]> git.pld-linux.org Git - packages/kernel.git/blame - linux-cluster-cman.patch
+CONFIG_IP_NF_MATCH_LAYER7=m
[packages/kernel.git] / linux-cluster-cman.patch
CommitLineData
50905e9e
AM
1diff -urN linux-2.6.7clean/arch/alpha/Kconfig linux-2.6.7/arch/alpha/Kconfig
2--- linux-2.6.7clean/arch/alpha/Kconfig 2004-07-05 10:44:04.000000000 +0100
3+++ linux-2.6.7/arch/alpha/Kconfig 2004-07-05 10:42:50.000000000 +0100
4@@ -698,3 +698,4 @@
4bf12011 5
50905e9e
AM
6 source "lib/Kconfig"
7
8+source "cluster/Kconfig"
9diff -urN linux-2.6.7clean/arch/arm/Kconfig linux-2.6.7/arch/arm/Kconfig
10--- linux-2.6.7clean/arch/arm/Kconfig 2004-07-05 10:44:04.000000000 +0100
11+++ linux-2.6.7/arch/arm/Kconfig 2004-07-05 10:50:40.000000000 +0100
12@@ -814,3 +814,4 @@
4bf12011 13
14 source "lib/Kconfig"
15
16+source "cluster/Kconfig"
50905e9e
AM
17diff -urN linux-2.6.7clean/arch/arm26/Kconfig linux-2.6.7/arch/arm26/Kconfig
18--- linux-2.6.7clean/arch/arm26/Kconfig 2004-07-05 10:43:54.000000000 +0100
19+++ linux-2.6.7/arch/arm26/Kconfig 2004-07-05 10:51:42.000000000 +0100
20@@ -333,3 +333,4 @@
21
22 source "lib/Kconfig"
23
24+source "cluster/Kconfig"
25diff -urN linux-2.6.7clean/arch/cris/Kconfig linux-2.6.7/arch/cris/Kconfig
26--- linux-2.6.7clean/arch/cris/Kconfig 2004-07-05 10:44:04.000000000 +0100
27+++ linux-2.6.7/arch/cris/Kconfig 2004-07-05 10:50:30.000000000 +0100
28@@ -216,3 +216,4 @@
29
30 source "lib/Kconfig"
31
32+source "cluster/Kconfig"
33diff -urN linux-2.6.7clean/arch/i386/Kconfig linux-2.6.7/arch/i386/Kconfig
34--- linux-2.6.7clean/arch/i386/Kconfig 2004-07-05 10:44:04.000000000 +0100
35+++ linux-2.6.7/arch/i386/Kconfig 2004-07-05 10:42:50.000000000 +0100
36@@ -1315,6 +1315,8 @@
4bf12011 37
38 source "lib/Kconfig"
39
40+source "cluster/Kconfig"
41+
42 config X86_SMP
43 bool
44 depends on SMP && !X86_VOYAGER
50905e9e
AM
45diff -urN linux-2.6.7clean/arch/ia64/Kconfig linux-2.6.7/arch/ia64/Kconfig
46--- linux-2.6.7clean/arch/ia64/Kconfig 2004-07-05 10:44:04.000000000 +0100
47+++ linux-2.6.7/arch/ia64/Kconfig 2004-07-05 10:49:31.000000000 +0100
48@@ -503,3 +503,5 @@
49 source "security/Kconfig"
50
51 source "crypto/Kconfig"
52+
53+source "cluster/Kconfig"
54diff -urN linux-2.6.7clean/arch/m68k/Kconfig linux-2.6.7/arch/m68k/Kconfig
55--- linux-2.6.7clean/arch/m68k/Kconfig 2004-07-05 10:44:10.000000000 +0100
56+++ linux-2.6.7/arch/m68k/Kconfig 2004-07-05 10:49:11.000000000 +0100
57@@ -696,3 +696,4 @@
58
59 source "lib/Kconfig"
60
61+source "cluster/Kconfig"
62diff -urN linux-2.6.7clean/arch/mips/Kconfig linux-2.6.7/arch/mips/Kconfig
63--- linux-2.6.7clean/arch/mips/Kconfig 2004-07-05 10:44:10.000000000 +0100
64+++ linux-2.6.7/arch/mips/Kconfig 2004-07-05 10:48:56.000000000 +0100
65@@ -1651,3 +1651,5 @@
66 source "crypto/Kconfig"
67
68 source "lib/Kconfig"
69+
70+source "cluster/Kconfig"
71diff -urN linux-2.6.7clean/arch/parisc/Kconfig linux-2.6.7/arch/parisc/Kconfig
72--- linux-2.6.7clean/arch/parisc/Kconfig 2004-07-05 10:44:10.000000000 +0100
73+++ linux-2.6.7/arch/parisc/Kconfig 2004-07-05 10:42:50.000000000 +0100
74@@ -229,3 +229,4 @@
75
76 source "lib/Kconfig"
77
78+source "cluster/Kconfig"
79diff -urN linux-2.6.7clean/arch/ppc/Kconfig linux-2.6.7/arch/ppc/Kconfig
80--- linux-2.6.7clean/arch/ppc/Kconfig 2004-07-05 10:44:10.000000000 +0100
81+++ linux-2.6.7/arch/ppc/Kconfig 2004-07-05 10:48:34.000000000 +0100
82@@ -1281,3 +1281,5 @@
83 source "security/Kconfig"
84
85 source "crypto/Kconfig"
86+
87+source "cluster/Kconfig"
88diff -urN linux-2.6.7clean/arch/ppc64/Kconfig linux-2.6.7/arch/ppc64/Kconfig
89--- linux-2.6.7clean/arch/ppc64/Kconfig 2004-07-05 10:44:10.000000000 +0100
90+++ linux-2.6.7/arch/ppc64/Kconfig 2004-07-05 10:48:43.000000000 +0100
91@@ -443,3 +443,4 @@
92
93 source "lib/Kconfig"
94
95+source "cluster/Kconfig"
96diff -urN linux-2.6.7clean/arch/s390/Kconfig linux-2.6.7/arch/s390/Kconfig
97--- linux-2.6.7clean/arch/s390/Kconfig 2004-07-05 10:44:02.000000000 +0100
98+++ linux-2.6.7/arch/s390/Kconfig 2004-07-05 10:48:22.000000000 +0100
99@@ -431,3 +431,4 @@
4bf12011 100
101 source "lib/Kconfig"
102
103+source "cluster/Kconfig"
50905e9e
AM
104diff -urN linux-2.6.7clean/arch/sh/Kconfig linux-2.6.7/arch/sh/Kconfig
105--- linux-2.6.7clean/arch/sh/Kconfig 2004-07-05 10:43:55.000000000 +0100
106+++ linux-2.6.7/arch/sh/Kconfig 2004-07-05 10:48:12.000000000 +0100
107@@ -798,3 +798,4 @@
4bf12011 108
109 source "lib/Kconfig"
110
111+source "cluster/Kconfig"
50905e9e
AM
112diff -urN linux-2.6.7clean/arch/sparc/Kconfig linux-2.6.7/arch/sparc/Kconfig
113--- linux-2.6.7clean/arch/sparc/Kconfig 2004-07-05 10:43:55.000000000 +0100
114+++ linux-2.6.7/arch/sparc/Kconfig 2004-07-05 10:47:47.000000000 +0100
115@@ -456,3 +456,4 @@
116
117 source "lib/Kconfig"
118
119+source "cluster/Kconfig"
120diff -urN linux-2.6.7clean/arch/sparc64/Kconfig linux-2.6.7/arch/sparc64/Kconfig
121--- linux-2.6.7clean/arch/sparc64/Kconfig 2004-07-05 10:44:10.000000000 +0100
122+++ linux-2.6.7/arch/sparc64/Kconfig 2004-07-05 10:42:50.000000000 +0100
123@@ -713,3 +713,4 @@
124
125 source "lib/Kconfig"
126
127+source "cluster/Kconfig"
128diff -urN linux-2.6.7clean/arch/um/Kconfig linux-2.6.7/arch/um/Kconfig
129--- linux-2.6.7clean/arch/um/Kconfig 2004-07-05 10:44:02.000000000 +0100
130+++ linux-2.6.7/arch/um/Kconfig 2004-07-05 10:47:29.000000000 +0100
131@@ -194,6 +194,8 @@
132
133 source "lib/Kconfig"
134
135+source "cluster/Kconfig"
136+
137 menu "SCSI support"
138
139 config SCSI
140diff -urN linux-2.6.7clean/arch/x86_64/Kconfig linux-2.6.7/arch/x86_64/Kconfig
141--- linux-2.6.7clean/arch/x86_64/Kconfig 2004-07-05 10:44:10.000000000 +0100
142+++ linux-2.6.7/arch/x86_64/Kconfig 2004-07-05 10:46:26.000000000 +0100
143@@ -505,3 +505,4 @@
144
145 source "lib/Kconfig"
146
147+source "cluster/Kconfig"
148diff -urN linux-2.6.7clean/cluster/cman/Makefile linux-2.6.7/cluster/cman/Makefile
149--- linux-2.6.7clean/cluster/cman/Makefile 1970-01-01 01:00:00.000000000 +0100
150+++ linux-2.6.7/cluster/cman/Makefile 2004-07-05 10:42:50.000000000 +0100
151@@ -0,0 +1,6 @@
152+cman-objs := cnxman.o config.o membership.o proc.o\
153+ sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
154+ sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
155+ sm_user.o
156+
157+obj-$(CONFIG_CLUSTER) := cman.o
158diff -urN linux-2.6.7clean/cluster/Kconfig linux-2.6.7/cluster/Kconfig
159--- linux-2.6.7clean/cluster/Kconfig 1970-01-01 01:00:00.000000000 +0100
160+++ linux-2.6.7/cluster/Kconfig 2004-07-05 10:42:50.000000000 +0100
4bf12011 161@@ -0,0 +1,13 @@
162+menu "Cluster Support"
163+
164+config CLUSTER
165+ tristate "Cluster support"
166+ ---help---
167+ Enable clustering support. This is not the high-performance clustering
168+ made famous by beowulf. It is a high-availability cluster often using
169+ shared storage.
170+ The cluster manager is the heart(beat) of the cluster system. It is
171+ needed by all the other components. It provides membership services
172+ for those other subsystems.
173+
174+endmenu
50905e9e
AM
175diff -urN linux-2.6.7clean/cluster/Makefile linux-2.6.7/cluster/Makefile
176--- linux-2.6.7clean/cluster/Makefile 1970-01-01 01:00:00.000000000 +0100
177+++ linux-2.6.7/cluster/Makefile 2004-07-05 10:42:50.000000000 +0100
4bf12011 178@@ -0,0 +1,3 @@
179+obj-y := nocluster.o
180+
181+obj-$(CONFIG_CLUSTER) += cman/
50905e9e
AM
182diff -urN linux-2.6.7clean/cluster/nocluster.c linux-2.6.7/cluster/nocluster.c
183--- linux-2.6.7clean/cluster/nocluster.c 1970-01-01 01:00:00.000000000 +0100
184+++ linux-2.6.7/cluster/nocluster.c 2004-07-05 10:42:50.000000000 +0100
4bf12011 185@@ -0,0 +1,20 @@
186+/*
187+ * cluster/nocluster.c
188+ *
189+ * Copy from net/nonet.c
190+ * Dummy functions to allow us to configure cluster support entirely
191+ * out of the kernel.
192+ *
193+ * Distributed under the terms of the GNU GPL version 2.
194+ * Copyright (c) Matthew Wilcox 2003
195+ */
196+
197+#include <linux/module.h>
198+#include <linux/errno.h>
199+#include <linux/fs.h>
200+#include <linux/init.h>
201+#include <linux/kernel.h>
202+
203+void __init nocluster_init(void)
204+{
205+}
50905e9e
AM
206diff -urN linux-2.6.7clean/Makefile linux-2.6.7/Makefile
207--- linux-2.6.7clean/Makefile 2004-07-05 10:44:04.000000000 +0100
208+++ linux-2.6.7/Makefile 2004-07-05 10:42:50.000000000 +0100
209@@ -418,7 +418,7 @@
210
211 # Objects we will link into vmlinux / subdirs we need to visit
212 init-y := init/
213-drivers-y := drivers/ sound/
214+drivers-y := drivers/ sound/ cluster/
215 net-y := net/
216 libs-y := lib/
217 core-y := usr/
4bf12011 218diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
219--- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 220+++ linux-patched/cluster/cman/cnxman-private.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 221@@ -0,0 +1,427 @@
222+/******************************************************************************
223+*******************************************************************************
224+**
225+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
226+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
227+**
228+** This copyrighted material is made available to anyone wishing to use,
229+** modify, copy, or redistribute it subject to the terms and conditions
230+** of the GNU General Public License v.2.
231+**
232+*******************************************************************************
233+******************************************************************************/
234+
235+#ifndef __CNXMAN_PRIVATE_H
236+#define __CNXMAN_PRIVATE_H
237+
238+/* Version triplet */
239+#define CNXMAN_MAJOR_VERSION 2
240+#define CNXMAN_MINOR_VERSION 0
241+#define CNXMAN_PATCH_VERSION 1
242+
243+#define MAX_RETRIES 3 /* Maximum number of send retries */
244+#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the
245+ * cluster */
246+#ifdef __KERNEL__
247+
248+/* How we announce ourself in console events */
249+#define CMAN_NAME "CMAN"
250+
251+/* One of these per AF_CLUSTER socket */
252+struct cluster_sock {
253+ /* WARNING: sk has to be the first member */
254+ struct sock sk;
255+
256+ unsigned char port; /* Bound port or zero */
257+ int (*kernel_callback) (char *, int, char *, int, unsigned int);
258+ void *service_data;
259+};
260+
261+#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
262+
263+/* We have one of these for each socket we use for communications */
264+struct cl_comms_socket {
265+ struct socket *sock;
266+ int broadcast; /* This is a broadcast socket */
267+ int recv_only; /* This is the unicast receive end of a
268+ * multicast socket */
269+ struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
270+ * the remote end(s) */
271+ int addr_len; /* Length of above */
272+ int number; /* Internal socket number, used to cycle around
273+ * sockets in case of network errors */
274+ struct file *file; /* file pointer for user-passed in sockets */
275+
276+ wait_queue_t wait;
277+
278+ /* The socket list */
279+ struct list_head list;
280+
281+ /* On here when it has something to say */
282+ struct list_head active_list;
283+ unsigned long active;
284+};
285+
286+/* A client socket. We keep a list of these so we can notify clients of cluster
287+ * events */
288+struct cl_client_socket {
289+ struct socket *sock;
290+ struct list_head list;
291+};
292+
293+/* This structure is tacked onto the start of a cluster message packet for our
294+ * own nefarious purposes. */
295+struct cl_protheader {
296+ unsigned char port;
297+ unsigned char flags;
298+ unsigned short cluster; /* Our cluster number, little-endian */
299+ unsigned short seq; /* Packet sequence number, little-endian */
300+ int srcid; /* Node ID of the sender */
301+ int tgtid; /* Node ID of the target or 0 for multicast
302+ * messages */
303+};
304+
305+/* A cluster internal protocol message - port number 0 */
306+struct cl_protmsg {
307+ struct cl_protheader header;
308+ unsigned char cmd;
309+};
310+
311+/* A Cluster ACK message */
312+struct cl_ackmsg {
313+ struct cl_protheader header;
314+ unsigned char cmd; /* Always CLUSTER_CMD_ACK */
315+ unsigned char remport; /* Remoye port number the original message was
316+ * for */
317+ unsigned char aflags; /* ACK flags 0=OK, 1=No listener */
318+ unsigned char pad;
319+ unsigned short seq; /* Sequence number we are acking */
320+};
321+
322+/* A Cluster LISTENREQ/LISTENRESP message */
323+struct cl_listenmsg {
324+ unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */
325+ unsigned char target_port; /* Port to probe */
326+ unsigned char listening; /* Always 0 for LISTENREQ */
327+ unsigned char pad;
328+ unsigned short tag; /* PID of remote waiting process */
329+};
330+
331+/* A Cluster PORTCLOSED message */
332+struct cl_closemsg {
333+ unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */
334+ unsigned char port;
335+};
336+
337+/* Structure of a newly dead node, passed from cnxman to kmembershipd */
338+struct cl_new_dead_node {
339+ struct list_head list;
340+ struct cluster_node *node;
341+};
342+
343+/* Subcommands for BARRIER message */
344+#define BARRIER_REGISTER 1
345+#define BARRIER_CHANGE 2
346+#define BARRIER_WAIT 4
347+#define BARRIER_COMPLETE 5
348+
349+/* A Cluster BARRIER message */
350+struct cl_barriermsg {
351+ unsigned char cmd; /* CLUSTER_CMD_BARRIER */
352+ unsigned char subcmd; /* BARRIER sub command */
353+ unsigned short pad;
354+ unsigned int flags;
355+ unsigned int nodes;
356+ char name[MAX_BARRIER_NAME_LEN];
357+};
358+
359+/* Membership services messages, the cl_protheader is added transparently */
360+struct cl_mem_hello_msg {
361+ unsigned char cmd;
362+ unsigned char flags;
363+ unsigned short members; /* Number of nodes in the cluster,
364+ * little-endian */
365+ unsigned int generation; /* Current cluster generation number */
366+};
367+
368+struct cl_mem_endtrans_msg {
369+ unsigned char cmd;
370+ unsigned char pad1;
371+ unsigned short pad2;
372+ unsigned int quorum;
373+ unsigned int total_votes;
374+ unsigned int generation; /* Current cluster generation number */
375+ unsigned int new_node_id; /* If reason is a new node joining */
376+};
377+
378+/* ACK types for JOINACK message */
379+#define JOINACK_TYPE_OK 1 /* You can join */
380+#define JOINACK_TYPE_NAK 2 /* You can NOT join */
381+#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition
382+ * already */
383+
384+struct cl_mem_joinack_msg {
385+ unsigned char cmd;
386+ unsigned char acktype;
387+};
388+
389+/* This is used by JOINREQ message */
390+struct cl_mem_join_msg {
391+ unsigned char cmd;
392+ unsigned char votes;
393+ unsigned short num_addr; /* Number of addresses for this node */
394+ unsigned int expected_votes;
395+ unsigned int members; /* Number of nodes in the cluster,
396+ * little-endian */
397+ unsigned int major_version; /* Not backwards compatible */
398+ unsigned int minor_version; /* Backwards compatible */
399+ unsigned int patch_version; /* Backwards/forwards compatible */
400+ unsigned int config_version;
401+ unsigned int addr_len; /* length of node addresses */
402+ char clustername[16];
403+ /* Followed by <num_addr> addresses of `address_length` bytes and a
404+ * NUL-terminated node name */
405+};
406+
407+/* State transition start reasons: */
408+#define TRANS_NEWNODE 1 /* A new node is joining the cluster */
409+#define TRANS_REMNODE 2 /* a node has left the cluster */
410+#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
411+ * transition */
412+#define TRANS_NEWMASTER 4 /* We have had an election and I am the new
413+ * master */
414+#define TRANS_CHECK 5 /* A consistency check was called for */
415+#define TRANS_RESTART 6 /* Transition restarted because of a previous
416+ * timeout */
417+#define TRANS_DEADMASTER 7 /* The master died during transition and I have
418+ * taken over */
419+
420+/* This is used to start a state transition */
421+struct cl_mem_starttrans_msg {
422+ unsigned char cmd;
423+ unsigned char reason; /* Why a start transition was started - see
424+ * above */
425+ unsigned char flags;
426+ unsigned char votes;
427+ unsigned int expected_votes;
428+ unsigned int generation; /* Incremented for each STARTTRANS sent
429+ */
430+ int nodeid; /* Node to be removed */
431+ unsigned short num_addrs;
432+ /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
433+ * `address_length` bytes and a NUL-terminated node name */
434+};
435+
436+struct cl_mem_startack_msg {
437+ unsigned char cmd;
438+ unsigned char reason;
439+ unsigned short pad;
440+ unsigned int generation;
441+ unsigned int node_id; /* node_id we think new node should have */
442+ unsigned int highest_node_id; /* highest node_id on this system */
443+};
444+
445+/* Reconfigure a cluster parameter */
446+struct cl_mem_reconfig_msg {
447+ unsigned char cmd;
448+ unsigned char param;
449+ unsigned short pad;
450+ unsigned int value;
451+};
452+
453+/* Structure containing information about an outstanding listen request */
454+struct cl_waiting_listen_request {
455+ wait_queue_head_t waitq;
456+ int result;
457+ int waiting;
458+ unsigned short tag;
459+ int nodeid;
460+ struct list_head list;
461+};
462+
463+/* Messages from membership services */
464+#define CLUSTER_MEM_JOINCONF 1
465+#define CLUSTER_MEM_JOINREQ 2
466+#define CLUSTER_MEM_LEAVE 3
467+#define CLUSTER_MEM_HELLO 4
468+#define CLUSTER_MEM_KILL 5
469+#define CLUSTER_MEM_JOINACK 6
470+#define CLUSTER_MEM_ENDTRANS 7
471+#define CLUSTER_MEM_RECONFIG 8
472+#define CLUSTER_MEM_MASTERVIEW 9
473+#define CLUSTER_MEM_STARTTRANS 10
474+#define CLUSTER_MEM_JOINREJ 11
475+#define CLUSTER_MEM_VIEWACK 12
476+#define CLUSTER_MEM_STARTACK 13
477+#define CLUSTER_MEM_TRANSITION 14
478+#define CLUSTER_MEM_NEWCLUSTER 15
479+#define CLUSTER_MEM_CONFACK 16
480+#define CLUSTER_MEM_NOMINATE 17
481+
482+/* Parameters for RECONFIG command */
483+#define RECONFIG_PARAM_EXPECTED_VOTES 1
484+#define RECONFIG_PARAM_NODE_VOTES 2
485+#define RECONFIG_PARAM_CONFIG_VERSION 3
486+
487+/* Data associated with an outgoing socket */
488+struct cl_socket {
489+ struct file *file; /* The real file */
490+ struct socket *socket; /* The real sock */
491+ struct cl_multicast_sock multicast_info;
492+ int num_nodes; /* On this link */
493+ int retransmit_count;
494+};
495+
496+/* There's one of these for each node in the cluster */
497+struct cluster_node {
498+ struct list_head list;
499+ char *name; /* Node/host name of node */
500+ struct list_head addr_list;
501+ int us; /* This node is us */
502+ unsigned int node_id; /* Unique node ID */
503+ nodestate_t state;
504+ unsigned short last_seq_recv;
505+ unsigned short last_seq_acked;
506+ unsigned short last_seq_sent;
507+ unsigned int votes;
508+ unsigned int expected_votes;
509+ unsigned int leave_reason;
510+ unsigned int incarnation; /* Incremented each time a node joins
511+ * the cluster */
512+ unsigned long last_hello; /* Jiffies */
513+};
514+
515+/* This is how we keep a list of user processes that are listening for cluster
516+ * membership events */
517+struct notify_struct {
518+ struct list_head list;
519+ pid_t pid;
520+ int signal;
521+};
522+
523+/* This is how we keep a list of kernel callbacks that are registered for
524+ * cluster membership events */
525+struct kernel_notify_struct {
526+ struct list_head list;
527+ void (*callback) (kcl_callback_reason, long arg);
528+};
529+
530+/* A message waiting to be sent */
531+struct queued_message {
532+ struct list_head list;
533+
534+ struct socket *socket;
535+ struct sockaddr_cl addr;
536+ int addr_len;
537+ int msg_len;
538+ unsigned char port;
539+ unsigned int flags;
540+ char msg_buffer[MAX_CLUSTER_MESSAGE];
541+};
542+
543+/* A barrier */
544+struct cl_barrier {
545+ struct list_head list;
546+
547+ char name[MAX_BARRIER_NAME_LEN];
548+ unsigned int flags;
549+ enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
550+ BARRIER_STATE_COMPLETE } state;
551+ unsigned int expected_nodes;
552+ unsigned int registered_nodes;
553+ atomic_t got_nodes;
554+ atomic_t completed_nodes;
555+ unsigned int inuse;
556+ unsigned int waitsent;
557+ unsigned int phase; /* Completion phase */
558+ unsigned int endreason; /* Reason we were woken, usually 0 */
559+ unsigned long timeout; /* In seconds */
560+
561+ void (*callback) (char *name, int status);
562+ wait_queue_head_t waitq;
563+ struct semaphore lock; /* To synch with cnxman messages */
564+ spinlock_t phase2_spinlock; /* Need to synchronise with timer
565+ * interrupts */
566+ struct timer_list timer;
567+};
568+
569+/* Cluster protocol commands sent to port 0 */
570+#define CLUSTER_CMD_ACK 1
571+#define CLUSTER_CMD_LISTENREQ 2
572+#define CLUSTER_CMD_LISTENRESP 3
573+#define CLUSTER_CMD_PORTCLOSED 4
574+#define CLUSTER_CMD_BARRIER 5
575+
576+extern struct cluster_node *find_node_by_addr(unsigned char *addr,
577+ int addr_len);
578+extern struct cluster_node *find_node_by_nodeid(unsigned int id);
579+extern struct cluster_node *find_node_by_name(char *name);
580+extern void set_quorate(int);
581+extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
582+extern void notify_listeners(void);
583+extern void free_nodeid_array(void);
584+extern int send_reconfigure(int param, unsigned int value);
585+extern int calculate_quorum(int, int, int *);
586+extern void recalculate_quorum(int);
587+extern int send_leave(unsigned char);
588+extern int get_quorum(void);
589+extern void set_votes(int, int);
590+extern void kcl_wait_for_all_acks(void);
591+extern char *membership_state(char *, int);
592+extern void a_node_just_died(struct cluster_node *node);
593+extern void check_barrier_returns(void);
594+extern int in_transition(void);
595+extern void get_local_addresses(struct cluster_node *node);
596+extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
597+extern void create_proc_entries(void);
598+extern void cleanup_proc_entries(void);
599+extern unsigned int get_highest_nodeid(void);
600+extern int allocate_nodeid_array(void);
601+extern void queue_oob_skb(struct socket *sock, int cmd);
602+extern int new_temp_nodeid(char *addr, int addrlen);
603+extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
604+extern void remove_temp_nodeid(int nodeid);
605+extern inline char *print_addr(unsigned char *addr, int len, char *buf)
606+{
607+ int i;
608+ int ptr = 0;
609+
610+ for (i = 0; i < len; i++)
611+ ptr += sprintf(buf + ptr, "%02x ", addr[i]);
612+
613+ return buf;
614+}
615+
616+#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
617+
618+/* Debug enabling macros. Sorry about the C++ comments but they're easier to
619+ * get rid of than C ones... */
620+
621+// #define DEBUG_MEMB
622+// #define DEBUG_COMMS
623+// #define DEBUG_BARRIER
624+
625+/* Debug macros */
626+#ifdef DEBUG_COMMS
627+#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
628+#else
629+#define P_COMMS(fmt, args...)
630+#endif
631+
632+#ifdef DEBUG_BARRIER
633+#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
634+#else
635+#define P_BARRIER(fmt, args...)
636+#endif
637+
638+#ifdef DEBUG_MEMB
639+#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
640+#define C_MEMB(fmt, args...) printk(fmt, ## args)
641+#else
642+#define P_MEMB(fmt, args...)
643+#define C_MEMB(fmt, args...)
644+#endif
645+
646+#endif /* __KERNEL */
647+
648+#endif
649diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
650--- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 651+++ linux-patched/cluster/cman/cnxman.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 652@@ -0,0 +1,4080 @@
653+/******************************************************************************
654+*******************************************************************************
655+**
656+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
657+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
658+**
659+** This copyrighted material is made available to anyone wishing to use,
660+** modify, copy, or redistribute it subject to the terms and conditions
661+** of the GNU General Public License v.2.
662+**
663+*******************************************************************************
664+******************************************************************************/
665+
666+#define EXPORT_SYMTAB
667+#include <linux/init.h>
668+#include <linux/socket.h>
669+#include <linux/kernel.h>
670+#include <linux/sched.h>
671+#include <linux/file.h>
672+#include <linux/utsname.h>
673+#include <net/sock.h>
674+#include <linux/proc_fs.h>
675+#include <linux/poll.h>
676+#include <linux/module.h>
677+#include <linux/list.h>
678+#include <cluster/cnxman.h>
679+#include <cluster/service.h>
680+
681+#include "cnxman-private.h"
682+#include "sm_control.h"
683+#include "sm_user.h"
684+#include "config.h"
685+
686+#define CMAN_RELEASE_NAME "<CVS>"
687+
688+static int __cl_setsockopt(struct socket *sock, int level, int optname,
689+ char *optval, int optlen, int flags);
690+static int __cl_getsockopt(struct socket *sock, int level, int optname,
691+ char *optval, int *optlen, int flags);
692+static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
693+ char *addr, int addrlen);
694+static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
695+ int addr_len, char *addr, unsigned char remport,
696+ unsigned char flag);
697+static void send_listen_request(int nodeid, unsigned char port);
698+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
699+ unsigned char port, unsigned short tag);
700+static void resend_last_message(void);
701+static void start_ack_timer(void);
702+static int send_queued_message(struct queued_message *qmsg);
703+static void send_port_close_oob(unsigned char port);
704+static void post_close_oob(unsigned char port, int nodeid);
705+static void process_barrier_msg(struct cl_barriermsg *msg,
706+ struct cluster_node *node);
707+static struct cl_barrier *find_barrier(char *name);
708+static void node_shutdown(void);
709+static void node_cleanup(void);
710+static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
711+ unsigned char port);
712+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
713+static void check_for_unacked_nodes(void);
714+static void free_cluster_sockets(void);
715+static uint16_t generate_cluster_id(char *name);
716+
717+static int is_valid_temp_nodeid(int nodeid);
718+
719+extern int start_membership_services(pid_t);
720+extern int kcl_leave_cluster(int remove);
721+extern int send_kill(int nodeid);
722+
723+static struct proto_ops cl_proto_ops;
724+static struct sock *master_sock;
725+static kmem_cache_t *cluster_sk_cachep;
726+
727+/* Pointer to the pseudo node that maintains quorum in a 2node system */
728+struct cluster_node *quorum_device = NULL;
729+
730+/* Array of "ports" allocated. This is just a list of pointers to the sock that
731+ * has this port bound. Speed is a major issue here so 1-2K of allocated
732+ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
733+static struct sock *port_array[256];
734+static struct semaphore port_array_lock;
735+
736+/* Our cluster name & number */
737+unsigned short cluster_id;
738+char cluster_name[MAX_CLUSTER_NAME_LEN+1];
739+
740+/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
741+ * No more than two nodes are permitted to join the cluster. */
742+unsigned short two_node;
743+
744+/* Cluster configuration version that must be the same among members. */
745+unsigned int config_version;
746+
747+/* Reference counting for cluster applications */
748+atomic_t use_count;
749+
750+/* Length of sockaddr address for our comms protocol */
751+unsigned int address_length;
752+
753+/* Message sending */
754+static unsigned short cur_seq; /* Last message sent */
755+static unsigned int ack_count; /* Number of acks received for message
756+ * 'cur_seq' */
757+static unsigned int acks_expected; /* Number of acks we expect to receive */
758+static struct semaphore send_lock;
759+static struct timer_list ack_timer;
760+
761+/* Saved packet information in case we need to resend it */
762+static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
763+static int saved_msg_len;
764+static int retry_count;
765+
766+/* Task variables */
767+static pid_t kcluster_pid;
768+static pid_t membership_pid;
769+extern int quit_threads;
770+
771+wait_queue_head_t cnxman_waitq;
772+
773+/* Variables owned by membership services */
774+extern int cluster_members;
775+extern struct list_head cluster_members_list;
776+extern struct semaphore cluster_members_lock;
777+extern int we_are_a_cluster_member;
778+extern int cluster_is_quorate;
779+extern struct cluster_node *us;
780+extern struct list_head new_dead_node_list;
781+extern struct semaphore new_dead_node_lock;
782+extern char nodename[];
783+
784+/* A list of processes listening for membership events */
785+static struct list_head event_listener_list;
786+static struct semaphore event_listener_lock;
787+
788+/* A list of kernel callbacks listening for membership events */
789+static struct list_head kernel_listener_list;
790+static struct semaphore kernel_listener_lock;
791+
792+/* A list of sockets we are listening on (and can transmit on...later) */
793+static struct list_head socket_list;
794+
795+/* A list of all open cluster client sockets */
796+static struct list_head client_socket_list;
797+static struct semaphore client_socket_lock;
798+
799+/* A list of all current barriers */
800+static struct list_head barrier_list;
801+static struct semaphore barrier_list_lock;
802+
803+/* When a socket is read for reading it goes on this queue */
804+static spinlock_t active_socket_lock;
805+static struct list_head active_socket_list;
806+
807+/* If the cnxman process is running and available for work */
808+atomic_t cnxman_running;
809+
810+/* Fkags set by timers etc for the mainloop to detect and act upon */
811+static unsigned long mainloop_flags;
812+
813+#define ACK_TIMEOUT 1
814+#define RESEND_NEEDED 2
815+
816+/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
817+ * process context then the messages get put in here */
818+static struct list_head messages_list;
819+static struct semaphore messages_list_lock;
820+
821+static struct semaphore start_thread_sem;
822+
823+/* List of outstanding ISLISTENING requests */
824+static struct list_head listenreq_list;
825+static struct semaphore listenreq_lock;
826+
827+/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
828+ * ACK) */
829+static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
830+
831+/* Wait for thread to exit properly */
832+struct completion cluster_thread_comp;
833+struct completion member_thread_comp;
834+
835+/* The resend delay to use, We increase this geometrically(word?) each time a
836+ * send is delayed. in deci-seconds */
837+static int resend_delay = 1;
838+
839+/* Highest numbered interface and the current default */
840+static int num_interfaces = 0;
841+static struct cl_comms_socket *current_interface = NULL;
842+
843+struct temp_node
844+{
845+ int nodeid;
846+ char addr[sizeof(struct sockaddr_in6)];
847+ int addrlen;
848+ struct list_head list;
849+};
850+static struct list_head tempnode_list;
851+static struct semaphore tempnode_lock;
852+
853+/* Wake up any processes that are waiting to send. This is usually called when
854+ * all the ACKs have been gathered up or when a node has left the cluster
855+ * unexpectedly and we reckon there are no more acks to collect */
856+static void unjam(void)
857+{
858+ wake_up_interruptible(&socket_waitq);
859+ wake_up_interruptible(&cnxman_waitq);
860+}
861+
862+/* Used by the data_ready routine to locate a connection given the socket */
863+static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
864+{
865+ struct list_head *conlist;
866+
867+ list_for_each(conlist, &socket_list) {
868+ struct cl_comms_socket *clsock =
869+ list_entry(conlist, struct cl_comms_socket, list);
870+ if (clsock->sock->sk == sk) {
871+ return clsock;
872+ }
873+ }
874+ return NULL;
875+}
876+
877+/* Data available on socket */
878+static void cnxman_data_ready(struct sock *sk, int count_unused)
879+{
880+ struct cl_comms_socket *clsock = find_comms_by_sock(sk);
881+
882+ if (clsock == NULL) /* ASSERT ?? */
883+ return;
884+
885+ /* If we're already on the list then don't do it again */
886+ if (test_and_set_bit(1, &clsock->active))
887+ return;
888+
889+ spin_lock_irq(&active_socket_lock);
890+ list_add(&clsock->active_list, &active_socket_list);
891+ spin_unlock_irq(&active_socket_lock);
892+
893+ wake_up_interruptible(&cnxman_waitq);
894+}
895+
896+static int receive_message(struct cl_comms_socket *csock, char *iobuf)
897+{
898+ struct msghdr msg;
899+ struct iovec iov;
900+ struct sockaddr_in6 sin;
901+ int len;
902+ mm_segment_t fs;
903+
904+ memset(&sin, 0, sizeof (sin));
905+
906+ msg.msg_control = NULL;
907+ msg.msg_controllen = 0;
908+ msg.msg_iovlen = 1;
909+ msg.msg_iov = &iov;
910+ msg.msg_name = &sin;
911+ msg.msg_namelen = sizeof (sin);
912+ msg.msg_flags = 0;
913+
914+ iov.iov_len = MAX_CLUSTER_MESSAGE;
915+ iov.iov_base = iobuf;
916+
917+ fs = get_fs();
918+ set_fs(get_ds());
919+
920+ len = sock_recvmsg(csock->sock, &msg, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
921+ set_fs(fs);
922+
923+ if (len > 0) {
924+ if (len > MAX_CLUSTER_MESSAGE) {
925+ printk(KERN_CRIT CMAN_NAME
926+ ": %d byte message far too big\n", len);
927+ return 0;
928+ }
929+ send_to_userport(csock, iobuf, len, msg.msg_name, msg.msg_namelen);
930+ }
931+ else {
932+ if (len != -EAGAIN)
933+ printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
934+ len);
935+ }
936+ return len;
937+}
938+
939+static int cluster_kthread(void *unused)
940+{
941+ int len;
942+ char *iobuf;
943+ struct list_head *socklist;
944+ struct cl_comms_socket *csock;
945+ wait_queue_t cnxman_waitq_head;
946+ sigset_t tmpsig;
947+
948+ daemonize("cman_comms");
949+
950+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
951+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
952+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
953+
954+ /* This is the waitq we can wake the process up with */
955+ init_waitqueue_head(&cnxman_waitq);
956+ init_waitqueue_entry(&cnxman_waitq_head, current);
957+ add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
958+
959+ set_user_nice(current, -6);
960+
961+ /* Allow the sockets to start receiving */
962+ list_for_each(socklist, &socket_list) {
963+ csock = list_entry(socklist, struct cl_comms_socket, list);
964+
965+ clear_bit(1, &csock->active);
966+ }
967+
968+ iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
969+ if (!iobuf) {
970+ printk(KERN_CRIT CMAN_NAME
971+ ": Cannot allocate receive buffer for cluster comms\n");
972+ return -1;
973+ }
974+
975+ complete(&cluster_thread_comp);
976+
977+ for (;;) {
978+ struct list_head *temp;
979+
980+ /* Wait for activity on any of the sockets */
981+ set_task_state(current, TASK_INTERRUPTIBLE);
982+
983+ if (list_empty(&active_socket_list))
984+ schedule();
985+ set_task_state(current, TASK_RUNNING);
986+
987+ if (quit_threads)
988+ break;
989+
990+ if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
991+ check_for_unacked_nodes();
992+ }
993+
994+ /* Now receive any messages waiting for us */
995+ spin_lock_irq(&active_socket_lock);
996+ list_for_each_safe(socklist, temp, &active_socket_list) {
997+ csock =
998+ list_entry(socklist, struct cl_comms_socket,
999+ active_list);
1000+
1001+ list_del(&csock->active_list);
1002+ clear_bit(1, &csock->active);
1003+
1004+ spin_unlock_irq(&active_socket_lock);
1005+
1006+ do {
1007+ len = receive_message(csock, iobuf);
1008+ }
1009+ while (len > 0);
1010+
1011+ spin_lock_irq(&active_socket_lock);
1012+
1013+ if (len == 0)
1014+ break; /* EOF on socket */
1015+ }
1016+ spin_unlock_irq(&active_socket_lock);
1017+
1018+ /* Resend any unacked messages */
1019+ if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
1020+ && acks_expected) {
1021+ resend_last_message();
1022+ }
1023+
1024+ /* Send any queued messages */
1025+ if (acks_expected == 0) {
1026+ struct list_head *temp;
1027+ struct list_head *msglist;
1028+
1029+ down(&messages_list_lock);
1030+ list_for_each_safe(msglist, temp, &messages_list) {
1031+ struct queued_message *qmsg =
1032+ list_entry(msglist, struct queued_message,
1033+ list);
1034+ int status = send_queued_message(qmsg);
1035+
1036+ if (status >= 0) {
1037+ /* Suceeded, remove it from the queue */
1038+ list_del(&qmsg->list);
1039+ kfree(qmsg);
1040+ }
1041+ /* Did it fail horribly ?? */
1042+ if (status < 0 && status != -EAGAIN) {
1043+ printk(KERN_INFO CMAN_NAME
1044+ ": send_queued_message failed, error %d\n",
1045+ status);
1046+ list_del(&qmsg->list);
1047+ kfree(qmsg);
1048+ }
1049+ break; /* Only send one message at a time */
1050+ }
1051+ up(&messages_list_lock);
1052+ }
1053+
1054+ if (signal_pending(current))
1055+ break;
1056+ }
1057+ P_COMMS("closing down\n");
1058+
1059+ if (we_are_a_cluster_member)
1060+ send_leave(us->leave_reason);
1061+
1062+ kfree(iobuf);
1063+ quit_threads = 1; /* force other thread to die too */
1064+ node_shutdown();
1065+
1066+ if (timer_pending(&ack_timer))
1067+ del_timer(&ack_timer);
1068+
1069+ /* Wait for membership thread to die */
1070+ wait_for_completion(&member_thread_comp);
1071+
1072+ node_cleanup();
1073+
1074+ complete(&cluster_thread_comp);
1075+ return 0;
1076+}
1077+
1078+void notify_kernel_listeners(kcl_callback_reason reason, long arg)
1079+{
1080+ struct kernel_notify_struct *knotify;
1081+ struct list_head *proclist;
1082+
1083+ down(&kernel_listener_lock);
1084+ list_for_each(proclist, &kernel_listener_list) {
1085+ knotify =
1086+ list_entry(proclist, struct kernel_notify_struct, list);
1087+ knotify->callback(reason, arg);
1088+ }
1089+ up(&kernel_listener_lock);
1090+}
1091+
1092+static void check_for_unacked_nodes()
1093+{
1094+ struct list_head *nodelist;
1095+ struct cluster_node *node;
1096+
1097+ clear_bit(RESEND_NEEDED, &mainloop_flags);
1098+ retry_count = 0;
1099+
1100+ P_COMMS("Retry count exceeded -- looking for dead node\n");
1101+
1102+ /* Node did not ACK a message after <n> tries, remove it from the
1103+ * cluster */
1104+ down(&cluster_members_lock);
1105+ list_for_each(nodelist, &cluster_members_list) {
1106+ node = list_entry(nodelist, struct cluster_node, list);
1107+
1108+ P_COMMS
1109+ ("checking node %s: last_acked = %d, last_seq_sent = %d\n",
1110+ node->name, node->last_seq_acked, node->last_seq_sent);
1111+ if (node->state != NODESTATE_DEAD
1112+ && node->last_seq_acked != node->last_seq_sent && !node->us) {
1113+ printk(KERN_WARNING CMAN_NAME
1114+ ": node %s is not responding - removing from the cluster\n",
1115+ node->name);
1116+
1117+ /* Start a state transition */
1118+ a_node_just_died(node);
1119+ }
1120+ }
1121+ up(&cluster_members_lock);
1122+ acks_expected = ack_count = 0;
1123+ unjam();
1124+ return;
1125+}
1126+
1127+static void ack_timer_fn(unsigned long arg)
1128+{
1129+ P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
1130+
1131+ /* Too many retries ? */
1132+ if (++retry_count > MAX_RETRIES) {
1133+ set_bit(ACK_TIMEOUT, &mainloop_flags);
1134+ wake_up_interruptible(&cnxman_waitq);
1135+ }
1136+ else {
1137+ /* Resend last message */
1138+ set_bit(RESEND_NEEDED, &mainloop_flags);
1139+ wake_up_interruptible(&cnxman_waitq);
1140+ }
1141+}
1142+
1143+/* Called to resend a packet if sock_sendmsg was busy */
1144+static void short_timer_fn(unsigned long arg)
1145+{
1146+ P_COMMS("short_timer fired\n");
1147+
1148+ /* Resend last message */
1149+ resend_delay <<= 1;
1150+ set_bit(RESEND_NEEDED, &mainloop_flags);
1151+ wake_up_interruptible(&cnxman_waitq);
1152+}
1153+
1154+static void start_ack_timer()
1155+{
1156+ ack_timer.function = ack_timer_fn;
1157+ ack_timer.data = 0L;
1158+ mod_timer(&ack_timer, jiffies + HZ);
1159+}
1160+
1161+static void start_short_timer(void)
1162+{
1163+ ack_timer.function = short_timer_fn;
1164+ ack_timer.data = 0L;
1165+ mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
1166+}
1167+
1168+
1169+static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
1170+{
1171+ struct list_head *llist;
1172+ struct cl_waiting_listen_request *listener;
1173+
1174+ down(&listenreq_lock);
1175+ list_for_each(llist, &listenreq_list) {
1176+ listener =
1177+ list_entry(llist, struct cl_waiting_listen_request, list);
1178+ if (listener->tag == tag) {
1179+ up(&listenreq_lock);
1180+ return listener;
1181+ }
1182+ }
1183+ up(&listenreq_lock);
1184+ return NULL;
1185+}
1186+
1187+static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
1188+ int len, char *addr, int addrlen,
1189+ struct cluster_node *rem_node)
1190+{
1191+ struct cl_protmsg *msg = (struct cl_protmsg *) data;
1192+ struct cl_protheader *header = (struct cl_protheader *) data;
1193+ struct cl_ackmsg *ackmsg;
1194+ struct cl_listenmsg *listenmsg;
1195+ struct cl_closemsg *closemsg;
1196+ struct cl_barriermsg *barriermsg;
1197+ struct cl_waiting_listen_request *listen_request;
1198+
1199+ P_COMMS("Message on port 0 is %d\n", msg->cmd);
1200+ switch (msg->cmd) {
1201+ case CLUSTER_CMD_ACK:
1202+ ackmsg = (struct cl_ackmsg *) data;
1203+
1204+ if (ackmsg->aflags & 1) {
1205+ if (net_ratelimit())
1206+ printk(KERN_INFO CMAN_NAME
1207+ ": WARNING no listener for port %d on node %s\n",
1208+ ackmsg->remport, rem_node->name);
1209+ }
1210+ P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
1211+ rem_node ? rem_node->name : "Unknown",
1212+ le16_to_cpu(ackmsg->seq), cur_seq);
1213+
1214+ if (rem_node && rem_node->state != NODESTATE_DEAD) {
1215+ /* This copes with duplicate acks from a multipathed
1216+ * host */
1217+ if (rem_node->last_seq_acked !=
1218+ le16_to_cpu(ackmsg->seq)) {
1219+ rem_node->last_seq_acked =
1220+ le16_to_cpu(ackmsg->seq);
1221+
1222+ /* Got em all */
1223+ if (++ack_count >= acks_expected) {
1224+
1225+ /* Cancel the timer */
1226+ del_timer(&ack_timer);
1227+ acks_expected = 0;
1228+ unjam();
1229+ }
1230+ }
1231+ }
1232+ else {
1233+ if (cluster_members) {
1234+#ifdef DEBUG_COMMS
1235+ char buf[MAX_ADDR_PRINTED_LEN];
1236+
1237+ printk(KERN_INFO CMAN_NAME
1238+ ": got ack from unknown or dead node: %s\n",
1239+ print_addr(addr, addrlen, buf));
1240+#endif
1241+ }
1242+ }
1243+ break;
1244+
1245+ /* Return 1 if we have a listener on this port, 0 if not */
1246+ case CLUSTER_CMD_LISTENREQ:
1247+ listenmsg =
1248+ (struct cl_listenmsg *) (data +
1249+ sizeof (struct cl_protheader));
1250+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1251+ send_listen_response(csock, le32_to_cpu(header->srcid),
1252+ listenmsg->target_port, listenmsg->tag);
1253+ break;
1254+
1255+ case CLUSTER_CMD_LISTENRESP:
1256+ /* Wake up process waiting for listen response */
1257+ listenmsg =
1258+ (struct cl_listenmsg *) (data +
1259+ sizeof (struct cl_protheader));
1260+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1261+ listen_request = find_listen_request(listenmsg->tag);
1262+ if (listen_request) {
1263+ listen_request->result = listenmsg->listening;
1264+ listen_request->waiting = 0;
1265+ wake_up_interruptible(&listen_request->waitq);
1266+ }
1267+ break;
1268+
1269+ case CLUSTER_CMD_PORTCLOSED:
1270+ closemsg =
1271+ (struct cl_closemsg *) (data +
1272+ sizeof (struct cl_protheader));
1273+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1274+ post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
1275+ break;
1276+
1277+ case CLUSTER_CMD_BARRIER:
1278+ barriermsg =
1279+ (struct cl_barriermsg *) (data +
1280+ sizeof (struct cl_protheader));
1281+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1282+ process_barrier_msg(barriermsg, rem_node);
1283+ break;
1284+
1285+ default:
1286+ printk(KERN_ERR CMAN_NAME
1287+ ": Unknown protocol message %d received\n", msg->cmd);
1288+ break;
1289+
1290+ }
1291+ return;
1292+}
1293+
1294+static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
1295+ char *addr, int addrlen)
1296+{
1297+ int err;
1298+ struct cl_protheader *header = (struct cl_protheader *) data;
1299+ struct cluster_node *rem_node =
1300+ find_node_by_nodeid(le32_to_cpu(header->srcid));
1301+ struct sk_buff *skb = NULL;
1302+
1303+ P_COMMS
1304+ ("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
1305+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1306+ le16_to_cpu(header->seq), rem_node,
1307+ rem_node ? rem_node->state : -1);
1308+
1309+ /* If the remote end is being coy about its node ID then look it up by
1310+ * address */
1311+ if (!rem_node && header->srcid == 0) {
1312+ rem_node = find_node_by_addr(addr, addrlen);
1313+ }
1314+
1315+ /* If this node is an ex-member then treat it as unknown */
1316+ if (rem_node && rem_node->state != NODESTATE_MEMBER
1317+ && rem_node->state != NODESTATE_JOINING)
1318+ rem_node = NULL;
1319+
1320+ /* Ignore messages not for our cluster */
1321+ if (le16_to_cpu(header->cluster) != cluster_id) {
1322+ P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
1323+ cluster_id, header->cluster);
1324+ goto userport_finish;
1325+ }
1326+
1327+ /* If the message is from us then just dump it */
1328+ if (rem_node && rem_node->us)
1329+ goto userport_finish;
1330+
1331+ /* If we can't find the nodeid then check for our own messages the hard
1332+ * way - this only happens during joining */
1333+ if (!rem_node) {
1334+ struct list_head *socklist;
1335+ struct cl_comms_socket *clsock;
1336+
1337+ list_for_each(socklist, &socket_list) {
1338+ clsock =
1339+ list_entry(socklist, struct cl_comms_socket, list);
1340+
1341+ if (clsock->recv_only) {
1342+
1343+ if (memcmp(addr, &clsock->saddr, address_length) == 0) {
1344+ goto userport_finish;
1345+ }
1346+ }
1347+ }
1348+
1349+ }
1350+
1351+ /* Ignore messages not for us */
1352+ if (le32_to_cpu(header->tgtid) > 0 && us
1353+ && le32_to_cpu(header->tgtid) != us->node_id) {
1354+ goto userport_finish;
1355+ }
1356+
1357+ P_COMMS("got message, from %d for %d, sequence num = %d\n",
1358+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1359+ le16_to_cpu(header->seq));
1360+
1361+ /* Have we received this message before ? If so just ignore it, it's a
1362+ * resend for someone else's benefit */
1363+ if (!(header->flags & (MSG_NOACK >> 16)) &&
1364+ rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
1365+ P_COMMS
1366+ ("Discarding message - Already seen this sequence number %d\n",
1367+ rem_node->last_seq_recv);
1368+ /* Still need to ACK it though, in case it was the ACK that got
1369+ * lost */
1370+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1371+ goto userport_finish;
1372+ }
1373+
1374+ /* If it's a new node then assign it a temporary node ID */
1375+ if (!rem_node)
1376+ header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
1377+
1378+ P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
1379+ header->flags, header->port, we_are_a_cluster_member);
1380+
1381+
1382+ /* If we are not part of the cluster then ignore multicast messages
1383+ * that need an ACK as we will confuse the sender who is only expecting
1384+ * ACKS from bona fide members */
1385+ if (header->flags & (MSG_MULTICAST >> 16) &&
1386+ !(header->flags & (MSG_NOACK >> 16)) && !we_are_a_cluster_member) {
1387+ P_COMMS
1388+ ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
1389+ header->port, header->flags);
1390+ goto userport_finish;
1391+ }
1392+
1393+ /* Save the sequence number of this message so we can ignore duplicates
1394+ * (above) */
1395+ if (!(header->flags & (MSG_NOACK >> 16)) && rem_node) {
1396+ P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
1397+ rem_node->name);
1398+ rem_node->last_seq_recv = le16_to_cpu(header->seq);
1399+ }
1400+
1401+ /* Is it a protocol message? */
1402+ if (header->port == 0) {
1403+ process_cnxman_message(csock, data, len, addr, addrlen,
1404+ rem_node);
1405+ goto userport_finish;
1406+ }
1407+
1408+ /* Skip past the header to the data */
1409+ data += sizeof (struct cl_protheader);
1410+ len -= sizeof (struct cl_protheader);
1411+
1412+ /* Get the port number and look for a listener */
1413+ down(&port_array_lock);
1414+ if (port_array[header->port]) {
1415+ int native_srcid;
1416+ struct cluster_sock *c = cluster_sk(port_array[header->port]);
1417+
1418+ /* ACK it */
1419+ if (!(header->flags & (MSG_NOACK >> 16)))
1420+ cl_sendack(csock, header->seq, addrlen, addr,
1421+ header->port, 0);
1422+
1423+ /* Call a callback if there is one */
1424+ if (c->kernel_callback) {
1425+ up(&port_array_lock);
1426+ c->kernel_callback(data, len, addr, addrlen,
1427+ le32_to_cpu(header->srcid));
1428+ goto userport_finish;
1429+ }
1430+
1431+ /* Otherwise put it into an SKB and pass it onto the recvmsg
1432+ * mechanism */
1433+ skb = alloc_skb(len, GFP_KERNEL);
1434+ if (!skb) {
1435+ up(&port_array_lock);
1436+ printk(KERN_INFO CMAN_NAME
1437+ ": Failed to allocate skb\n");
1438+ return;
1439+ }
1440+
1441+ skb_put(skb, len);
1442+ memcpy(skb->data, data, len);
1443+
1444+ /* Put the nodeid into cb so we can pass it to the clients */
1445+ skb->cb[0] = 0; /* Clear flags */
1446+ native_srcid = le32_to_cpu(header->srcid);
1447+ memcpy(skb->cb + 1, &native_srcid, sizeof(int));
1448+
1449+ if ((err =
1450+ sock_queue_rcv_skb(port_array[header->port], skb)) < 0) {
1451+
1452+ printk(KERN_INFO CMAN_NAME
1453+ ": Error queueing request to port %d: %d\n",
1454+ header->port, err);
1455+ kfree_skb(skb);
1456+
1457+ /* If the port was MEMBERSHIP then we have to die */
1458+ if (header->port == CLUSTER_PORT_MEMBERSHIP) {
1459+ up(&port_array_lock);
1460+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
1461+ panic("membership stopped responding");
1462+ }
1463+ }
1464+ up(&port_array_lock);
1465+
1466+ }
1467+ else {
1468+ /* ACK it, but set the flag bit so remote end knows no-one
1469+ * caught it */
1470+ if (!(header->flags & (MSG_NOACK >> 16)))
1471+ cl_sendack(csock, header->seq, addrlen, addr,
1472+ header->port, 1);
1473+
1474+ /* Nobody listening, drop it */
1475+ up(&port_array_lock);
1476+ }
1477+
1478+ userport_finish:
1479+ return;
1480+}
1481+
1482+static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
1483+{
1484+ struct sock *sk;
1485+ struct cluster_sock *c;
1486+
1487+ if ((sk =
1488+ sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
1489+ cluster_sk_cachep)) == NULL)
1490+ goto no_sock;
1491+
1492+ if (sock) {
1493+ sock->ops = &cl_proto_ops;
1494+ }
1495+ sock_init_data(sock, sk);
1496+
1497+ sk->sk_destruct = NULL;
1498+ sk->sk_no_check = 1;
1499+ sk->sk_family = PF_CLUSTER;
1500+ sk->sk_allocation = gfp;
1501+
1502+ c = cluster_sk(sk);
1503+ c->port = 0;
1504+ c->service_data = NULL;
1505+
1506+ return sk;
1507+ no_sock:
1508+ return NULL;
1509+}
1510+
1511+static int cl_release(struct socket *sock)
1512+{
1513+ struct sock *sk = sock->sk;
1514+ struct cl_client_socket *csock;
1515+ struct list_head *socklist;
1516+ struct list_head *tmp;
1517+
1518+ down(&client_socket_lock);
1519+ if (sk) {
1520+ /* Remove port allocations if it's a bound socket */
1521+ struct cluster_sock *c = cluster_sk(sk);
1522+
1523+ down(&port_array_lock);
1524+ if (c->port) {
1525+ port_array[c->port] = NULL;
1526+ }
1527+ up(&port_array_lock);
1528+
1529+ /* Tell other nodes in the cluster that this listener is going
1530+ * away */
1531+ if (atomic_read(&cnxman_running) && c->port)
1532+ send_port_close_oob(c->port);
1533+
1534+ if (c->service_data)
1535+ sm_sock_release(sock);
1536+
1537+ /* Master socket released ? */
1538+ if (sk->sk_protocol == CLPROTO_MASTER) {
1539+ master_sock = NULL;
1540+
1541+ /* If this socket is being freed and cnxman is not
1542+ * started then free all the comms sockets as either
1543+ * the userland "join" process has crashed or the
1544+ * join failed.
1545+ */
1546+ if (!atomic_read(&cnxman_running)) {
1547+ quit_threads = 1;
1548+ free_cluster_sockets();
1549+ }
1550+ }
1551+
1552+ sock_orphan(sk);
1553+ sock_hold(sk);
1554+ lock_sock(sk);
1555+ release_sock(sk);
1556+ sock_put(sk);
1557+ sock_put(sk);
1558+ sock->sk = NULL;
1559+ }
1560+
1561+ /* Remove it from the list of clients */
1562+ list_for_each_safe(socklist, tmp, &client_socket_list) {
1563+ csock = list_entry(socklist, struct cl_client_socket, list);
1564+
1565+ if (csock->sock == sock) {
1566+ list_del(&csock->list);
1567+ kfree(csock);
1568+ break;
1569+ }
1570+ }
1571+ up(&client_socket_lock);
1572+
1573+ return 0;
1574+}
1575+
1576+static int cl_create(struct socket *sock, int protocol)
1577+{
1578+ struct sock *sk;
1579+
1580+ /* All are datagrams */
1581+ if (sock->type != SOCK_DGRAM)
1582+ return -ESOCKTNOSUPPORT;
1583+
1584+ if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
1585+ return -EPERM;
1586+
1587+ /* Can only have one master socket */
1588+ if (master_sock && protocol == CLPROTO_MASTER)
1589+ return -EBUSY;
1590+
1591+ /* cnxman not running and a client was requested */
1592+ if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
1593+ return -ENETDOWN;
1594+
1595+ if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
1596+ return -ENOBUFS;
1597+
1598+ sk->sk_protocol = protocol;
1599+
1600+ if (protocol == CLPROTO_MASTER)
1601+ master_sock = sk;
1602+
1603+ /* Add client sockets to the list */
1604+ if (protocol == CLPROTO_CLIENT) {
1605+ struct cl_client_socket *clsock =
1606+ kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
1607+ if (!clsock) {
1608+ cl_release(sock);
1609+ return -ENOMEM;
1610+ }
1611+ clsock->sock = sock;
1612+ down(&client_socket_lock);
1613+ list_add(&clsock->list, &client_socket_list);
1614+ up(&client_socket_lock);
1615+ }
1616+
1617+ return 0;
1618+}
1619+
1620+static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1621+{
1622+ struct sock *sk = sock->sk;
1623+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
1624+ struct cluster_sock *c = cluster_sk(sk);
1625+
1626+ if (!capable(CAP_NET_BIND_SERVICE))
1627+ return -EPERM;
1628+
1629+ if (sk->sk_zapped == 0)
1630+ return -EINVAL;
1631+
1632+ if (addr_len != sizeof (struct sockaddr_cl))
1633+ return -EINVAL;
1634+
1635+ if (saddr->scl_family != AF_CLUSTER)
1636+ return -EINVAL;
1637+
1638+ if (saddr->scl_port == 0)
1639+ return -EINVAL; /* Port 0 is reserved for protocol messages */
1640+
1641+ down(&port_array_lock);
1642+
1643+ if (port_array[saddr->scl_port]) {
1644+ up(&port_array_lock);
1645+ return -EADDRINUSE;
1646+ }
1647+
1648+ port_array[saddr->scl_port] = sk;
1649+
1650+ up(&port_array_lock);
1651+
1652+ c->port = saddr->scl_port;
1653+ sk->sk_zapped = 0;
1654+
1655+ /* If we are not a cluster member yet then make the client wait until
1656+ * we are, this allows nodes to start cluster clients at the same time
1657+ * as cluster services but they will wait until membership is achieved.
1658+ * This looks odd in bind() (open would seem more obvious) but we need
1659+ * to know which port number is being used so that things like
1660+ * membership services don't get blocked
1661+ */
1662+
1663+ if (saddr->scl_port > HIGH_PROTECTED_PORT)
1664+ while (!we_are_a_cluster_member || !cluster_is_quorate
1665+ || in_transition()) {
1666+ DECLARE_WAITQUEUE(wq, current);
1667+ struct task_struct *tsk = current;
1668+
1669+ set_task_state(tsk, TASK_INTERRUPTIBLE);
1670+ add_wait_queue(&socket_waitq, &wq);
1671+
1672+ if (!we_are_a_cluster_member || !cluster_is_quorate
1673+ || in_transition())
1674+ schedule();
1675+
1676+ set_task_state(tsk, TASK_RUNNING);
1677+ remove_wait_queue(&socket_waitq, &wq);
1678+
1679+ /* We were woken up because the cluster is going down,
1680+ * ...and we never got a chance to do any work! (sob) */
1681+ if (atomic_read(&cnxman_running) == 0 || quit_threads) {
1682+ return -ENOTCONN;
1683+ }
1684+ }
1685+
1686+ return 0;
1687+}
1688+
1689+static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
1690+ int *uaddr_len, int peer)
1691+{
1692+ struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
1693+ struct sock *sk = sock->sk;
1694+ struct cluster_sock *c = cluster_sk(sk);
1695+
1696+ *uaddr_len = sizeof (struct sockaddr_cl);
1697+
1698+ lock_sock(sk);
1699+
1700+ sa->scl_port = c->port;
1701+ sa->scl_flags = 0;
1702+ sa->scl_family = AF_CLUSTER;
1703+
1704+ release_sock(sk);
1705+
1706+ return 0;
1707+}
1708+
1709+static unsigned int cl_poll(struct file *file, struct socket *sock,
1710+ poll_table * wait)
1711+{
1712+ return datagram_poll(file, sock, wait);
1713+}
1714+
1715+/* Copy internal node format to userland format */
1716+void copy_to_usernode(struct cluster_node *node,
1717+ struct cl_cluster_node *unode)
1718+{
1719+ strcpy(unode->name, node->name);
1720+ unode->size = sizeof (struct cl_cluster_node);
1721+ unode->votes = node->votes;
1722+ unode->state = node->state;
1723+ unode->us = node->us;
1724+ unode->node_id = node->node_id;
1725+ unode->leave_reason = node->leave_reason;
1726+ unode->incarnation = node->incarnation;
1727+}
1728+
1729+/* ioctl processing functions */
1730+
1731+static int do_ioctl_set_version(unsigned long arg)
1732+{
1733+ struct cl_version version, *u_version;
1734+
1735+ if (!capable(CAP_CLUSTER))
1736+ return -EPERM;
1737+ if (arg == 0)
1738+ return -EINVAL;
1739+
1740+ u_version = (struct cl_version *) arg;
1741+
1742+ if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
1743+ return -EFAULT;
1744+
1745+ if (version.major != CNXMAN_MAJOR_VERSION ||
1746+ version.minor != CNXMAN_MINOR_VERSION ||
1747+ version.patch != CNXMAN_PATCH_VERSION)
1748+ return -EINVAL;
1749+
1750+ if (config_version == version.config)
1751+ return 0;
1752+
1753+ config_version = version.config;
1754+ send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
1755+ return 0;
1756+}
1757+
1758+static int do_ioctl_get_members(unsigned long arg)
1759+{
1760+ struct cluster_node *node;
1761+ /* Kernel copies */
1762+ struct cl_cluster_node user_format_node;
1763+ struct cl_cluster_nodelist user_format_nodelist;
1764+ /* User space array ptr */
1765+ struct cl_cluster_node *user_node;
1766+ struct list_head *nodelist;
1767+ int num_nodes = 0;
1768+
1769+ if (arg == 0)
1770+ return cluster_members;
1771+
1772+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1773+ return -EFAULT;
1774+
1775+ down(&cluster_members_lock);
1776+
1777+ if (user_format_nodelist.max_members < cluster_members) {
1778+ up(&cluster_members_lock);
1779+ return -E2BIG;
1780+ }
1781+
1782+ user_node = user_format_nodelist.nodes;
1783+
1784+ list_for_each(nodelist, &cluster_members_list) {
1785+ node = list_entry(nodelist, struct cluster_node, list);
1786+ if (node->state == NODESTATE_MEMBER) {
1787+ copy_to_usernode(node, &user_format_node);
1788+ if (copy_to_user(user_node, &user_format_node,
1789+ sizeof (struct cl_cluster_node))) {
1790+ up(&cluster_members_lock);
1791+ return -EFAULT;
1792+ }
1793+ user_node++;
1794+ num_nodes++;
1795+ }
1796+ }
1797+ up(&cluster_members_lock);
1798+
1799+ return num_nodes;
1800+}
1801+
1802+static int do_ioctl_get_all_members(unsigned long arg)
1803+{
1804+ struct cluster_node *node;
1805+ /* Kernel copies */
1806+ struct cl_cluster_node user_format_node;
1807+ struct cl_cluster_nodelist user_format_nodelist;
1808+ /* User space array ptr*/
1809+ struct cl_cluster_node *user_node;
1810+ struct list_head *nodelist;
1811+ int num_nodes = 0;
1812+
1813+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1814+ return -EFAULT;
1815+
1816+ down(&cluster_members_lock);
1817+
1818+ user_node = user_format_nodelist.nodes;
1819+
1820+ list_for_each(nodelist, &cluster_members_list) {
1821+ node = list_entry(nodelist, struct cluster_node, list);
1822+ if (arg) {
1823+ copy_to_usernode(node,
1824+ &user_format_node);
1825+
1826+ if (copy_to_user(user_node, &user_format_node,
1827+ sizeof (struct cl_cluster_node))) {
1828+ up(&cluster_members_lock);
1829+ return -EFAULT;
1830+ }
1831+ user_node++;
1832+ if (--user_format_nodelist.max_members < 0) {
1833+ num_nodes = -EFAULT;
1834+ goto err_exit;
1835+ }
1836+
1837+ }
1838+ num_nodes++;
1839+ }
1840+ err_exit:
1841+ up(&cluster_members_lock);
1842+
1843+ return num_nodes;
1844+}
1845+
1846+static int do_ioctl_get_node(unsigned long arg)
1847+{
1848+ struct cluster_node *node;
1849+ struct cl_cluster_node k_node, *u_node;
1850+
1851+ u_node = (struct cl_cluster_node *) arg;
1852+
1853+ if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
1854+ return -EFAULT;
1855+
1856+ if (k_node.node_id)
1857+ node = find_node_by_nodeid(k_node.node_id);
1858+ else
1859+ node = find_node_by_name(k_node.name);
1860+
1861+ if (!node)
1862+ return -ENOENT;
1863+
1864+ copy_to_usernode(node, &k_node);
1865+
1866+ if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
1867+ return -EFAULT;
1868+
1869+ return 0;
1870+}
1871+
1872+static int do_ioctl_set_expected(unsigned long arg)
1873+{
1874+ struct list_head *nodelist;
1875+ struct cluster_node *node;
1876+ unsigned int total_votes;
1877+ unsigned int newquorum;
1878+
1879+ if (!capable(CAP_CLUSTER))
1880+ return -EPERM;
1881+ if (arg == 0)
1882+ return -EINVAL;
1883+
1884+ newquorum = calculate_quorum(1, arg, &total_votes);
1885+
1886+ if (newquorum < total_votes / 2
1887+ || newquorum > total_votes) {
1888+ return -EINVAL;
1889+ }
1890+
1891+ /* Now do it */
1892+ down(&cluster_members_lock);
1893+ list_for_each(nodelist, &cluster_members_list) {
1894+ node = list_entry(nodelist, struct cluster_node, list);
1895+ if (node->state == NODESTATE_MEMBER
1896+ && node->expected_votes > arg) {
1897+ node->expected_votes = arg;
1898+ }
1899+ }
1900+ up(&cluster_members_lock);
1901+
1902+ recalculate_quorum(1);
1903+
1904+ send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
1905+ sm_member_update(cluster_is_quorate);
1906+
1907+ return 0;
1908+}
1909+
1910+static int do_ioctl_kill_node(unsigned long arg)
1911+{
1912+ struct cluster_node *node;
1913+
1914+ if (!capable(CAP_CLUSTER))
1915+ return -EPERM;
1916+
1917+
1918+ if ((node = find_node_by_nodeid(arg)) == NULL)
1919+ return -EINVAL;
1920+
1921+ /* Can't kill us */
1922+ if (node->us)
1923+ return -EINVAL;
1924+
1925+ if (node->state != NODESTATE_MEMBER)
1926+ return -EINVAL;
1927+
1928+ /* Just in case it is alive, send a KILL message */
1929+ send_kill(arg);
1930+
1931+ node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
1932+ a_node_just_died(node);
1933+
1934+ return 0;
1935+}
1936+
1937+static int do_ioctl_barrier(unsigned long arg)
1938+{
1939+ struct cl_barrier_info info;
1940+
1941+ if (!capable(CAP_CLUSTER))
1942+ return -EPERM;
1943+
1944+ if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0)
1945+ return -EFAULT;
1946+
1947+ switch (info.cmd) {
1948+ case BARRIER_IOCTL_REGISTER:
1949+ return kcl_barrier_register(info.name,
1950+ info.flags,
1951+ info.arg);
1952+ case BARRIER_IOCTL_CHANGE:
1953+ return kcl_barrier_setattr(info.name,
1954+ info.flags,
1955+ info.arg);
1956+ case BARRIER_IOCTL_WAIT:
1957+ return kcl_barrier_wait(info.name);
1958+ case BARRIER_IOCTL_DELETE:
1959+ return kcl_barrier_delete(info.name);
1960+ default:
1961+ return -EINVAL;
1962+ }
1963+}
1964+
1965+static int do_ioctl_islistening(unsigned long arg)
1966+{
1967+ DECLARE_WAITQUEUE(wq, current);
1968+ struct cl_listen_request rq;
1969+ struct cluster_node *rem_node;
1970+ int nodeid;
1971+ int result;
1972+ struct cl_waiting_listen_request *listen_request;
1973+
1974+ if (!arg)
1975+ return -EINVAL;
1976+
1977+ if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
1978+ return -EFAULT;
1979+
1980+ nodeid = rq.nodeid;
1981+
1982+ rem_node = find_node_by_nodeid(nodeid);
1983+
1984+ /* Node not in the cluster */
1985+ if (!rem_node)
1986+ return -ENOENT;
1987+
1988+ if (rem_node->state != NODESTATE_MEMBER)
1989+ return -ENOTCONN;
1990+
1991+ /* If the request is for us then just look in the ports
1992+ * array */
1993+ if (nodeid == us->node_id)
1994+ return (port_array[rq.port] != 0) ? 1 : 0;
1995+
1996+ /* For a remote node we need to send a request out */
1997+
1998+ /* If we are in transition then wait until we are not */
1999+ while (in_transition()) {
2000+ set_task_state(current, TASK_INTERRUPTIBLE);
2001+ add_wait_queue(&socket_waitq, &wq);
2002+
2003+ if (in_transition())
2004+ schedule();
2005+
2006+ set_task_state(current, TASK_RUNNING);
2007+ remove_wait_queue(&socket_waitq, &wq);
2008+
2009+ if (signal_pending(current))
2010+ return -EINTR;
2011+ }
2012+
2013+ /* Were we shut down before it completed ? */
2014+ if (!atomic_read(&cnxman_running))
2015+ return -ENOTCONN;
2016+
2017+ listen_request =
2018+ kmalloc(sizeof (struct cl_waiting_listen_request),
2019+ GFP_KERNEL);
2020+ if (!listen_request)
2021+ return -ENOMEM;
2022+
2023+ /* Build the request */
2024+ listen_request->waiting = 1;
2025+ listen_request->result = 0;
2026+ listen_request->tag = current->pid;
2027+ listen_request->nodeid = nodeid;
2028+ init_waitqueue_head(&listen_request->waitq);
2029+
2030+ down(&listenreq_lock);
2031+ list_add(&listen_request->list, &listenreq_list);
2032+ up(&listenreq_lock);
2033+
2034+ /* Now wait for the response to come back */
2035+ send_listen_request(rq.nodeid, rq.port);
2036+
2037+ while (listen_request->waiting) {
2038+ set_task_state(current, TASK_INTERRUPTIBLE);
2039+ add_wait_queue(&listen_request->waitq, &wq);
2040+
2041+ if (listen_request->waiting)
2042+ schedule();
2043+
2044+ set_task_state(current, TASK_RUNNING);
2045+ remove_wait_queue(&listen_request->waitq, &wq);
2046+
2047+ if (signal_pending(current)) {
2048+ list_del(&listen_request->list);
2049+ kfree(listen_request);
2050+ return -ERESTARTSYS;
2051+ }
2052+ }
2053+ result = listen_request->result;
2054+ list_del(&listen_request->list);
2055+ kfree(listen_request);
2056+ return result;
2057+}
2058+
2059+static int do_ioctl_set_votes(unsigned long arg)
2060+{
2061+ unsigned int total_votes;
2062+ unsigned int newquorum;
2063+ int saved_votes;
2064+
2065+ if (!capable(CAP_CLUSTER))
2066+ return -EPERM;
2067+
2068+ /* Check votes is valid */
2069+ saved_votes = us->votes;
2070+ us->votes = arg;
2071+
2072+ newquorum = calculate_quorum(1, 0, &total_votes);
2073+
2074+ if (newquorum < total_votes / 2 || newquorum > total_votes) {
2075+ us->votes = saved_votes;
2076+ return -EINVAL;
2077+ }
2078+
2079+ recalculate_quorum(1);
2080+
2081+ send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
2082+
2083+ return 0;
2084+}
2085+
2086+static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2087+{
2088+ int err = -EOPNOTSUPP;
2089+ struct list_head *proclist;
2090+ struct list_head *tmp;
2091+ struct notify_struct *notify;
2092+ struct cl_version cnxman_version;
2093+
2094+ switch (cmd) {
2095+ /* Process requests notification of cluster events */
2096+ case SIOCCLUSTER_NOTIFY:
2097+ notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
2098+ if (!notify)
2099+ return -ENOMEM;
2100+ notify->pid = current->pid;
2101+ notify->signal = arg;
2102+ down(&event_listener_lock);
2103+ list_add(&notify->list, &event_listener_list);
2104+ up(&event_listener_lock);
2105+ err = 0;
2106+ break;
2107+
2108+ /* Process is no longer interested cluster events */
2109+ case SIOCCLUSTER_REMOVENOTIFY:
2110+ err = EINVAL;
2111+
2112+ down(&event_listener_lock);
2113+ list_for_each_safe(proclist, tmp, &event_listener_list) {
2114+ notify =
2115+ list_entry(proclist, struct notify_struct, list);
2116+ if (notify->pid == current->pid) {
2117+ list_del(&notify->list);
2118+ kfree(notify);
2119+ err = 0;
2120+ }
2121+ }
2122+ up(&event_listener_lock);
2123+ break;
2124+
2125+ /* Return the cnxman version number */
2126+ case SIOCCLUSTER_GET_VERSION:
2127+ if (!arg)
2128+ return -EINVAL;
2129+ err = 0;
2130+ cnxman_version.major = CNXMAN_MAJOR_VERSION;
2131+ cnxman_version.minor = CNXMAN_MINOR_VERSION;
2132+ cnxman_version.patch = CNXMAN_PATCH_VERSION;
2133+ if (copy_to_user((void *) arg, &cnxman_version,
2134+ sizeof (struct cl_version))) {
2135+ return -EFAULT;
2136+ }
2137+ break;
2138+
2139+ /* Set the cnxman config version number */
2140+ case SIOCCLUSTER_SET_VERSION:
2141+ err = do_ioctl_set_version(arg);
2142+ break;
2143+
2144+ /* Return the active membership list */
2145+ case SIOCCLUSTER_GETMEMBERS:
2146+ err = do_ioctl_get_members(arg);
2147+ break;
2148+
2149+ /* Return the full membership list include dead nodes */
2150+ case SIOCCLUSTER_GETALLMEMBERS:
2151+ err = do_ioctl_get_all_members(arg);
2152+ break;
2153+
2154+ case SIOCCLUSTER_GETNODE:
2155+ err = do_ioctl_get_node(arg);
2156+ break;
2157+
2158+ case SIOCCLUSTER_ISQUORATE:
2159+ return cluster_is_quorate;
2160+
2161+ case SIOCCLUSTER_ISACTIVE:
2162+ return atomic_read(&cnxman_running);
2163+
2164+ case SIOCCLUSTER_SETEXPECTED_VOTES:
2165+ err = do_ioctl_set_expected(arg);
2166+ break;
2167+
2168+ /* Change the number of votes for this node */
2169+ case SIOCCLUSTER_SET_VOTES:
2170+ err = do_ioctl_set_votes(arg);
2171+ break;
2172+
2173+ /* Return 1 if the specified node is listening on a given port */
2174+ case SIOCCLUSTER_ISLISTENING:
2175+ err = do_ioctl_islistening(arg);
2176+ break;
2177+
2178+ /* Forcibly kill a node */
2179+ case SIOCCLUSTER_KILLNODE:
2180+ err = do_ioctl_kill_node(arg);
2181+ break;
2182+
2183+ case SIOCCLUSTER_GET_JOINCOUNT:
2184+ if (!capable(CAP_CLUSTER))
2185+ return -EPERM;
2186+ else
2187+ return atomic_read(&use_count);
2188+
2189+ /* ioctl interface to the barrier system */
2190+ case SIOCCLUSTER_BARRIER:
2191+ err = do_ioctl_barrier(arg);
2192+ break;
2193+
2194+ default:
2195+ err = sm_ioctl(sock, cmd, arg);
2196+ }
2197+ return err;
2198+}
2199+
2200+static int cl_shutdown(struct socket *sock, int how)
2201+{
2202+ struct sock *sk = sock->sk;
2203+ int err = -ENOTCONN;
2204+
2205+ lock_sock(sk);
2206+
2207+ if (sock->state == SS_UNCONNECTED)
2208+ goto out;
2209+
2210+ err = 0;
2211+ if (sock->state == SS_DISCONNECTING)
2212+ goto out;
2213+
2214+ err = -EINVAL;
2215+
2216+ if (how != SHUTDOWN_MASK)
2217+ goto out;
2218+
2219+ sk->sk_shutdown = how;
2220+ err = 0;
2221+
2222+ out:
2223+ release_sock(sk);
2224+
2225+ return err;
2226+}
2227+
2228+static int cl_setsockopt(struct socket *sock, int level, int optname,
2229+ char *optval, int optlen)
2230+{
2231+ struct sock *sk = sock->sk;
2232+ int err;
2233+
2234+ if (sk != master_sock)
2235+ return -EPERM;
2236+
2237+ lock_sock(sk);
2238+ err = __cl_setsockopt(sock, level, optname, optval, optlen, 0);
2239+ release_sock(sk);
2240+
2241+ return err;
2242+}
2243+
2244+static int add_clsock(int broadcast, int number, struct socket *sock,
2245+ struct file *file)
2246+{
2247+ struct cl_comms_socket *newsock =
2248+ kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
2249+ if (!newsock)
2250+ return -ENOMEM;
2251+
2252+ memset(newsock, 0, sizeof (*newsock));
2253+ newsock->number = number;
2254+ newsock->sock = sock;
2255+ if (broadcast) {
2256+ newsock->broadcast = 1;
2257+ newsock->recv_only = 0;
2258+ }
2259+ else {
2260+ newsock->broadcast = 0;
2261+ newsock->recv_only = 1;
2262+ }
2263+
2264+ newsock->file = file;
2265+ newsock->addr_len = sizeof(struct sockaddr_in6);
2266+
2267+ /* Mark it active until cnxman thread is running and ready to process
2268+ * messages */
2269+ set_bit(1, &newsock->active);
2270+
2271+ /* Find out what it's bound to */
2272+ newsock->sock->ops->getname(newsock->sock,
2273+ (struct sockaddr *)&newsock->saddr,
2274+ &newsock->addr_len, 0);
2275+
2276+ num_interfaces = max(num_interfaces, newsock->number);
2277+ if (!current_interface && newsock->broadcast)
2278+ current_interface = newsock;
2279+
2280+ /* Hook data_ready */
2281+ newsock->sock->sk->sk_data_ready = cnxman_data_ready;
2282+
2283+ /* Make an attempt to keep them in order */
2284+ list_add_tail(&newsock->list, &socket_list);
2285+
2286+ address_length = newsock->addr_len;
2287+ return 0;
2288+}
2289+
2290+static int __cl_setsockopt(struct socket *sock, int level, int optname,
2291+ char *optval, int optlen, int flags)
2292+{
2293+ struct file *file;
2294+ struct cl_join_cluster_info join_info;
2295+ int error;
2296+ int leave_flags;
2297+ struct cl_multicast_sock multicast_info;
2298+
2299+ if (optlen && !optval)
2300+ return -EINVAL;
2301+
2302+ switch (optname) {
2303+ case CLU_SET_MULTICAST:
2304+ case CLU_SET_RCVONLY:
2305+ if (!capable(CAP_CLUSTER))
2306+ return -EPERM;
2307+
2308+ if (optlen != sizeof (struct cl_multicast_sock))
2309+ return -EINVAL;
2310+
2311+ if (atomic_read(&cnxman_running))
2312+ return -EINVAL;
2313+
2314+ error = -EBADF;
2315+
2316+ if (copy_from_user(&multicast_info, optval, optlen))
2317+ return -EFAULT;
2318+
2319+ file = fget(multicast_info.fd);
2320+ if (file) {
2321+ struct inode *inode = file->f_dentry->d_inode;
2322+
2323+ error =
2324+ add_clsock(optname == CLU_SET_MULTICAST,
2325+ multicast_info.number, SOCKET_I(inode),
2326+ file);
2327+ if (error)
2328+ fput(file);
2329+ }
2330+ return error;
2331+
2332+ case CLU_SET_NODENAME:
2333+ if (!capable(CAP_CLUSTER))
2334+ return -EPERM;
2335+
2336+ if (atomic_read(&cnxman_running))
2337+ return -EINVAL;
2338+
2339+ if (optlen > MAX_CLUSTER_MEMBER_NAME_LEN)
2340+ return -EINVAL;
2341+
2342+ if (copy_from_user(nodename, optval, optlen))
2343+ return -EFAULT;
2344+ break;
2345+
2346+ case CLU_JOIN_CLUSTER:
2347+ if (!capable(CAP_CLUSTER))
2348+ return -EPERM;
2349+
2350+ if (atomic_read(&cnxman_running))
2351+ return -EALREADY;
2352+
2353+ if (optlen != sizeof (struct cl_join_cluster_info))
2354+ return -EINVAL;
2355+
2356+ if (copy_from_user(&join_info, optval, optlen))
2357+ return -EFAULT;
2358+
2359+ if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
2360+ return -EINVAL;
2361+
2362+ if (list_empty(&socket_list))
2363+ return -ENOTCONN;
2364+
2365+ set_votes(join_info.votes, join_info.expected_votes);
2366+ cluster_id = generate_cluster_id(join_info.cluster_name);
2367+ strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
2368+ two_node = join_info.two_node;
2369+ config_version = join_info.config_version;
2370+
2371+ quit_threads = 0;
2372+ acks_expected = 0;
2373+ init_completion(&cluster_thread_comp);
2374+ init_completion(&member_thread_comp);
2375+ if (allocate_nodeid_array())
2376+ return -ENOMEM;
2377+
2378+ kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
2379+ if (kcluster_pid < 0)
2380+ return kcluster_pid;
2381+
2382+ wait_for_completion(&cluster_thread_comp);
2383+ init_completion(&cluster_thread_comp);
2384+
2385+ atomic_set(&cnxman_running, 1);
2386+
2387+ /* Make sure we have a node name */
2388+ if (nodename[0] == '\0')
2389+ strcpy(nodename, system_utsname.nodename);
2390+
2391+ membership_pid = start_membership_services(kcluster_pid);
2392+ if (membership_pid < 0) {
2393+ quit_threads = 1;
2394+ wait_for_completion(&cluster_thread_comp);
2395+ init_completion(&member_thread_comp);
2396+ return membership_pid;
2397+ }
2398+
2399+ sm_start();
2400+ break;
2401+
2402+ case CLU_LEAVE_CLUSTER:
2403+ if (!capable(CAP_CLUSTER))
2404+ return -EPERM;
2405+
2406+ if (optlen != sizeof (int))
2407+ return -EINVAL;
2408+
2409+ if (copy_from_user(&leave_flags, optval, optlen))
2410+ return -EFAULT;
2411+
2412+ if (!atomic_read(&cnxman_running))
2413+ return -ENOTCONN;
2414+
2415+ if (in_transition())
2416+ return -EBUSY;
2417+
2418+ /* Ignore the use count if FORCE is set */
2419+ if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
2420+ if (atomic_read(&use_count))
2421+ return -ENOTCONN;
2422+ }
2423+
2424+ us->leave_reason = leave_flags;
2425+ quit_threads = 1;
2426+ wake_up_interruptible(&cnxman_waitq);
2427+
2428+ wait_for_completion(&cluster_thread_comp);
2429+ break;
2430+
2431+ default:
2432+ return -ENOPROTOOPT;
2433+ }
2434+
2435+ return 0;
2436+}
2437+
2438+static int cl_getsockopt(struct socket *sock, int level, int optname,
2439+ char *optval, int *optlen)
2440+{
2441+ struct sock *sk = sock->sk;
2442+ int err;
2443+
2444+ lock_sock(sk);
2445+ err = __cl_getsockopt(sock, level, optname, optval, optlen, 0);
2446+ release_sock(sk);
2447+
2448+ return err;
2449+}
2450+
2451+static int __cl_getsockopt(struct socket *sock, int level, int optname,
2452+ char *optval, int *optlen, int flags)
2453+{
2454+
2455+ switch (optname) {
2456+ default:
2457+ return -ENOPROTOOPT;
2458+ }
2459+
2460+ return 0;
2461+}
2462+
2463+/* We'll be giving out reward points next... */
2464+/* Send the packet and save a copy in case someone loses theirs. Should be
2465+ * protected by the send mutexphore */
2466+static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
2467+ int size, int needack)
2468+{
2469+ mm_segment_t fs;
2470+ int result;
2471+ struct iovec save_vectors[msg->msg_iovlen];
2472+
2473+ /* Save a copy of the IO vectors as send_msg mucks around with them and
2474+ * we may want to send the same stuff out more than once (for different
2475+ * interfaces)
2476+ */
2477+ memcpy(save_vectors, msg->msg_iov,
2478+ sizeof (struct iovec) * msg->msg_iovlen);
2479+
2480+ fs = get_fs();
2481+ set_fs(get_ds());
2482+
2483+ result = sock_sendmsg(csock->sock, msg, size);
2484+
2485+ set_fs(fs);
2486+
2487+ if (result >= 0 && acks_expected && needack) {
2488+
2489+ /* Start retransmit timer if it didn't go */
2490+ if (result == 0) {
2491+ start_short_timer();
2492+ }
2493+ else {
2494+ resend_delay = 1;
2495+ }
2496+ }
2497+
2498+ /* Restore IOVs */
2499+ memcpy(msg->msg_iov, save_vectors,
2500+ sizeof (struct iovec) * msg->msg_iovlen);
2501+
2502+ return result;
2503+}
2504+
2505+static void resend_last_message()
2506+{
2507+ struct msghdr msg;
2508+ struct iovec vec[1];
2509+ mm_segment_t fs;
2510+ int result;
2511+
2512+ P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
2513+ jiffies, saved_msg_len, saved_msg_buffer[0],
2514+ saved_msg_buffer[6]);
2515+
2516+ /* Assume there is something wrong with the last interface */
2517+ current_interface = get_next_interface(current_interface);
2518+ if (num_interfaces > 1)
2519+ printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
2520+ current_interface->number);
2521+
2522+ vec[0].iov_base = saved_msg_buffer;
2523+ vec[0].iov_len = saved_msg_len;
2524+
2525+ memset(&msg, 0, sizeof (msg));
2526+ msg.msg_name = &current_interface->saddr;
2527+ msg.msg_namelen = current_interface->addr_len;
2528+ msg.msg_iovlen = 1;
2529+ msg.msg_iov = vec;
2530+
2531+ fs = get_fs();
2532+ set_fs(get_ds());
2533+
2534+ result = sock_sendmsg(current_interface->sock, &msg, saved_msg_len);
2535+
2536+ set_fs(fs);
2537+
2538+ if (result < 0)
2539+ printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
2540+
2541+ /* Try indefinitely to send this, the backlog must die down eventually
2542+ * !? */
2543+ if (result == 0)
2544+ start_short_timer();
2545+
2546+ /* Send succeeded, continue waiting for ACKS */
2547+ if (result > 0)
2548+ start_ack_timer();
2549+
2550+}
2551+
2552+static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
2553+ struct msghdr *msg, size_t size, int flags)
2554+{
2555+ struct sock *sk = sock->sk;
2556+ struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
2557+ struct cluster_sock *c = cluster_sk(sk);
2558+ struct sk_buff *skb;
2559+ int copied, err = 0;
2560+ int isoob = 0;
2561+
2562+ /* Socket was notified of shutdown, remove any pending skbs and return
2563+ * EOF */
2564+ if (!atomic_read(&cnxman_running)) {
2565+ while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
2566+ skb_free_datagram(sk, skb);
2567+ return 0; /* cnxman has left the building */
2568+ }
2569+
2570+ /* Generic datagram code does most of the work. If the user is not
2571+ * interested in OOB messages then ignore them */
2572+ do {
2573+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2574+ if (!skb)
2575+ goto out;
2576+
2577+ /* Is it OOB */
2578+ if (skb->cb[0] & 0x80)
2579+ isoob = 1;
2580+ else
2581+ isoob = 0;
2582+
2583+ /* If it is and the user doesn't want it, then throw it away. */
2584+ if (isoob && !(flags & MSG_OOB)) {
2585+ skb_free_datagram(sk, skb);
2586+
2587+ /* If we peeked (?) an OOB but the user doesn't want it
2588+ then we need to discard it or we'll loop forever */
2589+ if (flags & MSG_PEEK) {
2590+ skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
2591+ MSG_DONTWAIT, &err);
2592+ if (skb)
2593+ skb_free_datagram(sk, skb);
2594+ }
2595+ }
2596+ }
2597+ while (isoob && !(flags & MSG_OOB));
2598+
2599+ copied = skb->len;
2600+ if (copied > size) {
2601+ copied = size;
2602+ msg->msg_flags |= MSG_TRUNC;
2603+ }
2604+ err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
2605+
2606+ if (err)
2607+ goto out_free;
2608+
2609+ if (msg->msg_name && msg->msg_namelen) {
2610+ memset(msg->msg_name, 0, msg->msg_namelen);
2611+
2612+ if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
2613+
2614+ /* Nodeid is in native byte order - anything else is just
2615+ * perverse */
2616+ memcpy(&sin->scl_nodeid, skb->cb + 1, sizeof(int));
2617+ }
2618+ msg->msg_namelen = sizeof (struct sockaddr_cl);
2619+ sin->scl_port = c->port;
2620+ }
2621+
2622+ /* Top bit set in cb[0] means this is an OOB message */
2623+ if (skb->cb[0] & 0x80) {
2624+ msg->msg_flags |= MSG_OOB;
2625+ }
2626+
2627+ sock_recv_timestamp(msg, sk, skb);
2628+
2629+ err = copied;
2630+
2631+ out_free:
2632+ skb_free_datagram(sk, skb);
2633+
2634+ out:
2635+ return err;
2636+}
2637+
2638+/* Send a message out on all interfaces */
2639+static int send_to_all_ints(int nodeid, struct msghdr *our_msg, int size, int flags)
2640+{
2641+ struct sockaddr_in6 daddr;
2642+ struct cl_comms_socket *clsock;
2643+ int result = 0;
2644+
2645+ our_msg->msg_name = &daddr;
2646+
2647+ list_for_each_entry(clsock, &socket_list, list) {
2648+
2649+ /* Don't send out a recv-only socket */
2650+ if (!clsock->recv_only) {
2651+
2652+ /* For temporary node IDs send to the node's real IP address */
2653+ if (nodeid < 0) {
2654+ get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
2655+ }
2656+ else {
2657+ memcpy(&daddr, &clsock->saddr, clsock->addr_len);
2658+ our_msg->msg_namelen = clsock->addr_len;
2659+ }
2660+
2661+ result = __send_and_save(clsock, our_msg,
2662+ size + sizeof (struct cl_protheader),
2663+ !(flags & MSG_NOACK));
2664+ }
2665+ }
2666+ return result;
2667+}
2668+
2669+
2670+/* Internal common send message routine */
2671+static int __sendmsg(struct socket *sock, struct msghdr *msg, int size,
2672+ unsigned char port)
2673+{
2674+ int result = 0, i;
2675+ int flags = msg->msg_flags;
2676+ struct msghdr our_msg;
2677+ struct sockaddr_cl *caddr = msg->msg_name;
2678+ struct cl_protheader header;
2679+ struct iovec vectors[msg->msg_iovlen + 1];
2680+ int nodeid = 0;
2681+
2682+ if (size > MAX_CLUSTER_MESSAGE)
2683+ return -EINVAL;
2684+ if (!atomic_read(&cnxman_running))
2685+ return -ENOTCONN;
2686+
2687+ if (caddr)
2688+ nodeid = caddr->scl_nodeid;
2689+
2690+ /* Check that the node id (if present) is valid */
2691+ if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
2692+ !is_valid_temp_nodeid(nodeid))) {
2693+ return -ENOTCONN;
2694+ }
2695+
2696+ /* We can only have one send outstanding at a time so we might as well
2697+ * lock the whole send mechanism */
2698+ down(&send_lock);
2699+
2700+ while ((port > HIGH_PROTECTED_PORT
2701+ && (!cluster_is_quorate || in_transition()))
2702+ || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
2703+
2704+ DECLARE_WAITQUEUE(wq, current);
2705+ struct task_struct *tsk = current;
2706+
2707+ if (flags & MSG_DONTWAIT) {
2708+ up(&send_lock);
2709+ return -EAGAIN;
2710+ }
2711+
2712+ if (current->pid == kcluster_pid) {
2713+ P_COMMS
2714+ ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
2715+ port, ack_count, acks_expected);
2716+ up(&send_lock);
2717+ return -EAGAIN;
2718+ }
2719+
2720+ P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
2721+ ack_count, acks_expected);
2722+
2723+ set_task_state(tsk, TASK_INTERRUPTIBLE);
2724+ add_wait_queue(&socket_waitq, &wq);
2725+
2726+ if ((port > HIGH_PROTECTED_PORT
2727+ && (!cluster_is_quorate || in_transition()))
2728+ || (acks_expected > 0)) {
2729+
2730+ up(&send_lock);
2731+ schedule();
2732+ down(&send_lock);
2733+ }
2734+
2735+ /* Going down */
2736+ if (quit_threads) {
2737+ up(&send_lock);
2738+ return -ENOTCONN;
2739+ }
2740+
2741+ set_task_state(tsk, TASK_RUNNING);
2742+ remove_wait_queue(&socket_waitq, &wq);
2743+
2744+ if (signal_pending(current)) {
2745+ up(&send_lock);
2746+ return -ERESTARTSYS;
2747+ }
2748+
2749+ /* Were we shut down in the meantime ? */
2750+ if (!atomic_read(&cnxman_running)) {
2751+ up(&send_lock);
2752+ return -ENOTCONN;
2753+ }
2754+
2755+ }
2756+
2757+ memset(&our_msg, 0, sizeof (our_msg));
2758+
2759+ /* Build the header */
2760+ header.port = port;
2761+ header.flags = msg->msg_flags >> 16;
2762+ header.cluster = cpu_to_le16(cluster_id);
2763+ header.srcid = us ? cpu_to_le32(us->node_id) : 0;
2764+ header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
2765+
2766+ ++cur_seq;
2767+ header.seq = cpu_to_le16(cur_seq);
2768+
2769+ /* Set the MULTICAST flag on messages with no particular destination */
2770+ if (!msg->msg_namelen) {
2771+ header.flags |= MSG_MULTICAST >> 16;
2772+ header.tgtid = 0;
2773+ }
2774+
2775+ /* Copy the existing iovecs into our array and add the header on at the
2776+ * beginning */
2777+ vectors[0].iov_base = &header;
2778+ vectors[0].iov_len = sizeof (header);
2779+ for (i = 0; i < msg->msg_iovlen; i++) {
2780+ vectors[i + 1] = msg->msg_iov[i];
2781+ }
2782+
2783+ our_msg.msg_iovlen = msg->msg_iovlen + 1;
2784+ our_msg.msg_iov = vectors;
2785+
2786+ /* Work out how many ACKS are wanted - *don't* reset acks_expected to
2787+ * zero if no acks are required as an ACK-needed message may still be
2788+ * outstanding */
2789+ if (!(msg->msg_flags & MSG_NOACK)) {
2790+ if (msg->msg_namelen)
2791+ acks_expected = 1; /* Unicast */
2792+ else
2793+ acks_expected = max(cluster_members - 1, 0);
2794+
2795+ }
2796+
2797+ P_COMMS
2798+ ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
2799+ nodeid, header.port,
2800+ (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
2801+ le16_to_cpu(header.seq), header.flags);
2802+
2803+ /* Don't include temp nodeids in the message itself */
2804+ if (header.tgtid < 0)
2805+ header.tgtid = 0;
2806+
2807+ /* For non-member sends we use all the interfaces */
2808+ if ((nodeid < 0) || (flags & MSG_ALLINT)) {
2809+
2810+ result = send_to_all_ints(nodeid, &our_msg, size, msg->msg_flags);
2811+ }
2812+ else {
2813+ /* Send to only the current socket - resends will use the
2814+ * others if necessary */
2815+ our_msg.msg_name = &current_interface->saddr;
2816+ our_msg.msg_namelen = current_interface->addr_len;
2817+
2818+ result =
2819+ __send_and_save(current_interface, &our_msg,
2820+ size + sizeof (header),
2821+ !(msg->msg_flags & MSG_NOACK));
2822+ }
2823+
2824+ /* Make a note in each nodes' structure that it has been sent a message
2825+ * so we can see which ones went astray */
2826+ if (!(flags & MSG_NOACK) && nodeid >= 0) {
2827+ if (msg->msg_namelen) {
2828+ struct cluster_node *node;
2829+
2830+ node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
2831+ if (node)
2832+ node->last_seq_sent = cur_seq;
2833+ }
2834+ else {
2835+ struct cluster_node *node;
2836+ struct list_head *nodelist;
2837+
2838+ list_for_each(nodelist, &cluster_members_list) {
2839+ node =
2840+ list_entry(nodelist, struct cluster_node,
2841+ list);
2842+ if (node->state == NODESTATE_MEMBER) {
2843+ node->last_seq_sent = cur_seq;
2844+ }
2845+ }
2846+ }
2847+ }
2848+
2849+ /* Save a copy of the message if we're expecting an ACK */
2850+ if (!(flags & MSG_NOACK) && acks_expected) {
2851+ mm_segment_t fs;
2852+
2853+ fs = get_fs();
2854+ set_fs(get_ds());
2855+
2856+ memcpy_fromiovec(saved_msg_buffer, our_msg.msg_iov,
2857+ size + sizeof (header));
2858+ set_fs(fs);
2859+
2860+ saved_msg_len = size + sizeof (header);
2861+ retry_count = ack_count = 0;
2862+ clear_bit(RESEND_NEEDED, &mainloop_flags);
2863+
2864+ start_ack_timer();
2865+ }
2866+
2867+ up(&send_lock);
2868+ return result;
2869+}
2870+
2871+static int queue_message(void *buf, int len, struct sockaddr_cl *caddr,
2872+ unsigned char port, int flags)
2873+{
2874+ struct queued_message *qmsg;
2875+
2876+ qmsg = kmalloc(sizeof (struct queued_message),
2877+ (in_atomic()
2878+ || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
2879+ if (qmsg == NULL)
2880+ return -1;
2881+
2882+ memcpy(qmsg->msg_buffer, buf, len);
2883+ qmsg->msg_len = len;
2884+ if (caddr) {
2885+ memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
2886+ qmsg->addr_len = sizeof (struct sockaddr_cl);
2887+ }
2888+ else {
2889+ qmsg->addr_len = 0;
2890+ }
2891+ qmsg->flags = flags;
2892+ qmsg->port = port;
2893+ qmsg->socket = NULL;
2894+
2895+ down(&messages_list_lock);
2896+ list_add_tail(&qmsg->list, &messages_list);
2897+ up(&messages_list_lock);
2898+
2899+ wake_up_interruptible(&cnxman_waitq);
2900+
2901+ return 0;
2902+}
2903+
2904+static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
2905+ struct msghdr *msg, size_t size)
2906+{
2907+ struct cluster_sock *c = cluster_sk(sock->sk);
2908+ char *buffer;
2909+ int status;
2910+ int saved_iovlen;
2911+ uint8_t port;
2912+ struct iovec iov;
2913+ struct iovec *saved_iov;
2914+ struct sockaddr_cl *caddr = msg->msg_name;
2915+
2916+ if (sock->sk->sk_protocol == CLPROTO_MASTER)
2917+ return -EOPNOTSUPP;
2918+
2919+ port = c->port;
2920+
2921+ /* Only capable users can override the port number */
2922+ if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
2923+ port = caddr->scl_port;
2924+
2925+ if (port == 0)
2926+ return -EDESTADDRREQ;
2927+
2928+ /* Hmmm. On machines with segmented user/kernel space (sparc64, hppa &
2929+ * m68k AFAICT) we can't mix user and kernel space addresses in the
2930+ * IOV. This stymies __sendmsg a little as it tries to add a header to
2931+ * what could possibly be a userspace iov. So, here (where all the
2932+ * userspace sends come) we copy it to a kernel space buffer first. If
2933+ * performance is a big problem here then I might #ifdef it for the
2934+ * affected architectures but for now I think it will probably be OK */
2935+ buffer = kmalloc(size, GFP_KERNEL);
2936+ if (!buffer)
2937+ return -ENOMEM;
2938+
2939+ memcpy_fromiovec(buffer, msg->msg_iov, size);
2940+ iov.iov_len = size;
2941+ iov.iov_base = buffer;
2942+
2943+ saved_iov = msg->msg_iov;
2944+ saved_iovlen = msg->msg_iovlen;
2945+ msg->msg_iov = &iov;
2946+ msg->msg_iovlen = 1;
2947+
2948+ status = __sendmsg(sock, msg, size, port);
2949+ msg->msg_iov = saved_iov;
2950+ msg->msg_iovlen = saved_iovlen;
2951+
2952+ kfree(buffer);
2953+
2954+ return status;
2955+}
2956+
2957+/* Kernel call to sendmsg */
2958+int kcl_sendmsg(struct socket *sock, void *buf, int size,
2959+ struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
2960+{
2961+ struct iovec iovecs[1];
2962+ struct msghdr msg;
2963+ struct cluster_sock *c = cluster_sk(sock->sk);
2964+ unsigned char port;
2965+
2966+ if (size > MAX_CLUSTER_MESSAGE)
2967+ return -EINVAL;
2968+ if (!atomic_read(&cnxman_running))
2969+ return -ENOTCONN;
2970+
2971+ port = c->port;
2972+ if (caddr && caddr->scl_port)
2973+ port = caddr->scl_port;
2974+
2975+ if (port == 0)
2976+ return -EDESTADDRREQ;
2977+
2978+ /* If we have no process context then queue it up for kclusterd to
2979+ * send. */
2980+ if (in_interrupt() || flags & MSG_QUEUE) {
2981+ return queue_message(buf, size, caddr, port,
2982+ flags & ~MSG_QUEUE);
2983+ }
2984+
2985+ iovecs[0].iov_base = buf;
2986+ iovecs[0].iov_len = size;
2987+
2988+ memset(&msg, 0, sizeof (msg));
2989+ msg.msg_name = caddr;
2990+ msg.msg_namelen = addr_len;
2991+ msg.msg_iovlen = 1;
2992+ msg.msg_iov = iovecs;
2993+ msg.msg_flags = flags;
2994+
2995+ return __sendmsg(sock, &msg, size, port);
2996+}
2997+
2998+static int send_queued_message(struct queued_message *qmsg)
2999+{
3000+ struct iovec iovecs[1];
3001+ struct msghdr msg;
3002+
3003+ /* Don't send blocked messages */
3004+ if (qmsg->port > HIGH_PROTECTED_PORT
3005+ && (!cluster_is_quorate || in_transition()))
3006+ return -EAGAIN;
3007+
3008+ iovecs[0].iov_base = qmsg->msg_buffer;
3009+ iovecs[0].iov_len = qmsg->msg_len;
3010+
3011+ memset(&msg, 0, sizeof (msg));
3012+ msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
3013+ msg.msg_namelen = qmsg->addr_len;
3014+ msg.msg_iovlen = 1;
3015+ msg.msg_iov = iovecs;
3016+ msg.msg_flags = qmsg->flags;
3017+
3018+ return __sendmsg(qmsg->socket, &msg, qmsg->msg_len, qmsg->port);
3019+}
3020+
3021+int kcl_register_read_callback(struct socket *sock,
3022+ int (*routine) (char *, int, char *, int,
3023+ unsigned int))
3024+{
3025+ struct cluster_sock *c = cluster_sk(sock->sk);
3026+
3027+ c->kernel_callback = routine;
3028+
3029+ return 0;
3030+}
3031+
3032+/* Used where we are in kclusterd context and we can't allow the task to wait
3033+ * as we are also responsible to processing the ACKs that do the wake up. Try
3034+ * to send the message immediately and queue it if that's not possible */
3035+static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
3036+ unsigned char port)
3037+{
3038+ struct iovec iovecs[1];
3039+ struct msghdr msg;
3040+
3041+ int status;
3042+
3043+ /* Don't send blocked messages */
3044+ if (port > HIGH_PROTECTED_PORT
3045+ && (!cluster_is_quorate || in_transition())) {
3046+ return queue_message(buf, len, caddr, port, 0);
3047+ }
3048+
3049+ iovecs[0].iov_base = buf;
3050+ iovecs[0].iov_len = len;
3051+
3052+ memset(&msg, 0, sizeof (msg));
3053+ msg.msg_name = caddr;
3054+ msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
3055+ msg.msg_iovlen = 1;
3056+ msg.msg_iov = iovecs;
3057+ msg.msg_flags = MSG_DONTWAIT;
3058+
3059+ status = __sendmsg(NULL, &msg, len, port);
3060+
3061+ /* Did it work ? */
3062+ if (status > 0) {
3063+ return 0;
3064+ }
3065+
3066+ /* Failure other than EAGAIN is fatal */
3067+ if (status != -EAGAIN) {
3068+ return status;
3069+ }
3070+
3071+ return queue_message(buf, len, caddr, port, 0);
3072+}
3073+
3074+/* Send a listen request to a node */
3075+static void send_listen_request(int nodeid, unsigned char port)
3076+{
3077+ struct cl_listenmsg listenmsg;
3078+ struct sockaddr_cl caddr;
3079+
3080+ memset(&caddr, 0, sizeof (caddr));
3081+
3082+ /* Build the header */
3083+ listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
3084+ listenmsg.target_port = port;
3085+ listenmsg.listening = 0;
3086+ listenmsg.tag = current->pid;
3087+
3088+ caddr.scl_family = AF_CLUSTER;
3089+ caddr.scl_port = 0;
3090+ caddr.scl_nodeid = nodeid;
3091+
3092+ send_or_queue_message(&listenmsg, sizeof(listenmsg), &caddr, 0);
3093+ return;
3094+}
3095+
3096+/* Return 1 or 0 to indicate if we have a listener on the requested port */
3097+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
3098+ unsigned char port, unsigned short tag)
3099+{
3100+ struct cl_listenmsg listenmsg;
3101+ struct sockaddr_cl caddr;
3102+ int status;
3103+
3104+ memset(&caddr, 0, sizeof (caddr));
3105+
3106+ /* Build the message */
3107+ listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
3108+ listenmsg.target_port = port;
3109+ listenmsg.tag = tag;
3110+ listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
3111+
3112+ caddr.scl_family = AF_CLUSTER;
3113+ caddr.scl_port = 0;
3114+ caddr.scl_nodeid = nodeid;
3115+
3116+ status = send_or_queue_message(&listenmsg,
3117+ sizeof (listenmsg),
3118+ &caddr, 0);
3119+
3120+ return;
3121+}
3122+
3123+/* Send an ACK */
3124+static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
3125+ int addr_len, char *addr, unsigned char remport,
3126+ unsigned char flag)
3127+{
3128+ mm_segment_t fs;
3129+ struct iovec vec;
3130+ struct cl_ackmsg ackmsg;
3131+ struct msghdr msg;
3132+ struct sockaddr_in6 daddr;
3133+ int result;
3134+
3135+#ifdef DEBUG_COMMS
3136+ char buf[MAX_ADDR_PRINTED_LEN];
3137+
3138+ P_COMMS("Sending ACK to %s, seq=%d\n",
3139+ print_addr(addr, address_length, buf), le16_to_cpu(seq));
3140+#endif
3141+
3142+ if (addr) {
3143+ memcpy(&daddr, addr, addr_len);
3144+ }
3145+ else {
3146+ memcpy(&daddr, &csock->saddr, csock->addr_len);
3147+ addr_len = csock->addr_len;
3148+ }
3149+
3150+ /* Build the header */
3151+ ackmsg.header.port = 0; /* Protocol port */
3152+ ackmsg.header.seq = 0;
3153+ ackmsg.header.flags = MSG_NOACK >> 16;
3154+ ackmsg.header.cluster = cpu_to_le16(cluster_id);
3155+ ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
3156+ ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother
3157+ * to look this up */
3158+ ackmsg.cmd = CLUSTER_CMD_ACK;
3159+ ackmsg.remport = remport;
3160+ ackmsg.aflags = flag;
3161+ ackmsg.seq = seq; /* Already in LE order */
3162+ vec.iov_base = &ackmsg;
3163+ vec.iov_len = sizeof (ackmsg);
3164+
3165+ memset(&msg, 0, sizeof (msg));
3166+ msg.msg_name = &daddr;
3167+ msg.msg_namelen = addr_len;
3168+ msg.msg_iovlen = 1;
3169+ msg.msg_iov = &vec;
3170+
3171+ fs = get_fs();
3172+ set_fs(get_ds());
3173+
3174+ result = sock_sendmsg(csock->sock, &msg, sizeof (ackmsg));
3175+
3176+ set_fs(fs);
3177+
3178+ if (result < 0)
3179+ printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
3180+
3181+ return result;
3182+
3183+}
3184+
3185+/* Wait for all ACKS to be gathered */
3186+void kcl_wait_for_all_acks()
3187+{
3188+ while (ack_count < acks_expected) {
3189+
3190+ DECLARE_WAITQUEUE(wq, current);
3191+ struct task_struct *tsk = current;
3192+
3193+ set_task_state(tsk, TASK_INTERRUPTIBLE);
3194+ add_wait_queue(&socket_waitq, &wq);
3195+
3196+ if (ack_count < acks_expected) {
3197+ schedule();
3198+ }
3199+
3200+ set_task_state(tsk, TASK_RUNNING);
3201+ remove_wait_queue(&socket_waitq, &wq);
3202+ }
3203+}
3204+
3205+/* Send a closedown OOB message to all cluster nodes - this tells them that a
3206+ * port listener has gone away */
3207+static void send_port_close_oob(unsigned char port)
3208+{
3209+ struct cl_closemsg closemsg;
3210+
3211+ /* Build the header */
3212+ closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
3213+ closemsg.port = port;
3214+
3215+ send_or_queue_message(&closemsg, sizeof (closemsg), NULL, 0);
3216+ return;
3217+}
3218+
3219+/* A remote port has been closed - post an OOB message to the local listen on
3220+ * that port (if there is one) */
3221+static void post_close_oob(unsigned char port, int nodeid)
3222+{
3223+ struct cl_portclosed_oob *oobmsg;
3224+ struct sk_buff *skb;
3225+ struct sock *sock = port_array[port];
3226+
3227+ if (!sock) {
3228+ return; /* No-one listening */
3229+ }
3230+
3231+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3232+ if (!skb)
3233+ return;
3234+
3235+ skb_put(skb, sizeof (*oobmsg));
3236+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3237+ oobmsg->port = port;
3238+ oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
3239+ skb->cb[0] = 0x80;
3240+ memcpy(skb->cb + 1, &nodeid, sizeof(int));
3241+
3242+ sock_queue_rcv_skb(sock, skb);
3243+
3244+}
3245+
3246+/* Leave the cluster */
3247+static void node_shutdown()
3248+{
3249+ struct cl_barrier *barrier;
3250+ struct list_head *blist;
3251+ struct list_head *temp;
3252+ struct list_head *socklist;
3253+ struct cl_client_socket *csock;
3254+ struct sk_buff *null_skb;
3255+
3256+ printk(KERN_INFO CMAN_NAME ": we are leaving the cluster\n");
3257+
3258+ atomic_set(&cnxman_running, 0);
3259+ unjam();
3260+
3261+ /* Notify kernel listeners first */
3262+ notify_kernel_listeners(LEAVING, 0);
3263+
3264+ /* Notify client sockets */
3265+ down(&client_socket_lock);
3266+ list_for_each_safe(socklist, temp, &client_socket_list) {
3267+ csock = list_entry(socklist, struct cl_client_socket, list);
3268+
3269+ null_skb = alloc_skb(0, GFP_KERNEL);
3270+ if (null_skb)
3271+ sock_queue_rcv_skb(csock->sock->sk, null_skb);
3272+ list_del(&csock->list);
3273+ kfree(csock);
3274+ }
3275+ up(&client_socket_lock);
3276+ we_are_a_cluster_member = 0;
3277+
3278+ sm_stop(1);
3279+
3280+ /* Wake up any processes waiting for barriers */
3281+ down(&barrier_list_lock);
3282+ list_for_each(blist, &barrier_list) {
3283+ barrier = list_entry(blist, struct cl_barrier, list);
3284+
3285+ /* Cancel any timers */
3286+ if (timer_pending(&barrier->timer))
3287+ del_timer(&barrier->timer);
3288+
3289+ /* Force it to be auto-delete so it discards itself */
3290+ if (barrier->state == BARRIER_STATE_WAITING) {
3291+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
3292+ wake_up_interruptible(&barrier->waitq);
3293+ }
3294+ else {
3295+ if (barrier->callback) {
3296+ barrier->callback(barrier->name, -ENOTCONN);
3297+ barrier->callback = NULL;
3298+ }
3299+ }
3300+ }
3301+ up(&barrier_list_lock);
3302+
3303+ /* Wake up any processes waiting for ISLISTENING requests */
3304+ down(&listenreq_lock);
3305+ list_for_each(blist, &listenreq_list) {
3306+ struct cl_waiting_listen_request *lrequest =
3307+ list_entry(blist, struct cl_waiting_listen_request, list);
3308+
3309+ if (lrequest->waiting)
3310+ wake_up_interruptible(&lrequest->waitq);
3311+ }
3312+ up(&listenreq_lock);
3313+}
3314+
3315+static void free_cluster_sockets()
3316+{
3317+ struct list_head *socklist;
3318+ struct cl_comms_socket *sock;
3319+ struct list_head *temp;
3320+
3321+ list_for_each_safe(socklist, temp, &socket_list) {
3322+ sock = list_entry(socklist, struct cl_comms_socket, list);
3323+
3324+ list_del(&sock->list);
3325+ fput(sock->file);
3326+ kfree(sock);
3327+ }
3328+ num_interfaces = 0;
3329+ current_interface = NULL;
3330+}
3331+
3332+/* Tidy up after all the rest of the cluster bits have shut down */
3333+static void node_cleanup()
3334+{
3335+ struct list_head *nodelist;
3336+ struct list_head *proclist;
3337+ struct list_head *temp;
3338+ struct list_head *socklist;
3339+ struct list_head *blist;
3340+ struct cl_comms_socket *sock;
3341+ struct kernel_notify_struct *knotify;
3342+
3343+ /* Free list of kernel listeners */
3344+ list_for_each_safe(proclist, temp, &kernel_listener_list) {
3345+ knotify =
3346+ list_entry(proclist, struct kernel_notify_struct, list);
3347+ list_del(&knotify->list);
3348+ kfree(knotify);
3349+ }
3350+
3351+ /* Mark the sockets as busy so they don't get added to the active
3352+ * sockets list in the next few lines of code before we free them */
3353+ list_for_each_safe(socklist, temp, &socket_list) {
3354+ sock = list_entry(socklist, struct cl_comms_socket, list);
3355+
3356+ set_bit(1, &sock->active);
3357+ }
3358+
3359+ /* Tidy the active sockets list */
3360+ list_for_each_safe(socklist, temp, &active_socket_list) {
3361+ sock =
3362+ list_entry(socklist, struct cl_comms_socket, active_list);
3363+ list_del(&sock->active_list);
3364+ }
3365+
3366+ /* Free the memory allocated to cluster nodes */
3367+ free_nodeid_array();
3368+ down(&cluster_members_lock);
3369+ us = NULL;
3370+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
3371+
3372+ struct list_head *addrlist;
3373+ struct list_head *addrtemp;
3374+ struct cluster_node *node;
3375+ struct cluster_node_addr *nodeaddr;
3376+
3377+ node = list_entry(nodelist, struct cluster_node, list);
3378+
3379+ list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
3380+ nodeaddr =
3381+ list_entry(addrlist, struct cluster_node_addr,
3382+ list);
3383+
3384+ list_del(&nodeaddr->list);
3385+ kfree(nodeaddr);
3386+ }
3387+ list_del(&node->list);
3388+ kfree(node->name);
3389+ kfree(node);
3390+ }
3391+ cluster_members = 0;
3392+ up(&cluster_members_lock);
3393+
3394+ /* Free the memory allocated to the outgoing sockets */
3395+ free_cluster_sockets();
3396+
3397+ /* Make sure that all the barriers are deleted */
3398+ down(&barrier_list_lock);
3399+ list_for_each_safe(blist, temp, &barrier_list) {
3400+ struct cl_barrier *barrier =
3401+ list_entry(blist, struct cl_barrier, list);
3402+
3403+ list_del(&barrier->list);
3404+ kfree(barrier);
3405+ }
3406+ up(&barrier_list_lock);
3407+
3408+ kcluster_pid = 0;
3409+ clear_bit(RESEND_NEEDED, &mainloop_flags);
3410+ acks_expected = 0;
3411+}
3412+
3413+/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
3414+ * blocked. */
3415+void set_quorate(int total_votes)
3416+{
3417+ int quorate;
3418+
3419+ if (get_quorum() > total_votes) {
3420+ quorate = 0;
3421+ }
3422+ else {
3423+ quorate = 1;
3424+ }
3425+
3426+ /* Hide messages during startup state transition */
3427+ if (we_are_a_cluster_member) {
3428+ if (cluster_is_quorate && !quorate)
3429+ printk(KERN_CRIT CMAN_NAME
3430+ ": quorum lost, blocking activity\n");
3431+ if (!cluster_is_quorate && quorate)
3432+ printk(KERN_CRIT CMAN_NAME
3433+ ": quorum regained, resuming activity\n");
3434+ }
3435+ cluster_is_quorate = quorate;
3436+
3437+ /* Wake up any sleeping processes */
3438+ if (cluster_is_quorate) {
3439+ unjam();
3440+ }
3441+
3442+}
3443+
3444+void queue_oob_skb(struct socket *sock, int cmd)
3445+{
3446+ struct sk_buff *skb;
3447+ struct cl_portclosed_oob *oobmsg;
3448+
3449+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3450+ if (!skb)
3451+ return;
3452+
3453+ skb_put(skb, sizeof (*oobmsg));
3454+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3455+ oobmsg->port = 0;
3456+ oobmsg->cmd = cmd;
3457+
3458+ /* There is no remote node associated with this so
3459+ clear out the field to avoid any accidents */
3460+ memset(skb->cb, 0, sizeof(int));
3461+ skb->cb[0] = 0x80;
3462+
3463+ sock_queue_rcv_skb(sock->sk, skb);
3464+}
3465+
3466+/* Notify interested parties that the cluster configuration has changed */
3467+void notify_listeners()
3468+{
3469+ struct notify_struct *notify;
3470+ struct list_head *proclist;
3471+ struct list_head *socklist;
3472+ struct list_head *temp;
3473+
3474+ /* Do kernel listeners first */
3475+ notify_kernel_listeners(CLUSTER_RECONFIG, 0);
3476+
3477+ /* Now we deign to tell userspace */
3478+ down(&event_listener_lock);
3479+ list_for_each_safe(proclist, temp, &event_listener_list) {
3480+ notify = list_entry(proclist, struct notify_struct, list);
3481+
3482+ /* If the kill fails then remove the process from the list */
3483+ if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
3484+ list_del(&notify->list);
3485+ kfree(notify);
3486+ }
3487+ }
3488+ up(&event_listener_lock);
3489+
3490+ /* Tell userspace processes which want OOB messages */
3491+ down(&client_socket_lock);
3492+ list_for_each(socklist, &client_socket_list) {
3493+ struct cl_client_socket *csock;
3494+ csock = list_entry(socklist, struct cl_client_socket, list);
3495+ queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
3496+ }
3497+ up(&client_socket_lock);
3498+}
3499+
3500+/* This fills in the list of all addresses for the local node */
3501+void get_local_addresses(struct cluster_node *node)
3502+{
3503+ struct list_head *socklist;
3504+ struct cl_comms_socket *sock;
3505+
3506+ list_for_each(socklist, &socket_list) {
3507+ sock = list_entry(socklist, struct cl_comms_socket, list);
3508+
3509+ if (sock->recv_only) {
3510+ add_node_address(node, (char *) &sock->saddr, address_length);
3511+ }
3512+ }
3513+}
3514+
3515+
3516+static uint16_t generate_cluster_id(char *name)
3517+{
3518+ int i;
3519+ int value = 0;
3520+
3521+ for (i=0; i<strlen(name); i++) {
3522+ value <<= 1;
3523+ value += name[i];
3524+ }
3525+ return value & 0xFFFF;
3526+}
3527+
3528+/* Return the next comms socket we can use. */
3529+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
3530+{
3531+ int next;
3532+ struct list_head *socklist;
3533+
3534+ /* Fast path for single interface systems */
3535+ if (num_interfaces <= 1)
3536+ return cur;
3537+
3538+ /* Next number */
3539+ next = cur->number + 1;
3540+ if (next > num_interfaces)
3541+ next = 1;
3542+
3543+ /* Find the socket with this number, I could optimise this by starting
3544+ * at the current i/f but most systems are going to have a small number
3545+ * of them anyway */
3546+ list_for_each(socklist, &socket_list) {
3547+ struct cl_comms_socket *sock;
3548+ sock = list_entry(socklist, struct cl_comms_socket, list);
3549+
3550+ if (!sock->recv_only && sock->number == next)
3551+ return sock;
3552+ }
3553+
3554+ BUG();
3555+ return NULL;
3556+}
3557+
3558+/* MUST be called with the barrier list lock held */
3559+static struct cl_barrier *find_barrier(char *name)
3560+{
3561+ struct list_head *blist;
3562+ struct cl_barrier *bar;
3563+
3564+ list_for_each(blist, &barrier_list) {
3565+ bar = list_entry(blist, struct cl_barrier, list);
3566+
3567+ if (strcmp(name, bar->name) == 0)
3568+ return bar;
3569+ }
3570+ return NULL;
3571+}
3572+
3573+/* Do the stuff we need to do when the barrier has completed phase 1 */
3574+static void check_barrier_complete_phase1(struct cl_barrier *barrier)
3575+{
3576+ if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
3577+ ? barrier->expected_nodes :
3578+ cluster_members)) {
3579+
3580+ struct cl_barriermsg bmsg;
3581+
3582+ atomic_inc(&barrier->completed_nodes); /* We have completed */
3583+ barrier->phase = 2; /* Wait for complete phase II */
3584+
3585+ /* Send completion message, remember: we are in cnxman context
3586+ * and must not block */
3587+ bmsg.cmd = CLUSTER_CMD_BARRIER;
3588+ bmsg.subcmd = BARRIER_COMPLETE;
3589+ bmsg.flags = 0;
3590+ strcpy(bmsg.name, barrier->name);
3591+
3592+ P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
3593+ queue_message((char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
3594+ }
3595+}
3596+
3597+/* Do the stuff we need to do when the barrier has been reached */
3598+/* Return 1 if we deleted the barrier */
3599+static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
3600+{
3601+ spin_lock_irq(&barrier->phase2_spinlock);
3602+
3603+ if (barrier->state != BARRIER_STATE_COMPLETE &&
3604+ (status == -ETIMEDOUT ||
3605+ atomic_read(&barrier->completed_nodes) ==
3606+ ((barrier->expected_nodes != 0)
3607+ ? barrier->expected_nodes : cluster_members))) {
3608+
3609+ if (status == 0 && barrier->timeout)
3610+ del_timer(&barrier->timer);
3611+ barrier->endreason = status;
3612+
3613+ /* Wake up listener */
3614+ if (barrier->state == BARRIER_STATE_WAITING) {
3615+ wake_up_interruptible(&barrier->waitq);
3616+ }
3617+ else {
3618+ /* Additional tasks we have to do if the user was not
3619+ * waiting... */
3620+ /* Call the callback */
3621+ if (barrier->callback) {
3622+ barrier->callback(barrier->name, 0);
3623+ barrier->callback = NULL;
3624+ }
3625+ /* Remove it if it's AUTO-DELETE */
3626+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
3627+ list_del(&barrier->list);
3628+ spin_unlock_irq(&barrier->phase2_spinlock);
3629+ kfree(barrier);
3630+ return 1;
3631+ }
3632+ }
3633+ barrier->state = BARRIER_STATE_COMPLETE;
3634+ }
3635+ spin_unlock_irq(&barrier->phase2_spinlock);
3636+ return 0;
3637+}
3638+
3639+/* Called if a barrier timeout happens */
3640+static void barrier_timer_fn(unsigned long arg)
3641+{
3642+ struct cl_barrier *barrier = (struct cl_barrier *) arg;
3643+
3644+ /* Ignore any futher messages, they are too late. */
3645+ barrier->phase = 0;
3646+
3647+ /* and cause it to timeout */
3648+ check_barrier_complete_phase2(barrier, -ETIMEDOUT);
3649+}
3650+
3651+/* Process BARRIER messages from other nodes */
3652+static void process_barrier_msg(struct cl_barriermsg *msg,
3653+ struct cluster_node *node)
3654+{
3655+ struct cl_barrier *barrier;
3656+
3657+ down(&barrier_list_lock);
3658+ barrier = find_barrier(msg->name);
3659+ up(&barrier_list_lock);
3660+
3661+ /* Ignore other peoples messages, in_transition() is needed here so
3662+ * that joining nodes will see their barrier messages before the
3663+ * we_are_a_cluster_member is set */
3664+ if (!we_are_a_cluster_member && !in_transition())
3665+ return;
3666+ if (!barrier)
3667+ return;
3668+
3669+ P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
3670+ node ? node->name : "unknown");
3671+
3672+ switch (msg->subcmd) {
3673+ case BARRIER_WAIT:
3674+ down(&barrier->lock);
3675+ if (barrier->phase == 0)
3676+ barrier->phase = 1;
3677+
3678+ if (barrier->phase == 1) {
3679+ atomic_inc(&barrier->got_nodes);
3680+ check_barrier_complete_phase1(barrier);
3681+ }
3682+ else {
3683+ printk(KERN_WARNING CMAN_NAME
3684+ ": got WAIT barrier not in phase 1 %s (%d)\n",
3685+ msg->name, barrier->phase);
3686+
3687+ }
3688+ up(&barrier->lock);
3689+ break;
3690+
3691+ case BARRIER_COMPLETE:
3692+ down(&barrier->lock);
3693+ atomic_inc(&barrier->completed_nodes);
3694+
3695+ /* First node to get all the WAIT messages sends COMPLETE, so
3696+ * we all complete */
3697+ if (barrier->phase == 1) {
3698+ atomic_set(&barrier->got_nodes,
3699+ barrier->expected_nodes);
3700+ check_barrier_complete_phase1(barrier);
3701+ }
3702+
3703+ if (barrier->phase == 2) {
3704+ /* If it was deleted (ret==1) then no need to unlock
3705+ * the mutex */
3706+ if (check_barrier_complete_phase2(barrier, 0) == 1)
3707+ return;
3708+ }
3709+ up(&barrier->lock);
3710+ break;
3711+ }
3712+}
3713+
3714+/* In-kernel membership API */
3715+int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
3716+{
3717+ struct kernel_notify_struct *notify;
3718+
3719+ notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
3720+ if (!notify)
3721+ return -ENOMEM;
3722+ notify->callback = callback;
3723+
3724+ down(&kernel_listener_lock);
3725+ list_add(&notify->list, &kernel_listener_list);
3726+ up(&kernel_listener_lock);
3727+
3728+ return 0;
3729+}
3730+
3731+int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
3732+{
3733+ struct list_head *calllist;
3734+ struct list_head *temp;
3735+ struct kernel_notify_struct *notify;
3736+
3737+ down(&kernel_listener_lock);
3738+ list_for_each_safe(calllist, temp, &kernel_listener_list) {
3739+ notify = list_entry(calllist, struct kernel_notify_struct, list);
3740+ if (notify->callback == callback){
3741+ list_del(&notify->list);
3742+ kfree(notify);
3743+ up(&kernel_listener_lock);
3744+ return 0;
3745+ }
3746+ }
3747+ up(&kernel_listener_lock);
3748+ return -EINVAL;
3749+}
3750+
3751+/* Return quorate status */
3752+int kcl_is_quorate()
3753+{
3754+ return cluster_is_quorate;
3755+}
3756+
3757+/* Return the address list for a node */
3758+struct list_head *kcl_get_node_addresses(int nodeid)
3759+{
3760+ struct cluster_node *node = find_node_by_nodeid(nodeid);
3761+
3762+ if (node)
3763+ return &node->addr_list;
3764+ else
3765+ return NULL;
3766+}
3767+
3768+static void copy_to_kclnode(struct cluster_node *node,
3769+ struct kcl_cluster_node *knode)
3770+{
3771+ strcpy(knode->name, node->name);
3772+ knode->size = sizeof (struct kcl_cluster_node);
3773+ knode->votes = node->votes;
3774+ knode->state = node->state;
3775+ knode->node_id = node->node_id;
3776+ knode->us = node->us;
3777+ knode->leave_reason = node->leave_reason;
3778+ knode->incarnation = node->incarnation;
3779+}
3780+
3781+/* Return the info for a node given it's address. if addr is NULL then return
3782+ * OUR info */
3783+int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
3784+ struct kcl_cluster_node *n)
3785+{
3786+ struct cluster_node *node;
3787+
3788+ /* They want us */
3789+ if (addr == NULL) {
3790+ node = us;
3791+ }
3792+ else {
3793+ node = find_node_by_addr(addr, addr_len);
3794+ if (!node)
3795+ return -1;
3796+ }
3797+
3798+ /* Copy to user's buffer */
3799+ copy_to_kclnode(node, n);
3800+ return 0;
3801+}
3802+
3803+int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
3804+{
3805+ struct cluster_node *node;
3806+
3807+ /* They want us */
3808+ if (name == NULL) {
3809+ node = us;
3810+ if (node == NULL)
3811+ return -1;
3812+ }
3813+ else {
3814+ node = find_node_by_name(name);
3815+ if (!node)
3816+ return -1;
3817+ }
3818+
3819+ /* Copy to user's buffer */
3820+ copy_to_kclnode(node, n);
3821+ return 0;
3822+}
3823+
3824+/* As above but by node id. MUCH faster */
3825+int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
3826+{
3827+ struct cluster_node *node;
3828+
3829+ /* They want us */
3830+ if (nodeid == 0) {
3831+ node = us;
3832+ if (node == NULL)
3833+ return -1;
3834+ }
3835+ else {
3836+ node = find_node_by_nodeid(nodeid);
3837+ if (!node)
3838+ return -1;
3839+ }
3840+
3841+ /* Copy to user's buffer */
3842+ copy_to_kclnode(node, n);
3843+ return 0;
3844+}
3845+
3846+/* Return a list of all cluster members ever */
3847+int kcl_get_all_members(struct list_head *list)
3848+{
3849+ struct list_head *nodelist;
3850+ struct cluster_node *node;
3851+ struct kcl_cluster_node *newnode;
3852+ int num_nodes = 0;
3853+
3854+ down(&cluster_members_lock);
3855+ list_for_each(nodelist, &cluster_members_list) {
3856+ if (list) {
3857+ node = list_entry(nodelist, struct cluster_node, list);
3858+ newnode =
3859+ kmalloc(sizeof (struct kcl_cluster_node),
3860+ GFP_KERNEL);
3861+ if (newnode) {
3862+ copy_to_kclnode(node, newnode);
3863+ list_add(&newnode->list, list);
3864+ num_nodes++;
3865+ }
3866+ }
3867+ else {
3868+ num_nodes++;
3869+ }
3870+ }
3871+ up(&cluster_members_lock);
3872+
3873+ return num_nodes;
3874+}
3875+
3876+/* Return a list of cluster members */
3877+int kcl_get_members(struct list_head *list)
3878+{
3879+ struct list_head *nodelist;
3880+ struct cluster_node *node;
3881+ struct kcl_cluster_node *newnode;
3882+ int num_nodes = 0;
3883+
3884+ down(&cluster_members_lock);
3885+ list_for_each(nodelist, &cluster_members_list) {
3886+ node = list_entry(nodelist, struct cluster_node, list);
3887+
3888+ if (node->state == NODESTATE_MEMBER) {
3889+ if (list) {
3890+ newnode =
3891+ kmalloc(sizeof (struct kcl_cluster_node),
3892+ GFP_KERNEL);
3893+ if (newnode) {
3894+ copy_to_kclnode(node, newnode);
3895+ list_add(&newnode->list, list);
3896+ num_nodes++;
3897+ }
3898+ }
3899+ else {
3900+ num_nodes++;
3901+ }
3902+ }
3903+ }
3904+ up(&cluster_members_lock);
3905+
3906+ return num_nodes;
3907+}
3908+
3909+/* Copy current member's nodeids into buffer */
3910+int kcl_get_member_ids(uint32_t *idbuf, int size)
3911+{
3912+ struct list_head *nodelist;
3913+ struct cluster_node *node;
3914+ int num_nodes = 0;
3915+
3916+ down(&cluster_members_lock);
3917+ list_for_each(nodelist, &cluster_members_list) {
3918+ node = list_entry(nodelist, struct cluster_node, list);
3919+
3920+ if (node->state == NODESTATE_MEMBER) {
3921+ if (idbuf && size) {
3922+ idbuf[num_nodes] = node->node_id;
3923+ num_nodes++;
3924+ size--;
3925+ }
3926+ else {
3927+ num_nodes++;
3928+ }
3929+ }
3930+ }
3931+ up(&cluster_members_lock);
3932+
3933+ return num_nodes;
3934+}
3935+
3936+/* Barrier API */
3937+int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
3938+{
3939+ struct cl_barrier *barrier;
3940+
3941+ /* We are not joined to a cluster */
3942+ if (!we_are_a_cluster_member)
3943+ return -ENOTCONN;
3944+
3945+ /* Must have a valid name */
3946+ if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
3947+ return -EINVAL;
3948+
3949+ /* We don't do this yet */
3950+ if (flags & BARRIER_ATTR_MULTISTEP)
3951+ return -ENOTSUPP;
3952+
3953+ down(&barrier_list_lock);
3954+
3955+ /* See if it already exists */
3956+ if ((barrier = find_barrier(name))) {
3957+ up(&barrier_list_lock);
3958+ if (nodes != barrier->expected_nodes) {
3959+ printk(KERN_WARNING CMAN_NAME
3960+ ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
3961+ name, barrier->expected_nodes, nodes);
3962+ up(&barrier_list_lock);
3963+ return -EINVAL;
3964+ }
3965+ else
3966+ return 0;
3967+ }
3968+
3969+ /* Build a new struct and add it to the list */
3970+ barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
3971+ if (barrier == NULL) {
3972+ up(&barrier_list_lock);
3973+ return -ENOMEM;
3974+ }
3975+ memset(barrier, 0, sizeof (*barrier));
3976+
3977+ strcpy(barrier->name, name);
3978+ barrier->flags = flags;
3979+ barrier->expected_nodes = nodes;
3980+ atomic_set(&barrier->got_nodes, 0);
3981+ atomic_set(&barrier->completed_nodes, 0);
3982+ barrier->endreason = 0;
3983+ barrier->registered_nodes = 1;
3984+ spin_lock_init(&barrier->phase2_spinlock);
3985+ barrier->state = BARRIER_STATE_INACTIVE;
3986+ init_MUTEX(&barrier->lock);
3987+
3988+ list_add(&barrier->list, &barrier_list);
3989+ up(&barrier_list_lock);
3990+
3991+ return 0;
3992+}
3993+
3994+static int barrier_setattr_enabled(struct cl_barrier *barrier,
3995+ unsigned int attr, unsigned long arg)
3996+{
3997+ int status;
3998+
3999+ /* Can't disable a barrier */
4000+ if (!arg) {
4001+ up(&barrier->lock);
4002+ return -EINVAL;
4003+ }
4004+
4005+ /* We need to send WAIT now because the user may not
4006+ * actually call kcl_barrier_wait() */
4007+ if (!barrier->waitsent) {
4008+ struct cl_barriermsg bmsg;
4009+
4010+ /* Send it to the rest of the cluster */
4011+ bmsg.cmd = CLUSTER_CMD_BARRIER;
4012+ bmsg.subcmd = BARRIER_WAIT;
4013+ strcpy(bmsg.name, barrier->name);
4014+
4015+ barrier->waitsent = 1;
4016+ barrier->phase = 1;
4017+
4018+ atomic_inc(&barrier->got_nodes);
4019+
4020+ /* Start the timer if one was wanted */
4021+ if (barrier->timeout) {
4022+ init_timer(&barrier->timer);
4023+ barrier->timer.function = barrier_timer_fn;
4024+ barrier->timer.data = (long) barrier;
4025+ mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
4026+ }
4027+
4028+ /* Barrier WAIT and COMPLETE messages are
4029+ * always queued - that way they always get
4030+ * sent out in the right order. If we don't do
4031+ * this then one can get sent out in the
4032+ * context of the user process and the other in
4033+ * cnxman and COMPLETE may /just/ slide in
4034+ * before WAIT if its in the queue
4035+ */
4036+ P_BARRIER("Sending WAIT for %s\n", name);
4037+ status = queue_message(&bmsg, sizeof (bmsg), NULL, 0, 0);
4038+ if (status < 0) {
4039+ up(&barrier->lock);
4040+ return status;
4041+ }
4042+
4043+ /* It might have been reached now */
4044+ if (barrier
4045+ && barrier->state != BARRIER_STATE_COMPLETE
4046+ && barrier->phase == 1)
4047+ check_barrier_complete_phase1(barrier);
4048+ }
4049+ if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
4050+ up(&barrier->lock);
4051+ return barrier->endreason;
4052+ }
4053+ up(&barrier->lock);
4054+ return 0; /* Nothing to propogate */
4055+}
4056+
4057+int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
4058+{
4059+ struct cl_barrier *barrier;
4060+
4061+ /* See if it already exists */
4062+ down(&barrier_list_lock);
4063+ if (!(barrier = find_barrier(name))) {
4064+ up(&barrier_list_lock);
4065+ return -ENOENT;
4066+ }
4067+ up(&barrier_list_lock);
4068+
4069+ down(&barrier->lock);
4070+ if (barrier->state == BARRIER_STATE_COMPLETE) {
4071+ up(&barrier->lock);
4072+ return 0;
4073+ }
4074+
4075+ switch (attr) {
4076+ case BARRIER_SETATTR_AUTODELETE:
4077+ if (arg)
4078+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
4079+ else
4080+ barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
4081+ up(&barrier->lock);
4082+ return 0;
4083+ break;
4084+
4085+ case BARRIER_SETATTR_TIMEOUT:
4086+ /* Can only change the timout of an inactive barrier */
4087+ if (barrier->state == BARRIER_STATE_WAITING
4088+ || barrier->waitsent) {
4089+ up(&barrier->lock);
4090+ return -EINVAL;
4091+ }
4092+ barrier->timeout = arg;
4093+ up(&barrier->lock);
4094+ return 0;
4095+
4096+ case BARRIER_SETATTR_MULTISTEP:
4097+ up(&barrier->lock);
4098+ return -ENOTSUPP;
4099+
4100+ case BARRIER_SETATTR_ENABLED:
4101+ return barrier_setattr_enabled(barrier, attr, arg);
4102+
4103+ case BARRIER_SETATTR_NODES:
4104+ /* Can only change the expected node count of an inactive
4105+ * barrier */
4106+ if (barrier->state == BARRIER_STATE_WAITING
4107+ || barrier->waitsent)
4108+ return -EINVAL;
4109+ barrier->expected_nodes = arg;
4110+ break;
4111+
4112+ case BARRIER_SETATTR_CALLBACK:
4113+ if (barrier->state == BARRIER_STATE_WAITING
4114+ || barrier->waitsent)
4115+ return -EINVAL;
4116+ barrier->callback = (void (*)(char *, int)) arg;
4117+ up(&barrier->lock);
4118+ return 0; /* Don't propgate this to other nodes */
4119+ }
4120+
4121+ up(&barrier->lock);
4122+ return 0;
4123+}
4124+
4125+int kcl_barrier_delete(char *name)
4126+{
4127+ struct cl_barrier *barrier;
4128+
4129+ down(&barrier_list_lock);
4130+ /* See if it exists */
4131+ if (!(barrier = find_barrier(name))) {
4132+ up(&barrier_list_lock);
4133+ return -ENOENT;
4134+ }
4135+
4136+ /* Delete it */
4137+ list_del(&barrier->list);
4138+ kfree(barrier);
4139+
4140+ up(&barrier_list_lock);
4141+
4142+ return 0;
4143+}
4144+
4145+int kcl_barrier_cancel(char *name)
4146+{
4147+ struct cl_barrier *barrier;
4148+
4149+ /* See if it exists */
4150+ down(&barrier_list_lock);
4151+ if (!(barrier = find_barrier(name))) {
4152+ up(&barrier_list_lock);
4153+ return -ENOENT;
4154+ }
4155+ down(&barrier->lock);
4156+
4157+ barrier->endreason = -ENOTCONN;
4158+
4159+ if (barrier->callback) {
4160+ barrier->callback(barrier->name, -ECONNRESET);
4161+ barrier->callback = NULL;
4162+ }
4163+
4164+ if (barrier->timeout)
4165+ del_timer(&barrier->timer);
4166+
4167+ /* Remove it if it's AUTO-DELETE */
4168+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4169+ list_del(&barrier->list);
4170+ up(&barrier->lock);
4171+ kfree(barrier);
4172+ up(&barrier_list_lock);
4173+ return 0;
4174+ }
4175+
4176+ if (barrier->state == BARRIER_STATE_WAITING)
4177+ wake_up_interruptible(&barrier->waitq);
4178+
4179+ up(&barrier->lock);
4180+ up(&barrier_list_lock);
4181+ return 0;
4182+}
4183+
4184+int kcl_barrier_wait(char *name)
4185+{
4186+ struct cl_barrier *barrier;
4187+ int ret;
4188+
4189+ if (!atomic_read(&cnxman_running))
4190+ return -ENOTCONN;
4191+
4192+ /* Enable it */
4193+ kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
4194+
4195+ down(&barrier_list_lock);
4196+
4197+ /* See if it still exists - enable may have deleted it! */
4198+ if (!(barrier = find_barrier(name))) {
4199+ up(&barrier_list_lock);
4200+ return -ENOENT;
4201+ }
4202+
4203+ down(&barrier->lock);
4204+
4205+ up(&barrier_list_lock);
4206+
4207+ /* If it has already completed then return the status */
4208+ if (barrier->state == BARRIER_STATE_COMPLETE) {
4209+ up(&barrier->lock);
4210+ return barrier->endreason;
4211+ }
4212+
4213+ barrier->state = BARRIER_STATE_WAITING;
4214+
4215+ /* Have we all reached the barrier? */
4216+ while (atomic_read(&barrier->completed_nodes) !=
4217+ ((barrier->expected_nodes == 0)
4218+ ? cluster_members : barrier->expected_nodes)
4219+ && barrier->endreason == 0) {
4220+
4221+ wait_queue_t wq;
4222+
4223+ init_waitqueue_entry(&wq, current);
4224+ init_waitqueue_head(&barrier->waitq);
4225+
4226+ /* Wait for em all */
4227+ set_task_state(current, TASK_INTERRUPTIBLE);
4228+ add_wait_queue(&barrier->waitq, &wq);
4229+
4230+ if (atomic_read(&barrier->completed_nodes) !=
4231+ ((barrier->expected_nodes ==
4232+ 0) ? cluster_members : barrier->expected_nodes)
4233+ && barrier->endreason == 0) {
4234+ up(&barrier->lock);
4235+ schedule();
4236+ down(&barrier->lock);
4237+ }
4238+
4239+ remove_wait_queue(&barrier->waitq, &wq);
4240+ set_task_state(current, TASK_RUNNING);
4241+
4242+ if (signal_pending(current)) {
4243+ barrier->endreason = -EINTR;
4244+ break;
4245+ }
4246+ }
4247+ barrier->state = BARRIER_STATE_INACTIVE;
4248+
4249+ if (barrier->timeout)
4250+ del_timer(&barrier->timer);
4251+
4252+ /* Barrier has been reached on all nodes, call the callback */
4253+ if (barrier->callback) {
4254+ barrier->callback(barrier->name, barrier->endreason);
4255+ barrier->callback = NULL;
4256+ }
4257+
4258+ atomic_set(&barrier->got_nodes, 0);
4259+
4260+ /* Return the reason we were woken */
4261+ ret = barrier->endreason;
4262+
4263+ /* Remove it if it's AUTO-DELETE */
4264+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4265+ down(&barrier_list_lock);
4266+ list_del(&barrier->list);
4267+ up(&barrier_list_lock);
4268+ up(&barrier->lock);
4269+ kfree(barrier);
4270+ }
4271+ else {
4272+ up(&barrier->lock);
4273+ }
4274+
4275+ /* We were woken up because the node left the cluster ? */
4276+ if (!atomic_read(&cnxman_running))
4277+ ret = -ENOTCONN;
4278+
4279+ return ret;
4280+}
4281+
4282+/* This is called from membership services when a node has left the cluster -
4283+ * we signal all waiting barriers with -ESRCH so they know to do something
4284+ * else, if the number of nodes is left at 0 then we compare the new number of
4285+ * nodes in the cluster with that at the barrier and return 0 (success) in that
4286+ * case */
4287+void check_barrier_returns()
4288+{
4289+ struct list_head *blist;
4290+ struct list_head *llist;
4291+ struct cl_barrier *barrier;
4292+ int status = 0;
4293+
4294+ down(&barrier_list_lock);
4295+ list_for_each(blist, &barrier_list) {
4296+ barrier = list_entry(blist, struct cl_barrier, list);
4297+
4298+ if (barrier->waitsent) {
4299+ int wakeit = 0;
4300+
4301+ /* Check for a dynamic member barrier */
4302+ if (barrier->expected_nodes == 0) {
4303+ if (barrier->registered_nodes ==
4304+ cluster_members) {
4305+ status = 0;
4306+ wakeit = 1;
4307+ }
4308+ }
4309+ else {
4310+ status = -ESRCH;
4311+ wakeit = 1;
4312+ }
4313+
4314+ /* Do we need to tell the barrier? */
4315+ if (wakeit) {
4316+ if (barrier->state == BARRIER_STATE_WAITING) {
4317+ barrier->endreason = status;
4318+ wake_up_interruptible(&barrier->waitq);
4319+ }
4320+ else {
4321+ if (barrier->callback) {
4322+ barrier->callback(barrier->name,
4323+ status);
4324+ }
4325+ }
4326+ }
4327+ }
4328+ }
4329+ up(&barrier_list_lock);
4330+
4331+ /* Part 2 check for outstanding listen requests for dead nodes and
4332+ * cancel them */
4333+ down(&listenreq_lock);
4334+ list_for_each(llist, &listenreq_list) {
4335+ struct cl_waiting_listen_request *lrequest =
4336+ list_entry(llist, struct cl_waiting_listen_request, list);
4337+ struct cluster_node *node =
4338+ find_node_by_nodeid(lrequest->nodeid);
4339+
4340+ if (node && node->state != NODESTATE_MEMBER) {
4341+ lrequest->result = -ENOTCONN;
4342+ lrequest->waiting = 0;
4343+ wake_up_interruptible(&lrequest->waitq);
4344+ }
4345+ }
4346+ up(&listenreq_lock);
4347+}
4348+
4349+int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
4350+{
4351+ struct temp_node *tn;
4352+ int err = 1; /* true */
4353+#ifdef DEBUG_COMMS
4354+ char buf[MAX_ADDR_PRINTED_LEN];
4355+#endif
4356+
4357+ down(&tempnode_lock);
4358+
4359+ list_for_each_entry(tn, &tempnode_list, list) {
4360+ if (tn->nodeid == nodeid) {
4361+ memcpy(addr, tn->addr, tn->addrlen);
4362+ *addrlen = tn->addrlen;
4363+ P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
4364+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4365+
4366+ goto out;
4367+ }
4368+ }
4369+ err = 0;
4370+
4371+ out:
4372+ up(&tempnode_lock);
4373+ return err;
4374+}
4375+
4376+/* Create a new temporary node ID. This list will only ever be very small
4377+ (usaully only 1 item) but I can't take the risk that someone won't try to
4378+ boot 128 nodes all at exactly the same time. */
4379+int new_temp_nodeid(char *addr, int addrlen)
4380+{
4381+ struct temp_node *tn;
4382+ int err = -1;
4383+ int try_nodeid = 0;
4384+#ifdef DEBUG_COMMS
4385+ char buf[MAX_ADDR_PRINTED_LEN];
4386+#endif
4387+
4388+ P_COMMS("new_temp_nodeid needed for\n: %s\n",
4389+ print_addr(addr, addrlen, buf));
4390+
4391+ down(&tempnode_lock);
4392+
4393+ /* First see if we already know about this node */
4394+ list_for_each_entry(tn, &tempnode_list, list) {
4395+
4396+ P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
4397+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4398+
4399+ /* We're already in here... */
4400+ if (tn->addrlen == addrlen &&
4401+ memcmp(tn->addr, addr, addrlen) == 0) {
4402+ P_COMMS("reused temp node ID %d\n", tn->nodeid);
4403+ err = tn->nodeid;
4404+ goto out;
4405+ }
4406+ }
4407+
4408+ /* Nope, OK, invent a suitable number */
4409+ retry:
4410+ try_nodeid -= 1;
4411+ list_for_each_entry(tn, &tempnode_list, list) {
4412+
4413+ if (tn->nodeid == try_nodeid)
4414+ goto retry;
4415+ }
4416+
4417+ tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
4418+ if (!tn)
4419+ goto out;
4420+
4421+ memcpy(tn->addr, addr, addrlen);
4422+ tn->addrlen = addrlen;
4423+ tn->nodeid = try_nodeid;
4424+ list_add_tail(&tn->list, &tempnode_list);
4425+ err = try_nodeid;
4426+ P_COMMS("new temp nodeid = %d\n", try_nodeid);
4427+ out:
4428+ up(&tempnode_lock);
4429+ return err;
4430+}
4431+
4432+static int is_valid_temp_nodeid(int nodeid)
4433+{
4434+ struct temp_node *tn;
4435+ int err = 1; /* true */
4436+
4437+ down(&tempnode_lock);
4438+
4439+ list_for_each_entry(tn, &tempnode_list, list) {
4440+ if (tn->nodeid == nodeid)
4441+ goto out;
4442+ }
4443+ err = 0;
4444+
4445+ out:
4446+ P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
4447+ up(&tempnode_lock);
4448+ return err;
4449+}
4450+
4451+/* TODO: This needs to clean the list more fully of
4452+ nodes that are now full members but we did not master the transition */
4453+void remove_temp_nodeid(int nodeid)
4454+{
4455+ struct temp_node *tn;
4456+ struct temp_node *tmp;
4457+
4458+ down(&tempnode_lock);
4459+
4460+ list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
4461+ if (nodeid == tn->nodeid) {
4462+ list_del(&tn->list);
4463+ kfree(tn);
4464+ up(&tempnode_lock);
4465+ return;
4466+ }
4467+ }
4468+
4469+ up(&tempnode_lock);
4470+}
4471+
4472+/* Quorum device functions */
4473+int kcl_register_quorum_device(char *name, int votes)
4474+{
4475+ if (quorum_device)
4476+ return -EBUSY;
4477+
4478+ if (find_node_by_name(name))
4479+ return -EINVAL;
4480+
4481+ quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
4482+ if (!quorum_device)
4483+ return -ENOMEM;
4484+ memset(quorum_device, 0, sizeof (struct cluster_node));
4485+
4486+ quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
4487+ if (!quorum_device->name) {
4488+ kfree(quorum_device);
4489+ quorum_device = NULL;
4490+ return -ENOMEM;
4491+ }
4492+
4493+ strcpy(quorum_device->name, name);
4494+ quorum_device->votes = votes;
4495+ quorum_device->state = NODESTATE_DEAD;
4496+
4497+ /* Keep this list valid so it doesn't confuse other code */
4498+ INIT_LIST_HEAD(&quorum_device->addr_list);
4499+
4500+ return 0;
4501+}
4502+
4503+int kcl_unregister_quorum_device(void)
4504+{
4505+ if (!quorum_device)
4506+ return -EINVAL;
4507+ if (quorum_device->state == NODESTATE_MEMBER)
4508+ return -EINVAL;
4509+
4510+ quorum_device = NULL;
4511+
4512+ return 0;
4513+}
4514+
4515+int kcl_quorum_device_available(int yesno)
4516+{
4517+ if (!quorum_device)
4518+ return -EINVAL;
4519+
4520+ if (yesno) {
4521+ quorum_device->last_hello = jiffies;
4522+ if (quorum_device->state == NODESTATE_DEAD) {
4523+ quorum_device->state = NODESTATE_MEMBER;
4524+ recalculate_quorum(0);
4525+ }
4526+ }
4527+ else {
4528+ if (quorum_device->state == NODESTATE_MEMBER) {
4529+ quorum_device->state = NODESTATE_DEAD;
4530+ recalculate_quorum(0);
4531+ }
4532+ }
4533+
4534+ return 0;
4535+}
4536+
4537+/* APIs for cluster ref counting. */
4538+int kcl_addref_cluster()
4539+{
4540+ int ret = -ENOTCONN;
4541+
4542+ if (!atomic_read(&cnxman_running))
4543+ goto addref_ret;
4544+
4545+ if (try_module_get(THIS_MODULE)) {
4546+ atomic_inc(&use_count);
4547+ ret = 0;
4548+ }
4549+
4550+ addref_ret:
4551+ return ret;
4552+}
4553+
4554+int kcl_releaseref_cluster()
4555+{
4556+ if (!atomic_read(&cnxman_running))
4557+ return -ENOTCONN;
4558+ atomic_dec(&use_count);
4559+ module_put(THIS_MODULE);
4560+ return 0;
4561+}
4562+
4563+int kcl_cluster_name(char **cname)
4564+{
4565+ char *name;
4566+
4567+ name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
4568+ if (!name)
4569+ return -ENOMEM;
4570+
4571+ strncpy(name, cluster_name, strlen(cluster_name)+1);
4572+ *cname = name;
4573+ return 0;
4574+}
4575+
4576+int kcl_get_current_interface(void)
4577+{
4578+ return current_interface->number;
4579+}
4580+
4581+/* Socket registration stuff */
4582+static struct net_proto_family cl_family_ops = {
4583+ .family = AF_CLUSTER,
4584+ .create = cl_create
4585+};
4586+
4587+static struct proto_ops cl_proto_ops = {
4588+ .family = AF_CLUSTER,
4589+
4590+ .release = cl_release,
4591+ .bind = cl_bind,
4592+ .connect = sock_no_connect,
4593+ .socketpair = sock_no_socketpair,
4594+ .accept = sock_no_accept,
4595+ .getname = cl_getname,
4596+ .poll = cl_poll,
4597+ .ioctl = cl_ioctl,
4598+ .listen = sock_no_listen,
4599+ .shutdown = cl_shutdown,
4600+ .setsockopt = cl_setsockopt,
4601+ .getsockopt = cl_getsockopt,
4602+ .sendmsg = cl_sendmsg,
4603+ .recvmsg = cl_recvmsg,
4604+ .mmap = sock_no_mmap,
4605+ .sendpage = sock_no_sendpage,
4606+};
4607+
4608+#ifdef MODULE
4609+MODULE_DESCRIPTION("Cluster Connection and Service Manager");
4610+MODULE_AUTHOR("Red Hat, Inc");
4611+MODULE_LICENSE("GPL");
4612+#endif
4613+
4614+static int __init cluster_init(void)
4615+{
4616+ printk("CMAN %s (built %s %s) installed\n",
4617+ CMAN_RELEASE_NAME, __DATE__, __TIME__);
4618+
4619+ /* allocate our sock slab cache */
4620+ cluster_sk_cachep = kmem_cache_create("cluster_sock",
4621+ sizeof (struct cluster_sock), 0,
4622+ SLAB_HWCACHE_ALIGN, 0, 0);
4623+ if (!cluster_sk_cachep) {
4624+ printk(KERN_CRIT
4625+ "cluster_init: Cannot create cluster_sock SLAB cache\n");
4626+ return -1;
4627+
4628+ }
4629+
4630+ if (sock_register(&cl_family_ops)) {
4631+ printk(KERN_INFO "Unable to register cluster socket type\n");
4632+ kmem_cache_destroy(cluster_sk_cachep);
4633+ return -1;
4634+ }
4635+
4636+
4637+#ifdef CONFIG_PROC_FS
4638+ create_proc_entries();
4639+#endif
4640+
4641+ init_MUTEX(&start_thread_sem);
4642+ init_MUTEX(&send_lock);
4643+ init_MUTEX(&barrier_list_lock);
4644+ init_MUTEX(&cluster_members_lock);
4645+ init_MUTEX(&port_array_lock);
4646+ init_MUTEX(&messages_list_lock);
4647+ init_MUTEX(&listenreq_lock);
4648+ init_MUTEX(&client_socket_lock);
4649+ init_MUTEX(&new_dead_node_lock);
4650+ init_MUTEX(&event_listener_lock);
4651+ init_MUTEX(&kernel_listener_lock);
4652+ init_MUTEX(&tempnode_lock);
4653+ spin_lock_init(&active_socket_lock);
4654+ init_timer(&ack_timer);
4655+
4656+ INIT_LIST_HEAD(&event_listener_list);
4657+ INIT_LIST_HEAD(&kernel_listener_list);
4658+ INIT_LIST_HEAD(&socket_list);
4659+ INIT_LIST_HEAD(&client_socket_list);
4660+ INIT_LIST_HEAD(&active_socket_list);
4661+ INIT_LIST_HEAD(&barrier_list);
4662+ INIT_LIST_HEAD(&messages_list);
4663+ INIT_LIST_HEAD(&listenreq_list);
4664+ INIT_LIST_HEAD(&cluster_members_list);
4665+ INIT_LIST_HEAD(&new_dead_node_list);
4666+ INIT_LIST_HEAD(&tempnode_list);
4667+
4668+ atomic_set(&cnxman_running, 0);
4669+
4670+ sm_init();
4671+
4672+ return 0;
4673+}
4674+
4675+static void __exit cluster_exit(void)
4676+{
4677+#ifdef CONFIG_PROC_FS
4678+ cleanup_proc_entries();
4679+#endif
4680+
4681+ sock_unregister(AF_CLUSTER);
4682+ kmem_cache_destroy(cluster_sk_cachep);
4683+}
4684+
4685+module_init(cluster_init);
4686+module_exit(cluster_exit);
4687+
4688+EXPORT_SYMBOL(kcl_sendmsg);
4689+EXPORT_SYMBOL(kcl_register_read_callback);
4690+EXPORT_SYMBOL(kcl_add_callback);
4691+EXPORT_SYMBOL(kcl_remove_callback);
4692+EXPORT_SYMBOL(kcl_get_members);
4693+EXPORT_SYMBOL(kcl_get_member_ids);
4694+EXPORT_SYMBOL(kcl_get_all_members);
4695+EXPORT_SYMBOL(kcl_is_quorate);
4696+EXPORT_SYMBOL(kcl_get_node_by_addr);
4697+EXPORT_SYMBOL(kcl_get_node_by_name);
4698+EXPORT_SYMBOL(kcl_get_node_by_nodeid);
4699+EXPORT_SYMBOL(kcl_get_node_addresses);
4700+EXPORT_SYMBOL(kcl_addref_cluster);
4701+EXPORT_SYMBOL(kcl_releaseref_cluster);
4702+EXPORT_SYMBOL(kcl_cluster_name);
4703+
4704+EXPORT_SYMBOL(kcl_barrier_register);
4705+EXPORT_SYMBOL(kcl_barrier_setattr);
4706+EXPORT_SYMBOL(kcl_barrier_delete);
4707+EXPORT_SYMBOL(kcl_barrier_wait);
4708+EXPORT_SYMBOL(kcl_barrier_cancel);
4709+
4710+EXPORT_SYMBOL(kcl_register_quorum_device);
4711+EXPORT_SYMBOL(kcl_unregister_quorum_device);
4712+EXPORT_SYMBOL(kcl_quorum_device_available);
4713+
4714+EXPORT_SYMBOL(kcl_register_service);
4715+EXPORT_SYMBOL(kcl_unregister_service);
4716+EXPORT_SYMBOL(kcl_join_service);
4717+EXPORT_SYMBOL(kcl_leave_service);
4718+EXPORT_SYMBOL(kcl_global_service_id);
4719+EXPORT_SYMBOL(kcl_start_done);
4720+EXPORT_SYMBOL(kcl_get_services);
4721+EXPORT_SYMBOL(kcl_get_current_interface);
4722+
4723+/*
4724+ * Overrides for Emacs so that we follow Linus's tabbing style.
4725+ * Emacs will notice this stuff at the end of the file and automatically
4726+ * adjust the settings for this buffer only. This must remain at the end
4727+ * of the file.
4728+ * ---------------------------------------------------------------------------
4729+ * Local variables:
4730+ * c-file-style: "linux"
4731+ * End:
4732+ */
4733diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
4734--- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4735+++ linux-patched/cluster/cman/config.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 4736@@ -0,0 +1,46 @@
4737+/******************************************************************************
4738+*******************************************************************************
4739+**
4740+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4741+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4742+**
4743+** This copyrighted material is made available to anyone wishing to use,
4744+** modify, copy, or redistribute it subject to the terms and conditions
4745+** of the GNU General Public License v.2.
4746+**
4747+*******************************************************************************
4748+******************************************************************************/
4749+
4750+#include "config.h"
4751+
4752+/* Config file defaults */
4753+
4754+#define DEFAULT_JOIN_WAIT_TIME 11 /* Time to wait while sending JOINREQ
4755+ * messages. Should be at least twice
4756+ * the HELLO timer */
4757+#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a
4758+ * JOINACK to regarding that node as
4759+ * dead */
4760+#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */
4761+#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a
4762+ * node in this period kill it */
4763+#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition
4764+ * should take */
4765+#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to
4766+ * a JOINCONF message */
4767+#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */
4768+#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition
4769+ * restarts before we die */
4770+#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */
4771+
4772+struct config_info cman_config = {
4773+ .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
4774+ .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
4775+ .join_timeout = DEFAULT_JOIN_TIMEOUT,
4776+ .hello_timer = DEFAULT_HELLO_TIMER,
4777+ .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
4778+ .transition_timeout = DEFAULT_TRANSITION_TIMER,
4779+ .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
4780+ .max_nodes = DEFAULT_MAX_NODES,
4781+ .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
4782+};
4783diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
4784--- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4785+++ linux-patched/cluster/cman/config.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 4786@@ -0,0 +1,31 @@
4787+/******************************************************************************
4788+*******************************************************************************
4789+**
4790+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4791+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4792+**
4793+** This copyrighted material is made available to anyone wishing to use,
4794+** modify, copy, or redistribute it subject to the terms and conditions
4795+** of the GNU General Public License v.2.
4796+**
4797+*******************************************************************************
4798+******************************************************************************/
4799+
4800+#ifndef __CONFIG_DOT_H__
4801+#define __CONFIG_DOT_H__
4802+
4803+struct config_info {
4804+ int joinwait_timeout;
4805+ int joinconf_timeout;
4806+ int join_timeout;
4807+ int hello_timer;
4808+ int deadnode_timeout;
4809+ int transition_timeout;
4810+ int transition_restarts;
4811+ int max_nodes;
4812+ int sm_debug_size;
4813+};
4814+
4815+extern struct config_info cman_config;
4816+
4817+#endif /* __CONFIG_DOT_H__ */
4818diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
4819--- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4820+++ linux-patched/cluster/cman/kjoin.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 4821@@ -0,0 +1,238 @@
4822+/******************************************************************************
4823+*******************************************************************************
4824+**
4825+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4826+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4827+**
4828+** This copyrighted material is made available to anyone wishing to use,
4829+** modify, copy, or redistribute it subject to the terms and conditions
4830+** of the GNU General Public License v.2.
4831+**
4832+*******************************************************************************
4833+******************************************************************************/
4834+
4835+#include <linux/socket.h>
4836+#include <net/sock.h>
4837+#include <linux/list.h>
4838+#include <cluster/cnxman.h>
4839+#include <linux/in.h>
4840+
4841+#include "cnxman-private.h"
4842+
4843+static struct socket *mcast_sock;
4844+static struct socket *recv_sock;
4845+static struct socket *cluster_sock;
4846+
4847+extern short cluster_id;
4848+extern int join_count;
4849+extern struct semaphore join_count_lock;
4850+extern atomic_t cnxman_running;
4851+
4852+int kcl_join_cluster(struct cl_join_cluster_info *join_info)
4853+{
4854+ int result;
4855+ int one = 1, error;
4856+ unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
4857+ unsigned short port = join_info->port;
4858+ mm_segment_t fs;
4859+ struct sockaddr_in saddr;
4860+ struct kcl_multicast_sock mcast_info;
4861+
4862+ down(&join_count_lock);
4863+ if (atomic_read(&cnxman_running))
4864+ {
4865+ error = 0;
4866+ if (join_info->cluster_id == cluster_id)
4867+ join_count++;
4868+ else
4869+ error = -EINVAL;
4870+ up(&join_count_lock);
4871+ return error;
4872+ }
4873+ up(&join_count_lock);
4874+
4875+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
4876+ if (result < 0)
4877+ {
4878+ printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
4879+ return result;
4880+ }
4881+
4882+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
4883+ if (result < 0)
4884+ {
4885+ printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
4886+ return result;
4887+ }
4888+
4889+ fs = get_fs();
4890+ set_fs(get_ds());
4891+
4892+ if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
4893+ (void *) &one, sizeof (int))))
4894+ {
4895+ set_fs(fs);
4896+ printk("Error %d Setting master socket to SO_BROADCAST\n",
4897+ error);
4898+ sock_release(mcast_sock);
4899+ return -1;
4900+ }
4901+ set_fs(fs);
4902+
4903+ /* Bind the multicast socket */
4904+ saddr.sin_family = AF_INET;
4905+ saddr.sin_port = htons(port);
4906+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
4907+ result =
4908+ mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
4909+ sizeof (saddr));
4910+ if (result < 0)
4911+ {
4912+ printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
4913+ sock_release(mcast_sock);
4914+ sock_release(recv_sock);
4915+ return result;
4916+ }
4917+
4918+ /* Bind the receive socket to our IP address */
4919+ saddr.sin_family = AF_INET;
4920+ saddr.sin_port = htons(port);
4921+ saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
4922+ result =
4923+ recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
4924+ sizeof (saddr));
4925+ if (result < 0)
4926+ {
4927+ printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
4928+ sock_release(mcast_sock);
4929+ sock_release(recv_sock);
4930+ return result;
4931+ }
4932+
4933+ /* Create the cluster master socket */
4934+ result =
4935+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
4936+ if (result < 0)
4937+ {
4938+ printk(KERN_ERR CMAN_NAME
4939+ ": Can't create cluster master socket\n");
4940+ sock_release(mcast_sock);
4941+ sock_release(recv_sock);
4942+ return result;
4943+ }
4944+
4945+ /* This is the broadcast transmit address */
4946+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
4947+
4948+ /* Pass the multicast socket to kernel space */
4949+ mcast_info.sock = mcast_sock;
4950+ mcast_info.number = 1;
4951+
4952+ fs = get_fs();
4953+ set_fs(get_ds());
4954+
4955+ if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
4956+ KCL_SET_MULTICAST,
4957+ (void *) &mcast_info,
4958+ sizeof (mcast_info))))
4959+ {
4960+ set_fs(fs);
4961+ printk(CMAN_NAME
4962+ ": Unable to pass multicast socket to cnxman, %d\n",
4963+ error);
4964+ sock_release(mcast_sock);
4965+ sock_release(recv_sock);
4966+ sock_release(cluster_sock);
4967+ return -1;
4968+ }
4969+
4970+ mcast_info.sock = recv_sock;
4971+ if ((error =
4972+ cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
4973+ KCL_SET_RCVONLY,
4974+ (void *) &mcast_info,
4975+ sizeof (mcast_info))))
4976+ {
4977+ set_fs(fs);
4978+ printk(CMAN_NAME
4979+ ": Unable to pass receive socket to cnxman, %d\n",
4980+ error);
4981+ sock_release(mcast_sock);
4982+ sock_release(recv_sock);
4983+ sock_release(cluster_sock);
4984+ return -1;
4985+ }
4986+
4987+ /* This setsockopt expects usermode variables */
4988+
4989+ if (cluster_sock->ops->
4990+ setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
4991+ (void *) join_info,
4992+ sizeof (struct cl_join_cluster_info)))
4993+
4994+ {
4995+ set_fs(fs);
4996+ printk(CMAN_NAME ": Unable to join cluster\n");
4997+ sock_release(mcast_sock);
4998+ sock_release(recv_sock);
4999+ sock_release(cluster_sock);
5000+ return -1;
5001+ }
5002+ set_fs(fs);
5003+
5004+ return 0;
5005+}
5006+
5007+int kcl_leave_cluster(int remove)
5008+{
5009+ mm_segment_t fs;
5010+ int rem = remove;
5011+ int ret = 0;
5012+ struct socket *shutdown_sock = cluster_sock;
5013+
5014+ cluster_sock = NULL;
5015+
5016+ if (!shutdown_sock)
5017+ {
5018+ /* Create the cluster master socket */
5019+ int result =
5020+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
5021+ &shutdown_sock);
5022+ if (result < 0)
5023+ {
5024+ printk(KERN_ERR CMAN_NAME
5025+ ": Can't create cluster master socket\n");
5026+ sock_release(mcast_sock);
5027+ sock_release(recv_sock);
5028+ return result;
5029+ }
5030+ }
5031+
5032+ fs = get_fs();
5033+ set_fs(get_ds());
5034+
5035+ if ((ret =
5036+ shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
5037+ CLU_LEAVE_CLUSTER, (void *) &rem,
5038+ sizeof (int))))
5039+ {
5040+ printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
5041+ ret);
5042+ }
5043+ set_fs(fs);
5044+
5045+ sock_release(shutdown_sock);
5046+
5047+ return ret;
5048+}
5049+
5050+/*
5051+ * Overrides for Emacs so that we follow Linus's tabbing style.
5052+ * Emacs will notice this stuff at the end of the file and automatically
5053+ * adjust the settings for this buffer only. This must remain at the end
5054+ * of the file.
5055+ * ---------------------------------------------------------------------------
5056+ * Local variables:
5057+ * c-file-style: "linux"
5058+ * End:
5059+ */
5060diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
5061--- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 5062+++ linux-patched/cluster/cman/membership.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 5063@@ -0,0 +1,3069 @@
5064+/******************************************************************************
5065+*******************************************************************************
5066+**
5067+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5068+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5069+**
5070+** This copyrighted material is made available to anyone wishing to use,
5071+** modify, copy, or redistribute it subject to the terms and conditions
5072+** of the GNU General Public License v.2.
5073+**
5074+*******************************************************************************
5075+******************************************************************************/
5076+
5077+#include <linux/socket.h>
5078+#include <net/sock.h>
5079+#include <linux/slab.h>
5080+#include <linux/spinlock.h>
5081+#include <linux/vmalloc.h>
5082+#include <asm/uaccess.h>
5083+#include <linux/list.h>
5084+#include <cluster/cnxman.h>
5085+
5086+#include "cnxman-private.h"
5087+#include "config.h"
5088+#include "sm_control.h"
5089+
5090+#ifndef TRUE
5091+#define TRUE 1
5092+#endif
5093+
5094+/* Barrier name for membership transitions. %d is the cluster generation number
5095+ */
5096+#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d"
5097+
5098+/* Variables also used by connection manager */
5099+struct list_head cluster_members_list;
5100+struct semaphore cluster_members_lock;
5101+int cluster_members; /* Number of ACTIVE members, not a count of
5102+ * nodes in the list */
5103+int we_are_a_cluster_member = 0;
5104+int cluster_is_quorate;
5105+int quit_threads = 0;
5106+struct task_struct *membership_task;
5107+struct cluster_node *us;
5108+
5109+static struct task_struct *hello_task;
5110+static struct semaphore hello_task_lock;
5111+
5112+/* Variables that belong to the connection manager */
5113+extern wait_queue_head_t cnxman_waitq;
5114+extern struct completion member_thread_comp;
5115+extern struct cluster_node *quorum_device;
5116+extern unsigned short two_node;
5117+extern char cluster_name[];
5118+extern unsigned int config_version;
5119+extern unsigned int address_length;
5120+
5121+static struct socket *mem_socket;
5122+static pid_t kcluster_pid;
5123+
5124+static char iobuf[MAX_CLUSTER_MESSAGE];
5125+static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
5126+
5127+/* Our node name, usually system_utsname.nodename, but can be overridden */
5128+char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
5129+
5130+static spinlock_t members_by_nodeid_lock;
5131+static int sizeof_members_array = 0; /* Can dynamically increase (vmalloc
5132+ * permitting) */
5133+static struct cluster_node **members_by_nodeid;
5134+
5135+#define MEMBER_INCREMENT_SIZE 10
5136+
5137+static int votes = 1; /* Votes this node has */
5138+static int expected_votes = 1; /* Total expected votes in the cluster */
5139+static unsigned int quorum; /* Quorum, fewer votes than this and we stop
5140+ * work */
5141+static int leavereason; /* Saved for the duration of a state transition */
5142+static int transitionreason; /* Reason this transition was initiated */
5143+static unsigned int highest_nodeid; /* Highest node ID known to the cluster */
5144+static struct timer_list transition_timer; /* Kicks in if the transition
5145+ * doesn't complete in a
5146+ * reasonable time */
5147+static struct timer_list hello_timer; /* Timer to send HELLOs on */
5148+static unsigned long join_time; /* The time that we got our JOIN-ACK */
5149+static unsigned long start_time; /* The time that we were started */
5150+static int joinconf_count; /* Number of JOINCONF messages we have sent to
5151+ * a new node */
5152+static unsigned long wake_flags;/* Reason we were woken */
5153+
5154+/* Flags in above */
5155+#define WAKE_FLAG_DEADNODE 1
5156+#define WAKE_FLAG_TRANSTIMER 2
5157+
5158+/* The time the transition finished */
5159+static unsigned long transition_end_time;
5160+
5161+/* A list of nodes that cnxman tells us are dead. I hope this never has more
5162+ * than one element in it but I can't take that chance. only non-static so it
5163+ * can be initialised in module_load. */
5164+struct list_head new_dead_node_list;
5165+struct semaphore new_dead_node_lock;
5166+
5167+static int do_membership_packet(struct msghdr *msg, int len);
5168+static int do_process_joinreq(struct msghdr *msg, int len);
5169+static int do_process_joinack(struct msghdr *msg, int len);
5170+static int do_process_joinconf(struct msghdr *msg, int len);
5171+static int do_process_leave(struct msghdr *msg, int len);
5172+static int do_process_hello(struct msghdr *msg, int len);
5173+static int do_process_kill(struct msghdr *msg, int len);
5174+static int do_process_reconfig(struct msghdr *msg, int len);
5175+static int do_process_starttrans(struct msghdr *msg, int len);
5176+static int do_process_masterview(struct msghdr *msg, int len);
5177+static int do_process_endtrans(struct msghdr *msg, int len);
5178+static int do_process_viewack(struct msghdr *msg, int len);
5179+static int do_process_startack(struct msghdr *msg, int len);
5180+static int do_process_newcluster(struct msghdr *msg, int len);
5181+static int do_process_nominate(struct msghdr *msg, int len);
5182+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
5183+ unsigned int flags);
5184+static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
5185+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
5186+static int send_hello(void);
5187+static int send_master_hello(void);
5188+static int send_newcluster(void);
5189+static int end_transition(void);
5190+static int dispatch_messages(struct socket *mem_socket);
5191+static void check_for_dead_nodes(void);
5192+static void confirm_joiner(void);
5193+static void reset_hello_time(void);
5194+static int add_us(void);
5195+static int send_joinconf(void);
5196+static int init_membership_services(void);
5197+static int elect_master(struct cluster_node **);
5198+static void trans_timer_expired(unsigned long arg);
5199+static void hello_timer_expired(unsigned long arg);
5200+static void join_or_form_cluster(void);
5201+static int do_timer_wakeup(void);
5202+static int start_transition(unsigned char reason, struct cluster_node *node);
5203+int send_leave(unsigned char);
5204+int send_reconfigure(int, unsigned int);
5205+
5206+#ifdef DEBUG_MEMB
5207+static char *msgname(int msg);
5208+static int debug_sendmsg(struct socket *sock, void *buf, int size,
5209+ struct sockaddr_cl *caddr, int addr_len,
5210+ unsigned int flags)
5211+{
5212+ P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
5213+ size);
5214+ return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
5215+}
5216+
5217+#define kcl_sendmsg debug_sendmsg
5218+#endif
5219+
5220+/* State of the node */
5221+static enum { STARTING, JOINING, JOINWAIT, JOINACK, TRANSITION,
5222+ TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
5223+} node_state = STARTING;
5224+
5225+/* Sub-state when we are MASTER */
5226+static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
5227+ MASTER_COMPLETE } master_state;
5228+
5229+/* Number of responses collected while a master controlling a state transition */
5230+static int responses_collected;
5231+static int responses_expected;
5232+
5233+/* Current cluster generation number */
5234+static int cluster_generation = 1;
5235+
5236+/* When another node initiates a transtion then store it's pointer in here so
5237+ * we can check for other nodes trying to spoof us */
5238+static struct cluster_node *master_node = NULL;
5239+
5240+/* Struct the node wanting to join us */
5241+static struct cluster_node *joining_node = NULL;
5242+static int joining_temp_nodeid = 0;
5243+
5244+/* Last time a HELLO message was sent */
5245+unsigned long last_hello = 0;
5246+
5247+/* When we got our JOINWAIT or NEWCLUSTER */
5248+unsigned long joinwait_time = 0;
5249+
5250+/* Number of times a transition has restarted when we were master */
5251+int transition_restarts = 0;
5252+
5253+/* Variables used by the master to collect cluster status during a transition */
5254+static int agreeing_nodes = 0;
5255+static int dissenting_nodes = 0;
5256+static uint8_t *node_opinion = NULL;
5257+#define OPINION_AGREE 1
5258+#define OPINION_DISAGREE 2
5259+
5260+/* Set node id of a node, also add it to the members array and expand the array
5261+ * if necessary */
5262+static inline void set_nodeid(struct cluster_node *node, int nodeid)
5263+{
5264+ if (!nodeid)
5265+ return;
5266+
5267+ node->node_id = nodeid;
5268+ if (nodeid > sizeof_members_array) {
5269+ int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
5270+ struct cluster_node **new_array =
5271+ vmalloc((new_size) * sizeof (struct cluster_node *));
5272+ if (new_array) {
5273+ spin_lock(&members_by_nodeid_lock);
5274+ memcpy(new_array, members_by_nodeid,
5275+ sizeof_members_array *
5276+ sizeof (struct cluster_node *));
5277+ memset(&new_array[sizeof_members_array], 0,
5278+ MEMBER_INCREMENT_SIZE *
5279+ sizeof (struct cluster_node *));
5280+ vfree(members_by_nodeid);
5281+ members_by_nodeid = new_array;
5282+ sizeof_members_array = new_size;
5283+ spin_unlock(&members_by_nodeid_lock);
5284+ }
5285+ else {
5286+ panic("No memory for more nodes");
5287+ }
5288+ }
5289+ notify_kernel_listeners(NEWNODE, (long) nodeid);
5290+
5291+ spin_lock(&members_by_nodeid_lock);
5292+ members_by_nodeid[nodeid] = node;
5293+ spin_unlock(&members_by_nodeid_lock);
5294+}
5295+
5296+static int hello_kthread(void *unused)
5297+{
5298+ struct task_struct *tsk = current;
5299+ sigset_t tmpsig;
5300+
5301+ daemonize("cman_hbeat");
5302+
5303+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5304+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5305+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5306+
5307+ down(&hello_task_lock);
5308+ hello_task = tsk;
5309+ up(&hello_task_lock);
5310+
5311+ set_user_nice(current, -6);
5312+
5313+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5314+ send_hello();
5315+
5316+ /* Scan the nodes list for dead nodes */
5317+ if (node_state == MEMBER)
5318+ check_for_dead_nodes();
5319+
5320+ set_task_state(current, TASK_INTERRUPTIBLE);
5321+ schedule();
5322+ set_task_state(current, TASK_RUNNING);
5323+ }
5324+ down(&hello_task_lock);
5325+ hello_task = NULL;
5326+ up(&hello_task_lock);
5327+ P_MEMB("heartbeat closing down\n");
5328+ return 0;
5329+}
5330+
5331+/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
5332+ * that keeps track of and controls cluster membership. */
5333+static int membership_kthread(void *unused)
5334+{
5335+ struct task_struct *tsk = current;
5336+ struct socket *tmp_socket;
5337+ sigset_t tmpsig;
5338+
5339+ daemonize("cman_memb");
5340+
5341+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5342+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5343+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5344+
5345+ membership_task = tsk;
5346+ set_user_nice(current, -5);
5347+
5348+ /* Open the socket */
5349+ if (init_membership_services())
5350+ return -1;
5351+
5352+ add_us();
5353+ joining_node = us;
5354+
5355+ init_timer(&hello_timer);
5356+ hello_timer.function = hello_timer_expired;
5357+ hello_timer.data = 0L;
5358+
5359+ /* Do joining stuff */
5360+ join_or_form_cluster();
5361+
5362+ transition_end_time = jiffies;
5363+
5364+ /* Main loop */
5365+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5366+
5367+ struct task_struct *tsk = current;
5368+
5369+ DECLARE_WAITQUEUE(wait, tsk);
5370+
5371+ tsk->state = TASK_INTERRUPTIBLE;
5372+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5373+
5374+ if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
5375+ wake_flags == 0) {
5376+ if (node_state == JOINACK ||
5377+ node_state == JOINWAIT)
5378+ schedule_timeout(HZ);
5379+ else
5380+ schedule();
5381+ }
5382+
5383+ tsk->state = TASK_RUNNING;
5384+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5385+
5386+ /* Are we being shut down? */
5387+ if (node_state == LEFT_CLUSTER || quit_threads ||
5388+ signal_pending(current))
5389+ break;
5390+
5391+ /* Were we woken by a dead node passed down from cnxman ? */
5392+ if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
5393+ struct list_head *nodelist, *tmp;
5394+ struct cl_new_dead_node *deadnode;
5395+
5396+ down(&new_dead_node_lock);
5397+ list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
5398+ deadnode =
5399+ list_entry(nodelist,
5400+ struct cl_new_dead_node, list);
5401+
5402+ if (deadnode->node->state == NODESTATE_MEMBER)
5403+ a_node_just_died(deadnode->node);
5404+ list_del(&deadnode->list);
5405+ kfree(deadnode);
5406+ }
5407+ up(&new_dead_node_lock);
5408+ }
5409+
5410+ /* Process received messages. If dispatch_message() returns an
5411+ * error then we shut down */
5412+ if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5413+ if (dispatch_messages(mem_socket) < 0)
5414+ goto leave_cluster;
5415+
5416+ }
5417+
5418+ /* Were we woken by the transition timer firing ? */
5419+ if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
5420+ switch (do_timer_wakeup()) {
5421+ case -1:
5422+ continue;
5423+ case 0:
5424+ break;
5425+ case +1:
5426+ goto leave_cluster;
5427+ }
5428+ }
5429+
5430+ /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
5431+ * messages again */
5432+ if (node_state == JOINACK
5433+ && time_after(jiffies,
5434+ join_time + cman_config.join_timeout * HZ)) {
5435+ P_MEMB
5436+ ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
5437+ node_state = JOINWAIT;
5438+ joinwait_time = jiffies;
5439+ }
5440+
5441+ /* Have we been in joinwait for too long... */
5442+ if (node_state == JOINWAIT
5443+ && time_after(jiffies, joinwait_time +
5444+ cman_config.join_timeout * HZ)) {
5445+ printk(CMAN_NAME
5446+ ": Been in JOINWAIT for too long - giving up\n");
5447+ goto leave_cluster;
5448+ }
5449+ }
5450+
5451+ leave_cluster:
5452+
5453+ /* Wake up the heartbeat thread so it can exit */
5454+ down(&hello_task_lock);
5455+ if (hello_task)
5456+ wake_up_process(hello_task);
5457+ up(&hello_task_lock);
5458+
5459+ if (timer_pending(&hello_timer))
5460+ del_timer(&hello_timer);
5461+
5462+ if (timer_pending(&transition_timer))
5463+ del_timer(&transition_timer);
5464+
5465+ node_state = LEFT_CLUSTER;
5466+ P_MEMB("closing down\n");
5467+ quit_threads = 1; /* force other thread to exit too */
5468+
5469+ /* Close the socket, NULL the pointer first so it doesn't get used
5470+ * by send_leave()
5471+ */
5472+ tmp_socket = mem_socket;
5473+ mem_socket = NULL;
5474+ sock_release(tmp_socket);
5475+ highest_nodeid = 0;
5476+ complete(&member_thread_comp);
5477+ return 0;
5478+}
5479+
5480+/* Things to do in the main thread when the transition timer has woken us.
5481+ * Usually this happens when a transition is taking too long and we need to
5482+ * take remedial action.
5483+ *
5484+ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
5485+static int do_timer_wakeup()
5486+{
5487+ P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
5488+
5489+ /* Resend JOINCONF if it got lost on the wire */
5490+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5491+ mod_timer(&transition_timer,
5492+ jiffies + cman_config.joinconf_timeout * HZ);
5493+ if (++joinconf_count < MAX_RETRIES) {
5494+ P_MEMB("Resending JOINCONF\n");
5495+ send_joinconf();
5496+ }
5497+ else {
5498+ P_MEMB("JOINCONF not acked, cancelling transition\n");
5499+ end_transition();
5500+ }
5501+ return -1;
5502+ }
5503+
5504+ /* A joining node probably died */
5505+ if (cluster_members == 1) {
5506+ end_transition();
5507+ return -1;
5508+ }
5509+
5510+ /* See if the master is still there */
5511+ if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
5512+
5513+ /* If we are in transition and master_node is NULL then we are
5514+ * waiting for ENDTRANS after JOIN-CONF */
5515+ if (!master_node) {
5516+ /* Hmmm. master died after sending JOINCONF, we'll have
5517+ * to die as we are in mid-transition */
5518+ printk(KERN_INFO CMAN_NAME
5519+ ": Master died after JOINCONF, we must leave the cluster\n");
5520+ quit_threads = 1;
5521+ return +1;
5522+ }
5523+
5524+ /* No messages from the master - see if it's stil there */
5525+ if (master_node->state == NODESTATE_MEMBER) {
5526+ send_master_hello();
5527+ mod_timer(&transition_timer,
5528+ jiffies +
5529+ cman_config.transition_timeout * HZ);
5530+ }
5531+
5532+ /* If the master is dead then elect a new one */
5533+ if (master_node->state == NODESTATE_DEAD) {
5534+
5535+ struct cluster_node *node;
5536+
5537+ P_MEMB("Master node is dead...Election!\n");
5538+ if (elect_master(&node)) {
5539+
5540+ /* We are master now, all kneel */
5541+ start_transition(TRANS_DEADMASTER, master_node);
5542+ }
5543+ else {
5544+ /* Leave the job to someone on more pay */
5545+ master_node = node;
5546+ mod_timer(&transition_timer,
5547+ jiffies +
5548+ cman_config.transition_timeout * HZ);
5549+ }
5550+ }
5551+ }
5552+
5553+ /* If we are the master node then restart the transition */
5554+ if (node_state == MASTER) {
5555+ start_transition(TRANS_RESTART, us);
5556+ }
5557+
5558+ return 0;
5559+}
5560+
5561+static void form_cluster(void)
5562+{
5563+ printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
5564+ node_state = MEMBER;
5565+ we_are_a_cluster_member = TRUE;
5566+ us->node_id = 1;
5567+ us->state = NODESTATE_MEMBER;
5568+ set_nodeid(us, 1);
5569+ recalculate_quorum(0);
5570+ sm_member_update(cluster_is_quorate);
5571+ send_hello();
5572+ kernel_thread(hello_kthread, NULL, 0);
5573+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5574+}
5575+
5576+/* This does the initial JOIN part of the membership process. Actually most of
5577+ * is done in the message processing routines but this is the main loop that
5578+ * controls it. The side-effect of this routine is "node_state" which tells the
5579+ * real main loop (in the kernel thread routine) what to do next */
5580+static void join_or_form_cluster()
5581+{
5582+ start_time = jiffies;
5583+
5584+ printk(KERN_INFO CMAN_NAME
5585+ ": Waiting to join or form a Linux-cluster\n");
5586+ join_time = 0;
5587+ start_time = jiffies;
5588+ joinwait_time = jiffies;
5589+ last_hello = 0;
5590+ send_newcluster();
5591+
5592+ /* Listen for a reply */
5593+ do {
5594+ DECLARE_WAITQUEUE(wait, current);
5595+ set_task_state(current, TASK_INTERRUPTIBLE);
5596+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5597+
5598+ if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5599+ schedule_timeout((cman_config.joinwait_timeout * HZ) /
5600+ 5);
5601+
5602+ set_task_state(current, TASK_RUNNING);
5603+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5604+
5605+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5606+ dispatch_messages(mem_socket);
5607+ }
5608+ if (quit_threads)
5609+ node_state = LEFT_CLUSTER;
5610+
5611+ }
5612+ while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
5613+ node_state == STARTING);
5614+
5615+ /* If we didn't hear any HELLO messages then form a new cluster */
5616+ if (node_state == STARTING) {
5617+ form_cluster();
5618+ }
5619+ else
5620+ last_hello = jiffies;
5621+
5622+}
5623+
5624+int start_membership_services(pid_t cluster_pid)
5625+{
5626+ kcluster_pid = cluster_pid;
5627+
5628+ init_timer(&transition_timer);
5629+ transition_timer.function = trans_timer_expired;
5630+ transition_timer.data = 0L;
5631+
5632+ /* Start the thread */
5633+ return kernel_thread(membership_kthread, NULL, 0);
5634+}
5635+
5636+static int init_membership_services()
5637+{
5638+ int result;
5639+ struct sockaddr_cl saddr;
5640+ struct socket *sock;
5641+
5642+ init_MUTEX(&hello_task_lock);
5643+ /* Create a socket to communicate with */
5644+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
5645+ if (result < 0) {
5646+ printk(KERN_ERR CMAN_NAME
5647+ ": Can't create cluster socket for membership services\n");
5648+ return result;
5649+ }
5650+ mem_socket = sock;
5651+
5652+ /* Bind to our port */
5653+ saddr.scl_family = AF_CLUSTER;
5654+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5655+ result =
5656+ sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
5657+ if (result < 0) {
5658+ printk(KERN_ERR CMAN_NAME
5659+ ": Can't bind to cluster membership services port\n");
5660+ sock_release(sock);
5661+ return result;
5662+ }
5663+
5664+ node_state = STARTING;
5665+ return 0;
5666+}
5667+
5668+static int send_joinconf()
5669+{
5670+ struct sockaddr_cl saddr;
5671+ int status;
5672+
5673+ if (joining_temp_nodeid == 0) {
5674+ BUG();
5675+ }
5676+
5677+ master_state = MASTER_CONFIRM;
5678+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5679+ saddr.scl_family = AF_CLUSTER;
5680+ saddr.scl_nodeid = joining_temp_nodeid;
5681+ status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
5682+ MSG_NOACK);
5683+
5684+ if (status < 0) {
5685+ printk("Error %d sending JOINCONF, aborting transition\n", status);
5686+ end_transition();
5687+ }
5688+ return status;
5689+}
5690+
5691+static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
5692+{
5693+ char *msgbuf = scratchbuf;
5694+ struct list_head *addrlist;
5695+ int ptr = sizeof (struct cl_mem_join_msg);
5696+ unsigned short num_addr = 0;
5697+ struct cluster_node_addr *nodeaddr;
5698+ struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
5699+
5700+ msg->cmd = CLUSTER_MEM_JOINREQ;
5701+ msg->votes = votes;
5702+ msg->expected_votes = cpu_to_le32(expected_votes);
5703+ msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
5704+ msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
5705+ msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
5706+ msg->config_version = cpu_to_le32(config_version);
5707+ msg->addr_len = cpu_to_le32(address_length);
5708+ strcpy(msg->clustername, cluster_name);
5709+
5710+ /* Add our addresses */
5711+ list_for_each(addrlist, &us->addr_list) {
5712+ nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
5713+
5714+ memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
5715+ ptr += address_length;
5716+ num_addr++;
5717+ }
5718+ msg->num_addr = cpu_to_le16(num_addr);
5719+
5720+ /* And our name */
5721+ strcpy(msgbuf + ptr, nodename);
5722+ ptr += strlen(nodename) + 1;
5723+
5724+ return kcl_sendmsg(mem_socket, msgbuf, ptr,
5725+ addr, addr_len, MSG_NOACK);
5726+}
5727+
5728+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
5729+{
5730+ struct cl_mem_startack_msg msg;
5731+
5732+ msg.cmd = CLUSTER_MEM_STARTACK;
5733+ msg.generation = cpu_to_le32(cluster_generation);
5734+ msg.node_id = cpu_to_le32(node_id);
5735+ msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
5736+
5737+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, 0);
5738+}
5739+
5740+static int send_newcluster()
5741+{
5742+ char buf[1];
5743+
5744+ buf[0] = CLUSTER_MEM_NEWCLUSTER;
5745+
5746+ return kcl_sendmsg(mem_socket, buf, 1, NULL, 0,
5747+ MSG_NOACK);
5748+}
5749+
5750+static int send_hello()
5751+{
5752+ struct cl_mem_hello_msg hello_msg;
5753+ int status;
5754+
5755+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5756+ hello_msg.members = cpu_to_le16(cluster_members);
5757+ hello_msg.flags = 0;
5758+ hello_msg.generation = cpu_to_le32(cluster_generation);
5759+
5760+ status =
5761+ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), NULL, 0,
5762+ MSG_NOACK | MSG_ALLINT);
5763+
5764+ last_hello = jiffies;
5765+
5766+ return status;
5767+}
5768+
5769+/* This is a special HELLO message that requires an ACK. clients in transition
5770+ * send these to the master to check it is till alive. if it does not ACK then
5771+ * cnxman will signal it dead and we can restart the transition */
5772+static int send_master_hello()
5773+{
5774+ struct cl_mem_hello_msg hello_msg;
5775+ int status;
5776+ struct sockaddr_cl saddr;
5777+
5778+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5779+ hello_msg.members = cpu_to_le16(cluster_members);
5780+ hello_msg.flags = 1;
5781+ hello_msg.generation = cpu_to_le32(cluster_generation);
5782+
5783+ saddr.scl_family = AF_CLUSTER;
5784+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5785+ saddr.scl_nodeid = master_node->node_id;
5786+ status =
5787+ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg),
5788+ &saddr, sizeof (saddr), 0);
5789+
5790+ last_hello = jiffies;
5791+
5792+ return status;
5793+}
5794+
5795+/* Called when the transition timer has expired, meaning we sent a transition
5796+ * message that was not ACKed */
5797+static void trans_timer_expired(unsigned long arg)
5798+{
5799+ P_MEMB("Transition timer fired %ld\n", jiffies);
5800+
5801+ set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
5802+ wake_up_process(membership_task);
5803+}
5804+
5805+static void hello_timer_expired(unsigned long arg)
5806+{
5807+ P_MEMB("Hello timer fired %ld\n", jiffies);
5808+
5809+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5810+
5811+ if (node_state >= TRANSITION) {
5812+ wake_up_process(hello_task);
5813+ }
5814+}
5815+
5816+static int wait_for_completion_barrier(void)
5817+{
5818+ int status;
5819+ char barriername[MAX_BARRIER_NAME_LEN];
5820+
5821+ sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
5822+
5823+ /* Make sure we all complete together */
5824+ P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
5825+ if ((status =
5826+ kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
5827+ printk(CMAN_NAME ": Error registering barrier: %d\n", status);
5828+ return -1;
5829+ }
5830+ kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
5831+ cman_config.transition_timeout);
5832+ status = kcl_barrier_wait(barriername);
5833+ kcl_barrier_delete(barriername);
5834+
5835+ P_MEMB("Completion barrier reached : status = %d\n", status);
5836+ return status;
5837+}
5838+
5839+/* Called at the end of a state transition when we are the master */
5840+static int end_transition()
5841+{
5842+ struct cl_mem_endtrans_msg msg;
5843+ int total_votes;
5844+ int status;
5845+
5846+ /* Cancel the timer */
5847+ del_timer(&transition_timer);
5848+
5849+ confirm_joiner();
5850+
5851+ quorum = calculate_quorum(leavereason, 0, &total_votes);
5852+
5853+ msg.cmd = CLUSTER_MEM_ENDTRANS;
5854+ msg.quorum = cpu_to_le32(quorum);
5855+ msg.generation = cpu_to_le32(++cluster_generation);
5856+ msg.total_votes = cpu_to_le32(total_votes);
5857+ if (joining_node && transitionreason == TRANS_NEWNODE) {
5858+ msg.new_node_id = cpu_to_le32(joining_node->node_id);
5859+ }
5860+ else {
5861+ msg.new_node_id = 0;
5862+ }
5863+ status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
5864+
5865+ /* When that's all settled down, do the transition completion barrier */
5866+ kcl_wait_for_all_acks();
5867+
5868+ if (wait_for_completion_barrier() != 0) {
5869+ P_MEMB("Barrier timed out - restart\n");
5870+ start_transition(TRANS_RESTART, us);
5871+ return 0;
5872+ }
5873+
5874+ set_quorate(total_votes);
5875+
5876+ notify_listeners();
5877+ reset_hello_time();
5878+
5879+ /* Tell any waiting barriers that we had a transition */
5880+ check_barrier_returns();
5881+
5882+ leavereason = 0;
5883+ node_state = MEMBER;
5884+ transition_end_time = jiffies;
5885+
5886+ sm_member_update(cluster_is_quorate);
5887+
5888+ return 0;
5889+}
5890+
5891+int send_reconfigure(int param, unsigned int value)
5892+{
5893+ char msgbuf[66];
5894+ struct cl_mem_reconfig_msg *msg =
5895+ (struct cl_mem_reconfig_msg *) &msgbuf;
5896+
5897+ if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
5898+ expected_votes = value;
5899+
5900+ msg->cmd = CLUSTER_MEM_RECONFIG;
5901+ msg->param = param;
5902+ msg->value = cpu_to_le32(value);
5903+
5904+ return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
5905+}
5906+
5907+static int send_joinack(char *addr, int addr_len, unsigned char acktype)
5908+{
5909+ struct cl_mem_joinack_msg msg;
5910+
5911+ msg.cmd = CLUSTER_MEM_JOINACK;
5912+ msg.acktype = acktype;
5913+
5914+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
5915+ (struct sockaddr_cl *)addr, addr_len, MSG_NOACK);
5916+}
5917+
5918+/* Only send a leave message to one node in the cluster so that it can master
5919+ * the state transition, otherwise we get a "thundering herd" of potential
5920+ * masters fighting it out */
5921+int send_leave(unsigned char flags)
5922+{
5923+ unsigned char msg[2];
5924+ struct sockaddr_cl saddr;
5925+ struct cluster_node *node = NULL;
5926+ int status;
5927+
5928+ if (!mem_socket)
5929+ return 0;
5930+
5931+ saddr.scl_family = AF_CLUSTER;
5932+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5933+
5934+ /* If we are in transition then use the current master */
5935+ if (node_state == TRANSITION) {
5936+ node = master_node;
5937+ }
5938+ if (!node) {
5939+ /* If we are the master or not in transition then pick a node
5940+ * almost at random */
5941+ struct list_head *nodelist;
5942+
5943+ down(&cluster_members_lock);
5944+ list_for_each(nodelist, &cluster_members_list) {
5945+ node = list_entry(nodelist, struct cluster_node, list);
5946+
5947+ if (node->state == NODESTATE_MEMBER && !node->us)
5948+ break;
5949+ }
5950+ up(&cluster_members_lock);
5951+ }
5952+
5953+ /* we are the only member of the cluster - there is no-one to tell */
5954+ if (node && !node->us) {
5955+ saddr.scl_nodeid = node->node_id;
5956+
5957+ P_MEMB("Sending LEAVE to %s\n", node->name);
5958+ msg[0] = CLUSTER_MEM_LEAVE;
5959+ msg[1] = flags;
5960+ status =
5961+ kcl_sendmsg(mem_socket, msg, 2,
5962+ &saddr, sizeof (saddr),
5963+ MSG_NOACK);
5964+
5965+ if (status < 0)
5966+ return status;
5967+ }
5968+
5969+ /* And exit */
5970+ node_state = LEFT_CLUSTER;
5971+ wake_up_process(membership_task);
5972+ return 0;
5973+}
5974+
5975+int send_kill(int nodeid)
5976+{
5977+ char killmsg;
5978+ struct sockaddr_cl saddr;
5979+
5980+ killmsg = CLUSTER_MEM_KILL;
5981+
5982+ saddr.scl_family = AF_CLUSTER;
5983+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5984+ saddr.scl_nodeid = nodeid;
5985+ return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
5986+ sizeof (struct sockaddr_cl), MSG_NOACK);
5987+}
5988+
5989+/* Process a message */
5990+static int do_membership_packet(struct msghdr *msg, int len)
5991+{
5992+ int result = -1;
5993+ unsigned char *buf = msg->msg_iov->iov_base;
5994+ struct sockaddr_cl *saddr = msg->msg_name;
5995+ struct cluster_node *node;
5996+
5997+ node = find_node_by_nodeid(saddr->scl_nodeid);
5998+
5999+ P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
6000+ msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
6001+
6002+ switch (*buf) {
6003+ case CLUSTER_MEM_JOINREQ:
6004+ result = do_process_joinreq(msg, len);
6005+ break;
6006+
6007+ case CLUSTER_MEM_LEAVE:
6008+ if (we_are_a_cluster_member)
6009+ result = do_process_leave(msg, len);
6010+ break;
6011+
6012+ case CLUSTER_MEM_HELLO:
6013+ result = do_process_hello(msg, len);
6014+ break;
6015+
6016+ case CLUSTER_MEM_KILL:
6017+ if (we_are_a_cluster_member)
6018+ result = do_process_kill(msg, len);
6019+ break;
6020+
6021+ case CLUSTER_MEM_JOINCONF:
6022+ if (node_state == JOINACK) {
6023+ do_process_joinconf(msg, len);
6024+ }
6025+ break;
6026+
6027+ case CLUSTER_MEM_CONFACK:
6028+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
6029+ end_transition();
6030+ }
6031+ break;
6032+
6033+ case CLUSTER_MEM_MASTERVIEW:
6034+ if (node_state == TRANSITION)
6035+ do_process_masterview(msg, len);
6036+ break;
6037+
6038+ case CLUSTER_MEM_JOINACK:
6039+ if (node_state == JOINING || node_state == JOINWAIT) {
6040+ do_process_joinack(msg, len);
6041+ }
6042+ break;
6043+ case CLUSTER_MEM_RECONFIG:
6044+ if (we_are_a_cluster_member) {
6045+ do_process_reconfig(msg, len);
6046+ }
6047+ break;
6048+
6049+ case CLUSTER_MEM_STARTTRANS:
6050+ result = do_process_starttrans(msg, len);
6051+ break;
6052+
6053+ case CLUSTER_MEM_ENDTRANS:
6054+ result = do_process_endtrans(msg, len);
6055+ break;
6056+
6057+ case CLUSTER_MEM_VIEWACK:
6058+ result = do_process_viewack(msg, len);
6059+ break;
6060+
6061+ case CLUSTER_MEM_STARTACK:
6062+ if (node_state == MASTER)
6063+ result = do_process_startack(msg, len);
6064+ break;
6065+
6066+ case CLUSTER_MEM_NEWCLUSTER:
6067+ result = do_process_newcluster(msg, len);
6068+ break;
6069+
6070+ case CLUSTER_MEM_NOMINATE:
6071+ if (node_state != MASTER)
6072+ result = do_process_nominate(msg, len);
6073+ break;
6074+
6075+ default:
6076+ printk(KERN_ERR CMAN_NAME
6077+ ": Unknown membership services message %d received\n",
6078+ *buf);
6079+ break;
6080+
6081+ }
6082+ return result;
6083+}
6084+
6085+/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
6086+ * to ignore request (node already joining) */
6087+static int check_duplicate_node(char *name, struct msghdr *msg, int len)
6088+{
6089+ struct cluster_node *node;
6090+ struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
6091+ char addr[address_length];
6092+ int addrlen;
6093+
6094+ if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
6095+ return -3;
6096+
6097+ /* See if we already have a cluster member with that name... */
6098+ node = find_node_by_name(name);
6099+ if (node && node->state != NODESTATE_DEAD) {
6100+
6101+ if ((node->state == NODESTATE_JOINING ||
6102+ node->state == NODESTATE_REMOTEMEMBER))
6103+ return +1;
6104+
6105+ printk(KERN_WARNING CMAN_NAME
6106+ ": Rejecting cluster membership application from %s - already have a node with that name\n",
6107+ name);
6108+ return -1;
6109+
6110+ }
6111+
6112+ /* Need to check the node's address too */
6113+ if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
6114+ (node = find_node_by_addr(addr, addrlen)) &&
6115+ node->state != NODESTATE_DEAD) {
6116+
6117+ if ((node->state == NODESTATE_JOINING ||
6118+ node->state == NODESTATE_REMOTEMEMBER))
6119+ return +1;
6120+
6121+ printk(KERN_WARNING CMAN_NAME
6122+ ": Rejecting cluster membership application from %s - already have a node with that address\n",
6123+ name);
6124+ return -1;
6125+ }
6126+ return 0;
6127+}
6128+
6129+/* Start the state transition */
6130+static int start_transition(unsigned char reason, struct cluster_node *node)
6131+{
6132+ char *startbuf = scratchbuf;
6133+ struct cl_mem_starttrans_msg *msg =
6134+ (struct cl_mem_starttrans_msg *) startbuf;
6135+
6136+ P_MEMB("Start transition - reason = %d\n", reason);
6137+
6138+ /* If this is a restart then zero the counters */
6139+ if (reason == TRANS_RESTART) {
6140+ agreeing_nodes = 0;
6141+ dissenting_nodes = 0;
6142+ if (node_opinion) {
6143+ kfree(node_opinion);
6144+ node_opinion = NULL;
6145+ }
6146+ responses_collected = 0;
6147+ }
6148+
6149+ /* If we have timed out too many times then just die */
6150+ if (reason == TRANS_RESTART
6151+ && ++transition_restarts > cman_config.transition_restarts) {
6152+ printk(KERN_WARNING CMAN_NAME
6153+ ": too many transition restarts - will die\n");
6154+ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
6155+ node_state = LEFT_CLUSTER;
6156+ quit_threads = 1;
6157+ wake_up_process(membership_task);
6158+ wake_up_interruptible(&cnxman_waitq);
6159+ return 0;
6160+ }
6161+ if (reason != TRANS_RESTART)
6162+ transition_restarts = 0;
6163+
6164+ /* Only keep the original state transition reason in the global
6165+ * variable. */
6166+ if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
6167+ reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
6168+ transitionreason = reason;
6169+
6170+ /* Save the info of the requesting node */
6171+ if (reason == TRANS_NEWNODE)
6172+ joining_node = node;
6173+
6174+ node_state = MASTER;
6175+ master_state = MASTER_START;
6176+ responses_collected = 0;
6177+ responses_expected = cluster_members - 1;
6178+
6179+ /* If we are on our own then just do it */
6180+ if (responses_expected == 0) {
6181+ P_MEMB("We are on our own...lonely here\n");
6182+ responses_collected--;
6183+ do_process_startack(NULL, 0);
6184+ }
6185+ else {
6186+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6187+ struct list_head *addrlist;
6188+ unsigned short num_addrs = 0;
6189+ int flags = 0;
6190+
6191+ /* Send the STARTTRANS message */
6192+ msg->cmd = CLUSTER_MEM_STARTTRANS;
6193+ msg->reason = reason;
6194+ msg->votes = node->votes;
6195+ msg->expected_votes = cpu_to_le32(node->expected_votes);
6196+ msg->generation = cpu_to_le32(++cluster_generation);
6197+ msg->nodeid = cpu_to_le32(node->node_id);
6198+
6199+ if (reason == TRANS_NEWNODE) {
6200+ /* Add the addresses */
6201+ list_for_each(addrlist, &node->addr_list) {
6202+ struct cluster_node_addr *nodeaddr =
6203+ list_entry(addrlist,
6204+ struct cluster_node_addr, list);
6205+
6206+ memcpy(startbuf + ptr, nodeaddr->addr,
6207+ address_length);
6208+ ptr += address_length;
6209+ num_addrs++;
6210+ }
6211+
6212+ /* And the name */
6213+ strcpy(startbuf + ptr, node->name);
6214+ ptr += strlen(node->name) + 1;
6215+ }
6216+
6217+ /* If another node died then we must queue the STARTTRANS
6218+ * messages so that membershipd can carry on processing the
6219+ * other replies */
6220+ if (reason == TRANS_ANOTHERREMNODE)
6221+ flags |= MSG_QUEUE;
6222+
6223+ msg->num_addrs = cpu_to_le16(num_addrs);
6224+ kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
6225+ }
6226+ /* Set a timer in case we don't get 'em all back */
6227+ mod_timer(&transition_timer,
6228+ jiffies + cman_config.transition_timeout * HZ);
6229+ return 0;
6230+}
6231+
6232+/* A node has died - decide what to do */
6233+void a_node_just_died(struct cluster_node *node)
6234+{
6235+ /* If we are not in the context of kmembershipd then stick it on the
6236+ * list and wake it */
6237+ if (current != membership_task) {
6238+ struct cl_new_dead_node *newnode =
6239+ kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
6240+ if (!newnode)
6241+ return;
6242+ newnode->node = node;
6243+ down(&new_dead_node_lock);
6244+ list_add_tail(&newnode->list, &new_dead_node_list);
6245+ set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
6246+ up(&new_dead_node_lock);
6247+ wake_up_process(membership_task);
6248+ P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
6249+ return;
6250+ }
6251+
6252+ /* Remove it */
6253+ down(&cluster_members_lock);
6254+ if (node->state == NODESTATE_MEMBER)
6255+ cluster_members--;
6256+ node->state = NODESTATE_DEAD;
6257+ up(&cluster_members_lock);
6258+
6259+ /* Notify listeners */
6260+ notify_kernel_listeners(DIED, (long) node->node_id);
6261+
6262+ /* If we are in normal operation then become master and initiate a
6263+ * state-transition */
6264+ if (node_state == MEMBER) {
6265+ start_transition(TRANS_REMNODE, node);
6266+ return;
6267+ }
6268+
6269+ /* If we are a slave in transition then see if it's the master that has
6270+ * failed. If not then ignore it. If it /is/ the master then elect a
6271+ * new one */
6272+ if (node_state == TRANSITION) {
6273+ if (master_node == node) {
6274+ if (elect_master(&node)) {
6275+ del_timer(&transition_timer);
6276+ node_state = MASTER;
6277+
6278+ start_transition(TRANS_DEADMASTER, master_node);
6279+ }
6280+ else {
6281+ /* Someone else can be in charge - phew! */
6282+ }
6283+ }
6284+ return;
6285+ }
6286+
6287+ /* If we are the master then we need to start the transition all over
6288+ * again */
6289+ if (node_state == MASTER) {
6290+ /* Cancel timer */
6291+ del_timer(&transition_timer);
6292+
6293+ /* Restart the transition */
6294+ start_transition(TRANS_ANOTHERREMNODE, node);
6295+ transition_restarts = 0;
6296+ return;
6297+ }
6298+}
6299+
6300+/*
6301+ * Build up and send a set of messages consisting of the whole cluster view.
6302+ * The first byte is the command (cmd as passed in), the second is a flag byte:
6303+ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
6304+ * this is the only message sent The rest is a set of packed node entries, which
6305+ * are NOT split over packets. */
6306+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
6307+ unsigned int flags)
6308+{
6309+ int ptr = 2;
6310+ int len;
6311+ int status = 0;
6312+ int last_node_start = 2;
6313+ unsigned char first_packet_flag = 1;
6314+ struct list_head *nodelist;
6315+ struct list_head *temp;
6316+ struct cluster_node *node;
6317+ char *message = scratchbuf;
6318+
6319+ message[0] = cmd;
6320+
6321+ down(&cluster_members_lock);
6322+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
6323+ node = list_entry(nodelist, struct cluster_node, list);
6324+
6325+ if (node->state == NODESTATE_MEMBER) {
6326+ unsigned int evotes;
6327+ unsigned int node_id;
6328+ unsigned short num_addrs = 0;
6329+ unsigned short num_addrs_le;
6330+ struct list_head *addrlist;
6331+
6332+ last_node_start = ptr;
6333+
6334+ message[ptr++] = len = strlen(node->name);
6335+ strcpy(&message[ptr], node->name);
6336+ ptr += len;
6337+
6338+ /* Count the number of addresses this node has */
6339+ list_for_each(addrlist, &node->addr_list) {
6340+ num_addrs++;
6341+ }
6342+
6343+ num_addrs_le = cpu_to_le16(num_addrs);
6344+ memcpy(&message[ptr], &num_addrs_le, sizeof (short));
6345+ ptr += sizeof (short);
6346+
6347+ /* Pack em in */
6348+ list_for_each(addrlist, &node->addr_list) {
6349+
6350+ struct cluster_node_addr *nodeaddr =
6351+ list_entry(addrlist,
6352+ struct cluster_node_addr, list);
6353+
6354+ memcpy(&message[ptr], nodeaddr->addr,
6355+ address_length);
6356+ ptr += address_length;
6357+ }
6358+
6359+ message[ptr++] = node->votes;
6360+
6361+ evotes = cpu_to_le32(node->expected_votes);
6362+ memcpy(&message[ptr], &evotes, sizeof (int));
6363+ ptr += sizeof (int);
6364+
6365+ node_id = cpu_to_le32(node->node_id);
6366+ memcpy(&message[ptr], &node_id, sizeof (int));
6367+ ptr += sizeof (int);
6368+
6369+ /* If the block is full then send it */
6370+ if (ptr > MAX_CLUSTER_MESSAGE) {
6371+ message[1] = first_packet_flag;
6372+
6373+ up(&cluster_members_lock);
6374+ status =
6375+ kcl_sendmsg(mem_socket, message,
6376+ last_node_start, saddr,
6377+ saddr ? sizeof (struct sockaddr_cl) : 0,
6378+ flags);
6379+
6380+ if (status < 0)
6381+ goto send_fail;
6382+
6383+ down(&cluster_members_lock);
6384+
6385+ first_packet_flag = 0;
6386+ /* Copy the overflow back to the start of the
6387+ * buffer for the next send */
6388+ memcpy(&message[2], &message[last_node_start],
6389+ ptr - last_node_start);
6390+ ptr = ptr - last_node_start + 2;
6391+ }
6392+ }
6393+ }
6394+
6395+ up(&cluster_members_lock);
6396+
6397+ message[1] = first_packet_flag | 2; /* The last may also be first */
6398+ status = kcl_sendmsg(mem_socket, message, ptr,
6399+ saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
6400+ flags);
6401+ send_fail:
6402+
6403+ return status;
6404+}
6405+
6406+/* Make the JOINING node into a MEMBER */
6407+static void confirm_joiner()
6408+{
6409+ if (joining_node && joining_node->state == NODESTATE_JOINING) {
6410+ down(&cluster_members_lock);
6411+ joining_node->state = NODESTATE_MEMBER;
6412+ cluster_members++;
6413+ up(&cluster_members_lock);
6414+ }
6415+ remove_temp_nodeid(joining_temp_nodeid);
6416+ joining_temp_nodeid = 0;
6417+}
6418+
6419+/* Reset HELLO timers for all nodes We do this after a state-transition as we
6420+ * have had HELLOS disabled during the transition and if we don't do this the
6421+ * nodes will go on an uncontrolled culling-spree afterwards */
6422+static void reset_hello_time()
6423+{
6424+ struct list_head *nodelist;
6425+ struct cluster_node *node;
6426+
6427+ down(&cluster_members_lock);
6428+ list_for_each(nodelist, &cluster_members_list) {
6429+ node = list_entry(nodelist, struct cluster_node, list);
6430+
6431+ if (node->state == NODESTATE_MEMBER) {
6432+ node->last_hello = jiffies;
6433+ }
6434+
6435+ }
6436+ up(&cluster_members_lock);
6437+}
6438+
6439+/* Calculate the new quorum and return the value. do *not* set it in here as
6440+ * cnxman calls this to check if a new expected_votes value is valid. It
6441+ * (optionally) returns the total number of votes in the cluster */
6442+int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
6443+{
6444+ struct list_head *nodelist;
6445+ struct cluster_node *node;
6446+ unsigned int total_votes = 0;
6447+ unsigned int highest_expected = 0;
6448+ unsigned int newquorum, q1, q2;
6449+
6450+ down(&cluster_members_lock);
6451+ list_for_each(nodelist, &cluster_members_list) {
6452+ node = list_entry(nodelist, struct cluster_node, list);
6453+
6454+ if (node->state == NODESTATE_MEMBER) {
6455+ highest_expected =
6456+ max(highest_expected, node->expected_votes);
6457+ total_votes += node->votes;
6458+ }
6459+ }
6460+ up(&cluster_members_lock);
6461+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
6462+ total_votes += quorum_device->votes;
6463+
6464+ if (max_expected > 0)
6465+ highest_expected = max_expected;
6466+
6467+ /* This quorum calculation is taken from the OpenVMS Cluster Systems
6468+ * manual, but, then, you guessed that didn't you */
6469+ q1 = (highest_expected + 2) / 2;
6470+ q2 = (total_votes + 2) / 2;
6471+ newquorum = max(q1, q2);
6472+
6473+ /* Normally quorum never decreases but the system administrator can
6474+ * force it down by setting expected votes to a maximum value */
6475+ if (!allow_decrease)
6476+ newquorum = max(quorum, newquorum);
6477+
6478+ /* The special two_node mode allows each of the two nodes to retain
6479+ * quorum if the other fails. Only one of the two should live past
6480+ * fencing (as both nodes try to fence each other in split-brain.) */
6481+ if (two_node)
6482+ newquorum = 1;
6483+
6484+ if (ret_total_votes)
6485+ *ret_total_votes = total_votes;
6486+ return newquorum;
6487+}
6488+
6489+/* Recalculate cluster quorum, set quorate and notify changes */
6490+void recalculate_quorum(int allow_decrease)
6491+{
6492+ int total_votes;
6493+
6494+ quorum = calculate_quorum(allow_decrease, 0, &total_votes);
6495+ set_quorate(total_votes);
6496+ notify_listeners();
6497+}
6498+
6499+/* Add new node address to an existing node */
6500+int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
6501+{
6502+ struct cluster_node_addr *newaddr;
6503+
6504+ newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
6505+ if (!newaddr)
6506+ return -1;
6507+
6508+ memcpy(newaddr->addr, addr, len);
6509+ newaddr->addr_len = len;
6510+ list_add_tail(&newaddr->list, &node->addr_list);
6511+
6512+ return 0;
6513+}
6514+
6515+static struct cluster_node *add_new_node(char *name, unsigned char votes,
6516+ unsigned int expected_votes,
6517+ int node_id, int state)
6518+{
6519+ struct cluster_node *newnode;
6520+
6521+ /* Look for a dead node with this name */
6522+ newnode = find_node_by_name(name);
6523+
6524+ /* Is it already joining */
6525+ if (newnode && newnode->state == NODESTATE_JOINING)
6526+ return NULL;
6527+
6528+ /* Update existing information */
6529+ if (newnode && newnode->state == NODESTATE_DEAD) {
6530+ newnode->last_hello = jiffies;
6531+ newnode->votes = votes;
6532+ newnode->expected_votes = expected_votes;
6533+ newnode->state = state;
6534+ newnode->us = 0;
6535+ newnode->leave_reason = 0;
6536+ newnode->last_seq_recv = 0;
6537+ newnode->last_seq_acked = 0;
6538+ newnode->last_seq_sent = 0;
6539+ newnode->incarnation++;
6540+ /* Don't overwrite the node ID */
6541+
6542+ if (state == NODESTATE_MEMBER) {
6543+ down(&cluster_members_lock);
6544+ cluster_members++;
6545+ up(&cluster_members_lock);
6546+ }
6547+
6548+ printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
6549+ return newnode;
6550+ }
6551+
6552+ newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
6553+ if (!newnode)
6554+ goto alloc_err;
6555+
6556+ memset(newnode, 0, sizeof (struct cluster_node));
6557+ newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
6558+ if (!newnode->name)
6559+ goto alloc_err1;
6560+
6561+ strcpy(newnode->name, name);
6562+ newnode->last_hello = jiffies;
6563+ newnode->votes = votes;
6564+ newnode->expected_votes = expected_votes;
6565+ newnode->state = state;
6566+ newnode->node_id = node_id;
6567+ newnode->us = 0;
6568+ newnode->leave_reason = 0;
6569+ newnode->last_seq_recv = 0;
6570+ newnode->last_seq_acked = 0;
6571+ newnode->last_seq_sent = 0;
6572+ newnode->incarnation = 0;
6573+ INIT_LIST_HEAD(&newnode->addr_list);
6574+ set_nodeid(newnode, node_id);
6575+
6576+ /* Add the new node to the list */
6577+ down(&cluster_members_lock);
6578+ list_add(&newnode->list, &cluster_members_list);
6579+ if (state == NODESTATE_MEMBER)
6580+ cluster_members++;
6581+ up(&cluster_members_lock);
6582+
6583+ printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
6584+ return newnode;
6585+
6586+ alloc_err1:
6587+ kfree(newnode);
6588+ alloc_err:
6589+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
6590+
6591+ printk(KERN_CRIT CMAN_NAME
6592+ ": Cannot allocate memory for new cluster node %s\n", name);
6593+
6594+ panic("cluster memory allocation failed");
6595+
6596+ return NULL;
6597+}
6598+
6599+/* Remove node from a STARTTRANS message */
6600+static struct cluster_node *remove_node(int nodeid)
6601+{
6602+ struct cluster_node *node = find_node_by_nodeid(nodeid);
6603+
6604+ if (node && node->state == NODESTATE_MEMBER) {
6605+ P_MEMB("starttrans removes node %s\n", node->name);
6606+ down(&cluster_members_lock);
6607+ node->state = NODESTATE_DEAD;
6608+ cluster_members--;
6609+ up(&cluster_members_lock);
6610+
6611+ notify_kernel_listeners(DIED, (long) nodeid);
6612+
6613+ /* If this node is us then go quietly */
6614+ if (node->us) {
6615+ printk(KERN_INFO CMAN_NAME
6616+ ": killed by STARTTRANS or NOMINATE\n");
6617+ quit_threads = 1;
6618+ wake_up_process(membership_task);
6619+ wake_up_interruptible(&cnxman_waitq);
6620+ }
6621+ }
6622+ return node;
6623+}
6624+
6625+/* Add a node from a STARTTRANS or NOMINATE message */
6626+static void add_node_from_starttrans(struct msghdr *msg, int len)
6627+{
6628+ /* Add the new node but don't fill in the ID until the master has
6629+ * confirmed it */
6630+ struct cl_mem_starttrans_msg *startmsg =
6631+ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
6632+ char *msgbuf = (char *) msg->msg_iov->iov_base;
6633+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6634+ char *name =
6635+ msgbuf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
6636+ int i;
6637+
6638+ joining_node = add_new_node(name, startmsg->votes,
6639+ le32_to_cpu(startmsg->expected_votes),
6640+ 0, NODESTATE_JOINING);
6641+
6642+ /* add_new_node returns NULL if the node already exists */
6643+ if (!joining_node)
6644+ joining_node = find_node_by_name(name);
6645+
6646+ /* Add the node's addresses */
6647+ if (list_empty(&joining_node->addr_list)) {
6648+ for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
6649+ add_node_address(joining_node, msgbuf + ptr, address_length);
6650+ ptr += address_length;
6651+ }
6652+ }
6653+}
6654+
6655+/* We have been nominated as master for a transition */
6656+static int do_process_nominate(struct msghdr *msg, int len)
6657+{
6658+ struct cl_mem_starttrans_msg *startmsg =
6659+ (struct cl_mem_starttrans_msg *)msg->msg_iov->iov_base;
6660+ struct cluster_node *node = NULL;
6661+ char *nodeaddr = msg->msg_iov->iov_base + sizeof(struct cl_mem_starttrans_msg);
6662+
6663+ P_MEMB("nominate reason is %d\n", startmsg->reason);
6664+
6665+ if (startmsg->reason == TRANS_REMNODE) {
6666+ node = remove_node(le32_to_cpu(startmsg->nodeid));
6667+ }
6668+
6669+ if (startmsg->reason == TRANS_NEWNODE) {
6670+ add_node_from_starttrans(msg, len);
6671+ node = joining_node;
6672+ /* Make sure we have a temp nodeid for the new node */
6673+ joining_temp_nodeid = new_temp_nodeid(nodeaddr,
6674+ address_length);
6675+ }
6676+
6677+ /* This should be a TRANS_CHECK but start_transition needs some node
6678+ * info */
6679+ if (node == NULL)
6680+ node = us;
6681+ start_transition(startmsg->reason, node);
6682+ return 0;
6683+}
6684+
6685+/* Got a STARTACK response from a node */
6686+static int do_process_startack(struct msghdr *msg, int len)
6687+{
6688+ if (node_state != MASTER && master_state != MASTER_START) {
6689+ P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
6690+ return 0;
6691+ }
6692+
6693+ /* msg is NULL if we are called directly from start_transition */
6694+ if (msg) {
6695+ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
6696+
6697+ /* Ignore any messages wil old generation numbers in them */
6698+ if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
6699+ P_MEMB("Got old generation START-ACK msg - ignoring\n");
6700+ return 0;
6701+ }
6702+ }
6703+
6704+ /* If the node_id is non-zero then use it. */
6705+ if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
6706+ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
6707+
6708+ if (ackmsg->node_id) {
6709+ set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
6710+ }
6711+ highest_nodeid =
6712+ max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
6713+ P_MEMB("Node id = %d, highest node id = %d\n",
6714+ le32_to_cpu(ackmsg->node_id),
6715+ le32_to_cpu(ackmsg->highest_node_id));
6716+ }
6717+
6718+ /* If we have all the responses in then move to the next stage */
6719+ if (++responses_collected == responses_expected) {
6720+
6721+ /* If the new node has no node_id (ie nobody in the cluster has
6722+ * heard of it before) then assign it a new one */
6723+ if (transitionreason == TRANS_NEWNODE && joining_node) {
6724+ highest_nodeid =
6725+ max(highest_nodeid, get_highest_nodeid());
6726+ if (joining_node->node_id == 0) {
6727+ set_nodeid(joining_node, ++highest_nodeid);
6728+ }
6729+ P_MEMB("nodeIDs: new node: %d, highest: %d\n",
6730+ joining_node->node_id, highest_nodeid);
6731+ }
6732+
6733+ /* Behave a little differently if we are on our own */
6734+ if (cluster_members == 1) {
6735+ if (transitionreason == TRANS_NEWNODE) {
6736+ /* If the cluster is just us then confirm at
6737+ * once */
6738+ joinconf_count = 0;
6739+ mod_timer(&transition_timer,
6740+ jiffies +
6741+ cman_config.joinconf_timeout * HZ);
6742+ send_joinconf();
6743+ return 0;
6744+ }
6745+ else { /* Node leaving the cluster */
6746+ recalculate_quorum(leavereason);
6747+ leavereason = 0;
6748+ node_state = MEMBER;
6749+ }
6750+ }
6751+ else {
6752+ master_state = MASTER_COLLECT;
6753+ responses_collected = 0;
6754+ responses_expected = cluster_members - 1;
6755+ P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
6756+ responses_expected);
6757+
6758+ send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0);
6759+
6760+ /* Set a timer in case we don't get 'em all back */
6761+ mod_timer(&transition_timer,
6762+ jiffies +
6763+ cman_config.transition_timeout * HZ);
6764+ }
6765+ }
6766+ return 0;
6767+}
6768+
6769+/* Got a VIEWACK response from a node */
6770+static int do_process_viewack(struct msghdr *msg, int len)
6771+{
6772+ char *reply = msg->msg_iov->iov_base;
6773+ struct sockaddr_cl *saddr = msg->msg_name;
6774+
6775+ if (master_state != MASTER_COLLECT) {
6776+ printk(KERN_INFO CMAN_NAME
6777+ ": got VIEWACK while not in state transition\n");
6778+ return 0;
6779+ }
6780+
6781+ if (node_opinion == NULL) {
6782+ node_opinion =
6783+ kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
6784+ if (!node_opinion) {
6785+ panic(": malloc agree/dissent failed\n");
6786+ }
6787+ memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
6788+ }
6789+
6790+ /* Keep a list of agreeing and dissenting nodes */
6791+ if (reply[1] == 1) {
6792+ /* ACK - remote node agrees with me */
6793+ P_MEMB("Node agrees\n");
6794+ node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
6795+ agreeing_nodes++;
6796+ }
6797+ else {
6798+ /* Remote node disagrees */
6799+ P_MEMB("Node disagrees\n");
6800+ node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
6801+ dissenting_nodes++;
6802+ }
6803+
6804+ P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
6805+ responses_expected);
6806+
6807+ /* Are all the results in yet ? */
6808+ if (++responses_collected == responses_expected) {
6809+ del_timer(&transition_timer);
6810+
6811+ P_MEMB("The results are in: %d agree, %d dissent\n",
6812+ agreeing_nodes, dissenting_nodes);
6813+
6814+ if (agreeing_nodes > dissenting_nodes) {
6815+ /* Kill dissenting nodes */
6816+ int i;
6817+
6818+ for (i = 1; i <= responses_collected; i++) {
6819+ if (node_opinion[i] == OPINION_DISAGREE)
6820+ send_kill(i);
6821+ }
6822+ }
6823+ else {
6824+ /* We must leave the cluster as we are in a minority,
6825+ * the rest of them can fight it out amongst
6826+ * themselves. */
6827+ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
6828+
6829+ agreeing_nodes = 0;
6830+ dissenting_nodes = 0;
6831+ kfree(node_opinion);
6832+ node_opinion = NULL;
6833+ node_state = LEFT_CLUSTER;
6834+ quit_threads = 1;
6835+ wake_up_process(membership_task);
6836+ wake_up_interruptible(&cnxman_waitq);
6837+ return -1;
6838+ }
6839+
6840+ /* Reset counters */
6841+ agreeing_nodes = 0;
6842+ dissenting_nodes = 0;
6843+ kfree(node_opinion);
6844+ node_opinion = NULL;
6845+
6846+ /* Confirm new node */
6847+ if (transitionreason == TRANS_NEWNODE) {
6848+ mod_timer(&transition_timer,
6849+ jiffies + cman_config.joinconf_timeout * HZ);
6850+ joinconf_count = 0;
6851+ send_joinconf();
6852+ return 0;
6853+ }
6854+
6855+ master_state = MASTER_COMPLETE;
6856+
6857+ end_transition();
6858+ }
6859+
6860+ return 0;
6861+}
6862+
6863+/* Got an ENDTRANS message */
6864+static int do_process_endtrans(struct msghdr *msg, int len)
6865+{
6866+ struct cl_mem_endtrans_msg *endmsg =
6867+ (struct cl_mem_endtrans_msg *) msg->msg_iov->iov_base;
6868+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
6869+
6870+ /* Someone else's state transition */
6871+ if (node_state != TRANSITION && node_state != JOINACK)
6872+ return 0;
6873+
6874+ /* Check we got it from the MASTER node */
6875+ if (master_node && master_node->node_id != saddr->scl_nodeid) {
6876+ printk(KERN_INFO
6877+ "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
6878+ master_node->node_id, saddr->scl_nodeid);
6879+ return 0;
6880+ }
6881+
6882+ del_timer(&transition_timer);
6883+
6884+ /* Set node ID on new node */
6885+ if (endmsg->new_node_id) {
6886+ set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
6887+ P_MEMB("new node %s has ID %d\n", joining_node->name,
6888+ joining_node->node_id);
6889+ }
6890+
6891+ node_state = TRANSITION_COMPLETE;
6892+
6893+ /* Need to set this here or the barrier code will reject us if we've
6894+ * just joined */
6895+ we_are_a_cluster_member = TRUE;
6896+
6897+ confirm_joiner();
6898+ cluster_generation = le32_to_cpu(endmsg->generation);
6899+
6900+ if (wait_for_completion_barrier() != 0) {
6901+ P_MEMB("Barrier timed out - restart\n");
6902+ node_state = TRANSITION;
6903+ mod_timer(&transition_timer,
6904+ jiffies + cman_config.transition_timeout * HZ);
6905+ return 0;
6906+ }
6907+
6908+ quorum = le32_to_cpu(endmsg->quorum);
6909+ set_quorate(le32_to_cpu(endmsg->total_votes));
6910+
6911+ /* Tell any waiting barriers that we had a transition */
6912+ check_barrier_returns();
6913+
6914+ /* Clear the master node */
6915+ master_node = NULL;
6916+
6917+ node_state = MEMBER;
6918+
6919+ /* Notify other listeners that transition has completed */
6920+ notify_listeners();
6921+ reset_hello_time();
6922+ transition_end_time = jiffies;
6923+
6924+ sm_member_update(cluster_is_quorate);
6925+ return 0;
6926+}
6927+
6928+/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
6929+static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
6930+ int nodeid)
6931+{
6932+ struct sockaddr_cl maddr;
6933+
6934+ maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6935+ maddr.scl_family = AF_CLUSTER;
6936+ maddr.scl_nodeid = nodeid;
6937+
6938+ startmsg->cmd = CLUSTER_MEM_NOMINATE;
6939+ return kcl_sendmsg(mem_socket, startmsg, msglen,
6940+ &maddr, sizeof (maddr), 0);
6941+}
6942+
6943+/* Got a STARTTRANS message */
6944+static int do_process_starttrans(struct msghdr *msg, int len)
6945+{
6946+ struct cl_mem_starttrans_msg *startmsg =
6947+ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
6948+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
6949+ struct cluster_node *node;
6950+ unsigned int newgen = le32_to_cpu(startmsg->generation);
6951+
6952+ /* Got a WHAT from WHOM? */
6953+ node = find_node_by_nodeid(saddr->scl_nodeid);
6954+ if (!node || node->state != NODESTATE_MEMBER)
6955+ return 0;
6956+
6957+ /* Someone else's state transition */
6958+ if (node_state != MEMBER &&
6959+ node_state != TRANSITION && node_state != MASTER)
6960+ return 0;
6961+
6962+ /* Ignore old generation STARTTRANS messages */
6963+ if ((newgen < cluster_generation) ||
6964+ (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
6965+ P_MEMB("Ignoring STARTTRANS with old generation number\n");
6966+ return 0;
6967+ }
6968+
6969+ P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
6970+ newgen, cluster_generation, startmsg->reason);
6971+
6972+ /* Up the generation number */
6973+ cluster_generation = newgen;
6974+
6975+ /* If we are also a master then decide between us */
6976+ if (node_state == MASTER) {
6977+
6978+ /* See if we really want the responsibility of being master */
6979+ if (elect_master(&node)) {
6980+
6981+ /* I reluctantly accept this position of responsibility
6982+ */
6983+ P_MEMB("I elected myself master\n");
6984+
6985+ /* start_transition will re-establish this */
6986+ del_timer(&transition_timer);
6987+
6988+ start_transition(TRANS_NEWMASTER, node);
6989+ return 0;
6990+ }
6991+ else {
6992+ /* Back down */
6993+ P_MEMB("Backing down from MASTER status\n");
6994+ master_node = node;
6995+ node_state = MEMBER;
6996+
6997+ /* If we were bringing a new node into the cluster then
6998+ * we will have to abandon that now and tell the new
6999+ * node to try again later */
7000+ if (transitionreason == TRANS_NEWNODE && joining_node) {
7001+ struct cluster_node_addr *first_addr =
7002+ (struct cluster_node_addr *) joining_node->
7003+ addr_list.next;
7004+
7005+ P_MEMB("Postponing membership of node %s\n",
7006+ joining_node->name);
7007+ send_joinack(first_addr->addr, address_length,
7008+ JOINACK_TYPE_WAIT);
7009+
7010+ /* Not dead, just sleeping */
7011+ joining_node->state = NODESTATE_DEAD;
7012+ joining_node = NULL;
7013+ }
7014+
7015+ /* If the new master is not us OR the node we just got
7016+ * the STARTTRANS from then make sure it knows it has
7017+ * to be master */
7018+ if (saddr->scl_nodeid != node->node_id) {
7019+ send_nominate(startmsg, len, node->node_id);
7020+ return 0;
7021+ }
7022+
7023+ /* Fall through into MEMBER code below if we are
7024+ * obeying the STARTTRANS we just received */
7025+ }
7026+ }
7027+
7028+ /* Do non-MASTER STARTTRANS bits */
7029+ if (node_state == MEMBER) {
7030+ int ptr = sizeof (struct cl_mem_starttrans_msg);
7031+ int node_id = 0;
7032+
7033+ P_MEMB("Normal transition start\n");
7034+
7035+ /* If the master is adding a new node and we know it's node ID
7036+ * then ACK with it. */
7037+ if (startmsg->reason == TRANS_NEWNODE) {
7038+ struct cluster_node *node =
7039+ find_node_by_addr((char *) startmsg + ptr,
7040+ address_length);
7041+ if (node)
7042+ node_id = node->node_id;
7043+ }
7044+
7045+ /* Save the master info */
7046+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7047+ node_state = TRANSITION;
7048+
7049+ if (startmsg->reason == TRANS_NEWNODE) {
7050+ add_node_from_starttrans(msg, len);
7051+ }
7052+
7053+ if (startmsg->reason == TRANS_REMNODE ||
7054+ startmsg->reason == TRANS_ANOTHERREMNODE) {
7055+ remove_node(le32_to_cpu(startmsg->nodeid));
7056+ }
7057+
7058+ send_startack(saddr, msg->msg_namelen,
7059+ node_id);
7060+
7061+ /* Establish timer in case the master dies */
7062+ mod_timer(&transition_timer,
7063+ jiffies + cman_config.transition_timeout * HZ);
7064+
7065+ return 0;
7066+ }
7067+
7068+ /* We are in transition but this may be a restart */
7069+ if (node_state == TRANSITION) {
7070+
7071+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7072+ send_startack(saddr, msg->msg_namelen, 0);
7073+
7074+ /* Is it a new joining node ? This happens if a master is
7075+ * usurped */
7076+ if (startmsg->reason == TRANS_NEWNODE) {
7077+ struct cluster_node *oldjoin = joining_node;
7078+
7079+ add_node_from_starttrans(msg, len);
7080+
7081+ /* If this is a different node joining than the one we
7082+ * were previously joining (probably cos the master is
7083+ * a nominated one) then mark our "old" joiner as DEAD.
7084+ * The original master will already have told the node
7085+ * to go back into JOINWAIT state */
7086+ if (oldjoin && oldjoin != joining_node
7087+ && oldjoin->state == NODESTATE_JOINING)
7088+ oldjoin->state = NODESTATE_DEAD;
7089+ }
7090+
7091+ /* Is it a new master node? */
7092+ if (startmsg->reason == TRANS_NEWMASTER ||
7093+ startmsg->reason == TRANS_DEADMASTER) {
7094+ P_MEMB("starttrans %s, node=%d\n",
7095+ startmsg->reason ==
7096+ TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
7097+ le32_to_cpu(startmsg->nodeid));
7098+
7099+ /* If the old master has died then remove it */
7100+ node =
7101+ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
7102+
7103+ if (startmsg->reason == TRANS_DEADMASTER &&
7104+ node && node->state == NODESTATE_MEMBER) {
7105+ down(&cluster_members_lock);
7106+ node->state = NODESTATE_DEAD;
7107+ cluster_members--;
7108+ up(&cluster_members_lock);
7109+ }
7110+
7111+ /* Store new master */
7112+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7113+ }
7114+
7115+ /* Another node has died (or been killed) */
7116+ if (startmsg->reason == TRANS_ANOTHERREMNODE) {
7117+ /* Remove new dead node */
7118+ node =
7119+ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
7120+ if (node && node->state == NODESTATE_MEMBER) {
7121+ down(&cluster_members_lock);
7122+ node->state = NODESTATE_DEAD;
7123+ cluster_members--;
7124+ up(&cluster_members_lock);
7125+ }
7126+ }
7127+ /* Restart the timer */
7128+ del_timer(&transition_timer);
7129+ mod_timer(&transition_timer,
7130+ jiffies + cman_config.transition_timeout * HZ);
7131+ }
7132+
7133+ return 0;
7134+}
7135+
7136+/* Change a cluster parameter */
7137+static int do_process_reconfig(struct msghdr *msg, int len)
7138+{
7139+ struct cl_mem_reconfig_msg *confmsg;
7140+ struct sockaddr_cl *saddr = msg->msg_name;
7141+ struct cluster_node *node;
7142+ unsigned int val;
7143+
7144+ if (len < sizeof(struct cl_mem_reconfig_msg))
7145+ return -1;
7146+
7147+ confmsg = (struct cl_mem_reconfig_msg *) msg->msg_iov->iov_base;
7148+ val = le32_to_cpu(confmsg->value);
7149+
7150+ switch (confmsg->param) {
7151+
7152+ case RECONFIG_PARAM_EXPECTED_VOTES:
7153+ /* Set any nodes with expected_votes higher than the new value
7154+ * down */
7155+ if (val > 0) {
7156+ struct cluster_node *node;
7157+
7158+ down(&cluster_members_lock);
7159+ list_for_each_entry(node, &cluster_members_list, list) {
7160+ if (node->state == NODESTATE_MEMBER &&
7161+ node->expected_votes > val) {
7162+ node->expected_votes = val;
7163+ }
7164+ }
7165+ up(&cluster_members_lock);
7166+ if (expected_votes > val)
7167+ expected_votes = val;
7168+ }
7169+ recalculate_quorum(1); /* Allow decrease */
7170+ sm_member_update(cluster_is_quorate);
7171+ break;
7172+
7173+ case RECONFIG_PARAM_NODE_VOTES:
7174+ node = find_node_by_nodeid(saddr->scl_nodeid);
7175+ node->votes = val;
7176+ recalculate_quorum(1); /* Allow decrease */
7177+ sm_member_update(cluster_is_quorate);
7178+ break;
7179+
7180+ case RECONFIG_PARAM_CONFIG_VERSION:
7181+ config_version = val;
7182+ break;
7183+
7184+ default:
7185+ printk(KERN_INFO CMAN_NAME
7186+ ": got unknown parameter in reconfigure message. %d\n",
7187+ confmsg->param);
7188+ break;
7189+ }
7190+ return 0;
7191+}
7192+
7193+/* Response from master node */
7194+static int do_process_joinack(struct msghdr *msg, int len)
7195+{
7196+ struct cl_mem_joinack_msg *ackmsg = msg->msg_iov->iov_base;
7197+
7198+ join_time = jiffies;
7199+ if (ackmsg->acktype == JOINACK_TYPE_OK) {
7200+ node_state = JOINACK;
7201+ }
7202+
7203+ if (ackmsg->acktype == JOINACK_TYPE_NAK) {
7204+ printk(KERN_WARNING CMAN_NAME
7205+ ": Cluster membership rejected\n");
7206+ P_MEMB("Got JOINACK NACK\n");
7207+ node_state = REJECTED;
7208+ }
7209+
7210+ if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
7211+ P_MEMB("Got JOINACK WAIT\n");
7212+ node_state = JOINWAIT;
7213+ joinwait_time = jiffies;
7214+ }
7215+
7216+ return 0;
7217+}
7218+
7219+/* Request to join the cluster. This makes us the master for this state
7220+ * transition */
7221+static int do_process_joinreq(struct msghdr *msg, int len)
7222+{
7223+ int status;
7224+ static unsigned long last_joinreq = 0;
7225+ static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
7226+ struct cl_mem_join_msg *joinmsg = msg->msg_iov->iov_base;
7227+ struct cluster_node *node;
7228+
7229+ /* If we are in a state transition then tell the new node to wait a bit
7230+ * longer */
7231+ if (node_state != MEMBER) {
7232+ if (node_state == MASTER || node_state == TRANSITION) {
7233+ send_joinack(msg->msg_name, msg->msg_namelen,
7234+ JOINACK_TYPE_WAIT);
7235+ }
7236+ return 0;
7237+ }
7238+
7239+ /* Check version number */
7240+ if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
7241+ char *ptr = (char *) joinmsg;
7242+ char *name;
7243+
7244+ /* Sanity-check the num_addrs field otherwise we could oops */
7245+ if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
7246+ printk(KERN_WARNING CMAN_NAME
7247+ ": num_addr in JOIN-REQ message is rubbish: %d\n",
7248+ le16_to_cpu(joinmsg->num_addr));
7249+ return 0;
7250+ }
7251+
7252+ /* Check the cluster name matches */
7253+ if (strcmp(cluster_name, joinmsg->clustername)) {
7254+ printk(KERN_WARNING CMAN_NAME
7255+ ": attempt to join with cluster name '%s' refused\n",
7256+ joinmsg->clustername);
7257+ send_joinack(msg->msg_name, msg->msg_namelen,
7258+ JOINACK_TYPE_NAK);
7259+ return 0;
7260+ }
7261+
7262+ ptr += sizeof (*joinmsg);
7263+ name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7264+
7265+ /* Check we are not exceeding the maximum number of nodes */
7266+ if (cluster_members > cman_config.max_nodes) {
7267+ printk(KERN_WARNING CMAN_NAME
7268+ ": Join request from %s rejected, exceeds maximum number of nodes\n",
7269+ name);
7270+ send_joinack(msg->msg_name, msg->msg_namelen,
7271+ JOINACK_TYPE_NAK);
7272+ return 0;
7273+ }
7274+
7275+ /* Check that we don't exceed the two_node limit */
7276+ if (two_node && cluster_members == 2) {
7277+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7278+ "rejected, exceeds two node limit\n", name);
7279+ send_joinack(msg->msg_name, msg->msg_namelen,
7280+ JOINACK_TYPE_NAK);
7281+ return 0;
7282+ }
7283+
7284+ if (le16_to_cpu(joinmsg->config_version) != config_version) {
7285+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7286+ "rejected, config version local %u remote %u\n",
7287+ name, config_version,
7288+ le16_to_cpu(joinmsg->config_version));
7289+ send_joinack(msg->msg_name, msg->msg_namelen,
7290+ JOINACK_TYPE_NAK);
7291+ return 0;
7292+ }
7293+
7294+ /* If these don't match then I don't know how the message
7295+ arrived! However, I can't take the chance */
7296+ if (le32_to_cpu(joinmsg->addr_len) != address_length) {
7297+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7298+ "rejected, address length local: %u remote %u\n",
7299+ name, address_length,
7300+ le32_to_cpu(joinmsg->addr_len));
7301+ send_joinack(msg->msg_name, msg->msg_namelen,
7302+ JOINACK_TYPE_NAK);
7303+ return 0;
7304+ }
7305+
7306+ /* Duplicate checking: Because joining messages do not have
7307+ * sequence numbers we may get as many JOINREQ messages as we
7308+ * have interfaces. This bit of code here just checks for
7309+ * JOINREQ messages that come in from the same node in a small
7310+ * period of time and removes the duplicates */
7311+ if (time_before(jiffies, last_joinreq + 10 * HZ)
7312+ && strcmp(name, last_name) == 0) {
7313+ return 0;
7314+ }
7315+
7316+ /* Do we already know about this node? */
7317+ status = check_duplicate_node(name, msg, len);
7318+
7319+ if (status < 0) {
7320+ send_joinack(msg->msg_name, msg->msg_namelen,
7321+ JOINACK_TYPE_NAK);
7322+ return 0;
7323+ }
7324+
7325+ /* OK, you can be in my gang */
7326+ if (status == 0) {
7327+ int i;
7328+ struct sockaddr_cl *addr = msg->msg_name;
7329+
7330+ last_joinreq = jiffies;
7331+ strcpy(last_name, name);
7332+
7333+ node =
7334+ add_new_node(name, joinmsg->votes,
7335+ le32_to_cpu(joinmsg->expected_votes),
7336+ 0, NODESTATE_JOINING);
7337+
7338+ /* Add the node's addresses */
7339+ if (list_empty(&node->addr_list)) {
7340+ for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
7341+ i++) {
7342+ add_node_address(node, ptr, address_length);
7343+ ptr += address_length;
7344+ }
7345+ }
7346+
7347+ send_joinack(msg->msg_name, msg->msg_namelen,
7348+ JOINACK_TYPE_OK);
7349+ joining_node = node;
7350+ joining_temp_nodeid = addr->scl_nodeid;
7351+
7352+ /* Start the state transition */
7353+ start_transition(TRANS_NEWNODE, node);
7354+ }
7355+ }
7356+ else {
7357+ /* Version number mismatch, don't use any part of the message
7358+ * other than the version numbers as things may have moved */
7359+ char buf[MAX_ADDR_PRINTED_LEN];
7360+
7361+ printk(KERN_INFO CMAN_NAME
7362+ ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d) addr: %s\n",
7363+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
7364+ CNXMAN_PATCH_VERSION,
7365+ le32_to_cpu(joinmsg->major_version),
7366+ le32_to_cpu(joinmsg->minor_version),
7367+ le32_to_cpu(joinmsg->patch_version),
7368+ print_addr(msg->msg_name, msg->msg_namelen, buf));
7369+
7370+ send_joinack(msg->msg_name, msg->msg_namelen,
7371+ JOINACK_TYPE_NAK);
7372+ return 0;
7373+ }
7374+
7375+ return 0;
7376+}
7377+
7378+/* A simple function to invent a small number based
7379+ on the node name */
7380+static int node_hash(void)
7381+{
7382+ int i;
7383+ int value = 0;
7384+
7385+ for (i=0; i<strlen(nodename); i++) {
7386+ value += nodename[i];
7387+ }
7388+ return value & 0xF;
7389+}
7390+
7391+/* A new node has stated its intent to form a new cluster. we may have
7392+ * something to say about that... */
7393+static int do_process_newcluster(struct msghdr *msg, int len)
7394+{
7395+ /* If we are also in STARTING state then back down for a random period
7396+ * of time */
7397+ if (node_state == STARTING) {
7398+ P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
7399+ start_time = jiffies + node_hash() * HZ;
7400+ }
7401+
7402+ return 0;
7403+}
7404+
7405+/* Called for each node by the node-message unpacker. Returns -1 if there is a
7406+ * mismatch and the caller will stop processing */
7407+static int check_node(struct cluster_node *newnode, char *addrs,
7408+ unsigned short num_addr)
7409+{
7410+ struct cluster_node *node = find_node_by_name(newnode->name);
7411+
7412+ P_MEMB("check_node: %s", newnode->name);
7413+
7414+ if (!node) {
7415+ C_MEMB(" - not found\n");
7416+ return -1;
7417+ }
7418+
7419+ if (node->votes != newnode->votes ||
7420+ node->node_id != newnode->node_id ||
7421+ node->state != NODESTATE_MEMBER) {
7422+ C_MEMB
7423+ (" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
7424+ node->votes, newnode->votes, node->node_id,
7425+ newnode->node_id, node->state);
7426+ return -1;
7427+ }
7428+ C_MEMB(" - OK\n");
7429+ return 0;
7430+}
7431+
7432+/* Called for each new node found in a JOINCONF message. Create a new node
7433+ * entry */
7434+static int add_node(struct cluster_node *node, char *addrs,
7435+ unsigned short num_addr)
7436+{
7437+ P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
7438+ node->expected_votes, node->node_id);
7439+
7440+ if (!find_node_by_name(node->name)) {
7441+ struct cluster_node *newnode;
7442+ int i;
7443+
7444+ if ((newnode =
7445+ add_new_node(node->name, node->votes, node->expected_votes,
7446+ node->node_id, NODESTATE_MEMBER)) == NULL) {
7447+ P_MEMB("Error adding node\n");
7448+ return -1;
7449+ }
7450+ if (list_empty(&newnode->addr_list)) {
7451+ for (i = 0; i < num_addr; i++) {
7452+ add_node_address(newnode,
7453+ addrs + i * address_length, address_length);
7454+ }
7455+ }
7456+ return 0;
7457+ }
7458+ else {
7459+ P_MEMB("Already got node with name %s\n", node->name);
7460+ return -1;
7461+ }
7462+}
7463+
7464+/* Call a specified routine for each node unpacked from the message. Return
7465+ * either the number of nodes found or -1 for an error */
7466+static int unpack_nodes(unsigned char *buf, int len,
7467+ int (*routine) (struct cluster_node *, char *,
7468+ unsigned short))
7469+{
7470+ int ptr = 0;
7471+ int num_nodes = 0;
7472+ char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
7473+ struct cluster_node node;
7474+
7475+ node.name = nodename;
7476+
7477+ while (ptr < len) {
7478+ int namelen = buf[ptr++];
7479+ unsigned int evotes;
7480+ unsigned int node_id;
7481+ unsigned short num_addr;
7482+ unsigned char *addrs;
7483+
7484+ memcpy(nodename, &buf[ptr], namelen);
7485+ nodename[namelen] = '\0';
7486+ ptr += namelen;
7487+
7488+ memcpy(&num_addr, &buf[ptr], sizeof (short));
7489+ num_addr = le16_to_cpu(num_addr);
7490+ ptr += sizeof (short);
7491+
7492+ /* Just make a note of the addrs "array" */
7493+ addrs = &buf[ptr];
7494+ ptr += num_addr * address_length;
7495+
7496+ node.votes = buf[ptr++];
7497+
7498+ memcpy(&evotes, &buf[ptr], sizeof (int));
7499+ node.expected_votes = le32_to_cpu(evotes);
7500+ ptr += sizeof (int);
7501+
7502+ memcpy(&node_id, &buf[ptr], sizeof (int));
7503+ node.node_id = le32_to_cpu(node_id);
7504+ ptr += sizeof (int);
7505+
7506+ /* Call the callback routine */
7507+ if (routine(&node, addrs, num_addr) < 0)
7508+ return -1;
7509+ num_nodes++;
7510+ }
7511+ return num_nodes;
7512+}
7513+
7514+/* Got join confirmation from a master node. This message contains a list of
7515+ * cluster nodes which we unpack and build into our cluster nodes list. When we
7516+ * have the last message we can go into TRANSITION state */
7517+static int do_process_joinconf(struct msghdr *msg, int len)
7518+{
7519+ char *message = msg->msg_iov->iov_base;
7520+
7521+ if (unpack_nodes(message + 2, len - 2, add_node) < 0) {
7522+ printk(CMAN_NAME
7523+ ": Error procssing joinconf message - giving up on cluster join\n");
7524+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7525+ return -1;
7526+ }
7527+
7528+ /* Last message in the list? */
7529+ if (message[1] & 2) {
7530+ char ackmsg;
7531+ struct sockaddr_cl *addr = msg->msg_name;
7532+
7533+ us->state = NODESTATE_MEMBER;
7534+ node_state = TRANSITION;
7535+ we_are_a_cluster_member = TRUE;
7536+
7537+ ackmsg = CLUSTER_MEM_CONFACK;
7538+ kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
7539+ sizeof (struct sockaddr_cl),
7540+ MSG_NOACK);
7541+ kernel_thread(hello_kthread, NULL, 0);
7542+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
7543+ }
7544+ return 0;
7545+}
7546+
7547+/* Got the master's view of the cluster - compare it with ours and tell it the
7548+ * result */
7549+static int do_process_masterview(struct msghdr *msg, int len)
7550+{
7551+ char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
7552+ char *message = msg->msg_iov->iov_base;
7553+ static int num_nodes;
7554+
7555+ /* Someone else's state transition */
7556+ if (node_state != MEMBER &&
7557+ node_state != TRANSITION && node_state != MASTER)
7558+ return 0;
7559+
7560+ /* First message, zero the counter */
7561+ if (message[1] & 1)
7562+ num_nodes = 0;
7563+
7564+ num_nodes +=
7565+ unpack_nodes(msg->msg_iov->iov_base + 2, len - 2, check_node);
7566+
7567+ /* Last message, check the count and reply */
7568+ if (message[1] & 2) {
7569+ if (num_nodes == cluster_members) {
7570+ /* Send ACK */
7571+ reply[1] = 1;
7572+ }
7573+ else {
7574+ P_MEMB
7575+ ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
7576+ num_nodes, cluster_members);
7577+ /* Send NAK */
7578+ reply[1] = 0;
7579+ }
7580+ kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
7581+ msg->msg_namelen, 0);
7582+ }
7583+ return 0;
7584+}
7585+
7586+static int do_process_leave(struct msghdr *msg, int len)
7587+{
7588+ struct cluster_node *node;
7589+ struct sockaddr_cl *saddr = msg->msg_name;
7590+ unsigned char *leavemsg = (unsigned char *) msg->msg_iov->iov_base;
7591+
7592+ if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
7593+ unsigned char reason = leavemsg[1];
7594+
7595+ if (node->state != NODESTATE_DEAD) {
7596+ printk(KERN_INFO CMAN_NAME
7597+ ": Node %s is leaving the cluster, reason %d\n",
7598+ node->name, reason);
7599+
7600+ node->leave_reason = reason;
7601+ }
7602+ leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
7603+
7604+ a_node_just_died(node);
7605+
7606+ /* If it was the master node, then we have been nominated as
7607+ * the sucessor */
7608+ if (node == master_node) {
7609+ start_transition(TRANS_DEADMASTER, master_node);
7610+ }
7611+
7612+ }
7613+ return 0;
7614+}
7615+
7616+static int do_process_hello(struct msghdr *msg, int len)
7617+{
7618+ struct cluster_node *node;
7619+ struct cl_mem_hello_msg *hellomsg =
7620+ (struct cl_mem_hello_msg *) msg->msg_iov->iov_base;
7621+ struct sockaddr_cl *saddr = msg->msg_name;
7622+
7623+ /* We are starting up. Send a join message to the node whose HELLO we
7624+ * just received */
7625+ if (node_state == STARTING || node_state == JOINWAIT) {
7626+ struct sockaddr_cl *addr = msg->msg_name;
7627+
7628+ printk(KERN_INFO CMAN_NAME ": sending membership request\n");
7629+
7630+ send_joinreq(addr, msg->msg_namelen);
7631+ join_time = jiffies;
7632+ node_state = JOINING;
7633+ return 0;
7634+ }
7635+
7636+ /* Only process HELLOs if we are not in transition */
7637+ if (node_state == MEMBER) {
7638+ if (len < sizeof (struct cl_mem_hello_msg)) {
7639+ printk(KERN_ERR CMAN_NAME
7640+ ": short hello message from node %d\n",
7641+ saddr->scl_nodeid);
7642+ return -1;
7643+ }
7644+
7645+ node = find_node_by_nodeid(saddr->scl_nodeid);
7646+ if (node && node->state != NODESTATE_DEAD) {
7647+
7648+ /* Check the cluster generation in the HELLO message.
7649+ * NOTE: this may be different if the message crossed
7650+ * on the wire with an END-TRANS so we allow a period
7651+ * of grace in which this is allowable */
7652+ if (cluster_generation !=
7653+ le32_to_cpu(hellomsg->generation)
7654+ && node_state == MEMBER
7655+ && time_after(jiffies,
7656+ cman_config.hello_timer * HZ +
7657+ transition_end_time)) {
7658+ char killmsg;
7659+
7660+ printk(KERN_INFO CMAN_NAME
7661+ ": bad generation number %d in HELLO message, expected %d\n",
7662+ le32_to_cpu(hellomsg->generation),
7663+ cluster_generation);
7664+
7665+ notify_kernel_listeners(DIED,
7666+ (long) node->node_id);
7667+
7668+ killmsg = CLUSTER_MEM_KILL;
7669+ kcl_sendmsg(mem_socket, &killmsg, 1,
7670+ saddr, sizeof (struct sockaddr_cl),
7671+ MSG_NOACK);
7672+ return 0;
7673+ }
7674+
7675+ if (cluster_members != le16_to_cpu(hellomsg->members)
7676+ && node_state == MEMBER) {
7677+ printk(KERN_INFO CMAN_NAME
7678+ ": nmembers in HELLO message does not match our view\n");
7679+ start_transition(TRANS_CHECK, node);
7680+ return 0;
7681+ }
7682+ /* The message is OK - save the time */
7683+ node->last_hello = jiffies;
7684+
7685+ }
7686+ else {
7687+ struct sockaddr_cl *addr = msg->msg_name;
7688+
7689+ /* This node is a danger to our valid cluster */
7690+ if (cluster_is_quorate) {
7691+ char killmsg;
7692+
7693+ killmsg = CLUSTER_MEM_KILL;
7694+ kcl_sendmsg(mem_socket, &killmsg, 1, addr,
7695+ sizeof (struct sockaddr_cl),
7696+ MSG_NOACK);
7697+ }
7698+
7699+ }
7700+ }
7701+
7702+ return 0;
7703+
7704+}
7705+
7706+static int do_process_kill(struct msghdr *msg, int len)
7707+{
7708+ struct sockaddr_cl *saddr = msg->msg_name;
7709+ struct cluster_node *node;
7710+
7711+ node = find_node_by_nodeid(saddr->scl_nodeid);
7712+ if (node && node->state == NODESTATE_MEMBER) {
7713+
7714+ printk(KERN_INFO CMAN_NAME
7715+ ": Being told to leave the cluster by node %d\n",
7716+ saddr->scl_nodeid);
7717+
7718+ node_state = LEFT_CLUSTER;
7719+ quit_threads = 1;
7720+ wake_up_process(membership_task);
7721+ wake_up_interruptible(&cnxman_waitq);
7722+ }
7723+ else {
7724+ P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
7725+ }
7726+ return 0;
7727+}
7728+
7729+/* Some cluster membership utility functions */
7730+struct cluster_node *find_node_by_name(char *name)
7731+{
7732+ struct list_head *nodelist;
7733+ struct cluster_node *node;
7734+
7735+ down(&cluster_members_lock);
7736+ list_for_each(nodelist, &cluster_members_list) {
7737+ node = list_entry(nodelist, struct cluster_node, list);
7738+
7739+ if (strcmp(node->name, name) == 0) {
7740+ up(&cluster_members_lock);
7741+ return node;
7742+ }
7743+ }
7744+ up(&cluster_members_lock);
7745+ return NULL;
7746+}
7747+
7748+/* Try to avoid using this as it's slow and holds the members lock */
7749+struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
7750+{
7751+ struct list_head *nodelist;
7752+ struct list_head *addrlist;
7753+ struct cluster_node *node;
7754+ struct cluster_node_addr *nodeaddr;
7755+
7756+ down(&cluster_members_lock);
7757+
7758+ list_for_each(nodelist, &cluster_members_list) {
7759+ node = list_entry(nodelist, struct cluster_node, list);
7760+
7761+ list_for_each(addrlist, &node->addr_list) {
7762+ nodeaddr =
7763+ list_entry(addrlist, struct cluster_node_addr,
7764+ list);
7765+
7766+ if (memcmp(nodeaddr->addr, addr, address_length) == 0) {
7767+ up(&cluster_members_lock);
7768+ return node;
7769+ }
7770+ }
7771+ }
7772+
7773+ up(&cluster_members_lock);
7774+ return NULL;
7775+}
7776+
7777+/* This is the quick way to find a node */
7778+struct cluster_node *find_node_by_nodeid(unsigned int id)
7779+{
7780+ struct cluster_node *node;
7781+
7782+ if (id > sizeof_members_array)
7783+ return NULL;
7784+
7785+ spin_lock(&members_by_nodeid_lock);
7786+ node = members_by_nodeid[id];
7787+ spin_unlock(&members_by_nodeid_lock);
7788+ return node;
7789+}
7790+
7791+static int dispatch_messages(struct socket *mem_socket)
7792+{
7793+ int err = 0;
7794+
7795+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
7796+ struct msghdr msg;
7797+ struct iovec iov;
7798+ struct sockaddr_cl sin;
7799+ int len;
7800+ mm_segment_t fs;
7801+
7802+ memset(&sin, 0, sizeof (sin));
7803+
7804+ msg.msg_control = NULL;
7805+ msg.msg_controllen = 0;
7806+ msg.msg_iovlen = 1;
7807+ msg.msg_iov = &iov;
7808+ msg.msg_name = &sin;
7809+ msg.msg_namelen = sizeof (sin);
7810+ msg.msg_flags = 0;
7811+
7812+ iov.iov_len = MAX_CLUSTER_MESSAGE;
7813+ iov.iov_base = iobuf;
7814+
7815+ fs = get_fs();
7816+ set_fs(get_ds());
7817+
7818+ len =
7819+ sock_recvmsg(mem_socket, &msg, MAX_CLUSTER_MESSAGE,
7820+ MSG_DONTWAIT);
7821+ set_fs(fs);
7822+ if (len > 0) {
7823+ iov.iov_base = iobuf; /* Reinstate pointer */
7824+ msg.msg_name = &sin;
7825+ do_membership_packet(&msg, len);
7826+ }
7827+ else {
7828+ if (len == -EAGAIN)
7829+ err = 0;
7830+ else
7831+ err = -1;
7832+ break;
7833+ }
7834+ }
7835+ return err;
7836+}
7837+
7838+/* Scan the nodes list for dead nodes */
7839+static void check_for_dead_nodes()
7840+{
7841+ struct list_head *nodelist;
7842+ struct cluster_node *node;
7843+
7844+ down(&cluster_members_lock);
7845+ list_for_each(nodelist, &cluster_members_list) {
7846+ node = list_entry(nodelist, struct cluster_node, list);
7847+
7848+ if (node->state != NODESTATE_DEAD &&
7849+ time_after(jiffies,
7850+ node->last_hello +
7851+ cman_config.deadnode_timeout * HZ) && !node->us) {
7852+
7853+ up(&cluster_members_lock);
7854+
7855+ printk(KERN_WARNING CMAN_NAME
7856+ ": no HELLO from %s, removing from the cluster\n",
7857+ node->name);
7858+
7859+ P_MEMB("last hello was %ld, current time is %ld\n",
7860+ node->last_hello, jiffies);
7861+
7862+ node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
7863+ leavereason = 0;
7864+
7865+ /* This is unlikely to work but it's worth a try! */
7866+ send_kill(node->node_id);
7867+
7868+ /* Start state transition */
7869+ a_node_just_died(node);
7870+ return;
7871+ }
7872+ }
7873+ up(&cluster_members_lock);
7874+
7875+ /* Also check for a dead quorum device */
7876+ if (quorum_device) {
7877+ if (quorum_device->state == NODESTATE_MEMBER &&
7878+ time_after(jiffies,
7879+ quorum_device->last_hello +
7880+ cman_config.deadnode_timeout * HZ)) {
7881+ quorum_device->state = NODESTATE_DEAD;
7882+ printk(KERN_WARNING CMAN_NAME
7883+ ": Quorum device %s timed out\n",
7884+ quorum_device->name);
7885+ recalculate_quorum(0);
7886+ }
7887+ }
7888+
7889+ return;
7890+}
7891+
7892+/* add "us" as a node in the cluster */
7893+static int add_us()
7894+{
7895+ struct cluster_node *newnode =
7896+ kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
7897+
7898+ if (!newnode) {
7899+ /* Oh shit, we have to commit hara kiri here for the greater
7900+ * good of the cluster */
7901+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7902+
7903+ printk(KERN_CRIT CMAN_NAME
7904+ ": Cannot allocate memory for our node structure\n");
7905+ panic("Must die");
7906+
7907+ return -1;
7908+ }
7909+
7910+ memset(newnode, 0, sizeof (struct cluster_node));
7911+ newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
7912+ if (!newnode->name) {
7913+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7914+
7915+ printk(KERN_CRIT CMAN_NAME
7916+ ": Cannot allocate memory for node name\n");
7917+ kfree(newnode);
7918+
7919+ panic("Must die");
7920+
7921+ return -1;
7922+ }
7923+
7924+ strcpy(newnode->name, nodename);
7925+ newnode->last_hello = jiffies;
7926+ newnode->votes = votes;
7927+ newnode->expected_votes = expected_votes;
7928+ newnode->state = NODESTATE_JOINING;
7929+ newnode->node_id = 0; /* Will get filled in by ENDTRANS message */
7930+ newnode->us = 1;
7931+ newnode->leave_reason = 0;
7932+ INIT_LIST_HEAD(&newnode->addr_list);
7933+ get_local_addresses(newnode); /* Get from cnxman socket info */
7934+
7935+ /* Add the new node to the list */
7936+ down(&cluster_members_lock);
7937+ list_add(&newnode->list, &cluster_members_list);
7938+ cluster_members++;
7939+ up(&cluster_members_lock);
7940+ us = newnode;
7941+
7942+ return 0;
7943+}
7944+
7945+/* Return the highest known node_id */
7946+unsigned int get_highest_nodeid()
7947+{
7948+ struct list_head *nodelist;
7949+ struct cluster_node *node = NULL;
7950+ unsigned int highest = 0;
7951+
7952+ down(&cluster_members_lock);
7953+ list_for_each(nodelist, &cluster_members_list) {
7954+ node = list_entry(nodelist, struct cluster_node, list);
7955+
7956+ if (node->node_id > highest)
7957+ highest = node->node_id;
7958+ }
7959+ up(&cluster_members_lock);
7960+
7961+ return highest;
7962+}
7963+
7964+/* Elect a new master if there is a clash. Returns 1 if we are the new master,
7965+ * the master's struct will also be returned. This, rather primitively, uses
7966+ * the lowest node ID */
7967+static int elect_master(struct cluster_node **master_node)
7968+{
7969+ int i;
7970+
7971+ for (i = 1; i < sizeof_members_array; i++) {
7972+ if (members_by_nodeid[i]
7973+ && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
7974+ *master_node = members_by_nodeid[i];
7975+ P_MEMB("Elected master is %s\n", (*master_node)->name);
7976+ return (*master_node)->us;
7977+ }
7978+ }
7979+ BUG();
7980+ return 0;
7981+}
7982+
7983+/* Called by node_cleanup in cnxman when we have left the cluster */
7984+void free_nodeid_array()
7985+{
7986+ vfree(members_by_nodeid);
7987+ members_by_nodeid = NULL;
7988+ sizeof_members_array = 0;
7989+}
7990+
7991+int allocate_nodeid_array()
7992+{
7993+ /* Allocate space for the nodeid lookup array */
7994+ if (!members_by_nodeid) {
7995+ spin_lock_init(&members_by_nodeid_lock);
7996+ members_by_nodeid =
7997+ vmalloc(cman_config.max_nodes *
7998+ sizeof (struct cluster_member *));
7999+ }
8000+
8001+ if (!members_by_nodeid) {
8002+ printk(KERN_WARNING
8003+ "Unable to allocate members array for %d members\n",
8004+ cman_config.max_nodes);
8005+ return -ENOMEM;
8006+ }
8007+ memset(members_by_nodeid, 0,
8008+ cman_config.max_nodes * sizeof (struct cluster_member *));
8009+ sizeof_members_array = cman_config.max_nodes;
8010+
8011+ return 0;
8012+}
8013+
8014+/* Set the votes & expected_votes variables */
8015+void set_votes(int v, int e)
8016+{
8017+ votes = v;
8018+ expected_votes = e;
8019+}
8020+
8021+int get_quorum()
8022+{
8023+ return quorum;
8024+}
8025+
8026+/* Called by cnxman to see if activity should be blocked because we are in a
8027+ * state transition */
8028+int in_transition()
8029+{
8030+ return node_state == TRANSITION ||
8031+ node_state == TRANSITION_COMPLETE || node_state == MASTER;
8032+}
8033+
8034+/* Return the current membership state as a string for the main line to put
8035+ * into /proc . I really should be using snprintf rather than sprintf but it's
8036+ * not exported... */
8037+char *membership_state(char *buf, int buflen)
8038+{
8039+ switch (node_state) {
8040+ case STARTING:
8041+ strncpy(buf, "Starting", buflen);
8042+ break;
8043+ case JOINING:
8044+ strncpy(buf, "Joining", buflen);
8045+ break;
8046+ case JOINWAIT:
8047+ strncpy(buf, "Join-Wait", buflen);
8048+ break;
8049+ case JOINACK:
8050+ strncpy(buf, "Join-Ack", buflen);
8051+ break;
8052+ case TRANSITION:
8053+ sprintf(buf, "State-Transition: Master is %s",
8054+ master_node ? master_node->name : "Unknown");
8055+ break;
8056+ case MEMBER:
8057+ strncpy(buf, "Cluster-Member", buflen);
8058+ break;
8059+ case REJECTED:
8060+ strncpy(buf, "Rejected", buflen);
8061+ break;
8062+ case LEFT_CLUSTER:
8063+ strncpy(buf, "Left-Cluster", buflen);
8064+ break;
8065+ case TRANSITION_COMPLETE:
8066+ strncpy(buf, "Transition-Complete", buflen);
8067+ break;
8068+ case MASTER:
8069+ strncpy(buf, "Transition-Master", buflen);
8070+ break;
8071+ default:
8072+ sprintf(buf, "Unknown: code=%d", node_state);
8073+ break;
8074+ }
8075+
8076+ return buf;
8077+}
8078+
8079+#ifdef DEBUG_MEMB
8080+static char *msgname(int msg)
8081+{
8082+ switch (msg) {
8083+ case CLUSTER_MEM_JOINCONF:
8084+ return "JOINCONF";
8085+ case CLUSTER_MEM_JOINREQ:
8086+ return "JOINREQ";
8087+ case CLUSTER_MEM_LEAVE:
8088+ return "LEAVE";
8089+ case CLUSTER_MEM_HELLO:
8090+ return "HELLO";
8091+ case CLUSTER_MEM_KILL:
8092+ return "KILL";
8093+ case CLUSTER_MEM_JOINACK:
8094+ return "JOINACK";
8095+ case CLUSTER_MEM_ENDTRANS:
8096+ return "ENDTRANS";
8097+ case CLUSTER_MEM_RECONFIG:
8098+ return "RECONFIG";
8099+ case CLUSTER_MEM_MASTERVIEW:
8100+ return "MASTERVIEW";
8101+ case CLUSTER_MEM_STARTTRANS:
8102+ return "STARTTRANS";
8103+ case CLUSTER_MEM_JOINREJ:
8104+ return "JOINREJ";
8105+ case CLUSTER_MEM_VIEWACK:
8106+ return "VIEWACK";
8107+ case CLUSTER_MEM_STARTACK:
8108+ return "STARTACK";
8109+ case CLUSTER_MEM_NEWCLUSTER:
8110+ return "NEWCLUSTER";
8111+ case CLUSTER_MEM_CONFACK:
8112+ return "CONFACK";
8113+ case CLUSTER_MEM_NOMINATE:
8114+ return "NOMINATE";
8115+
8116+ default:
8117+ return "??UNKNOWN??";
8118+ }
8119+}
8120+
8121+#endif
8122+
8123+/*
8124+ * Overrides for Emacs so that we follow Linus's tabbing style.
8125+ * Emacs will notice this stuff at the end of the file and automatically
8126+ * adjust the settings for this buffer only. This must remain at the end
8127+ * of the file.
8128+ * ---------------------------------------------------------------------------
8129+ * Local variables:
8130+ * c-file-style: "linux"
8131+ * End:
8132+ */
8133diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
8134--- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8135+++ linux-patched/cluster/cman/proc.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 8136@@ -0,0 +1,364 @@
8137+/******************************************************************************
8138+*******************************************************************************
8139+**
8140+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8141+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8142+**
8143+** This copyrighted material is made available to anyone wishing to use,
8144+** modify, copy, or redistribute it subject to the terms and conditions
8145+** of the GNU General Public License v.2.
8146+**
8147+*******************************************************************************
8148+******************************************************************************/
8149+
8150+#include <linux/init.h>
8151+#include <linux/socket.h>
8152+#include <linux/kernel.h>
8153+#include <linux/sched.h>
8154+#include <linux/file.h>
8155+#include <linux/proc_fs.h>
8156+#include <linux/seq_file.h>
8157+#include <linux/list.h>
8158+#include <linux/in.h>
8159+#include <net/sock.h>
8160+#include <cluster/cnxman.h>
8161+#include <cluster/service.h>
8162+
8163+#include "cnxman-private.h"
8164+#include "config.h"
8165+
8166+extern int cluster_members;
8167+extern struct list_head cluster_members_list;
8168+extern struct semaphore cluster_members_lock;
8169+extern struct cluster_node *quorum_device;
8170+extern int we_are_a_cluster_member;
8171+extern int cluster_is_quorate;
8172+extern unsigned short cluster_id;
8173+extern atomic_t use_count;
8174+extern unsigned int address_length;
8175+extern unsigned int config_version;
8176+extern char cluster_name[];
8177+extern struct cluster_node *us;
8178+static struct seq_operations cluster_info_op;
8179+
8180+int sm_procdata(char *b, char **start, off_t offset, int length);
8181+int sm_debug_info(char *b, char **start, off_t offset, int length);
8182+
8183+/* /proc interface to the configuration struct */
8184+static struct config_proc_info {
8185+ char *name;
8186+ int *value;
8187+} config_proc[] = {
8188+ {
8189+ .name = "joinwait_timeout",
8190+ .value = &cman_config.joinwait_timeout,
8191+ },
8192+ {
8193+ .name = "joinconf_timeout",
8194+ .value = &cman_config.joinconf_timeout,
8195+ },
8196+ {
8197+ .name = "join_timeout",
8198+ .value = &cman_config.join_timeout,
8199+ },
8200+ {
8201+ .name = "hello_timer",
8202+ .value = &cman_config.hello_timer,
8203+ },
8204+ {
8205+ .name = "deadnode_timeout",
8206+ .value = &cman_config.deadnode_timeout,
8207+ },
8208+ {
8209+ .name = "transition_timeout",
8210+ .value = &cman_config.transition_timeout,
8211+ },
8212+ {
8213+ .name = "transition_restarts",
8214+ .value = &cman_config.transition_restarts,
8215+ },
8216+ {
8217+ .name = "max_nodes",
8218+ .value = &cman_config.max_nodes,
8219+ },
8220+ {
8221+ .name = "sm_debug_size",
8222+ .value = &cman_config.sm_debug_size,
8223+ },
8224+};
8225+
8226+
8227+static int proc_cluster_status(char *b, char **start, off_t offset, int length)
8228+{
8229+ struct list_head *nodelist;
8230+ struct cluster_node *node;
8231+ struct cluster_node_addr *node_addr;
8232+ unsigned int total_votes = 0;
8233+ unsigned int max_expected = 0;
8234+ int c = 0;
8235+ char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
8236+
8237+ if (!we_are_a_cluster_member) {
8238+ c += sprintf(b+c, "Not a cluster member. State: %s\n",
8239+ membership_state(node_buf,
8240+ sizeof (node_buf)));
8241+ return c;
8242+ }
8243+
8244+ /* Total the votes */
8245+ down(&cluster_members_lock);
8246+ list_for_each(nodelist, &cluster_members_list) {
8247+ node = list_entry(nodelist, struct cluster_node, list);
8248+ if (node->state == NODESTATE_MEMBER) {
8249+ total_votes += node->votes;
8250+ max_expected =
8251+ max(max_expected, node->expected_votes);
8252+ }
8253+ }
8254+ up(&cluster_members_lock);
8255+
8256+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
8257+ total_votes += quorum_device->votes;
8258+
8259+ c += sprintf(b+c,
8260+ "Version: %d.%d.%d\nConfig version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
8261+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
8262+ CNXMAN_PATCH_VERSION,
8263+ config_version,
8264+ cluster_name, cluster_id,
8265+ membership_state(node_buf, sizeof (node_buf)));
8266+ c += sprintf(b+c,
8267+ "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n",
8268+ cluster_members, max_expected, total_votes,
8269+ get_quorum(),
8270+ cluster_is_quorate ? " " : "Activity blocked");
8271+ c += sprintf(b+c, "Active subsystems: %d\n",
8272+ atomic_read(&use_count));
8273+
8274+
8275+ c += sprintf(b+c, "Node addresses: ");
8276+ list_for_each_entry(node_addr, &us->addr_list, list) {
8277+ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
8278+ if (saddr->sin6_family == AF_INET6) {
8279+ c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ",
8280+ be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
8281+ be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
8282+ be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
8283+ be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
8284+ be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
8285+ be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
8286+ be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
8287+ be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
8288+ }
8289+ else {
8290+ struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
8291+ uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
8292+ c+= sprintf(b+c, "%u.%u.%u.%u ",
8293+ addr[0], addr[1], addr[2], addr[3]);
8294+ }
8295+ }
8296+ c += sprintf(b+c, "\n\n");
8297+ return c;
8298+}
8299+
8300+
8301+/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
8302+ * we are */
8303+struct cluster_seq_info {
8304+ int nodeid;
8305+ int highest_nodeid;
8306+};
8307+
8308+static int cluster_open(struct inode *inode, struct file *file)
8309+{
8310+ return seq_open(file, &cluster_info_op);
8311+}
8312+
8313+static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
8314+{
8315+ struct cluster_seq_info *csi =
8316+ kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
8317+
8318+ if (!csi)
8319+ return NULL;
8320+
8321+ /* Keep highest_nodeid here so we don't need to keep traversing the
8322+ * list to find it */
8323+ csi->nodeid = *pos;
8324+ csi->highest_nodeid = get_highest_nodeid();
8325+
8326+ /* Print the header */
8327+ if (*pos == 0) {
8328+ seq_printf(m,
8329+ "Node Votes Exp Sts Name\n");
8330+ return csi;
8331+ }
8332+ return csi;
8333+}
8334+
8335+static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
8336+{
8337+ struct cluster_seq_info *csi = p;
8338+
8339+ *pos = ++csi->nodeid;
8340+ if (csi->nodeid > csi->highest_nodeid)
8341+ return NULL;
8342+
8343+ return csi;
8344+}
8345+
8346+static int cluster_seq_show(struct seq_file *m, void *p)
8347+{
8348+ char state = '?';
8349+ struct cluster_node *node;
8350+ struct cluster_seq_info *csi = p;
8351+
8352+ /*
8353+ * If we have "0" here then display the quorum device if
8354+ * there is one.
8355+ */
8356+ if (csi->nodeid == 0)
8357+ node = quorum_device;
8358+ else
8359+ node = find_node_by_nodeid(csi->nodeid);
8360+
8361+ if (!node)
8362+ return 0;
8363+
8364+ /* Make state printable */
8365+ switch (node->state) {
8366+ case NODESTATE_MEMBER:
8367+ state = 'M';
8368+ break;
8369+ case NODESTATE_JOINING:
8370+ state = 'J';
8371+ break;
8372+ case NODESTATE_REMOTEMEMBER:
8373+ state = 'R';
8374+ break;
8375+ case NODESTATE_DEAD:
8376+ state = 'X';
8377+ break;
8378+ }
8379+ seq_printf(m, " %3d %3d %3d %c %s\n",
8380+ node->node_id,
8381+ node->votes,
8382+ node->expected_votes,
8383+ state,
8384+ node->name);
8385+
8386+ return 0;
8387+}
8388+
8389+static void cluster_seq_stop(struct seq_file *m, void *p)
8390+{
8391+ kfree(p);
8392+}
8393+
8394+static struct seq_operations cluster_info_op = {
8395+ .start = cluster_seq_start,
8396+ .next = cluster_seq_next,
8397+ .stop = cluster_seq_stop,
8398+ .show = cluster_seq_show
8399+};
8400+
8401+static struct file_operations cluster_fops = {
8402+ .open = cluster_open,
8403+ .read = seq_read,
8404+ .llseek = seq_lseek,
8405+ .release = seq_release,
8406+};
8407+
8408+static int cman_config_read_proc(char *page, char **start, off_t off, int count,
8409+ int *eof, void *data)
8410+{
8411+ struct config_proc_info *cinfo = data;
8412+
8413+ return snprintf(page, count, "%d\n", *cinfo->value);
8414+}
8415+
8416+static int cman_config_write_proc(struct file *file, const char *buffer,
8417+ unsigned long count, void *data)
8418+{
8419+ struct config_proc_info *cinfo = data;
8420+ int value;
8421+ char *end;
8422+
8423+ value = simple_strtoul(buffer, &end, 10);
8424+ if (*end) {
8425+ *cinfo->value = value;
8426+ }
8427+ return count;
8428+}
8429+
8430+/* Base of the config directory for cman */
8431+static struct proc_dir_entry *proc_cman_config;
8432+void create_proc_entries(void)
8433+{
8434+ struct proc_dir_entry *procentry;
8435+ struct proc_dir_entry *proc_cluster;
8436+ int i;
8437+
8438+ proc_cluster = proc_mkdir("cluster", 0);
8439+ if (!proc_cluster)
8440+ return;
8441+ proc_cluster->owner = THIS_MODULE;
8442+
8443+ /* Config dir filled in by us and others */
8444+ if (!proc_mkdir("cluster/config", 0))
8445+ return;
8446+
8447+ /* Don't much care if this fails, it's hardly vital */
8448+ procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
8449+ if (procentry)
8450+ procentry->proc_fops = &cluster_fops;
8451+
8452+ procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
8453+ if (procentry)
8454+ procentry->get_info = proc_cluster_status;
8455+
8456+ procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
8457+ if (procentry)
8458+ procentry->get_info = sm_procdata;
8459+
8460+ /* Config entries */
8461+ proc_cman_config = proc_mkdir("cluster/config/cman", 0);
8462+ if (!proc_cman_config)
8463+ return;
8464+
8465+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
8466+ procentry = create_proc_entry(config_proc[i].name, 0660,
8467+ proc_cman_config);
8468+ if (procentry) {
8469+ procentry->data = &config_proc[i];
8470+ procentry->write_proc = cman_config_write_proc;
8471+ procentry->read_proc = cman_config_read_proc;
8472+ }
8473+ }
8474+
8475+ procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
8476+ if (procentry)
8477+ procentry->get_info = sm_debug_info;
8478+}
8479+
8480+void cleanup_proc_entries(void)
8481+{
8482+ int i, config_count;
8483+
8484+ remove_proc_entry("cluster/sm_debug", NULL);
8485+
8486+ config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
8487+
8488+ if (proc_cman_config) {
8489+ for (i=0; i<config_count; i++)
8490+ remove_proc_entry(config_proc[i].name, proc_cman_config);
8491+ }
8492+ remove_proc_entry("cluster/config/cman", NULL);
8493+ remove_proc_entry("cluster/config", NULL);
8494+
8495+ remove_proc_entry("cluster/nodes", NULL);
8496+ remove_proc_entry("cluster/status", NULL);
8497+ remove_proc_entry("cluster/services", NULL);
8498+ remove_proc_entry("cluster/config", NULL);
8499+ remove_proc_entry("cluster", NULL);
8500+}
8501diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
8502--- linux-orig/cluster/cman/sm.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8503+++ linux-patched/cluster/cman/sm.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 8504@@ -0,0 +1,108 @@
8505+/******************************************************************************
8506+*******************************************************************************
8507+**
8508+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8509+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8510+**
8511+** This copyrighted material is made available to anyone wishing to use,
8512+** modify, copy, or redistribute it subject to the terms and conditions
8513+** of the GNU General Public License v.2.
8514+**
8515+*******************************************************************************
8516+******************************************************************************/
8517+
8518+#ifndef __SM_DOT_H__
8519+#define __SM_DOT_H__
8520+
8521+/*
8522+ * This is the main header file to be included in each Service Manager source
8523+ * file.
8524+ */
8525+
8526+#include <linux/list.h>
8527+#include <linux/socket.h>
8528+#include <linux/kernel.h>
8529+#include <linux/sched.h>
8530+#include <linux/file.h>
8531+#include <net/sock.h>
8532+
8533+#include <cluster/cnxman.h>
8534+#include <cluster/service.h>
8535+
8536+#define SG_LEVELS (4)
8537+
8538+#include "sm_internal.h"
8539+#include "sm_barrier.h"
8540+#include "sm_control.h"
8541+#include "sm_daemon.h"
8542+#include "sm_joinleave.h"
8543+#include "sm_membership.h"
8544+#include "sm_message.h"
8545+#include "sm_misc.h"
8546+#include "sm_recover.h"
8547+#include "sm_services.h"
8548+
8549+extern struct list_head sm_sg[SG_LEVELS];
8550+extern struct semaphore sm_sglock;
8551+
8552+#ifndef TRUE
8553+#define TRUE (1)
8554+#endif
8555+
8556+#ifndef FALSE
8557+#define FALSE (0)
8558+#endif
8559+
8560+#define SM_ASSERT(x, do) \
8561+{ \
8562+ if (!(x)) \
8563+ { \
8564+ printk("\nSM: Assertion failed on line %d of file %s\n" \
8565+ "SM: assertion: \"%s\"\n" \
8566+ "SM: time = %lu\n", \
8567+ __LINE__, __FILE__, #x, jiffies); \
8568+ {do} \
8569+ printk("\n"); \
8570+ panic("SM: Record message above and reboot.\n"); \
8571+ } \
8572+}
8573+
8574+#define SM_RETRY(do_this, until_this) \
8575+for (;;) \
8576+{ \
8577+ do { do_this; } while (0); \
8578+ if (until_this) \
8579+ break; \
8580+ printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \
8581+ schedule();\
8582+}
8583+
8584+
8585+#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
8586+
8587+#define log_error(sg, fmt, args...) \
8588+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8589+
8590+
8591+#define SM_DEBUG_LOG
8592+
8593+#ifdef SM_DEBUG_CONSOLE
8594+#define log_debug(sg, fmt, args...) \
8595+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8596+#endif
8597+
8598+#ifdef SM_DEBUG_LOG
8599+#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
8600+#endif
8601+
8602+#ifdef SM_DEBUG_ALL
8603+#define log_debug(sg, fmt, args...) \
8604+do \
8605+{ \
8606+ printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
8607+ sm_debug_log(sg, fmt, ##args); \
8608+} \
8609+while (0)
8610+#endif
8611+
8612+#endif /* __SM_DOT_H__ */
8613diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
8614--- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8615+++ linux-patched/cluster/cman/sm_barrier.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 8616@@ -0,0 +1,232 @@
8617+/******************************************************************************
8618+*******************************************************************************
8619+**
8620+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8621+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8622+**
8623+** This copyrighted material is made available to anyone wishing to use,
8624+** modify, copy, or redistribute it subject to the terms and conditions
8625+** of the GNU General Public License v.2.
8626+**
8627+*******************************************************************************
8628+******************************************************************************/
8629+
8630+#include "sm.h"
8631+
8632+static struct list_head barriers;
8633+static spinlock_t barriers_lock;
8634+
8635+struct bc_entry {
8636+ struct list_head list;
8637+ uint32_t gid;
8638+ int status;
8639+ char type;
8640+};
8641+typedef struct bc_entry bc_entry_t;
8642+
8643+void init_barriers(void)
8644+{
8645+ INIT_LIST_HEAD(&barriers);
8646+ spin_lock_init(&barriers_lock);
8647+}
8648+
8649+static int atoi(char *c)
8650+{
8651+ int x = 0;
8652+
8653+ while ('0' <= *c && *c <= '9') {
8654+ x = x * 10 + (*c - '0');
8655+ c++;
8656+ }
8657+ return x;
8658+}
8659+
8660+static void add_barrier_callback(char *name, int status, int type)
8661+{
8662+ char *p;
8663+ uint32_t gid;
8664+ bc_entry_t *be;
8665+
8666+ /* an ESRCH callback just means there was a cnxman transition */
8667+ if (status == -ESRCH)
8668+ return;
8669+
8670+ /* extract global id of SG from barrier name */
8671+ p = strstr(name, "sm.");
8672+
8673+ SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
8674+
8675+ p += strlen("sm.");
8676+ gid = atoi(p);
8677+
8678+ SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
8679+
8680+ be->gid = gid;
8681+ be->status = status;
8682+ be->type = type;
8683+
8684+ spin_lock(&barriers_lock);
8685+ list_add_tail(&be->list, &barriers);
8686+ spin_unlock(&barriers_lock);
8687+
8688+ wake_serviced(DO_BARRIERS);
8689+}
8690+
8691+static void callback_recovery_barrier(char *name, int status)
8692+{
8693+ add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
8694+}
8695+
8696+static void callback_startdone_barrier_new(char *name, int status)
8697+{
8698+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
8699+}
8700+
8701+static void callback_startdone_barrier(char *name, int status)
8702+{
8703+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
8704+}
8705+
8706+int sm_barrier(char *name, int count, int type)
8707+{
8708+ int error;
8709+ unsigned long fn = 0;
8710+
8711+ switch (type) {
8712+ case SM_BARRIER_STARTDONE:
8713+ fn = (unsigned long) callback_startdone_barrier;
8714+ break;
8715+ case SM_BARRIER_STARTDONE_NEW:
8716+ fn = (unsigned long) callback_startdone_barrier_new;
8717+ break;
8718+ case SM_BARRIER_RECOVERY:
8719+ fn = (unsigned long) callback_recovery_barrier;
8720+ break;
8721+ }
8722+
8723+ error = kcl_barrier_register(name, 0, count);
8724+ if (error) {
8725+ log_print("barrier register error %d", error);
8726+ goto fail;
8727+ }
8728+
8729+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
8730+ if (error) {
8731+ log_print("barrier setattr autodel error %d", error);
8732+ goto fail_bar;
8733+ }
8734+
8735+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
8736+ if (error) {
8737+ log_print("barrier setattr cb error %d", error);
8738+ goto fail_bar;
8739+ }
8740+
8741+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
8742+ if (error) {
8743+ log_print("barrier setattr enabled error %d", error);
8744+ goto fail_bar;
8745+ }
8746+
8747+ return 0;
8748+
8749+ fail_bar:
8750+ kcl_barrier_delete(name);
8751+ fail:
8752+ return error;
8753+}
8754+
8755+void process_startdone_barrier_new(sm_group_t *sg, int status)
8756+{
8757+ sm_sevent_t *sev = sg->sevent;
8758+
8759+ if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
8760+ log_debug(sev->se_sg, "ignore barrier cb status %d", status);
8761+ return;
8762+ }
8763+
8764+ sev->se_barrier_status = status;
8765+ sev->se_state = SEST_BARRIER_DONE;
8766+ set_bit(SEFL_CHECK, &sev->se_flags);
8767+ wake_serviced(DO_JOINLEAVE);
8768+}
8769+
8770+void process_startdone_barrier(sm_group_t *sg, int status)
8771+{
8772+ sm_uevent_t *uev = &sg->uevent;
8773+
8774+ if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
8775+ log_debug(sg, "ignore barrier cb status %d", status);
8776+ return;
8777+ }
8778+
8779+ uev->ue_barrier_status = status;
8780+ uev->ue_state = UEST_BARRIER_DONE;
8781+ set_bit(UEFL_CHECK, &uev->ue_flags);
8782+ wake_serviced(DO_MEMBERSHIP);
8783+}
8784+
8785+void process_recovery_barrier(sm_group_t *sg, int status)
8786+{
8787+ if (status) {
8788+ log_error(sg, "process_recovery_barrier status=%d", status);
8789+ return;
8790+ }
8791+
8792+ if (sg->state != SGST_RECOVER ||
8793+ sg->recover_state != RECOVER_BARRIERWAIT) {
8794+ log_error(sg, "process_recovery_barrier state %d recover %d",
8795+ sg->state, sg->recover_state);
8796+ return;
8797+ }
8798+
8799+ if (!sg->recover_stop)
8800+ sg->recover_state = RECOVER_STOP;
8801+ else
8802+ sg->recover_state = RECOVER_BARRIERDONE;
8803+
8804+ wake_serviced(DO_RECOVERIES);
8805+}
8806+
8807+void process_barriers(void)
8808+{
8809+ sm_group_t *sg;
8810+ bc_entry_t *be;
8811+
8812+ while (1) {
8813+ be = NULL;
8814+
8815+ spin_lock(&barriers_lock);
8816+ if (!list_empty(&barriers)) {
8817+ be = list_entry(barriers.next, bc_entry_t, list);
8818+ list_del(&be->list);
8819+ }
8820+ spin_unlock(&barriers_lock);
8821+
8822+ if (!be)
8823+ break;
8824+
8825+ sg = sm_global_id_to_sg(be->gid);
8826+ if (!sg) {
8827+ log_print("process_barriers: no sg %08x", be->gid);
8828+ break;
8829+ }
8830+
8831+ switch (be->type) {
8832+ case SM_BARRIER_STARTDONE_NEW:
8833+ process_startdone_barrier_new(sg, be->status);
8834+ break;
8835+
8836+ case SM_BARRIER_STARTDONE:
8837+ process_startdone_barrier(sg, be->status);
8838+ break;
8839+
8840+ case SM_BARRIER_RECOVERY:
8841+ process_recovery_barrier(sg, be->status);
8842+ break;
8843+ }
8844+
8845+ kfree(be);
8846+ schedule();
8847+ }
8848+}
8849diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
8850--- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8851+++ linux-patched/cluster/cman/sm_barrier.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 8852@@ -0,0 +1,29 @@
8853+/******************************************************************************
8854+*******************************************************************************
8855+**
8856+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8857+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8858+**
8859+** This copyrighted material is made available to anyone wishing to use,
8860+** modify, copy, or redistribute it subject to the terms and conditions
8861+** of the GNU General Public License v.2.
8862+**
8863+*******************************************************************************
8864+******************************************************************************/
8865+
8866+#ifndef __SM_BARRIER_DOT_H__
8867+#define __SM_BARRIER_DOT_H__
8868+
8869+#define SM_BARRIER_STARTDONE (0)
8870+#define SM_BARRIER_STARTDONE_NEW (1)
8871+#define SM_BARRIER_RECOVERY (2)
8872+#define SM_BARRIER_RESET (3)
8873+
8874+void init_barriers(void);
8875+void process_barriers(void);
8876+int sm_barrier(char *name, int count, int type);
8877+void process_startdone_barrier(sm_group_t *sg, int status);
8878+void process_startdone_barrier_new(sm_group_t *sg, int status);
8879+void process_recovery_barrier(sm_group_t *sg, int status);
8880+
8881+#endif
8882diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
8883--- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8884+++ linux-patched/cluster/cman/sm_control.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 8885@@ -0,0 +1,156 @@
8886+/******************************************************************************
8887+*******************************************************************************
8888+**
8889+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8890+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8891+**
8892+** This copyrighted material is made available to anyone wishing to use,
8893+** modify, copy, or redistribute it subject to the terms and conditions
8894+** of the GNU General Public License v.2.
8895+**
8896+*******************************************************************************
8897+******************************************************************************/
8898+
8899+#include "sm.h"
8900+#include "config.h"
8901+
8902+struct socket * sm_socket;
8903+uint32_t * sm_new_nodeids;
8904+uint32_t sm_our_nodeid;
8905+int sm_quorum, sm_quorum_next;
8906+struct list_head sm_members;
8907+int sm_member_count;
8908+
8909+
8910+/*
8911+ * Context: cnxman
8912+ * Called by cnxman when it has a new member list.
8913+ */
8914+
8915+void sm_member_update(int quorate)
8916+{
8917+ sm_quorum_next = quorate;
8918+ wake_serviced(DO_START_RECOVERY);
8919+}
8920+
8921+/*
8922+ * Context: cnxman
8923+ * Called when module is loaded.
8924+ */
8925+
8926+void sm_init(void)
8927+{
8928+ sm_socket = NULL;
8929+ sm_new_nodeids = NULL;
8930+ sm_quorum = 0;
8931+ sm_quorum_next = 0;
8932+ sm_our_nodeid = 0;
8933+ INIT_LIST_HEAD(&sm_members);
8934+ sm_member_count = 0;
8935+
8936+ init_services();
8937+ init_messages();
8938+ init_barriers();
8939+ init_serviced();
8940+ init_recovery();
8941+ init_joinleave();
8942+ init_sm_misc();
8943+}
8944+
8945+/*
8946+ * Context: cnxman
8947+ * Called at beginning of cluster join procedure.
8948+ */
8949+
8950+void sm_start(void)
8951+{
8952+ struct sockaddr_cl saddr;
8953+ struct socket *sock;
8954+ int result;
8955+
8956+ /* Create a communication channel among service managers */
8957+
8958+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
8959+ if (result < 0) {
8960+ log_print("can't create socket %d", result);
8961+ goto fail;
8962+ }
8963+
8964+ sm_socket = sock;
8965+
8966+ saddr.scl_family = AF_CLUSTER;
8967+ saddr.scl_port = CLUSTER_PORT_SERVICES;
8968+
8969+ result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
8970+ sizeof(saddr));
8971+ if (result < 0) {
8972+ log_print("can't bind socket %d", result);
8973+ goto fail_release;
8974+ }
8975+
8976+ result = kcl_register_read_callback(sm_socket, sm_cluster_message);
8977+ if (result < 0) {
8978+ log_print("can't register read callback %d", result);
8979+ goto fail_release;
8980+ }
8981+
8982+ sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
8983+ sizeof(uint32_t),
8984+ GFP_KERNEL);
8985+ start_serviced();
8986+
8987+ /* cnxman should call sm_member_update() once we've joined - then we
8988+ * can get our first list of members and our own nodeid */
8989+
8990+ return;
8991+
8992+ fail_release:
8993+ sock_release(sm_socket);
8994+ sm_socket = NULL;
8995+
8996+ fail:
8997+ return;
8998+}
8999+
9000+/*
9001+ * Context: cnxman
9002+ * Called before cnxman leaves the cluster. If this returns an error to cman,
9003+ * cman should not leave the cluster but return EBUSY.
9004+ * If force is set we go away anyway. cman knows best in this case
9005+ */
9006+
9007+int sm_stop(int force)
9008+{
9009+ struct list_head *head;
9010+ sm_group_t *sg;
9011+ sm_node_t *node;
9012+ int i, busy = FALSE, error = -EBUSY;
9013+
9014+ for (i = 0; i < SG_LEVELS; i++) {
9015+ if (!list_empty(&sm_sg[i])) {
9016+ sg = list_entry(sm_sg[i].next, sm_group_t, list);
9017+ log_error(sg, "sm_stop: SG still joined");
9018+ busy = TRUE;
9019+ }
9020+ }
9021+
9022+ if (!busy || force) {
9023+ stop_serviced();
9024+
9025+ if (sm_socket)
9026+ sock_release(sm_socket);
9027+
9028+ head = &sm_members;
9029+ while (!list_empty(head)) {
9030+ node = list_entry(head->next, sm_node_t, list);
9031+ list_del(&node->list);
9032+ sm_member_count--;
9033+ kfree(node);
9034+ }
9035+
9036+ kfree(sm_new_nodeids);
9037+ sm_init();
9038+ error = 0;
9039+ }
9040+ return error;
9041+}
9042diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
9043--- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9044+++ linux-patched/cluster/cman/sm_control.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 9045@@ -0,0 +1,22 @@
9046+/******************************************************************************
9047+*******************************************************************************
9048+**
9049+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9050+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9051+**
9052+** This copyrighted material is made available to anyone wishing to use,
9053+** modify, copy, or redistribute it subject to the terms and conditions
9054+** of the GNU General Public License v.2.
9055+**
9056+*******************************************************************************
9057+******************************************************************************/
9058+
9059+#ifndef __SM_CONTROL_DOT_H__
9060+#define __SM_CONTROL_DOT_H__
9061+
9062+void sm_init(void);
9063+void sm_start(void);
9064+int sm_stop(int force);
9065+void sm_member_update(int quorate);
9066+
9067+#endif
9068diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
9069--- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9070+++ linux-patched/cluster/cman/sm_daemon.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 9071@@ -0,0 +1,120 @@
9072+/******************************************************************************
9073+*******************************************************************************
9074+**
9075+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9076+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9077+**
9078+** This copyrighted material is made available to anyone wishing to use,
9079+** modify, copy, or redistribute it subject to the terms and conditions
9080+** of the GNU General Public License v.2.
9081+**
9082+*******************************************************************************
9083+******************************************************************************/
9084+
9085+#include "sm.h"
9086+
9087+static unsigned long daemon_flags;
9088+static struct task_struct * daemon_task;
9089+static struct completion daemon_done;
9090+static wait_queue_head_t daemon_wait;
9091+extern int sm_quorum;
9092+
9093+void init_serviced(void)
9094+{
9095+ daemon_flags = 0;
9096+ daemon_task = NULL;
9097+ init_completion(&daemon_done);
9098+ init_waitqueue_head(&daemon_wait);
9099+}
9100+
9101+void wake_serviced(int do_flag)
9102+{
9103+ set_bit(do_flag, &daemon_flags);
9104+ wake_up(&daemon_wait);
9105+}
9106+
9107+static inline int got_work(void)
9108+{
9109+ int rv = 0;
9110+
9111+ rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
9112+ test_bit(DO_MESSAGES, &daemon_flags) ||
9113+ test_bit(DO_BARRIERS, &daemon_flags) ||
9114+ test_bit(DO_CALLBACKS, &daemon_flags));
9115+
9116+ if (sm_quorum && !rv)
9117+ rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
9118+ test_bit(DO_RECOVERIES, &daemon_flags) ||
9119+ test_bit(DO_MEMBERSHIP, &daemon_flags));
9120+ return rv;
9121+}
9122+
9123+static int serviced(void *arg)
9124+{
9125+ DECLARE_WAITQUEUE(wait, current);
9126+
9127+ daemonize("cman_serviced");
9128+ daemon_task = current;
9129+ set_bit(DO_RUN, &daemon_flags);
9130+ complete(&daemon_done);
9131+
9132+ for (;;) {
9133+ if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
9134+ process_nodechange();
9135+
9136+ if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
9137+ process_messages();
9138+
9139+ if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
9140+ process_barriers();
9141+
9142+ if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
9143+ process_callbacks();
9144+
9145+ if (sm_quorum) {
9146+ if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
9147+ process_recoveries();
9148+
9149+ if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
9150+ process_joinleave();
9151+
9152+ if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
9153+ process_membership();
9154+ }
9155+
9156+ if (!test_bit(DO_RUN, &daemon_flags))
9157+ break;
9158+
9159+ current->state = TASK_INTERRUPTIBLE;
9160+ add_wait_queue(&daemon_wait, &wait);
9161+ if (!got_work() && test_bit(DO_RUN, &daemon_flags))
9162+ schedule();
9163+ remove_wait_queue(&daemon_wait, &wait);
9164+ current->state = TASK_RUNNING;
9165+ }
9166+
9167+ complete(&daemon_done);
9168+ return 0;
9169+}
9170+
9171+int start_serviced(void)
9172+{
9173+ int error;
9174+
9175+ error = kernel_thread(serviced, NULL, 0);
9176+ if (error < 0)
9177+ goto out;
9178+
9179+ error = 0;
9180+ wait_for_completion(&daemon_done);
9181+
9182+ out:
9183+ return error;
9184+}
9185+
9186+void stop_serviced(void)
9187+{
9188+ clear_bit(DO_RUN, &daemon_flags);
9189+ wake_up(&daemon_wait);
9190+ wait_for_completion(&daemon_done);
9191+}
9192diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
9193--- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9194+++ linux-patched/cluster/cman/sm_daemon.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 9195@@ -0,0 +1,32 @@
9196+/******************************************************************************
9197+*******************************************************************************
9198+**
9199+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9200+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9201+**
9202+** This copyrighted material is made available to anyone wishing to use,
9203+** modify, copy, or redistribute it subject to the terms and conditions
9204+** of the GNU General Public License v.2.
9205+**
9206+*******************************************************************************
9207+******************************************************************************/
9208+
9209+#ifndef __SM_DAEMON_DOT_H__
9210+#define __SM_DAEMON_DOT_H__
9211+
9212+#define DO_RUN (0)
9213+#define DO_START_RECOVERY (1)
9214+#define DO_MESSAGES (2)
9215+#define DO_BARRIERS (3)
9216+#define DO_CALLBACKS (4)
9217+#define DO_JOINLEAVE (5)
9218+#define DO_RECOVERIES (6)
9219+#define DO_MEMBERSHIP (7)
9220+#define DO_RESET (8)
9221+
9222+void init_serviced(void);
9223+void wake_serviced(int do_flag);
9224+void stop_serviced(void);
9225+int start_serviced(void);
9226+
9227+#endif
9228diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
9229--- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9230+++ linux-patched/cluster/cman/sm_internal.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 9231@@ -0,0 +1,230 @@
9232+/******************************************************************************
9233+*******************************************************************************
9234+**
9235+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9236+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9237+**
9238+** This copyrighted material is made available to anyone wishing to use,
9239+** modify, copy, or redistribute it subject to the terms and conditions
9240+** of the GNU General Public License v.2.
9241+**
9242+*******************************************************************************
9243+******************************************************************************/
9244+
9245+#ifndef __SM_INTERNAL_DOT_H__
9246+#define __SM_INTERNAL_DOT_H__
9247+
9248+/*
9249+ * Any header files needed by this file should be included before it in sm.h.
9250+ * This file should only be included by sm.h.
9251+ */
9252+
9253+struct sm_group;
9254+struct sm_sevent;
9255+struct sm_uevent;
9256+struct sm_node;
9257+struct sm_msg;
9258+
9259+typedef struct sm_group sm_group_t;
9260+typedef struct sm_sevent sm_sevent_t;
9261+typedef struct sm_uevent sm_uevent_t;
9262+typedef struct sm_node sm_node_t;
9263+typedef struct sm_msg sm_msg_t;
9264+
9265+
9266+/*
9267+ * Number of seconds to wait before trying again to join or leave an SG
9268+ */
9269+#define RETRY_DELAY (2)
9270+
9271+
9272+/*
9273+ * Service Event - what a node uses to join or leave an sg
9274+ */
9275+
9276+/* SE Flags */
9277+#define SEFL_CHECK (0)
9278+#define SEFL_ALLOW_JOIN (1)
9279+#define SEFL_ALLOW_JSTOP (2)
9280+#define SEFL_ALLOW_LEAVE (3)
9281+#define SEFL_ALLOW_LSTOP (4)
9282+#define SEFL_ALLOW_STARTDONE (5)
9283+#define SEFL_ALLOW_BARRIER (6)
9284+#define SEFL_DELAY (7)
9285+#define SEFL_LEAVE (8)
9286+#define SEFL_CANCEL (9)
9287+
9288+/* SE States */
9289+#define SEST_JOIN_BEGIN (1)
9290+#define SEST_JOIN_ACKWAIT (2)
9291+#define SEST_JOIN_ACKED (3)
9292+#define SEST_JSTOP_ACKWAIT (4)
9293+#define SEST_JSTOP_ACKED (5)
9294+#define SEST_JSTART_SERVICEWAIT (6)
9295+#define SEST_JSTART_SERVICEDONE (7)
9296+#define SEST_BARRIER_WAIT (8)
9297+#define SEST_BARRIER_DONE (9)
9298+#define SEST_LEAVE_BEGIN (10)
9299+#define SEST_LEAVE_ACKWAIT (11)
9300+#define SEST_LEAVE_ACKED (12)
9301+#define SEST_LSTOP_ACKWAIT (13)
9302+#define SEST_LSTOP_ACKED (14)
9303+#define SEST_LSTART_WAITREMOTE (15)
9304+#define SEST_LSTART_REMOTEDONE (16)
9305+
9306+struct sm_sevent {
9307+ struct list_head se_list;
9308+ unsigned int se_id;
9309+ sm_group_t * se_sg;
9310+ unsigned long se_flags;
9311+ unsigned int se_state;
9312+
9313+ int se_node_count;
9314+ int se_memb_count;
9315+ int se_reply_count;
9316+
9317+ uint32_t * se_node_ids;
9318+ char * se_node_status;
9319+ int se_len_ids; /* length of node_ids */
9320+ int se_len_status; /* length of node_status */
9321+
9322+ int se_barrier_status;
9323+ struct timer_list se_restart_timer;
9324+};
9325+
9326+/*
9327+ * Update Event - what an sg member uses to respond to an sevent
9328+ */
9329+
9330+/* UE Flags */
9331+#define UEFL_ALLOW_STARTDONE (0)
9332+#define UEFL_ALLOW_BARRIER (1)
9333+#define UEFL_CANCEL (2)
9334+#define UEFL_LEAVE (3)
9335+#define UEFL_CHECK (4)
9336+
9337+/* UE States */
9338+#define UEST_JSTOP (1)
9339+#define UEST_JSTART_WAITCMD (2)
9340+#define UEST_JSTART (3)
9341+#define UEST_JSTART_SERVICEWAIT (4)
9342+#define UEST_JSTART_SERVICEDONE (5)
9343+#define UEST_BARRIER_WAIT (6)
9344+#define UEST_BARRIER_DONE (7)
9345+#define UEST_LSTOP (8)
9346+#define UEST_LSTART_WAITCMD (9)
9347+#define UEST_LSTART (10)
9348+#define UEST_LSTART_SERVICEWAIT (11)
9349+#define UEST_LSTART_SERVICEDONE (12)
9350+
9351+struct sm_uevent {
9352+ unsigned int ue_state;
9353+ unsigned long ue_flags;
9354+ uint32_t ue_id;
9355+ uint32_t ue_nodeid;
9356+ int ue_num_nodes;
9357+ int ue_barrier_status;
9358+ uint16_t ue_remote_seid;
9359+};
9360+
9361+/*
9362+ * Service Group
9363+ */
9364+
9365+#define RECOVER_NONE (0)
9366+#define RECOVER_STOP (1)
9367+#define RECOVER_START (2)
9368+#define RECOVER_STARTDONE (3)
9369+#define RECOVER_BARRIERWAIT (4)
9370+#define RECOVER_BARRIERDONE (5)
9371+
9372+/* SG Flags */
9373+#define SGFL_SEVENT (1)
9374+#define SGFL_UEVENT (2)
9375+#define SGFL_NEED_RECOVERY (3)
9376+
9377+/* SG States */
9378+#define SGST_NONE (0)
9379+#define SGST_JOIN (1)
9380+#define SGST_RUN (2)
9381+#define SGST_RECOVER (3)
9382+#define SGST_UEVENT (4)
9383+
9384+struct sm_group {
9385+ struct list_head list; /* list of sg's */
9386+ uint16_t level;
9387+ uint32_t local_id;
9388+ uint32_t global_id;
9389+ unsigned long flags;
9390+ int state;
9391+ int refcount; /* references from reg/unreg */
9392+ void * service_data; /* data from the service */
9393+ struct kcl_service_ops *ops; /* ops from the service */
9394+ struct completion event_comp;
9395+
9396+ struct list_head memb; /* Membership List for RC */
9397+ int memb_count; /* number of nodes in memb */
9398+ struct list_head joining; /* nodes joining the sg */
9399+ sm_sevent_t * sevent;
9400+ sm_uevent_t uevent;
9401+
9402+ int recover_state;
9403+ int recover_stop;
9404+ struct list_head recover_list; /* recovery event list */
9405+ void * recover_data;
9406+ char recover_barrier[MAX_BARRIER_NAME_LEN];
9407+
9408+ int namelen;
9409+ char name[1]; /* must be last field */
9410+};
9411+
9412+/*
9413+ * Service Message
9414+ */
9415+
9416+/* SMSG Type */
9417+#define SMSG_JOIN_REQ (1)
9418+#define SMSG_JOIN_REP (2)
9419+#define SMSG_JSTOP_REQ (3)
9420+#define SMSG_JSTOP_REP (4)
9421+#define SMSG_JSTART_CMD (5)
9422+#define SMSG_LEAVE_REQ (6)
9423+#define SMSG_LEAVE_REP (7)
9424+#define SMSG_LSTOP_REQ (8)
9425+#define SMSG_LSTOP_REP (9)
9426+#define SMSG_LSTART_CMD (10)
9427+#define SMSG_LSTART_DONE (11)
9428+#define SMSG_RECOVER (12)
9429+
9430+/* SMSG Status */
9431+#define STATUS_POS (1)
9432+#define STATUS_NEG (2)
9433+#define STATUS_WAIT (3)
9434+
9435+struct sm_msg {
9436+ uint8_t ms_type;
9437+ uint8_t ms_status;
9438+ uint16_t ms_sevent_id;
9439+ uint32_t ms_global_sgid;
9440+ uint32_t ms_global_lastid;
9441+ uint16_t ms_sglevel;
9442+ uint16_t ms_length;
9443+ /* buf of ms_length bytes follows */
9444+};
9445+
9446+/*
9447+ * Node structure
9448+ */
9449+
9450+#define SNFL_NEED_RECOVERY (0)
9451+#define SNFL_CLUSTER_MEMBER (1)
9452+#define SNFL_LEAVING (2)
9453+
9454+struct sm_node {
9455+ struct list_head list;
9456+ uint32_t id; /* node id from cnxman */
9457+ unsigned long flags;
9458+ int incarnation; /* node incarnation number */
9459+};
9460+
9461+#endif /* __SM_INTERNAL_DOT_H__ */
9462diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
9463--- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9464+++ linux-patched/cluster/cman/sm_joinleave.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 9465@@ -0,0 +1,1286 @@
9466+/******************************************************************************
9467+*******************************************************************************
9468+**
9469+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9470+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9471+**
9472+** This copyrighted material is made available to anyone wishing to use,
9473+** modify, copy, or redistribute it subject to the terms and conditions
9474+** of the GNU General Public License v.2.
9475+**
9476+*******************************************************************************
9477+******************************************************************************/
9478+
9479+#include "sm.h"
9480+
9481+/*
9482+ * Routines used by nodes that are joining or leaving a SG. These "sevent"
9483+ * routines initiate membership changes to a SG. Existing SG members respond
9484+ * using the "uevent" membership update routines.
9485+ */
9486+
9487+extern uint32_t sm_our_nodeid;
9488+extern struct list_head sm_members;
9489+static struct list_head new_event;
9490+static spinlock_t new_event_lock;
9491+static struct list_head joinleave_events;
9492+
9493+void init_joinleave(void)
9494+{
9495+ INIT_LIST_HEAD(&new_event);
9496+ spin_lock_init(&new_event_lock);
9497+ INIT_LIST_HEAD(&joinleave_events);
9498+}
9499+
9500+void new_joinleave(sm_sevent_t *sev)
9501+{
9502+ spin_lock(&new_event_lock);
9503+ list_add_tail(&sev->se_list, &new_event);
9504+ spin_unlock(&new_event_lock);
9505+ wake_serviced(DO_JOINLEAVE);
9506+}
9507+
9508+sm_sevent_t *find_sevent(unsigned int id)
9509+{
9510+ sm_sevent_t *sev;
9511+
9512+ list_for_each_entry(sev, &joinleave_events, se_list) {
9513+ if (sev->se_id == id)
9514+ return sev;
9515+ }
9516+ return NULL;
9517+}
9518+
9519+static void release_sevent(sm_sevent_t *sev)
9520+{
9521+ if (sev->se_len_ids) {
9522+ kfree(sev->se_node_ids);
9523+ sev->se_node_ids = NULL;
9524+ }
9525+
9526+ if (sev->se_len_status) {
9527+ kfree(sev->se_node_status);
9528+ sev->se_node_status = NULL;
9529+ }
9530+
9531+ sev->se_node_count = 0;
9532+ sev->se_memb_count = 0;
9533+ sev->se_reply_count = 0;
9534+}
9535+
9536+static int init_sevent(sm_sevent_t *sev)
9537+{
9538+ sm_node_t *node;
9539+ int len1, len2, count, cluster_members = 0;
9540+
9541+ /* clear state from any previous attempt */
9542+ release_sevent(sev);
9543+
9544+ list_for_each_entry(node, &sm_members, list) {
9545+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9546+ cluster_members++;
9547+ }
9548+
9549+ sev->se_node_count = cluster_members;
9550+ sev->se_memb_count = sev->se_sg->memb_count;
9551+
9552+ /*
9553+ * When joining, we need a node array the size of the entire cluster
9554+ * member list because we get responses from all nodes. When leaving,
9555+ * we only get responses from SG members, so the node array need only
9556+ * be that large.
9557+ */
9558+
9559+ if (sev->se_state < SEST_LEAVE_BEGIN)
9560+ count = sev->se_node_count;
9561+ else
9562+ count = sev->se_memb_count;
9563+
9564+ len1 = count * sizeof(uint32_t);
9565+ sev->se_len_ids = len1;
9566+
9567+ sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
9568+ if (!sev->se_node_ids)
9569+ goto fail;
9570+
9571+ len2 = count * sizeof (char);
9572+ sev->se_len_status = len2;
9573+
9574+ sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
9575+ if (!sev->se_node_status)
9576+ goto fail_free;
9577+
9578+ memset(sev->se_node_status, 0, len2);
9579+ memset(sev->se_node_ids, 0, len1);
9580+
9581+ return 0;
9582+
9583+ fail_free:
9584+ kfree(sev->se_node_ids);
9585+ sev->se_node_ids = NULL;
9586+ sev->se_len_ids = 0;
9587+
9588+ fail:
9589+ return -ENOMEM;
9590+}
9591+
9592+/* Context: timer */
9593+
9594+static void sev_restart(unsigned long data)
9595+{
9596+ sm_sevent_t *sev = (sm_sevent_t *) data;
9597+
9598+ clear_bit(SEFL_DELAY, &sev->se_flags);
9599+ set_bit(SEFL_CHECK, &sev->se_flags);
9600+ wake_serviced(DO_JOINLEAVE);
9601+}
9602+
9603+static void schedule_sev_restart(sm_sevent_t *sev)
9604+{
9605+ init_timer(&sev->se_restart_timer);
9606+ sev->se_restart_timer.function = sev_restart;
9607+ sev->se_restart_timer.data = (long) sev;
9608+ mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
9609+}
9610+
9611+void free_sg_memb(sm_group_t *sg)
9612+{
9613+ sm_node_t *node;
9614+
9615+ while (!list_empty(&sg->memb)) {
9616+ node = list_entry(sg->memb.next, sm_node_t, list);
9617+ list_del(&node->list);
9618+ kfree(node);
9619+ }
9620+ sg->memb_count = 0;
9621+}
9622+
9623+/*
9624+ * 1. First step in joining a SG - send a message to all nodes in the cluster
9625+ * asking to join the named SG. If any nodes are members they will reply with
9626+ * a POS, or a WAIT (wait means try again, only one node can join at a time).
9627+ * If no one knows about this SG, they all send NEG replies which means we form
9628+ * the SG with just ourself as a member.
9629+ */
9630+
9631+static int send_join_notice(sm_sevent_t *sev)
9632+{
9633+ sm_group_t *sg = sev->se_sg;
9634+ sm_node_t *node;
9635+ char *msg;
9636+ int i = 0, error, namelen, len = 0;
9637+
9638+ /*
9639+ * Create node array from member list in which to collect responses.
9640+ */
9641+
9642+ error = init_sevent(sev);
9643+ if (error)
9644+ goto out;
9645+
9646+ list_for_each_entry(node, &sm_members, list) {
9647+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9648+ sev->se_node_ids[i++] = node->id;
9649+ }
9650+
9651+ /*
9652+ * Create and send a join request message.
9653+ *
9654+ * Other nodes then run process_join_request and reply to us; we
9655+ * collect the responses in process_reply and check them in
9656+ * check_join_notice.
9657+ */
9658+
9659+ namelen = sg->namelen;
9660+ msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
9661+ memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
9662+
9663+ error = send_broadcast_message_sev(msg, len, sev);
9664+
9665+ out:
9666+ return error;
9667+}
9668+
9669+/*
9670+ * 2. Second step in joining a SG - after we collect all replies to our join
9671+ * request, we look at them. If anyone told us to wait, we'll wait a while, go
9672+ * back and start at step 1 again.
9673+ */
9674+
9675+static int check_join_notice(sm_sevent_t *sev)
9676+{
9677+ int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
9678+
9679+ for (i = 0; i < sev->se_node_count; i++) {
9680+ switch (sev->se_node_status[i]) {
9681+ case STATUS_POS:
9682+ /* this node is in the SG and will be in new proposed
9683+ * memb list */
9684+ pos++;
9685+ break;
9686+
9687+ case STATUS_WAIT:
9688+ /* this node is in the SG but something else is
9689+ * happening with it at the moment. */
9690+ wait++;
9691+ break;
9692+
9693+ case STATUS_NEG:
9694+ /* this node has no record of the SG we're interested
9695+ * in */
9696+ neg++;
9697+
9698+ if (sev->se_node_ids[i] == sm_our_nodeid)
9699+ sev->se_node_status[i] = STATUS_POS;
9700+ break;
9701+
9702+ default:
9703+ /* we didn't get a valid response from this node,
9704+ * restart the entire sev. */
9705+ restart++;
9706+ break;
9707+ }
9708+ }
9709+
9710+ if (pos && !wait && !restart) {
9711+ /* all current members of this sg pos'ed our entry */
9712+ } else if (!pos && !wait && !restart && neg) {
9713+ /* we're the first in the cluster to join this sg */
9714+ sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
9715+ } else
9716+ error = -1;
9717+
9718+ return error;
9719+}
9720+
9721+/*
9722+ * 3. Third step in joining the SG - tell the nodes that are already members
9723+ * to "stop" the service. We stop them so that everyone can restart with the
9724+ * new member (us!) added.
9725+ */
9726+
9727+static int send_join_stop(sm_sevent_t *sev)
9728+{
9729+ sm_group_t *sg = sev->se_sg;
9730+ sm_node_t *node;
9731+ char *msg;
9732+ uint32_t be_count;
9733+ int i, len = 0, error = 0;
9734+
9735+ /*
9736+ * Form the SG memb list with us in it.
9737+ */
9738+
9739+ for (i = 0; i < sev->se_node_count; i++) {
9740+ if (sev->se_node_status[i] != STATUS_POS)
9741+ continue;
9742+
9743+ node = sm_new_node(sev->se_node_ids[i]);
9744+ if (!node)
9745+ goto fail;
9746+
9747+ list_add_tail(&node->list, &sg->memb);
9748+ sg->memb_count++;
9749+ }
9750+
9751+ /*
9752+ * Re-init the node vector in which to collect responses again.
9753+ */
9754+
9755+ sev->se_memb_count = sg->memb_count;
9756+
9757+ memset(sev->se_node_status, 0, sev->se_len_status);
9758+ memset(sev->se_node_ids, 0, sev->se_len_ids);
9759+ i = 0;
9760+
9761+ list_for_each_entry(node, &sg->memb, list)
9762+ sev->se_node_ids[i++] = node->id;
9763+
9764+ /*
9765+ * Create and send a stop message.
9766+ *
9767+ * Other nodes then run process_stop_request and process_join_stop and
9768+ * reply to us. They stop the sg we're trying to join if they agree.
9769+ * We collect responses in process_reply and check them in
9770+ * check_join_stop.
9771+ */
9772+
9773+ msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
9774+ be_count = cpu_to_be32(sg->memb_count);
9775+ memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
9776+
9777+ error = send_members_message_sev(sg, msg, len, sev);
9778+ if (error < 0)
9779+ goto fail;
9780+
9781+ return 0;
9782+
9783+ fail:
9784+ free_sg_memb(sg);
9785+ return error;
9786+}
9787+
9788+/*
9789+ * 4. Fourth step in joining the SG - after we collect replies to our stop
9790+ * request, we look at them. Everyone sending POS agrees with us joining and
9791+ * has stopped their SG. If some nodes sent NEG, something is wrong and we
9792+ * don't have a good way to address that yet since some nodes may have sent
9793+ * POS.
9794+ *
9795+ * FIXME: even nodes replying with NEG should stop their SG so we can send an
9796+ * abort and have everyone at the same place to start from again.
9797+ */
9798+
9799+static int check_join_stop(sm_sevent_t *sev)
9800+{
9801+ sm_group_t *sg = sev->se_sg;
9802+ int i, pos = 0, neg = 0;
9803+
9804+ for (i = 0; i < sev->se_memb_count; i++) {
9805+ switch (sev->se_node_status[i]) {
9806+ case STATUS_POS:
9807+ pos++;
9808+ break;
9809+
9810+ case STATUS_NEG:
9811+ log_error(sg, "check_join_stop: neg from nodeid %u "
9812+ "(%d, %d, %u)", sev->se_node_ids[i],
9813+ pos, neg, sev->se_memb_count);
9814+ neg++;
9815+ break;
9816+
9817+ default:
9818+ log_error(sg, "check_join_stop: unknown status=%u "
9819+ "nodeid=%u", sev->se_node_status[i],
9820+ sev->se_node_ids[i]);
9821+ neg++;
9822+ break;
9823+ }
9824+ }
9825+
9826+ if (pos == sg->memb_count)
9827+ return 0;
9828+
9829+ free_sg_memb(sg);
9830+ return -1;
9831+}
9832+
9833+/*
9834+ * 5. Fifth step in joining the SG - everyone has stopped their service and we
9835+ * all now start the service with us, the new member, added to the SG member
9836+ * list. We send start to our own service here and send a message to the other
9837+ * members that they should also start their service.
9838+ */
9839+
9840+static int send_join_start(sm_sevent_t *sev)
9841+{
9842+ sm_group_t *sg = sev->se_sg;
9843+ sm_node_t *node;
9844+ uint32_t *memb;
9845+ char *msg;
9846+ int error, count = 0, len = 0;
9847+
9848+ /*
9849+ * Create a start message and send it.
9850+ */
9851+
9852+ msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
9853+
9854+ error = send_members_message(sg, msg, len);
9855+ if (error < 0)
9856+ goto fail;
9857+
9858+ /*
9859+ * Start the service ourself. The chunk of memory with the member ids
9860+ * must be freed by the service when it is done with it.
9861+ */
9862+
9863+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
9864+ memb);
9865+
9866+ list_for_each_entry(node, &sg->memb, list)
9867+ memb[count++] = node->id;
9868+
9869+ set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
9870+
9871+ sg->ops->start(sg->service_data, memb, count, sev->se_id,
9872+ SERVICE_NODE_JOIN);
9873+ return 0;
9874+
9875+ fail:
9876+ free_sg_memb(sg);
9877+ return error;
9878+}
9879+
9880+/*
9881+ * 6. Sixth step in joining the SG - once the service has completed its start,
9882+ * it does a kcl_start_done() to signal us that it's done. That gets us here
9883+ * and we do a barrier with all other members which join the barrier when their
9884+ * service is done starting.
9885+ */
9886+
9887+static int startdone_barrier_new(sm_sevent_t *sev)
9888+{
9889+ sm_group_t *sg = sev->se_sg;
9890+ char bname[MAX_BARRIER_NAME_LEN];
9891+ int error;
9892+
9893+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
9894+ sev->se_barrier_status = -1;
9895+
9896+ set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
9897+
9898+ /* If we're the only member, skip the barrier */
9899+ if (sg->memb_count == 1) {
9900+ process_startdone_barrier_new(sg, 0);
9901+ return 0;
9902+ }
9903+
9904+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
9905+ sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
9906+
9907+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
9908+ if (error)
9909+ goto fail;
9910+
9911+ return 0;
9912+
9913+ fail:
9914+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
9915+ sg->ops->stop(sg->service_data);
9916+ free_sg_memb(sg);
9917+ return error;
9918+}
9919+
9920+/*
9921+ * 7. Seventh step in joining the SG - check that the barrier we joined with
9922+ * all other members returned with a successful status.
9923+ */
9924+
9925+static int check_startdone_barrier_new(sm_sevent_t *sev)
9926+{
9927+ sm_group_t *sg = sev->se_sg;
9928+ int error = sev->se_barrier_status;
9929+
9930+ if (error) {
9931+ sg->ops->stop(sg->service_data);
9932+ free_sg_memb(sg);
9933+ }
9934+ return error;
9935+}
9936+
9937+/*
9938+ * 8. Eigth step in joining the SG - send the service a "finish" indicating
9939+ * that all members have successfully started the service.
9940+ */
9941+
9942+static void do_finish_new(sm_sevent_t *sev)
9943+{
9944+ sm_group_t *sg = sev->se_sg;
9945+
9946+ sg->state = SGST_RUN;
9947+ sg->sevent = NULL;
9948+ clear_bit(SGFL_SEVENT, &sg->flags);
9949+
9950+ sg->ops->finish(sg->service_data, sev->se_id);
9951+}
9952+
9953+/*
9954+ * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff
9955+ * and tell the process which initiated the join that it's done.
9956+ */
9957+
9958+static void sevent_done(sm_sevent_t *sev)
9959+{
9960+ sm_group_t *sg = sev->se_sg;
9961+
9962+ list_del(&sev->se_list);
9963+ release_sevent(sev);
9964+ kfree(sev);
9965+ complete(&sg->event_comp);
9966+}
9967+
9968+/*
9969+ * Move through the steps of a join. Summary:
9970+ *
9971+ * 1. Send a join notice to all cluster members.
9972+ * 2. Collect and check replies to the join notice.
9973+ * 3. Send a stop message to all SG members.
9974+ * 4. Collect and check replies to the stop message.
9975+ * 5. Send a start message to all SG members and start service ourself.
9976+ * 6. Use barrier to wait for all nodes to complete the start.
9977+ * 7. Check that all SG members joined the barrier.
9978+ * 8. Send finish to the service indicating that all nodes started it.
9979+ * 9. Clean up sevent and signal completion to the process that started the join
9980+ */
9981+
9982+static void process_join_sevent(sm_sevent_t *sev)
9983+{
9984+ int error = 0;
9985+
9986+ /*
9987+ * We may cancel the current join attempt if another node is also
9988+ * attempting to join or leave. (Only a single node can join or leave
9989+ * at once.) If cancelled, 0ur join attempt will be restarted later.
9990+ */
9991+
9992+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
9993+ error = -1;
9994+ goto cancel;
9995+ }
9996+
9997+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
9998+
9999+ switch (sev->se_state) {
10000+
10001+ /*
10002+ * An sevent is created in kcl_join_service with a state of
10003+ * JOIN_BEGIN.
10004+ */
10005+
10006+ case SEST_JOIN_BEGIN:
10007+ sev->se_state = SEST_JOIN_ACKWAIT;
10008+ error = send_join_notice(sev);
10009+ break;
10010+
10011+ /*
10012+ * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
10013+ * process_reply (when all the replies have been received)
10014+ */
10015+
10016+ case SEST_JOIN_ACKED:
10017+ error = check_join_notice(sev);
10018+ if (error)
10019+ break;
10020+
10021+ sev->se_state = SEST_JSTOP_ACKWAIT;
10022+ error = send_join_stop(sev);
10023+ break;
10024+
10025+ /*
10026+ * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
10027+ * proces_reply (when all the replies have been received)
10028+ */
10029+
10030+ case SEST_JSTOP_ACKED:
10031+ error = check_join_stop(sev);
10032+ if (error)
10033+ break;
10034+
10035+ sev->se_state = SEST_JSTART_SERVICEWAIT;
10036+ error = send_join_start(sev);
10037+ break;
10038+
10039+ /*
10040+ * se_state is changed from JSTART_SERVICEWAIT to
10041+ * JSTART_SERVICEDONE in kcl_start_done
10042+ */
10043+
10044+ case SEST_JSTART_SERVICEDONE:
10045+ sev->se_state = SEST_BARRIER_WAIT;
10046+ error = startdone_barrier_new(sev);
10047+ break;
10048+
10049+ /*
10050+ * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
10051+ * process_startdone_barrier_new
10052+ */
10053+
10054+ case SEST_BARRIER_DONE:
10055+ error = check_startdone_barrier_new(sev);
10056+ if (error)
10057+ break;
10058+
10059+ do_finish_new(sev);
10060+ sevent_done(sev);
10061+ break;
10062+
10063+ default:
10064+ log_error(sev->se_sg, "no join processing for state %u",
10065+ sev->se_state);
10066+ }
10067+
10068+ cancel:
10069+ if (error) {
10070+ /* restart the sevent from the beginning */
10071+ sev->se_state = SEST_JOIN_BEGIN;
10072+ sev->se_sg->global_id = 0;
10073+ set_bit(SEFL_DELAY, &sev->se_flags);
10074+ schedule_sev_restart(sev);
10075+ }
10076+}
10077+
10078+/*
10079+ * 1. First step in leaving an SG - send a message to other SG members asking
10080+ * to leave the SG. Nodes that don't have another active sevent or uevent for
10081+ * this SG will return POS.
10082+ */
10083+
10084+static int send_leave_notice(sm_sevent_t *sev)
10085+{
10086+ sm_group_t *sg = sev->se_sg;
10087+ sm_node_t *node;
10088+ char *msg;
10089+ int i = 0, error = -1, len = 0;
10090+
10091+ /*
10092+ * Create a node array from member list in which to collect responses.
10093+ */
10094+
10095+ error = init_sevent(sev);
10096+ if (error)
10097+ goto out;
10098+
10099+ list_for_each_entry(node, &sg->memb, list)
10100+ sev->se_node_ids[i++] = node->id;
10101+
10102+ /*
10103+ * Create and send a leave request message.
10104+ */
10105+
10106+ msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
10107+
10108+ error = send_members_message_sev(sg, msg, len, sev);
10109+
10110+ out:
10111+ return error;
10112+}
10113+
10114+/*
10115+ * 2. Second step in leaving an SG - after we collect all replies to our leave
10116+ * request, we look at them. If anyone replied with WAIT, we abort our attempt
10117+ * at leaving and try again in a bit.
10118+ */
10119+
10120+static int check_leave_notice(sm_sevent_t *sev)
10121+{
10122+ int pos = 0, wait = 0, neg = 0, restart = 0, i;
10123+
10124+ for (i = 0; i < sev->se_memb_count; i++) {
10125+ switch (sev->se_node_status[i]) {
10126+ case STATUS_POS:
10127+ pos++;
10128+ break;
10129+
10130+ case STATUS_WAIT:
10131+ wait++;
10132+ break;
10133+
10134+ case STATUS_NEG:
10135+ neg++;
10136+ break;
10137+
10138+ default:
10139+ /* we didn't get a valid response from this node,
10140+ * restart the entire sev. */
10141+ restart++;
10142+ break;
10143+ }
10144+ }
10145+
10146+ /* all members approve */
10147+ if (pos && !wait && !restart)
10148+ return 0;
10149+
10150+ return -1;
10151+}
10152+
10153+/*
10154+ * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG.
10155+ * They must be stopped in order to restart without us as a member.
10156+ */
10157+
10158+static int send_leave_stop(sm_sevent_t *sev)
10159+{
10160+ sm_group_t *sg = sev->se_sg;
10161+ char *msg;
10162+ int error, len = 0;
10163+
10164+ /*
10165+ * Re-init the status vector in which to collect responses.
10166+ */
10167+
10168+ memset(sev->se_node_status, 0, sev->se_len_status);
10169+
10170+ /*
10171+ * Create and send a stop message.
10172+ */
10173+
10174+ msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
10175+
10176+ error = send_members_message_sev(sg, msg, len, sev);
10177+ if (error < 0)
10178+ goto out;
10179+
10180+ /*
10181+ * we and all others stop the SG now
10182+ */
10183+
10184+ sg->ops->stop(sg->service_data);
10185+
10186+ out:
10187+ return error;
10188+}
10189+
10190+/*
10191+ * 4. Fourth step in leaving the SG - check the replies to our stop request.
10192+ * Same problem with getting different replies as check_join_stop.
10193+ */
10194+
10195+static int check_leave_stop(sm_sevent_t *sev)
10196+{
10197+ sm_group_t *sg = sev->se_sg;
10198+ int i, pos = 0, neg = 0;
10199+
10200+ for (i = 0; i < sev->se_memb_count; i++) {
10201+ switch (sev->se_node_status[i]) {
10202+ case STATUS_POS:
10203+ pos++;
10204+ break;
10205+
10206+ case STATUS_NEG:
10207+ log_error(sg, "check_leave_stop: fail from nodeid %u "
10208+ "(%d, %d, %u)", sev->se_node_ids[i],
10209+ pos, neg, sev->se_memb_count);
10210+ neg++;
10211+ break;
10212+
10213+ default:
10214+ log_error(sg, "check_leave_stop: status %u nodeid %u",
10215+ sev->se_node_status[i], sev->se_node_ids[i]);
10216+ neg++;
10217+ break;
10218+ }
10219+ }
10220+
10221+ if (pos == sg->memb_count)
10222+ return 0;
10223+
10224+ return -1;
10225+}
10226+
10227+/*
10228+ * 5. Fifth step in leaving the SG - tell the other SG members to restart the
10229+ * service without us. We, of course, don't start our own stopped service. If
10230+ * we're the last SG member and leaving, we jump right to the next step.
10231+ */
10232+
10233+static int send_leave_start(sm_sevent_t *sev)
10234+{
10235+ sm_group_t *sg = sev->se_sg;
10236+ char *msg;
10237+ int error = 0, len = 0;
10238+
10239+ if (sg->memb_count == 1) {
10240+ sev->se_state = SEST_LSTART_REMOTEDONE;
10241+ set_bit(SEFL_CHECK, &sev->se_flags);
10242+ wake_serviced(DO_JOINLEAVE);
10243+ } else {
10244+ msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
10245+ error = send_members_message(sg, msg, len);
10246+ }
10247+ return error;
10248+}
10249+
10250+/*
10251+ * Move through the steps of a leave. Summary:
10252+ *
10253+ * 1. Send a leave notice to all SG members.
10254+ * 2. Collect and check replies to the leave notice.
10255+ * 3. Send a stop message to all SG members and stop our own SG.
10256+ * 4. Collect and check replies to the stop message.
10257+ * 5. Send a start message to SG members.
10258+ * 6. Clean up sevent and signal completion to the process that
10259+ * started the leave.
10260+ */
10261+
10262+static void process_leave_sevent(sm_sevent_t *sev)
10263+{
10264+ int error = 0;
10265+
10266+ /*
10267+ * We may cancel the current leave attempt if another node is also
10268+ * attempting to join or leave. (Only a single node can join or leave
10269+ * at once.) Our leave attempt will be restarted after being
10270+ * cancelled.
10271+ */
10272+
10273+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10274+ error = 1;
10275+ goto cancel;
10276+ }
10277+
10278+ if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
10279+ error = 2;
10280+ goto cancel;
10281+ }
10282+
10283+ if (!list_empty(&sev->se_sg->joining)) {
10284+ error = 3;
10285+ goto cancel;
10286+ }
10287+
10288+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10289+
10290+ switch (sev->se_state) {
10291+
10292+ /*
10293+ * An sevent is created in kcl_leave_service with a state of
10294+ * LEAVE_BEGIN.
10295+ */
10296+
10297+ case SEST_LEAVE_BEGIN:
10298+ sev->se_state = SEST_LEAVE_ACKWAIT;
10299+ error = send_leave_notice(sev);
10300+ break;
10301+
10302+ /*
10303+ * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
10304+ * process_reply (when all the replies have been received)
10305+ */
10306+
10307+ case SEST_LEAVE_ACKED:
10308+ error = check_leave_notice(sev);
10309+ if (error)
10310+ break;
10311+
10312+ sev->se_state = SEST_LSTOP_ACKWAIT;
10313+ error = send_leave_stop(sev);
10314+ break;
10315+
10316+ /*
10317+ * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
10318+ * process_reply
10319+ */
10320+
10321+ case SEST_LSTOP_ACKED:
10322+ error = check_leave_stop(sev);
10323+ if (error)
10324+ break;
10325+
10326+ sev->se_state = SEST_LSTART_WAITREMOTE;
10327+ error = send_leave_start(sev);
10328+ break;
10329+
10330+ /*
10331+ * se_state is changed from LSTART_WAITREMOTE to
10332+ * LSTART_REMOTEDONE in process_leave_done
10333+ */
10334+
10335+ case SEST_LSTART_REMOTEDONE:
10336+ sevent_done(sev);
10337+ break;
10338+
10339+ default:
10340+ log_error(sev->se_sg, "process_leave_sevent state=%u\n",
10341+ sev->se_state);
10342+ }
10343+
10344+ cancel:
10345+ if (error) {
10346+ /* restart the sevent from the beginning */
10347+ sev->se_state = SEST_LEAVE_BEGIN;
10348+ set_bit(SEFL_DELAY, &sev->se_flags);
10349+ schedule_sev_restart(sev);
10350+ }
10351+}
10352+
10353+/*
10354+ * Sevent backout code. Take appropriate steps when a recovery occurs while
10355+ * we're in the midst of an sevent. The recovery may or may not affect the
10356+ * sevent. If it does, it usually means cancelling the sevent and restarting
10357+ * it from the beginning once the recovery processing is done.
10358+ */
10359+
10360+/*
10361+ * If any of the nodes that replied with OK is dead, we give up on the current
10362+ * join attempt and restart. Otherwise, this sevent can continue.
10363+ */
10364+
10365+static int backout_join_acked(sm_sevent_t *sev)
10366+{
10367+ sm_node_t *node;
10368+ int i;
10369+
10370+ for (i = 0; i < sev->se_node_count; i++) {
10371+ if (sev->se_node_status[i] != STATUS_POS)
10372+ continue;
10373+
10374+ list_for_each_entry(node, &sm_members, list) {
10375+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
10376+ (node->id == sev->se_node_ids[i]))
10377+ return TRUE;
10378+ }
10379+ }
10380+ return FALSE;
10381+}
10382+
10383+/*
10384+ * In this state our sg member list exists and mark_affected_sgs() will have
10385+ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We
10386+ * restart the join process if this is the case, otherwise this sevent can
10387+ * continue.
10388+ */
10389+
10390+static int backout_jstop_ackwait(sm_sevent_t *sev)
10391+{
10392+ sm_group_t *sg = sev->se_sg;
10393+
10394+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10395+ return FALSE;
10396+
10397+ clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
10398+ free_sg_memb(sg);
10399+ return TRUE;
10400+}
10401+
10402+/*
10403+ * Same as previous.
10404+ */
10405+
10406+static int backout_jstop_acked(sm_sevent_t *sev)
10407+{
10408+ return backout_jstop_ackwait(sev);
10409+}
10410+
10411+/*
10412+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10413+ * starting our service. The recovery process will restart the service on all
10414+ * the prior sg members (not including those that died or us). We will
10415+ * reattempt our join which should be accepted once the nodes are done with
10416+ * recovery.
10417+ */
10418+
10419+static int backout_jstart_servicewait(sm_sevent_t *sev)
10420+{
10421+ sm_group_t *sg = sev->se_sg;
10422+
10423+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10424+ return FALSE;
10425+
10426+ clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10427+ sg->ops->stop(sg->service_data);
10428+ free_sg_memb(sg);
10429+ return TRUE;
10430+}
10431+
10432+/*
10433+ * Same as previous.
10434+ */
10435+
10436+static int backout_jstart_servicedone(sm_sevent_t *sev)
10437+{
10438+ return backout_jstart_servicewait(sev);
10439+}
10440+
10441+/*
10442+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10443+ * waiting on the "all done" barrier. Stop our service that we just started
10444+ * and cancel the barrier. The recovery process will restart the service on
10445+ * all the prior sg members (not including those that died or us). We will
10446+ * reattempt our join which should be accepted once the nodes are done with
10447+ * recovery.
10448+ */
10449+
10450+static int backout_barrier_wait(sm_sevent_t *sev)
10451+{
10452+ sm_group_t *sg = sev->se_sg;
10453+ char bname[MAX_BARRIER_NAME_LEN];
10454+
10455+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10456+ return FALSE;
10457+
10458+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10459+
10460+ sg->ops->stop(sg->service_data);
10461+
10462+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10463+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10464+ sg->global_id, sm_our_nodeid, sev->se_id,
10465+ sg->memb_count);
10466+ kcl_barrier_cancel(bname);
10467+
10468+ free_sg_memb(sg);
10469+ return TRUE;
10470+}
10471+
10472+/*
10473+ * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The
10474+ * recovery began after the barrier callback. If the result in the callback is
10475+ * "success" then we are joined, this sevent is finished and we'll process the
10476+ * sg within the forthcoming recovery with the other members.
10477+ *
10478+ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
10479+ * all nodes will receive the corresponding barrier callback *before any*
10480+ * receive an sm_member_update() due to one of those nodes failing just after
10481+ * joining the barrier. If some nodes receive the sm_member_update() before
10482+ * the barrier callback and others receive the barrier callback before the
10483+ * sm_member_update() then they will disagree as to whether the node joining/
10484+ * leaving is in/out of the sg.
10485+ */
10486+
10487+static int backout_barrier_done(sm_sevent_t *sev)
10488+{
10489+ sm_group_t *sg = sev->se_sg;
10490+
10491+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10492+ return FALSE;
10493+
10494+ if (!sev->se_barrier_status) {
10495+ do_finish_new(sev);
10496+ sevent_done(sev);
10497+ return FALSE;
10498+ } else {
10499+ sg->ops->stop(sg->service_data);
10500+ free_sg_memb(sg);
10501+ return TRUE;
10502+ }
10503+}
10504+
10505+/*
10506+ * We've done nothing yet, just restart when recovery is done (if sg is flagged
10507+ * with recovery.)
10508+ */
10509+
10510+static int backout_leave_begin(sm_sevent_t *sev)
10511+{
10512+ sm_group_t *sg = sev->se_sg;
10513+
10514+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10515+ return FALSE;
10516+
10517+ return TRUE;
10518+}
10519+
10520+/*
10521+ * Ignore any replies to our leave notice and restart when recovery is done (if
10522+ * sg is flagged with recovery.)
10523+ */
10524+
10525+static int backout_leave_ackwait(sm_sevent_t *sev)
10526+{
10527+ sm_group_t *sg = sev->se_sg;
10528+
10529+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10530+ return FALSE;
10531+
10532+ clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
10533+
10534+ return TRUE;
10535+}
10536+
10537+/*
10538+ * Same as previous.
10539+ */
10540+
10541+static int backout_leave_acked(sm_sevent_t *sev)
10542+{
10543+ return backout_leave_ackwait(sev);
10544+}
10545+
10546+/*
10547+ * Ignore any stop replies. All the members will be stopped anyway to do the
10548+ * recovery. Let that happen and restart our leave when done.
10549+ */
10550+
10551+static int backout_lstop_ackwait(sm_sevent_t *sev)
10552+{
10553+ sm_group_t *sg = sev->se_sg;
10554+
10555+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10556+ return FALSE;
10557+
10558+ clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
10559+
10560+ return TRUE;
10561+}
10562+
10563+/*
10564+ * Same as previous.
10565+ */
10566+
10567+static int backout_lstop_acked(sm_sevent_t *sev)
10568+{
10569+ return backout_lstop_ackwait(sev);
10570+}
10571+
10572+/*
10573+ * All members will be stopped due to recovery and restarted by recovery
10574+ * processing. That includes us, we have to retry the leave once the recovery
10575+ * is done.
10576+ */
10577+
10578+static int backout_lstart_waitremote(sm_sevent_t *sev)
10579+{
10580+ sm_group_t *sg = sev->se_sg;
10581+
10582+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10583+ return FALSE;
10584+
10585+ return TRUE;
10586+}
10587+
10588+/*
10589+ * Reset an sevent to its beginning so it can be restarted. This is necessary
10590+ * when recovery affects an SG while we're trying to join or leave (ie. a node
10591+ * in the SG fails).
10592+ */
10593+
10594+void backout_sevents(void)
10595+{
10596+ sm_sevent_t *sev, *safe;
10597+ int delay;
10598+
10599+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10600+
10601+ delay = FALSE;
10602+
10603+ log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
10604+
10605+ switch (sev->se_state) {
10606+
10607+ /* backout after kcl_join_service and before
10608+ * send_join_notice */
10609+ case SEST_JOIN_BEGIN:
10610+ break;
10611+
10612+ /* backout after send_join_notice and before final
10613+ * process_reply */
10614+ case SEST_JOIN_ACKWAIT:
10615+ clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
10616+ sev->se_state = SEST_JOIN_BEGIN;
10617+ schedule_sev_restart(sev);
10618+ break;
10619+
10620+ /* backout after final process_reply and before
10621+ * check_join_notice */
10622+ case SEST_JOIN_ACKED:
10623+ delay = backout_join_acked(sev);
10624+ break;
10625+
10626+ /* backout after send_join_stop and before final
10627+ * process_reply */
10628+ case SEST_JSTOP_ACKWAIT:
10629+ delay = backout_jstop_ackwait(sev);
10630+ break;
10631+
10632+ /* backout after final process_reply and before
10633+ * check_join_stop */
10634+ case SEST_JSTOP_ACKED:
10635+ delay = backout_jstop_acked(sev);
10636+ break;
10637+
10638+ /* backout after send_join_start and before
10639+ * kcl_start_done */
10640+ case SEST_JSTART_SERVICEWAIT:
10641+ delay = backout_jstart_servicewait(sev);
10642+ break;
10643+
10644+ /* backout after kcl_start_done and before
10645+ * startdone_barrier_new */
10646+ case SEST_JSTART_SERVICEDONE:
10647+ delay = backout_jstart_servicedone(sev);
10648+ break;
10649+
10650+ /* backout after startdone_barrier_new and before
10651+ * callback_startdone_barrier_new */
10652+ case SEST_BARRIER_WAIT:
10653+ delay = backout_barrier_wait(sev);
10654+ break;
10655+
10656+ /* backout after callback_startdone_barrier_new and
10657+ * before check_startdone_barrier_new */
10658+ case SEST_BARRIER_DONE:
10659+ delay = backout_barrier_done(sev);
10660+ break;
10661+
10662+ /* backout after kcl_leave_service and before
10663+ * send_leave_notice */
10664+ case SEST_LEAVE_BEGIN:
10665+ delay = backout_leave_begin(sev);
10666+ break;
10667+
10668+ /* backout after send_leave_notice and before final
10669+ * process_reply */
10670+ case SEST_LEAVE_ACKWAIT:
10671+ delay = backout_leave_ackwait(sev);
10672+ break;
10673+
10674+ /* backout after final process_reply and before
10675+ * check_leave_notice */
10676+ case SEST_LEAVE_ACKED:
10677+ delay = backout_leave_acked(sev);
10678+ break;
10679+
10680+ /* backout after send_leave_stop and before final
10681+ * process_reply */
10682+ case SEST_LSTOP_ACKWAIT:
10683+ delay = backout_lstop_ackwait(sev);
10684+ break;
10685+
10686+ /* backout after final process_reply and before
10687+ * check_leave_stop */
10688+ case SEST_LSTOP_ACKED:
10689+ delay = backout_lstop_acked(sev);
10690+ break;
10691+
10692+ /* backout after send_leave_start and before
10693+ * process_lstart_done */
10694+ case SEST_LSTART_WAITREMOTE:
10695+ delay = backout_lstart_waitremote(sev);
10696+ break;
10697+
10698+ /* backout after process_lstart_done and before
10699+ * process_leave_sevent */
10700+ case SEST_LSTART_REMOTEDONE:
10701+ sevent_done(sev);
10702+ delay = FALSE;
10703+ break;
10704+
10705+ default:
10706+ log_error(sev->se_sg, "backout_sevents: bad state %d",
10707+ sev->se_state);
10708+ }
10709+
10710+ if (delay) {
10711+ set_bit(SEFL_DELAY, &sev->se_flags);
10712+
10713+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
10714+ sev->se_state = SEST_LEAVE_BEGIN;
10715+ /* The DELAY flag will be cleared once recovery
10716+ * is done allowing the leave to be retried. */
10717+ } else {
10718+ sev->se_state = SEST_JOIN_BEGIN;
10719+ /* restart timer function will clear DELAY */
10720+ schedule_sev_restart(sev);
10721+ }
10722+ }
10723+ }
10724+}
10725+
10726+void process_joinleave(void)
10727+{
10728+ sm_sevent_t *sev = NULL, *safe;
10729+
10730+ spin_lock(&new_event_lock);
10731+ if (!list_empty(&new_event)) {
10732+ sev = list_entry(new_event.next, sm_sevent_t, se_list);
10733+ list_del(&sev->se_list);
10734+ list_add_tail(&sev->se_list, &joinleave_events);
10735+ set_bit(SEFL_CHECK, &sev->se_flags);
10736+ }
10737+ spin_unlock(&new_event_lock);
10738+
10739+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10740+ if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
10741+ continue;
10742+
10743+ if (test_bit(SEFL_DELAY, &sev->se_flags))
10744+ continue;
10745+
10746+ if (sev->se_state < SEST_LEAVE_BEGIN)
10747+ process_join_sevent(sev);
10748+ else
10749+ process_leave_sevent(sev);
10750+ }
10751+}
10752diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
10753--- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 10754+++ linux-patched/cluster/cman/sm_joinleave.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 10755@@ -0,0 +1,23 @@
10756+/******************************************************************************
10757+*******************************************************************************
10758+**
10759+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10760+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10761+**
10762+** This copyrighted material is made available to anyone wishing to use,
10763+** modify, copy, or redistribute it subject to the terms and conditions
10764+** of the GNU General Public License v.2.
10765+**
10766+*******************************************************************************
10767+******************************************************************************/
10768+
10769+#ifndef __SM_JOINLEAVE_DOT_H__
10770+#define __SM_JOINLEAVE_DOT_H__
10771+
10772+void init_joinleave(void);
10773+void new_joinleave(sm_sevent_t *sev);
10774+void process_joinleave(void);
10775+void backout_sevents(void);
10776+sm_sevent_t *find_sevent(unsigned int id);
10777+
10778+#endif
10779diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
10780--- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 10781+++ linux-patched/cluster/cman/sm_membership.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 10782@@ -0,0 +1,696 @@
10783+/******************************************************************************
10784+*******************************************************************************
10785+**
10786+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10787+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10788+**
10789+** This copyrighted material is made available to anyone wishing to use,
10790+** modify, copy, or redistribute it subject to the terms and conditions
10791+** of the GNU General Public License v.2.
10792+**
10793+*******************************************************************************
10794+******************************************************************************/
10795+
10796+#include "sm.h"
10797+
10798+extern struct list_head sm_members;
10799+
10800+/*
10801+ * Routines for SG members to handle other nodes joining or leaving the SG.
10802+ * These "uevent" membership update routines are the response to an "sevent" on
10803+ * a joining/leaving node.
10804+ */
10805+
10806+static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
10807+{
10808+ sm_node_t *node;
10809+
10810+ list_for_each_entry(node, &sg->memb, list) {
10811+ if (node->id != nodeid)
10812+ continue;
10813+ list_del(&node->list);
10814+ kfree(node);
10815+ sg->memb_count--;
10816+ log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
10817+ break;
10818+ }
10819+}
10820+
10821+static void add_memb_node(sm_group_t *sg, sm_node_t *node)
10822+{
10823+ list_add_tail(&node->list, &sg->memb);
10824+ sg->memb_count++;
10825+ log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
10826+}
10827+
10828+/*
10829+ * Join 1. The receive end of send_join_stop() from a node requesting to join
10830+ * the SG. We stop the service so it can be restarted with the new node.
10831+ */
10832+
10833+static int process_join_stop(sm_group_t *sg)
10834+{
10835+ sm_uevent_t *uev = &sg->uevent;
10836+ sm_node_t *node;
10837+ sm_msg_t reply;
10838+ int error;
10839+
10840+ if (uev->ue_num_nodes != sg->memb_count + 1) {
10841+ log_error(sg, "process_join_stop: bad num nodes %u %u",
10842+ uev->ue_num_nodes, sg->memb_count);
10843+ return -1;
10844+ }
10845+
10846+ sm_set_event_id(&uev->ue_id);
10847+
10848+ node = sm_find_joiner(sg, uev->ue_nodeid);
10849+ SM_ASSERT(node,);
10850+
10851+ sg->state = SGST_UEVENT;
10852+ sg->ops->stop(sg->service_data);
10853+
10854+ reply.ms_type = SMSG_JSTOP_REP;
10855+ reply.ms_status = STATUS_POS;
10856+ reply.ms_sevent_id = uev->ue_remote_seid;
10857+ smsg_bswap_out(&reply);
10858+
10859+ error = send_nodeid_message((char *) &reply, sizeof(reply),
10860+ uev->ue_nodeid);
10861+ if (error < 0)
10862+ return error;
10863+ return 0;
10864+}
10865+
10866+/*
10867+ * Join 2. The receive end of send_join_start() from a node joining the SG.
10868+ * We are re-starting the service with the new member added.
10869+ */
10870+
10871+static int process_join_start(sm_group_t *sg)
10872+{
10873+ sm_uevent_t *uev = &sg->uevent;
10874+ sm_node_t *node;
10875+ uint32_t *memb;
10876+ int count = 0;
10877+
10878+ /* this memory is passed to the service which must free it */
10879+ SM_RETRY(memb =
10880+ kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
10881+ memb);
10882+
10883+ /* transfer joining node from joining list to member list */
10884+ node = sm_find_joiner(sg, uev->ue_nodeid);
10885+ SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
10886+ list_del(&node->list);
10887+ add_memb_node(sg, node);
10888+
10889+ /* the new member list for the service */
10890+ list_for_each_entry(node, &sg->memb, list)
10891+ memb[count++] = node->id;
10892+
10893+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
10894+
10895+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
10896+ SERVICE_NODE_JOIN);
10897+ return 0;
10898+}
10899+
10900+/*
10901+ * Join 3. When done starting their local service, every previous SG member
10902+ * calls startdone_barrier() and the new/joining member calls
10903+ * startdone_barrier_new(). The barrier returns when everyone has started
10904+ * their service and joined the barrier.
10905+ */
10906+
10907+static int startdone_barrier(sm_group_t *sg)
10908+{
10909+ sm_uevent_t *uev = &sg->uevent;
10910+ char bname[MAX_BARRIER_NAME_LEN];
10911+ int error;
10912+
10913+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10914+ uev->ue_barrier_status = -1;
10915+
10916+ set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
10917+
10918+ /* If we're the only member, skip the barrier */
10919+ if (sg->memb_count == 1) {
10920+ process_startdone_barrier(sg, 0);
10921+ return 0;
10922+ }
10923+
10924+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10925+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
10926+ sg->memb_count);
10927+
10928+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
10929+
10930+ return error;
10931+}
10932+
10933+/*
10934+ * Join 4. Check that the "all started" barrier returned a successful status.
10935+ * The newly joined member calls check_startdone_barrier_new().
10936+ */
10937+
10938+static int check_startdone_barrier(sm_group_t *sg)
10939+{
10940+ int error = sg->uevent.ue_barrier_status;
10941+ return error;
10942+}
10943+
10944+/*
10945+ * Join 5. Send the service a "finish" indicating that all members have
10946+ * successfully started. The newly joined member calls do_finish_new().
10947+ */
10948+
10949+static void do_finish(sm_group_t *sg)
10950+{
10951+ sg->state = SGST_RUN;
10952+ clear_bit(SGFL_UEVENT, &sg->flags);
10953+ sg->ops->finish(sg->service_data, sg->uevent.ue_id);
10954+}
10955+
10956+/*
10957+ * Join 6. The uevent is done. If this was a uevent for a node leaving the
10958+ * SG, then send a final message to the departed node signalling that the
10959+ * remaining nodes have restarted since it left.
10960+ */
10961+
10962+static void uevent_done(sm_group_t *sg)
10963+{
10964+ sm_uevent_t *uev = &sg->uevent;
10965+ sm_msg_t reply;
10966+
10967+ if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
10968+ reply.ms_type = SMSG_LSTART_DONE;
10969+ reply.ms_status = STATUS_POS;
10970+ reply.ms_sevent_id = uev->ue_remote_seid;
10971+ smsg_bswap_out(&reply);
10972+ send_nodeid_message((char *) &reply, sizeof(reply),
10973+ uev->ue_nodeid);
10974+ }
10975+ memset(&sg->uevent, 0, sizeof(sm_uevent_t));
10976+}
10977+
10978+/*
10979+ * Leave 1. The receive end of send_leave_stop() from a node leaving the SG.
10980+ */
10981+
10982+static int process_leave_stop(sm_group_t *sg)
10983+{
10984+ sm_uevent_t *uev = &sg->uevent;
10985+ sm_msg_t reply;
10986+ int error;
10987+
10988+ sm_set_event_id(&uev->ue_id);
10989+
10990+ sg->state = SGST_UEVENT;
10991+ sg->ops->stop(sg->service_data);
10992+
10993+ reply.ms_type = SMSG_LSTOP_REP;
10994+ reply.ms_status = STATUS_POS;
10995+ reply.ms_sevent_id = uev->ue_remote_seid;
10996+ smsg_bswap_out(&reply);
10997+
10998+ error = send_nodeid_message((char *) &reply, sizeof(reply),
10999+ uev->ue_nodeid);
11000+ if (error < 0)
11001+ return error;
11002+ return 0;
11003+}
11004+
11005+/*
11006+ * Leave 2. The receive end of send_leave_start() from a node leaving the SG.
11007+ * We are re-starting the service (without the node that's left naturally.)
11008+ */
11009+
11010+static int process_leave_start(sm_group_t *sg)
11011+{
11012+ sm_uevent_t *uev = &sg->uevent;
11013+ sm_node_t *node;
11014+ uint32_t *memb;
11015+ int count = 0;
11016+
11017+ SM_ASSERT(sg->memb_count > 1,
11018+ printk("memb_count=%u\n", sg->memb_count););
11019+
11020+ /* this memory is passed to the service which must free it */
11021+ SM_RETRY(memb =
11022+ kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
11023+ memb);
11024+
11025+ /* remove departed member from sg member list */
11026+ del_memb_node(sg, uev->ue_nodeid);
11027+
11028+ /* build member list to pass to service */
11029+ list_for_each_entry(node, &sg->memb, list)
11030+ memb[count++] = node->id;
11031+
11032+ /* allow us to accept the start_done callback for this start */
11033+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11034+
11035+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11036+ SERVICE_NODE_LEAVE);
11037+ return 0;
11038+}
11039+
11040+/*
11041+ * Move through the steps of another node joining or leaving the SG.
11042+ */
11043+
11044+static void process_one_uevent(sm_group_t *sg)
11045+{
11046+ sm_uevent_t *uev = &sg->uevent;
11047+ int error = 0;
11048+
11049+ log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
11050+
11051+ switch (uev->ue_state) {
11052+
11053+ /*
11054+ * a uevent is initialized with state JSTOP in
11055+ * process_stop_request
11056+ */
11057+
11058+ case UEST_JSTOP:
11059+ uev->ue_state = UEST_JSTART_WAITCMD;
11060+ error = process_join_stop(sg);
11061+ break;
11062+
11063+ /*
11064+ * ue_state is changed from JSTART_WAITCMD to JSTART in
11065+ * process_start_request
11066+ */
11067+
11068+ case UEST_JSTART:
11069+ uev->ue_state = UEST_JSTART_SERVICEWAIT;
11070+ error = process_join_start(sg);
11071+ break;
11072+
11073+ /*
11074+ * ue_state is changed from JSTART_SERVICEWAIT to
11075+ * JSTART_SERVICEDONE in kcl_start_done
11076+ */
11077+
11078+ case UEST_JSTART_SERVICEDONE:
11079+ uev->ue_state = UEST_BARRIER_WAIT;
11080+ error = startdone_barrier(sg);
11081+ break;
11082+
11083+ /*
11084+ * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
11085+ * process_startdone_barrier
11086+ */
11087+
11088+ case UEST_BARRIER_DONE:
11089+ error = check_startdone_barrier(sg);
11090+ if (error)
11091+ break;
11092+
11093+ do_finish(sg);
11094+ uevent_done(sg);
11095+ break;
11096+
11097+ /*
11098+ * a uevent is initialized with state LSTOP in
11099+ * process_stop_request
11100+ */
11101+
11102+ case UEST_LSTOP:
11103+ uev->ue_state = UEST_LSTART_WAITCMD;
11104+ error = process_leave_stop(sg);
11105+ break;
11106+
11107+ /*
11108+ * a uevent is changed from LSTART_WAITCMD to LSTART in
11109+ * process_start_request
11110+ */
11111+
11112+ case UEST_LSTART:
11113+ uev->ue_state = UEST_LSTART_SERVICEWAIT;
11114+ error = process_leave_start(sg);
11115+ break;
11116+
11117+ /*
11118+ * a uevent is changed from LSTART_SERVICEWAIT to to
11119+ * LSTART_SERVICEDONE in kcl_start_done
11120+ */
11121+
11122+ case UEST_LSTART_SERVICEDONE:
11123+ uev->ue_state = UEST_BARRIER_WAIT;
11124+ error = startdone_barrier(sg);
11125+ break;
11126+
11127+ default:
11128+ error = -1;
11129+ }
11130+
11131+ /* If we encounter an error during these routines, we do nothing,
11132+ expecting that a node failure related to this sg will cause a
11133+ recovery event to arrive and call cancel_one_uevent(). */
11134+
11135+ if (error)
11136+ log_error(sg, "process_one_uevent error %d state %u",
11137+ error, uev->ue_state);
11138+}
11139+
11140+static sm_node_t *failed_memb(sm_group_t *sg, int *count)
11141+{
11142+ sm_node_t *node, *sm_node, *failed_uev_node = NULL;
11143+
11144+ list_for_each_entry(node, &sg->memb, list) {
11145+
11146+ sm_node = sm_find_member(node->id);
11147+ SM_ASSERT(sm_node, );
11148+
11149+ if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
11150+ (*count)++;
11151+ if (node->id == sg->uevent.ue_nodeid)
11152+ failed_uev_node = sm_node;
11153+ }
11154+ }
11155+ return failed_uev_node;
11156+}
11157+
11158+static void send_recover_msg(sm_group_t *sg)
11159+{
11160+ char *msg;
11161+ int len = 0;
11162+ msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
11163+ send_members_message(sg, msg, len);
11164+}
11165+
11166+static void cancel_barrier(sm_group_t *sg)
11167+{
11168+ sm_uevent_t *uev = &sg->uevent;
11169+ char bname[MAX_BARRIER_NAME_LEN];
11170+
11171+ clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11172+
11173+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
11174+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11175+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11176+ sg->memb_count);
11177+ kcl_barrier_cancel(bname);
11178+}
11179+
11180+static void cancel_one_uevent(sm_group_t *sg, int *effected)
11181+{
11182+ sm_uevent_t *uev = &sg->uevent;
11183+ int failed_count;
11184+ sm_node_t *node, *failed_joiner, *failed_leaver;
11185+
11186+ log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
11187+ uev->ue_nodeid);
11188+
11189+ switch (uev->ue_state) {
11190+
11191+ case UEST_JSTOP:
11192+ case UEST_JSTART_WAITCMD:
11193+ case UEST_JSTART:
11194+
11195+ sg->ops->stop(sg->service_data);
11196+
11197+ failed_count = 0;
11198+ failed_joiner = failed_memb(sg, &failed_count);
11199+ SM_ASSERT(!failed_joiner, );
11200+
11201+ node = sm_find_member(uev->ue_nodeid);
11202+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11203+ failed_joiner = node;
11204+
11205+ if (!failed_count) {
11206+ /* only joining node failed */
11207+ SM_ASSERT(failed_joiner, );
11208+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11209+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11210+ (*effected)++;
11211+ /* some nodes may not have gotten a JSTOP message
11212+ in which case this will tell them to begin
11213+ recovery for this sg. */
11214+ send_recover_msg(sg);
11215+
11216+ } else {
11217+ /* a member node failed (and possibly joining node, it
11218+ doesn't matter) */
11219+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11220+ }
11221+
11222+ clear_bit(SGFL_UEVENT, &sg->flags);
11223+ memset(uev, 0, sizeof(sm_uevent_t));
11224+ break;
11225+
11226+
11227+ case UEST_JSTART_SERVICEWAIT:
11228+ case UEST_JSTART_SERVICEDONE:
11229+
11230+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11231+ sg->ops->stop(sg->service_data);
11232+
11233+ failed_count = 0;
11234+ failed_joiner = failed_memb(sg, &failed_count);
11235+ SM_ASSERT(failed_count, );
11236+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11237+
11238+ if (failed_count == 1 && failed_joiner) {
11239+ /* only joining node failed */
11240+
11241+ } else if (failed_count && failed_joiner) {
11242+ /* joining node and another member failed */
11243+
11244+ } else {
11245+ /* other member failed, joining node still alive */
11246+ SM_ASSERT(!failed_joiner, );
11247+ del_memb_node(sg, uev->ue_nodeid);
11248+ }
11249+
11250+ clear_bit(SGFL_UEVENT, &sg->flags);
11251+ memset(uev, 0, sizeof(sm_uevent_t));
11252+ break;
11253+
11254+
11255+ case UEST_LSTOP:
11256+ case UEST_LSTART_WAITCMD:
11257+ case UEST_LSTART:
11258+
11259+ sg->ops->stop(sg->service_data);
11260+
11261+ failed_count = 0;
11262+ failed_leaver = failed_memb(sg, &failed_count);
11263+ SM_ASSERT(failed_count, );
11264+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11265+
11266+ if (failed_count == 1 && failed_leaver) {
11267+ /* only leaving node failed */
11268+
11269+ } else if (failed_count && failed_leaver) {
11270+ /* leaving node and another member failed */
11271+
11272+ } else {
11273+ /* other member failed, leaving node still alive */
11274+ SM_ASSERT(!failed_leaver, );
11275+ }
11276+
11277+ clear_bit(SGFL_UEVENT, &sg->flags);
11278+ memset(uev, 0, sizeof(sm_uevent_t));
11279+ break;
11280+
11281+
11282+ case UEST_LSTART_SERVICEWAIT:
11283+ case UEST_LSTART_SERVICEDONE:
11284+
11285+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11286+ sg->ops->stop(sg->service_data);
11287+
11288+ failed_count = 0;
11289+ failed_leaver = failed_memb(sg, &failed_count);
11290+ SM_ASSERT(!failed_leaver, );
11291+
11292+ node = sm_find_member(uev->ue_nodeid);
11293+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11294+ failed_leaver = node;
11295+
11296+ if (!failed_count) {
11297+ /* only leaving node failed */
11298+ SM_ASSERT(failed_leaver, );
11299+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11300+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11301+ (*effected)++;
11302+
11303+ } else if (failed_count && failed_leaver) {
11304+ /* leaving node and another member failed */
11305+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11306+
11307+ } else {
11308+ /* other member failed, leaving node still alive */
11309+ SM_ASSERT(failed_count, );
11310+ SM_ASSERT(!failed_leaver, );
11311+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11312+ node = sm_new_node(sg->uevent.ue_nodeid);
11313+ add_memb_node(sg, node);
11314+ }
11315+
11316+ clear_bit(SGFL_UEVENT, &sg->flags);
11317+ memset(uev, 0, sizeof(sm_uevent_t));
11318+ break;
11319+
11320+
11321+ case UEST_BARRIER_WAIT:
11322+
11323+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11324+ goto barrier_wait_leave;
11325+
11326+ sg->ops->stop(sg->service_data);
11327+ cancel_barrier(sg);
11328+
11329+ barrier_wait_join:
11330+
11331+ failed_count = 0;
11332+ failed_joiner = failed_memb(sg, &failed_count);
11333+ SM_ASSERT(failed_count, );
11334+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11335+
11336+ if (failed_count == 1 && failed_joiner) {
11337+ /* only joining node failed */
11338+
11339+ } else if (failed_count && failed_joiner) {
11340+ /* joining node and another member failed */
11341+
11342+ } else {
11343+ /* other member failed, joining node still alive */
11344+ SM_ASSERT(!failed_joiner, );
11345+ del_memb_node(sg, uev->ue_nodeid);
11346+ }
11347+
11348+ clear_bit(SGFL_UEVENT, &sg->flags);
11349+ memset(uev, 0, sizeof(sm_uevent_t));
11350+ break;
11351+
11352+ barrier_wait_leave:
11353+
11354+ failed_count = 0;
11355+ failed_leaver = failed_memb(sg, &failed_count);
11356+ SM_ASSERT(!failed_leaver, );
11357+
11358+ node = sm_find_member(uev->ue_nodeid);
11359+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11360+ failed_leaver = node;
11361+
11362+ if (!failed_count) {
11363+ /* only leaving node failed */
11364+ SM_ASSERT(failed_leaver, );
11365+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11366+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11367+ (*effected)++;
11368+
11369+ } else if (failed_count && failed_leaver) {
11370+ /* leaving node and another member failed */
11371+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11372+
11373+ } else {
11374+ /* other member failed, leaving node still alive */
11375+ SM_ASSERT(failed_count, );
11376+ SM_ASSERT(!failed_leaver, );
11377+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11378+ node = sm_new_node(sg->uevent.ue_nodeid);
11379+ add_memb_node(sg, node);
11380+ }
11381+
11382+ clear_bit(SGFL_UEVENT, &sg->flags);
11383+ memset(uev, 0, sizeof(sm_uevent_t));
11384+ break;
11385+
11386+
11387+ case UEST_BARRIER_DONE:
11388+
11389+ if (!uev->ue_barrier_status) {
11390+ do_finish(sg);
11391+ uevent_done(sg);
11392+ break;
11393+ }
11394+
11395+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11396+ goto barrier_wait_leave;
11397+ else
11398+ goto barrier_wait_join;
11399+
11400+
11401+ default:
11402+ log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
11403+ }
11404+}
11405+
11406+void cancel_uevents(int *effected)
11407+{
11408+ sm_group_t *sg;
11409+ sm_node_t *node, *sgnode;
11410+ int i;
11411+
11412+ list_for_each_entry(node, &sm_members, list) {
11413+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
11414+ continue;
11415+
11416+ /*
11417+ * Clear this dead node from the "interested in joining" list
11418+ * of any SG. The node is added to this list before the uevent
11419+ * begins.
11420+ */
11421+
11422+ for (i = 0; i < SG_LEVELS; i++) {
11423+ list_for_each_entry(sg, &sm_sg[i], list) {
11424+ sgnode = sm_find_joiner(sg, node->id);
11425+ if (sgnode) {
11426+ log_debug(sg, "clear joining node %u",
11427+ sgnode->id);
11428+ list_del(&sgnode->list);
11429+ kfree(sgnode);
11430+ }
11431+ }
11432+ }
11433+ }
11434+
11435+ /* Adjust any uevents in sg's effected by the failed node(s) */
11436+
11437+ for (i = 0; i < SG_LEVELS; i++) {
11438+ list_for_each_entry(sg, &sm_sg[i], list) {
11439+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11440+ continue;
11441+
11442+ /* We may have some cancelling to do if this sg is
11443+ flagged as having a failed member, or if a joining
11444+ or leaving node has died. */
11445+
11446+ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
11447+ cancel_one_uevent(sg, effected);
11448+ else if (sg->uevent.ue_nodeid) {
11449+ node = sm_find_member(sg->uevent.ue_nodeid);
11450+ SM_ASSERT(node, );
11451+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11452+ cancel_one_uevent(sg, effected);
11453+ }
11454+ }
11455+ }
11456+}
11457+
11458+void process_membership(void)
11459+{
11460+ sm_group_t *sg;
11461+ int i;
11462+
11463+ down(&sm_sglock);
11464+
11465+ for (i = 0; i < SG_LEVELS; i++) {
11466+ list_for_each_entry(sg, &sm_sg[i], list) {
11467+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11468+ continue;
11469+
11470+ if (!test_and_clear_bit(UEFL_CHECK,
11471+ &sg->uevent.ue_flags))
11472+ continue;
11473+
11474+ process_one_uevent(sg);
11475+ }
11476+ }
11477+ up(&sm_sglock);
11478+}
11479diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
11480--- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11481+++ linux-patched/cluster/cman/sm_membership.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 11482@@ -0,0 +1,20 @@
11483+/******************************************************************************
11484+*******************************************************************************
11485+**
11486+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11487+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11488+**
11489+** This copyrighted material is made available to anyone wishing to use,
11490+** modify, copy, or redistribute it subject to the terms and conditions
11491+** of the GNU General Public License v.2.
11492+**
11493+*******************************************************************************
11494+******************************************************************************/
11495+
11496+#ifndef __SM_MEMBERSHIP_DOT_H__
11497+#define __SM_MEMBERSHIP_DOT_H__
11498+
11499+void process_membership(void);
11500+void cancel_uevents(int *effected);
11501+
11502+#endif
11503diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
11504--- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11505+++ linux-patched/cluster/cman/sm_message.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 11506@@ -0,0 +1,867 @@
11507+/******************************************************************************
11508+*******************************************************************************
11509+**
11510+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11511+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11512+**
11513+** This copyrighted material is made available to anyone wishing to use,
11514+** modify, copy, or redistribute it subject to the terms and conditions
11515+** of the GNU General Public License v.2.
11516+**
11517+*******************************************************************************
11518+******************************************************************************/
11519+
11520+#include "sm.h"
11521+
11522+#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
11523+
11524+extern struct socket * sm_socket;
11525+extern uint32_t sm_our_nodeid;
11526+static uint32_t global_last_id;
11527+static struct list_head messages;
11528+static spinlock_t message_lock;
11529+static char smsg_buf[SMSG_BUF_SIZE];
11530+
11531+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
11532+
11533+struct rq_entry {
11534+ struct list_head list;
11535+ char *msg;
11536+ int len;
11537+ uint32_t nodeid;
11538+};
11539+typedef struct rq_entry rq_entry_t;
11540+
11541+void init_messages(void)
11542+{
11543+ global_last_id = 1;
11544+ INIT_LIST_HEAD(&messages);
11545+ spin_lock_init(&message_lock);
11546+}
11547+
11548+uint32_t sm_new_global_id(int level)
11549+{
11550+ uint32_t id = global_last_id++;
11551+ uint8_t l = (uint8_t) level;
11552+
11553+ if (level > 255)
11554+ return 0;
11555+
11556+ if (id > 0x00FFFFFF)
11557+ return 0;
11558+
11559+ id |= (l << 24);
11560+ return id;
11561+}
11562+
11563+static void smsg_copy_in(char *msg, sm_msg_t *smsg)
11564+{
11565+ sm_msg_t *in = (sm_msg_t *) msg;
11566+
11567+ smsg->ms_type = in->ms_type;
11568+ smsg->ms_status = in->ms_status;
11569+ smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
11570+ smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
11571+ smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
11572+ smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
11573+ smsg->ms_length = le16_to_cpu(in->ms_length);
11574+}
11575+
11576+/* swapping bytes in place is an easy source of errors - be careful not to
11577+ * access the fields after calling this */
11578+
11579+void smsg_bswap_out(sm_msg_t *smsg)
11580+{
11581+ smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
11582+ smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
11583+ smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
11584+ smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
11585+ smsg->ms_length = cpu_to_le16(smsg->ms_length);
11586+}
11587+
11588+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
11589+ sm_sevent_t *sev)
11590+{
11591+ char *msg;
11592+ sm_msg_t *smsg;
11593+ int fulllen = sizeof(sm_msg_t) + datalen;
11594+
11595+ msg = smsg_buf;
11596+ memset(smsg_buf, 0, SMSG_BUF_SIZE);
11597+ SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
11598+
11599+ smsg = (sm_msg_t *) msg;
11600+ smsg->ms_type = type;
11601+ smsg->ms_global_sgid = sg->global_id;
11602+ smsg->ms_sglevel = sg->level;
11603+ smsg->ms_length = datalen;
11604+ smsg->ms_sevent_id = sev ? sev->se_id : 0;
11605+
11606+ smsg_bswap_out(smsg);
11607+ *msglen = fulllen;
11608+ return msg;
11609+}
11610+
11611+static unsigned int msgtype_to_flag(int type)
11612+{
11613+ unsigned int flag;
11614+
11615+ switch (type) {
11616+ case SMSG_JOIN_REP:
11617+ case SMSG_JOIN_REQ:
11618+ flag = SEFL_ALLOW_JOIN;
11619+ break;
11620+
11621+ case SMSG_JSTOP_REP:
11622+ case SMSG_JSTOP_REQ:
11623+ flag = SEFL_ALLOW_JSTOP;
11624+ break;
11625+
11626+ case SMSG_LEAVE_REP:
11627+ case SMSG_LEAVE_REQ:
11628+ flag = SEFL_ALLOW_LEAVE;
11629+ break;
11630+
11631+ case SMSG_LSTOP_REP:
11632+ case SMSG_LSTOP_REQ:
11633+ flag = SEFL_ALLOW_LSTOP;
11634+ break;
11635+
11636+ default:
11637+ SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
11638+ }
11639+ return flag;
11640+}
11641+
11642+static int test_allowed_msgtype(sm_sevent_t * sev, int type)
11643+{
11644+ unsigned int flag = msgtype_to_flag(type);
11645+
11646+ return test_bit(flag, &sev->se_flags);
11647+}
11648+
11649+static void clear_allowed_msgtype(sm_sevent_t * sev, int type)
11650+{
11651+ unsigned int flag = msgtype_to_flag(type);
11652+
11653+ clear_bit(flag, &sev->se_flags);
11654+}
11655+
11656+static void set_allowed_msgtype(sm_sevent_t * sev, int type)
11657+{
11658+ unsigned int flag = msgtype_to_flag(type);
11659+
11660+ set_bit(flag, &sev->se_flags);
11661+}
11662+
11663+static int save_global_id(sm_sevent_t * sev, sm_msg_t * smsg)
11664+{
11665+ sm_group_t *sg = sev->se_sg;
11666+
11667+ if (!smsg->ms_global_sgid) {
11668+ log_error(sg, "save_global_id: zero sg id");
11669+ return -1;
11670+ }
11671+
11672+ if (!sg->global_id)
11673+ sg->global_id = smsg->ms_global_sgid;
11674+
11675+ if (sg->global_id != smsg->ms_global_sgid) {
11676+ log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
11677+ return -1;
11678+ }
11679+ return 0;
11680+}
11681+
11682+static void save_lastid(sm_msg_t * smsg)
11683+{
11684+ uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
11685+
11686+ /*
11687+ * Keep track of the highst SG id which has been used
11688+ * in the cluster in case we need to choose a new SG id.
11689+ */
11690+
11691+ if (gid > global_last_id)
11692+ global_last_id = gid;
11693+}
11694+
11695+static int next_sev_state(int msg_type, int cur_state)
11696+{
11697+ int next = 0;
11698+
11699+ switch (msg_type) {
11700+ case SMSG_JOIN_REP:
11701+ SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
11702+ next = SEST_JOIN_ACKED;
11703+ break;
11704+
11705+ case SMSG_JSTOP_REP:
11706+ SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
11707+ next = SEST_JSTOP_ACKED;
11708+ break;
11709+
11710+ case SMSG_LEAVE_REP:
11711+ SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
11712+ next = SEST_LEAVE_ACKED;
11713+ break;
11714+
11715+ case SMSG_LSTOP_REP:
11716+ SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
11717+ next = SEST_LSTOP_ACKED;
11718+ break;
11719+ }
11720+ return next;
11721+}
11722+
11723+/*
11724+ * Functions in sevent.c send messages to other nodes and then expect replies.
11725+ * This function collects the replies for the sevent messages and moves the
11726+ * sevent to the next stage when all the expected replies have been received.
11727+ */
11728+
11729+static void process_reply(sm_msg_t * smsg, uint32_t nodeid)
11730+{
11731+ sm_sevent_t *sev;
11732+ int i, expected, type = smsg->ms_type;
11733+
11734+ /*
11735+ * Find the relevant sevent.
11736+ */
11737+
11738+ sev = find_sevent(smsg->ms_sevent_id);
11739+ if (!sev) {
11740+ log_print("process_reply invalid id=%u nodeid=%u",
11741+ smsg->ms_sevent_id, nodeid);
11742+ goto out;
11743+ }
11744+
11745+ /*
11746+ * Check if this message type is what this sevent is waiting for.
11747+ */
11748+
11749+ if (!test_allowed_msgtype(sev, type)) {
11750+ log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id);
11751+ goto out;
11752+ }
11753+
11754+ expected =
11755+ (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
11756+
11757+ SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
11758+ printk("type=%d expected=%d len_ids=%d node_count=%d "
11759+ "memb_count=%d\n", type, expected, sev->se_len_ids,
11760+ sev->se_node_count, sev->se_memb_count););
11761+
11762+ SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
11763+ printk("type=%d expected=%d len_status=%d node_count=%d "
11764+ "memb_count=%d\n", type, expected, sev->se_len_status,
11765+ sev->se_node_count, sev->se_memb_count););
11766+
11767+ for (i = 0; i < expected; i++) {
11768+ if (sev->se_node_ids[i] == nodeid) {
11769+ /*
11770+ * Save the status from the replying node
11771+ */
11772+
11773+ if (!sev->se_node_status[i])
11774+ sev->se_node_status[i] = smsg->ms_status;
11775+ else {
11776+ log_error(sev->se_sg, "process_reply duplicate"
11777+ "id=%u nodeid=%u %u/%u",
11778+ sev->se_id, nodeid,
11779+ sev->se_node_status[i],
11780+ smsg->ms_status);
11781+ goto out;
11782+ }
11783+
11784+ if (type == SMSG_JOIN_REP) {
11785+ save_lastid(smsg);
11786+
11787+ if (smsg->ms_status == STATUS_POS)
11788+ save_global_id(sev, smsg);
11789+ }
11790+
11791+ /*
11792+ * Signal sm if we have all replies
11793+ */
11794+
11795+ if (++sev->se_reply_count == expected) {
11796+ clear_allowed_msgtype(sev, type);
11797+ sev->se_state = next_sev_state(type,
11798+ sev->se_state);
11799+ set_bit(SEFL_CHECK, &sev->se_flags);
11800+ wake_serviced(DO_JOINLEAVE);
11801+ }
11802+
11803+ break;
11804+ }
11805+ }
11806+
11807+ out:
11808+ return;
11809+}
11810+
11811+/*
11812+ * A node wants to join an SG and has run send_join_notice. If we know nothing
11813+ * about the SG , then we have no objection - send back STATUS_POS. If we're a
11814+ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
11815+ * no sevent or uevent of higher priority in progress (only a single join or
11816+ * leave is permitted for the SG at once). If there happens to be a higher
11817+ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
11818+ * requested join for a bit.
11819+ */
11820+
11821+static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
11822+{
11823+ sm_group_t *sg = NULL;
11824+ sm_sevent_t *sev = NULL;
11825+ sm_node_t *node;
11826+ int found = FALSE;
11827+ int level = smsg->ms_sglevel;
11828+ sm_msg_t reply;
11829+
11830+ memset(&reply, 0, sizeof(reply));
11831+
11832+ down(&sm_sglock);
11833+
11834+ if (nodeid == sm_our_nodeid)
11835+ goto next;
11836+
11837+ /*
11838+ * search SG list for an SG with given name/len
11839+ */
11840+
11841+ list_for_each_entry(sg, &sm_sg[level], list) {
11842+ if ((sg->namelen != smsg->ms_length) ||
11843+ memcmp(sg->name, name, sg->namelen))
11844+ continue;
11845+ found = TRUE;
11846+ break;
11847+ }
11848+
11849+ /*
11850+ * build reply message
11851+ */
11852+
11853+ next:
11854+
11855+ if (!found) {
11856+ reply.ms_type = SMSG_JOIN_REP;
11857+ reply.ms_status = STATUS_NEG;
11858+ reply.ms_global_lastid = global_last_id;
11859+ reply.ms_sevent_id = smsg->ms_sevent_id;
11860+ } else {
11861+ reply.ms_type = SMSG_JOIN_REP;
11862+ reply.ms_status = STATUS_POS;
11863+ reply.ms_sevent_id = smsg->ms_sevent_id;
11864+ reply.ms_global_sgid = sg->global_id;
11865+ reply.ms_global_lastid = global_last_id;
11866+
11867+ /*
11868+ * The node trying to join should wait and try again until
11869+ * we're done with recovery.
11870+ */
11871+
11872+ if (sg->state == SGST_RECOVER) {
11873+ reply.ms_status = STATUS_WAIT;
11874+ goto send;
11875+ }
11876+
11877+ /*
11878+ * An sevent node trying to join may have gotten as far as
11879+ * creating a uevent with us and then backed out. That node
11880+ * will retry joining from the beginning so we should not turn
11881+ * them away. If we're handling a uevent for another node,
11882+ * tell the joining node to wait.
11883+ */
11884+
11885+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
11886+ if (sg->uevent.ue_nodeid != nodeid)
11887+ reply.ms_status = STATUS_WAIT;
11888+ goto send;
11889+ }
11890+
11891+ /*
11892+ * We're trying to join or leave the SG at the moment.
11893+ */
11894+
11895+ if (test_bit(SGFL_SEVENT, &sg->flags)) {
11896+ sev = sg->sevent;
11897+
11898+ /*
11899+ * We're trying to leave. Make the join wait until
11900+ * we've left if we're beyond LEAVE_ACKWAIT.
11901+ */
11902+
11903+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
11904+ if (sev->se_state > SEST_LEAVE_ACKED)
11905+ reply.ms_status = STATUS_WAIT;
11906+ else {
11907+ reply.ms_status = STATUS_POS;
11908+ clear_bit(SEFL_ALLOW_LEAVE,
11909+ &sev->se_flags);
11910+ set_bit(SEFL_CANCEL, &sev->se_flags);
11911+ }
11912+ }
11913+
11914+ /*
11915+ * We're trying to join. Making the other join wait
11916+ * until we're joined if we're beyond JOIN_ACKWAIT or
11917+ * if we have a lower id. (Send NEG to allow the other
11918+ * node to go ahead because we're not in the SG.)
11919+ */
11920+
11921+ else {
11922+ if (sev->se_state > SEST_JOIN_ACKED)
11923+ reply.ms_status = STATUS_WAIT;
11924+ else if (sm_our_nodeid < nodeid)
11925+ reply.ms_status = STATUS_WAIT;
11926+ else {
11927+ reply.ms_status = STATUS_NEG;
11928+ clear_bit(SEFL_ALLOW_JOIN,
11929+ &sev->se_flags);
11930+ set_bit(SEFL_CANCEL, &sev->se_flags);
11931+ }
11932+ }
11933+
11934+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
11935+ set_bit(SEFL_CHECK, &sev->se_flags);
11936+ wake_serviced(DO_JOINLEAVE);
11937+ }
11938+ goto send;
11939+ }
11940+
11941+ /* no r,u,s event, stick with STATUS_POS */
11942+ }
11943+
11944+ send:
11945+
11946+ if (reply.ms_status == STATUS_POS) {
11947+ node = sm_find_joiner(sg, nodeid);
11948+ if (!node) {
11949+ node = sm_new_node(nodeid);
11950+ list_add_tail(&node->list, &sg->joining);
11951+ }
11952+ }
11953+
11954+ up(&sm_sglock);
11955+ smsg_bswap_out(&reply);
11956+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
11957+}
11958+
11959+/*
11960+ * Another node wants us to stop a service so it can join or leave the SG. We
11961+ * do this by saving the request info in a uevent and having the sm thread do
11962+ * the processing and then replying.
11963+ */
11964+
11965+static void process_stop_request(sm_msg_t * smsg, uint32_t nodeid,
11966+ uint32_t * msgbuf)
11967+{
11968+ sm_group_t *sg;
11969+ sm_uevent_t *uev;
11970+ sm_msg_t reply;
11971+ int type = smsg->ms_type;
11972+
11973+ if (nodeid == sm_our_nodeid)
11974+ goto agree;
11975+
11976+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
11977+ if (!sg) {
11978+ log_print("process_stop_request: unknown sg id %x",
11979+ smsg->ms_global_sgid);
11980+ return;
11981+ }
11982+
11983+ /*
11984+ * We shouldn't get here with uevent already set.
11985+ */
11986+
11987+ if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
11988+ log_error(sg, "process_stop_request: uevent already set");
11989+ return;
11990+ }
11991+
11992+ uev = &sg->uevent;
11993+ uev->ue_nodeid = nodeid;
11994+ uev->ue_remote_seid = smsg->ms_sevent_id;
11995+ uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
11996+
11997+ if (type == SMSG_JSTOP_REQ)
11998+ uev->ue_num_nodes = be32_to_cpu(*msgbuf);
11999+ else
12000+ set_bit(UEFL_LEAVE, &uev->ue_flags);
12001+
12002+ /*
12003+ * Do process_join_stop() or process_leave_stop().
12004+ */
12005+
12006+ set_bit(UEFL_CHECK, &uev->ue_flags);
12007+ wake_serviced(DO_MEMBERSHIP);
12008+ return;
12009+
12010+ agree:
12011+ reply.ms_status = STATUS_POS;
12012+ reply.ms_type =
12013+ (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
12014+ reply.ms_sevent_id = smsg->ms_sevent_id;
12015+ smsg_bswap_out(&reply);
12016+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12017+}
12018+
12019+static void process_start_request(sm_msg_t * smsg, uint32_t nodeid)
12020+{
12021+ sm_group_t *sg;
12022+ sm_uevent_t *uev;
12023+ int type = smsg->ms_type;
12024+
12025+ if (nodeid == sm_our_nodeid)
12026+ return;
12027+
12028+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12029+ if (!sg) {
12030+ log_print("process_start_request: unknown sg id %x",
12031+ smsg->ms_global_sgid);
12032+ return;
12033+ }
12034+
12035+ if (!test_bit(SGFL_UEVENT, &sg->flags)) {
12036+ log_error(sg, "process_start_request: no uevent");
12037+ return;
12038+ }
12039+
12040+ uev = &sg->uevent;
12041+
12042+ if (type == SMSG_JSTART_CMD)
12043+ uev->ue_state = UEST_JSTART;
12044+ else
12045+ uev->ue_state = UEST_LSTART;
12046+
12047+ set_bit(UEFL_CHECK, &uev->ue_flags);
12048+ wake_serviced(DO_MEMBERSHIP);
12049+}
12050+
12051+static void process_leave_request(sm_msg_t * smsg, uint32_t nodeid)
12052+{
12053+ sm_group_t *sg;
12054+ sm_node_t *node;
12055+ sm_msg_t reply;
12056+ sm_sevent_t *sev;
12057+ int found = FALSE;
12058+
12059+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12060+ if (sg) {
12061+ if (nodeid == sm_our_nodeid)
12062+ found = TRUE;
12063+ else {
12064+ list_for_each_entry(node, &sg->memb, list) {
12065+ if (node->id != nodeid)
12066+ continue;
12067+ set_bit(SNFL_LEAVING, &node->flags);
12068+ found = TRUE;
12069+ break;
12070+ }
12071+ }
12072+ }
12073+
12074+ if (!found) {
12075+ reply.ms_type = SMSG_LEAVE_REP;
12076+ reply.ms_status = STATUS_NEG;
12077+ reply.ms_sevent_id = smsg->ms_sevent_id;
12078+ } else {
12079+ reply.ms_type = SMSG_LEAVE_REP;
12080+ reply.ms_status = STATUS_POS;
12081+ reply.ms_sevent_id = smsg->ms_sevent_id;
12082+
12083+ if (sg->state == SGST_RECOVER)
12084+ reply.ms_status = STATUS_WAIT;
12085+
12086+ else if (test_bit(SGFL_SEVENT, &sg->flags) &&
12087+ nodeid != sm_our_nodeid) {
12088+ sev = sg->sevent;
12089+
12090+ /*
12091+ * We're trying to join or leave at the moment. If
12092+ * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
12093+ * wait. Otherwise, if joining we'll cancel to let the
12094+ * leave happen first, or if we're leaving allow the
12095+ * lower nodeid to leave first.
12096+ */
12097+
12098+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12099+ if (sev->se_state > SEST_LEAVE_ACKWAIT)
12100+ reply.ms_status = STATUS_WAIT;
12101+ else if (sm_our_nodeid < nodeid)
12102+ reply.ms_status = STATUS_WAIT;
12103+ else {
12104+ reply.ms_status = STATUS_POS;
12105+ clear_bit(SEFL_ALLOW_LEAVE,
12106+ &sev->se_flags);
12107+ set_bit(SEFL_CANCEL, &sev->se_flags);
12108+ }
12109+ } else {
12110+ if (sev->se_state > SEST_JOIN_ACKWAIT)
12111+ reply.ms_status = STATUS_WAIT;
12112+ else {
12113+ reply.ms_status = STATUS_NEG;
12114+ clear_bit(SEFL_ALLOW_JOIN,
12115+ &sev->se_flags);
12116+ set_bit(SEFL_CANCEL, &sev->se_flags);
12117+ }
12118+ }
12119+
12120+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12121+ set_bit(SEFL_CHECK, &sev->se_flags);
12122+ wake_serviced(DO_JOINLEAVE);
12123+ }
12124+ }
12125+
12126+ else if (test_bit(SGFL_UEVENT, &sg->flags)) {
12127+ if (sg->uevent.ue_nodeid != nodeid)
12128+ reply.ms_status = STATUS_WAIT;
12129+ }
12130+
12131+ }
12132+
12133+ smsg_bswap_out(&reply);
12134+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12135+}
12136+
12137+/*
12138+ * Each remaining node will send us a done message. We quit when we get the
12139+ * first. The subsequent done messages for the finished sevent get here and
12140+ * are ignored.
12141+ */
12142+
12143+static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
12144+{
12145+ sm_sevent_t *sev;
12146+
12147+ sev = find_sevent(smsg->ms_sevent_id);
12148+ if (!sev)
12149+ return;
12150+
12151+ if (sev->se_state != SEST_LSTART_WAITREMOTE)
12152+ return;
12153+
12154+ sev->se_state = SEST_LSTART_REMOTEDONE;
12155+ set_bit(SEFL_CHECK, &sev->se_flags);
12156+ wake_serviced(DO_JOINLEAVE);
12157+}
12158+
12159+/*
12160+ * This function and everything it calls always runs in sm context.
12161+ */
12162+
12163+static void process_message(char *msg, uint32_t nodeid)
12164+{
12165+ sm_msg_t smsg;
12166+
12167+ smsg_copy_in(msg, &smsg);
12168+
12169+ switch (smsg.ms_type) {
12170+ case SMSG_JOIN_REQ:
12171+ process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
12172+ break;
12173+
12174+ case SMSG_JSTOP_REQ:
12175+ process_stop_request(&smsg, nodeid,
12176+ (uint32_t *) (msg + sizeof(sm_msg_t)));
12177+ break;
12178+
12179+ case SMSG_LEAVE_REQ:
12180+ process_leave_request(&smsg, nodeid);
12181+ break;
12182+
12183+ case SMSG_LSTOP_REQ:
12184+ process_stop_request(&smsg, nodeid, NULL);
12185+ break;
12186+
12187+ case SMSG_JSTART_CMD:
12188+ case SMSG_LSTART_CMD:
12189+ process_start_request(&smsg, nodeid);
12190+ break;
12191+
12192+ case SMSG_LSTART_DONE:
12193+ process_lstart_done(&smsg, nodeid);
12194+ break;
12195+
12196+ case SMSG_JOIN_REP:
12197+ case SMSG_JSTOP_REP:
12198+ case SMSG_LEAVE_REP:
12199+ case SMSG_LSTOP_REP:
12200+ process_reply(&smsg, nodeid);
12201+ break;
12202+
12203+ case SMSG_RECOVER:
12204+ process_recover_msg(&smsg, nodeid);
12205+ break;
12206+
12207+ default:
12208+ log_print("process_message: unknown type %u nodeid %u",
12209+ smsg.ms_type, nodeid);
12210+ }
12211+}
12212+
12213+/*
12214+ * Always called from sm context.
12215+ */
12216+
12217+void process_messages(void)
12218+{
12219+ rq_entry_t *re;
12220+
12221+ while (1) {
12222+ re = NULL;
12223+
12224+ spin_lock(&message_lock);
12225+ if (!list_empty(&messages)) {
12226+ re = list_entry(messages.next, rq_entry_t, list);
12227+ list_del(&re->list);
12228+ }
12229+ spin_unlock(&message_lock);
12230+
12231+ if (!re)
12232+ break;
12233+ process_message(re->msg, re->nodeid);
12234+ kfree(re->msg);
12235+ kfree(re);
12236+ schedule();
12237+ }
12238+}
12239+
12240+/*
12241+ * Context: cnxman and sm
12242+ */
12243+
12244+static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
12245+{
12246+ rq_entry_t *re;
12247+
12248+ SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
12249+ re);
12250+ SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
12251+
12252+ memcpy(re->msg, msg, len);
12253+ re->len = len;
12254+ re->nodeid = nodeid;
12255+
12256+ spin_lock(&message_lock);
12257+ list_add_tail(&re->list, &messages);
12258+ spin_unlock(&message_lock);
12259+
12260+ wake_serviced(DO_MESSAGES);
12261+ return 0;
12262+}
12263+
12264+/*
12265+ * Context: cnxman
12266+ * Called by cnxman when a service manager message arrives.
12267+ */
12268+
12269+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12270+ unsigned int node_id)
12271+{
12272+ struct kcl_cluster_node kclnode;
12273+ uint32_t nodeid = 0;
12274+ int error = 0;
12275+
12276+ if (!node_id) {
12277+ error = kcl_get_node_by_addr(addr, addr_len, &kclnode);
12278+ if (error)
12279+ return error;
12280+ nodeid = kclnode.node_id;
12281+ } else
12282+ nodeid = node_id;
12283+
12284+ return add_to_recvqueue(msg, len, nodeid);
12285+}
12286+
12287+/*
12288+ * These send routines are used by sm and are always called from sm context.
12289+ */
12290+
12291+int send_nodeid_message(char *msg, int len, uint32_t nodeid)
12292+{
12293+ int error = 0;
12294+ struct sockaddr_cl saddr;
12295+
12296+ if (nodeid == sm_our_nodeid) {
12297+ add_to_recvqueue(msg, len, nodeid);
12298+ goto out;
12299+ }
12300+
12301+ saddr.scl_family = AF_CLUSTER;
12302+ saddr.scl_port = CLUSTER_PORT_SERVICES;
12303+ saddr.scl_nodeid = nodeid;
12304+ error = kcl_sendmsg(sm_socket, msg, len, &saddr,
12305+ sizeof(saddr), 0);
12306+ if (error > 0)
12307+ error = 0;
12308+
12309+ if (error)
12310+ log_print("send_nodeid_message error %d to %u", error, nodeid);
12311+ out:
12312+ return error;
12313+}
12314+
12315+int send_broadcast_message(char *msg, int len)
12316+{
12317+ int error;
12318+
12319+ error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
12320+ if (error > 0)
12321+ error = 0;
12322+
12323+ add_to_recvqueue(msg, len, sm_our_nodeid);
12324+
12325+ if (error)
12326+ log_print("send_broadcast_message error %d", error);
12327+
12328+ return error;
12329+}
12330+
12331+int send_members_message(sm_group_t *sg, char *msg, int len)
12332+{
12333+ sm_node_t *node;
12334+ int error = 0;
12335+
12336+ list_for_each_entry(node, &sg->memb, list) {
12337+ error = send_nodeid_message(msg, len, node->id);
12338+ if (error < 0)
12339+ break;
12340+ }
12341+ return error;
12342+}
12343+
12344+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12345+ sm_sevent_t * sev)
12346+{
12347+ int error;
12348+ sm_msg_t *smsg = (sm_msg_t *) msg;
12349+
12350+ set_allowed_msgtype(sev, smsg->ms_type);
12351+ sev->se_reply_count = 0;
12352+
12353+ error = send_members_message(sg, msg, len);
12354+ if (error < 0)
12355+ clear_allowed_msgtype(sev, smsg->ms_type);
12356+
12357+ return error;
12358+}
12359+
12360+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
12361+{
12362+ int error;
12363+ sm_msg_t *smsg = (sm_msg_t *) msg;
12364+
12365+ set_allowed_msgtype(sev, smsg->ms_type);
12366+ sev->se_reply_count = 0;
12367+
12368+ error = send_broadcast_message(msg, len);
12369+ if (error < 0)
12370+ clear_allowed_msgtype(sev, smsg->ms_type);
12371+
12372+ return error;
12373+}
12374diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
12375--- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12376+++ linux-patched/cluster/cman/sm_message.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 12377@@ -0,0 +1,34 @@
12378+/******************************************************************************
12379+*******************************************************************************
12380+**
12381+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12382+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12383+**
12384+** This copyrighted material is made available to anyone wishing to use,
12385+** modify, copy, or redistribute it subject to the terms and conditions
12386+** of the GNU General Public License v.2.
12387+**
12388+*******************************************************************************
12389+******************************************************************************/
12390+
12391+#ifndef __SM_MESSAGE_DOT_H__
12392+#define __SM_MESSAGE_DOT_H__
12393+
12394+void init_messages(void);
12395+uint32_t sm_new_global_id(int level);
12396+void smsg_bswap_out(sm_msg_t * smsg);
12397+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
12398+ sm_sevent_t *sev);
12399+void process_messages(void);
12400+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12401+ unsigned int node_id);
12402+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
12403+int send_broadcast_message(char *msg, int len);
12404+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
12405+int send_members_message(sm_group_t *sg, char *msg, int len);
12406+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12407+ sm_sevent_t * sev);
12408+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12409+ unsigned int node_id);
12410+
12411+#endif
12412diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
12413--- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12414+++ linux-patched/cluster/cman/sm_misc.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 12415@@ -0,0 +1,369 @@
12416+/******************************************************************************
12417+*******************************************************************************
12418+**
12419+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12420+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12421+**
12422+** This copyrighted material is made available to anyone wishing to use,
12423+** modify, copy, or redistribute it subject to the terms and conditions
12424+** of the GNU General Public License v.2.
12425+**
12426+*******************************************************************************
12427+******************************************************************************/
12428+
12429+#include "sm.h"
12430+#include "config.h"
12431+
12432+#define MAX_DEBUG_MSG_LEN (40)
12433+
12434+extern struct list_head sm_members;
12435+static uint32_t local_ids;
12436+static uint32_t event_id;
12437+static spinlock_t event_id_lock;
12438+static char * debug_buf;
12439+static unsigned int debug_size;
12440+static unsigned int debug_point;
12441+static int debug_wrap;
12442+static spinlock_t debug_lock;
12443+
12444+
12445+void init_sm_misc(void)
12446+{
12447+ local_ids = 1;
12448+ event_id = 1;
12449+ spin_lock_init(&event_id_lock);
12450+ debug_buf = NULL;
12451+ debug_size = 0;
12452+ debug_point = 0;
12453+ debug_wrap = 0;
12454+ spin_lock_init(&debug_lock);
12455+
12456+ sm_debug_setup(cman_config.sm_debug_size);
12457+}
12458+
12459+sm_node_t *sm_new_node(uint32_t nodeid)
12460+{
12461+ struct kcl_cluster_node kclnode;
12462+ sm_node_t *node;
12463+ int error;
12464+
12465+ error = kcl_get_node_by_nodeid(nodeid, &kclnode);
12466+ SM_ASSERT(!error,);
12467+
12468+ SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
12469+ node);
12470+
12471+ memset(node, 0, sizeof(sm_node_t));
12472+ node->id = nodeid;
12473+ node->incarnation = kclnode.incarnation;
12474+ return node;
12475+}
12476+
12477+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
12478+{
12479+ sm_node_t *node;
12480+
12481+ list_for_each_entry(node, &sg->joining, list) {
12482+ if (node->id == nodeid)
12483+ return node;
12484+ }
12485+ return NULL;
12486+}
12487+
12488+sm_node_t *sm_find_member(uint32_t nodeid)
12489+{
12490+ sm_node_t *node;
12491+
12492+ list_for_each_entry(node, &sm_members, list) {
12493+ if (node->id == nodeid)
12494+ return node;
12495+ }
12496+ return NULL;
12497+}
12498+
12499+uint32_t sm_new_local_id(int level)
12500+{
12501+ uint32_t id = local_ids++;
12502+ uint8_t l = (uint8_t) level;
12503+
12504+ if (level > 0xFF)
12505+ return 0;
12506+
12507+ if (id > 0x00FFFFFF)
12508+ return 0;
12509+
12510+ id |= (l << 24);
12511+ return id;
12512+}
12513+
12514+int sm_id_to_level(uint32_t id)
12515+{
12516+ uint8_t l = (id & 0xFF000000) >> 24;
12517+
12518+ return (int) l;
12519+}
12520+
12521+void sm_set_event_id(int *id)
12522+{
12523+ spin_lock(&event_id_lock);
12524+ *id = event_id++;
12525+ spin_unlock(&event_id_lock);
12526+}
12527+
12528+sm_group_t *sm_local_id_to_sg(int id)
12529+{
12530+ sm_group_t *sg;
12531+ int level = sm_id_to_level(id);
12532+ int found = FALSE;
12533+
12534+ down(&sm_sglock);
12535+
12536+ list_for_each_entry(sg, &sm_sg[level], list) {
12537+ if (sg->local_id == id) {
12538+ found = TRUE;
12539+ break;
12540+ }
12541+ }
12542+ up(&sm_sglock);
12543+ if (!found)
12544+ sg = NULL;
12545+ return sg;
12546+}
12547+
12548+sm_group_t *sm_global_id_to_sg(int id)
12549+{
12550+ sm_group_t *sg;
12551+ int level = sm_id_to_level(id);
12552+ int found = FALSE;
12553+
12554+ down(&sm_sglock);
12555+
12556+ list_for_each_entry(sg, &sm_sg[level], list) {
12557+ if (sg->global_id == id) {
12558+ found = TRUE;
12559+ break;
12560+ }
12561+ }
12562+ up(&sm_sglock);
12563+ if (!found)
12564+ sg = NULL;
12565+ return sg;
12566+}
12567+
12568+void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
12569+{
12570+ va_list va;
12571+ int i, n, size, len;
12572+ char buf[MAX_DEBUG_MSG_LEN+1];
12573+
12574+ spin_lock(&debug_lock);
12575+
12576+ if (!debug_buf)
12577+ goto out;
12578+
12579+ size = MAX_DEBUG_MSG_LEN;
12580+ memset(buf, 0, size+1);
12581+
12582+ n = snprintf(buf, size, "%08x ", sg->global_id);
12583+ size -= n;
12584+
12585+ va_start(va, fmt);
12586+ vsnprintf(buf+n, size, fmt, va);
12587+ va_end(va);
12588+
12589+ len = strlen(buf);
12590+ if (len > MAX_DEBUG_MSG_LEN-1)
12591+ len = MAX_DEBUG_MSG_LEN-1;
12592+ buf[len] = '\n';
12593+ buf[len+1] = '\0';
12594+
12595+ for (i = 0; i < strlen(buf); i++) {
12596+ debug_buf[debug_point++] = buf[i];
12597+
12598+ if (debug_point == debug_size) {
12599+ debug_point = 0;
12600+ debug_wrap = 1;
12601+ }
12602+ }
12603+ out:
12604+ spin_unlock(&debug_lock);
12605+}
12606+
12607+void sm_debug_setup(int size)
12608+{
12609+ char *b = kmalloc(size, GFP_KERNEL);
12610+
12611+ spin_lock(&debug_lock);
12612+ if (debug_buf)
12613+ kfree(debug_buf);
12614+
12615+ if (size > PAGE_SIZE)
12616+ size = PAGE_SIZE;
12617+ debug_size = size;
12618+ debug_point = 0;
12619+ debug_wrap = 0;
12620+ debug_buf = b;
12621+ memset(debug_buf, 0, debug_size);
12622+ spin_unlock(&debug_lock);
12623+}
12624+
12625+#ifdef CONFIG_PROC_FS
12626+
12627+int sm_debug_info(char *b, char **start, off_t offset, int length)
12628+{
12629+ int i, n = 0;
12630+
12631+ spin_lock(&debug_lock);
12632+
12633+ if (debug_wrap) {
12634+ for (i = debug_point; i < debug_size; i++)
12635+ n += sprintf(b + n, "%c", debug_buf[i]);
12636+ }
12637+ for (i = 0; i < debug_point; i++)
12638+ n += sprintf(b + n, "%c", debug_buf[i]);
12639+
12640+ spin_unlock(&debug_lock);
12641+
12642+ return n;
12643+}
12644+
12645+int sm_procdata(char *b, char **start, off_t offset, int length)
12646+{
12647+ sm_group_t *sg;
12648+ sm_node_t *node;
12649+ int n = 0, level, i;
12650+
12651+ n += sprintf(b + n, "\n");
12652+
12653+ /*
12654+ * Header
12655+ */
12656+
12657+ n += sprintf(b + n,
12658+ "Service Name GID LID State Code\n");
12659+
12660+ down(&sm_sglock);
12661+
12662+ for (level = 0; level < SG_LEVELS; level++) {
12663+ list_for_each_entry(sg, &sm_sg[level], list) {
12664+
12665+ /*
12666+ * Cluster Service
12667+ */
12668+
12669+ switch (level) {
12670+ case SERVICE_LEVEL_FENCE:
12671+ n += sprintf(b + n, "Fence Domain: ");
12672+ break;
12673+ case SERVICE_LEVEL_GDLM:
12674+ n += sprintf(b + n, "DLM Lock Space: ");
12675+ break;
12676+ case SERVICE_LEVEL_GFS:
12677+ n += sprintf(b + n, "GFS Mount Group: ");
12678+ break;
12679+ case SERVICE_LEVEL_USER:
12680+ n += sprintf(b + n, "User: ");
12681+ break;
12682+ }
12683+
12684+ /*
12685+ * Name
12686+ */
12687+
12688+ n += sprintf(b + n, "\"");
12689+ for (i = 0; i < sg->namelen; i++)
12690+ n += sprintf(b + n, "%c", sg->name[i]);
12691+ n += sprintf(b + n, "\"");
12692+
12693+ for (; i < MAX_SERVICE_NAME_LEN-1; i++)
12694+ n += sprintf(b + n, " ");
12695+
12696+ /*
12697+ * GID LID (sans level from top byte)
12698+ */
12699+
12700+ n += sprintf(b + n, "%3u %3u ",
12701+ (sg->global_id & 0x00FFFFFF),
12702+ (sg->local_id & 0x00FFFFFF));
12703+
12704+ /*
12705+ * State
12706+ */
12707+
12708+ switch (sg->state) {
12709+ case SGST_NONE:
12710+ n += sprintf(b + n, "none ");
12711+ break;
12712+ case SGST_JOIN:
12713+ n += sprintf(b + n, "join ");
12714+ break;
12715+ case SGST_RUN:
12716+ n += sprintf(b + n, "run ");
12717+ break;
12718+ case SGST_RECOVER:
12719+ n += sprintf(b + n, "recover %u ",
12720+ sg->recover_state);
12721+ break;
12722+ case SGST_UEVENT:
12723+ n += sprintf(b + n, "update ");
12724+ break;
12725+ }
12726+
12727+ /*
12728+ * Code
12729+ */
12730+
12731+ if (test_bit(SGFL_SEVENT, &sg->flags))
12732+ n += sprintf(b + n, "S");
12733+ if (test_bit(SGFL_UEVENT, &sg->flags))
12734+ n += sprintf(b + n, "U");
12735+ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
12736+ n += sprintf(b + n, "N");
12737+
12738+ n += sprintf(b + n, "-");
12739+
12740+ if (test_bit(SGFL_SEVENT, &sg->flags)
12741+ && sg->sevent) {
12742+ n += sprintf(b + n, "%u,%lx,%u",
12743+ sg->sevent->se_state,
12744+ sg->sevent->se_flags,
12745+ sg->sevent->se_reply_count);
12746+ }
12747+
12748+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
12749+ n += sprintf(b + n, "%u,%lx,%u",
12750+ sg->uevent.ue_state,
12751+ sg->uevent.ue_flags,
12752+ sg->uevent.ue_nodeid);
12753+ }
12754+
12755+ n += sprintf(b + n, "\n");
12756+
12757+ /*
12758+ * node list
12759+ */
12760+
12761+ i = 0;
12762+
12763+ n += sprintf(b + n, "[");
12764+
12765+ list_for_each_entry(node, &sg->memb, list) {
12766+ if (i && !(i % 24))
12767+ n += sprintf(b + n, "\n");
12768+
12769+ if (i)
12770+ n += sprintf(b + n, " ");
12771+
12772+ n += sprintf(b + n, "%u", node->id);
12773+ i++;
12774+ }
12775+
12776+ n += sprintf(b + n, "]\n\n");
12777+ }
12778+ }
12779+
12780+ up(&sm_sglock);
12781+
12782+ return n;
12783+}
12784+#endif
12785diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
12786--- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12787+++ linux-patched/cluster/cman/sm_misc.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 12788@@ -0,0 +1,29 @@
12789+/******************************************************************************
12790+*******************************************************************************
12791+**
12792+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12793+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12794+**
12795+** This copyrighted material is made available to anyone wishing to use,
12796+** modify, copy, or redistribute it subject to the terms and conditions
12797+** of the GNU General Public License v.2.
12798+**
12799+*******************************************************************************
12800+******************************************************************************/
12801+
12802+#ifndef __SM_MISC_DOT_H__
12803+#define __SM_MISC_DOT_H__
12804+
12805+void init_sm_misc(void);
12806+sm_node_t *sm_new_node(uint32_t nodeid);
12807+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
12808+sm_node_t *sm_find_member(uint32_t nodeid);
12809+uint32_t sm_new_local_id(int level);
12810+int sm_id_to_level(uint32_t id);
12811+void sm_set_event_id(int *id);
12812+sm_group_t *sm_local_id_to_sg(int id);
12813+sm_group_t *sm_global_id_to_sg(int id);
12814+void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
12815+void sm_debug_setup(int size);
12816+
12817+#endif
12818diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
12819--- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12820+++ linux-patched/cluster/cman/sm_recover.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 12821@@ -0,0 +1,522 @@
12822+/******************************************************************************
12823+*******************************************************************************
12824+**
12825+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12826+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12827+**
12828+** This copyrighted material is made available to anyone wishing to use,
12829+** modify, copy, or redistribute it subject to the terms and conditions
12830+** of the GNU General Public License v.2.
12831+**
12832+*******************************************************************************
12833+******************************************************************************/
12834+
12835+#include "sm.h"
12836+#include "config.h"
12837+
12838+/*
12839+ * A collection of sg's which need to be recovered due to a failed member.
12840+ * These sg's are recovered in order of level. An sg subject to cascading
12841+ * failures is moved from one of these structs to a newer one.
12842+ */
12843+
12844+struct recover {
12845+ struct list_head list; /* list of current re's */
12846+ struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */
12847+ int event_id; /* event id */
12848+ int cur_level;
12849+};
12850+typedef struct recover recover_t;
12851+
12852+
12853+extern uint32_t * sm_new_nodeids;
12854+extern int sm_quorum, sm_quorum_next;
12855+extern uint32_t sm_our_nodeid;
12856+extern struct list_head sm_members;
12857+extern int sm_member_count;
12858+static struct list_head recoveries;
12859+
12860+
12861+void init_recovery(void)
12862+{
12863+ INIT_LIST_HEAD(&recoveries);
12864+}
12865+
12866+/*
12867+ * This is the first thing called when a change is announced in cluster
12868+ * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new
12869+ * nodes to its sm_members list which it's not seen before. Nodes which were
12870+ * alive but are now gone are marked as "need recovery".
12871+ *
12872+ * The "need recovery" status of nodes is propagated to the node's SG's in
12873+ * mark_effected_sgs. The effected SG's are themselves marked as needing
12874+ * recovery and in new_recovery the dead nodes are removed from the SG's
12875+ * individual member lists. The "need recovery" status of nodes is cleared in
12876+ * adjust_members_done().
12877+ */
12878+
12879+static int adjust_members(void)
12880+{
12881+ sm_node_t *node;
12882+ struct kcl_cluster_node knode;
12883+ int i, error, num_nodes, sub = 0, add = 0, found;
12884+
12885+ /*
12886+ * Get list of current members from cnxman
12887+ */
12888+
12889+ memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
12890+ num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
12891+
12892+ /*
12893+ * Determine who's gone
12894+ */
12895+
12896+ list_for_each_entry(node, &sm_members, list) {
12897+ found = FALSE;
12898+ for (i = 0; i < num_nodes; i++) {
12899+ if (node->id == sm_new_nodeids[i]) {
12900+ found = TRUE;
12901+ sm_new_nodeids[i] = 0;
12902+ break;
12903+ }
12904+ }
12905+
12906+ if (found) {
12907+ error = kcl_get_node_by_nodeid(node->id, &knode);
12908+ SM_ASSERT(!error, printk("error=%d\n", error););
12909+
12910+ if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
12911+ /* former member is back */
12912+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
12913+ node->incarnation = knode.incarnation;
12914+ add++;
12915+ } else {
12916+ /* current member is still alive - if the
12917+ * incarnation number is different it died and
12918+ * returned between checks */
12919+ if (node->incarnation != knode.incarnation) {
12920+ set_bit(SNFL_NEED_RECOVERY,
12921+ &node->flags);
12922+ node->incarnation = knode.incarnation;
12923+ sub++;
12924+ }
12925+ }
12926+ } else {
12927+ /* current member has died */
12928+ if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
12929+ &node->flags)) {
12930+ set_bit(SNFL_NEED_RECOVERY, &node->flags);
12931+ sub++;
12932+ }
12933+ }
12934+ }
12935+
12936+ /*
12937+ * Look for new nodes
12938+ */
12939+
12940+ for (i = 0; i < num_nodes; i++) {
12941+ if (sm_new_nodeids[i]) {
12942+ node = sm_new_node(sm_new_nodeids[i]);
12943+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
12944+ add++;
12945+ list_add_tail(&node->list, &sm_members);
12946+ sm_member_count++;
12947+ }
12948+ }
12949+
12950+ /*
12951+ * Get our own nodeid
12952+ */
12953+
12954+ if (!sm_our_nodeid) {
12955+ list_for_each_entry(node, &sm_members, list) {
12956+ error = kcl_get_node_by_nodeid(node->id, &knode);
12957+ SM_ASSERT(!error, printk("error=%d\n", error););
12958+
12959+ if (knode.us) {
12960+ sm_our_nodeid = knode.node_id;
12961+ break;
12962+ }
12963+ }
12964+ }
12965+
12966+ return sub;
12967+}
12968+
12969+/*
12970+ * Given some number of dead nodes, flag SG's the dead nodes were part of.
12971+ * This requires a number of loops because each node structure does not keep a
12972+ * list of SG's it's in.
12973+ */
12974+
12975+static int mark_effected_sgs(void)
12976+{
12977+ sm_group_t *sg;
12978+ sm_node_t *node, *sgnode;
12979+ uint32_t dead_id;
12980+ int i, effected = 0;
12981+
12982+ down(&sm_sglock);
12983+
12984+ list_for_each_entry(node, &sm_members, list) {
12985+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
12986+ continue;
12987+
12988+ dead_id = node->id;
12989+
12990+ for (i = 0; i < SG_LEVELS; i++) {
12991+ list_for_each_entry(sg, &sm_sg[i], list) {
12992+ /* check if dead node is among sg's members */
12993+ list_for_each_entry(sgnode, &sg->memb, list) {
12994+ if (sgnode->id == dead_id) {
12995+ set_bit(SGFL_NEED_RECOVERY,
12996+ &sg->flags);
12997+ effected++;
12998+ break;
12999+ }
13000+ }
13001+ }
13002+ }
13003+ }
13004+ up(&sm_sglock);
13005+
13006+ return effected;
13007+}
13008+
13009+static recover_t *alloc_recover(void)
13010+{
13011+ recover_t *rev;
13012+ int i;
13013+
13014+ SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
13015+
13016+ memset(rev, 0, sizeof(recover_t));
13017+
13018+ sm_set_event_id(&rev->event_id);
13019+
13020+ for (i = 0; i < SG_LEVELS; i++) {
13021+ INIT_LIST_HEAD(&rev->sgs[i]);
13022+ }
13023+
13024+ return rev;
13025+}
13026+
13027+/*
13028+ * An in-progress revent re-start for an SG is interrupted by another node
13029+ * failure in the SG. Cancel an outstanding barrier if there is one. The SG
13030+ * will be moved to the new revent and re-started as part of that.
13031+ */
13032+
13033+static void cancel_prev_recovery(sm_group_t *sg)
13034+{
13035+ int error;
13036+
13037+ if (sg->recover_state == RECOVER_BARRIERWAIT) {
13038+ error = kcl_barrier_cancel(sg->recover_barrier);
13039+ if (error)
13040+ log_error(sg, "cancel_prev_recovery: error %d", error);
13041+ }
13042+}
13043+
13044+static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
13045+{
13046+ if (sg->state == SGST_RECOVER) {
13047+ cancel_prev_recovery(sg);
13048+ list_del(&sg->recover_list);
13049+ }
13050+
13051+ sg->ops->stop(sg->service_data);
13052+ sg->state = SGST_RECOVER;
13053+ sg->recover_state = RECOVER_NONE;
13054+ sg->recover_data = rev;
13055+ list_add(&sg->recover_list, &rev->sgs[sg->level]);
13056+}
13057+
13058+/*
13059+ * When adjust_members finds that some nodes are dead and mark_effected_sgs
13060+ * finds that some SG's are effected by departed nodes, this is called to
13061+ * collect together the SG's which need to be recovered. An revent (recovery
13062+ * event) is the group of effected SG's.
13063+ */
13064+
13065+static int new_recovery(void)
13066+{
13067+ sm_group_t *sg;
13068+ recover_t *rev;
13069+ sm_node_t *node, *sgnode, *safe;
13070+ int i;
13071+
13072+ rev = alloc_recover();
13073+ list_add_tail(&rev->list, &recoveries);
13074+
13075+ down(&sm_sglock);
13076+
13077+ /*
13078+ * Stop effected SG's and add them to the rev
13079+ */
13080+
13081+ for (i = 0; i < SG_LEVELS; i++) {
13082+ list_for_each_entry(sg, &sm_sg[i], list) {
13083+ if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
13084+ if (sg->state == SGST_JOIN)
13085+ continue;
13086+ pre_recover_sg(sg, rev);
13087+ }
13088+ }
13089+ }
13090+
13091+ /*
13092+ * For an SG needing recovery, remove dead nodes from sg->memb list
13093+ */
13094+
13095+ for (i = 0; i < SG_LEVELS; i++) {
13096+ list_for_each_entry(sg, &rev->sgs[i], recover_list) {
13097+
13098+ /* Remove dead members from SG's member list */
13099+ list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
13100+
13101+ node = sm_find_member(sgnode->id);
13102+ SM_ASSERT(node, printk("id %u\n", sgnode->id););
13103+
13104+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
13105+ list_del(&sgnode->list);
13106+ kfree(sgnode);
13107+ sg->memb_count--;
13108+ log_debug(sg, "remove node %u count %d",
13109+ sgnode->id, sg->memb_count);
13110+ }
13111+ }
13112+ }
13113+ }
13114+
13115+ up(&sm_sglock);
13116+ rev->cur_level = 0;
13117+ return 0;
13118+}
13119+
13120+/*
13121+ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
13122+ * mark_effected_sgs() and add_revent(). After that, we're done using the bit
13123+ * and we clear it here.
13124+ */
13125+
13126+static void adjust_members_done(void)
13127+{
13128+ sm_node_t *node;
13129+
13130+ list_for_each_entry(node, &sm_members, list)
13131+ clear_bit(SNFL_NEED_RECOVERY, &node->flags);
13132+}
13133+
13134+/*
13135+ * Start the service of the given SG. The service must be given an array of
13136+ * nodeids specifying the new sg membership. The service is responsible to
13137+ * free this chunk of memory when done with it.
13138+ */
13139+
13140+static void start_sg(sm_group_t *sg, uint32_t event_id)
13141+{
13142+ sm_node_t *node;
13143+ uint32_t *memb;
13144+ int count = 0;
13145+
13146+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
13147+ memb);
13148+
13149+ list_for_each_entry(node, &sg->memb, list)
13150+ memb[count++] = node->id;
13151+
13152+ sg->ops->start(sg->service_data, memb, count, event_id,
13153+ SERVICE_NODE_FAILED);
13154+}
13155+
13156+static void recovery_barrier(sm_group_t *sg)
13157+{
13158+ char bname[MAX_BARRIER_NAME_LEN];
13159+ int error, len;
13160+
13161+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
13162+
13163+ /* bypass the barrier if we're the only member */
13164+ if (sg->memb_count == 1) {
13165+ process_recovery_barrier(sg, 0);
13166+ return;
13167+ }
13168+
13169+ len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
13170+ sg->global_id, sg->recover_stop, sg->memb_count);
13171+
13172+ /* We save this barrier name so we can cancel it if needed. */
13173+ memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
13174+ memcpy(sg->recover_barrier, bname, len);
13175+
13176+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
13177+ if (error)
13178+ log_error(sg, "recovery_barrier error %d: %s", error, bname);
13179+}
13180+
13181+static void recover_sg(sm_group_t *sg, int event_id)
13182+{
13183+ log_debug(sg, "recover state %d", sg->recover_state);
13184+
13185+ switch (sg->recover_state) {
13186+
13187+ case RECOVER_NONE:
13188+ /* must wait for recovery to stop sg on all nodes */
13189+ sg->recover_state = RECOVER_BARRIERWAIT;
13190+ sg->recover_stop = 0;
13191+ recovery_barrier(sg);
13192+ break;
13193+
13194+ case RECOVER_BARRIERWAIT:
13195+ break;
13196+
13197+ case RECOVER_STOP:
13198+ /* barrier callback sets state STOP */
13199+ sg->recover_stop = 1;
13200+ sg->recover_state = RECOVER_START;
13201+ start_sg(sg, event_id);
13202+ break;
13203+
13204+ case RECOVER_START:
13205+ break;
13206+
13207+ case RECOVER_STARTDONE:
13208+ /* service callback sets state STARTDONE */
13209+ sg->recover_state = RECOVER_BARRIERWAIT;
13210+ recovery_barrier(sg);
13211+ break;
13212+
13213+ case RECOVER_BARRIERDONE:
13214+ /* barrier callback sets state BARRIERDONE */
13215+ sg->ops->finish(sg->service_data, event_id);
13216+ list_del(&sg->recover_list);
13217+ sg->recover_state = RECOVER_NONE;
13218+ sg->state = SGST_RUN;
13219+
13220+ /* Continue a previous, interrupted attempt to leave the sg */
13221+ if (sg->sevent) {
13222+ clear_bit(SEFL_DELAY, &sg->sevent->se_flags);
13223+ set_bit(SEFL_CHECK, &sg->sevent->se_flags);
13224+ wake_serviced(DO_JOINLEAVE);
13225+ }
13226+ break;
13227+
13228+ default:
13229+ log_error(sg, "invalid recover_state %u", sg->recover_state);
13230+ }
13231+}
13232+
13233+static void recover_level(recover_t *rev, int level)
13234+{
13235+ sm_group_t *sg, *safe;
13236+
13237+ list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
13238+ recover_sg(sg, rev->event_id);
13239+}
13240+
13241+static void recover_levels(recover_t *rev)
13242+{
13243+ for (;;) {
13244+ recover_level(rev, rev->cur_level);
13245+
13246+ if (list_empty(&rev->sgs[rev->cur_level])) {
13247+ if (rev->cur_level == SG_LEVELS - 1) {
13248+ list_del(&rev->list);
13249+ kfree(rev);
13250+ return;
13251+ }
13252+ rev->cur_level++;
13253+ continue;
13254+ }
13255+ break;
13256+ }
13257+}
13258+
13259+/*
13260+ * Called by SM thread when the cluster is quorate. It restarts
13261+ * SG's that were stopped in new_recovery() due to a member death.
13262+ * It waits for all SG's at level N to complete restart before
13263+ * restarting SG's at level N+1.
13264+ */
13265+
13266+void process_recoveries(void)
13267+{
13268+ recover_t *rev, *safe;
13269+
13270+ down(&sm_sglock);
13271+ list_for_each_entry_safe(rev, safe, &recoveries, list)
13272+ recover_levels(rev);
13273+ up(&sm_sglock);
13274+}
13275+
13276+/*
13277+ * The cnxman membership has changed. Check if there's still quorum and
13278+ * whether any nodes have died. If nodes have died, initiate recovery on any
13279+ * SG's they were in. This begins immediately if the cluster remains quorate;
13280+ * if not this waits until the cluster regains quorum.
13281+ */
13282+
13283+void process_nodechange(void)
13284+{
13285+ int gone, effected;
13286+
13287+ if ((sm_quorum = sm_quorum_next))
13288+ wake_serviced(DO_RUN);
13289+
13290+ gone = adjust_members();
13291+ if (gone > 0) {
13292+ effected = mark_effected_sgs();
13293+
13294+ backout_sevents();
13295+ cancel_uevents(&effected);
13296+
13297+ if (effected > 0) {
13298+ new_recovery();
13299+ wake_serviced(DO_RECOVERIES);
13300+ }
13301+ }
13302+ adjust_members_done();
13303+}
13304+
13305+int check_recovery(sm_group_t *sg, int event_id)
13306+{
13307+ if (sg->state == SGST_RECOVER) {
13308+ recover_t *rev = (recover_t *) sg->recover_data;
13309+ if (rev && rev->event_id == event_id)
13310+ return 1;
13311+ }
13312+ return 0;
13313+}
13314+
13315+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
13316+{
13317+ sm_group_t *sg;
13318+ recover_t *rev;
13319+
13320+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
13321+ if (!sg) {
13322+ log_print("process_recover_msg: unknown sg id %x",
13323+ smsg->ms_global_sgid);
13324+ return;
13325+ }
13326+
13327+ /* we already know about the recovery and can ignore the msg */
13328+ if (sg->state == SGST_RECOVER)
13329+ return;
13330+
13331+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
13332+ /* we will initiate recovery on our own if we know about the
13333+ uevent so we can ignore this */
13334+ log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
13335+ return;
13336+ }
13337+
13338+ log_debug(sg, "recovery initiated by msg from %u", nodeid);
13339+ rev = alloc_recover();
13340+ list_add_tail(&rev->list, &recoveries);
13341+ pre_recover_sg(sg, rev);
13342+ wake_serviced(DO_RECOVERIES);
13343+}
13344diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
13345--- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13346+++ linux-patched/cluster/cman/sm_recover.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 13347@@ -0,0 +1,23 @@
13348+/******************************************************************************
13349+*******************************************************************************
13350+**
13351+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13352+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13353+**
13354+** This copyrighted material is made available to anyone wishing to use,
13355+** modify, copy, or redistribute it subject to the terms and conditions
13356+** of the GNU General Public License v.2.
13357+**
13358+*******************************************************************************
13359+******************************************************************************/
13360+
13361+#ifndef __SM_RECOVER_DOT_H__
13362+#define __SM_RECOVER_DOT_H__
13363+
13364+void init_recovery(void);
13365+void process_recoveries(void);
13366+void process_nodechange(void);
13367+int check_recovery(sm_group_t *sg, int event_id);
13368+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
13369+
13370+#endif
13371diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
13372--- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13373+++ linux-patched/cluster/cman/sm_services.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 13374@@ -0,0 +1,418 @@
13375+/******************************************************************************
13376+*******************************************************************************
13377+**
13378+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13379+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13380+**
13381+** This copyrighted material is made available to anyone wishing to use,
13382+** modify, copy, or redistribute it subject to the terms and conditions
13383+** of the GNU General Public License v.2.
13384+**
13385+*******************************************************************************
13386+******************************************************************************/
13387+
13388+#include "sm.h"
13389+
13390+static struct list_head callbacks;
13391+static spinlock_t callback_lock;
13392+static struct list_head sg_registered[SG_LEVELS];
13393+
13394+/*
13395+ * These are the functions to register, join, leave, unregister, callback
13396+ * with/to the sm.
13397+ */
13398+
13399+struct sc_entry {
13400+ struct list_head list;
13401+ uint32_t local_id;
13402+ int event_id;
13403+};
13404+typedef struct sc_entry sc_entry_t;
13405+
13406+void init_services(void)
13407+{
13408+ int i;
13409+
13410+ INIT_LIST_HEAD(&callbacks);
13411+ spin_lock_init(&callback_lock);
13412+
13413+ for (i = 0; i < SG_LEVELS; i++) {
13414+ INIT_LIST_HEAD(&sm_sg[i]);
13415+ INIT_LIST_HEAD(&sg_registered[i]);
13416+ }
13417+ init_MUTEX(&sm_sglock);
13418+}
13419+
13420+/* Context: service */
13421+
13422+int kcl_register_service(char *name, int namelen, int level,
13423+ struct kcl_service_ops *ops, int unique,
13424+ void *servicedata, uint32_t *service_id)
13425+{
13426+ sm_group_t *sg;
13427+ int found = FALSE;
13428+ int error = -EINVAL;
13429+
13430+ if (level > SG_LEVELS - 1)
13431+ goto fail;
13432+
13433+ if (namelen > MAX_SERVICE_NAME_LEN)
13434+ goto fail;
13435+
13436+ error = kcl_addref_cluster();
13437+ if (error)
13438+ goto fail;
13439+
13440+ down(&sm_sglock);
13441+
13442+ list_for_each_entry(sg, &sm_sg[level], list) {
13443+ if ((sg->namelen == namelen) &&
13444+ (!strncmp(sg->name, name, namelen))) {
13445+ found = TRUE;
13446+ goto next;
13447+ }
13448+ }
13449+
13450+ list_for_each_entry(sg, &sg_registered[level], list) {
13451+ if ((sg->namelen == namelen) &&
13452+ (!strncmp(sg->name, name, namelen))) {
13453+ found = TRUE;
13454+ goto next;
13455+ }
13456+ }
13457+
13458+ next:
13459+
13460+ if (found && unique) {
13461+ error = -EEXIST;
13462+ goto fail_unlock;
13463+ }
13464+
13465+ if (found) {
13466+ sg->refcount++;
13467+ goto out;
13468+ }
13469+
13470+ sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
13471+ if (!sg) {
13472+ error = -ENOMEM;
13473+ goto fail_unlock;
13474+ }
13475+ memset(sg, 0, sizeof(sm_group_t) + namelen);
13476+
13477+ sg->refcount = 1;
13478+ sg->service_data = servicedata;
13479+ sg->ops = ops;
13480+ sg->level = level;
13481+ sg->namelen = namelen;
13482+ memcpy(sg->name, name, namelen);
13483+ sg->local_id = sm_new_local_id(level);
13484+ sg->state = SGST_NONE;
13485+ INIT_LIST_HEAD(&sg->memb);
13486+ INIT_LIST_HEAD(&sg->joining);
13487+ init_completion(&sg->event_comp);
13488+
13489+ list_add_tail(&sg->list, &sg_registered[level]);
13490+
13491+ out:
13492+ *service_id = sg->local_id;
13493+ up(&sm_sglock);
13494+ return 0;
13495+
13496+ fail_unlock:
13497+ up(&sm_sglock);
13498+ kcl_releaseref_cluster();
13499+ fail:
13500+ return error;
13501+}
13502+
13503+/* Context: service */
13504+
13505+void kcl_unregister_service(uint32_t local_id)
13506+{
13507+ sm_group_t *sg;
13508+ int level = sm_id_to_level(local_id);
13509+
13510+ down(&sm_sglock);
13511+
13512+ list_for_each_entry(sg, &sg_registered[level], list) {
13513+ if (sg->local_id == local_id) {
13514+ SM_ASSERT(sg->refcount,);
13515+ sg->refcount--;
13516+
13517+ if (!sg->refcount) {
13518+ list_del(&sg->list);
13519+ kfree(sg);
13520+ }
13521+ kcl_releaseref_cluster();
13522+ break;
13523+ }
13524+ }
13525+ up(&sm_sglock);
13526+}
13527+
13528+/* Context: service */
13529+
13530+int kcl_join_service(uint32_t local_id)
13531+{
13532+ sm_group_t *sg;
13533+ sm_sevent_t *sev;
13534+ int level = sm_id_to_level(local_id);
13535+ int error, found = FALSE;
13536+
13537+ down(&sm_sglock);
13538+
13539+ list_for_each_entry(sg, &sg_registered[level], list) {
13540+ if (sg->local_id == local_id) {
13541+ found = TRUE;
13542+ break;
13543+ }
13544+ }
13545+
13546+ if (!found) {
13547+ up(&sm_sglock);
13548+ error = -ENOENT;
13549+ goto out;
13550+ }
13551+
13552+ if (sg->state != SGST_NONE) {
13553+ up(&sm_sglock);
13554+ error = -EINVAL;
13555+ goto out;
13556+ }
13557+
13558+ sg->state = SGST_JOIN;
13559+ set_bit(SGFL_SEVENT, &sg->flags);
13560+ list_del(&sg->list);
13561+ list_add_tail(&sg->list, &sm_sg[sg->level]);
13562+
13563+ up(&sm_sglock);
13564+
13565+ /*
13566+ * The join is a service event which will be processed asynchronously.
13567+ */
13568+
13569+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13570+ if (!sev) {
13571+ error = -ENOMEM;
13572+ goto out;
13573+ }
13574+
13575+ memset(sev, 0, sizeof (sm_sevent_t));
13576+ sev->se_state = SEST_JOIN_BEGIN;
13577+ sev->se_sg = sg;
13578+ sg->sevent = sev;
13579+ sm_set_event_id(&sev->se_id);
13580+
13581+ new_joinleave(sev);
13582+ wait_for_completion(&sg->event_comp);
13583+ error = 0;
13584+
13585+ out:
13586+ return error;
13587+}
13588+
13589+/* Context: service */
13590+
13591+int kcl_leave_service(uint32_t local_id)
13592+{
13593+ sm_group_t *sg = NULL;
13594+ sm_sevent_t *sev;
13595+ int error;
13596+
13597+ error = -ENOENT;
13598+ sg = sm_local_id_to_sg(local_id);
13599+ if (!sg)
13600+ goto out;
13601+
13602+ /* sg was never joined */
13603+ error = -EINVAL;
13604+ if (sg->state == SGST_NONE)
13605+ goto out;
13606+
13607+ /* may still be joining */
13608+ error = -EBUSY;
13609+ if (test_and_set_bit(SGFL_SEVENT, &sg->flags))
13610+ goto out;
13611+
13612+ error = -ENOMEM;
13613+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13614+ if (!sev)
13615+ goto out;
13616+
13617+ memset(sev, 0, sizeof (sm_sevent_t));
13618+ sev->se_state = SEST_LEAVE_BEGIN;
13619+ set_bit(SEFL_LEAVE, &sev->se_flags);
13620+ sev->se_sg = sg;
13621+ sg->sevent = sev;
13622+ sm_set_event_id(&sev->se_id);
13623+
13624+ new_joinleave(sev);
13625+ wait_for_completion(&sg->event_comp);
13626+ error = 0;
13627+
13628+ down(&sm_sglock);
13629+ list_del(&sg->list);
13630+ list_add_tail(&sg->list, &sg_registered[sg->level]);
13631+ up(&sm_sglock);
13632+
13633+ out:
13634+ return error;
13635+}
13636+
13637+static void process_callback(uint32_t local_id, int event_id)
13638+{
13639+ sm_group_t *sg;
13640+ sm_sevent_t *sev;
13641+ sm_uevent_t *uev;
13642+
13643+ sg = sm_local_id_to_sg(local_id);
13644+ if (!sg)
13645+ return;
13646+
13647+ if (sg->state == SGST_RECOVER) {
13648+ if (!check_recovery(sg, event_id)) {
13649+ log_error(sg, "process_callback invalid recover "
13650+ "event id %d", event_id);
13651+ return;
13652+ }
13653+
13654+ if (sg->recover_state == RECOVER_START)
13655+ sg->recover_state = RECOVER_STARTDONE;
13656+ else
13657+ log_error(sg, "process_callback recover state %u",
13658+ sg->recover_state);
13659+ wake_serviced(DO_RECOVERIES);
13660+ }
13661+
13662+ else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
13663+ (sg->sevent->se_id == event_id)) {
13664+ sev = sg->sevent;
13665+
13666+ if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
13667+ (sev->se_state == SEST_JSTART_SERVICEWAIT))
13668+ sev->se_state = SEST_JSTART_SERVICEDONE;
13669+
13670+ set_bit(SEFL_CHECK, &sev->se_flags);
13671+ wake_serviced(DO_JOINLEAVE);
13672+ }
13673+
13674+ else if (test_bit(SGFL_UEVENT, &sg->flags) &&
13675+ (sg->uevent.ue_id == event_id)) {
13676+ uev = &sg->uevent;
13677+
13678+ if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
13679+ if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
13680+ uev->ue_state = UEST_JSTART_SERVICEDONE;
13681+ else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
13682+ uev->ue_state = UEST_LSTART_SERVICEDONE;
13683+ }
13684+ set_bit(UEFL_CHECK, &uev->ue_flags);
13685+ wake_serviced(DO_MEMBERSHIP);
13686+ }
13687+
13688+ else
13689+ log_error(sg, "ignoring service callback id=%x event=%u",
13690+ local_id, event_id);
13691+}
13692+
13693+void process_callbacks(void)
13694+{
13695+ sc_entry_t *se;
13696+
13697+ while (1) {
13698+ se = NULL;
13699+
13700+ spin_lock(&callback_lock);
13701+ if (!list_empty(&callbacks)) {
13702+ se = list_entry(callbacks.next, sc_entry_t, list);
13703+ list_del(&se->list);
13704+ }
13705+ spin_unlock(&callback_lock);
13706+
13707+ if (!se)
13708+ break;
13709+ process_callback(se->local_id, se->event_id);
13710+ kfree(se);
13711+ schedule();
13712+ }
13713+}
13714+
13715+/* Context: service */
13716+
13717+void kcl_start_done(uint32_t local_id, int event_id)
13718+{
13719+ sc_entry_t *se;
13720+
13721+ SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
13722+
13723+ se->local_id = local_id;
13724+ se->event_id = event_id;
13725+
13726+ spin_lock(&callback_lock);
13727+ list_add_tail(&se->list, &callbacks);
13728+ spin_unlock(&callback_lock);
13729+
13730+ wake_serviced(DO_CALLBACKS);
13731+}
13732+
13733+/* Context: service */
13734+
13735+void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
13736+{
13737+ sm_group_t *sg = sm_local_id_to_sg(local_id);
13738+
13739+ if (!sg)
13740+ log_print("kcl_global_service_id: can't find %x", local_id);
13741+ else
13742+ *global_id = sg->global_id;
13743+}
13744+
13745+static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
13746+{
13747+ s->level = sg->level;
13748+ s->local_id = sg->local_id;
13749+ s->global_id = sg->global_id;
13750+ s->node_count = sg->memb_count;
13751+ strcpy(s->name, sg->name);
13752+}
13753+
13754+int kcl_get_services(struct list_head *head, int level)
13755+{
13756+ sm_group_t *sg;
13757+ struct kcl_service *s;
13758+ int error = -ENOMEM, count = 0;
13759+
13760+ down(&sm_sglock);
13761+
13762+ list_for_each_entry(sg, &sg_registered[level], list) {
13763+ if (head) {
13764+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
13765+ if (!s)
13766+ goto out;
13767+ copy_to_service(sg, s);
13768+ list_add(&s->list, head);
13769+ }
13770+ count++;
13771+ }
13772+
13773+ list_for_each_entry(sg, &sm_sg[level], list) {
13774+ if (head) {
13775+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
13776+ if (!s)
13777+ goto out;
13778+ copy_to_service(sg, s);
13779+ list_add(&s->list, head);
13780+ }
13781+ count++;
13782+ }
13783+
13784+ error = count;
13785+ out:
13786+ up(&sm_sglock);
13787+ return error;
13788+}
13789+
13790+/* These three global variables listed in extern form in sm.h. */
13791+struct list_head sm_sg[SG_LEVELS];
13792+struct semaphore sm_sglock;
13793diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
13794--- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13795+++ linux-patched/cluster/cman/sm_services.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 13796@@ -0,0 +1,20 @@
13797+/******************************************************************************
13798+*******************************************************************************
13799+**
13800+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13801+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13802+**
13803+** This copyrighted material is made available to anyone wishing to use,
13804+** modify, copy, or redistribute it subject to the terms and conditions
13805+** of the GNU General Public License v.2.
13806+**
13807+*******************************************************************************
13808+******************************************************************************/
13809+
13810+#ifndef __SM_SERVICES_DOT_H__
13811+#define __SM_SERVICES_DOT_H__
13812+
13813+void init_services(void);
13814+void process_callbacks(void);
13815+
13816+#endif
13817diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
13818--- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
13819+++ linux-patched/cluster/cman/sm_user.c 2004-06-29 20:07:51.000000000 +0800
13820@@ -0,0 +1,569 @@
4bf12011 13821+/******************************************************************************
13822+*******************************************************************************
13823+**
13824+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13825+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13826+**
13827+** This copyrighted material is made available to anyone wishing to use,
13828+** modify, copy, or redistribute it subject to the terms and conditions
13829+** of the GNU General Public License v.2.
13830+**
13831+*******************************************************************************
13832+******************************************************************************/
13833+
13834+#include "sm.h"
13835+#include "cnxman-private.h"
13836+
13837+void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
13838+
13839+#define UST_REGISTER 1
13840+#define UST_UNREGISTER 2
13841+#define UST_JOIN 3
13842+#define UST_LEAVE 4
13843+#define UST_JOINED 5
13844+
13845+struct event {
13846+ struct list_head list;
13847+ service_event_t type;
13848+ service_start_t start_type;
13849+ unsigned int event_id;
13850+ unsigned int last_stop;
13851+ unsigned int last_start;
13852+ unsigned int last_finish;
13853+ unsigned int node_count;
13854+ uint32_t * nodeids;
13855+};
13856+typedef struct event event_t;
13857+
13858+struct user_service {
13859+ uint32_t local_id;
13860+ pid_t pid;
13861+ int signal;
13862+ struct socket * sock;
13863+ uint8_t state;
13864+ uint8_t async;
13865+ struct semaphore lock;
13866+ struct list_head events;
13867+ spinlock_t event_lock;
13868+ unsigned int last_stop;
13869+ unsigned int last_start;
13870+ unsigned int last_finish;
13871+ unsigned int need_startdone;
13872+ unsigned int node_count;
13873+ uint32_t * nodeids;
13874+ int name_len;
13875+ char name[MAX_SERVICE_NAME_LEN];
13876+};
13877+typedef struct user_service user_service_t;
13878+
13879+
13880+static void add_event(user_service_t *us, event_t *ev)
13881+{
13882+ spin_lock(&us->event_lock);
13883+ list_add_tail(&ev->list, &us->events);
13884+
13885+ switch(ev->type) {
13886+ case SERVICE_EVENT_STOP:
13887+ us->last_stop = us->last_start;
13888+ break;
13889+ case SERVICE_EVENT_START:
13890+ us->last_start = ev->event_id;
13891+ break;
13892+ case SERVICE_EVENT_FINISH:
13893+ us->last_finish = ev->event_id;
13894+ break;
13895+ case SERVICE_EVENT_LEAVEDONE:
13896+ break;
13897+ }
13898+ spin_unlock(&us->event_lock);
13899+}
13900+
13901+static event_t *get_event(user_service_t *us)
13902+{
13903+ event_t *ev = NULL;
13904+
13905+ spin_lock(&us->event_lock);
13906+ if (!list_empty(&us->events)) {
13907+ ev = list_entry(us->events.next, event_t, list);
13908+ ev->last_stop = us->last_stop;
13909+ ev->last_start = us->last_start;
13910+ ev->last_finish = us->last_finish;
13911+ }
13912+ spin_unlock(&us->event_lock);
13913+ return ev;
13914+}
13915+
13916+static void del_event(user_service_t *us, event_t *ev)
13917+{
13918+ spin_lock(&us->event_lock);
13919+ list_del(&ev->list);
13920+ spin_unlock(&us->event_lock);
13921+}
13922+
13923+static event_t *alloc_event(void)
13924+{
13925+ event_t *ev;
13926+ SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
13927+ memset(ev, 0, sizeof(event_t));
13928+ return ev;
13929+}
13930+
13931+/* us->lock must be held before calling */
13932+static void user_notify(user_service_t *us)
13933+{
13934+ if (us->sock)
13935+ queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
13936+ if (us->pid && us->signal)
13937+ kill_proc(us->pid, us->signal, 0);
13938+}
13939+
13940+static service_start_t start_type(int type)
13941+{
13942+ switch (type) {
13943+ case SERVICE_NODE_FAILED:
13944+ return SERVICE_START_FAILED;
13945+ case SERVICE_NODE_JOIN:
13946+ return SERVICE_START_JOIN;
13947+ case SERVICE_NODE_LEAVE:
13948+ return SERVICE_START_LEAVE;
13949+ }
13950+ return 0;
13951+}
13952+
13953+static int user_stop(void *servicedata)
13954+{
13955+ user_service_t *us = (user_service_t *) servicedata;
13956+ event_t *ev;
13957+
13958+ down(&us->lock);
13959+ if (!us->sock)
13960+ goto out;
13961+
13962+ ev = alloc_event();
13963+ ev->type = SERVICE_EVENT_STOP;
13964+
13965+ add_event(us, ev);
13966+ user_notify(us);
13967+ out:
13968+ up(&us->lock);
13969+ return 0;
13970+}
13971+
13972+static int user_start(void *servicedata, uint32_t *nodeids, int count,
13973+ int event_id, int type)
13974+{
13975+ user_service_t *us = (user_service_t *) servicedata;
13976+ event_t *ev;
13977+
13978+ down(&us->lock);
13979+ if (!us->sock) {
13980+ kcl_start_done(us->local_id, event_id);
13981+ goto out;
13982+ }
13983+
13984+ us->need_startdone = event_id;
13985+
13986+ ev = alloc_event();
13987+ ev->type = SERVICE_EVENT_START;
13988+ ev->node_count = count;
13989+ ev->start_type = start_type(type);
13990+ ev->event_id = event_id;
13991+ ev->nodeids = nodeids;
13992+
13993+ add_event(us, ev);
13994+ user_notify(us);
13995+ out:
13996+ up(&us->lock);
13997+ return 0;
13998+}
13999+
14000+static void user_finish(void *servicedata, int event_id)
14001+{
14002+ user_service_t *us = (user_service_t *) servicedata;
14003+ event_t *ev;
14004+
14005+ down(&us->lock);
14006+ if (!us->sock)
14007+ goto out;
14008+
14009+ ev = alloc_event();
14010+ ev->type = SERVICE_EVENT_FINISH;
14011+ ev->event_id = event_id;
14012+
14013+ add_event(us, ev);
14014+ user_notify(us);
14015+ out:
14016+ up(&us->lock);
14017+}
14018+
14019+struct kcl_service_ops user_service_ops = {
14020+ .stop = user_stop,
14021+ .start = user_start,
14022+ .finish = user_finish
14023+};
14024+
5cdbd17b 14025+static int user_register(char *u_name, user_service_t **us_data)
4bf12011 14026+{
14027+ user_service_t *us;
5cdbd17b
AM
14028+ char name[MAX_SERVICE_NAME_LEN+1];
14029+ int len, error;
14030+
14031+ memset(name, 0, MAX_SERVICE_NAME_LEN+1);
14032+
14033+ if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
14034+ return -EFAULT;
4bf12011 14035+
5cdbd17b
AM
14036+ len = strlen(name);
14037+ if (len > MAX_SERVICE_NAME_LEN)
4bf12011 14038+ return -ENAMETOOLONG;
14039+ if (!len)
14040+ return -EINVAL;
14041+
14042+ us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
14043+ if (!us)
14044+ return -ENOMEM;
14045+ memset(us, 0, sizeof(user_service_t));
14046+ us->nodeids = NULL;
14047+ INIT_LIST_HEAD(&us->events);
14048+ spin_lock_init(&us->event_lock);
14049+ init_MUTEX(&us->lock);
14050+ us->name_len = len;
14051+ memcpy(us->name, name, len);
14052+
14053+ error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
14054+ &user_service_ops, TRUE, (void *) us,
14055+ &us->local_id);
14056+ if (error) {
14057+ kfree(us);
14058+ us = NULL;
14059+ }
14060+ *us_data = us;
14061+ return error;
14062+}
14063+
14064+static void user_unregister(user_service_t *us)
14065+{
14066+ event_t *ev;
14067+
14068+ kcl_unregister_service(us->local_id);
14069+
14070+ if (us->nodeids)
14071+ kfree(us->nodeids);
14072+
14073+ while ((ev = get_event(us))) {
14074+ del_event(us, ev);
14075+ if (ev->nodeids)
14076+ kfree(ev->nodeids);
14077+ kfree(ev);
14078+ }
14079+}
14080+
14081+static int user_join_async(void *arg)
14082+{
14083+ user_service_t *us = arg;
14084+ int user_gone = 0;
14085+
14086+ daemonize("cman_userjoin");
14087+
14088+ kcl_join_service(us->local_id);
14089+
14090+ down(&us->lock);
14091+ us->state = UST_JOINED;
14092+ us->async = 0;
14093+ if (!us->sock) {
14094+ if (us->need_startdone)
14095+ kcl_start_done(us->local_id, us->need_startdone);
14096+ user_gone = 1;
14097+ }
14098+ up(&us->lock);
14099+
14100+ if (user_gone) {
14101+ kcl_leave_service(us->local_id);
14102+ user_unregister(us);
14103+ kfree(us);
14104+ }
14105+ return 0;
14106+}
14107+
14108+static int user_leave_async(void *arg)
14109+{
14110+ user_service_t *us = arg;
14111+
14112+ daemonize("cman_userleave");
14113+
14114+ kcl_leave_service(us->local_id);
14115+
14116+ down(&us->lock);
14117+ us->async = 0;
14118+ if (!us->sock) {
14119+ user_unregister(us);
14120+ kfree(us);
14121+ } else {
14122+ event_t *ev = alloc_event();
14123+ ev->type = SERVICE_EVENT_LEAVEDONE;
14124+ add_event(us, ev);
14125+ user_notify(us);
14126+ up(&us->lock);
14127+ }
14128+
14129+ return 0;
14130+}
14131+
14132+static int user_join(user_service_t *us, int wait)
14133+{
14134+ int error = 0;
14135+
14136+ if (wait) {
14137+ error = kcl_join_service(us->local_id);
14138+ us->state = UST_JOINED;
14139+ }
14140+ else {
14141+ us->async = 1;
14142+ kernel_thread(user_join_async, us, 0);
14143+ }
14144+
14145+ return error;
14146+}
14147+
14148+static void user_leave(user_service_t *us, int wait)
14149+{
14150+ if (wait)
14151+ kcl_leave_service(us->local_id);
14152+ else {
14153+ us->async = 1;
14154+ kernel_thread(user_leave_async, us, 0);
14155+ }
14156+}
14157+
14158+static int user_start_done(user_service_t *us, unsigned int event_id)
14159+{
14160+ if (!us->need_startdone)
14161+ return -EINVAL;
14162+ if (us->need_startdone == event_id)
14163+ us->need_startdone = 0;
14164+ kcl_start_done(us->local_id, event_id);
14165+ return 0;
14166+}
14167+
14168+static void user_set_signal(user_service_t *us, int signal)
14169+{
14170+ us->pid = current->pid;
14171+ us->signal = signal;
14172+}
14173+
14174+static int user_get_event(user_service_t *us,
14175+ struct cl_service_event *user_event)
14176+{
14177+ event_t *ev;
14178+ struct cl_service_event event;
14179+
14180+ ev = get_event(us);
14181+ if (!ev)
14182+ return 0;
14183+
14184+ event.type = ev->type;
14185+ event.start_type = ev->start_type;
14186+ event.event_id = ev->event_id;
14187+ event.last_stop = ev->last_stop;
14188+ event.last_start = ev->last_start;
14189+ event.last_finish = ev->last_finish;
14190+ event.node_count = ev->node_count;
14191+
14192+ if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
14193+ return -EFAULT;
14194+
14195+ del_event(us, ev);
14196+
14197+ if (ev->type == SERVICE_EVENT_START) {
14198+ if (us->nodeids)
14199+ kfree(us->nodeids);
14200+ us->nodeids = ev->nodeids;
14201+ us->node_count = ev->node_count;
14202+ }
14203+
14204+ kfree(ev);
14205+ return 1;
14206+}
14207+
14208+static int user_get_members(user_service_t *us,
14209+ struct cl_cluster_nodelist *u_nodelist)
14210+{
14211+ struct cl_cluster_nodelist user_nodelist;
14212+ struct cl_cluster_node user_node, *u_node;
14213+ struct cluster_node *node;
14214+ unsigned int i;
14215+ int num_nodes = 0;
14216+
14217+ if (!u_nodelist)
14218+ return us->node_count;
14219+
14220+ if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
14221+ sizeof(struct cl_cluster_nodelist)))
14222+ return -EFAULT;
14223+
14224+ if (user_nodelist.max_members < us->node_count)
14225+ return -E2BIG;
14226+
14227+ u_node = user_nodelist.nodes;
14228+
14229+ for (i = 0; i < us->node_count; i++) {
14230+ node = find_node_by_nodeid(us->nodeids[i]);
14231+ if (!node)
14232+ continue;
14233+
14234+ copy_to_usernode(node, &user_node);
14235+ if (copy_to_user(u_node, &user_node,
14236+ sizeof(struct cl_cluster_node)))
14237+ return -EFAULT;
14238+
14239+ u_node++;
14240+ num_nodes++;
14241+ }
14242+ return num_nodes;
14243+}
14244+
14245+static int user_global_id(user_service_t *us, uint32_t *id)
14246+{
14247+ uint32_t gid = 0;
14248+
14249+ if (us->state != UST_JOINED)
14250+ return -EINVAL;
14251+
14252+ kcl_global_service_id(us->local_id, &gid);
14253+
14254+ if (copy_to_user(id, &gid, sizeof(uint32_t)))
14255+ return -EFAULT;
14256+ return 0;
14257+}
14258+
14259+static int user_set_level(user_service_t *us, int level)
14260+{
14261+ int prev_id = us->local_id;
14262+ int error;
14263+
14264+ if (us->state != UST_REGISTER)
14265+ return -EINVAL;
14266+
14267+ error = kcl_register_service(us->name, us->name_len, level,
14268+ &user_service_ops, TRUE, (void *) us,
14269+ &us->local_id);
14270+ if (error)
14271+ return error;
14272+
14273+ kcl_unregister_service(prev_id);
14274+ return 0;
14275+}
14276+
14277+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
14278+{
14279+ struct cluster_sock *c = cluster_sk(sock->sk);
14280+ user_service_t *us = c->service_data;
14281+ int error = 0;
14282+
14283+ if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
14284+ return -EINVAL;
14285+
14286+ switch (cmd) {
14287+ case SIOCCLUSTER_SERVICE_REGISTER:
14288+ error = user_register((char *) arg, &us);
14289+ if (!error) {
14290+ us->state = UST_REGISTER;
14291+ us->sock = sock;
14292+ c->service_data = us;
14293+ }
14294+ break;
14295+
14296+ case SIOCCLUSTER_SERVICE_UNREGISTER:
14297+ down(&us->lock);
14298+ us->state = UST_UNREGISTER;
14299+ user_unregister(us);
14300+ up(&us->lock);
14301+ break;
14302+
14303+ case SIOCCLUSTER_SERVICE_JOIN:
14304+ us->state = UST_JOIN;
14305+ user_join(us, 0);
14306+ break;
14307+
14308+ case SIOCCLUSTER_SERVICE_LEAVE:
14309+ down(&us->lock);
14310+ if (us->state != UST_JOINED) {
14311+ error = -EBUSY;
14312+ up(&us->lock);
14313+ } else {
14314+ us->state = UST_LEAVE;
14315+ up(&us->lock);
14316+ user_leave(us, 0);
14317+ }
14318+ break;
14319+
14320+ case SIOCCLUSTER_SERVICE_SETSIGNAL:
14321+ user_set_signal(us, (int) arg);
14322+ break;
14323+
14324+ case SIOCCLUSTER_SERVICE_STARTDONE:
14325+ error = user_start_done(us, (unsigned int) arg);
14326+ break;
14327+
14328+ case SIOCCLUSTER_SERVICE_GETEVENT:
14329+ error = user_get_event(us, (struct cl_service_event *) arg);
14330+ break;
14331+
14332+ case SIOCCLUSTER_SERVICE_GETMEMBERS:
14333+ error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
14334+ break;
14335+
14336+ case SIOCCLUSTER_SERVICE_GLOBALID:
14337+ error = user_global_id(us, (uint32_t *) arg);
14338+ break;
14339+
14340+ case SIOCCLUSTER_SERVICE_SETLEVEL:
14341+ error = user_set_level(us, (int) arg);
14342+ break;
14343+
14344+ default:
14345+ error = -EINVAL;
14346+ }
14347+
14348+ return error;
14349+}
14350+
14351+void sm_sock_release(struct socket *sock)
14352+{
14353+ struct cluster_sock *c = cluster_sk(sock->sk);
14354+ user_service_t *us = c->service_data;
14355+ int state;
14356+
14357+ if (!us)
14358+ return;
14359+
14360+ down(&us->lock);
14361+ us->sock = NULL;
14362+ c->service_data = NULL;
14363+
14364+ if (us->need_startdone)
14365+ kcl_start_done(us->local_id, us->need_startdone);
14366+
14367+ if (us->async) {
14368+ /* async thread will clean up before exiting */
14369+ up(&us->lock);
14370+ return;
14371+ }
14372+ state = us->state;
14373+ up(&us->lock);
14374+
14375+ switch (state) {
14376+ case UST_JOIN:
14377+ break;
14378+ case UST_JOINED:
14379+ user_leave(us, 1);
14380+ /* fall through */
14381+ case UST_LEAVE:
14382+ case UST_REGISTER:
14383+ user_unregister(us);
14384+ /* fall through */
14385+ case UST_UNREGISTER:
14386+ kfree(us);
14387+ break;
14388+ }
14389+}
14390diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
14391--- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14392+++ linux-patched/cluster/cman/sm_user.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 14393@@ -0,0 +1,21 @@
14394+/******************************************************************************
14395+*******************************************************************************
14396+**
14397+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14398+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14399+**
14400+** This copyrighted material is made available to anyone wishing to use,
14401+** modify, copy, or redistribute it subject to the terms and conditions
14402+** of the GNU General Public License v.2.
14403+**
14404+*******************************************************************************
14405+******************************************************************************/
14406+
14407+#ifndef __SM_USER_DOT_H__
14408+#define __SM_USER_DOT_H__
14409+
14410+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
14411+void sm_sock_release(struct socket *sock);
14412+void sm_sock_bind(struct socket *sock);
14413+
14414+#endif
14415diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
14416--- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14417+++ linux-patched/include/cluster/cnxman-socket.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14418@@ -0,0 +1,226 @@
14419+/******************************************************************************
14420+*******************************************************************************
14421+**
14422+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14423+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14424+**
14425+** This copyrighted material is made available to anyone wishing to use,
14426+** modify, copy, or redistribute it subject to the terms and conditions
14427+** of the GNU General Public License v.2.
14428+**
14429+*******************************************************************************
14430+******************************************************************************/
14431+
14432+/* CMAN socket interface header,
14433+ may be include by user or kernel code */
14434+
14435+#ifndef __CNXMAN_SOCKET_H
14436+#define __CNXMAN_SOCKET_H
14437+
14438+/* Just made these up but the address family must be less than 32 (NPROTO) */
14439+#define AF_CLUSTER 31
14440+#define PF_CLUSTER AF_CLUSTER
14441+
14442+/* Protocol(socket) types */
14443+#define CLPROTO_MASTER 2
14444+#define CLPROTO_CLIENT 3
14445+
14446+/* Setsockopt -- maybe should be ioctls?? */
14447+#define CLU_SET_MULTICAST 100
14448+#define CLU_JOIN_CLUSTER 101
14449+#define CLU_LEAVE_CLUSTER 102
14450+#define CLU_SET_RCVONLY 103
14451+#define CLU_SET_UNICAST 104
14452+#define KCL_SET_MULTICAST 105
14453+#define KCL_SET_RCVONLY 106
14454+#define KCL_SET_UNICAST 107
14455+#define KCL_SET_NODENAME 108
14456+#define CLU_SET_NODENAME 109
14457+
14458+/* ioctls -- should register these properly */
14459+#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int)
14460+#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02)
14461+#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist)
14462+#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
14463+#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05)
14464+#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request)
14465+#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist)
14466+#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int)
14467+#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version)
14468+#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version)
14469+#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b)
14470+#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int)
14471+#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d)
14472+#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char)
14473+#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
14474+#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10)
14475+#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20)
14476+#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
14477+#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
14478+#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event)
14479+#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
14480+#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t)
14481+#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int)
14482+#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node)
14483+#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info)
14484+
14485+/* Maximum size of a cluster message */
14486+#define MAX_CLUSTER_MESSAGE 1500
14487+#define MAX_CLUSTER_MEMBER_NAME_LEN 255
14488+#define MAX_BARRIER_NAME_LEN 33
14489+#define MAX_SA_ADDR_LEN 12
14490+#define MAX_CLUSTER_NAME_LEN 16
14491+
14492+/* Well-known cluster port numbers */
14493+#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster
14494+ * transitions! */
14495+#define CLUSTER_PORT_SERVICES 2
14496+#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
14497+#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
14498+#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */
14499+
14500+/* Port numbers above this will be blocked when the cluster is inquorate or in
14501+ * transition */
14502+#define HIGH_PROTECTED_PORT 9
14503+
14504+/* Reasons for leaving the cluster */
14505+#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */
14506+#define CLUSTER_LEAVEFLAG_KILLED 1
14507+#define CLUSTER_LEAVEFLAG_PANIC 2
14508+#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */
14509+#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the
14510+ * first place */
14511+#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is
14512+ * in a minority */
14513+#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */
14514+#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */
14515+
14516+/* OOB messages sent to a local socket */
14517+#define CLUSTER_OOB_MSG_PORTCLOSED 1
14518+#define CLUSTER_OOB_MSG_STATECHANGE 2
14519+#define CLUSTER_OOB_MSG_SERVICEEVENT 3
14520+
14521+/* Sendmsg flags, these are above the normal sendmsg flags so they don't
14522+ * interfere */
14523+#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */
14524+#define MSG_QUEUE 0x020000 /* Queue the message for sending later */
14525+#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
14526+ */
14527+#define MSG_ALLINT 0x100000 /* Send out of all interfaces */
14528+
14529+typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER,
14530+ NODESTATE_DEAD } nodestate_t;
14531+
14532+
14533+struct sockaddr_cl {
14534+ unsigned short scl_family;
14535+ unsigned char scl_flags;
14536+ unsigned char scl_port;
14537+ int scl_nodeid;
14538+};
14539+
14540+/* This is how we pass the multicast socket into kernel space. addr is the
14541+ * multicast address to use in the address family of the socket (eg for UDP it
14542+ * might be 255.255.255.0) */
14543+struct cl_multicast_sock {
14544+ int fd; /* FD of master socket to do multicast on */
14545+ int number; /* Socket number, to match up recvonly & bcast
14546+ * sockets */
14547+};
14548+
14549+/* Cluster configuration info passed when we join the cluster */
14550+struct cl_join_cluster_info {
14551+ unsigned char votes;
14552+ unsigned int expected_votes;
14553+ unsigned int two_node;
14554+ unsigned int config_version;
14555+
14556+ char cluster_name[17];
14557+};
14558+
14559+
14560+/* This is the structure, per node, returned from the membership ioctl */
14561+struct cl_cluster_node {
14562+ unsigned int size;
14563+ unsigned int node_id;
14564+ unsigned int us;
14565+ unsigned int leave_reason;
14566+ unsigned int incarnation;
14567+ nodestate_t state;
14568+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14569+ unsigned char votes;
14570+};
14571+
14572+/* The struct passed to the membership ioctls */
14573+struct cl_cluster_nodelist {
14574+ uint32_t max_members;
14575+ struct cl_cluster_node *nodes;
14576+};
14577+
14578+/* Structure passed to SIOCCLUSTER_ISLISTENING */
14579+struct cl_listen_request {
14580+ unsigned char port;
14581+ int nodeid;
14582+};
14583+
14584+/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
14585+struct cl_portclosed_oob {
14586+ unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */
14587+ unsigned char port;
14588+};
14589+
14590+/* Get all version numbers or set the config version */
14591+struct cl_version {
14592+ unsigned int major;
14593+ unsigned int minor;
14594+ unsigned int patch;
14595+ unsigned int config;
14596+};
14597+
14598+/* structure passed to barrier ioctls */
14599+struct cl_barrier_info {
14600+ char cmd;
14601+ char name[MAX_BARRIER_NAME_LEN];
14602+ unsigned int flags;
14603+ unsigned long arg;
14604+};
14605+
14606+typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
14607+ SERVICE_EVENT_LEAVEDONE } service_event_t;
14608+
14609+typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
14610+ service_start_t;
14611+
14612+struct cl_service_event {
14613+ service_event_t type;
14614+ service_start_t start_type;
14615+ unsigned int event_id;
14616+ unsigned int last_stop;
14617+ unsigned int last_start;
14618+ unsigned int last_finish;
14619+ unsigned int node_count;
14620+};
14621+
14622+
14623+/* Commands to the barrier ioctl */
14624+#define BARRIER_IOCTL_REGISTER 1
14625+#define BARRIER_IOCTL_CHANGE 2
14626+#define BARRIER_IOCTL_DELETE 3
14627+#define BARRIER_IOCTL_WAIT 4
14628+
14629+/* Attributes of a barrier - bitmask */
14630+#define BARRIER_ATTR_AUTODELETE 1
14631+#define BARRIER_ATTR_MULTISTEP 2
14632+#define BARRIER_ATTR_MANUAL 4
14633+#define BARRIER_ATTR_ENABLED 8
14634+#define BARRIER_ATTR_CALLBACK 16
14635+
14636+/* Attribute setting commands */
14637+#define BARRIER_SETATTR_AUTODELETE 1
14638+#define BARRIER_SETATTR_MULTISTEP 2
14639+#define BARRIER_SETATTR_ENABLED 3
14640+#define BARRIER_SETATTR_NODES 4
14641+#define BARRIER_SETATTR_CALLBACK 5
14642+#define BARRIER_SETATTR_TIMEOUT 6
14643+
14644+#endif
14645diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
14646--- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14647+++ linux-patched/include/cluster/cnxman.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14648@@ -0,0 +1,87 @@
14649+/******************************************************************************
14650+*******************************************************************************
14651+**
14652+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14653+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14654+**
14655+** This copyrighted material is made available to anyone wishing to use,
14656+** modify, copy, or redistribute it subject to the terms and conditions
14657+** of the GNU General Public License v.2.
14658+**
14659+*******************************************************************************
14660+******************************************************************************/
14661+
14662+#ifndef __CNXMAN_H
14663+#define __CNXMAN_H
14664+
14665+#include "linux/in6.h"
14666+#include "cluster/cnxman-socket.h"
14667+
14668+/* In-kernel API */
14669+
14670+/* This is the structure, per node, returned from the membership request */
14671+struct kcl_cluster_node {
14672+ unsigned int size;
14673+ unsigned int node_id;
14674+ unsigned int us;
14675+ unsigned int leave_reason;
14676+ unsigned int incarnation;
14677+ nodestate_t state;
14678+ struct list_head list;
14679+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14680+ unsigned char votes;
14681+};
14682+
14683+struct cluster_node_addr {
14684+ struct list_head list;
14685+ unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
14686+ int addr_len;
14687+};
14688+
14689+
14690+/* Reasons for a kernel membership callback */
14691+typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
14692+
14693+/* Kernel version of above, the void *sock is a struct socket */
14694+struct kcl_multicast_sock {
14695+ void *sock;
14696+ int number; /* Socket number, to match up recvonly & bcast
14697+ * sockets */
14698+};
14699+
14700+extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
14701+ struct sockaddr_cl *caddr, int addr_len,
14702+ unsigned int flags);
14703+extern int kcl_register_read_callback(struct socket *sock,
14704+ int (*routine) (char *, int, char *, int,
14705+ unsigned int));
14706+extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
14707+extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
14708+extern int kcl_get_members(struct list_head *list);
14709+extern int kcl_get_member_ids(uint32_t * idbuf, int size);
14710+extern int kcl_get_all_members(struct list_head *list);
14711+extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
14712+ struct kcl_cluster_node *n);
14713+extern int kcl_get_node_by_name(unsigned char *name,
14714+ struct kcl_cluster_node *n);
14715+extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
14716+extern int kcl_is_quorate(void);
14717+extern int kcl_addref_cluster(void);
14718+extern int kcl_releaseref_cluster(void);
14719+extern int kcl_cluster_name(char **cname);
14720+extern int kcl_get_current_interface(void);
14721+extern struct list_head *kcl_get_node_addresses(int nodeid);
14722+
14723+extern int kcl_barrier_register(char *name, unsigned int flags,
14724+ unsigned int nodes);
14725+extern int kcl_barrier_setattr(char *name, unsigned int attr,
14726+ unsigned long arg);
14727+extern int kcl_barrier_delete(char *name);
14728+extern int kcl_barrier_wait(char *name);
14729+extern int kcl_barrier_cancel(char *name);
14730+
14731+extern int kcl_register_quorum_device(char *name, int votes);
14732+extern int kcl_unregister_quorum_device(void);
14733+extern int kcl_quorum_device_available(int yesno);
14734+
14735+#endif
14736diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
14737--- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14738+++ linux-patched/include/cluster/service.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14739@@ -0,0 +1,102 @@
14740+/******************************************************************************
14741+*******************************************************************************
14742+**
14743+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14744+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14745+**
14746+** This copyrighted material is made available to anyone wishing to use,
14747+** modify, copy, or redistribute it subject to the terms and conditions
14748+** of the GNU General Public License v.2.
14749+**
14750+*******************************************************************************
14751+******************************************************************************/
14752+
14753+#ifndef __SERVICE_DOT_H__
14754+#define __SERVICE_DOT_H__
14755+
14756+/*
14757+ * Interface between service manager and services
14758+ */
14759+
14760+/*
14761+ * Service levels are started in order from lowest, so level 0 is started on
14762+ * all nodes before level 1 is started.
14763+ */
14764+
14765+#define SERVICE_LEVEL_FENCE (0)
14766+#define SERVICE_LEVEL_GDLM (1)
14767+#define SERVICE_LEVEL_GFS (2)
14768+#define SERVICE_LEVEL_USER (3)
14769+
14770+#define MAX_SERVICE_NAME_LEN (33)
14771+
14772+/*
14773+ * The type of start a service receives. The start (and preceding stop) may be
14774+ * due to a node joining or leaving the SG or due to a node having failed.
14775+ */
14776+
14777+#define SERVICE_NODE_FAILED (1)
14778+#define SERVICE_NODE_JOIN (2)
14779+#define SERVICE_NODE_LEAVE (3)
14780+
14781+
14782+struct kcl_service {
14783+ struct list_head list;
14784+ uint16_t level;
14785+ uint32_t local_id;
14786+ uint32_t global_id;
14787+ int node_count;
14788+ char name[MAX_SERVICE_NAME_LEN];
14789+};
14790+
14791+int kcl_get_services(struct list_head *list, int level);
14792+
14793+
14794+/*
14795+ * These routines which run in CMAN context must return quickly and cannot
14796+ * block.
14797+ */
14798+
14799+struct kcl_service_ops {
14800+ int (*stop) (void *servicedata);
14801+ int (*start) (void *servicedata, uint32_t *nodeids, int count,
14802+ int event_id, int type);
14803+ void (*finish) (void *servicedata, int event_id);
14804+};
14805+
14806+/*
14807+ * Register will cause CMAN to create a Service Group (SG) for the named
14808+ * instance of the service. A local ID is returned which is used to join,
14809+ * leave and unregister the service.
14810+ */
14811+
14812+int kcl_register_service(char *name, int namelen, int level,
14813+ struct kcl_service_ops *ops, int unique,
14814+ void *servicedata, uint32_t *local_id);
14815+
14816+void kcl_unregister_service(uint32_t local_id);
14817+
14818+/*
14819+ * Once a service is joined it will be managed by CMAN and receive start, stop,
14820+ * and finish calls. After leave is called the service is no longer managed by
14821+ * CMAN. The first start for a service may arrive before kcl_join_service()
14822+ * returns.
14823+ */
14824+
14825+int kcl_join_service(uint32_t local_id);
14826+int kcl_leave_service(uint32_t local_id);
14827+
14828+/*
14829+ * After a service is started, it can ask for its cluster-wide unique ID.
14830+ */
14831+
14832+void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
14833+
14834+/*
14835+ * Called by a service when it's done with a start(). Cannot be called from
14836+ * the start function.
14837+ */
14838+
14839+void kcl_start_done(uint32_t local_id, int event_id);
14840+
14841+#endif
This page took 1.910045 seconds and 4 git commands to generate.