]> git.pld-linux.org Git - packages/kernel.git/blame - linux-cluster-cman.patch
- _rel 1.19,
[packages/kernel.git] / linux-cluster-cman.patch
CommitLineData
4bf12011 1# Add CMAN to build system
2diff -urN -p linux-2.6.7/Makefile linux/Makefile
3--- linux-2.6.7/Makefile 2004-06-16 13:19:37.000000000 +0800
4+++ linux/Makefile 2004-06-17 14:55:06.000000000 +0800
5@@ -418,7 +418,7 @@ all: vmlinux
6
7 # Objects we will link into vmlinux / subdirs we need to visit
8 init-y := init/
9-drivers-y := drivers/ sound/
10+drivers-y := drivers/ sound/ cluster/
11 net-y := net/
12 libs-y := lib/
13 core-y := usr/
14diff -urN -p linux-2.6.7/arch/alpha/Kconfig linux/arch/alpha/Kconfig
15--- linux-2.6.7/arch/alpha/Kconfig 2004-06-16 13:19:44.000000000 +0800
16+++ linux/arch/alpha/Kconfig 2004-06-17 14:55:06.000000000 +0800
17@@ -698,3 +698,4 @@ source "crypto/Kconfig"
18
19 source "lib/Kconfig"
20
21+source "cluster/Kconfig"
22diff -urN -p linux-2.6.7/arch/i386/Kconfig linux/arch/i386/Kconfig
23--- linux-2.6.7/arch/i386/Kconfig 2004-06-16 13:18:59.000000000 +0800
24+++ linux/arch/i386/Kconfig 2004-06-17 14:55:06.000000000 +0800
25@@ -1315,6 +1315,8 @@ source "crypto/Kconfig"
26
27 source "lib/Kconfig"
28
29+source "cluster/Kconfig"
30+
31 config X86_SMP
32 bool
33 depends on SMP && !X86_VOYAGER
34diff -urN -p linux-2.6.7/arch/parisc/Kconfig linux/arch/parisc/Kconfig
35--- linux-2.6.7/arch/parisc/Kconfig 2004-06-16 13:19:36.000000000 +0800
36+++ linux/arch/parisc/Kconfig 2004-06-17 14:55:06.000000000 +0800
37@@ -229,3 +229,4 @@ source "crypto/Kconfig"
38
39 source "lib/Kconfig"
40
41+source "cluster/Kconfig"
42diff -urN -p linux-2.6.7/arch/sparc64/Kconfig linux/arch/sparc64/Kconfig
43--- linux-2.6.7/arch/sparc64/Kconfig 2004-06-16 13:19:52.000000000 +0800
44+++ linux/arch/sparc64/Kconfig 2004-06-17 14:55:06.000000000 +0800
45@@ -713,3 +713,4 @@ source "crypto/Kconfig"
46
47 source "lib/Kconfig"
48
49+source "cluster/Kconfig"
50diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
51--- linux-2.6.7/cluster/Kconfig 1970-01-01 07:30:00.000000000 +0730
52+++ linux/cluster/Kconfig 2004-06-17 14:55:06.000000000 +0800
53@@ -0,0 +1,13 @@
54+menu "Cluster Support"
55+
56+config CLUSTER
57+ tristate "Cluster support"
58+ ---help---
59+ Enable clustering support. This is not the high-performance clustering
60+ made famous by beowulf. It is a high-availability cluster often using
61+ shared storage.
62+ The cluster manager is the heart(beat) of the cluster system. It is
63+ needed by all the other components. It provides membership services
64+ for those other subsystems.
65+
66+endmenu
67diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
68--- linux-2.6.7/cluster/Makefile 1970-01-01 07:30:00.000000000 +0730
69+++ linux/cluster/Makefile 2004-06-17 14:55:06.000000000 +0800
70@@ -0,0 +1,3 @@
71+obj-y := nocluster.o
72+
73+obj-$(CONFIG_CLUSTER) += cman/
74diff -urN -p linux-2.6.7/cluster/cman/Makefile linux/cluster/cman/Makefile
75--- linux-2.6.7/cluster/cman/Makefile 1970-01-01 07:30:00.000000000 +0730
76+++ linux/cluster/cman/Makefile 2004-06-17 14:55:06.000000000 +0800
77@@ -0,0 +1,6 @@
78+cman-objs := cnxman.o config.o membership.o proc.o\
79+ sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
80+ sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
81+ sm_user.o
82+
83+obj-$(CONFIG_CLUSTER) := cman.o
84diff -urN -p linux-2.6.7/cluster/nocluster.c linux/cluster/nocluster.c
85--- linux-2.6.7/cluster/nocluster.c 1970-01-01 07:30:00.000000000 +0730
86+++ linux/cluster/nocluster.c 2004-06-17 14:55:06.000000000 +0800
87@@ -0,0 +1,20 @@
88+/*
89+ * cluster/nocluster.c
90+ *
91+ * Copy from net/nonet.c
92+ * Dummy functions to allow us to configure cluster support entirely
93+ * out of the kernel.
94+ *
95+ * Distributed under the terms of the GNU GPL version 2.
96+ * Copyright (c) Matthew Wilcox 2003
97+ */
98+
99+#include <linux/module.h>
100+#include <linux/errno.h>
101+#include <linux/fs.h>
102+#include <linux/init.h>
103+#include <linux/kernel.h>
104+
105+void __init nocluster_init(void)
106+{
107+}
108diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
109--- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 110+++ linux-patched/cluster/cman/cnxman-private.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 111@@ -0,0 +1,427 @@
112+/******************************************************************************
113+*******************************************************************************
114+**
115+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
116+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
117+**
118+** This copyrighted material is made available to anyone wishing to use,
119+** modify, copy, or redistribute it subject to the terms and conditions
120+** of the GNU General Public License v.2.
121+**
122+*******************************************************************************
123+******************************************************************************/
124+
125+#ifndef __CNXMAN_PRIVATE_H
126+#define __CNXMAN_PRIVATE_H
127+
128+/* Version triplet */
129+#define CNXMAN_MAJOR_VERSION 2
130+#define CNXMAN_MINOR_VERSION 0
131+#define CNXMAN_PATCH_VERSION 1
132+
133+#define MAX_RETRIES 3 /* Maximum number of send retries */
134+#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the
135+ * cluster */
136+#ifdef __KERNEL__
137+
138+/* How we announce ourself in console events */
139+#define CMAN_NAME "CMAN"
140+
141+/* One of these per AF_CLUSTER socket */
142+struct cluster_sock {
143+ /* WARNING: sk has to be the first member */
144+ struct sock sk;
145+
146+ unsigned char port; /* Bound port or zero */
147+ int (*kernel_callback) (char *, int, char *, int, unsigned int);
148+ void *service_data;
149+};
150+
151+#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
152+
153+/* We have one of these for each socket we use for communications */
154+struct cl_comms_socket {
155+ struct socket *sock;
156+ int broadcast; /* This is a broadcast socket */
157+ int recv_only; /* This is the unicast receive end of a
158+ * multicast socket */
159+ struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
160+ * the remote end(s) */
161+ int addr_len; /* Length of above */
162+ int number; /* Internal socket number, used to cycle around
163+ * sockets in case of network errors */
164+ struct file *file; /* file pointer for user-passed in sockets */
165+
166+ wait_queue_t wait;
167+
168+ /* The socket list */
169+ struct list_head list;
170+
171+ /* On here when it has something to say */
172+ struct list_head active_list;
173+ unsigned long active;
174+};
175+
176+/* A client socket. We keep a list of these so we can notify clients of cluster
177+ * events */
178+struct cl_client_socket {
179+ struct socket *sock;
180+ struct list_head list;
181+};
182+
183+/* This structure is tacked onto the start of a cluster message packet for our
184+ * own nefarious purposes. */
185+struct cl_protheader {
186+ unsigned char port;
187+ unsigned char flags;
188+ unsigned short cluster; /* Our cluster number, little-endian */
189+ unsigned short seq; /* Packet sequence number, little-endian */
190+ int srcid; /* Node ID of the sender */
191+ int tgtid; /* Node ID of the target or 0 for multicast
192+ * messages */
193+};
194+
195+/* A cluster internal protocol message - port number 0 */
196+struct cl_protmsg {
197+ struct cl_protheader header;
198+ unsigned char cmd;
199+};
200+
201+/* A Cluster ACK message */
202+struct cl_ackmsg {
203+ struct cl_protheader header;
204+ unsigned char cmd; /* Always CLUSTER_CMD_ACK */
205+ unsigned char remport; /* Remoye port number the original message was
206+ * for */
207+ unsigned char aflags; /* ACK flags 0=OK, 1=No listener */
208+ unsigned char pad;
209+ unsigned short seq; /* Sequence number we are acking */
210+};
211+
212+/* A Cluster LISTENREQ/LISTENRESP message */
213+struct cl_listenmsg {
214+ unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */
215+ unsigned char target_port; /* Port to probe */
216+ unsigned char listening; /* Always 0 for LISTENREQ */
217+ unsigned char pad;
218+ unsigned short tag; /* PID of remote waiting process */
219+};
220+
221+/* A Cluster PORTCLOSED message */
222+struct cl_closemsg {
223+ unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */
224+ unsigned char port;
225+};
226+
227+/* Structure of a newly dead node, passed from cnxman to kmembershipd */
228+struct cl_new_dead_node {
229+ struct list_head list;
230+ struct cluster_node *node;
231+};
232+
233+/* Subcommands for BARRIER message */
234+#define BARRIER_REGISTER 1
235+#define BARRIER_CHANGE 2
236+#define BARRIER_WAIT 4
237+#define BARRIER_COMPLETE 5
238+
239+/* A Cluster BARRIER message */
240+struct cl_barriermsg {
241+ unsigned char cmd; /* CLUSTER_CMD_BARRIER */
242+ unsigned char subcmd; /* BARRIER sub command */
243+ unsigned short pad;
244+ unsigned int flags;
245+ unsigned int nodes;
246+ char name[MAX_BARRIER_NAME_LEN];
247+};
248+
249+/* Membership services messages, the cl_protheader is added transparently */
250+struct cl_mem_hello_msg {
251+ unsigned char cmd;
252+ unsigned char flags;
253+ unsigned short members; /* Number of nodes in the cluster,
254+ * little-endian */
255+ unsigned int generation; /* Current cluster generation number */
256+};
257+
258+struct cl_mem_endtrans_msg {
259+ unsigned char cmd;
260+ unsigned char pad1;
261+ unsigned short pad2;
262+ unsigned int quorum;
263+ unsigned int total_votes;
264+ unsigned int generation; /* Current cluster generation number */
265+ unsigned int new_node_id; /* If reason is a new node joining */
266+};
267+
268+/* ACK types for JOINACK message */
269+#define JOINACK_TYPE_OK 1 /* You can join */
270+#define JOINACK_TYPE_NAK 2 /* You can NOT join */
271+#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition
272+ * already */
273+
274+struct cl_mem_joinack_msg {
275+ unsigned char cmd;
276+ unsigned char acktype;
277+};
278+
279+/* This is used by JOINREQ message */
280+struct cl_mem_join_msg {
281+ unsigned char cmd;
282+ unsigned char votes;
283+ unsigned short num_addr; /* Number of addresses for this node */
284+ unsigned int expected_votes;
285+ unsigned int members; /* Number of nodes in the cluster,
286+ * little-endian */
287+ unsigned int major_version; /* Not backwards compatible */
288+ unsigned int minor_version; /* Backwards compatible */
289+ unsigned int patch_version; /* Backwards/forwards compatible */
290+ unsigned int config_version;
291+ unsigned int addr_len; /* length of node addresses */
292+ char clustername[16];
293+ /* Followed by <num_addr> addresses of `address_length` bytes and a
294+ * NUL-terminated node name */
295+};
296+
297+/* State transition start reasons: */
298+#define TRANS_NEWNODE 1 /* A new node is joining the cluster */
299+#define TRANS_REMNODE 2 /* a node has left the cluster */
300+#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
301+ * transition */
302+#define TRANS_NEWMASTER 4 /* We have had an election and I am the new
303+ * master */
304+#define TRANS_CHECK 5 /* A consistency check was called for */
305+#define TRANS_RESTART 6 /* Transition restarted because of a previous
306+ * timeout */
307+#define TRANS_DEADMASTER 7 /* The master died during transition and I have
308+ * taken over */
309+
310+/* This is used to start a state transition */
311+struct cl_mem_starttrans_msg {
312+ unsigned char cmd;
313+ unsigned char reason; /* Why a start transition was started - see
314+ * above */
315+ unsigned char flags;
316+ unsigned char votes;
317+ unsigned int expected_votes;
318+ unsigned int generation; /* Incremented for each STARTTRANS sent
319+ */
320+ int nodeid; /* Node to be removed */
321+ unsigned short num_addrs;
322+ /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
323+ * `address_length` bytes and a NUL-terminated node name */
324+};
325+
326+struct cl_mem_startack_msg {
327+ unsigned char cmd;
328+ unsigned char reason;
329+ unsigned short pad;
330+ unsigned int generation;
331+ unsigned int node_id; /* node_id we think new node should have */
332+ unsigned int highest_node_id; /* highest node_id on this system */
333+};
334+
335+/* Reconfigure a cluster parameter */
336+struct cl_mem_reconfig_msg {
337+ unsigned char cmd;
338+ unsigned char param;
339+ unsigned short pad;
340+ unsigned int value;
341+};
342+
343+/* Structure containing information about an outstanding listen request */
344+struct cl_waiting_listen_request {
345+ wait_queue_head_t waitq;
346+ int result;
347+ int waiting;
348+ unsigned short tag;
349+ int nodeid;
350+ struct list_head list;
351+};
352+
353+/* Messages from membership services */
354+#define CLUSTER_MEM_JOINCONF 1
355+#define CLUSTER_MEM_JOINREQ 2
356+#define CLUSTER_MEM_LEAVE 3
357+#define CLUSTER_MEM_HELLO 4
358+#define CLUSTER_MEM_KILL 5
359+#define CLUSTER_MEM_JOINACK 6
360+#define CLUSTER_MEM_ENDTRANS 7
361+#define CLUSTER_MEM_RECONFIG 8
362+#define CLUSTER_MEM_MASTERVIEW 9
363+#define CLUSTER_MEM_STARTTRANS 10
364+#define CLUSTER_MEM_JOINREJ 11
365+#define CLUSTER_MEM_VIEWACK 12
366+#define CLUSTER_MEM_STARTACK 13
367+#define CLUSTER_MEM_TRANSITION 14
368+#define CLUSTER_MEM_NEWCLUSTER 15
369+#define CLUSTER_MEM_CONFACK 16
370+#define CLUSTER_MEM_NOMINATE 17
371+
372+/* Parameters for RECONFIG command */
373+#define RECONFIG_PARAM_EXPECTED_VOTES 1
374+#define RECONFIG_PARAM_NODE_VOTES 2
375+#define RECONFIG_PARAM_CONFIG_VERSION 3
376+
377+/* Data associated with an outgoing socket */
378+struct cl_socket {
379+ struct file *file; /* The real file */
380+ struct socket *socket; /* The real sock */
381+ struct cl_multicast_sock multicast_info;
382+ int num_nodes; /* On this link */
383+ int retransmit_count;
384+};
385+
386+/* There's one of these for each node in the cluster */
387+struct cluster_node {
388+ struct list_head list;
389+ char *name; /* Node/host name of node */
390+ struct list_head addr_list;
391+ int us; /* This node is us */
392+ unsigned int node_id; /* Unique node ID */
393+ nodestate_t state;
394+ unsigned short last_seq_recv;
395+ unsigned short last_seq_acked;
396+ unsigned short last_seq_sent;
397+ unsigned int votes;
398+ unsigned int expected_votes;
399+ unsigned int leave_reason;
400+ unsigned int incarnation; /* Incremented each time a node joins
401+ * the cluster */
402+ unsigned long last_hello; /* Jiffies */
403+};
404+
405+/* This is how we keep a list of user processes that are listening for cluster
406+ * membership events */
407+struct notify_struct {
408+ struct list_head list;
409+ pid_t pid;
410+ int signal;
411+};
412+
413+/* This is how we keep a list of kernel callbacks that are registered for
414+ * cluster membership events */
415+struct kernel_notify_struct {
416+ struct list_head list;
417+ void (*callback) (kcl_callback_reason, long arg);
418+};
419+
420+/* A message waiting to be sent */
421+struct queued_message {
422+ struct list_head list;
423+
424+ struct socket *socket;
425+ struct sockaddr_cl addr;
426+ int addr_len;
427+ int msg_len;
428+ unsigned char port;
429+ unsigned int flags;
430+ char msg_buffer[MAX_CLUSTER_MESSAGE];
431+};
432+
433+/* A barrier */
434+struct cl_barrier {
435+ struct list_head list;
436+
437+ char name[MAX_BARRIER_NAME_LEN];
438+ unsigned int flags;
439+ enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
440+ BARRIER_STATE_COMPLETE } state;
441+ unsigned int expected_nodes;
442+ unsigned int registered_nodes;
443+ atomic_t got_nodes;
444+ atomic_t completed_nodes;
445+ unsigned int inuse;
446+ unsigned int waitsent;
447+ unsigned int phase; /* Completion phase */
448+ unsigned int endreason; /* Reason we were woken, usually 0 */
449+ unsigned long timeout; /* In seconds */
450+
451+ void (*callback) (char *name, int status);
452+ wait_queue_head_t waitq;
453+ struct semaphore lock; /* To synch with cnxman messages */
454+ spinlock_t phase2_spinlock; /* Need to synchronise with timer
455+ * interrupts */
456+ struct timer_list timer;
457+};
458+
459+/* Cluster protocol commands sent to port 0 */
460+#define CLUSTER_CMD_ACK 1
461+#define CLUSTER_CMD_LISTENREQ 2
462+#define CLUSTER_CMD_LISTENRESP 3
463+#define CLUSTER_CMD_PORTCLOSED 4
464+#define CLUSTER_CMD_BARRIER 5
465+
466+extern struct cluster_node *find_node_by_addr(unsigned char *addr,
467+ int addr_len);
468+extern struct cluster_node *find_node_by_nodeid(unsigned int id);
469+extern struct cluster_node *find_node_by_name(char *name);
470+extern void set_quorate(int);
471+extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
472+extern void notify_listeners(void);
473+extern void free_nodeid_array(void);
474+extern int send_reconfigure(int param, unsigned int value);
475+extern int calculate_quorum(int, int, int *);
476+extern void recalculate_quorum(int);
477+extern int send_leave(unsigned char);
478+extern int get_quorum(void);
479+extern void set_votes(int, int);
480+extern void kcl_wait_for_all_acks(void);
481+extern char *membership_state(char *, int);
482+extern void a_node_just_died(struct cluster_node *node);
483+extern void check_barrier_returns(void);
484+extern int in_transition(void);
485+extern void get_local_addresses(struct cluster_node *node);
486+extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
487+extern void create_proc_entries(void);
488+extern void cleanup_proc_entries(void);
489+extern unsigned int get_highest_nodeid(void);
490+extern int allocate_nodeid_array(void);
491+extern void queue_oob_skb(struct socket *sock, int cmd);
492+extern int new_temp_nodeid(char *addr, int addrlen);
493+extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
494+extern void remove_temp_nodeid(int nodeid);
495+extern inline char *print_addr(unsigned char *addr, int len, char *buf)
496+{
497+ int i;
498+ int ptr = 0;
499+
500+ for (i = 0; i < len; i++)
501+ ptr += sprintf(buf + ptr, "%02x ", addr[i]);
502+
503+ return buf;
504+}
505+
506+#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
507+
508+/* Debug enabling macros. Sorry about the C++ comments but they're easier to
509+ * get rid of than C ones... */
510+
511+// #define DEBUG_MEMB
512+// #define DEBUG_COMMS
513+// #define DEBUG_BARRIER
514+
515+/* Debug macros */
516+#ifdef DEBUG_COMMS
517+#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
518+#else
519+#define P_COMMS(fmt, args...)
520+#endif
521+
522+#ifdef DEBUG_BARRIER
523+#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
524+#else
525+#define P_BARRIER(fmt, args...)
526+#endif
527+
528+#ifdef DEBUG_MEMB
529+#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
530+#define C_MEMB(fmt, args...) printk(fmt, ## args)
531+#else
532+#define P_MEMB(fmt, args...)
533+#define C_MEMB(fmt, args...)
534+#endif
535+
536+#endif /* __KERNEL */
537+
538+#endif
539diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
540--- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 541+++ linux-patched/cluster/cman/cnxman.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 542@@ -0,0 +1,4080 @@
543+/******************************************************************************
544+*******************************************************************************
545+**
546+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
547+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
548+**
549+** This copyrighted material is made available to anyone wishing to use,
550+** modify, copy, or redistribute it subject to the terms and conditions
551+** of the GNU General Public License v.2.
552+**
553+*******************************************************************************
554+******************************************************************************/
555+
556+#define EXPORT_SYMTAB
557+#include <linux/init.h>
558+#include <linux/socket.h>
559+#include <linux/kernel.h>
560+#include <linux/sched.h>
561+#include <linux/file.h>
562+#include <linux/utsname.h>
563+#include <net/sock.h>
564+#include <linux/proc_fs.h>
565+#include <linux/poll.h>
566+#include <linux/module.h>
567+#include <linux/list.h>
568+#include <cluster/cnxman.h>
569+#include <cluster/service.h>
570+
571+#include "cnxman-private.h"
572+#include "sm_control.h"
573+#include "sm_user.h"
574+#include "config.h"
575+
576+#define CMAN_RELEASE_NAME "<CVS>"
577+
578+static int __cl_setsockopt(struct socket *sock, int level, int optname,
579+ char *optval, int optlen, int flags);
580+static int __cl_getsockopt(struct socket *sock, int level, int optname,
581+ char *optval, int *optlen, int flags);
582+static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
583+ char *addr, int addrlen);
584+static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
585+ int addr_len, char *addr, unsigned char remport,
586+ unsigned char flag);
587+static void send_listen_request(int nodeid, unsigned char port);
588+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
589+ unsigned char port, unsigned short tag);
590+static void resend_last_message(void);
591+static void start_ack_timer(void);
592+static int send_queued_message(struct queued_message *qmsg);
593+static void send_port_close_oob(unsigned char port);
594+static void post_close_oob(unsigned char port, int nodeid);
595+static void process_barrier_msg(struct cl_barriermsg *msg,
596+ struct cluster_node *node);
597+static struct cl_barrier *find_barrier(char *name);
598+static void node_shutdown(void);
599+static void node_cleanup(void);
600+static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
601+ unsigned char port);
602+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
603+static void check_for_unacked_nodes(void);
604+static void free_cluster_sockets(void);
605+static uint16_t generate_cluster_id(char *name);
606+
607+static int is_valid_temp_nodeid(int nodeid);
608+
609+extern int start_membership_services(pid_t);
610+extern int kcl_leave_cluster(int remove);
611+extern int send_kill(int nodeid);
612+
613+static struct proto_ops cl_proto_ops;
614+static struct sock *master_sock;
615+static kmem_cache_t *cluster_sk_cachep;
616+
617+/* Pointer to the pseudo node that maintains quorum in a 2node system */
618+struct cluster_node *quorum_device = NULL;
619+
620+/* Array of "ports" allocated. This is just a list of pointers to the sock that
621+ * has this port bound. Speed is a major issue here so 1-2K of allocated
622+ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
623+static struct sock *port_array[256];
624+static struct semaphore port_array_lock;
625+
626+/* Our cluster name & number */
627+unsigned short cluster_id;
628+char cluster_name[MAX_CLUSTER_NAME_LEN+1];
629+
630+/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
631+ * No more than two nodes are permitted to join the cluster. */
632+unsigned short two_node;
633+
634+/* Cluster configuration version that must be the same among members. */
635+unsigned int config_version;
636+
637+/* Reference counting for cluster applications */
638+atomic_t use_count;
639+
640+/* Length of sockaddr address for our comms protocol */
641+unsigned int address_length;
642+
643+/* Message sending */
644+static unsigned short cur_seq; /* Last message sent */
645+static unsigned int ack_count; /* Number of acks received for message
646+ * 'cur_seq' */
647+static unsigned int acks_expected; /* Number of acks we expect to receive */
648+static struct semaphore send_lock;
649+static struct timer_list ack_timer;
650+
651+/* Saved packet information in case we need to resend it */
652+static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
653+static int saved_msg_len;
654+static int retry_count;
655+
656+/* Task variables */
657+static pid_t kcluster_pid;
658+static pid_t membership_pid;
659+extern int quit_threads;
660+
661+wait_queue_head_t cnxman_waitq;
662+
663+/* Variables owned by membership services */
664+extern int cluster_members;
665+extern struct list_head cluster_members_list;
666+extern struct semaphore cluster_members_lock;
667+extern int we_are_a_cluster_member;
668+extern int cluster_is_quorate;
669+extern struct cluster_node *us;
670+extern struct list_head new_dead_node_list;
671+extern struct semaphore new_dead_node_lock;
672+extern char nodename[];
673+
674+/* A list of processes listening for membership events */
675+static struct list_head event_listener_list;
676+static struct semaphore event_listener_lock;
677+
678+/* A list of kernel callbacks listening for membership events */
679+static struct list_head kernel_listener_list;
680+static struct semaphore kernel_listener_lock;
681+
682+/* A list of sockets we are listening on (and can transmit on...later) */
683+static struct list_head socket_list;
684+
685+/* A list of all open cluster client sockets */
686+static struct list_head client_socket_list;
687+static struct semaphore client_socket_lock;
688+
689+/* A list of all current barriers */
690+static struct list_head barrier_list;
691+static struct semaphore barrier_list_lock;
692+
693+/* When a socket is read for reading it goes on this queue */
694+static spinlock_t active_socket_lock;
695+static struct list_head active_socket_list;
696+
697+/* If the cnxman process is running and available for work */
698+atomic_t cnxman_running;
699+
700+/* Fkags set by timers etc for the mainloop to detect and act upon */
701+static unsigned long mainloop_flags;
702+
703+#define ACK_TIMEOUT 1
704+#define RESEND_NEEDED 2
705+
706+/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
707+ * process context then the messages get put in here */
708+static struct list_head messages_list;
709+static struct semaphore messages_list_lock;
710+
711+static struct semaphore start_thread_sem;
712+
713+/* List of outstanding ISLISTENING requests */
714+static struct list_head listenreq_list;
715+static struct semaphore listenreq_lock;
716+
717+/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
718+ * ACK) */
719+static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
720+
721+/* Wait for thread to exit properly */
722+struct completion cluster_thread_comp;
723+struct completion member_thread_comp;
724+
725+/* The resend delay to use, We increase this geometrically(word?) each time a
726+ * send is delayed. in deci-seconds */
727+static int resend_delay = 1;
728+
729+/* Highest numbered interface and the current default */
730+static int num_interfaces = 0;
731+static struct cl_comms_socket *current_interface = NULL;
732+
733+struct temp_node
734+{
735+ int nodeid;
736+ char addr[sizeof(struct sockaddr_in6)];
737+ int addrlen;
738+ struct list_head list;
739+};
740+static struct list_head tempnode_list;
741+static struct semaphore tempnode_lock;
742+
743+/* Wake up any processes that are waiting to send. This is usually called when
744+ * all the ACKs have been gathered up or when a node has left the cluster
745+ * unexpectedly and we reckon there are no more acks to collect */
746+static void unjam(void)
747+{
748+ wake_up_interruptible(&socket_waitq);
749+ wake_up_interruptible(&cnxman_waitq);
750+}
751+
752+/* Used by the data_ready routine to locate a connection given the socket */
753+static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
754+{
755+ struct list_head *conlist;
756+
757+ list_for_each(conlist, &socket_list) {
758+ struct cl_comms_socket *clsock =
759+ list_entry(conlist, struct cl_comms_socket, list);
760+ if (clsock->sock->sk == sk) {
761+ return clsock;
762+ }
763+ }
764+ return NULL;
765+}
766+
767+/* Data available on socket */
768+static void cnxman_data_ready(struct sock *sk, int count_unused)
769+{
770+ struct cl_comms_socket *clsock = find_comms_by_sock(sk);
771+
772+ if (clsock == NULL) /* ASSERT ?? */
773+ return;
774+
775+ /* If we're already on the list then don't do it again */
776+ if (test_and_set_bit(1, &clsock->active))
777+ return;
778+
779+ spin_lock_irq(&active_socket_lock);
780+ list_add(&clsock->active_list, &active_socket_list);
781+ spin_unlock_irq(&active_socket_lock);
782+
783+ wake_up_interruptible(&cnxman_waitq);
784+}
785+
786+static int receive_message(struct cl_comms_socket *csock, char *iobuf)
787+{
788+ struct msghdr msg;
789+ struct iovec iov;
790+ struct sockaddr_in6 sin;
791+ int len;
792+ mm_segment_t fs;
793+
794+ memset(&sin, 0, sizeof (sin));
795+
796+ msg.msg_control = NULL;
797+ msg.msg_controllen = 0;
798+ msg.msg_iovlen = 1;
799+ msg.msg_iov = &iov;
800+ msg.msg_name = &sin;
801+ msg.msg_namelen = sizeof (sin);
802+ msg.msg_flags = 0;
803+
804+ iov.iov_len = MAX_CLUSTER_MESSAGE;
805+ iov.iov_base = iobuf;
806+
807+ fs = get_fs();
808+ set_fs(get_ds());
809+
810+ len = sock_recvmsg(csock->sock, &msg, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
811+ set_fs(fs);
812+
813+ if (len > 0) {
814+ if (len > MAX_CLUSTER_MESSAGE) {
815+ printk(KERN_CRIT CMAN_NAME
816+ ": %d byte message far too big\n", len);
817+ return 0;
818+ }
819+ send_to_userport(csock, iobuf, len, msg.msg_name, msg.msg_namelen);
820+ }
821+ else {
822+ if (len != -EAGAIN)
823+ printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
824+ len);
825+ }
826+ return len;
827+}
828+
829+static int cluster_kthread(void *unused)
830+{
831+ int len;
832+ char *iobuf;
833+ struct list_head *socklist;
834+ struct cl_comms_socket *csock;
835+ wait_queue_t cnxman_waitq_head;
836+ sigset_t tmpsig;
837+
838+ daemonize("cman_comms");
839+
840+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
841+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
842+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
843+
844+ /* This is the waitq we can wake the process up with */
845+ init_waitqueue_head(&cnxman_waitq);
846+ init_waitqueue_entry(&cnxman_waitq_head, current);
847+ add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
848+
849+ set_user_nice(current, -6);
850+
851+ /* Allow the sockets to start receiving */
852+ list_for_each(socklist, &socket_list) {
853+ csock = list_entry(socklist, struct cl_comms_socket, list);
854+
855+ clear_bit(1, &csock->active);
856+ }
857+
858+ iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
859+ if (!iobuf) {
860+ printk(KERN_CRIT CMAN_NAME
861+ ": Cannot allocate receive buffer for cluster comms\n");
862+ return -1;
863+ }
864+
865+ complete(&cluster_thread_comp);
866+
867+ for (;;) {
868+ struct list_head *temp;
869+
870+ /* Wait for activity on any of the sockets */
871+ set_task_state(current, TASK_INTERRUPTIBLE);
872+
873+ if (list_empty(&active_socket_list))
874+ schedule();
875+ set_task_state(current, TASK_RUNNING);
876+
877+ if (quit_threads)
878+ break;
879+
880+ if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
881+ check_for_unacked_nodes();
882+ }
883+
884+ /* Now receive any messages waiting for us */
885+ spin_lock_irq(&active_socket_lock);
886+ list_for_each_safe(socklist, temp, &active_socket_list) {
887+ csock =
888+ list_entry(socklist, struct cl_comms_socket,
889+ active_list);
890+
891+ list_del(&csock->active_list);
892+ clear_bit(1, &csock->active);
893+
894+ spin_unlock_irq(&active_socket_lock);
895+
896+ do {
897+ len = receive_message(csock, iobuf);
898+ }
899+ while (len > 0);
900+
901+ spin_lock_irq(&active_socket_lock);
902+
903+ if (len == 0)
904+ break; /* EOF on socket */
905+ }
906+ spin_unlock_irq(&active_socket_lock);
907+
908+ /* Resend any unacked messages */
909+ if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
910+ && acks_expected) {
911+ resend_last_message();
912+ }
913+
914+ /* Send any queued messages */
915+ if (acks_expected == 0) {
916+ struct list_head *temp;
917+ struct list_head *msglist;
918+
919+ down(&messages_list_lock);
920+ list_for_each_safe(msglist, temp, &messages_list) {
921+ struct queued_message *qmsg =
922+ list_entry(msglist, struct queued_message,
923+ list);
924+ int status = send_queued_message(qmsg);
925+
926+ if (status >= 0) {
927+ /* Suceeded, remove it from the queue */
928+ list_del(&qmsg->list);
929+ kfree(qmsg);
930+ }
931+ /* Did it fail horribly ?? */
932+ if (status < 0 && status != -EAGAIN) {
933+ printk(KERN_INFO CMAN_NAME
934+ ": send_queued_message failed, error %d\n",
935+ status);
936+ list_del(&qmsg->list);
937+ kfree(qmsg);
938+ }
939+ break; /* Only send one message at a time */
940+ }
941+ up(&messages_list_lock);
942+ }
943+
944+ if (signal_pending(current))
945+ break;
946+ }
947+ P_COMMS("closing down\n");
948+
949+ if (we_are_a_cluster_member)
950+ send_leave(us->leave_reason);
951+
952+ kfree(iobuf);
953+ quit_threads = 1; /* force other thread to die too */
954+ node_shutdown();
955+
956+ if (timer_pending(&ack_timer))
957+ del_timer(&ack_timer);
958+
959+ /* Wait for membership thread to die */
960+ wait_for_completion(&member_thread_comp);
961+
962+ node_cleanup();
963+
964+ complete(&cluster_thread_comp);
965+ return 0;
966+}
967+
968+void notify_kernel_listeners(kcl_callback_reason reason, long arg)
969+{
970+ struct kernel_notify_struct *knotify;
971+ struct list_head *proclist;
972+
973+ down(&kernel_listener_lock);
974+ list_for_each(proclist, &kernel_listener_list) {
975+ knotify =
976+ list_entry(proclist, struct kernel_notify_struct, list);
977+ knotify->callback(reason, arg);
978+ }
979+ up(&kernel_listener_lock);
980+}
981+
982+static void check_for_unacked_nodes()
983+{
984+ struct list_head *nodelist;
985+ struct cluster_node *node;
986+
987+ clear_bit(RESEND_NEEDED, &mainloop_flags);
988+ retry_count = 0;
989+
990+ P_COMMS("Retry count exceeded -- looking for dead node\n");
991+
992+ /* Node did not ACK a message after <n> tries, remove it from the
993+ * cluster */
994+ down(&cluster_members_lock);
995+ list_for_each(nodelist, &cluster_members_list) {
996+ node = list_entry(nodelist, struct cluster_node, list);
997+
998+ P_COMMS
999+ ("checking node %s: last_acked = %d, last_seq_sent = %d\n",
1000+ node->name, node->last_seq_acked, node->last_seq_sent);
1001+ if (node->state != NODESTATE_DEAD
1002+ && node->last_seq_acked != node->last_seq_sent && !node->us) {
1003+ printk(KERN_WARNING CMAN_NAME
1004+ ": node %s is not responding - removing from the cluster\n",
1005+ node->name);
1006+
1007+ /* Start a state transition */
1008+ a_node_just_died(node);
1009+ }
1010+ }
1011+ up(&cluster_members_lock);
1012+ acks_expected = ack_count = 0;
1013+ unjam();
1014+ return;
1015+}
1016+
1017+static void ack_timer_fn(unsigned long arg)
1018+{
1019+ P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
1020+
1021+ /* Too many retries ? */
1022+ if (++retry_count > MAX_RETRIES) {
1023+ set_bit(ACK_TIMEOUT, &mainloop_flags);
1024+ wake_up_interruptible(&cnxman_waitq);
1025+ }
1026+ else {
1027+ /* Resend last message */
1028+ set_bit(RESEND_NEEDED, &mainloop_flags);
1029+ wake_up_interruptible(&cnxman_waitq);
1030+ }
1031+}
1032+
1033+/* Called to resend a packet if sock_sendmsg was busy */
1034+static void short_timer_fn(unsigned long arg)
1035+{
1036+ P_COMMS("short_timer fired\n");
1037+
1038+ /* Resend last message */
1039+ resend_delay <<= 1;
1040+ set_bit(RESEND_NEEDED, &mainloop_flags);
1041+ wake_up_interruptible(&cnxman_waitq);
1042+}
1043+
1044+static void start_ack_timer()
1045+{
1046+ ack_timer.function = ack_timer_fn;
1047+ ack_timer.data = 0L;
1048+ mod_timer(&ack_timer, jiffies + HZ);
1049+}
1050+
1051+static void start_short_timer(void)
1052+{
1053+ ack_timer.function = short_timer_fn;
1054+ ack_timer.data = 0L;
1055+ mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
1056+}
1057+
1058+
1059+static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
1060+{
1061+ struct list_head *llist;
1062+ struct cl_waiting_listen_request *listener;
1063+
1064+ down(&listenreq_lock);
1065+ list_for_each(llist, &listenreq_list) {
1066+ listener =
1067+ list_entry(llist, struct cl_waiting_listen_request, list);
1068+ if (listener->tag == tag) {
1069+ up(&listenreq_lock);
1070+ return listener;
1071+ }
1072+ }
1073+ up(&listenreq_lock);
1074+ return NULL;
1075+}
1076+
1077+static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
1078+ int len, char *addr, int addrlen,
1079+ struct cluster_node *rem_node)
1080+{
1081+ struct cl_protmsg *msg = (struct cl_protmsg *) data;
1082+ struct cl_protheader *header = (struct cl_protheader *) data;
1083+ struct cl_ackmsg *ackmsg;
1084+ struct cl_listenmsg *listenmsg;
1085+ struct cl_closemsg *closemsg;
1086+ struct cl_barriermsg *barriermsg;
1087+ struct cl_waiting_listen_request *listen_request;
1088+
1089+ P_COMMS("Message on port 0 is %d\n", msg->cmd);
1090+ switch (msg->cmd) {
1091+ case CLUSTER_CMD_ACK:
1092+ ackmsg = (struct cl_ackmsg *) data;
1093+
1094+ if (ackmsg->aflags & 1) {
1095+ if (net_ratelimit())
1096+ printk(KERN_INFO CMAN_NAME
1097+ ": WARNING no listener for port %d on node %s\n",
1098+ ackmsg->remport, rem_node->name);
1099+ }
1100+ P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
1101+ rem_node ? rem_node->name : "Unknown",
1102+ le16_to_cpu(ackmsg->seq), cur_seq);
1103+
1104+ if (rem_node && rem_node->state != NODESTATE_DEAD) {
1105+ /* This copes with duplicate acks from a multipathed
1106+ * host */
1107+ if (rem_node->last_seq_acked !=
1108+ le16_to_cpu(ackmsg->seq)) {
1109+ rem_node->last_seq_acked =
1110+ le16_to_cpu(ackmsg->seq);
1111+
1112+ /* Got em all */
1113+ if (++ack_count >= acks_expected) {
1114+
1115+ /* Cancel the timer */
1116+ del_timer(&ack_timer);
1117+ acks_expected = 0;
1118+ unjam();
1119+ }
1120+ }
1121+ }
1122+ else {
1123+ if (cluster_members) {
1124+#ifdef DEBUG_COMMS
1125+ char buf[MAX_ADDR_PRINTED_LEN];
1126+
1127+ printk(KERN_INFO CMAN_NAME
1128+ ": got ack from unknown or dead node: %s\n",
1129+ print_addr(addr, addrlen, buf));
1130+#endif
1131+ }
1132+ }
1133+ break;
1134+
1135+ /* Return 1 if we have a listener on this port, 0 if not */
1136+ case CLUSTER_CMD_LISTENREQ:
1137+ listenmsg =
1138+ (struct cl_listenmsg *) (data +
1139+ sizeof (struct cl_protheader));
1140+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1141+ send_listen_response(csock, le32_to_cpu(header->srcid),
1142+ listenmsg->target_port, listenmsg->tag);
1143+ break;
1144+
1145+ case CLUSTER_CMD_LISTENRESP:
1146+ /* Wake up process waiting for listen response */
1147+ listenmsg =
1148+ (struct cl_listenmsg *) (data +
1149+ sizeof (struct cl_protheader));
1150+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1151+ listen_request = find_listen_request(listenmsg->tag);
1152+ if (listen_request) {
1153+ listen_request->result = listenmsg->listening;
1154+ listen_request->waiting = 0;
1155+ wake_up_interruptible(&listen_request->waitq);
1156+ }
1157+ break;
1158+
1159+ case CLUSTER_CMD_PORTCLOSED:
1160+ closemsg =
1161+ (struct cl_closemsg *) (data +
1162+ sizeof (struct cl_protheader));
1163+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1164+ post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
1165+ break;
1166+
1167+ case CLUSTER_CMD_BARRIER:
1168+ barriermsg =
1169+ (struct cl_barriermsg *) (data +
1170+ sizeof (struct cl_protheader));
1171+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1172+ process_barrier_msg(barriermsg, rem_node);
1173+ break;
1174+
1175+ default:
1176+ printk(KERN_ERR CMAN_NAME
1177+ ": Unknown protocol message %d received\n", msg->cmd);
1178+ break;
1179+
1180+ }
1181+ return;
1182+}
1183+
1184+static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
1185+ char *addr, int addrlen)
1186+{
1187+ int err;
1188+ struct cl_protheader *header = (struct cl_protheader *) data;
1189+ struct cluster_node *rem_node =
1190+ find_node_by_nodeid(le32_to_cpu(header->srcid));
1191+ struct sk_buff *skb = NULL;
1192+
1193+ P_COMMS
1194+ ("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
1195+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1196+ le16_to_cpu(header->seq), rem_node,
1197+ rem_node ? rem_node->state : -1);
1198+
1199+ /* If the remote end is being coy about its node ID then look it up by
1200+ * address */
1201+ if (!rem_node && header->srcid == 0) {
1202+ rem_node = find_node_by_addr(addr, addrlen);
1203+ }
1204+
1205+ /* If this node is an ex-member then treat it as unknown */
1206+ if (rem_node && rem_node->state != NODESTATE_MEMBER
1207+ && rem_node->state != NODESTATE_JOINING)
1208+ rem_node = NULL;
1209+
1210+ /* Ignore messages not for our cluster */
1211+ if (le16_to_cpu(header->cluster) != cluster_id) {
1212+ P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
1213+ cluster_id, header->cluster);
1214+ goto userport_finish;
1215+ }
1216+
1217+ /* If the message is from us then just dump it */
1218+ if (rem_node && rem_node->us)
1219+ goto userport_finish;
1220+
1221+ /* If we can't find the nodeid then check for our own messages the hard
1222+ * way - this only happens during joining */
1223+ if (!rem_node) {
1224+ struct list_head *socklist;
1225+ struct cl_comms_socket *clsock;
1226+
1227+ list_for_each(socklist, &socket_list) {
1228+ clsock =
1229+ list_entry(socklist, struct cl_comms_socket, list);
1230+
1231+ if (clsock->recv_only) {
1232+
1233+ if (memcmp(addr, &clsock->saddr, address_length) == 0) {
1234+ goto userport_finish;
1235+ }
1236+ }
1237+ }
1238+
1239+ }
1240+
1241+ /* Ignore messages not for us */
1242+ if (le32_to_cpu(header->tgtid) > 0 && us
1243+ && le32_to_cpu(header->tgtid) != us->node_id) {
1244+ goto userport_finish;
1245+ }
1246+
1247+ P_COMMS("got message, from %d for %d, sequence num = %d\n",
1248+ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1249+ le16_to_cpu(header->seq));
1250+
1251+ /* Have we received this message before ? If so just ignore it, it's a
1252+ * resend for someone else's benefit */
1253+ if (!(header->flags & (MSG_NOACK >> 16)) &&
1254+ rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
1255+ P_COMMS
1256+ ("Discarding message - Already seen this sequence number %d\n",
1257+ rem_node->last_seq_recv);
1258+ /* Still need to ACK it though, in case it was the ACK that got
1259+ * lost */
1260+ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
1261+ goto userport_finish;
1262+ }
1263+
1264+ /* If it's a new node then assign it a temporary node ID */
1265+ if (!rem_node)
1266+ header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
1267+
1268+ P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
1269+ header->flags, header->port, we_are_a_cluster_member);
1270+
1271+
1272+ /* If we are not part of the cluster then ignore multicast messages
1273+ * that need an ACK as we will confuse the sender who is only expecting
1274+ * ACKS from bona fide members */
1275+ if (header->flags & (MSG_MULTICAST >> 16) &&
1276+ !(header->flags & (MSG_NOACK >> 16)) && !we_are_a_cluster_member) {
1277+ P_COMMS
1278+ ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
1279+ header->port, header->flags);
1280+ goto userport_finish;
1281+ }
1282+
1283+ /* Save the sequence number of this message so we can ignore duplicates
1284+ * (above) */
1285+ if (!(header->flags & (MSG_NOACK >> 16)) && rem_node) {
1286+ P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
1287+ rem_node->name);
1288+ rem_node->last_seq_recv = le16_to_cpu(header->seq);
1289+ }
1290+
1291+ /* Is it a protocol message? */
1292+ if (header->port == 0) {
1293+ process_cnxman_message(csock, data, len, addr, addrlen,
1294+ rem_node);
1295+ goto userport_finish;
1296+ }
1297+
1298+ /* Skip past the header to the data */
1299+ data += sizeof (struct cl_protheader);
1300+ len -= sizeof (struct cl_protheader);
1301+
1302+ /* Get the port number and look for a listener */
1303+ down(&port_array_lock);
1304+ if (port_array[header->port]) {
1305+ int native_srcid;
1306+ struct cluster_sock *c = cluster_sk(port_array[header->port]);
1307+
1308+ /* ACK it */
1309+ if (!(header->flags & (MSG_NOACK >> 16)))
1310+ cl_sendack(csock, header->seq, addrlen, addr,
1311+ header->port, 0);
1312+
1313+ /* Call a callback if there is one */
1314+ if (c->kernel_callback) {
1315+ up(&port_array_lock);
1316+ c->kernel_callback(data, len, addr, addrlen,
1317+ le32_to_cpu(header->srcid));
1318+ goto userport_finish;
1319+ }
1320+
1321+ /* Otherwise put it into an SKB and pass it onto the recvmsg
1322+ * mechanism */
1323+ skb = alloc_skb(len, GFP_KERNEL);
1324+ if (!skb) {
1325+ up(&port_array_lock);
1326+ printk(KERN_INFO CMAN_NAME
1327+ ": Failed to allocate skb\n");
1328+ return;
1329+ }
1330+
1331+ skb_put(skb, len);
1332+ memcpy(skb->data, data, len);
1333+
1334+ /* Put the nodeid into cb so we can pass it to the clients */
1335+ skb->cb[0] = 0; /* Clear flags */
1336+ native_srcid = le32_to_cpu(header->srcid);
1337+ memcpy(skb->cb + 1, &native_srcid, sizeof(int));
1338+
1339+ if ((err =
1340+ sock_queue_rcv_skb(port_array[header->port], skb)) < 0) {
1341+
1342+ printk(KERN_INFO CMAN_NAME
1343+ ": Error queueing request to port %d: %d\n",
1344+ header->port, err);
1345+ kfree_skb(skb);
1346+
1347+ /* If the port was MEMBERSHIP then we have to die */
1348+ if (header->port == CLUSTER_PORT_MEMBERSHIP) {
1349+ up(&port_array_lock);
1350+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
1351+ panic("membership stopped responding");
1352+ }
1353+ }
1354+ up(&port_array_lock);
1355+
1356+ }
1357+ else {
1358+ /* ACK it, but set the flag bit so remote end knows no-one
1359+ * caught it */
1360+ if (!(header->flags & (MSG_NOACK >> 16)))
1361+ cl_sendack(csock, header->seq, addrlen, addr,
1362+ header->port, 1);
1363+
1364+ /* Nobody listening, drop it */
1365+ up(&port_array_lock);
1366+ }
1367+
1368+ userport_finish:
1369+ return;
1370+}
1371+
1372+static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
1373+{
1374+ struct sock *sk;
1375+ struct cluster_sock *c;
1376+
1377+ if ((sk =
1378+ sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
1379+ cluster_sk_cachep)) == NULL)
1380+ goto no_sock;
1381+
1382+ if (sock) {
1383+ sock->ops = &cl_proto_ops;
1384+ }
1385+ sock_init_data(sock, sk);
1386+
1387+ sk->sk_destruct = NULL;
1388+ sk->sk_no_check = 1;
1389+ sk->sk_family = PF_CLUSTER;
1390+ sk->sk_allocation = gfp;
1391+
1392+ c = cluster_sk(sk);
1393+ c->port = 0;
1394+ c->service_data = NULL;
1395+
1396+ return sk;
1397+ no_sock:
1398+ return NULL;
1399+}
1400+
1401+static int cl_release(struct socket *sock)
1402+{
1403+ struct sock *sk = sock->sk;
1404+ struct cl_client_socket *csock;
1405+ struct list_head *socklist;
1406+ struct list_head *tmp;
1407+
1408+ down(&client_socket_lock);
1409+ if (sk) {
1410+ /* Remove port allocations if it's a bound socket */
1411+ struct cluster_sock *c = cluster_sk(sk);
1412+
1413+ down(&port_array_lock);
1414+ if (c->port) {
1415+ port_array[c->port] = NULL;
1416+ }
1417+ up(&port_array_lock);
1418+
1419+ /* Tell other nodes in the cluster that this listener is going
1420+ * away */
1421+ if (atomic_read(&cnxman_running) && c->port)
1422+ send_port_close_oob(c->port);
1423+
1424+ if (c->service_data)
1425+ sm_sock_release(sock);
1426+
1427+ /* Master socket released ? */
1428+ if (sk->sk_protocol == CLPROTO_MASTER) {
1429+ master_sock = NULL;
1430+
1431+ /* If this socket is being freed and cnxman is not
1432+ * started then free all the comms sockets as either
1433+ * the userland "join" process has crashed or the
1434+ * join failed.
1435+ */
1436+ if (!atomic_read(&cnxman_running)) {
1437+ quit_threads = 1;
1438+ free_cluster_sockets();
1439+ }
1440+ }
1441+
1442+ sock_orphan(sk);
1443+ sock_hold(sk);
1444+ lock_sock(sk);
1445+ release_sock(sk);
1446+ sock_put(sk);
1447+ sock_put(sk);
1448+ sock->sk = NULL;
1449+ }
1450+
1451+ /* Remove it from the list of clients */
1452+ list_for_each_safe(socklist, tmp, &client_socket_list) {
1453+ csock = list_entry(socklist, struct cl_client_socket, list);
1454+
1455+ if (csock->sock == sock) {
1456+ list_del(&csock->list);
1457+ kfree(csock);
1458+ break;
1459+ }
1460+ }
1461+ up(&client_socket_lock);
1462+
1463+ return 0;
1464+}
1465+
1466+static int cl_create(struct socket *sock, int protocol)
1467+{
1468+ struct sock *sk;
1469+
1470+ /* All are datagrams */
1471+ if (sock->type != SOCK_DGRAM)
1472+ return -ESOCKTNOSUPPORT;
1473+
1474+ if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
1475+ return -EPERM;
1476+
1477+ /* Can only have one master socket */
1478+ if (master_sock && protocol == CLPROTO_MASTER)
1479+ return -EBUSY;
1480+
1481+ /* cnxman not running and a client was requested */
1482+ if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
1483+ return -ENETDOWN;
1484+
1485+ if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
1486+ return -ENOBUFS;
1487+
1488+ sk->sk_protocol = protocol;
1489+
1490+ if (protocol == CLPROTO_MASTER)
1491+ master_sock = sk;
1492+
1493+ /* Add client sockets to the list */
1494+ if (protocol == CLPROTO_CLIENT) {
1495+ struct cl_client_socket *clsock =
1496+ kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
1497+ if (!clsock) {
1498+ cl_release(sock);
1499+ return -ENOMEM;
1500+ }
1501+ clsock->sock = sock;
1502+ down(&client_socket_lock);
1503+ list_add(&clsock->list, &client_socket_list);
1504+ up(&client_socket_lock);
1505+ }
1506+
1507+ return 0;
1508+}
1509+
1510+static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1511+{
1512+ struct sock *sk = sock->sk;
1513+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
1514+ struct cluster_sock *c = cluster_sk(sk);
1515+
1516+ if (!capable(CAP_NET_BIND_SERVICE))
1517+ return -EPERM;
1518+
1519+ if (sk->sk_zapped == 0)
1520+ return -EINVAL;
1521+
1522+ if (addr_len != sizeof (struct sockaddr_cl))
1523+ return -EINVAL;
1524+
1525+ if (saddr->scl_family != AF_CLUSTER)
1526+ return -EINVAL;
1527+
1528+ if (saddr->scl_port == 0)
1529+ return -EINVAL; /* Port 0 is reserved for protocol messages */
1530+
1531+ down(&port_array_lock);
1532+
1533+ if (port_array[saddr->scl_port]) {
1534+ up(&port_array_lock);
1535+ return -EADDRINUSE;
1536+ }
1537+
1538+ port_array[saddr->scl_port] = sk;
1539+
1540+ up(&port_array_lock);
1541+
1542+ c->port = saddr->scl_port;
1543+ sk->sk_zapped = 0;
1544+
1545+ /* If we are not a cluster member yet then make the client wait until
1546+ * we are, this allows nodes to start cluster clients at the same time
1547+ * as cluster services but they will wait until membership is achieved.
1548+ * This looks odd in bind() (open would seem more obvious) but we need
1549+ * to know which port number is being used so that things like
1550+ * membership services don't get blocked
1551+ */
1552+
1553+ if (saddr->scl_port > HIGH_PROTECTED_PORT)
1554+ while (!we_are_a_cluster_member || !cluster_is_quorate
1555+ || in_transition()) {
1556+ DECLARE_WAITQUEUE(wq, current);
1557+ struct task_struct *tsk = current;
1558+
1559+ set_task_state(tsk, TASK_INTERRUPTIBLE);
1560+ add_wait_queue(&socket_waitq, &wq);
1561+
1562+ if (!we_are_a_cluster_member || !cluster_is_quorate
1563+ || in_transition())
1564+ schedule();
1565+
1566+ set_task_state(tsk, TASK_RUNNING);
1567+ remove_wait_queue(&socket_waitq, &wq);
1568+
1569+ /* We were woken up because the cluster is going down,
1570+ * ...and we never got a chance to do any work! (sob) */
1571+ if (atomic_read(&cnxman_running) == 0 || quit_threads) {
1572+ return -ENOTCONN;
1573+ }
1574+ }
1575+
1576+ return 0;
1577+}
1578+
1579+static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
1580+ int *uaddr_len, int peer)
1581+{
1582+ struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
1583+ struct sock *sk = sock->sk;
1584+ struct cluster_sock *c = cluster_sk(sk);
1585+
1586+ *uaddr_len = sizeof (struct sockaddr_cl);
1587+
1588+ lock_sock(sk);
1589+
1590+ sa->scl_port = c->port;
1591+ sa->scl_flags = 0;
1592+ sa->scl_family = AF_CLUSTER;
1593+
1594+ release_sock(sk);
1595+
1596+ return 0;
1597+}
1598+
1599+static unsigned int cl_poll(struct file *file, struct socket *sock,
1600+ poll_table * wait)
1601+{
1602+ return datagram_poll(file, sock, wait);
1603+}
1604+
1605+/* Copy internal node format to userland format */
1606+void copy_to_usernode(struct cluster_node *node,
1607+ struct cl_cluster_node *unode)
1608+{
1609+ strcpy(unode->name, node->name);
1610+ unode->size = sizeof (struct cl_cluster_node);
1611+ unode->votes = node->votes;
1612+ unode->state = node->state;
1613+ unode->us = node->us;
1614+ unode->node_id = node->node_id;
1615+ unode->leave_reason = node->leave_reason;
1616+ unode->incarnation = node->incarnation;
1617+}
1618+
1619+/* ioctl processing functions */
1620+
1621+static int do_ioctl_set_version(unsigned long arg)
1622+{
1623+ struct cl_version version, *u_version;
1624+
1625+ if (!capable(CAP_CLUSTER))
1626+ return -EPERM;
1627+ if (arg == 0)
1628+ return -EINVAL;
1629+
1630+ u_version = (struct cl_version *) arg;
1631+
1632+ if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
1633+ return -EFAULT;
1634+
1635+ if (version.major != CNXMAN_MAJOR_VERSION ||
1636+ version.minor != CNXMAN_MINOR_VERSION ||
1637+ version.patch != CNXMAN_PATCH_VERSION)
1638+ return -EINVAL;
1639+
1640+ if (config_version == version.config)
1641+ return 0;
1642+
1643+ config_version = version.config;
1644+ send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
1645+ return 0;
1646+}
1647+
1648+static int do_ioctl_get_members(unsigned long arg)
1649+{
1650+ struct cluster_node *node;
1651+ /* Kernel copies */
1652+ struct cl_cluster_node user_format_node;
1653+ struct cl_cluster_nodelist user_format_nodelist;
1654+ /* User space array ptr */
1655+ struct cl_cluster_node *user_node;
1656+ struct list_head *nodelist;
1657+ int num_nodes = 0;
1658+
1659+ if (arg == 0)
1660+ return cluster_members;
1661+
1662+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1663+ return -EFAULT;
1664+
1665+ down(&cluster_members_lock);
1666+
1667+ if (user_format_nodelist.max_members < cluster_members) {
1668+ up(&cluster_members_lock);
1669+ return -E2BIG;
1670+ }
1671+
1672+ user_node = user_format_nodelist.nodes;
1673+
1674+ list_for_each(nodelist, &cluster_members_list) {
1675+ node = list_entry(nodelist, struct cluster_node, list);
1676+ if (node->state == NODESTATE_MEMBER) {
1677+ copy_to_usernode(node, &user_format_node);
1678+ if (copy_to_user(user_node, &user_format_node,
1679+ sizeof (struct cl_cluster_node))) {
1680+ up(&cluster_members_lock);
1681+ return -EFAULT;
1682+ }
1683+ user_node++;
1684+ num_nodes++;
1685+ }
1686+ }
1687+ up(&cluster_members_lock);
1688+
1689+ return num_nodes;
1690+}
1691+
1692+static int do_ioctl_get_all_members(unsigned long arg)
1693+{
1694+ struct cluster_node *node;
1695+ /* Kernel copies */
1696+ struct cl_cluster_node user_format_node;
1697+ struct cl_cluster_nodelist user_format_nodelist;
1698+ /* User space array ptr*/
1699+ struct cl_cluster_node *user_node;
1700+ struct list_head *nodelist;
1701+ int num_nodes = 0;
1702+
1703+ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1704+ return -EFAULT;
1705+
1706+ down(&cluster_members_lock);
1707+
1708+ user_node = user_format_nodelist.nodes;
1709+
1710+ list_for_each(nodelist, &cluster_members_list) {
1711+ node = list_entry(nodelist, struct cluster_node, list);
1712+ if (arg) {
1713+ copy_to_usernode(node,
1714+ &user_format_node);
1715+
1716+ if (copy_to_user(user_node, &user_format_node,
1717+ sizeof (struct cl_cluster_node))) {
1718+ up(&cluster_members_lock);
1719+ return -EFAULT;
1720+ }
1721+ user_node++;
1722+ if (--user_format_nodelist.max_members < 0) {
1723+ num_nodes = -EFAULT;
1724+ goto err_exit;
1725+ }
1726+
1727+ }
1728+ num_nodes++;
1729+ }
1730+ err_exit:
1731+ up(&cluster_members_lock);
1732+
1733+ return num_nodes;
1734+}
1735+
1736+static int do_ioctl_get_node(unsigned long arg)
1737+{
1738+ struct cluster_node *node;
1739+ struct cl_cluster_node k_node, *u_node;
1740+
1741+ u_node = (struct cl_cluster_node *) arg;
1742+
1743+ if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
1744+ return -EFAULT;
1745+
1746+ if (k_node.node_id)
1747+ node = find_node_by_nodeid(k_node.node_id);
1748+ else
1749+ node = find_node_by_name(k_node.name);
1750+
1751+ if (!node)
1752+ return -ENOENT;
1753+
1754+ copy_to_usernode(node, &k_node);
1755+
1756+ if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
1757+ return -EFAULT;
1758+
1759+ return 0;
1760+}
1761+
1762+static int do_ioctl_set_expected(unsigned long arg)
1763+{
1764+ struct list_head *nodelist;
1765+ struct cluster_node *node;
1766+ unsigned int total_votes;
1767+ unsigned int newquorum;
1768+
1769+ if (!capable(CAP_CLUSTER))
1770+ return -EPERM;
1771+ if (arg == 0)
1772+ return -EINVAL;
1773+
1774+ newquorum = calculate_quorum(1, arg, &total_votes);
1775+
1776+ if (newquorum < total_votes / 2
1777+ || newquorum > total_votes) {
1778+ return -EINVAL;
1779+ }
1780+
1781+ /* Now do it */
1782+ down(&cluster_members_lock);
1783+ list_for_each(nodelist, &cluster_members_list) {
1784+ node = list_entry(nodelist, struct cluster_node, list);
1785+ if (node->state == NODESTATE_MEMBER
1786+ && node->expected_votes > arg) {
1787+ node->expected_votes = arg;
1788+ }
1789+ }
1790+ up(&cluster_members_lock);
1791+
1792+ recalculate_quorum(1);
1793+
1794+ send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
1795+ sm_member_update(cluster_is_quorate);
1796+
1797+ return 0;
1798+}
1799+
1800+static int do_ioctl_kill_node(unsigned long arg)
1801+{
1802+ struct cluster_node *node;
1803+
1804+ if (!capable(CAP_CLUSTER))
1805+ return -EPERM;
1806+
1807+
1808+ if ((node = find_node_by_nodeid(arg)) == NULL)
1809+ return -EINVAL;
1810+
1811+ /* Can't kill us */
1812+ if (node->us)
1813+ return -EINVAL;
1814+
1815+ if (node->state != NODESTATE_MEMBER)
1816+ return -EINVAL;
1817+
1818+ /* Just in case it is alive, send a KILL message */
1819+ send_kill(arg);
1820+
1821+ node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
1822+ a_node_just_died(node);
1823+
1824+ return 0;
1825+}
1826+
1827+static int do_ioctl_barrier(unsigned long arg)
1828+{
1829+ struct cl_barrier_info info;
1830+
1831+ if (!capable(CAP_CLUSTER))
1832+ return -EPERM;
1833+
1834+ if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0)
1835+ return -EFAULT;
1836+
1837+ switch (info.cmd) {
1838+ case BARRIER_IOCTL_REGISTER:
1839+ return kcl_barrier_register(info.name,
1840+ info.flags,
1841+ info.arg);
1842+ case BARRIER_IOCTL_CHANGE:
1843+ return kcl_barrier_setattr(info.name,
1844+ info.flags,
1845+ info.arg);
1846+ case BARRIER_IOCTL_WAIT:
1847+ return kcl_barrier_wait(info.name);
1848+ case BARRIER_IOCTL_DELETE:
1849+ return kcl_barrier_delete(info.name);
1850+ default:
1851+ return -EINVAL;
1852+ }
1853+}
1854+
1855+static int do_ioctl_islistening(unsigned long arg)
1856+{
1857+ DECLARE_WAITQUEUE(wq, current);
1858+ struct cl_listen_request rq;
1859+ struct cluster_node *rem_node;
1860+ int nodeid;
1861+ int result;
1862+ struct cl_waiting_listen_request *listen_request;
1863+
1864+ if (!arg)
1865+ return -EINVAL;
1866+
1867+ if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
1868+ return -EFAULT;
1869+
1870+ nodeid = rq.nodeid;
1871+
1872+ rem_node = find_node_by_nodeid(nodeid);
1873+
1874+ /* Node not in the cluster */
1875+ if (!rem_node)
1876+ return -ENOENT;
1877+
1878+ if (rem_node->state != NODESTATE_MEMBER)
1879+ return -ENOTCONN;
1880+
1881+ /* If the request is for us then just look in the ports
1882+ * array */
1883+ if (nodeid == us->node_id)
1884+ return (port_array[rq.port] != 0) ? 1 : 0;
1885+
1886+ /* For a remote node we need to send a request out */
1887+
1888+ /* If we are in transition then wait until we are not */
1889+ while (in_transition()) {
1890+ set_task_state(current, TASK_INTERRUPTIBLE);
1891+ add_wait_queue(&socket_waitq, &wq);
1892+
1893+ if (in_transition())
1894+ schedule();
1895+
1896+ set_task_state(current, TASK_RUNNING);
1897+ remove_wait_queue(&socket_waitq, &wq);
1898+
1899+ if (signal_pending(current))
1900+ return -EINTR;
1901+ }
1902+
1903+ /* Were we shut down before it completed ? */
1904+ if (!atomic_read(&cnxman_running))
1905+ return -ENOTCONN;
1906+
1907+ listen_request =
1908+ kmalloc(sizeof (struct cl_waiting_listen_request),
1909+ GFP_KERNEL);
1910+ if (!listen_request)
1911+ return -ENOMEM;
1912+
1913+ /* Build the request */
1914+ listen_request->waiting = 1;
1915+ listen_request->result = 0;
1916+ listen_request->tag = current->pid;
1917+ listen_request->nodeid = nodeid;
1918+ init_waitqueue_head(&listen_request->waitq);
1919+
1920+ down(&listenreq_lock);
1921+ list_add(&listen_request->list, &listenreq_list);
1922+ up(&listenreq_lock);
1923+
1924+ /* Now wait for the response to come back */
1925+ send_listen_request(rq.nodeid, rq.port);
1926+
1927+ while (listen_request->waiting) {
1928+ set_task_state(current, TASK_INTERRUPTIBLE);
1929+ add_wait_queue(&listen_request->waitq, &wq);
1930+
1931+ if (listen_request->waiting)
1932+ schedule();
1933+
1934+ set_task_state(current, TASK_RUNNING);
1935+ remove_wait_queue(&listen_request->waitq, &wq);
1936+
1937+ if (signal_pending(current)) {
1938+ list_del(&listen_request->list);
1939+ kfree(listen_request);
1940+ return -ERESTARTSYS;
1941+ }
1942+ }
1943+ result = listen_request->result;
1944+ list_del(&listen_request->list);
1945+ kfree(listen_request);
1946+ return result;
1947+}
1948+
1949+static int do_ioctl_set_votes(unsigned long arg)
1950+{
1951+ unsigned int total_votes;
1952+ unsigned int newquorum;
1953+ int saved_votes;
1954+
1955+ if (!capable(CAP_CLUSTER))
1956+ return -EPERM;
1957+
1958+ /* Check votes is valid */
1959+ saved_votes = us->votes;
1960+ us->votes = arg;
1961+
1962+ newquorum = calculate_quorum(1, 0, &total_votes);
1963+
1964+ if (newquorum < total_votes / 2 || newquorum > total_votes) {
1965+ us->votes = saved_votes;
1966+ return -EINVAL;
1967+ }
1968+
1969+ recalculate_quorum(1);
1970+
1971+ send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
1972+
1973+ return 0;
1974+}
1975+
1976+static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1977+{
1978+ int err = -EOPNOTSUPP;
1979+ struct list_head *proclist;
1980+ struct list_head *tmp;
1981+ struct notify_struct *notify;
1982+ struct cl_version cnxman_version;
1983+
1984+ switch (cmd) {
1985+ /* Process requests notification of cluster events */
1986+ case SIOCCLUSTER_NOTIFY:
1987+ notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
1988+ if (!notify)
1989+ return -ENOMEM;
1990+ notify->pid = current->pid;
1991+ notify->signal = arg;
1992+ down(&event_listener_lock);
1993+ list_add(&notify->list, &event_listener_list);
1994+ up(&event_listener_lock);
1995+ err = 0;
1996+ break;
1997+
1998+ /* Process is no longer interested cluster events */
1999+ case SIOCCLUSTER_REMOVENOTIFY:
2000+ err = EINVAL;
2001+
2002+ down(&event_listener_lock);
2003+ list_for_each_safe(proclist, tmp, &event_listener_list) {
2004+ notify =
2005+ list_entry(proclist, struct notify_struct, list);
2006+ if (notify->pid == current->pid) {
2007+ list_del(&notify->list);
2008+ kfree(notify);
2009+ err = 0;
2010+ }
2011+ }
2012+ up(&event_listener_lock);
2013+ break;
2014+
2015+ /* Return the cnxman version number */
2016+ case SIOCCLUSTER_GET_VERSION:
2017+ if (!arg)
2018+ return -EINVAL;
2019+ err = 0;
2020+ cnxman_version.major = CNXMAN_MAJOR_VERSION;
2021+ cnxman_version.minor = CNXMAN_MINOR_VERSION;
2022+ cnxman_version.patch = CNXMAN_PATCH_VERSION;
2023+ if (copy_to_user((void *) arg, &cnxman_version,
2024+ sizeof (struct cl_version))) {
2025+ return -EFAULT;
2026+ }
2027+ break;
2028+
2029+ /* Set the cnxman config version number */
2030+ case SIOCCLUSTER_SET_VERSION:
2031+ err = do_ioctl_set_version(arg);
2032+ break;
2033+
2034+ /* Return the active membership list */
2035+ case SIOCCLUSTER_GETMEMBERS:
2036+ err = do_ioctl_get_members(arg);
2037+ break;
2038+
2039+ /* Return the full membership list include dead nodes */
2040+ case SIOCCLUSTER_GETALLMEMBERS:
2041+ err = do_ioctl_get_all_members(arg);
2042+ break;
2043+
2044+ case SIOCCLUSTER_GETNODE:
2045+ err = do_ioctl_get_node(arg);
2046+ break;
2047+
2048+ case SIOCCLUSTER_ISQUORATE:
2049+ return cluster_is_quorate;
2050+
2051+ case SIOCCLUSTER_ISACTIVE:
2052+ return atomic_read(&cnxman_running);
2053+
2054+ case SIOCCLUSTER_SETEXPECTED_VOTES:
2055+ err = do_ioctl_set_expected(arg);
2056+ break;
2057+
2058+ /* Change the number of votes for this node */
2059+ case SIOCCLUSTER_SET_VOTES:
2060+ err = do_ioctl_set_votes(arg);
2061+ break;
2062+
2063+ /* Return 1 if the specified node is listening on a given port */
2064+ case SIOCCLUSTER_ISLISTENING:
2065+ err = do_ioctl_islistening(arg);
2066+ break;
2067+
2068+ /* Forcibly kill a node */
2069+ case SIOCCLUSTER_KILLNODE:
2070+ err = do_ioctl_kill_node(arg);
2071+ break;
2072+
2073+ case SIOCCLUSTER_GET_JOINCOUNT:
2074+ if (!capable(CAP_CLUSTER))
2075+ return -EPERM;
2076+ else
2077+ return atomic_read(&use_count);
2078+
2079+ /* ioctl interface to the barrier system */
2080+ case SIOCCLUSTER_BARRIER:
2081+ err = do_ioctl_barrier(arg);
2082+ break;
2083+
2084+ default:
2085+ err = sm_ioctl(sock, cmd, arg);
2086+ }
2087+ return err;
2088+}
2089+
2090+static int cl_shutdown(struct socket *sock, int how)
2091+{
2092+ struct sock *sk = sock->sk;
2093+ int err = -ENOTCONN;
2094+
2095+ lock_sock(sk);
2096+
2097+ if (sock->state == SS_UNCONNECTED)
2098+ goto out;
2099+
2100+ err = 0;
2101+ if (sock->state == SS_DISCONNECTING)
2102+ goto out;
2103+
2104+ err = -EINVAL;
2105+
2106+ if (how != SHUTDOWN_MASK)
2107+ goto out;
2108+
2109+ sk->sk_shutdown = how;
2110+ err = 0;
2111+
2112+ out:
2113+ release_sock(sk);
2114+
2115+ return err;
2116+}
2117+
2118+static int cl_setsockopt(struct socket *sock, int level, int optname,
2119+ char *optval, int optlen)
2120+{
2121+ struct sock *sk = sock->sk;
2122+ int err;
2123+
2124+ if (sk != master_sock)
2125+ return -EPERM;
2126+
2127+ lock_sock(sk);
2128+ err = __cl_setsockopt(sock, level, optname, optval, optlen, 0);
2129+ release_sock(sk);
2130+
2131+ return err;
2132+}
2133+
2134+static int add_clsock(int broadcast, int number, struct socket *sock,
2135+ struct file *file)
2136+{
2137+ struct cl_comms_socket *newsock =
2138+ kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
2139+ if (!newsock)
2140+ return -ENOMEM;
2141+
2142+ memset(newsock, 0, sizeof (*newsock));
2143+ newsock->number = number;
2144+ newsock->sock = sock;
2145+ if (broadcast) {
2146+ newsock->broadcast = 1;
2147+ newsock->recv_only = 0;
2148+ }
2149+ else {
2150+ newsock->broadcast = 0;
2151+ newsock->recv_only = 1;
2152+ }
2153+
2154+ newsock->file = file;
2155+ newsock->addr_len = sizeof(struct sockaddr_in6);
2156+
2157+ /* Mark it active until cnxman thread is running and ready to process
2158+ * messages */
2159+ set_bit(1, &newsock->active);
2160+
2161+ /* Find out what it's bound to */
2162+ newsock->sock->ops->getname(newsock->sock,
2163+ (struct sockaddr *)&newsock->saddr,
2164+ &newsock->addr_len, 0);
2165+
2166+ num_interfaces = max(num_interfaces, newsock->number);
2167+ if (!current_interface && newsock->broadcast)
2168+ current_interface = newsock;
2169+
2170+ /* Hook data_ready */
2171+ newsock->sock->sk->sk_data_ready = cnxman_data_ready;
2172+
2173+ /* Make an attempt to keep them in order */
2174+ list_add_tail(&newsock->list, &socket_list);
2175+
2176+ address_length = newsock->addr_len;
2177+ return 0;
2178+}
2179+
2180+static int __cl_setsockopt(struct socket *sock, int level, int optname,
2181+ char *optval, int optlen, int flags)
2182+{
2183+ struct file *file;
2184+ struct cl_join_cluster_info join_info;
2185+ int error;
2186+ int leave_flags;
2187+ struct cl_multicast_sock multicast_info;
2188+
2189+ if (optlen && !optval)
2190+ return -EINVAL;
2191+
2192+ switch (optname) {
2193+ case CLU_SET_MULTICAST:
2194+ case CLU_SET_RCVONLY:
2195+ if (!capable(CAP_CLUSTER))
2196+ return -EPERM;
2197+
2198+ if (optlen != sizeof (struct cl_multicast_sock))
2199+ return -EINVAL;
2200+
2201+ if (atomic_read(&cnxman_running))
2202+ return -EINVAL;
2203+
2204+ error = -EBADF;
2205+
2206+ if (copy_from_user(&multicast_info, optval, optlen))
2207+ return -EFAULT;
2208+
2209+ file = fget(multicast_info.fd);
2210+ if (file) {
2211+ struct inode *inode = file->f_dentry->d_inode;
2212+
2213+ error =
2214+ add_clsock(optname == CLU_SET_MULTICAST,
2215+ multicast_info.number, SOCKET_I(inode),
2216+ file);
2217+ if (error)
2218+ fput(file);
2219+ }
2220+ return error;
2221+
2222+ case CLU_SET_NODENAME:
2223+ if (!capable(CAP_CLUSTER))
2224+ return -EPERM;
2225+
2226+ if (atomic_read(&cnxman_running))
2227+ return -EINVAL;
2228+
2229+ if (optlen > MAX_CLUSTER_MEMBER_NAME_LEN)
2230+ return -EINVAL;
2231+
2232+ if (copy_from_user(nodename, optval, optlen))
2233+ return -EFAULT;
2234+ break;
2235+
2236+ case CLU_JOIN_CLUSTER:
2237+ if (!capable(CAP_CLUSTER))
2238+ return -EPERM;
2239+
2240+ if (atomic_read(&cnxman_running))
2241+ return -EALREADY;
2242+
2243+ if (optlen != sizeof (struct cl_join_cluster_info))
2244+ return -EINVAL;
2245+
2246+ if (copy_from_user(&join_info, optval, optlen))
2247+ return -EFAULT;
2248+
2249+ if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
2250+ return -EINVAL;
2251+
2252+ if (list_empty(&socket_list))
2253+ return -ENOTCONN;
2254+
2255+ set_votes(join_info.votes, join_info.expected_votes);
2256+ cluster_id = generate_cluster_id(join_info.cluster_name);
2257+ strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
2258+ two_node = join_info.two_node;
2259+ config_version = join_info.config_version;
2260+
2261+ quit_threads = 0;
2262+ acks_expected = 0;
2263+ init_completion(&cluster_thread_comp);
2264+ init_completion(&member_thread_comp);
2265+ if (allocate_nodeid_array())
2266+ return -ENOMEM;
2267+
2268+ kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
2269+ if (kcluster_pid < 0)
2270+ return kcluster_pid;
2271+
2272+ wait_for_completion(&cluster_thread_comp);
2273+ init_completion(&cluster_thread_comp);
2274+
2275+ atomic_set(&cnxman_running, 1);
2276+
2277+ /* Make sure we have a node name */
2278+ if (nodename[0] == '\0')
2279+ strcpy(nodename, system_utsname.nodename);
2280+
2281+ membership_pid = start_membership_services(kcluster_pid);
2282+ if (membership_pid < 0) {
2283+ quit_threads = 1;
2284+ wait_for_completion(&cluster_thread_comp);
2285+ init_completion(&member_thread_comp);
2286+ return membership_pid;
2287+ }
2288+
2289+ sm_start();
2290+ break;
2291+
2292+ case CLU_LEAVE_CLUSTER:
2293+ if (!capable(CAP_CLUSTER))
2294+ return -EPERM;
2295+
2296+ if (optlen != sizeof (int))
2297+ return -EINVAL;
2298+
2299+ if (copy_from_user(&leave_flags, optval, optlen))
2300+ return -EFAULT;
2301+
2302+ if (!atomic_read(&cnxman_running))
2303+ return -ENOTCONN;
2304+
2305+ if (in_transition())
2306+ return -EBUSY;
2307+
2308+ /* Ignore the use count if FORCE is set */
2309+ if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
2310+ if (atomic_read(&use_count))
2311+ return -ENOTCONN;
2312+ }
2313+
2314+ us->leave_reason = leave_flags;
2315+ quit_threads = 1;
2316+ wake_up_interruptible(&cnxman_waitq);
2317+
2318+ wait_for_completion(&cluster_thread_comp);
2319+ break;
2320+
2321+ default:
2322+ return -ENOPROTOOPT;
2323+ }
2324+
2325+ return 0;
2326+}
2327+
2328+static int cl_getsockopt(struct socket *sock, int level, int optname,
2329+ char *optval, int *optlen)
2330+{
2331+ struct sock *sk = sock->sk;
2332+ int err;
2333+
2334+ lock_sock(sk);
2335+ err = __cl_getsockopt(sock, level, optname, optval, optlen, 0);
2336+ release_sock(sk);
2337+
2338+ return err;
2339+}
2340+
2341+static int __cl_getsockopt(struct socket *sock, int level, int optname,
2342+ char *optval, int *optlen, int flags)
2343+{
2344+
2345+ switch (optname) {
2346+ default:
2347+ return -ENOPROTOOPT;
2348+ }
2349+
2350+ return 0;
2351+}
2352+
2353+/* We'll be giving out reward points next... */
2354+/* Send the packet and save a copy in case someone loses theirs. Should be
2355+ * protected by the send mutexphore */
2356+static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
2357+ int size, int needack)
2358+{
2359+ mm_segment_t fs;
2360+ int result;
2361+ struct iovec save_vectors[msg->msg_iovlen];
2362+
2363+ /* Save a copy of the IO vectors as send_msg mucks around with them and
2364+ * we may want to send the same stuff out more than once (for different
2365+ * interfaces)
2366+ */
2367+ memcpy(save_vectors, msg->msg_iov,
2368+ sizeof (struct iovec) * msg->msg_iovlen);
2369+
2370+ fs = get_fs();
2371+ set_fs(get_ds());
2372+
2373+ result = sock_sendmsg(csock->sock, msg, size);
2374+
2375+ set_fs(fs);
2376+
2377+ if (result >= 0 && acks_expected && needack) {
2378+
2379+ /* Start retransmit timer if it didn't go */
2380+ if (result == 0) {
2381+ start_short_timer();
2382+ }
2383+ else {
2384+ resend_delay = 1;
2385+ }
2386+ }
2387+
2388+ /* Restore IOVs */
2389+ memcpy(msg->msg_iov, save_vectors,
2390+ sizeof (struct iovec) * msg->msg_iovlen);
2391+
2392+ return result;
2393+}
2394+
2395+static void resend_last_message()
2396+{
2397+ struct msghdr msg;
2398+ struct iovec vec[1];
2399+ mm_segment_t fs;
2400+ int result;
2401+
2402+ P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
2403+ jiffies, saved_msg_len, saved_msg_buffer[0],
2404+ saved_msg_buffer[6]);
2405+
2406+ /* Assume there is something wrong with the last interface */
2407+ current_interface = get_next_interface(current_interface);
2408+ if (num_interfaces > 1)
2409+ printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
2410+ current_interface->number);
2411+
2412+ vec[0].iov_base = saved_msg_buffer;
2413+ vec[0].iov_len = saved_msg_len;
2414+
2415+ memset(&msg, 0, sizeof (msg));
2416+ msg.msg_name = &current_interface->saddr;
2417+ msg.msg_namelen = current_interface->addr_len;
2418+ msg.msg_iovlen = 1;
2419+ msg.msg_iov = vec;
2420+
2421+ fs = get_fs();
2422+ set_fs(get_ds());
2423+
2424+ result = sock_sendmsg(current_interface->sock, &msg, saved_msg_len);
2425+
2426+ set_fs(fs);
2427+
2428+ if (result < 0)
2429+ printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
2430+
2431+ /* Try indefinitely to send this, the backlog must die down eventually
2432+ * !? */
2433+ if (result == 0)
2434+ start_short_timer();
2435+
2436+ /* Send succeeded, continue waiting for ACKS */
2437+ if (result > 0)
2438+ start_ack_timer();
2439+
2440+}
2441+
2442+static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
2443+ struct msghdr *msg, size_t size, int flags)
2444+{
2445+ struct sock *sk = sock->sk;
2446+ struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
2447+ struct cluster_sock *c = cluster_sk(sk);
2448+ struct sk_buff *skb;
2449+ int copied, err = 0;
2450+ int isoob = 0;
2451+
2452+ /* Socket was notified of shutdown, remove any pending skbs and return
2453+ * EOF */
2454+ if (!atomic_read(&cnxman_running)) {
2455+ while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
2456+ skb_free_datagram(sk, skb);
2457+ return 0; /* cnxman has left the building */
2458+ }
2459+
2460+ /* Generic datagram code does most of the work. If the user is not
2461+ * interested in OOB messages then ignore them */
2462+ do {
2463+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2464+ if (!skb)
2465+ goto out;
2466+
2467+ /* Is it OOB */
2468+ if (skb->cb[0] & 0x80)
2469+ isoob = 1;
2470+ else
2471+ isoob = 0;
2472+
2473+ /* If it is and the user doesn't want it, then throw it away. */
2474+ if (isoob && !(flags & MSG_OOB)) {
2475+ skb_free_datagram(sk, skb);
2476+
2477+ /* If we peeked (?) an OOB but the user doesn't want it
2478+ then we need to discard it or we'll loop forever */
2479+ if (flags & MSG_PEEK) {
2480+ skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
2481+ MSG_DONTWAIT, &err);
2482+ if (skb)
2483+ skb_free_datagram(sk, skb);
2484+ }
2485+ }
2486+ }
2487+ while (isoob && !(flags & MSG_OOB));
2488+
2489+ copied = skb->len;
2490+ if (copied > size) {
2491+ copied = size;
2492+ msg->msg_flags |= MSG_TRUNC;
2493+ }
2494+ err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
2495+
2496+ if (err)
2497+ goto out_free;
2498+
2499+ if (msg->msg_name && msg->msg_namelen) {
2500+ memset(msg->msg_name, 0, msg->msg_namelen);
2501+
2502+ if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
2503+
2504+ /* Nodeid is in native byte order - anything else is just
2505+ * perverse */
2506+ memcpy(&sin->scl_nodeid, skb->cb + 1, sizeof(int));
2507+ }
2508+ msg->msg_namelen = sizeof (struct sockaddr_cl);
2509+ sin->scl_port = c->port;
2510+ }
2511+
2512+ /* Top bit set in cb[0] means this is an OOB message */
2513+ if (skb->cb[0] & 0x80) {
2514+ msg->msg_flags |= MSG_OOB;
2515+ }
2516+
2517+ sock_recv_timestamp(msg, sk, skb);
2518+
2519+ err = copied;
2520+
2521+ out_free:
2522+ skb_free_datagram(sk, skb);
2523+
2524+ out:
2525+ return err;
2526+}
2527+
2528+/* Send a message out on all interfaces */
2529+static int send_to_all_ints(int nodeid, struct msghdr *our_msg, int size, int flags)
2530+{
2531+ struct sockaddr_in6 daddr;
2532+ struct cl_comms_socket *clsock;
2533+ int result = 0;
2534+
2535+ our_msg->msg_name = &daddr;
2536+
2537+ list_for_each_entry(clsock, &socket_list, list) {
2538+
2539+ /* Don't send out a recv-only socket */
2540+ if (!clsock->recv_only) {
2541+
2542+ /* For temporary node IDs send to the node's real IP address */
2543+ if (nodeid < 0) {
2544+ get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
2545+ }
2546+ else {
2547+ memcpy(&daddr, &clsock->saddr, clsock->addr_len);
2548+ our_msg->msg_namelen = clsock->addr_len;
2549+ }
2550+
2551+ result = __send_and_save(clsock, our_msg,
2552+ size + sizeof (struct cl_protheader),
2553+ !(flags & MSG_NOACK));
2554+ }
2555+ }
2556+ return result;
2557+}
2558+
2559+
2560+/* Internal common send message routine */
2561+static int __sendmsg(struct socket *sock, struct msghdr *msg, int size,
2562+ unsigned char port)
2563+{
2564+ int result = 0, i;
2565+ int flags = msg->msg_flags;
2566+ struct msghdr our_msg;
2567+ struct sockaddr_cl *caddr = msg->msg_name;
2568+ struct cl_protheader header;
2569+ struct iovec vectors[msg->msg_iovlen + 1];
2570+ int nodeid = 0;
2571+
2572+ if (size > MAX_CLUSTER_MESSAGE)
2573+ return -EINVAL;
2574+ if (!atomic_read(&cnxman_running))
2575+ return -ENOTCONN;
2576+
2577+ if (caddr)
2578+ nodeid = caddr->scl_nodeid;
2579+
2580+ /* Check that the node id (if present) is valid */
2581+ if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
2582+ !is_valid_temp_nodeid(nodeid))) {
2583+ return -ENOTCONN;
2584+ }
2585+
2586+ /* We can only have one send outstanding at a time so we might as well
2587+ * lock the whole send mechanism */
2588+ down(&send_lock);
2589+
2590+ while ((port > HIGH_PROTECTED_PORT
2591+ && (!cluster_is_quorate || in_transition()))
2592+ || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
2593+
2594+ DECLARE_WAITQUEUE(wq, current);
2595+ struct task_struct *tsk = current;
2596+
2597+ if (flags & MSG_DONTWAIT) {
2598+ up(&send_lock);
2599+ return -EAGAIN;
2600+ }
2601+
2602+ if (current->pid == kcluster_pid) {
2603+ P_COMMS
2604+ ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
2605+ port, ack_count, acks_expected);
2606+ up(&send_lock);
2607+ return -EAGAIN;
2608+ }
2609+
2610+ P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
2611+ ack_count, acks_expected);
2612+
2613+ set_task_state(tsk, TASK_INTERRUPTIBLE);
2614+ add_wait_queue(&socket_waitq, &wq);
2615+
2616+ if ((port > HIGH_PROTECTED_PORT
2617+ && (!cluster_is_quorate || in_transition()))
2618+ || (acks_expected > 0)) {
2619+
2620+ up(&send_lock);
2621+ schedule();
2622+ down(&send_lock);
2623+ }
2624+
2625+ /* Going down */
2626+ if (quit_threads) {
2627+ up(&send_lock);
2628+ return -ENOTCONN;
2629+ }
2630+
2631+ set_task_state(tsk, TASK_RUNNING);
2632+ remove_wait_queue(&socket_waitq, &wq);
2633+
2634+ if (signal_pending(current)) {
2635+ up(&send_lock);
2636+ return -ERESTARTSYS;
2637+ }
2638+
2639+ /* Were we shut down in the meantime ? */
2640+ if (!atomic_read(&cnxman_running)) {
2641+ up(&send_lock);
2642+ return -ENOTCONN;
2643+ }
2644+
2645+ }
2646+
2647+ memset(&our_msg, 0, sizeof (our_msg));
2648+
2649+ /* Build the header */
2650+ header.port = port;
2651+ header.flags = msg->msg_flags >> 16;
2652+ header.cluster = cpu_to_le16(cluster_id);
2653+ header.srcid = us ? cpu_to_le32(us->node_id) : 0;
2654+ header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
2655+
2656+ ++cur_seq;
2657+ header.seq = cpu_to_le16(cur_seq);
2658+
2659+ /* Set the MULTICAST flag on messages with no particular destination */
2660+ if (!msg->msg_namelen) {
2661+ header.flags |= MSG_MULTICAST >> 16;
2662+ header.tgtid = 0;
2663+ }
2664+
2665+ /* Copy the existing iovecs into our array and add the header on at the
2666+ * beginning */
2667+ vectors[0].iov_base = &header;
2668+ vectors[0].iov_len = sizeof (header);
2669+ for (i = 0; i < msg->msg_iovlen; i++) {
2670+ vectors[i + 1] = msg->msg_iov[i];
2671+ }
2672+
2673+ our_msg.msg_iovlen = msg->msg_iovlen + 1;
2674+ our_msg.msg_iov = vectors;
2675+
2676+ /* Work out how many ACKS are wanted - *don't* reset acks_expected to
2677+ * zero if no acks are required as an ACK-needed message may still be
2678+ * outstanding */
2679+ if (!(msg->msg_flags & MSG_NOACK)) {
2680+ if (msg->msg_namelen)
2681+ acks_expected = 1; /* Unicast */
2682+ else
2683+ acks_expected = max(cluster_members - 1, 0);
2684+
2685+ }
2686+
2687+ P_COMMS
2688+ ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
2689+ nodeid, header.port,
2690+ (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
2691+ le16_to_cpu(header.seq), header.flags);
2692+
2693+ /* Don't include temp nodeids in the message itself */
2694+ if (header.tgtid < 0)
2695+ header.tgtid = 0;
2696+
2697+ /* For non-member sends we use all the interfaces */
2698+ if ((nodeid < 0) || (flags & MSG_ALLINT)) {
2699+
2700+ result = send_to_all_ints(nodeid, &our_msg, size, msg->msg_flags);
2701+ }
2702+ else {
2703+ /* Send to only the current socket - resends will use the
2704+ * others if necessary */
2705+ our_msg.msg_name = &current_interface->saddr;
2706+ our_msg.msg_namelen = current_interface->addr_len;
2707+
2708+ result =
2709+ __send_and_save(current_interface, &our_msg,
2710+ size + sizeof (header),
2711+ !(msg->msg_flags & MSG_NOACK));
2712+ }
2713+
2714+ /* Make a note in each nodes' structure that it has been sent a message
2715+ * so we can see which ones went astray */
2716+ if (!(flags & MSG_NOACK) && nodeid >= 0) {
2717+ if (msg->msg_namelen) {
2718+ struct cluster_node *node;
2719+
2720+ node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
2721+ if (node)
2722+ node->last_seq_sent = cur_seq;
2723+ }
2724+ else {
2725+ struct cluster_node *node;
2726+ struct list_head *nodelist;
2727+
2728+ list_for_each(nodelist, &cluster_members_list) {
2729+ node =
2730+ list_entry(nodelist, struct cluster_node,
2731+ list);
2732+ if (node->state == NODESTATE_MEMBER) {
2733+ node->last_seq_sent = cur_seq;
2734+ }
2735+ }
2736+ }
2737+ }
2738+
2739+ /* Save a copy of the message if we're expecting an ACK */
2740+ if (!(flags & MSG_NOACK) && acks_expected) {
2741+ mm_segment_t fs;
2742+
2743+ fs = get_fs();
2744+ set_fs(get_ds());
2745+
2746+ memcpy_fromiovec(saved_msg_buffer, our_msg.msg_iov,
2747+ size + sizeof (header));
2748+ set_fs(fs);
2749+
2750+ saved_msg_len = size + sizeof (header);
2751+ retry_count = ack_count = 0;
2752+ clear_bit(RESEND_NEEDED, &mainloop_flags);
2753+
2754+ start_ack_timer();
2755+ }
2756+
2757+ up(&send_lock);
2758+ return result;
2759+}
2760+
2761+static int queue_message(void *buf, int len, struct sockaddr_cl *caddr,
2762+ unsigned char port, int flags)
2763+{
2764+ struct queued_message *qmsg;
2765+
2766+ qmsg = kmalloc(sizeof (struct queued_message),
2767+ (in_atomic()
2768+ || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
2769+ if (qmsg == NULL)
2770+ return -1;
2771+
2772+ memcpy(qmsg->msg_buffer, buf, len);
2773+ qmsg->msg_len = len;
2774+ if (caddr) {
2775+ memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
2776+ qmsg->addr_len = sizeof (struct sockaddr_cl);
2777+ }
2778+ else {
2779+ qmsg->addr_len = 0;
2780+ }
2781+ qmsg->flags = flags;
2782+ qmsg->port = port;
2783+ qmsg->socket = NULL;
2784+
2785+ down(&messages_list_lock);
2786+ list_add_tail(&qmsg->list, &messages_list);
2787+ up(&messages_list_lock);
2788+
2789+ wake_up_interruptible(&cnxman_waitq);
2790+
2791+ return 0;
2792+}
2793+
2794+static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
2795+ struct msghdr *msg, size_t size)
2796+{
2797+ struct cluster_sock *c = cluster_sk(sock->sk);
2798+ char *buffer;
2799+ int status;
2800+ int saved_iovlen;
2801+ uint8_t port;
2802+ struct iovec iov;
2803+ struct iovec *saved_iov;
2804+ struct sockaddr_cl *caddr = msg->msg_name;
2805+
2806+ if (sock->sk->sk_protocol == CLPROTO_MASTER)
2807+ return -EOPNOTSUPP;
2808+
2809+ port = c->port;
2810+
2811+ /* Only capable users can override the port number */
2812+ if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
2813+ port = caddr->scl_port;
2814+
2815+ if (port == 0)
2816+ return -EDESTADDRREQ;
2817+
2818+ /* Hmmm. On machines with segmented user/kernel space (sparc64, hppa &
2819+ * m68k AFAICT) we can't mix user and kernel space addresses in the
2820+ * IOV. This stymies __sendmsg a little as it tries to add a header to
2821+ * what could possibly be a userspace iov. So, here (where all the
2822+ * userspace sends come) we copy it to a kernel space buffer first. If
2823+ * performance is a big problem here then I might #ifdef it for the
2824+ * affected architectures but for now I think it will probably be OK */
2825+ buffer = kmalloc(size, GFP_KERNEL);
2826+ if (!buffer)
2827+ return -ENOMEM;
2828+
2829+ memcpy_fromiovec(buffer, msg->msg_iov, size);
2830+ iov.iov_len = size;
2831+ iov.iov_base = buffer;
2832+
2833+ saved_iov = msg->msg_iov;
2834+ saved_iovlen = msg->msg_iovlen;
2835+ msg->msg_iov = &iov;
2836+ msg->msg_iovlen = 1;
2837+
2838+ status = __sendmsg(sock, msg, size, port);
2839+ msg->msg_iov = saved_iov;
2840+ msg->msg_iovlen = saved_iovlen;
2841+
2842+ kfree(buffer);
2843+
2844+ return status;
2845+}
2846+
2847+/* Kernel call to sendmsg */
2848+int kcl_sendmsg(struct socket *sock, void *buf, int size,
2849+ struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
2850+{
2851+ struct iovec iovecs[1];
2852+ struct msghdr msg;
2853+ struct cluster_sock *c = cluster_sk(sock->sk);
2854+ unsigned char port;
2855+
2856+ if (size > MAX_CLUSTER_MESSAGE)
2857+ return -EINVAL;
2858+ if (!atomic_read(&cnxman_running))
2859+ return -ENOTCONN;
2860+
2861+ port = c->port;
2862+ if (caddr && caddr->scl_port)
2863+ port = caddr->scl_port;
2864+
2865+ if (port == 0)
2866+ return -EDESTADDRREQ;
2867+
2868+ /* If we have no process context then queue it up for kclusterd to
2869+ * send. */
2870+ if (in_interrupt() || flags & MSG_QUEUE) {
2871+ return queue_message(buf, size, caddr, port,
2872+ flags & ~MSG_QUEUE);
2873+ }
2874+
2875+ iovecs[0].iov_base = buf;
2876+ iovecs[0].iov_len = size;
2877+
2878+ memset(&msg, 0, sizeof (msg));
2879+ msg.msg_name = caddr;
2880+ msg.msg_namelen = addr_len;
2881+ msg.msg_iovlen = 1;
2882+ msg.msg_iov = iovecs;
2883+ msg.msg_flags = flags;
2884+
2885+ return __sendmsg(sock, &msg, size, port);
2886+}
2887+
2888+static int send_queued_message(struct queued_message *qmsg)
2889+{
2890+ struct iovec iovecs[1];
2891+ struct msghdr msg;
2892+
2893+ /* Don't send blocked messages */
2894+ if (qmsg->port > HIGH_PROTECTED_PORT
2895+ && (!cluster_is_quorate || in_transition()))
2896+ return -EAGAIN;
2897+
2898+ iovecs[0].iov_base = qmsg->msg_buffer;
2899+ iovecs[0].iov_len = qmsg->msg_len;
2900+
2901+ memset(&msg, 0, sizeof (msg));
2902+ msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
2903+ msg.msg_namelen = qmsg->addr_len;
2904+ msg.msg_iovlen = 1;
2905+ msg.msg_iov = iovecs;
2906+ msg.msg_flags = qmsg->flags;
2907+
2908+ return __sendmsg(qmsg->socket, &msg, qmsg->msg_len, qmsg->port);
2909+}
2910+
2911+int kcl_register_read_callback(struct socket *sock,
2912+ int (*routine) (char *, int, char *, int,
2913+ unsigned int))
2914+{
2915+ struct cluster_sock *c = cluster_sk(sock->sk);
2916+
2917+ c->kernel_callback = routine;
2918+
2919+ return 0;
2920+}
2921+
2922+/* Used where we are in kclusterd context and we can't allow the task to wait
2923+ * as we are also responsible to processing the ACKs that do the wake up. Try
2924+ * to send the message immediately and queue it if that's not possible */
2925+static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
2926+ unsigned char port)
2927+{
2928+ struct iovec iovecs[1];
2929+ struct msghdr msg;
2930+
2931+ int status;
2932+
2933+ /* Don't send blocked messages */
2934+ if (port > HIGH_PROTECTED_PORT
2935+ && (!cluster_is_quorate || in_transition())) {
2936+ return queue_message(buf, len, caddr, port, 0);
2937+ }
2938+
2939+ iovecs[0].iov_base = buf;
2940+ iovecs[0].iov_len = len;
2941+
2942+ memset(&msg, 0, sizeof (msg));
2943+ msg.msg_name = caddr;
2944+ msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
2945+ msg.msg_iovlen = 1;
2946+ msg.msg_iov = iovecs;
2947+ msg.msg_flags = MSG_DONTWAIT;
2948+
2949+ status = __sendmsg(NULL, &msg, len, port);
2950+
2951+ /* Did it work ? */
2952+ if (status > 0) {
2953+ return 0;
2954+ }
2955+
2956+ /* Failure other than EAGAIN is fatal */
2957+ if (status != -EAGAIN) {
2958+ return status;
2959+ }
2960+
2961+ return queue_message(buf, len, caddr, port, 0);
2962+}
2963+
2964+/* Send a listen request to a node */
2965+static void send_listen_request(int nodeid, unsigned char port)
2966+{
2967+ struct cl_listenmsg listenmsg;
2968+ struct sockaddr_cl caddr;
2969+
2970+ memset(&caddr, 0, sizeof (caddr));
2971+
2972+ /* Build the header */
2973+ listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
2974+ listenmsg.target_port = port;
2975+ listenmsg.listening = 0;
2976+ listenmsg.tag = current->pid;
2977+
2978+ caddr.scl_family = AF_CLUSTER;
2979+ caddr.scl_port = 0;
2980+ caddr.scl_nodeid = nodeid;
2981+
2982+ send_or_queue_message(&listenmsg, sizeof(listenmsg), &caddr, 0);
2983+ return;
2984+}
2985+
2986+/* Return 1 or 0 to indicate if we have a listener on the requested port */
2987+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
2988+ unsigned char port, unsigned short tag)
2989+{
2990+ struct cl_listenmsg listenmsg;
2991+ struct sockaddr_cl caddr;
2992+ int status;
2993+
2994+ memset(&caddr, 0, sizeof (caddr));
2995+
2996+ /* Build the message */
2997+ listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
2998+ listenmsg.target_port = port;
2999+ listenmsg.tag = tag;
3000+ listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
3001+
3002+ caddr.scl_family = AF_CLUSTER;
3003+ caddr.scl_port = 0;
3004+ caddr.scl_nodeid = nodeid;
3005+
3006+ status = send_or_queue_message(&listenmsg,
3007+ sizeof (listenmsg),
3008+ &caddr, 0);
3009+
3010+ return;
3011+}
3012+
3013+/* Send an ACK */
3014+static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
3015+ int addr_len, char *addr, unsigned char remport,
3016+ unsigned char flag)
3017+{
3018+ mm_segment_t fs;
3019+ struct iovec vec;
3020+ struct cl_ackmsg ackmsg;
3021+ struct msghdr msg;
3022+ struct sockaddr_in6 daddr;
3023+ int result;
3024+
3025+#ifdef DEBUG_COMMS
3026+ char buf[MAX_ADDR_PRINTED_LEN];
3027+
3028+ P_COMMS("Sending ACK to %s, seq=%d\n",
3029+ print_addr(addr, address_length, buf), le16_to_cpu(seq));
3030+#endif
3031+
3032+ if (addr) {
3033+ memcpy(&daddr, addr, addr_len);
3034+ }
3035+ else {
3036+ memcpy(&daddr, &csock->saddr, csock->addr_len);
3037+ addr_len = csock->addr_len;
3038+ }
3039+
3040+ /* Build the header */
3041+ ackmsg.header.port = 0; /* Protocol port */
3042+ ackmsg.header.seq = 0;
3043+ ackmsg.header.flags = MSG_NOACK >> 16;
3044+ ackmsg.header.cluster = cpu_to_le16(cluster_id);
3045+ ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
3046+ ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother
3047+ * to look this up */
3048+ ackmsg.cmd = CLUSTER_CMD_ACK;
3049+ ackmsg.remport = remport;
3050+ ackmsg.aflags = flag;
3051+ ackmsg.seq = seq; /* Already in LE order */
3052+ vec.iov_base = &ackmsg;
3053+ vec.iov_len = sizeof (ackmsg);
3054+
3055+ memset(&msg, 0, sizeof (msg));
3056+ msg.msg_name = &daddr;
3057+ msg.msg_namelen = addr_len;
3058+ msg.msg_iovlen = 1;
3059+ msg.msg_iov = &vec;
3060+
3061+ fs = get_fs();
3062+ set_fs(get_ds());
3063+
3064+ result = sock_sendmsg(csock->sock, &msg, sizeof (ackmsg));
3065+
3066+ set_fs(fs);
3067+
3068+ if (result < 0)
3069+ printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
3070+
3071+ return result;
3072+
3073+}
3074+
3075+/* Wait for all ACKS to be gathered */
3076+void kcl_wait_for_all_acks()
3077+{
3078+ while (ack_count < acks_expected) {
3079+
3080+ DECLARE_WAITQUEUE(wq, current);
3081+ struct task_struct *tsk = current;
3082+
3083+ set_task_state(tsk, TASK_INTERRUPTIBLE);
3084+ add_wait_queue(&socket_waitq, &wq);
3085+
3086+ if (ack_count < acks_expected) {
3087+ schedule();
3088+ }
3089+
3090+ set_task_state(tsk, TASK_RUNNING);
3091+ remove_wait_queue(&socket_waitq, &wq);
3092+ }
3093+}
3094+
3095+/* Send a closedown OOB message to all cluster nodes - this tells them that a
3096+ * port listener has gone away */
3097+static void send_port_close_oob(unsigned char port)
3098+{
3099+ struct cl_closemsg closemsg;
3100+
3101+ /* Build the header */
3102+ closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
3103+ closemsg.port = port;
3104+
3105+ send_or_queue_message(&closemsg, sizeof (closemsg), NULL, 0);
3106+ return;
3107+}
3108+
3109+/* A remote port has been closed - post an OOB message to the local listen on
3110+ * that port (if there is one) */
3111+static void post_close_oob(unsigned char port, int nodeid)
3112+{
3113+ struct cl_portclosed_oob *oobmsg;
3114+ struct sk_buff *skb;
3115+ struct sock *sock = port_array[port];
3116+
3117+ if (!sock) {
3118+ return; /* No-one listening */
3119+ }
3120+
3121+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3122+ if (!skb)
3123+ return;
3124+
3125+ skb_put(skb, sizeof (*oobmsg));
3126+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3127+ oobmsg->port = port;
3128+ oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
3129+ skb->cb[0] = 0x80;
3130+ memcpy(skb->cb + 1, &nodeid, sizeof(int));
3131+
3132+ sock_queue_rcv_skb(sock, skb);
3133+
3134+}
3135+
3136+/* Leave the cluster */
3137+static void node_shutdown()
3138+{
3139+ struct cl_barrier *barrier;
3140+ struct list_head *blist;
3141+ struct list_head *temp;
3142+ struct list_head *socklist;
3143+ struct cl_client_socket *csock;
3144+ struct sk_buff *null_skb;
3145+
3146+ printk(KERN_INFO CMAN_NAME ": we are leaving the cluster\n");
3147+
3148+ atomic_set(&cnxman_running, 0);
3149+ unjam();
3150+
3151+ /* Notify kernel listeners first */
3152+ notify_kernel_listeners(LEAVING, 0);
3153+
3154+ /* Notify client sockets */
3155+ down(&client_socket_lock);
3156+ list_for_each_safe(socklist, temp, &client_socket_list) {
3157+ csock = list_entry(socklist, struct cl_client_socket, list);
3158+
3159+ null_skb = alloc_skb(0, GFP_KERNEL);
3160+ if (null_skb)
3161+ sock_queue_rcv_skb(csock->sock->sk, null_skb);
3162+ list_del(&csock->list);
3163+ kfree(csock);
3164+ }
3165+ up(&client_socket_lock);
3166+ we_are_a_cluster_member = 0;
3167+
3168+ sm_stop(1);
3169+
3170+ /* Wake up any processes waiting for barriers */
3171+ down(&barrier_list_lock);
3172+ list_for_each(blist, &barrier_list) {
3173+ barrier = list_entry(blist, struct cl_barrier, list);
3174+
3175+ /* Cancel any timers */
3176+ if (timer_pending(&barrier->timer))
3177+ del_timer(&barrier->timer);
3178+
3179+ /* Force it to be auto-delete so it discards itself */
3180+ if (barrier->state == BARRIER_STATE_WAITING) {
3181+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
3182+ wake_up_interruptible(&barrier->waitq);
3183+ }
3184+ else {
3185+ if (barrier->callback) {
3186+ barrier->callback(barrier->name, -ENOTCONN);
3187+ barrier->callback = NULL;
3188+ }
3189+ }
3190+ }
3191+ up(&barrier_list_lock);
3192+
3193+ /* Wake up any processes waiting for ISLISTENING requests */
3194+ down(&listenreq_lock);
3195+ list_for_each(blist, &listenreq_list) {
3196+ struct cl_waiting_listen_request *lrequest =
3197+ list_entry(blist, struct cl_waiting_listen_request, list);
3198+
3199+ if (lrequest->waiting)
3200+ wake_up_interruptible(&lrequest->waitq);
3201+ }
3202+ up(&listenreq_lock);
3203+}
3204+
3205+static void free_cluster_sockets()
3206+{
3207+ struct list_head *socklist;
3208+ struct cl_comms_socket *sock;
3209+ struct list_head *temp;
3210+
3211+ list_for_each_safe(socklist, temp, &socket_list) {
3212+ sock = list_entry(socklist, struct cl_comms_socket, list);
3213+
3214+ list_del(&sock->list);
3215+ fput(sock->file);
3216+ kfree(sock);
3217+ }
3218+ num_interfaces = 0;
3219+ current_interface = NULL;
3220+}
3221+
3222+/* Tidy up after all the rest of the cluster bits have shut down */
3223+static void node_cleanup()
3224+{
3225+ struct list_head *nodelist;
3226+ struct list_head *proclist;
3227+ struct list_head *temp;
3228+ struct list_head *socklist;
3229+ struct list_head *blist;
3230+ struct cl_comms_socket *sock;
3231+ struct kernel_notify_struct *knotify;
3232+
3233+ /* Free list of kernel listeners */
3234+ list_for_each_safe(proclist, temp, &kernel_listener_list) {
3235+ knotify =
3236+ list_entry(proclist, struct kernel_notify_struct, list);
3237+ list_del(&knotify->list);
3238+ kfree(knotify);
3239+ }
3240+
3241+ /* Mark the sockets as busy so they don't get added to the active
3242+ * sockets list in the next few lines of code before we free them */
3243+ list_for_each_safe(socklist, temp, &socket_list) {
3244+ sock = list_entry(socklist, struct cl_comms_socket, list);
3245+
3246+ set_bit(1, &sock->active);
3247+ }
3248+
3249+ /* Tidy the active sockets list */
3250+ list_for_each_safe(socklist, temp, &active_socket_list) {
3251+ sock =
3252+ list_entry(socklist, struct cl_comms_socket, active_list);
3253+ list_del(&sock->active_list);
3254+ }
3255+
3256+ /* Free the memory allocated to cluster nodes */
3257+ free_nodeid_array();
3258+ down(&cluster_members_lock);
3259+ us = NULL;
3260+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
3261+
3262+ struct list_head *addrlist;
3263+ struct list_head *addrtemp;
3264+ struct cluster_node *node;
3265+ struct cluster_node_addr *nodeaddr;
3266+
3267+ node = list_entry(nodelist, struct cluster_node, list);
3268+
3269+ list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
3270+ nodeaddr =
3271+ list_entry(addrlist, struct cluster_node_addr,
3272+ list);
3273+
3274+ list_del(&nodeaddr->list);
3275+ kfree(nodeaddr);
3276+ }
3277+ list_del(&node->list);
3278+ kfree(node->name);
3279+ kfree(node);
3280+ }
3281+ cluster_members = 0;
3282+ up(&cluster_members_lock);
3283+
3284+ /* Free the memory allocated to the outgoing sockets */
3285+ free_cluster_sockets();
3286+
3287+ /* Make sure that all the barriers are deleted */
3288+ down(&barrier_list_lock);
3289+ list_for_each_safe(blist, temp, &barrier_list) {
3290+ struct cl_barrier *barrier =
3291+ list_entry(blist, struct cl_barrier, list);
3292+
3293+ list_del(&barrier->list);
3294+ kfree(barrier);
3295+ }
3296+ up(&barrier_list_lock);
3297+
3298+ kcluster_pid = 0;
3299+ clear_bit(RESEND_NEEDED, &mainloop_flags);
3300+ acks_expected = 0;
3301+}
3302+
3303+/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
3304+ * blocked. */
3305+void set_quorate(int total_votes)
3306+{
3307+ int quorate;
3308+
3309+ if (get_quorum() > total_votes) {
3310+ quorate = 0;
3311+ }
3312+ else {
3313+ quorate = 1;
3314+ }
3315+
3316+ /* Hide messages during startup state transition */
3317+ if (we_are_a_cluster_member) {
3318+ if (cluster_is_quorate && !quorate)
3319+ printk(KERN_CRIT CMAN_NAME
3320+ ": quorum lost, blocking activity\n");
3321+ if (!cluster_is_quorate && quorate)
3322+ printk(KERN_CRIT CMAN_NAME
3323+ ": quorum regained, resuming activity\n");
3324+ }
3325+ cluster_is_quorate = quorate;
3326+
3327+ /* Wake up any sleeping processes */
3328+ if (cluster_is_quorate) {
3329+ unjam();
3330+ }
3331+
3332+}
3333+
3334+void queue_oob_skb(struct socket *sock, int cmd)
3335+{
3336+ struct sk_buff *skb;
3337+ struct cl_portclosed_oob *oobmsg;
3338+
3339+ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3340+ if (!skb)
3341+ return;
3342+
3343+ skb_put(skb, sizeof (*oobmsg));
3344+ oobmsg = (struct cl_portclosed_oob *) skb->data;
3345+ oobmsg->port = 0;
3346+ oobmsg->cmd = cmd;
3347+
3348+ /* There is no remote node associated with this so
3349+ clear out the field to avoid any accidents */
3350+ memset(skb->cb, 0, sizeof(int));
3351+ skb->cb[0] = 0x80;
3352+
3353+ sock_queue_rcv_skb(sock->sk, skb);
3354+}
3355+
3356+/* Notify interested parties that the cluster configuration has changed */
3357+void notify_listeners()
3358+{
3359+ struct notify_struct *notify;
3360+ struct list_head *proclist;
3361+ struct list_head *socklist;
3362+ struct list_head *temp;
3363+
3364+ /* Do kernel listeners first */
3365+ notify_kernel_listeners(CLUSTER_RECONFIG, 0);
3366+
3367+ /* Now we deign to tell userspace */
3368+ down(&event_listener_lock);
3369+ list_for_each_safe(proclist, temp, &event_listener_list) {
3370+ notify = list_entry(proclist, struct notify_struct, list);
3371+
3372+ /* If the kill fails then remove the process from the list */
3373+ if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
3374+ list_del(&notify->list);
3375+ kfree(notify);
3376+ }
3377+ }
3378+ up(&event_listener_lock);
3379+
3380+ /* Tell userspace processes which want OOB messages */
3381+ down(&client_socket_lock);
3382+ list_for_each(socklist, &client_socket_list) {
3383+ struct cl_client_socket *csock;
3384+ csock = list_entry(socklist, struct cl_client_socket, list);
3385+ queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
3386+ }
3387+ up(&client_socket_lock);
3388+}
3389+
3390+/* This fills in the list of all addresses for the local node */
3391+void get_local_addresses(struct cluster_node *node)
3392+{
3393+ struct list_head *socklist;
3394+ struct cl_comms_socket *sock;
3395+
3396+ list_for_each(socklist, &socket_list) {
3397+ sock = list_entry(socklist, struct cl_comms_socket, list);
3398+
3399+ if (sock->recv_only) {
3400+ add_node_address(node, (char *) &sock->saddr, address_length);
3401+ }
3402+ }
3403+}
3404+
3405+
3406+static uint16_t generate_cluster_id(char *name)
3407+{
3408+ int i;
3409+ int value = 0;
3410+
3411+ for (i=0; i<strlen(name); i++) {
3412+ value <<= 1;
3413+ value += name[i];
3414+ }
3415+ return value & 0xFFFF;
3416+}
3417+
3418+/* Return the next comms socket we can use. */
3419+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
3420+{
3421+ int next;
3422+ struct list_head *socklist;
3423+
3424+ /* Fast path for single interface systems */
3425+ if (num_interfaces <= 1)
3426+ return cur;
3427+
3428+ /* Next number */
3429+ next = cur->number + 1;
3430+ if (next > num_interfaces)
3431+ next = 1;
3432+
3433+ /* Find the socket with this number, I could optimise this by starting
3434+ * at the current i/f but most systems are going to have a small number
3435+ * of them anyway */
3436+ list_for_each(socklist, &socket_list) {
3437+ struct cl_comms_socket *sock;
3438+ sock = list_entry(socklist, struct cl_comms_socket, list);
3439+
3440+ if (!sock->recv_only && sock->number == next)
3441+ return sock;
3442+ }
3443+
3444+ BUG();
3445+ return NULL;
3446+}
3447+
3448+/* MUST be called with the barrier list lock held */
3449+static struct cl_barrier *find_barrier(char *name)
3450+{
3451+ struct list_head *blist;
3452+ struct cl_barrier *bar;
3453+
3454+ list_for_each(blist, &barrier_list) {
3455+ bar = list_entry(blist, struct cl_barrier, list);
3456+
3457+ if (strcmp(name, bar->name) == 0)
3458+ return bar;
3459+ }
3460+ return NULL;
3461+}
3462+
3463+/* Do the stuff we need to do when the barrier has completed phase 1 */
3464+static void check_barrier_complete_phase1(struct cl_barrier *barrier)
3465+{
3466+ if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
3467+ ? barrier->expected_nodes :
3468+ cluster_members)) {
3469+
3470+ struct cl_barriermsg bmsg;
3471+
3472+ atomic_inc(&barrier->completed_nodes); /* We have completed */
3473+ barrier->phase = 2; /* Wait for complete phase II */
3474+
3475+ /* Send completion message, remember: we are in cnxman context
3476+ * and must not block */
3477+ bmsg.cmd = CLUSTER_CMD_BARRIER;
3478+ bmsg.subcmd = BARRIER_COMPLETE;
3479+ bmsg.flags = 0;
3480+ strcpy(bmsg.name, barrier->name);
3481+
3482+ P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
3483+ queue_message((char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
3484+ }
3485+}
3486+
3487+/* Do the stuff we need to do when the barrier has been reached */
3488+/* Return 1 if we deleted the barrier */
3489+static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
3490+{
3491+ spin_lock_irq(&barrier->phase2_spinlock);
3492+
3493+ if (barrier->state != BARRIER_STATE_COMPLETE &&
3494+ (status == -ETIMEDOUT ||
3495+ atomic_read(&barrier->completed_nodes) ==
3496+ ((barrier->expected_nodes != 0)
3497+ ? barrier->expected_nodes : cluster_members))) {
3498+
3499+ if (status == 0 && barrier->timeout)
3500+ del_timer(&barrier->timer);
3501+ barrier->endreason = status;
3502+
3503+ /* Wake up listener */
3504+ if (barrier->state == BARRIER_STATE_WAITING) {
3505+ wake_up_interruptible(&barrier->waitq);
3506+ }
3507+ else {
3508+ /* Additional tasks we have to do if the user was not
3509+ * waiting... */
3510+ /* Call the callback */
3511+ if (barrier->callback) {
3512+ barrier->callback(barrier->name, 0);
3513+ barrier->callback = NULL;
3514+ }
3515+ /* Remove it if it's AUTO-DELETE */
3516+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
3517+ list_del(&barrier->list);
3518+ spin_unlock_irq(&barrier->phase2_spinlock);
3519+ kfree(barrier);
3520+ return 1;
3521+ }
3522+ }
3523+ barrier->state = BARRIER_STATE_COMPLETE;
3524+ }
3525+ spin_unlock_irq(&barrier->phase2_spinlock);
3526+ return 0;
3527+}
3528+
3529+/* Called if a barrier timeout happens */
3530+static void barrier_timer_fn(unsigned long arg)
3531+{
3532+ struct cl_barrier *barrier = (struct cl_barrier *) arg;
3533+
3534+ /* Ignore any futher messages, they are too late. */
3535+ barrier->phase = 0;
3536+
3537+ /* and cause it to timeout */
3538+ check_barrier_complete_phase2(barrier, -ETIMEDOUT);
3539+}
3540+
3541+/* Process BARRIER messages from other nodes */
3542+static void process_barrier_msg(struct cl_barriermsg *msg,
3543+ struct cluster_node *node)
3544+{
3545+ struct cl_barrier *barrier;
3546+
3547+ down(&barrier_list_lock);
3548+ barrier = find_barrier(msg->name);
3549+ up(&barrier_list_lock);
3550+
3551+ /* Ignore other peoples messages, in_transition() is needed here so
3552+ * that joining nodes will see their barrier messages before the
3553+ * we_are_a_cluster_member is set */
3554+ if (!we_are_a_cluster_member && !in_transition())
3555+ return;
3556+ if (!barrier)
3557+ return;
3558+
3559+ P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
3560+ node ? node->name : "unknown");
3561+
3562+ switch (msg->subcmd) {
3563+ case BARRIER_WAIT:
3564+ down(&barrier->lock);
3565+ if (barrier->phase == 0)
3566+ barrier->phase = 1;
3567+
3568+ if (barrier->phase == 1) {
3569+ atomic_inc(&barrier->got_nodes);
3570+ check_barrier_complete_phase1(barrier);
3571+ }
3572+ else {
3573+ printk(KERN_WARNING CMAN_NAME
3574+ ": got WAIT barrier not in phase 1 %s (%d)\n",
3575+ msg->name, barrier->phase);
3576+
3577+ }
3578+ up(&barrier->lock);
3579+ break;
3580+
3581+ case BARRIER_COMPLETE:
3582+ down(&barrier->lock);
3583+ atomic_inc(&barrier->completed_nodes);
3584+
3585+ /* First node to get all the WAIT messages sends COMPLETE, so
3586+ * we all complete */
3587+ if (barrier->phase == 1) {
3588+ atomic_set(&barrier->got_nodes,
3589+ barrier->expected_nodes);
3590+ check_barrier_complete_phase1(barrier);
3591+ }
3592+
3593+ if (barrier->phase == 2) {
3594+ /* If it was deleted (ret==1) then no need to unlock
3595+ * the mutex */
3596+ if (check_barrier_complete_phase2(barrier, 0) == 1)
3597+ return;
3598+ }
3599+ up(&barrier->lock);
3600+ break;
3601+ }
3602+}
3603+
3604+/* In-kernel membership API */
3605+int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
3606+{
3607+ struct kernel_notify_struct *notify;
3608+
3609+ notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
3610+ if (!notify)
3611+ return -ENOMEM;
3612+ notify->callback = callback;
3613+
3614+ down(&kernel_listener_lock);
3615+ list_add(&notify->list, &kernel_listener_list);
3616+ up(&kernel_listener_lock);
3617+
3618+ return 0;
3619+}
3620+
3621+int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
3622+{
3623+ struct list_head *calllist;
3624+ struct list_head *temp;
3625+ struct kernel_notify_struct *notify;
3626+
3627+ down(&kernel_listener_lock);
3628+ list_for_each_safe(calllist, temp, &kernel_listener_list) {
3629+ notify = list_entry(calllist, struct kernel_notify_struct, list);
3630+ if (notify->callback == callback){
3631+ list_del(&notify->list);
3632+ kfree(notify);
3633+ up(&kernel_listener_lock);
3634+ return 0;
3635+ }
3636+ }
3637+ up(&kernel_listener_lock);
3638+ return -EINVAL;
3639+}
3640+
3641+/* Return quorate status */
3642+int kcl_is_quorate()
3643+{
3644+ return cluster_is_quorate;
3645+}
3646+
3647+/* Return the address list for a node */
3648+struct list_head *kcl_get_node_addresses(int nodeid)
3649+{
3650+ struct cluster_node *node = find_node_by_nodeid(nodeid);
3651+
3652+ if (node)
3653+ return &node->addr_list;
3654+ else
3655+ return NULL;
3656+}
3657+
3658+static void copy_to_kclnode(struct cluster_node *node,
3659+ struct kcl_cluster_node *knode)
3660+{
3661+ strcpy(knode->name, node->name);
3662+ knode->size = sizeof (struct kcl_cluster_node);
3663+ knode->votes = node->votes;
3664+ knode->state = node->state;
3665+ knode->node_id = node->node_id;
3666+ knode->us = node->us;
3667+ knode->leave_reason = node->leave_reason;
3668+ knode->incarnation = node->incarnation;
3669+}
3670+
3671+/* Return the info for a node given it's address. if addr is NULL then return
3672+ * OUR info */
3673+int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
3674+ struct kcl_cluster_node *n)
3675+{
3676+ struct cluster_node *node;
3677+
3678+ /* They want us */
3679+ if (addr == NULL) {
3680+ node = us;
3681+ }
3682+ else {
3683+ node = find_node_by_addr(addr, addr_len);
3684+ if (!node)
3685+ return -1;
3686+ }
3687+
3688+ /* Copy to user's buffer */
3689+ copy_to_kclnode(node, n);
3690+ return 0;
3691+}
3692+
3693+int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
3694+{
3695+ struct cluster_node *node;
3696+
3697+ /* They want us */
3698+ if (name == NULL) {
3699+ node = us;
3700+ if (node == NULL)
3701+ return -1;
3702+ }
3703+ else {
3704+ node = find_node_by_name(name);
3705+ if (!node)
3706+ return -1;
3707+ }
3708+
3709+ /* Copy to user's buffer */
3710+ copy_to_kclnode(node, n);
3711+ return 0;
3712+}
3713+
3714+/* As above but by node id. MUCH faster */
3715+int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
3716+{
3717+ struct cluster_node *node;
3718+
3719+ /* They want us */
3720+ if (nodeid == 0) {
3721+ node = us;
3722+ if (node == NULL)
3723+ return -1;
3724+ }
3725+ else {
3726+ node = find_node_by_nodeid(nodeid);
3727+ if (!node)
3728+ return -1;
3729+ }
3730+
3731+ /* Copy to user's buffer */
3732+ copy_to_kclnode(node, n);
3733+ return 0;
3734+}
3735+
3736+/* Return a list of all cluster members ever */
3737+int kcl_get_all_members(struct list_head *list)
3738+{
3739+ struct list_head *nodelist;
3740+ struct cluster_node *node;
3741+ struct kcl_cluster_node *newnode;
3742+ int num_nodes = 0;
3743+
3744+ down(&cluster_members_lock);
3745+ list_for_each(nodelist, &cluster_members_list) {
3746+ if (list) {
3747+ node = list_entry(nodelist, struct cluster_node, list);
3748+ newnode =
3749+ kmalloc(sizeof (struct kcl_cluster_node),
3750+ GFP_KERNEL);
3751+ if (newnode) {
3752+ copy_to_kclnode(node, newnode);
3753+ list_add(&newnode->list, list);
3754+ num_nodes++;
3755+ }
3756+ }
3757+ else {
3758+ num_nodes++;
3759+ }
3760+ }
3761+ up(&cluster_members_lock);
3762+
3763+ return num_nodes;
3764+}
3765+
3766+/* Return a list of cluster members */
3767+int kcl_get_members(struct list_head *list)
3768+{
3769+ struct list_head *nodelist;
3770+ struct cluster_node *node;
3771+ struct kcl_cluster_node *newnode;
3772+ int num_nodes = 0;
3773+
3774+ down(&cluster_members_lock);
3775+ list_for_each(nodelist, &cluster_members_list) {
3776+ node = list_entry(nodelist, struct cluster_node, list);
3777+
3778+ if (node->state == NODESTATE_MEMBER) {
3779+ if (list) {
3780+ newnode =
3781+ kmalloc(sizeof (struct kcl_cluster_node),
3782+ GFP_KERNEL);
3783+ if (newnode) {
3784+ copy_to_kclnode(node, newnode);
3785+ list_add(&newnode->list, list);
3786+ num_nodes++;
3787+ }
3788+ }
3789+ else {
3790+ num_nodes++;
3791+ }
3792+ }
3793+ }
3794+ up(&cluster_members_lock);
3795+
3796+ return num_nodes;
3797+}
3798+
3799+/* Copy current member's nodeids into buffer */
3800+int kcl_get_member_ids(uint32_t *idbuf, int size)
3801+{
3802+ struct list_head *nodelist;
3803+ struct cluster_node *node;
3804+ int num_nodes = 0;
3805+
3806+ down(&cluster_members_lock);
3807+ list_for_each(nodelist, &cluster_members_list) {
3808+ node = list_entry(nodelist, struct cluster_node, list);
3809+
3810+ if (node->state == NODESTATE_MEMBER) {
3811+ if (idbuf && size) {
3812+ idbuf[num_nodes] = node->node_id;
3813+ num_nodes++;
3814+ size--;
3815+ }
3816+ else {
3817+ num_nodes++;
3818+ }
3819+ }
3820+ }
3821+ up(&cluster_members_lock);
3822+
3823+ return num_nodes;
3824+}
3825+
3826+/* Barrier API */
3827+int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
3828+{
3829+ struct cl_barrier *barrier;
3830+
3831+ /* We are not joined to a cluster */
3832+ if (!we_are_a_cluster_member)
3833+ return -ENOTCONN;
3834+
3835+ /* Must have a valid name */
3836+ if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
3837+ return -EINVAL;
3838+
3839+ /* We don't do this yet */
3840+ if (flags & BARRIER_ATTR_MULTISTEP)
3841+ return -ENOTSUPP;
3842+
3843+ down(&barrier_list_lock);
3844+
3845+ /* See if it already exists */
3846+ if ((barrier = find_barrier(name))) {
3847+ up(&barrier_list_lock);
3848+ if (nodes != barrier->expected_nodes) {
3849+ printk(KERN_WARNING CMAN_NAME
3850+ ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
3851+ name, barrier->expected_nodes, nodes);
3852+ up(&barrier_list_lock);
3853+ return -EINVAL;
3854+ }
3855+ else
3856+ return 0;
3857+ }
3858+
3859+ /* Build a new struct and add it to the list */
3860+ barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
3861+ if (barrier == NULL) {
3862+ up(&barrier_list_lock);
3863+ return -ENOMEM;
3864+ }
3865+ memset(barrier, 0, sizeof (*barrier));
3866+
3867+ strcpy(barrier->name, name);
3868+ barrier->flags = flags;
3869+ barrier->expected_nodes = nodes;
3870+ atomic_set(&barrier->got_nodes, 0);
3871+ atomic_set(&barrier->completed_nodes, 0);
3872+ barrier->endreason = 0;
3873+ barrier->registered_nodes = 1;
3874+ spin_lock_init(&barrier->phase2_spinlock);
3875+ barrier->state = BARRIER_STATE_INACTIVE;
3876+ init_MUTEX(&barrier->lock);
3877+
3878+ list_add(&barrier->list, &barrier_list);
3879+ up(&barrier_list_lock);
3880+
3881+ return 0;
3882+}
3883+
3884+static int barrier_setattr_enabled(struct cl_barrier *barrier,
3885+ unsigned int attr, unsigned long arg)
3886+{
3887+ int status;
3888+
3889+ /* Can't disable a barrier */
3890+ if (!arg) {
3891+ up(&barrier->lock);
3892+ return -EINVAL;
3893+ }
3894+
3895+ /* We need to send WAIT now because the user may not
3896+ * actually call kcl_barrier_wait() */
3897+ if (!barrier->waitsent) {
3898+ struct cl_barriermsg bmsg;
3899+
3900+ /* Send it to the rest of the cluster */
3901+ bmsg.cmd = CLUSTER_CMD_BARRIER;
3902+ bmsg.subcmd = BARRIER_WAIT;
3903+ strcpy(bmsg.name, barrier->name);
3904+
3905+ barrier->waitsent = 1;
3906+ barrier->phase = 1;
3907+
3908+ atomic_inc(&barrier->got_nodes);
3909+
3910+ /* Start the timer if one was wanted */
3911+ if (barrier->timeout) {
3912+ init_timer(&barrier->timer);
3913+ barrier->timer.function = barrier_timer_fn;
3914+ barrier->timer.data = (long) barrier;
3915+ mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
3916+ }
3917+
3918+ /* Barrier WAIT and COMPLETE messages are
3919+ * always queued - that way they always get
3920+ * sent out in the right order. If we don't do
3921+ * this then one can get sent out in the
3922+ * context of the user process and the other in
3923+ * cnxman and COMPLETE may /just/ slide in
3924+ * before WAIT if its in the queue
3925+ */
3926+ P_BARRIER("Sending WAIT for %s\n", name);
3927+ status = queue_message(&bmsg, sizeof (bmsg), NULL, 0, 0);
3928+ if (status < 0) {
3929+ up(&barrier->lock);
3930+ return status;
3931+ }
3932+
3933+ /* It might have been reached now */
3934+ if (barrier
3935+ && barrier->state != BARRIER_STATE_COMPLETE
3936+ && barrier->phase == 1)
3937+ check_barrier_complete_phase1(barrier);
3938+ }
3939+ if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
3940+ up(&barrier->lock);
3941+ return barrier->endreason;
3942+ }
3943+ up(&barrier->lock);
3944+ return 0; /* Nothing to propogate */
3945+}
3946+
3947+int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
3948+{
3949+ struct cl_barrier *barrier;
3950+
3951+ /* See if it already exists */
3952+ down(&barrier_list_lock);
3953+ if (!(barrier = find_barrier(name))) {
3954+ up(&barrier_list_lock);
3955+ return -ENOENT;
3956+ }
3957+ up(&barrier_list_lock);
3958+
3959+ down(&barrier->lock);
3960+ if (barrier->state == BARRIER_STATE_COMPLETE) {
3961+ up(&barrier->lock);
3962+ return 0;
3963+ }
3964+
3965+ switch (attr) {
3966+ case BARRIER_SETATTR_AUTODELETE:
3967+ if (arg)
3968+ barrier->flags |= BARRIER_ATTR_AUTODELETE;
3969+ else
3970+ barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
3971+ up(&barrier->lock);
3972+ return 0;
3973+ break;
3974+
3975+ case BARRIER_SETATTR_TIMEOUT:
3976+ /* Can only change the timout of an inactive barrier */
3977+ if (barrier->state == BARRIER_STATE_WAITING
3978+ || barrier->waitsent) {
3979+ up(&barrier->lock);
3980+ return -EINVAL;
3981+ }
3982+ barrier->timeout = arg;
3983+ up(&barrier->lock);
3984+ return 0;
3985+
3986+ case BARRIER_SETATTR_MULTISTEP:
3987+ up(&barrier->lock);
3988+ return -ENOTSUPP;
3989+
3990+ case BARRIER_SETATTR_ENABLED:
3991+ return barrier_setattr_enabled(barrier, attr, arg);
3992+
3993+ case BARRIER_SETATTR_NODES:
3994+ /* Can only change the expected node count of an inactive
3995+ * barrier */
3996+ if (barrier->state == BARRIER_STATE_WAITING
3997+ || barrier->waitsent)
3998+ return -EINVAL;
3999+ barrier->expected_nodes = arg;
4000+ break;
4001+
4002+ case BARRIER_SETATTR_CALLBACK:
4003+ if (barrier->state == BARRIER_STATE_WAITING
4004+ || barrier->waitsent)
4005+ return -EINVAL;
4006+ barrier->callback = (void (*)(char *, int)) arg;
4007+ up(&barrier->lock);
4008+ return 0; /* Don't propgate this to other nodes */
4009+ }
4010+
4011+ up(&barrier->lock);
4012+ return 0;
4013+}
4014+
4015+int kcl_barrier_delete(char *name)
4016+{
4017+ struct cl_barrier *barrier;
4018+
4019+ down(&barrier_list_lock);
4020+ /* See if it exists */
4021+ if (!(barrier = find_barrier(name))) {
4022+ up(&barrier_list_lock);
4023+ return -ENOENT;
4024+ }
4025+
4026+ /* Delete it */
4027+ list_del(&barrier->list);
4028+ kfree(barrier);
4029+
4030+ up(&barrier_list_lock);
4031+
4032+ return 0;
4033+}
4034+
4035+int kcl_barrier_cancel(char *name)
4036+{
4037+ struct cl_barrier *barrier;
4038+
4039+ /* See if it exists */
4040+ down(&barrier_list_lock);
4041+ if (!(barrier = find_barrier(name))) {
4042+ up(&barrier_list_lock);
4043+ return -ENOENT;
4044+ }
4045+ down(&barrier->lock);
4046+
4047+ barrier->endreason = -ENOTCONN;
4048+
4049+ if (barrier->callback) {
4050+ barrier->callback(barrier->name, -ECONNRESET);
4051+ barrier->callback = NULL;
4052+ }
4053+
4054+ if (barrier->timeout)
4055+ del_timer(&barrier->timer);
4056+
4057+ /* Remove it if it's AUTO-DELETE */
4058+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4059+ list_del(&barrier->list);
4060+ up(&barrier->lock);
4061+ kfree(barrier);
4062+ up(&barrier_list_lock);
4063+ return 0;
4064+ }
4065+
4066+ if (barrier->state == BARRIER_STATE_WAITING)
4067+ wake_up_interruptible(&barrier->waitq);
4068+
4069+ up(&barrier->lock);
4070+ up(&barrier_list_lock);
4071+ return 0;
4072+}
4073+
4074+int kcl_barrier_wait(char *name)
4075+{
4076+ struct cl_barrier *barrier;
4077+ int ret;
4078+
4079+ if (!atomic_read(&cnxman_running))
4080+ return -ENOTCONN;
4081+
4082+ /* Enable it */
4083+ kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
4084+
4085+ down(&barrier_list_lock);
4086+
4087+ /* See if it still exists - enable may have deleted it! */
4088+ if (!(barrier = find_barrier(name))) {
4089+ up(&barrier_list_lock);
4090+ return -ENOENT;
4091+ }
4092+
4093+ down(&barrier->lock);
4094+
4095+ up(&barrier_list_lock);
4096+
4097+ /* If it has already completed then return the status */
4098+ if (barrier->state == BARRIER_STATE_COMPLETE) {
4099+ up(&barrier->lock);
4100+ return barrier->endreason;
4101+ }
4102+
4103+ barrier->state = BARRIER_STATE_WAITING;
4104+
4105+ /* Have we all reached the barrier? */
4106+ while (atomic_read(&barrier->completed_nodes) !=
4107+ ((barrier->expected_nodes == 0)
4108+ ? cluster_members : barrier->expected_nodes)
4109+ && barrier->endreason == 0) {
4110+
4111+ wait_queue_t wq;
4112+
4113+ init_waitqueue_entry(&wq, current);
4114+ init_waitqueue_head(&barrier->waitq);
4115+
4116+ /* Wait for em all */
4117+ set_task_state(current, TASK_INTERRUPTIBLE);
4118+ add_wait_queue(&barrier->waitq, &wq);
4119+
4120+ if (atomic_read(&barrier->completed_nodes) !=
4121+ ((barrier->expected_nodes ==
4122+ 0) ? cluster_members : barrier->expected_nodes)
4123+ && barrier->endreason == 0) {
4124+ up(&barrier->lock);
4125+ schedule();
4126+ down(&barrier->lock);
4127+ }
4128+
4129+ remove_wait_queue(&barrier->waitq, &wq);
4130+ set_task_state(current, TASK_RUNNING);
4131+
4132+ if (signal_pending(current)) {
4133+ barrier->endreason = -EINTR;
4134+ break;
4135+ }
4136+ }
4137+ barrier->state = BARRIER_STATE_INACTIVE;
4138+
4139+ if (barrier->timeout)
4140+ del_timer(&barrier->timer);
4141+
4142+ /* Barrier has been reached on all nodes, call the callback */
4143+ if (barrier->callback) {
4144+ barrier->callback(barrier->name, barrier->endreason);
4145+ barrier->callback = NULL;
4146+ }
4147+
4148+ atomic_set(&barrier->got_nodes, 0);
4149+
4150+ /* Return the reason we were woken */
4151+ ret = barrier->endreason;
4152+
4153+ /* Remove it if it's AUTO-DELETE */
4154+ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4155+ down(&barrier_list_lock);
4156+ list_del(&barrier->list);
4157+ up(&barrier_list_lock);
4158+ up(&barrier->lock);
4159+ kfree(barrier);
4160+ }
4161+ else {
4162+ up(&barrier->lock);
4163+ }
4164+
4165+ /* We were woken up because the node left the cluster ? */
4166+ if (!atomic_read(&cnxman_running))
4167+ ret = -ENOTCONN;
4168+
4169+ return ret;
4170+}
4171+
4172+/* This is called from membership services when a node has left the cluster -
4173+ * we signal all waiting barriers with -ESRCH so they know to do something
4174+ * else, if the number of nodes is left at 0 then we compare the new number of
4175+ * nodes in the cluster with that at the barrier and return 0 (success) in that
4176+ * case */
4177+void check_barrier_returns()
4178+{
4179+ struct list_head *blist;
4180+ struct list_head *llist;
4181+ struct cl_barrier *barrier;
4182+ int status = 0;
4183+
4184+ down(&barrier_list_lock);
4185+ list_for_each(blist, &barrier_list) {
4186+ barrier = list_entry(blist, struct cl_barrier, list);
4187+
4188+ if (barrier->waitsent) {
4189+ int wakeit = 0;
4190+
4191+ /* Check for a dynamic member barrier */
4192+ if (barrier->expected_nodes == 0) {
4193+ if (barrier->registered_nodes ==
4194+ cluster_members) {
4195+ status = 0;
4196+ wakeit = 1;
4197+ }
4198+ }
4199+ else {
4200+ status = -ESRCH;
4201+ wakeit = 1;
4202+ }
4203+
4204+ /* Do we need to tell the barrier? */
4205+ if (wakeit) {
4206+ if (barrier->state == BARRIER_STATE_WAITING) {
4207+ barrier->endreason = status;
4208+ wake_up_interruptible(&barrier->waitq);
4209+ }
4210+ else {
4211+ if (barrier->callback) {
4212+ barrier->callback(barrier->name,
4213+ status);
4214+ }
4215+ }
4216+ }
4217+ }
4218+ }
4219+ up(&barrier_list_lock);
4220+
4221+ /* Part 2 check for outstanding listen requests for dead nodes and
4222+ * cancel them */
4223+ down(&listenreq_lock);
4224+ list_for_each(llist, &listenreq_list) {
4225+ struct cl_waiting_listen_request *lrequest =
4226+ list_entry(llist, struct cl_waiting_listen_request, list);
4227+ struct cluster_node *node =
4228+ find_node_by_nodeid(lrequest->nodeid);
4229+
4230+ if (node && node->state != NODESTATE_MEMBER) {
4231+ lrequest->result = -ENOTCONN;
4232+ lrequest->waiting = 0;
4233+ wake_up_interruptible(&lrequest->waitq);
4234+ }
4235+ }
4236+ up(&listenreq_lock);
4237+}
4238+
4239+int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
4240+{
4241+ struct temp_node *tn;
4242+ int err = 1; /* true */
4243+#ifdef DEBUG_COMMS
4244+ char buf[MAX_ADDR_PRINTED_LEN];
4245+#endif
4246+
4247+ down(&tempnode_lock);
4248+
4249+ list_for_each_entry(tn, &tempnode_list, list) {
4250+ if (tn->nodeid == nodeid) {
4251+ memcpy(addr, tn->addr, tn->addrlen);
4252+ *addrlen = tn->addrlen;
4253+ P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
4254+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4255+
4256+ goto out;
4257+ }
4258+ }
4259+ err = 0;
4260+
4261+ out:
4262+ up(&tempnode_lock);
4263+ return err;
4264+}
4265+
4266+/* Create a new temporary node ID. This list will only ever be very small
4267+ (usaully only 1 item) but I can't take the risk that someone won't try to
4268+ boot 128 nodes all at exactly the same time. */
4269+int new_temp_nodeid(char *addr, int addrlen)
4270+{
4271+ struct temp_node *tn;
4272+ int err = -1;
4273+ int try_nodeid = 0;
4274+#ifdef DEBUG_COMMS
4275+ char buf[MAX_ADDR_PRINTED_LEN];
4276+#endif
4277+
4278+ P_COMMS("new_temp_nodeid needed for\n: %s\n",
4279+ print_addr(addr, addrlen, buf));
4280+
4281+ down(&tempnode_lock);
4282+
4283+ /* First see if we already know about this node */
4284+ list_for_each_entry(tn, &tempnode_list, list) {
4285+
4286+ P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
4287+ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4288+
4289+ /* We're already in here... */
4290+ if (tn->addrlen == addrlen &&
4291+ memcmp(tn->addr, addr, addrlen) == 0) {
4292+ P_COMMS("reused temp node ID %d\n", tn->nodeid);
4293+ err = tn->nodeid;
4294+ goto out;
4295+ }
4296+ }
4297+
4298+ /* Nope, OK, invent a suitable number */
4299+ retry:
4300+ try_nodeid -= 1;
4301+ list_for_each_entry(tn, &tempnode_list, list) {
4302+
4303+ if (tn->nodeid == try_nodeid)
4304+ goto retry;
4305+ }
4306+
4307+ tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
4308+ if (!tn)
4309+ goto out;
4310+
4311+ memcpy(tn->addr, addr, addrlen);
4312+ tn->addrlen = addrlen;
4313+ tn->nodeid = try_nodeid;
4314+ list_add_tail(&tn->list, &tempnode_list);
4315+ err = try_nodeid;
4316+ P_COMMS("new temp nodeid = %d\n", try_nodeid);
4317+ out:
4318+ up(&tempnode_lock);
4319+ return err;
4320+}
4321+
4322+static int is_valid_temp_nodeid(int nodeid)
4323+{
4324+ struct temp_node *tn;
4325+ int err = 1; /* true */
4326+
4327+ down(&tempnode_lock);
4328+
4329+ list_for_each_entry(tn, &tempnode_list, list) {
4330+ if (tn->nodeid == nodeid)
4331+ goto out;
4332+ }
4333+ err = 0;
4334+
4335+ out:
4336+ P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
4337+ up(&tempnode_lock);
4338+ return err;
4339+}
4340+
4341+/* TODO: This needs to clean the list more fully of
4342+ nodes that are now full members but we did not master the transition */
4343+void remove_temp_nodeid(int nodeid)
4344+{
4345+ struct temp_node *tn;
4346+ struct temp_node *tmp;
4347+
4348+ down(&tempnode_lock);
4349+
4350+ list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
4351+ if (nodeid == tn->nodeid) {
4352+ list_del(&tn->list);
4353+ kfree(tn);
4354+ up(&tempnode_lock);
4355+ return;
4356+ }
4357+ }
4358+
4359+ up(&tempnode_lock);
4360+}
4361+
4362+/* Quorum device functions */
4363+int kcl_register_quorum_device(char *name, int votes)
4364+{
4365+ if (quorum_device)
4366+ return -EBUSY;
4367+
4368+ if (find_node_by_name(name))
4369+ return -EINVAL;
4370+
4371+ quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
4372+ if (!quorum_device)
4373+ return -ENOMEM;
4374+ memset(quorum_device, 0, sizeof (struct cluster_node));
4375+
4376+ quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
4377+ if (!quorum_device->name) {
4378+ kfree(quorum_device);
4379+ quorum_device = NULL;
4380+ return -ENOMEM;
4381+ }
4382+
4383+ strcpy(quorum_device->name, name);
4384+ quorum_device->votes = votes;
4385+ quorum_device->state = NODESTATE_DEAD;
4386+
4387+ /* Keep this list valid so it doesn't confuse other code */
4388+ INIT_LIST_HEAD(&quorum_device->addr_list);
4389+
4390+ return 0;
4391+}
4392+
4393+int kcl_unregister_quorum_device(void)
4394+{
4395+ if (!quorum_device)
4396+ return -EINVAL;
4397+ if (quorum_device->state == NODESTATE_MEMBER)
4398+ return -EINVAL;
4399+
4400+ quorum_device = NULL;
4401+
4402+ return 0;
4403+}
4404+
4405+int kcl_quorum_device_available(int yesno)
4406+{
4407+ if (!quorum_device)
4408+ return -EINVAL;
4409+
4410+ if (yesno) {
4411+ quorum_device->last_hello = jiffies;
4412+ if (quorum_device->state == NODESTATE_DEAD) {
4413+ quorum_device->state = NODESTATE_MEMBER;
4414+ recalculate_quorum(0);
4415+ }
4416+ }
4417+ else {
4418+ if (quorum_device->state == NODESTATE_MEMBER) {
4419+ quorum_device->state = NODESTATE_DEAD;
4420+ recalculate_quorum(0);
4421+ }
4422+ }
4423+
4424+ return 0;
4425+}
4426+
4427+/* APIs for cluster ref counting. */
4428+int kcl_addref_cluster()
4429+{
4430+ int ret = -ENOTCONN;
4431+
4432+ if (!atomic_read(&cnxman_running))
4433+ goto addref_ret;
4434+
4435+ if (try_module_get(THIS_MODULE)) {
4436+ atomic_inc(&use_count);
4437+ ret = 0;
4438+ }
4439+
4440+ addref_ret:
4441+ return ret;
4442+}
4443+
4444+int kcl_releaseref_cluster()
4445+{
4446+ if (!atomic_read(&cnxman_running))
4447+ return -ENOTCONN;
4448+ atomic_dec(&use_count);
4449+ module_put(THIS_MODULE);
4450+ return 0;
4451+}
4452+
4453+int kcl_cluster_name(char **cname)
4454+{
4455+ char *name;
4456+
4457+ name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
4458+ if (!name)
4459+ return -ENOMEM;
4460+
4461+ strncpy(name, cluster_name, strlen(cluster_name)+1);
4462+ *cname = name;
4463+ return 0;
4464+}
4465+
4466+int kcl_get_current_interface(void)
4467+{
4468+ return current_interface->number;
4469+}
4470+
4471+/* Socket registration stuff */
4472+static struct net_proto_family cl_family_ops = {
4473+ .family = AF_CLUSTER,
4474+ .create = cl_create
4475+};
4476+
4477+static struct proto_ops cl_proto_ops = {
4478+ .family = AF_CLUSTER,
4479+
4480+ .release = cl_release,
4481+ .bind = cl_bind,
4482+ .connect = sock_no_connect,
4483+ .socketpair = sock_no_socketpair,
4484+ .accept = sock_no_accept,
4485+ .getname = cl_getname,
4486+ .poll = cl_poll,
4487+ .ioctl = cl_ioctl,
4488+ .listen = sock_no_listen,
4489+ .shutdown = cl_shutdown,
4490+ .setsockopt = cl_setsockopt,
4491+ .getsockopt = cl_getsockopt,
4492+ .sendmsg = cl_sendmsg,
4493+ .recvmsg = cl_recvmsg,
4494+ .mmap = sock_no_mmap,
4495+ .sendpage = sock_no_sendpage,
4496+};
4497+
4498+#ifdef MODULE
4499+MODULE_DESCRIPTION("Cluster Connection and Service Manager");
4500+MODULE_AUTHOR("Red Hat, Inc");
4501+MODULE_LICENSE("GPL");
4502+#endif
4503+
4504+static int __init cluster_init(void)
4505+{
4506+ printk("CMAN %s (built %s %s) installed\n",
4507+ CMAN_RELEASE_NAME, __DATE__, __TIME__);
4508+
4509+ /* allocate our sock slab cache */
4510+ cluster_sk_cachep = kmem_cache_create("cluster_sock",
4511+ sizeof (struct cluster_sock), 0,
4512+ SLAB_HWCACHE_ALIGN, 0, 0);
4513+ if (!cluster_sk_cachep) {
4514+ printk(KERN_CRIT
4515+ "cluster_init: Cannot create cluster_sock SLAB cache\n");
4516+ return -1;
4517+
4518+ }
4519+
4520+ if (sock_register(&cl_family_ops)) {
4521+ printk(KERN_INFO "Unable to register cluster socket type\n");
4522+ kmem_cache_destroy(cluster_sk_cachep);
4523+ return -1;
4524+ }
4525+
4526+
4527+#ifdef CONFIG_PROC_FS
4528+ create_proc_entries();
4529+#endif
4530+
4531+ init_MUTEX(&start_thread_sem);
4532+ init_MUTEX(&send_lock);
4533+ init_MUTEX(&barrier_list_lock);
4534+ init_MUTEX(&cluster_members_lock);
4535+ init_MUTEX(&port_array_lock);
4536+ init_MUTEX(&messages_list_lock);
4537+ init_MUTEX(&listenreq_lock);
4538+ init_MUTEX(&client_socket_lock);
4539+ init_MUTEX(&new_dead_node_lock);
4540+ init_MUTEX(&event_listener_lock);
4541+ init_MUTEX(&kernel_listener_lock);
4542+ init_MUTEX(&tempnode_lock);
4543+ spin_lock_init(&active_socket_lock);
4544+ init_timer(&ack_timer);
4545+
4546+ INIT_LIST_HEAD(&event_listener_list);
4547+ INIT_LIST_HEAD(&kernel_listener_list);
4548+ INIT_LIST_HEAD(&socket_list);
4549+ INIT_LIST_HEAD(&client_socket_list);
4550+ INIT_LIST_HEAD(&active_socket_list);
4551+ INIT_LIST_HEAD(&barrier_list);
4552+ INIT_LIST_HEAD(&messages_list);
4553+ INIT_LIST_HEAD(&listenreq_list);
4554+ INIT_LIST_HEAD(&cluster_members_list);
4555+ INIT_LIST_HEAD(&new_dead_node_list);
4556+ INIT_LIST_HEAD(&tempnode_list);
4557+
4558+ atomic_set(&cnxman_running, 0);
4559+
4560+ sm_init();
4561+
4562+ return 0;
4563+}
4564+
4565+static void __exit cluster_exit(void)
4566+{
4567+#ifdef CONFIG_PROC_FS
4568+ cleanup_proc_entries();
4569+#endif
4570+
4571+ sock_unregister(AF_CLUSTER);
4572+ kmem_cache_destroy(cluster_sk_cachep);
4573+}
4574+
4575+module_init(cluster_init);
4576+module_exit(cluster_exit);
4577+
4578+EXPORT_SYMBOL(kcl_sendmsg);
4579+EXPORT_SYMBOL(kcl_register_read_callback);
4580+EXPORT_SYMBOL(kcl_add_callback);
4581+EXPORT_SYMBOL(kcl_remove_callback);
4582+EXPORT_SYMBOL(kcl_get_members);
4583+EXPORT_SYMBOL(kcl_get_member_ids);
4584+EXPORT_SYMBOL(kcl_get_all_members);
4585+EXPORT_SYMBOL(kcl_is_quorate);
4586+EXPORT_SYMBOL(kcl_get_node_by_addr);
4587+EXPORT_SYMBOL(kcl_get_node_by_name);
4588+EXPORT_SYMBOL(kcl_get_node_by_nodeid);
4589+EXPORT_SYMBOL(kcl_get_node_addresses);
4590+EXPORT_SYMBOL(kcl_addref_cluster);
4591+EXPORT_SYMBOL(kcl_releaseref_cluster);
4592+EXPORT_SYMBOL(kcl_cluster_name);
4593+
4594+EXPORT_SYMBOL(kcl_barrier_register);
4595+EXPORT_SYMBOL(kcl_barrier_setattr);
4596+EXPORT_SYMBOL(kcl_barrier_delete);
4597+EXPORT_SYMBOL(kcl_barrier_wait);
4598+EXPORT_SYMBOL(kcl_barrier_cancel);
4599+
4600+EXPORT_SYMBOL(kcl_register_quorum_device);
4601+EXPORT_SYMBOL(kcl_unregister_quorum_device);
4602+EXPORT_SYMBOL(kcl_quorum_device_available);
4603+
4604+EXPORT_SYMBOL(kcl_register_service);
4605+EXPORT_SYMBOL(kcl_unregister_service);
4606+EXPORT_SYMBOL(kcl_join_service);
4607+EXPORT_SYMBOL(kcl_leave_service);
4608+EXPORT_SYMBOL(kcl_global_service_id);
4609+EXPORT_SYMBOL(kcl_start_done);
4610+EXPORT_SYMBOL(kcl_get_services);
4611+EXPORT_SYMBOL(kcl_get_current_interface);
4612+
4613+/*
4614+ * Overrides for Emacs so that we follow Linus's tabbing style.
4615+ * Emacs will notice this stuff at the end of the file and automatically
4616+ * adjust the settings for this buffer only. This must remain at the end
4617+ * of the file.
4618+ * ---------------------------------------------------------------------------
4619+ * Local variables:
4620+ * c-file-style: "linux"
4621+ * End:
4622+ */
4623diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
4624--- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4625+++ linux-patched/cluster/cman/config.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 4626@@ -0,0 +1,46 @@
4627+/******************************************************************************
4628+*******************************************************************************
4629+**
4630+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4631+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4632+**
4633+** This copyrighted material is made available to anyone wishing to use,
4634+** modify, copy, or redistribute it subject to the terms and conditions
4635+** of the GNU General Public License v.2.
4636+**
4637+*******************************************************************************
4638+******************************************************************************/
4639+
4640+#include "config.h"
4641+
4642+/* Config file defaults */
4643+
4644+#define DEFAULT_JOIN_WAIT_TIME 11 /* Time to wait while sending JOINREQ
4645+ * messages. Should be at least twice
4646+ * the HELLO timer */
4647+#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a
4648+ * JOINACK to regarding that node as
4649+ * dead */
4650+#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */
4651+#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a
4652+ * node in this period kill it */
4653+#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition
4654+ * should take */
4655+#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to
4656+ * a JOINCONF message */
4657+#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */
4658+#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition
4659+ * restarts before we die */
4660+#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */
4661+
4662+struct config_info cman_config = {
4663+ .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
4664+ .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
4665+ .join_timeout = DEFAULT_JOIN_TIMEOUT,
4666+ .hello_timer = DEFAULT_HELLO_TIMER,
4667+ .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
4668+ .transition_timeout = DEFAULT_TRANSITION_TIMER,
4669+ .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
4670+ .max_nodes = DEFAULT_MAX_NODES,
4671+ .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
4672+};
4673diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
4674--- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4675+++ linux-patched/cluster/cman/config.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 4676@@ -0,0 +1,31 @@
4677+/******************************************************************************
4678+*******************************************************************************
4679+**
4680+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4681+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4682+**
4683+** This copyrighted material is made available to anyone wishing to use,
4684+** modify, copy, or redistribute it subject to the terms and conditions
4685+** of the GNU General Public License v.2.
4686+**
4687+*******************************************************************************
4688+******************************************************************************/
4689+
4690+#ifndef __CONFIG_DOT_H__
4691+#define __CONFIG_DOT_H__
4692+
4693+struct config_info {
4694+ int joinwait_timeout;
4695+ int joinconf_timeout;
4696+ int join_timeout;
4697+ int hello_timer;
4698+ int deadnode_timeout;
4699+ int transition_timeout;
4700+ int transition_restarts;
4701+ int max_nodes;
4702+ int sm_debug_size;
4703+};
4704+
4705+extern struct config_info cman_config;
4706+
4707+#endif /* __CONFIG_DOT_H__ */
4708diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
4709--- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4710+++ linux-patched/cluster/cman/kjoin.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 4711@@ -0,0 +1,238 @@
4712+/******************************************************************************
4713+*******************************************************************************
4714+**
4715+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4716+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4717+**
4718+** This copyrighted material is made available to anyone wishing to use,
4719+** modify, copy, or redistribute it subject to the terms and conditions
4720+** of the GNU General Public License v.2.
4721+**
4722+*******************************************************************************
4723+******************************************************************************/
4724+
4725+#include <linux/socket.h>
4726+#include <net/sock.h>
4727+#include <linux/list.h>
4728+#include <cluster/cnxman.h>
4729+#include <linux/in.h>
4730+
4731+#include "cnxman-private.h"
4732+
4733+static struct socket *mcast_sock;
4734+static struct socket *recv_sock;
4735+static struct socket *cluster_sock;
4736+
4737+extern short cluster_id;
4738+extern int join_count;
4739+extern struct semaphore join_count_lock;
4740+extern atomic_t cnxman_running;
4741+
4742+int kcl_join_cluster(struct cl_join_cluster_info *join_info)
4743+{
4744+ int result;
4745+ int one = 1, error;
4746+ unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
4747+ unsigned short port = join_info->port;
4748+ mm_segment_t fs;
4749+ struct sockaddr_in saddr;
4750+ struct kcl_multicast_sock mcast_info;
4751+
4752+ down(&join_count_lock);
4753+ if (atomic_read(&cnxman_running))
4754+ {
4755+ error = 0;
4756+ if (join_info->cluster_id == cluster_id)
4757+ join_count++;
4758+ else
4759+ error = -EINVAL;
4760+ up(&join_count_lock);
4761+ return error;
4762+ }
4763+ up(&join_count_lock);
4764+
4765+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
4766+ if (result < 0)
4767+ {
4768+ printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
4769+ return result;
4770+ }
4771+
4772+ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
4773+ if (result < 0)
4774+ {
4775+ printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
4776+ return result;
4777+ }
4778+
4779+ fs = get_fs();
4780+ set_fs(get_ds());
4781+
4782+ if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
4783+ (void *) &one, sizeof (int))))
4784+ {
4785+ set_fs(fs);
4786+ printk("Error %d Setting master socket to SO_BROADCAST\n",
4787+ error);
4788+ sock_release(mcast_sock);
4789+ return -1;
4790+ }
4791+ set_fs(fs);
4792+
4793+ /* Bind the multicast socket */
4794+ saddr.sin_family = AF_INET;
4795+ saddr.sin_port = htons(port);
4796+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
4797+ result =
4798+ mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
4799+ sizeof (saddr));
4800+ if (result < 0)
4801+ {
4802+ printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
4803+ sock_release(mcast_sock);
4804+ sock_release(recv_sock);
4805+ return result;
4806+ }
4807+
4808+ /* Bind the receive socket to our IP address */
4809+ saddr.sin_family = AF_INET;
4810+ saddr.sin_port = htons(port);
4811+ saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
4812+ result =
4813+ recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
4814+ sizeof (saddr));
4815+ if (result < 0)
4816+ {
4817+ printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
4818+ sock_release(mcast_sock);
4819+ sock_release(recv_sock);
4820+ return result;
4821+ }
4822+
4823+ /* Create the cluster master socket */
4824+ result =
4825+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
4826+ if (result < 0)
4827+ {
4828+ printk(KERN_ERR CMAN_NAME
4829+ ": Can't create cluster master socket\n");
4830+ sock_release(mcast_sock);
4831+ sock_release(recv_sock);
4832+ return result;
4833+ }
4834+
4835+ /* This is the broadcast transmit address */
4836+ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
4837+
4838+ /* Pass the multicast socket to kernel space */
4839+ mcast_info.sock = mcast_sock;
4840+ mcast_info.number = 1;
4841+
4842+ fs = get_fs();
4843+ set_fs(get_ds());
4844+
4845+ if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
4846+ KCL_SET_MULTICAST,
4847+ (void *) &mcast_info,
4848+ sizeof (mcast_info))))
4849+ {
4850+ set_fs(fs);
4851+ printk(CMAN_NAME
4852+ ": Unable to pass multicast socket to cnxman, %d\n",
4853+ error);
4854+ sock_release(mcast_sock);
4855+ sock_release(recv_sock);
4856+ sock_release(cluster_sock);
4857+ return -1;
4858+ }
4859+
4860+ mcast_info.sock = recv_sock;
4861+ if ((error =
4862+ cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
4863+ KCL_SET_RCVONLY,
4864+ (void *) &mcast_info,
4865+ sizeof (mcast_info))))
4866+ {
4867+ set_fs(fs);
4868+ printk(CMAN_NAME
4869+ ": Unable to pass receive socket to cnxman, %d\n",
4870+ error);
4871+ sock_release(mcast_sock);
4872+ sock_release(recv_sock);
4873+ sock_release(cluster_sock);
4874+ return -1;
4875+ }
4876+
4877+ /* This setsockopt expects usermode variables */
4878+
4879+ if (cluster_sock->ops->
4880+ setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
4881+ (void *) join_info,
4882+ sizeof (struct cl_join_cluster_info)))
4883+
4884+ {
4885+ set_fs(fs);
4886+ printk(CMAN_NAME ": Unable to join cluster\n");
4887+ sock_release(mcast_sock);
4888+ sock_release(recv_sock);
4889+ sock_release(cluster_sock);
4890+ return -1;
4891+ }
4892+ set_fs(fs);
4893+
4894+ return 0;
4895+}
4896+
4897+int kcl_leave_cluster(int remove)
4898+{
4899+ mm_segment_t fs;
4900+ int rem = remove;
4901+ int ret = 0;
4902+ struct socket *shutdown_sock = cluster_sock;
4903+
4904+ cluster_sock = NULL;
4905+
4906+ if (!shutdown_sock)
4907+ {
4908+ /* Create the cluster master socket */
4909+ int result =
4910+ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
4911+ &shutdown_sock);
4912+ if (result < 0)
4913+ {
4914+ printk(KERN_ERR CMAN_NAME
4915+ ": Can't create cluster master socket\n");
4916+ sock_release(mcast_sock);
4917+ sock_release(recv_sock);
4918+ return result;
4919+ }
4920+ }
4921+
4922+ fs = get_fs();
4923+ set_fs(get_ds());
4924+
4925+ if ((ret =
4926+ shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
4927+ CLU_LEAVE_CLUSTER, (void *) &rem,
4928+ sizeof (int))))
4929+ {
4930+ printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
4931+ ret);
4932+ }
4933+ set_fs(fs);
4934+
4935+ sock_release(shutdown_sock);
4936+
4937+ return ret;
4938+}
4939+
4940+/*
4941+ * Overrides for Emacs so that we follow Linus's tabbing style.
4942+ * Emacs will notice this stuff at the end of the file and automatically
4943+ * adjust the settings for this buffer only. This must remain at the end
4944+ * of the file.
4945+ * ---------------------------------------------------------------------------
4946+ * Local variables:
4947+ * c-file-style: "linux"
4948+ * End:
4949+ */
4950diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
4951--- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 4952+++ linux-patched/cluster/cman/membership.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 4953@@ -0,0 +1,3069 @@
4954+/******************************************************************************
4955+*******************************************************************************
4956+**
4957+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4958+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4959+**
4960+** This copyrighted material is made available to anyone wishing to use,
4961+** modify, copy, or redistribute it subject to the terms and conditions
4962+** of the GNU General Public License v.2.
4963+**
4964+*******************************************************************************
4965+******************************************************************************/
4966+
4967+#include <linux/socket.h>
4968+#include <net/sock.h>
4969+#include <linux/slab.h>
4970+#include <linux/spinlock.h>
4971+#include <linux/vmalloc.h>
4972+#include <asm/uaccess.h>
4973+#include <linux/list.h>
4974+#include <cluster/cnxman.h>
4975+
4976+#include "cnxman-private.h"
4977+#include "config.h"
4978+#include "sm_control.h"
4979+
4980+#ifndef TRUE
4981+#define TRUE 1
4982+#endif
4983+
4984+/* Barrier name for membership transitions. %d is the cluster generation number
4985+ */
4986+#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d"
4987+
4988+/* Variables also used by connection manager */
4989+struct list_head cluster_members_list;
4990+struct semaphore cluster_members_lock;
4991+int cluster_members; /* Number of ACTIVE members, not a count of
4992+ * nodes in the list */
4993+int we_are_a_cluster_member = 0;
4994+int cluster_is_quorate;
4995+int quit_threads = 0;
4996+struct task_struct *membership_task;
4997+struct cluster_node *us;
4998+
4999+static struct task_struct *hello_task;
5000+static struct semaphore hello_task_lock;
5001+
5002+/* Variables that belong to the connection manager */
5003+extern wait_queue_head_t cnxman_waitq;
5004+extern struct completion member_thread_comp;
5005+extern struct cluster_node *quorum_device;
5006+extern unsigned short two_node;
5007+extern char cluster_name[];
5008+extern unsigned int config_version;
5009+extern unsigned int address_length;
5010+
5011+static struct socket *mem_socket;
5012+static pid_t kcluster_pid;
5013+
5014+static char iobuf[MAX_CLUSTER_MESSAGE];
5015+static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
5016+
5017+/* Our node name, usually system_utsname.nodename, but can be overridden */
5018+char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
5019+
5020+static spinlock_t members_by_nodeid_lock;
5021+static int sizeof_members_array = 0; /* Can dynamically increase (vmalloc
5022+ * permitting) */
5023+static struct cluster_node **members_by_nodeid;
5024+
5025+#define MEMBER_INCREMENT_SIZE 10
5026+
5027+static int votes = 1; /* Votes this node has */
5028+static int expected_votes = 1; /* Total expected votes in the cluster */
5029+static unsigned int quorum; /* Quorum, fewer votes than this and we stop
5030+ * work */
5031+static int leavereason; /* Saved for the duration of a state transition */
5032+static int transitionreason; /* Reason this transition was initiated */
5033+static unsigned int highest_nodeid; /* Highest node ID known to the cluster */
5034+static struct timer_list transition_timer; /* Kicks in if the transition
5035+ * doesn't complete in a
5036+ * reasonable time */
5037+static struct timer_list hello_timer; /* Timer to send HELLOs on */
5038+static unsigned long join_time; /* The time that we got our JOIN-ACK */
5039+static unsigned long start_time; /* The time that we were started */
5040+static int joinconf_count; /* Number of JOINCONF messages we have sent to
5041+ * a new node */
5042+static unsigned long wake_flags;/* Reason we were woken */
5043+
5044+/* Flags in above */
5045+#define WAKE_FLAG_DEADNODE 1
5046+#define WAKE_FLAG_TRANSTIMER 2
5047+
5048+/* The time the transition finished */
5049+static unsigned long transition_end_time;
5050+
5051+/* A list of nodes that cnxman tells us are dead. I hope this never has more
5052+ * than one element in it but I can't take that chance. only non-static so it
5053+ * can be initialised in module_load. */
5054+struct list_head new_dead_node_list;
5055+struct semaphore new_dead_node_lock;
5056+
5057+static int do_membership_packet(struct msghdr *msg, int len);
5058+static int do_process_joinreq(struct msghdr *msg, int len);
5059+static int do_process_joinack(struct msghdr *msg, int len);
5060+static int do_process_joinconf(struct msghdr *msg, int len);
5061+static int do_process_leave(struct msghdr *msg, int len);
5062+static int do_process_hello(struct msghdr *msg, int len);
5063+static int do_process_kill(struct msghdr *msg, int len);
5064+static int do_process_reconfig(struct msghdr *msg, int len);
5065+static int do_process_starttrans(struct msghdr *msg, int len);
5066+static int do_process_masterview(struct msghdr *msg, int len);
5067+static int do_process_endtrans(struct msghdr *msg, int len);
5068+static int do_process_viewack(struct msghdr *msg, int len);
5069+static int do_process_startack(struct msghdr *msg, int len);
5070+static int do_process_newcluster(struct msghdr *msg, int len);
5071+static int do_process_nominate(struct msghdr *msg, int len);
5072+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
5073+ unsigned int flags);
5074+static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
5075+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
5076+static int send_hello(void);
5077+static int send_master_hello(void);
5078+static int send_newcluster(void);
5079+static int end_transition(void);
5080+static int dispatch_messages(struct socket *mem_socket);
5081+static void check_for_dead_nodes(void);
5082+static void confirm_joiner(void);
5083+static void reset_hello_time(void);
5084+static int add_us(void);
5085+static int send_joinconf(void);
5086+static int init_membership_services(void);
5087+static int elect_master(struct cluster_node **);
5088+static void trans_timer_expired(unsigned long arg);
5089+static void hello_timer_expired(unsigned long arg);
5090+static void join_or_form_cluster(void);
5091+static int do_timer_wakeup(void);
5092+static int start_transition(unsigned char reason, struct cluster_node *node);
5093+int send_leave(unsigned char);
5094+int send_reconfigure(int, unsigned int);
5095+
5096+#ifdef DEBUG_MEMB
5097+static char *msgname(int msg);
5098+static int debug_sendmsg(struct socket *sock, void *buf, int size,
5099+ struct sockaddr_cl *caddr, int addr_len,
5100+ unsigned int flags)
5101+{
5102+ P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
5103+ size);
5104+ return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
5105+}
5106+
5107+#define kcl_sendmsg debug_sendmsg
5108+#endif
5109+
5110+/* State of the node */
5111+static enum { STARTING, JOINING, JOINWAIT, JOINACK, TRANSITION,
5112+ TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
5113+} node_state = STARTING;
5114+
5115+/* Sub-state when we are MASTER */
5116+static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
5117+ MASTER_COMPLETE } master_state;
5118+
5119+/* Number of responses collected while a master controlling a state transition */
5120+static int responses_collected;
5121+static int responses_expected;
5122+
5123+/* Current cluster generation number */
5124+static int cluster_generation = 1;
5125+
5126+/* When another node initiates a transtion then store it's pointer in here so
5127+ * we can check for other nodes trying to spoof us */
5128+static struct cluster_node *master_node = NULL;
5129+
5130+/* Struct the node wanting to join us */
5131+static struct cluster_node *joining_node = NULL;
5132+static int joining_temp_nodeid = 0;
5133+
5134+/* Last time a HELLO message was sent */
5135+unsigned long last_hello = 0;
5136+
5137+/* When we got our JOINWAIT or NEWCLUSTER */
5138+unsigned long joinwait_time = 0;
5139+
5140+/* Number of times a transition has restarted when we were master */
5141+int transition_restarts = 0;
5142+
5143+/* Variables used by the master to collect cluster status during a transition */
5144+static int agreeing_nodes = 0;
5145+static int dissenting_nodes = 0;
5146+static uint8_t *node_opinion = NULL;
5147+#define OPINION_AGREE 1
5148+#define OPINION_DISAGREE 2
5149+
5150+/* Set node id of a node, also add it to the members array and expand the array
5151+ * if necessary */
5152+static inline void set_nodeid(struct cluster_node *node, int nodeid)
5153+{
5154+ if (!nodeid)
5155+ return;
5156+
5157+ node->node_id = nodeid;
5158+ if (nodeid > sizeof_members_array) {
5159+ int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
5160+ struct cluster_node **new_array =
5161+ vmalloc((new_size) * sizeof (struct cluster_node *));
5162+ if (new_array) {
5163+ spin_lock(&members_by_nodeid_lock);
5164+ memcpy(new_array, members_by_nodeid,
5165+ sizeof_members_array *
5166+ sizeof (struct cluster_node *));
5167+ memset(&new_array[sizeof_members_array], 0,
5168+ MEMBER_INCREMENT_SIZE *
5169+ sizeof (struct cluster_node *));
5170+ vfree(members_by_nodeid);
5171+ members_by_nodeid = new_array;
5172+ sizeof_members_array = new_size;
5173+ spin_unlock(&members_by_nodeid_lock);
5174+ }
5175+ else {
5176+ panic("No memory for more nodes");
5177+ }
5178+ }
5179+ notify_kernel_listeners(NEWNODE, (long) nodeid);
5180+
5181+ spin_lock(&members_by_nodeid_lock);
5182+ members_by_nodeid[nodeid] = node;
5183+ spin_unlock(&members_by_nodeid_lock);
5184+}
5185+
5186+static int hello_kthread(void *unused)
5187+{
5188+ struct task_struct *tsk = current;
5189+ sigset_t tmpsig;
5190+
5191+ daemonize("cman_hbeat");
5192+
5193+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5194+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5195+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5196+
5197+ down(&hello_task_lock);
5198+ hello_task = tsk;
5199+ up(&hello_task_lock);
5200+
5201+ set_user_nice(current, -6);
5202+
5203+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5204+ send_hello();
5205+
5206+ /* Scan the nodes list for dead nodes */
5207+ if (node_state == MEMBER)
5208+ check_for_dead_nodes();
5209+
5210+ set_task_state(current, TASK_INTERRUPTIBLE);
5211+ schedule();
5212+ set_task_state(current, TASK_RUNNING);
5213+ }
5214+ down(&hello_task_lock);
5215+ hello_task = NULL;
5216+ up(&hello_task_lock);
5217+ P_MEMB("heartbeat closing down\n");
5218+ return 0;
5219+}
5220+
5221+/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
5222+ * that keeps track of and controls cluster membership. */
5223+static int membership_kthread(void *unused)
5224+{
5225+ struct task_struct *tsk = current;
5226+ struct socket *tmp_socket;
5227+ sigset_t tmpsig;
5228+
5229+ daemonize("cman_memb");
5230+
5231+ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5232+ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5233+ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5234+
5235+ membership_task = tsk;
5236+ set_user_nice(current, -5);
5237+
5238+ /* Open the socket */
5239+ if (init_membership_services())
5240+ return -1;
5241+
5242+ add_us();
5243+ joining_node = us;
5244+
5245+ init_timer(&hello_timer);
5246+ hello_timer.function = hello_timer_expired;
5247+ hello_timer.data = 0L;
5248+
5249+ /* Do joining stuff */
5250+ join_or_form_cluster();
5251+
5252+ transition_end_time = jiffies;
5253+
5254+ /* Main loop */
5255+ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5256+
5257+ struct task_struct *tsk = current;
5258+
5259+ DECLARE_WAITQUEUE(wait, tsk);
5260+
5261+ tsk->state = TASK_INTERRUPTIBLE;
5262+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5263+
5264+ if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
5265+ wake_flags == 0) {
5266+ if (node_state == JOINACK ||
5267+ node_state == JOINWAIT)
5268+ schedule_timeout(HZ);
5269+ else
5270+ schedule();
5271+ }
5272+
5273+ tsk->state = TASK_RUNNING;
5274+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5275+
5276+ /* Are we being shut down? */
5277+ if (node_state == LEFT_CLUSTER || quit_threads ||
5278+ signal_pending(current))
5279+ break;
5280+
5281+ /* Were we woken by a dead node passed down from cnxman ? */
5282+ if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
5283+ struct list_head *nodelist, *tmp;
5284+ struct cl_new_dead_node *deadnode;
5285+
5286+ down(&new_dead_node_lock);
5287+ list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
5288+ deadnode =
5289+ list_entry(nodelist,
5290+ struct cl_new_dead_node, list);
5291+
5292+ if (deadnode->node->state == NODESTATE_MEMBER)
5293+ a_node_just_died(deadnode->node);
5294+ list_del(&deadnode->list);
5295+ kfree(deadnode);
5296+ }
5297+ up(&new_dead_node_lock);
5298+ }
5299+
5300+ /* Process received messages. If dispatch_message() returns an
5301+ * error then we shut down */
5302+ if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5303+ if (dispatch_messages(mem_socket) < 0)
5304+ goto leave_cluster;
5305+
5306+ }
5307+
5308+ /* Were we woken by the transition timer firing ? */
5309+ if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
5310+ switch (do_timer_wakeup()) {
5311+ case -1:
5312+ continue;
5313+ case 0:
5314+ break;
5315+ case +1:
5316+ goto leave_cluster;
5317+ }
5318+ }
5319+
5320+ /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
5321+ * messages again */
5322+ if (node_state == JOINACK
5323+ && time_after(jiffies,
5324+ join_time + cman_config.join_timeout * HZ)) {
5325+ P_MEMB
5326+ ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
5327+ node_state = JOINWAIT;
5328+ joinwait_time = jiffies;
5329+ }
5330+
5331+ /* Have we been in joinwait for too long... */
5332+ if (node_state == JOINWAIT
5333+ && time_after(jiffies, joinwait_time +
5334+ cman_config.join_timeout * HZ)) {
5335+ printk(CMAN_NAME
5336+ ": Been in JOINWAIT for too long - giving up\n");
5337+ goto leave_cluster;
5338+ }
5339+ }
5340+
5341+ leave_cluster:
5342+
5343+ /* Wake up the heartbeat thread so it can exit */
5344+ down(&hello_task_lock);
5345+ if (hello_task)
5346+ wake_up_process(hello_task);
5347+ up(&hello_task_lock);
5348+
5349+ if (timer_pending(&hello_timer))
5350+ del_timer(&hello_timer);
5351+
5352+ if (timer_pending(&transition_timer))
5353+ del_timer(&transition_timer);
5354+
5355+ node_state = LEFT_CLUSTER;
5356+ P_MEMB("closing down\n");
5357+ quit_threads = 1; /* force other thread to exit too */
5358+
5359+ /* Close the socket, NULL the pointer first so it doesn't get used
5360+ * by send_leave()
5361+ */
5362+ tmp_socket = mem_socket;
5363+ mem_socket = NULL;
5364+ sock_release(tmp_socket);
5365+ highest_nodeid = 0;
5366+ complete(&member_thread_comp);
5367+ return 0;
5368+}
5369+
5370+/* Things to do in the main thread when the transition timer has woken us.
5371+ * Usually this happens when a transition is taking too long and we need to
5372+ * take remedial action.
5373+ *
5374+ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
5375+static int do_timer_wakeup()
5376+{
5377+ P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
5378+
5379+ /* Resend JOINCONF if it got lost on the wire */
5380+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5381+ mod_timer(&transition_timer,
5382+ jiffies + cman_config.joinconf_timeout * HZ);
5383+ if (++joinconf_count < MAX_RETRIES) {
5384+ P_MEMB("Resending JOINCONF\n");
5385+ send_joinconf();
5386+ }
5387+ else {
5388+ P_MEMB("JOINCONF not acked, cancelling transition\n");
5389+ end_transition();
5390+ }
5391+ return -1;
5392+ }
5393+
5394+ /* A joining node probably died */
5395+ if (cluster_members == 1) {
5396+ end_transition();
5397+ return -1;
5398+ }
5399+
5400+ /* See if the master is still there */
5401+ if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
5402+
5403+ /* If we are in transition and master_node is NULL then we are
5404+ * waiting for ENDTRANS after JOIN-CONF */
5405+ if (!master_node) {
5406+ /* Hmmm. master died after sending JOINCONF, we'll have
5407+ * to die as we are in mid-transition */
5408+ printk(KERN_INFO CMAN_NAME
5409+ ": Master died after JOINCONF, we must leave the cluster\n");
5410+ quit_threads = 1;
5411+ return +1;
5412+ }
5413+
5414+ /* No messages from the master - see if it's stil there */
5415+ if (master_node->state == NODESTATE_MEMBER) {
5416+ send_master_hello();
5417+ mod_timer(&transition_timer,
5418+ jiffies +
5419+ cman_config.transition_timeout * HZ);
5420+ }
5421+
5422+ /* If the master is dead then elect a new one */
5423+ if (master_node->state == NODESTATE_DEAD) {
5424+
5425+ struct cluster_node *node;
5426+
5427+ P_MEMB("Master node is dead...Election!\n");
5428+ if (elect_master(&node)) {
5429+
5430+ /* We are master now, all kneel */
5431+ start_transition(TRANS_DEADMASTER, master_node);
5432+ }
5433+ else {
5434+ /* Leave the job to someone on more pay */
5435+ master_node = node;
5436+ mod_timer(&transition_timer,
5437+ jiffies +
5438+ cman_config.transition_timeout * HZ);
5439+ }
5440+ }
5441+ }
5442+
5443+ /* If we are the master node then restart the transition */
5444+ if (node_state == MASTER) {
5445+ start_transition(TRANS_RESTART, us);
5446+ }
5447+
5448+ return 0;
5449+}
5450+
5451+static void form_cluster(void)
5452+{
5453+ printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
5454+ node_state = MEMBER;
5455+ we_are_a_cluster_member = TRUE;
5456+ us->node_id = 1;
5457+ us->state = NODESTATE_MEMBER;
5458+ set_nodeid(us, 1);
5459+ recalculate_quorum(0);
5460+ sm_member_update(cluster_is_quorate);
5461+ send_hello();
5462+ kernel_thread(hello_kthread, NULL, 0);
5463+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5464+}
5465+
5466+/* This does the initial JOIN part of the membership process. Actually most of
5467+ * is done in the message processing routines but this is the main loop that
5468+ * controls it. The side-effect of this routine is "node_state" which tells the
5469+ * real main loop (in the kernel thread routine) what to do next */
5470+static void join_or_form_cluster()
5471+{
5472+ start_time = jiffies;
5473+
5474+ printk(KERN_INFO CMAN_NAME
5475+ ": Waiting to join or form a Linux-cluster\n");
5476+ join_time = 0;
5477+ start_time = jiffies;
5478+ joinwait_time = jiffies;
5479+ last_hello = 0;
5480+ send_newcluster();
5481+
5482+ /* Listen for a reply */
5483+ do {
5484+ DECLARE_WAITQUEUE(wait, current);
5485+ set_task_state(current, TASK_INTERRUPTIBLE);
5486+ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5487+
5488+ if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5489+ schedule_timeout((cman_config.joinwait_timeout * HZ) /
5490+ 5);
5491+
5492+ set_task_state(current, TASK_RUNNING);
5493+ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5494+
5495+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5496+ dispatch_messages(mem_socket);
5497+ }
5498+ if (quit_threads)
5499+ node_state = LEFT_CLUSTER;
5500+
5501+ }
5502+ while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
5503+ node_state == STARTING);
5504+
5505+ /* If we didn't hear any HELLO messages then form a new cluster */
5506+ if (node_state == STARTING) {
5507+ form_cluster();
5508+ }
5509+ else
5510+ last_hello = jiffies;
5511+
5512+}
5513+
5514+int start_membership_services(pid_t cluster_pid)
5515+{
5516+ kcluster_pid = cluster_pid;
5517+
5518+ init_timer(&transition_timer);
5519+ transition_timer.function = trans_timer_expired;
5520+ transition_timer.data = 0L;
5521+
5522+ /* Start the thread */
5523+ return kernel_thread(membership_kthread, NULL, 0);
5524+}
5525+
5526+static int init_membership_services()
5527+{
5528+ int result;
5529+ struct sockaddr_cl saddr;
5530+ struct socket *sock;
5531+
5532+ init_MUTEX(&hello_task_lock);
5533+ /* Create a socket to communicate with */
5534+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
5535+ if (result < 0) {
5536+ printk(KERN_ERR CMAN_NAME
5537+ ": Can't create cluster socket for membership services\n");
5538+ return result;
5539+ }
5540+ mem_socket = sock;
5541+
5542+ /* Bind to our port */
5543+ saddr.scl_family = AF_CLUSTER;
5544+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5545+ result =
5546+ sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
5547+ if (result < 0) {
5548+ printk(KERN_ERR CMAN_NAME
5549+ ": Can't bind to cluster membership services port\n");
5550+ sock_release(sock);
5551+ return result;
5552+ }
5553+
5554+ node_state = STARTING;
5555+ return 0;
5556+}
5557+
5558+static int send_joinconf()
5559+{
5560+ struct sockaddr_cl saddr;
5561+ int status;
5562+
5563+ if (joining_temp_nodeid == 0) {
5564+ BUG();
5565+ }
5566+
5567+ master_state = MASTER_CONFIRM;
5568+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5569+ saddr.scl_family = AF_CLUSTER;
5570+ saddr.scl_nodeid = joining_temp_nodeid;
5571+ status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
5572+ MSG_NOACK);
5573+
5574+ if (status < 0) {
5575+ printk("Error %d sending JOINCONF, aborting transition\n", status);
5576+ end_transition();
5577+ }
5578+ return status;
5579+}
5580+
5581+static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
5582+{
5583+ char *msgbuf = scratchbuf;
5584+ struct list_head *addrlist;
5585+ int ptr = sizeof (struct cl_mem_join_msg);
5586+ unsigned short num_addr = 0;
5587+ struct cluster_node_addr *nodeaddr;
5588+ struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
5589+
5590+ msg->cmd = CLUSTER_MEM_JOINREQ;
5591+ msg->votes = votes;
5592+ msg->expected_votes = cpu_to_le32(expected_votes);
5593+ msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
5594+ msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
5595+ msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
5596+ msg->config_version = cpu_to_le32(config_version);
5597+ msg->addr_len = cpu_to_le32(address_length);
5598+ strcpy(msg->clustername, cluster_name);
5599+
5600+ /* Add our addresses */
5601+ list_for_each(addrlist, &us->addr_list) {
5602+ nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
5603+
5604+ memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
5605+ ptr += address_length;
5606+ num_addr++;
5607+ }
5608+ msg->num_addr = cpu_to_le16(num_addr);
5609+
5610+ /* And our name */
5611+ strcpy(msgbuf + ptr, nodename);
5612+ ptr += strlen(nodename) + 1;
5613+
5614+ return kcl_sendmsg(mem_socket, msgbuf, ptr,
5615+ addr, addr_len, MSG_NOACK);
5616+}
5617+
5618+static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
5619+{
5620+ struct cl_mem_startack_msg msg;
5621+
5622+ msg.cmd = CLUSTER_MEM_STARTACK;
5623+ msg.generation = cpu_to_le32(cluster_generation);
5624+ msg.node_id = cpu_to_le32(node_id);
5625+ msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
5626+
5627+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, 0);
5628+}
5629+
5630+static int send_newcluster()
5631+{
5632+ char buf[1];
5633+
5634+ buf[0] = CLUSTER_MEM_NEWCLUSTER;
5635+
5636+ return kcl_sendmsg(mem_socket, buf, 1, NULL, 0,
5637+ MSG_NOACK);
5638+}
5639+
5640+static int send_hello()
5641+{
5642+ struct cl_mem_hello_msg hello_msg;
5643+ int status;
5644+
5645+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5646+ hello_msg.members = cpu_to_le16(cluster_members);
5647+ hello_msg.flags = 0;
5648+ hello_msg.generation = cpu_to_le32(cluster_generation);
5649+
5650+ status =
5651+ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), NULL, 0,
5652+ MSG_NOACK | MSG_ALLINT);
5653+
5654+ last_hello = jiffies;
5655+
5656+ return status;
5657+}
5658+
5659+/* This is a special HELLO message that requires an ACK. clients in transition
5660+ * send these to the master to check it is till alive. if it does not ACK then
5661+ * cnxman will signal it dead and we can restart the transition */
5662+static int send_master_hello()
5663+{
5664+ struct cl_mem_hello_msg hello_msg;
5665+ int status;
5666+ struct sockaddr_cl saddr;
5667+
5668+ hello_msg.cmd = CLUSTER_MEM_HELLO;
5669+ hello_msg.members = cpu_to_le16(cluster_members);
5670+ hello_msg.flags = 1;
5671+ hello_msg.generation = cpu_to_le32(cluster_generation);
5672+
5673+ saddr.scl_family = AF_CLUSTER;
5674+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5675+ saddr.scl_nodeid = master_node->node_id;
5676+ status =
5677+ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg),
5678+ &saddr, sizeof (saddr), 0);
5679+
5680+ last_hello = jiffies;
5681+
5682+ return status;
5683+}
5684+
5685+/* Called when the transition timer has expired, meaning we sent a transition
5686+ * message that was not ACKed */
5687+static void trans_timer_expired(unsigned long arg)
5688+{
5689+ P_MEMB("Transition timer fired %ld\n", jiffies);
5690+
5691+ set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
5692+ wake_up_process(membership_task);
5693+}
5694+
5695+static void hello_timer_expired(unsigned long arg)
5696+{
5697+ P_MEMB("Hello timer fired %ld\n", jiffies);
5698+
5699+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5700+
5701+ if (node_state >= TRANSITION) {
5702+ wake_up_process(hello_task);
5703+ }
5704+}
5705+
5706+static int wait_for_completion_barrier(void)
5707+{
5708+ int status;
5709+ char barriername[MAX_BARRIER_NAME_LEN];
5710+
5711+ sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
5712+
5713+ /* Make sure we all complete together */
5714+ P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
5715+ if ((status =
5716+ kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
5717+ printk(CMAN_NAME ": Error registering barrier: %d\n", status);
5718+ return -1;
5719+ }
5720+ kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
5721+ cman_config.transition_timeout);
5722+ status = kcl_barrier_wait(barriername);
5723+ kcl_barrier_delete(barriername);
5724+
5725+ P_MEMB("Completion barrier reached : status = %d\n", status);
5726+ return status;
5727+}
5728+
5729+/* Called at the end of a state transition when we are the master */
5730+static int end_transition()
5731+{
5732+ struct cl_mem_endtrans_msg msg;
5733+ int total_votes;
5734+ int status;
5735+
5736+ /* Cancel the timer */
5737+ del_timer(&transition_timer);
5738+
5739+ confirm_joiner();
5740+
5741+ quorum = calculate_quorum(leavereason, 0, &total_votes);
5742+
5743+ msg.cmd = CLUSTER_MEM_ENDTRANS;
5744+ msg.quorum = cpu_to_le32(quorum);
5745+ msg.generation = cpu_to_le32(++cluster_generation);
5746+ msg.total_votes = cpu_to_le32(total_votes);
5747+ if (joining_node && transitionreason == TRANS_NEWNODE) {
5748+ msg.new_node_id = cpu_to_le32(joining_node->node_id);
5749+ }
5750+ else {
5751+ msg.new_node_id = 0;
5752+ }
5753+ status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
5754+
5755+ /* When that's all settled down, do the transition completion barrier */
5756+ kcl_wait_for_all_acks();
5757+
5758+ if (wait_for_completion_barrier() != 0) {
5759+ P_MEMB("Barrier timed out - restart\n");
5760+ start_transition(TRANS_RESTART, us);
5761+ return 0;
5762+ }
5763+
5764+ set_quorate(total_votes);
5765+
5766+ notify_listeners();
5767+ reset_hello_time();
5768+
5769+ /* Tell any waiting barriers that we had a transition */
5770+ check_barrier_returns();
5771+
5772+ leavereason = 0;
5773+ node_state = MEMBER;
5774+ transition_end_time = jiffies;
5775+
5776+ sm_member_update(cluster_is_quorate);
5777+
5778+ return 0;
5779+}
5780+
5781+int send_reconfigure(int param, unsigned int value)
5782+{
5783+ char msgbuf[66];
5784+ struct cl_mem_reconfig_msg *msg =
5785+ (struct cl_mem_reconfig_msg *) &msgbuf;
5786+
5787+ if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
5788+ expected_votes = value;
5789+
5790+ msg->cmd = CLUSTER_MEM_RECONFIG;
5791+ msg->param = param;
5792+ msg->value = cpu_to_le32(value);
5793+
5794+ return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
5795+}
5796+
5797+static int send_joinack(char *addr, int addr_len, unsigned char acktype)
5798+{
5799+ struct cl_mem_joinack_msg msg;
5800+
5801+ msg.cmd = CLUSTER_MEM_JOINACK;
5802+ msg.acktype = acktype;
5803+
5804+ return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
5805+ (struct sockaddr_cl *)addr, addr_len, MSG_NOACK);
5806+}
5807+
5808+/* Only send a leave message to one node in the cluster so that it can master
5809+ * the state transition, otherwise we get a "thundering herd" of potential
5810+ * masters fighting it out */
5811+int send_leave(unsigned char flags)
5812+{
5813+ unsigned char msg[2];
5814+ struct sockaddr_cl saddr;
5815+ struct cluster_node *node = NULL;
5816+ int status;
5817+
5818+ if (!mem_socket)
5819+ return 0;
5820+
5821+ saddr.scl_family = AF_CLUSTER;
5822+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5823+
5824+ /* If we are in transition then use the current master */
5825+ if (node_state == TRANSITION) {
5826+ node = master_node;
5827+ }
5828+ if (!node) {
5829+ /* If we are the master or not in transition then pick a node
5830+ * almost at random */
5831+ struct list_head *nodelist;
5832+
5833+ down(&cluster_members_lock);
5834+ list_for_each(nodelist, &cluster_members_list) {
5835+ node = list_entry(nodelist, struct cluster_node, list);
5836+
5837+ if (node->state == NODESTATE_MEMBER && !node->us)
5838+ break;
5839+ }
5840+ up(&cluster_members_lock);
5841+ }
5842+
5843+ /* we are the only member of the cluster - there is no-one to tell */
5844+ if (node && !node->us) {
5845+ saddr.scl_nodeid = node->node_id;
5846+
5847+ P_MEMB("Sending LEAVE to %s\n", node->name);
5848+ msg[0] = CLUSTER_MEM_LEAVE;
5849+ msg[1] = flags;
5850+ status =
5851+ kcl_sendmsg(mem_socket, msg, 2,
5852+ &saddr, sizeof (saddr),
5853+ MSG_NOACK);
5854+
5855+ if (status < 0)
5856+ return status;
5857+ }
5858+
5859+ /* And exit */
5860+ node_state = LEFT_CLUSTER;
5861+ wake_up_process(membership_task);
5862+ return 0;
5863+}
5864+
5865+int send_kill(int nodeid)
5866+{
5867+ char killmsg;
5868+ struct sockaddr_cl saddr;
5869+
5870+ killmsg = CLUSTER_MEM_KILL;
5871+
5872+ saddr.scl_family = AF_CLUSTER;
5873+ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5874+ saddr.scl_nodeid = nodeid;
5875+ return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
5876+ sizeof (struct sockaddr_cl), MSG_NOACK);
5877+}
5878+
5879+/* Process a message */
5880+static int do_membership_packet(struct msghdr *msg, int len)
5881+{
5882+ int result = -1;
5883+ unsigned char *buf = msg->msg_iov->iov_base;
5884+ struct sockaddr_cl *saddr = msg->msg_name;
5885+ struct cluster_node *node;
5886+
5887+ node = find_node_by_nodeid(saddr->scl_nodeid);
5888+
5889+ P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
5890+ msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
5891+
5892+ switch (*buf) {
5893+ case CLUSTER_MEM_JOINREQ:
5894+ result = do_process_joinreq(msg, len);
5895+ break;
5896+
5897+ case CLUSTER_MEM_LEAVE:
5898+ if (we_are_a_cluster_member)
5899+ result = do_process_leave(msg, len);
5900+ break;
5901+
5902+ case CLUSTER_MEM_HELLO:
5903+ result = do_process_hello(msg, len);
5904+ break;
5905+
5906+ case CLUSTER_MEM_KILL:
5907+ if (we_are_a_cluster_member)
5908+ result = do_process_kill(msg, len);
5909+ break;
5910+
5911+ case CLUSTER_MEM_JOINCONF:
5912+ if (node_state == JOINACK) {
5913+ do_process_joinconf(msg, len);
5914+ }
5915+ break;
5916+
5917+ case CLUSTER_MEM_CONFACK:
5918+ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5919+ end_transition();
5920+ }
5921+ break;
5922+
5923+ case CLUSTER_MEM_MASTERVIEW:
5924+ if (node_state == TRANSITION)
5925+ do_process_masterview(msg, len);
5926+ break;
5927+
5928+ case CLUSTER_MEM_JOINACK:
5929+ if (node_state == JOINING || node_state == JOINWAIT) {
5930+ do_process_joinack(msg, len);
5931+ }
5932+ break;
5933+ case CLUSTER_MEM_RECONFIG:
5934+ if (we_are_a_cluster_member) {
5935+ do_process_reconfig(msg, len);
5936+ }
5937+ break;
5938+
5939+ case CLUSTER_MEM_STARTTRANS:
5940+ result = do_process_starttrans(msg, len);
5941+ break;
5942+
5943+ case CLUSTER_MEM_ENDTRANS:
5944+ result = do_process_endtrans(msg, len);
5945+ break;
5946+
5947+ case CLUSTER_MEM_VIEWACK:
5948+ result = do_process_viewack(msg, len);
5949+ break;
5950+
5951+ case CLUSTER_MEM_STARTACK:
5952+ if (node_state == MASTER)
5953+ result = do_process_startack(msg, len);
5954+ break;
5955+
5956+ case CLUSTER_MEM_NEWCLUSTER:
5957+ result = do_process_newcluster(msg, len);
5958+ break;
5959+
5960+ case CLUSTER_MEM_NOMINATE:
5961+ if (node_state != MASTER)
5962+ result = do_process_nominate(msg, len);
5963+ break;
5964+
5965+ default:
5966+ printk(KERN_ERR CMAN_NAME
5967+ ": Unknown membership services message %d received\n",
5968+ *buf);
5969+ break;
5970+
5971+ }
5972+ return result;
5973+}
5974+
5975+/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
5976+ * to ignore request (node already joining) */
5977+static int check_duplicate_node(char *name, struct msghdr *msg, int len)
5978+{
5979+ struct cluster_node *node;
5980+ struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
5981+ char addr[address_length];
5982+ int addrlen;
5983+
5984+ if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
5985+ return -3;
5986+
5987+ /* See if we already have a cluster member with that name... */
5988+ node = find_node_by_name(name);
5989+ if (node && node->state != NODESTATE_DEAD) {
5990+
5991+ if ((node->state == NODESTATE_JOINING ||
5992+ node->state == NODESTATE_REMOTEMEMBER))
5993+ return +1;
5994+
5995+ printk(KERN_WARNING CMAN_NAME
5996+ ": Rejecting cluster membership application from %s - already have a node with that name\n",
5997+ name);
5998+ return -1;
5999+
6000+ }
6001+
6002+ /* Need to check the node's address too */
6003+ if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
6004+ (node = find_node_by_addr(addr, addrlen)) &&
6005+ node->state != NODESTATE_DEAD) {
6006+
6007+ if ((node->state == NODESTATE_JOINING ||
6008+ node->state == NODESTATE_REMOTEMEMBER))
6009+ return +1;
6010+
6011+ printk(KERN_WARNING CMAN_NAME
6012+ ": Rejecting cluster membership application from %s - already have a node with that address\n",
6013+ name);
6014+ return -1;
6015+ }
6016+ return 0;
6017+}
6018+
6019+/* Start the state transition */
6020+static int start_transition(unsigned char reason, struct cluster_node *node)
6021+{
6022+ char *startbuf = scratchbuf;
6023+ struct cl_mem_starttrans_msg *msg =
6024+ (struct cl_mem_starttrans_msg *) startbuf;
6025+
6026+ P_MEMB("Start transition - reason = %d\n", reason);
6027+
6028+ /* If this is a restart then zero the counters */
6029+ if (reason == TRANS_RESTART) {
6030+ agreeing_nodes = 0;
6031+ dissenting_nodes = 0;
6032+ if (node_opinion) {
6033+ kfree(node_opinion);
6034+ node_opinion = NULL;
6035+ }
6036+ responses_collected = 0;
6037+ }
6038+
6039+ /* If we have timed out too many times then just die */
6040+ if (reason == TRANS_RESTART
6041+ && ++transition_restarts > cman_config.transition_restarts) {
6042+ printk(KERN_WARNING CMAN_NAME
6043+ ": too many transition restarts - will die\n");
6044+ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
6045+ node_state = LEFT_CLUSTER;
6046+ quit_threads = 1;
6047+ wake_up_process(membership_task);
6048+ wake_up_interruptible(&cnxman_waitq);
6049+ return 0;
6050+ }
6051+ if (reason != TRANS_RESTART)
6052+ transition_restarts = 0;
6053+
6054+ /* Only keep the original state transition reason in the global
6055+ * variable. */
6056+ if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
6057+ reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
6058+ transitionreason = reason;
6059+
6060+ /* Save the info of the requesting node */
6061+ if (reason == TRANS_NEWNODE)
6062+ joining_node = node;
6063+
6064+ node_state = MASTER;
6065+ master_state = MASTER_START;
6066+ responses_collected = 0;
6067+ responses_expected = cluster_members - 1;
6068+
6069+ /* If we are on our own then just do it */
6070+ if (responses_expected == 0) {
6071+ P_MEMB("We are on our own...lonely here\n");
6072+ responses_collected--;
6073+ do_process_startack(NULL, 0);
6074+ }
6075+ else {
6076+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6077+ struct list_head *addrlist;
6078+ unsigned short num_addrs = 0;
6079+ int flags = 0;
6080+
6081+ /* Send the STARTTRANS message */
6082+ msg->cmd = CLUSTER_MEM_STARTTRANS;
6083+ msg->reason = reason;
6084+ msg->votes = node->votes;
6085+ msg->expected_votes = cpu_to_le32(node->expected_votes);
6086+ msg->generation = cpu_to_le32(++cluster_generation);
6087+ msg->nodeid = cpu_to_le32(node->node_id);
6088+
6089+ if (reason == TRANS_NEWNODE) {
6090+ /* Add the addresses */
6091+ list_for_each(addrlist, &node->addr_list) {
6092+ struct cluster_node_addr *nodeaddr =
6093+ list_entry(addrlist,
6094+ struct cluster_node_addr, list);
6095+
6096+ memcpy(startbuf + ptr, nodeaddr->addr,
6097+ address_length);
6098+ ptr += address_length;
6099+ num_addrs++;
6100+ }
6101+
6102+ /* And the name */
6103+ strcpy(startbuf + ptr, node->name);
6104+ ptr += strlen(node->name) + 1;
6105+ }
6106+
6107+ /* If another node died then we must queue the STARTTRANS
6108+ * messages so that membershipd can carry on processing the
6109+ * other replies */
6110+ if (reason == TRANS_ANOTHERREMNODE)
6111+ flags |= MSG_QUEUE;
6112+
6113+ msg->num_addrs = cpu_to_le16(num_addrs);
6114+ kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
6115+ }
6116+ /* Set a timer in case we don't get 'em all back */
6117+ mod_timer(&transition_timer,
6118+ jiffies + cman_config.transition_timeout * HZ);
6119+ return 0;
6120+}
6121+
6122+/* A node has died - decide what to do */
6123+void a_node_just_died(struct cluster_node *node)
6124+{
6125+ /* If we are not in the context of kmembershipd then stick it on the
6126+ * list and wake it */
6127+ if (current != membership_task) {
6128+ struct cl_new_dead_node *newnode =
6129+ kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
6130+ if (!newnode)
6131+ return;
6132+ newnode->node = node;
6133+ down(&new_dead_node_lock);
6134+ list_add_tail(&newnode->list, &new_dead_node_list);
6135+ set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
6136+ up(&new_dead_node_lock);
6137+ wake_up_process(membership_task);
6138+ P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
6139+ return;
6140+ }
6141+
6142+ /* Remove it */
6143+ down(&cluster_members_lock);
6144+ if (node->state == NODESTATE_MEMBER)
6145+ cluster_members--;
6146+ node->state = NODESTATE_DEAD;
6147+ up(&cluster_members_lock);
6148+
6149+ /* Notify listeners */
6150+ notify_kernel_listeners(DIED, (long) node->node_id);
6151+
6152+ /* If we are in normal operation then become master and initiate a
6153+ * state-transition */
6154+ if (node_state == MEMBER) {
6155+ start_transition(TRANS_REMNODE, node);
6156+ return;
6157+ }
6158+
6159+ /* If we are a slave in transition then see if it's the master that has
6160+ * failed. If not then ignore it. If it /is/ the master then elect a
6161+ * new one */
6162+ if (node_state == TRANSITION) {
6163+ if (master_node == node) {
6164+ if (elect_master(&node)) {
6165+ del_timer(&transition_timer);
6166+ node_state = MASTER;
6167+
6168+ start_transition(TRANS_DEADMASTER, master_node);
6169+ }
6170+ else {
6171+ /* Someone else can be in charge - phew! */
6172+ }
6173+ }
6174+ return;
6175+ }
6176+
6177+ /* If we are the master then we need to start the transition all over
6178+ * again */
6179+ if (node_state == MASTER) {
6180+ /* Cancel timer */
6181+ del_timer(&transition_timer);
6182+
6183+ /* Restart the transition */
6184+ start_transition(TRANS_ANOTHERREMNODE, node);
6185+ transition_restarts = 0;
6186+ return;
6187+ }
6188+}
6189+
6190+/*
6191+ * Build up and send a set of messages consisting of the whole cluster view.
6192+ * The first byte is the command (cmd as passed in), the second is a flag byte:
6193+ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
6194+ * this is the only message sent The rest is a set of packed node entries, which
6195+ * are NOT split over packets. */
6196+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
6197+ unsigned int flags)
6198+{
6199+ int ptr = 2;
6200+ int len;
6201+ int status = 0;
6202+ int last_node_start = 2;
6203+ unsigned char first_packet_flag = 1;
6204+ struct list_head *nodelist;
6205+ struct list_head *temp;
6206+ struct cluster_node *node;
6207+ char *message = scratchbuf;
6208+
6209+ message[0] = cmd;
6210+
6211+ down(&cluster_members_lock);
6212+ list_for_each_safe(nodelist, temp, &cluster_members_list) {
6213+ node = list_entry(nodelist, struct cluster_node, list);
6214+
6215+ if (node->state == NODESTATE_MEMBER) {
6216+ unsigned int evotes;
6217+ unsigned int node_id;
6218+ unsigned short num_addrs = 0;
6219+ unsigned short num_addrs_le;
6220+ struct list_head *addrlist;
6221+
6222+ last_node_start = ptr;
6223+
6224+ message[ptr++] = len = strlen(node->name);
6225+ strcpy(&message[ptr], node->name);
6226+ ptr += len;
6227+
6228+ /* Count the number of addresses this node has */
6229+ list_for_each(addrlist, &node->addr_list) {
6230+ num_addrs++;
6231+ }
6232+
6233+ num_addrs_le = cpu_to_le16(num_addrs);
6234+ memcpy(&message[ptr], &num_addrs_le, sizeof (short));
6235+ ptr += sizeof (short);
6236+
6237+ /* Pack em in */
6238+ list_for_each(addrlist, &node->addr_list) {
6239+
6240+ struct cluster_node_addr *nodeaddr =
6241+ list_entry(addrlist,
6242+ struct cluster_node_addr, list);
6243+
6244+ memcpy(&message[ptr], nodeaddr->addr,
6245+ address_length);
6246+ ptr += address_length;
6247+ }
6248+
6249+ message[ptr++] = node->votes;
6250+
6251+ evotes = cpu_to_le32(node->expected_votes);
6252+ memcpy(&message[ptr], &evotes, sizeof (int));
6253+ ptr += sizeof (int);
6254+
6255+ node_id = cpu_to_le32(node->node_id);
6256+ memcpy(&message[ptr], &node_id, sizeof (int));
6257+ ptr += sizeof (int);
6258+
6259+ /* If the block is full then send it */
6260+ if (ptr > MAX_CLUSTER_MESSAGE) {
6261+ message[1] = first_packet_flag;
6262+
6263+ up(&cluster_members_lock);
6264+ status =
6265+ kcl_sendmsg(mem_socket, message,
6266+ last_node_start, saddr,
6267+ saddr ? sizeof (struct sockaddr_cl) : 0,
6268+ flags);
6269+
6270+ if (status < 0)
6271+ goto send_fail;
6272+
6273+ down(&cluster_members_lock);
6274+
6275+ first_packet_flag = 0;
6276+ /* Copy the overflow back to the start of the
6277+ * buffer for the next send */
6278+ memcpy(&message[2], &message[last_node_start],
6279+ ptr - last_node_start);
6280+ ptr = ptr - last_node_start + 2;
6281+ }
6282+ }
6283+ }
6284+
6285+ up(&cluster_members_lock);
6286+
6287+ message[1] = first_packet_flag | 2; /* The last may also be first */
6288+ status = kcl_sendmsg(mem_socket, message, ptr,
6289+ saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
6290+ flags);
6291+ send_fail:
6292+
6293+ return status;
6294+}
6295+
6296+/* Make the JOINING node into a MEMBER */
6297+static void confirm_joiner()
6298+{
6299+ if (joining_node && joining_node->state == NODESTATE_JOINING) {
6300+ down(&cluster_members_lock);
6301+ joining_node->state = NODESTATE_MEMBER;
6302+ cluster_members++;
6303+ up(&cluster_members_lock);
6304+ }
6305+ remove_temp_nodeid(joining_temp_nodeid);
6306+ joining_temp_nodeid = 0;
6307+}
6308+
6309+/* Reset HELLO timers for all nodes We do this after a state-transition as we
6310+ * have had HELLOS disabled during the transition and if we don't do this the
6311+ * nodes will go on an uncontrolled culling-spree afterwards */
6312+static void reset_hello_time()
6313+{
6314+ struct list_head *nodelist;
6315+ struct cluster_node *node;
6316+
6317+ down(&cluster_members_lock);
6318+ list_for_each(nodelist, &cluster_members_list) {
6319+ node = list_entry(nodelist, struct cluster_node, list);
6320+
6321+ if (node->state == NODESTATE_MEMBER) {
6322+ node->last_hello = jiffies;
6323+ }
6324+
6325+ }
6326+ up(&cluster_members_lock);
6327+}
6328+
6329+/* Calculate the new quorum and return the value. do *not* set it in here as
6330+ * cnxman calls this to check if a new expected_votes value is valid. It
6331+ * (optionally) returns the total number of votes in the cluster */
6332+int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
6333+{
6334+ struct list_head *nodelist;
6335+ struct cluster_node *node;
6336+ unsigned int total_votes = 0;
6337+ unsigned int highest_expected = 0;
6338+ unsigned int newquorum, q1, q2;
6339+
6340+ down(&cluster_members_lock);
6341+ list_for_each(nodelist, &cluster_members_list) {
6342+ node = list_entry(nodelist, struct cluster_node, list);
6343+
6344+ if (node->state == NODESTATE_MEMBER) {
6345+ highest_expected =
6346+ max(highest_expected, node->expected_votes);
6347+ total_votes += node->votes;
6348+ }
6349+ }
6350+ up(&cluster_members_lock);
6351+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
6352+ total_votes += quorum_device->votes;
6353+
6354+ if (max_expected > 0)
6355+ highest_expected = max_expected;
6356+
6357+ /* This quorum calculation is taken from the OpenVMS Cluster Systems
6358+ * manual, but, then, you guessed that didn't you */
6359+ q1 = (highest_expected + 2) / 2;
6360+ q2 = (total_votes + 2) / 2;
6361+ newquorum = max(q1, q2);
6362+
6363+ /* Normally quorum never decreases but the system administrator can
6364+ * force it down by setting expected votes to a maximum value */
6365+ if (!allow_decrease)
6366+ newquorum = max(quorum, newquorum);
6367+
6368+ /* The special two_node mode allows each of the two nodes to retain
6369+ * quorum if the other fails. Only one of the two should live past
6370+ * fencing (as both nodes try to fence each other in split-brain.) */
6371+ if (two_node)
6372+ newquorum = 1;
6373+
6374+ if (ret_total_votes)
6375+ *ret_total_votes = total_votes;
6376+ return newquorum;
6377+}
6378+
6379+/* Recalculate cluster quorum, set quorate and notify changes */
6380+void recalculate_quorum(int allow_decrease)
6381+{
6382+ int total_votes;
6383+
6384+ quorum = calculate_quorum(allow_decrease, 0, &total_votes);
6385+ set_quorate(total_votes);
6386+ notify_listeners();
6387+}
6388+
6389+/* Add new node address to an existing node */
6390+int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
6391+{
6392+ struct cluster_node_addr *newaddr;
6393+
6394+ newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
6395+ if (!newaddr)
6396+ return -1;
6397+
6398+ memcpy(newaddr->addr, addr, len);
6399+ newaddr->addr_len = len;
6400+ list_add_tail(&newaddr->list, &node->addr_list);
6401+
6402+ return 0;
6403+}
6404+
6405+static struct cluster_node *add_new_node(char *name, unsigned char votes,
6406+ unsigned int expected_votes,
6407+ int node_id, int state)
6408+{
6409+ struct cluster_node *newnode;
6410+
6411+ /* Look for a dead node with this name */
6412+ newnode = find_node_by_name(name);
6413+
6414+ /* Is it already joining */
6415+ if (newnode && newnode->state == NODESTATE_JOINING)
6416+ return NULL;
6417+
6418+ /* Update existing information */
6419+ if (newnode && newnode->state == NODESTATE_DEAD) {
6420+ newnode->last_hello = jiffies;
6421+ newnode->votes = votes;
6422+ newnode->expected_votes = expected_votes;
6423+ newnode->state = state;
6424+ newnode->us = 0;
6425+ newnode->leave_reason = 0;
6426+ newnode->last_seq_recv = 0;
6427+ newnode->last_seq_acked = 0;
6428+ newnode->last_seq_sent = 0;
6429+ newnode->incarnation++;
6430+ /* Don't overwrite the node ID */
6431+
6432+ if (state == NODESTATE_MEMBER) {
6433+ down(&cluster_members_lock);
6434+ cluster_members++;
6435+ up(&cluster_members_lock);
6436+ }
6437+
6438+ printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
6439+ return newnode;
6440+ }
6441+
6442+ newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
6443+ if (!newnode)
6444+ goto alloc_err;
6445+
6446+ memset(newnode, 0, sizeof (struct cluster_node));
6447+ newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
6448+ if (!newnode->name)
6449+ goto alloc_err1;
6450+
6451+ strcpy(newnode->name, name);
6452+ newnode->last_hello = jiffies;
6453+ newnode->votes = votes;
6454+ newnode->expected_votes = expected_votes;
6455+ newnode->state = state;
6456+ newnode->node_id = node_id;
6457+ newnode->us = 0;
6458+ newnode->leave_reason = 0;
6459+ newnode->last_seq_recv = 0;
6460+ newnode->last_seq_acked = 0;
6461+ newnode->last_seq_sent = 0;
6462+ newnode->incarnation = 0;
6463+ INIT_LIST_HEAD(&newnode->addr_list);
6464+ set_nodeid(newnode, node_id);
6465+
6466+ /* Add the new node to the list */
6467+ down(&cluster_members_lock);
6468+ list_add(&newnode->list, &cluster_members_list);
6469+ if (state == NODESTATE_MEMBER)
6470+ cluster_members++;
6471+ up(&cluster_members_lock);
6472+
6473+ printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
6474+ return newnode;
6475+
6476+ alloc_err1:
6477+ kfree(newnode);
6478+ alloc_err:
6479+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
6480+
6481+ printk(KERN_CRIT CMAN_NAME
6482+ ": Cannot allocate memory for new cluster node %s\n", name);
6483+
6484+ panic("cluster memory allocation failed");
6485+
6486+ return NULL;
6487+}
6488+
6489+/* Remove node from a STARTTRANS message */
6490+static struct cluster_node *remove_node(int nodeid)
6491+{
6492+ struct cluster_node *node = find_node_by_nodeid(nodeid);
6493+
6494+ if (node && node->state == NODESTATE_MEMBER) {
6495+ P_MEMB("starttrans removes node %s\n", node->name);
6496+ down(&cluster_members_lock);
6497+ node->state = NODESTATE_DEAD;
6498+ cluster_members--;
6499+ up(&cluster_members_lock);
6500+
6501+ notify_kernel_listeners(DIED, (long) nodeid);
6502+
6503+ /* If this node is us then go quietly */
6504+ if (node->us) {
6505+ printk(KERN_INFO CMAN_NAME
6506+ ": killed by STARTTRANS or NOMINATE\n");
6507+ quit_threads = 1;
6508+ wake_up_process(membership_task);
6509+ wake_up_interruptible(&cnxman_waitq);
6510+ }
6511+ }
6512+ return node;
6513+}
6514+
6515+/* Add a node from a STARTTRANS or NOMINATE message */
6516+static void add_node_from_starttrans(struct msghdr *msg, int len)
6517+{
6518+ /* Add the new node but don't fill in the ID until the master has
6519+ * confirmed it */
6520+ struct cl_mem_starttrans_msg *startmsg =
6521+ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
6522+ char *msgbuf = (char *) msg->msg_iov->iov_base;
6523+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6524+ char *name =
6525+ msgbuf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
6526+ int i;
6527+
6528+ joining_node = add_new_node(name, startmsg->votes,
6529+ le32_to_cpu(startmsg->expected_votes),
6530+ 0, NODESTATE_JOINING);
6531+
6532+ /* add_new_node returns NULL if the node already exists */
6533+ if (!joining_node)
6534+ joining_node = find_node_by_name(name);
6535+
6536+ /* Add the node's addresses */
6537+ if (list_empty(&joining_node->addr_list)) {
6538+ for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
6539+ add_node_address(joining_node, msgbuf + ptr, address_length);
6540+ ptr += address_length;
6541+ }
6542+ }
6543+}
6544+
6545+/* We have been nominated as master for a transition */
6546+static int do_process_nominate(struct msghdr *msg, int len)
6547+{
6548+ struct cl_mem_starttrans_msg *startmsg =
6549+ (struct cl_mem_starttrans_msg *)msg->msg_iov->iov_base;
6550+ struct cluster_node *node = NULL;
6551+ char *nodeaddr = msg->msg_iov->iov_base + sizeof(struct cl_mem_starttrans_msg);
6552+
6553+ P_MEMB("nominate reason is %d\n", startmsg->reason);
6554+
6555+ if (startmsg->reason == TRANS_REMNODE) {
6556+ node = remove_node(le32_to_cpu(startmsg->nodeid));
6557+ }
6558+
6559+ if (startmsg->reason == TRANS_NEWNODE) {
6560+ add_node_from_starttrans(msg, len);
6561+ node = joining_node;
6562+ /* Make sure we have a temp nodeid for the new node */
6563+ joining_temp_nodeid = new_temp_nodeid(nodeaddr,
6564+ address_length);
6565+ }
6566+
6567+ /* This should be a TRANS_CHECK but start_transition needs some node
6568+ * info */
6569+ if (node == NULL)
6570+ node = us;
6571+ start_transition(startmsg->reason, node);
6572+ return 0;
6573+}
6574+
6575+/* Got a STARTACK response from a node */
6576+static int do_process_startack(struct msghdr *msg, int len)
6577+{
6578+ if (node_state != MASTER && master_state != MASTER_START) {
6579+ P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
6580+ return 0;
6581+ }
6582+
6583+ /* msg is NULL if we are called directly from start_transition */
6584+ if (msg) {
6585+ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
6586+
6587+ /* Ignore any messages wil old generation numbers in them */
6588+ if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
6589+ P_MEMB("Got old generation START-ACK msg - ignoring\n");
6590+ return 0;
6591+ }
6592+ }
6593+
6594+ /* If the node_id is non-zero then use it. */
6595+ if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
6596+ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
6597+
6598+ if (ackmsg->node_id) {
6599+ set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
6600+ }
6601+ highest_nodeid =
6602+ max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
6603+ P_MEMB("Node id = %d, highest node id = %d\n",
6604+ le32_to_cpu(ackmsg->node_id),
6605+ le32_to_cpu(ackmsg->highest_node_id));
6606+ }
6607+
6608+ /* If we have all the responses in then move to the next stage */
6609+ if (++responses_collected == responses_expected) {
6610+
6611+ /* If the new node has no node_id (ie nobody in the cluster has
6612+ * heard of it before) then assign it a new one */
6613+ if (transitionreason == TRANS_NEWNODE && joining_node) {
6614+ highest_nodeid =
6615+ max(highest_nodeid, get_highest_nodeid());
6616+ if (joining_node->node_id == 0) {
6617+ set_nodeid(joining_node, ++highest_nodeid);
6618+ }
6619+ P_MEMB("nodeIDs: new node: %d, highest: %d\n",
6620+ joining_node->node_id, highest_nodeid);
6621+ }
6622+
6623+ /* Behave a little differently if we are on our own */
6624+ if (cluster_members == 1) {
6625+ if (transitionreason == TRANS_NEWNODE) {
6626+ /* If the cluster is just us then confirm at
6627+ * once */
6628+ joinconf_count = 0;
6629+ mod_timer(&transition_timer,
6630+ jiffies +
6631+ cman_config.joinconf_timeout * HZ);
6632+ send_joinconf();
6633+ return 0;
6634+ }
6635+ else { /* Node leaving the cluster */
6636+ recalculate_quorum(leavereason);
6637+ leavereason = 0;
6638+ node_state = MEMBER;
6639+ }
6640+ }
6641+ else {
6642+ master_state = MASTER_COLLECT;
6643+ responses_collected = 0;
6644+ responses_expected = cluster_members - 1;
6645+ P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
6646+ responses_expected);
6647+
6648+ send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0);
6649+
6650+ /* Set a timer in case we don't get 'em all back */
6651+ mod_timer(&transition_timer,
6652+ jiffies +
6653+ cman_config.transition_timeout * HZ);
6654+ }
6655+ }
6656+ return 0;
6657+}
6658+
6659+/* Got a VIEWACK response from a node */
6660+static int do_process_viewack(struct msghdr *msg, int len)
6661+{
6662+ char *reply = msg->msg_iov->iov_base;
6663+ struct sockaddr_cl *saddr = msg->msg_name;
6664+
6665+ if (master_state != MASTER_COLLECT) {
6666+ printk(KERN_INFO CMAN_NAME
6667+ ": got VIEWACK while not in state transition\n");
6668+ return 0;
6669+ }
6670+
6671+ if (node_opinion == NULL) {
6672+ node_opinion =
6673+ kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
6674+ if (!node_opinion) {
6675+ panic(": malloc agree/dissent failed\n");
6676+ }
6677+ memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
6678+ }
6679+
6680+ /* Keep a list of agreeing and dissenting nodes */
6681+ if (reply[1] == 1) {
6682+ /* ACK - remote node agrees with me */
6683+ P_MEMB("Node agrees\n");
6684+ node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
6685+ agreeing_nodes++;
6686+ }
6687+ else {
6688+ /* Remote node disagrees */
6689+ P_MEMB("Node disagrees\n");
6690+ node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
6691+ dissenting_nodes++;
6692+ }
6693+
6694+ P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
6695+ responses_expected);
6696+
6697+ /* Are all the results in yet ? */
6698+ if (++responses_collected == responses_expected) {
6699+ del_timer(&transition_timer);
6700+
6701+ P_MEMB("The results are in: %d agree, %d dissent\n",
6702+ agreeing_nodes, dissenting_nodes);
6703+
6704+ if (agreeing_nodes > dissenting_nodes) {
6705+ /* Kill dissenting nodes */
6706+ int i;
6707+
6708+ for (i = 1; i <= responses_collected; i++) {
6709+ if (node_opinion[i] == OPINION_DISAGREE)
6710+ send_kill(i);
6711+ }
6712+ }
6713+ else {
6714+ /* We must leave the cluster as we are in a minority,
6715+ * the rest of them can fight it out amongst
6716+ * themselves. */
6717+ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
6718+
6719+ agreeing_nodes = 0;
6720+ dissenting_nodes = 0;
6721+ kfree(node_opinion);
6722+ node_opinion = NULL;
6723+ node_state = LEFT_CLUSTER;
6724+ quit_threads = 1;
6725+ wake_up_process(membership_task);
6726+ wake_up_interruptible(&cnxman_waitq);
6727+ return -1;
6728+ }
6729+
6730+ /* Reset counters */
6731+ agreeing_nodes = 0;
6732+ dissenting_nodes = 0;
6733+ kfree(node_opinion);
6734+ node_opinion = NULL;
6735+
6736+ /* Confirm new node */
6737+ if (transitionreason == TRANS_NEWNODE) {
6738+ mod_timer(&transition_timer,
6739+ jiffies + cman_config.joinconf_timeout * HZ);
6740+ joinconf_count = 0;
6741+ send_joinconf();
6742+ return 0;
6743+ }
6744+
6745+ master_state = MASTER_COMPLETE;
6746+
6747+ end_transition();
6748+ }
6749+
6750+ return 0;
6751+}
6752+
6753+/* Got an ENDTRANS message */
6754+static int do_process_endtrans(struct msghdr *msg, int len)
6755+{
6756+ struct cl_mem_endtrans_msg *endmsg =
6757+ (struct cl_mem_endtrans_msg *) msg->msg_iov->iov_base;
6758+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
6759+
6760+ /* Someone else's state transition */
6761+ if (node_state != TRANSITION && node_state != JOINACK)
6762+ return 0;
6763+
6764+ /* Check we got it from the MASTER node */
6765+ if (master_node && master_node->node_id != saddr->scl_nodeid) {
6766+ printk(KERN_INFO
6767+ "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
6768+ master_node->node_id, saddr->scl_nodeid);
6769+ return 0;
6770+ }
6771+
6772+ del_timer(&transition_timer);
6773+
6774+ /* Set node ID on new node */
6775+ if (endmsg->new_node_id) {
6776+ set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
6777+ P_MEMB("new node %s has ID %d\n", joining_node->name,
6778+ joining_node->node_id);
6779+ }
6780+
6781+ node_state = TRANSITION_COMPLETE;
6782+
6783+ /* Need to set this here or the barrier code will reject us if we've
6784+ * just joined */
6785+ we_are_a_cluster_member = TRUE;
6786+
6787+ confirm_joiner();
6788+ cluster_generation = le32_to_cpu(endmsg->generation);
6789+
6790+ if (wait_for_completion_barrier() != 0) {
6791+ P_MEMB("Barrier timed out - restart\n");
6792+ node_state = TRANSITION;
6793+ mod_timer(&transition_timer,
6794+ jiffies + cman_config.transition_timeout * HZ);
6795+ return 0;
6796+ }
6797+
6798+ quorum = le32_to_cpu(endmsg->quorum);
6799+ set_quorate(le32_to_cpu(endmsg->total_votes));
6800+
6801+ /* Tell any waiting barriers that we had a transition */
6802+ check_barrier_returns();
6803+
6804+ /* Clear the master node */
6805+ master_node = NULL;
6806+
6807+ node_state = MEMBER;
6808+
6809+ /* Notify other listeners that transition has completed */
6810+ notify_listeners();
6811+ reset_hello_time();
6812+ transition_end_time = jiffies;
6813+
6814+ sm_member_update(cluster_is_quorate);
6815+ return 0;
6816+}
6817+
6818+/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
6819+static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
6820+ int nodeid)
6821+{
6822+ struct sockaddr_cl maddr;
6823+
6824+ maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6825+ maddr.scl_family = AF_CLUSTER;
6826+ maddr.scl_nodeid = nodeid;
6827+
6828+ startmsg->cmd = CLUSTER_MEM_NOMINATE;
6829+ return kcl_sendmsg(mem_socket, startmsg, msglen,
6830+ &maddr, sizeof (maddr), 0);
6831+}
6832+
6833+/* Got a STARTTRANS message */
6834+static int do_process_starttrans(struct msghdr *msg, int len)
6835+{
6836+ struct cl_mem_starttrans_msg *startmsg =
6837+ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
6838+ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
6839+ struct cluster_node *node;
6840+ unsigned int newgen = le32_to_cpu(startmsg->generation);
6841+
6842+ /* Got a WHAT from WHOM? */
6843+ node = find_node_by_nodeid(saddr->scl_nodeid);
6844+ if (!node || node->state != NODESTATE_MEMBER)
6845+ return 0;
6846+
6847+ /* Someone else's state transition */
6848+ if (node_state != MEMBER &&
6849+ node_state != TRANSITION && node_state != MASTER)
6850+ return 0;
6851+
6852+ /* Ignore old generation STARTTRANS messages */
6853+ if ((newgen < cluster_generation) ||
6854+ (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
6855+ P_MEMB("Ignoring STARTTRANS with old generation number\n");
6856+ return 0;
6857+ }
6858+
6859+ P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
6860+ newgen, cluster_generation, startmsg->reason);
6861+
6862+ /* Up the generation number */
6863+ cluster_generation = newgen;
6864+
6865+ /* If we are also a master then decide between us */
6866+ if (node_state == MASTER) {
6867+
6868+ /* See if we really want the responsibility of being master */
6869+ if (elect_master(&node)) {
6870+
6871+ /* I reluctantly accept this position of responsibility
6872+ */
6873+ P_MEMB("I elected myself master\n");
6874+
6875+ /* start_transition will re-establish this */
6876+ del_timer(&transition_timer);
6877+
6878+ start_transition(TRANS_NEWMASTER, node);
6879+ return 0;
6880+ }
6881+ else {
6882+ /* Back down */
6883+ P_MEMB("Backing down from MASTER status\n");
6884+ master_node = node;
6885+ node_state = MEMBER;
6886+
6887+ /* If we were bringing a new node into the cluster then
6888+ * we will have to abandon that now and tell the new
6889+ * node to try again later */
6890+ if (transitionreason == TRANS_NEWNODE && joining_node) {
6891+ struct cluster_node_addr *first_addr =
6892+ (struct cluster_node_addr *) joining_node->
6893+ addr_list.next;
6894+
6895+ P_MEMB("Postponing membership of node %s\n",
6896+ joining_node->name);
6897+ send_joinack(first_addr->addr, address_length,
6898+ JOINACK_TYPE_WAIT);
6899+
6900+ /* Not dead, just sleeping */
6901+ joining_node->state = NODESTATE_DEAD;
6902+ joining_node = NULL;
6903+ }
6904+
6905+ /* If the new master is not us OR the node we just got
6906+ * the STARTTRANS from then make sure it knows it has
6907+ * to be master */
6908+ if (saddr->scl_nodeid != node->node_id) {
6909+ send_nominate(startmsg, len, node->node_id);
6910+ return 0;
6911+ }
6912+
6913+ /* Fall through into MEMBER code below if we are
6914+ * obeying the STARTTRANS we just received */
6915+ }
6916+ }
6917+
6918+ /* Do non-MASTER STARTTRANS bits */
6919+ if (node_state == MEMBER) {
6920+ int ptr = sizeof (struct cl_mem_starttrans_msg);
6921+ int node_id = 0;
6922+
6923+ P_MEMB("Normal transition start\n");
6924+
6925+ /* If the master is adding a new node and we know it's node ID
6926+ * then ACK with it. */
6927+ if (startmsg->reason == TRANS_NEWNODE) {
6928+ struct cluster_node *node =
6929+ find_node_by_addr((char *) startmsg + ptr,
6930+ address_length);
6931+ if (node)
6932+ node_id = node->node_id;
6933+ }
6934+
6935+ /* Save the master info */
6936+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
6937+ node_state = TRANSITION;
6938+
6939+ if (startmsg->reason == TRANS_NEWNODE) {
6940+ add_node_from_starttrans(msg, len);
6941+ }
6942+
6943+ if (startmsg->reason == TRANS_REMNODE ||
6944+ startmsg->reason == TRANS_ANOTHERREMNODE) {
6945+ remove_node(le32_to_cpu(startmsg->nodeid));
6946+ }
6947+
6948+ send_startack(saddr, msg->msg_namelen,
6949+ node_id);
6950+
6951+ /* Establish timer in case the master dies */
6952+ mod_timer(&transition_timer,
6953+ jiffies + cman_config.transition_timeout * HZ);
6954+
6955+ return 0;
6956+ }
6957+
6958+ /* We are in transition but this may be a restart */
6959+ if (node_state == TRANSITION) {
6960+
6961+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
6962+ send_startack(saddr, msg->msg_namelen, 0);
6963+
6964+ /* Is it a new joining node ? This happens if a master is
6965+ * usurped */
6966+ if (startmsg->reason == TRANS_NEWNODE) {
6967+ struct cluster_node *oldjoin = joining_node;
6968+
6969+ add_node_from_starttrans(msg, len);
6970+
6971+ /* If this is a different node joining than the one we
6972+ * were previously joining (probably cos the master is
6973+ * a nominated one) then mark our "old" joiner as DEAD.
6974+ * The original master will already have told the node
6975+ * to go back into JOINWAIT state */
6976+ if (oldjoin && oldjoin != joining_node
6977+ && oldjoin->state == NODESTATE_JOINING)
6978+ oldjoin->state = NODESTATE_DEAD;
6979+ }
6980+
6981+ /* Is it a new master node? */
6982+ if (startmsg->reason == TRANS_NEWMASTER ||
6983+ startmsg->reason == TRANS_DEADMASTER) {
6984+ P_MEMB("starttrans %s, node=%d\n",
6985+ startmsg->reason ==
6986+ TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
6987+ le32_to_cpu(startmsg->nodeid));
6988+
6989+ /* If the old master has died then remove it */
6990+ node =
6991+ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
6992+
6993+ if (startmsg->reason == TRANS_DEADMASTER &&
6994+ node && node->state == NODESTATE_MEMBER) {
6995+ down(&cluster_members_lock);
6996+ node->state = NODESTATE_DEAD;
6997+ cluster_members--;
6998+ up(&cluster_members_lock);
6999+ }
7000+
7001+ /* Store new master */
7002+ master_node = find_node_by_nodeid(saddr->scl_nodeid);
7003+ }
7004+
7005+ /* Another node has died (or been killed) */
7006+ if (startmsg->reason == TRANS_ANOTHERREMNODE) {
7007+ /* Remove new dead node */
7008+ node =
7009+ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
7010+ if (node && node->state == NODESTATE_MEMBER) {
7011+ down(&cluster_members_lock);
7012+ node->state = NODESTATE_DEAD;
7013+ cluster_members--;
7014+ up(&cluster_members_lock);
7015+ }
7016+ }
7017+ /* Restart the timer */
7018+ del_timer(&transition_timer);
7019+ mod_timer(&transition_timer,
7020+ jiffies + cman_config.transition_timeout * HZ);
7021+ }
7022+
7023+ return 0;
7024+}
7025+
7026+/* Change a cluster parameter */
7027+static int do_process_reconfig(struct msghdr *msg, int len)
7028+{
7029+ struct cl_mem_reconfig_msg *confmsg;
7030+ struct sockaddr_cl *saddr = msg->msg_name;
7031+ struct cluster_node *node;
7032+ unsigned int val;
7033+
7034+ if (len < sizeof(struct cl_mem_reconfig_msg))
7035+ return -1;
7036+
7037+ confmsg = (struct cl_mem_reconfig_msg *) msg->msg_iov->iov_base;
7038+ val = le32_to_cpu(confmsg->value);
7039+
7040+ switch (confmsg->param) {
7041+
7042+ case RECONFIG_PARAM_EXPECTED_VOTES:
7043+ /* Set any nodes with expected_votes higher than the new value
7044+ * down */
7045+ if (val > 0) {
7046+ struct cluster_node *node;
7047+
7048+ down(&cluster_members_lock);
7049+ list_for_each_entry(node, &cluster_members_list, list) {
7050+ if (node->state == NODESTATE_MEMBER &&
7051+ node->expected_votes > val) {
7052+ node->expected_votes = val;
7053+ }
7054+ }
7055+ up(&cluster_members_lock);
7056+ if (expected_votes > val)
7057+ expected_votes = val;
7058+ }
7059+ recalculate_quorum(1); /* Allow decrease */
7060+ sm_member_update(cluster_is_quorate);
7061+ break;
7062+
7063+ case RECONFIG_PARAM_NODE_VOTES:
7064+ node = find_node_by_nodeid(saddr->scl_nodeid);
7065+ node->votes = val;
7066+ recalculate_quorum(1); /* Allow decrease */
7067+ sm_member_update(cluster_is_quorate);
7068+ break;
7069+
7070+ case RECONFIG_PARAM_CONFIG_VERSION:
7071+ config_version = val;
7072+ break;
7073+
7074+ default:
7075+ printk(KERN_INFO CMAN_NAME
7076+ ": got unknown parameter in reconfigure message. %d\n",
7077+ confmsg->param);
7078+ break;
7079+ }
7080+ return 0;
7081+}
7082+
7083+/* Response from master node */
7084+static int do_process_joinack(struct msghdr *msg, int len)
7085+{
7086+ struct cl_mem_joinack_msg *ackmsg = msg->msg_iov->iov_base;
7087+
7088+ join_time = jiffies;
7089+ if (ackmsg->acktype == JOINACK_TYPE_OK) {
7090+ node_state = JOINACK;
7091+ }
7092+
7093+ if (ackmsg->acktype == JOINACK_TYPE_NAK) {
7094+ printk(KERN_WARNING CMAN_NAME
7095+ ": Cluster membership rejected\n");
7096+ P_MEMB("Got JOINACK NACK\n");
7097+ node_state = REJECTED;
7098+ }
7099+
7100+ if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
7101+ P_MEMB("Got JOINACK WAIT\n");
7102+ node_state = JOINWAIT;
7103+ joinwait_time = jiffies;
7104+ }
7105+
7106+ return 0;
7107+}
7108+
7109+/* Request to join the cluster. This makes us the master for this state
7110+ * transition */
7111+static int do_process_joinreq(struct msghdr *msg, int len)
7112+{
7113+ int status;
7114+ static unsigned long last_joinreq = 0;
7115+ static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
7116+ struct cl_mem_join_msg *joinmsg = msg->msg_iov->iov_base;
7117+ struct cluster_node *node;
7118+
7119+ /* If we are in a state transition then tell the new node to wait a bit
7120+ * longer */
7121+ if (node_state != MEMBER) {
7122+ if (node_state == MASTER || node_state == TRANSITION) {
7123+ send_joinack(msg->msg_name, msg->msg_namelen,
7124+ JOINACK_TYPE_WAIT);
7125+ }
7126+ return 0;
7127+ }
7128+
7129+ /* Check version number */
7130+ if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
7131+ char *ptr = (char *) joinmsg;
7132+ char *name;
7133+
7134+ /* Sanity-check the num_addrs field otherwise we could oops */
7135+ if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
7136+ printk(KERN_WARNING CMAN_NAME
7137+ ": num_addr in JOIN-REQ message is rubbish: %d\n",
7138+ le16_to_cpu(joinmsg->num_addr));
7139+ return 0;
7140+ }
7141+
7142+ /* Check the cluster name matches */
7143+ if (strcmp(cluster_name, joinmsg->clustername)) {
7144+ printk(KERN_WARNING CMAN_NAME
7145+ ": attempt to join with cluster name '%s' refused\n",
7146+ joinmsg->clustername);
7147+ send_joinack(msg->msg_name, msg->msg_namelen,
7148+ JOINACK_TYPE_NAK);
7149+ return 0;
7150+ }
7151+
7152+ ptr += sizeof (*joinmsg);
7153+ name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7154+
7155+ /* Check we are not exceeding the maximum number of nodes */
7156+ if (cluster_members > cman_config.max_nodes) {
7157+ printk(KERN_WARNING CMAN_NAME
7158+ ": Join request from %s rejected, exceeds maximum number of nodes\n",
7159+ name);
7160+ send_joinack(msg->msg_name, msg->msg_namelen,
7161+ JOINACK_TYPE_NAK);
7162+ return 0;
7163+ }
7164+
7165+ /* Check that we don't exceed the two_node limit */
7166+ if (two_node && cluster_members == 2) {
7167+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7168+ "rejected, exceeds two node limit\n", name);
7169+ send_joinack(msg->msg_name, msg->msg_namelen,
7170+ JOINACK_TYPE_NAK);
7171+ return 0;
7172+ }
7173+
7174+ if (le16_to_cpu(joinmsg->config_version) != config_version) {
7175+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7176+ "rejected, config version local %u remote %u\n",
7177+ name, config_version,
7178+ le16_to_cpu(joinmsg->config_version));
7179+ send_joinack(msg->msg_name, msg->msg_namelen,
7180+ JOINACK_TYPE_NAK);
7181+ return 0;
7182+ }
7183+
7184+ /* If these don't match then I don't know how the message
7185+ arrived! However, I can't take the chance */
7186+ if (le32_to_cpu(joinmsg->addr_len) != address_length) {
7187+ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7188+ "rejected, address length local: %u remote %u\n",
7189+ name, address_length,
7190+ le32_to_cpu(joinmsg->addr_len));
7191+ send_joinack(msg->msg_name, msg->msg_namelen,
7192+ JOINACK_TYPE_NAK);
7193+ return 0;
7194+ }
7195+
7196+ /* Duplicate checking: Because joining messages do not have
7197+ * sequence numbers we may get as many JOINREQ messages as we
7198+ * have interfaces. This bit of code here just checks for
7199+ * JOINREQ messages that come in from the same node in a small
7200+ * period of time and removes the duplicates */
7201+ if (time_before(jiffies, last_joinreq + 10 * HZ)
7202+ && strcmp(name, last_name) == 0) {
7203+ return 0;
7204+ }
7205+
7206+ /* Do we already know about this node? */
7207+ status = check_duplicate_node(name, msg, len);
7208+
7209+ if (status < 0) {
7210+ send_joinack(msg->msg_name, msg->msg_namelen,
7211+ JOINACK_TYPE_NAK);
7212+ return 0;
7213+ }
7214+
7215+ /* OK, you can be in my gang */
7216+ if (status == 0) {
7217+ int i;
7218+ struct sockaddr_cl *addr = msg->msg_name;
7219+
7220+ last_joinreq = jiffies;
7221+ strcpy(last_name, name);
7222+
7223+ node =
7224+ add_new_node(name, joinmsg->votes,
7225+ le32_to_cpu(joinmsg->expected_votes),
7226+ 0, NODESTATE_JOINING);
7227+
7228+ /* Add the node's addresses */
7229+ if (list_empty(&node->addr_list)) {
7230+ for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
7231+ i++) {
7232+ add_node_address(node, ptr, address_length);
7233+ ptr += address_length;
7234+ }
7235+ }
7236+
7237+ send_joinack(msg->msg_name, msg->msg_namelen,
7238+ JOINACK_TYPE_OK);
7239+ joining_node = node;
7240+ joining_temp_nodeid = addr->scl_nodeid;
7241+
7242+ /* Start the state transition */
7243+ start_transition(TRANS_NEWNODE, node);
7244+ }
7245+ }
7246+ else {
7247+ /* Version number mismatch, don't use any part of the message
7248+ * other than the version numbers as things may have moved */
7249+ char buf[MAX_ADDR_PRINTED_LEN];
7250+
7251+ printk(KERN_INFO CMAN_NAME
7252+ ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d) addr: %s\n",
7253+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
7254+ CNXMAN_PATCH_VERSION,
7255+ le32_to_cpu(joinmsg->major_version),
7256+ le32_to_cpu(joinmsg->minor_version),
7257+ le32_to_cpu(joinmsg->patch_version),
7258+ print_addr(msg->msg_name, msg->msg_namelen, buf));
7259+
7260+ send_joinack(msg->msg_name, msg->msg_namelen,
7261+ JOINACK_TYPE_NAK);
7262+ return 0;
7263+ }
7264+
7265+ return 0;
7266+}
7267+
7268+/* A simple function to invent a small number based
7269+ on the node name */
7270+static int node_hash(void)
7271+{
7272+ int i;
7273+ int value = 0;
7274+
7275+ for (i=0; i<strlen(nodename); i++) {
7276+ value += nodename[i];
7277+ }
7278+ return value & 0xF;
7279+}
7280+
7281+/* A new node has stated its intent to form a new cluster. we may have
7282+ * something to say about that... */
7283+static int do_process_newcluster(struct msghdr *msg, int len)
7284+{
7285+ /* If we are also in STARTING state then back down for a random period
7286+ * of time */
7287+ if (node_state == STARTING) {
7288+ P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
7289+ start_time = jiffies + node_hash() * HZ;
7290+ }
7291+
7292+ return 0;
7293+}
7294+
7295+/* Called for each node by the node-message unpacker. Returns -1 if there is a
7296+ * mismatch and the caller will stop processing */
7297+static int check_node(struct cluster_node *newnode, char *addrs,
7298+ unsigned short num_addr)
7299+{
7300+ struct cluster_node *node = find_node_by_name(newnode->name);
7301+
7302+ P_MEMB("check_node: %s", newnode->name);
7303+
7304+ if (!node) {
7305+ C_MEMB(" - not found\n");
7306+ return -1;
7307+ }
7308+
7309+ if (node->votes != newnode->votes ||
7310+ node->node_id != newnode->node_id ||
7311+ node->state != NODESTATE_MEMBER) {
7312+ C_MEMB
7313+ (" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
7314+ node->votes, newnode->votes, node->node_id,
7315+ newnode->node_id, node->state);
7316+ return -1;
7317+ }
7318+ C_MEMB(" - OK\n");
7319+ return 0;
7320+}
7321+
7322+/* Called for each new node found in a JOINCONF message. Create a new node
7323+ * entry */
7324+static int add_node(struct cluster_node *node, char *addrs,
7325+ unsigned short num_addr)
7326+{
7327+ P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
7328+ node->expected_votes, node->node_id);
7329+
7330+ if (!find_node_by_name(node->name)) {
7331+ struct cluster_node *newnode;
7332+ int i;
7333+
7334+ if ((newnode =
7335+ add_new_node(node->name, node->votes, node->expected_votes,
7336+ node->node_id, NODESTATE_MEMBER)) == NULL) {
7337+ P_MEMB("Error adding node\n");
7338+ return -1;
7339+ }
7340+ if (list_empty(&newnode->addr_list)) {
7341+ for (i = 0; i < num_addr; i++) {
7342+ add_node_address(newnode,
7343+ addrs + i * address_length, address_length);
7344+ }
7345+ }
7346+ return 0;
7347+ }
7348+ else {
7349+ P_MEMB("Already got node with name %s\n", node->name);
7350+ return -1;
7351+ }
7352+}
7353+
7354+/* Call a specified routine for each node unpacked from the message. Return
7355+ * either the number of nodes found or -1 for an error */
7356+static int unpack_nodes(unsigned char *buf, int len,
7357+ int (*routine) (struct cluster_node *, char *,
7358+ unsigned short))
7359+{
7360+ int ptr = 0;
7361+ int num_nodes = 0;
7362+ char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
7363+ struct cluster_node node;
7364+
7365+ node.name = nodename;
7366+
7367+ while (ptr < len) {
7368+ int namelen = buf[ptr++];
7369+ unsigned int evotes;
7370+ unsigned int node_id;
7371+ unsigned short num_addr;
7372+ unsigned char *addrs;
7373+
7374+ memcpy(nodename, &buf[ptr], namelen);
7375+ nodename[namelen] = '\0';
7376+ ptr += namelen;
7377+
7378+ memcpy(&num_addr, &buf[ptr], sizeof (short));
7379+ num_addr = le16_to_cpu(num_addr);
7380+ ptr += sizeof (short);
7381+
7382+ /* Just make a note of the addrs "array" */
7383+ addrs = &buf[ptr];
7384+ ptr += num_addr * address_length;
7385+
7386+ node.votes = buf[ptr++];
7387+
7388+ memcpy(&evotes, &buf[ptr], sizeof (int));
7389+ node.expected_votes = le32_to_cpu(evotes);
7390+ ptr += sizeof (int);
7391+
7392+ memcpy(&node_id, &buf[ptr], sizeof (int));
7393+ node.node_id = le32_to_cpu(node_id);
7394+ ptr += sizeof (int);
7395+
7396+ /* Call the callback routine */
7397+ if (routine(&node, addrs, num_addr) < 0)
7398+ return -1;
7399+ num_nodes++;
7400+ }
7401+ return num_nodes;
7402+}
7403+
7404+/* Got join confirmation from a master node. This message contains a list of
7405+ * cluster nodes which we unpack and build into our cluster nodes list. When we
7406+ * have the last message we can go into TRANSITION state */
7407+static int do_process_joinconf(struct msghdr *msg, int len)
7408+{
7409+ char *message = msg->msg_iov->iov_base;
7410+
7411+ if (unpack_nodes(message + 2, len - 2, add_node) < 0) {
7412+ printk(CMAN_NAME
7413+ ": Error procssing joinconf message - giving up on cluster join\n");
7414+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7415+ return -1;
7416+ }
7417+
7418+ /* Last message in the list? */
7419+ if (message[1] & 2) {
7420+ char ackmsg;
7421+ struct sockaddr_cl *addr = msg->msg_name;
7422+
7423+ us->state = NODESTATE_MEMBER;
7424+ node_state = TRANSITION;
7425+ we_are_a_cluster_member = TRUE;
7426+
7427+ ackmsg = CLUSTER_MEM_CONFACK;
7428+ kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
7429+ sizeof (struct sockaddr_cl),
7430+ MSG_NOACK);
7431+ kernel_thread(hello_kthread, NULL, 0);
7432+ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
7433+ }
7434+ return 0;
7435+}
7436+
7437+/* Got the master's view of the cluster - compare it with ours and tell it the
7438+ * result */
7439+static int do_process_masterview(struct msghdr *msg, int len)
7440+{
7441+ char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
7442+ char *message = msg->msg_iov->iov_base;
7443+ static int num_nodes;
7444+
7445+ /* Someone else's state transition */
7446+ if (node_state != MEMBER &&
7447+ node_state != TRANSITION && node_state != MASTER)
7448+ return 0;
7449+
7450+ /* First message, zero the counter */
7451+ if (message[1] & 1)
7452+ num_nodes = 0;
7453+
7454+ num_nodes +=
7455+ unpack_nodes(msg->msg_iov->iov_base + 2, len - 2, check_node);
7456+
7457+ /* Last message, check the count and reply */
7458+ if (message[1] & 2) {
7459+ if (num_nodes == cluster_members) {
7460+ /* Send ACK */
7461+ reply[1] = 1;
7462+ }
7463+ else {
7464+ P_MEMB
7465+ ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
7466+ num_nodes, cluster_members);
7467+ /* Send NAK */
7468+ reply[1] = 0;
7469+ }
7470+ kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
7471+ msg->msg_namelen, 0);
7472+ }
7473+ return 0;
7474+}
7475+
7476+static int do_process_leave(struct msghdr *msg, int len)
7477+{
7478+ struct cluster_node *node;
7479+ struct sockaddr_cl *saddr = msg->msg_name;
7480+ unsigned char *leavemsg = (unsigned char *) msg->msg_iov->iov_base;
7481+
7482+ if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
7483+ unsigned char reason = leavemsg[1];
7484+
7485+ if (node->state != NODESTATE_DEAD) {
7486+ printk(KERN_INFO CMAN_NAME
7487+ ": Node %s is leaving the cluster, reason %d\n",
7488+ node->name, reason);
7489+
7490+ node->leave_reason = reason;
7491+ }
7492+ leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
7493+
7494+ a_node_just_died(node);
7495+
7496+ /* If it was the master node, then we have been nominated as
7497+ * the sucessor */
7498+ if (node == master_node) {
7499+ start_transition(TRANS_DEADMASTER, master_node);
7500+ }
7501+
7502+ }
7503+ return 0;
7504+}
7505+
7506+static int do_process_hello(struct msghdr *msg, int len)
7507+{
7508+ struct cluster_node *node;
7509+ struct cl_mem_hello_msg *hellomsg =
7510+ (struct cl_mem_hello_msg *) msg->msg_iov->iov_base;
7511+ struct sockaddr_cl *saddr = msg->msg_name;
7512+
7513+ /* We are starting up. Send a join message to the node whose HELLO we
7514+ * just received */
7515+ if (node_state == STARTING || node_state == JOINWAIT) {
7516+ struct sockaddr_cl *addr = msg->msg_name;
7517+
7518+ printk(KERN_INFO CMAN_NAME ": sending membership request\n");
7519+
7520+ send_joinreq(addr, msg->msg_namelen);
7521+ join_time = jiffies;
7522+ node_state = JOINING;
7523+ return 0;
7524+ }
7525+
7526+ /* Only process HELLOs if we are not in transition */
7527+ if (node_state == MEMBER) {
7528+ if (len < sizeof (struct cl_mem_hello_msg)) {
7529+ printk(KERN_ERR CMAN_NAME
7530+ ": short hello message from node %d\n",
7531+ saddr->scl_nodeid);
7532+ return -1;
7533+ }
7534+
7535+ node = find_node_by_nodeid(saddr->scl_nodeid);
7536+ if (node && node->state != NODESTATE_DEAD) {
7537+
7538+ /* Check the cluster generation in the HELLO message.
7539+ * NOTE: this may be different if the message crossed
7540+ * on the wire with an END-TRANS so we allow a period
7541+ * of grace in which this is allowable */
7542+ if (cluster_generation !=
7543+ le32_to_cpu(hellomsg->generation)
7544+ && node_state == MEMBER
7545+ && time_after(jiffies,
7546+ cman_config.hello_timer * HZ +
7547+ transition_end_time)) {
7548+ char killmsg;
7549+
7550+ printk(KERN_INFO CMAN_NAME
7551+ ": bad generation number %d in HELLO message, expected %d\n",
7552+ le32_to_cpu(hellomsg->generation),
7553+ cluster_generation);
7554+
7555+ notify_kernel_listeners(DIED,
7556+ (long) node->node_id);
7557+
7558+ killmsg = CLUSTER_MEM_KILL;
7559+ kcl_sendmsg(mem_socket, &killmsg, 1,
7560+ saddr, sizeof (struct sockaddr_cl),
7561+ MSG_NOACK);
7562+ return 0;
7563+ }
7564+
7565+ if (cluster_members != le16_to_cpu(hellomsg->members)
7566+ && node_state == MEMBER) {
7567+ printk(KERN_INFO CMAN_NAME
7568+ ": nmembers in HELLO message does not match our view\n");
7569+ start_transition(TRANS_CHECK, node);
7570+ return 0;
7571+ }
7572+ /* The message is OK - save the time */
7573+ node->last_hello = jiffies;
7574+
7575+ }
7576+ else {
7577+ struct sockaddr_cl *addr = msg->msg_name;
7578+
7579+ /* This node is a danger to our valid cluster */
7580+ if (cluster_is_quorate) {
7581+ char killmsg;
7582+
7583+ killmsg = CLUSTER_MEM_KILL;
7584+ kcl_sendmsg(mem_socket, &killmsg, 1, addr,
7585+ sizeof (struct sockaddr_cl),
7586+ MSG_NOACK);
7587+ }
7588+
7589+ }
7590+ }
7591+
7592+ return 0;
7593+
7594+}
7595+
7596+static int do_process_kill(struct msghdr *msg, int len)
7597+{
7598+ struct sockaddr_cl *saddr = msg->msg_name;
7599+ struct cluster_node *node;
7600+
7601+ node = find_node_by_nodeid(saddr->scl_nodeid);
7602+ if (node && node->state == NODESTATE_MEMBER) {
7603+
7604+ printk(KERN_INFO CMAN_NAME
7605+ ": Being told to leave the cluster by node %d\n",
7606+ saddr->scl_nodeid);
7607+
7608+ node_state = LEFT_CLUSTER;
7609+ quit_threads = 1;
7610+ wake_up_process(membership_task);
7611+ wake_up_interruptible(&cnxman_waitq);
7612+ }
7613+ else {
7614+ P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
7615+ }
7616+ return 0;
7617+}
7618+
7619+/* Some cluster membership utility functions */
7620+struct cluster_node *find_node_by_name(char *name)
7621+{
7622+ struct list_head *nodelist;
7623+ struct cluster_node *node;
7624+
7625+ down(&cluster_members_lock);
7626+ list_for_each(nodelist, &cluster_members_list) {
7627+ node = list_entry(nodelist, struct cluster_node, list);
7628+
7629+ if (strcmp(node->name, name) == 0) {
7630+ up(&cluster_members_lock);
7631+ return node;
7632+ }
7633+ }
7634+ up(&cluster_members_lock);
7635+ return NULL;
7636+}
7637+
7638+/* Try to avoid using this as it's slow and holds the members lock */
7639+struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
7640+{
7641+ struct list_head *nodelist;
7642+ struct list_head *addrlist;
7643+ struct cluster_node *node;
7644+ struct cluster_node_addr *nodeaddr;
7645+
7646+ down(&cluster_members_lock);
7647+
7648+ list_for_each(nodelist, &cluster_members_list) {
7649+ node = list_entry(nodelist, struct cluster_node, list);
7650+
7651+ list_for_each(addrlist, &node->addr_list) {
7652+ nodeaddr =
7653+ list_entry(addrlist, struct cluster_node_addr,
7654+ list);
7655+
7656+ if (memcmp(nodeaddr->addr, addr, address_length) == 0) {
7657+ up(&cluster_members_lock);
7658+ return node;
7659+ }
7660+ }
7661+ }
7662+
7663+ up(&cluster_members_lock);
7664+ return NULL;
7665+}
7666+
7667+/* This is the quick way to find a node */
7668+struct cluster_node *find_node_by_nodeid(unsigned int id)
7669+{
7670+ struct cluster_node *node;
7671+
7672+ if (id > sizeof_members_array)
7673+ return NULL;
7674+
7675+ spin_lock(&members_by_nodeid_lock);
7676+ node = members_by_nodeid[id];
7677+ spin_unlock(&members_by_nodeid_lock);
7678+ return node;
7679+}
7680+
7681+static int dispatch_messages(struct socket *mem_socket)
7682+{
7683+ int err = 0;
7684+
7685+ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
7686+ struct msghdr msg;
7687+ struct iovec iov;
7688+ struct sockaddr_cl sin;
7689+ int len;
7690+ mm_segment_t fs;
7691+
7692+ memset(&sin, 0, sizeof (sin));
7693+
7694+ msg.msg_control = NULL;
7695+ msg.msg_controllen = 0;
7696+ msg.msg_iovlen = 1;
7697+ msg.msg_iov = &iov;
7698+ msg.msg_name = &sin;
7699+ msg.msg_namelen = sizeof (sin);
7700+ msg.msg_flags = 0;
7701+
7702+ iov.iov_len = MAX_CLUSTER_MESSAGE;
7703+ iov.iov_base = iobuf;
7704+
7705+ fs = get_fs();
7706+ set_fs(get_ds());
7707+
7708+ len =
7709+ sock_recvmsg(mem_socket, &msg, MAX_CLUSTER_MESSAGE,
7710+ MSG_DONTWAIT);
7711+ set_fs(fs);
7712+ if (len > 0) {
7713+ iov.iov_base = iobuf; /* Reinstate pointer */
7714+ msg.msg_name = &sin;
7715+ do_membership_packet(&msg, len);
7716+ }
7717+ else {
7718+ if (len == -EAGAIN)
7719+ err = 0;
7720+ else
7721+ err = -1;
7722+ break;
7723+ }
7724+ }
7725+ return err;
7726+}
7727+
7728+/* Scan the nodes list for dead nodes */
7729+static void check_for_dead_nodes()
7730+{
7731+ struct list_head *nodelist;
7732+ struct cluster_node *node;
7733+
7734+ down(&cluster_members_lock);
7735+ list_for_each(nodelist, &cluster_members_list) {
7736+ node = list_entry(nodelist, struct cluster_node, list);
7737+
7738+ if (node->state != NODESTATE_DEAD &&
7739+ time_after(jiffies,
7740+ node->last_hello +
7741+ cman_config.deadnode_timeout * HZ) && !node->us) {
7742+
7743+ up(&cluster_members_lock);
7744+
7745+ printk(KERN_WARNING CMAN_NAME
7746+ ": no HELLO from %s, removing from the cluster\n",
7747+ node->name);
7748+
7749+ P_MEMB("last hello was %ld, current time is %ld\n",
7750+ node->last_hello, jiffies);
7751+
7752+ node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
7753+ leavereason = 0;
7754+
7755+ /* This is unlikely to work but it's worth a try! */
7756+ send_kill(node->node_id);
7757+
7758+ /* Start state transition */
7759+ a_node_just_died(node);
7760+ return;
7761+ }
7762+ }
7763+ up(&cluster_members_lock);
7764+
7765+ /* Also check for a dead quorum device */
7766+ if (quorum_device) {
7767+ if (quorum_device->state == NODESTATE_MEMBER &&
7768+ time_after(jiffies,
7769+ quorum_device->last_hello +
7770+ cman_config.deadnode_timeout * HZ)) {
7771+ quorum_device->state = NODESTATE_DEAD;
7772+ printk(KERN_WARNING CMAN_NAME
7773+ ": Quorum device %s timed out\n",
7774+ quorum_device->name);
7775+ recalculate_quorum(0);
7776+ }
7777+ }
7778+
7779+ return;
7780+}
7781+
7782+/* add "us" as a node in the cluster */
7783+static int add_us()
7784+{
7785+ struct cluster_node *newnode =
7786+ kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
7787+
7788+ if (!newnode) {
7789+ /* Oh shit, we have to commit hara kiri here for the greater
7790+ * good of the cluster */
7791+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7792+
7793+ printk(KERN_CRIT CMAN_NAME
7794+ ": Cannot allocate memory for our node structure\n");
7795+ panic("Must die");
7796+
7797+ return -1;
7798+ }
7799+
7800+ memset(newnode, 0, sizeof (struct cluster_node));
7801+ newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
7802+ if (!newnode->name) {
7803+ send_leave(CLUSTER_LEAVEFLAG_PANIC);
7804+
7805+ printk(KERN_CRIT CMAN_NAME
7806+ ": Cannot allocate memory for node name\n");
7807+ kfree(newnode);
7808+
7809+ panic("Must die");
7810+
7811+ return -1;
7812+ }
7813+
7814+ strcpy(newnode->name, nodename);
7815+ newnode->last_hello = jiffies;
7816+ newnode->votes = votes;
7817+ newnode->expected_votes = expected_votes;
7818+ newnode->state = NODESTATE_JOINING;
7819+ newnode->node_id = 0; /* Will get filled in by ENDTRANS message */
7820+ newnode->us = 1;
7821+ newnode->leave_reason = 0;
7822+ INIT_LIST_HEAD(&newnode->addr_list);
7823+ get_local_addresses(newnode); /* Get from cnxman socket info */
7824+
7825+ /* Add the new node to the list */
7826+ down(&cluster_members_lock);
7827+ list_add(&newnode->list, &cluster_members_list);
7828+ cluster_members++;
7829+ up(&cluster_members_lock);
7830+ us = newnode;
7831+
7832+ return 0;
7833+}
7834+
7835+/* Return the highest known node_id */
7836+unsigned int get_highest_nodeid()
7837+{
7838+ struct list_head *nodelist;
7839+ struct cluster_node *node = NULL;
7840+ unsigned int highest = 0;
7841+
7842+ down(&cluster_members_lock);
7843+ list_for_each(nodelist, &cluster_members_list) {
7844+ node = list_entry(nodelist, struct cluster_node, list);
7845+
7846+ if (node->node_id > highest)
7847+ highest = node->node_id;
7848+ }
7849+ up(&cluster_members_lock);
7850+
7851+ return highest;
7852+}
7853+
7854+/* Elect a new master if there is a clash. Returns 1 if we are the new master,
7855+ * the master's struct will also be returned. This, rather primitively, uses
7856+ * the lowest node ID */
7857+static int elect_master(struct cluster_node **master_node)
7858+{
7859+ int i;
7860+
7861+ for (i = 1; i < sizeof_members_array; i++) {
7862+ if (members_by_nodeid[i]
7863+ && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
7864+ *master_node = members_by_nodeid[i];
7865+ P_MEMB("Elected master is %s\n", (*master_node)->name);
7866+ return (*master_node)->us;
7867+ }
7868+ }
7869+ BUG();
7870+ return 0;
7871+}
7872+
7873+/* Called by node_cleanup in cnxman when we have left the cluster */
7874+void free_nodeid_array()
7875+{
7876+ vfree(members_by_nodeid);
7877+ members_by_nodeid = NULL;
7878+ sizeof_members_array = 0;
7879+}
7880+
7881+int allocate_nodeid_array()
7882+{
7883+ /* Allocate space for the nodeid lookup array */
7884+ if (!members_by_nodeid) {
7885+ spin_lock_init(&members_by_nodeid_lock);
7886+ members_by_nodeid =
7887+ vmalloc(cman_config.max_nodes *
7888+ sizeof (struct cluster_member *));
7889+ }
7890+
7891+ if (!members_by_nodeid) {
7892+ printk(KERN_WARNING
7893+ "Unable to allocate members array for %d members\n",
7894+ cman_config.max_nodes);
7895+ return -ENOMEM;
7896+ }
7897+ memset(members_by_nodeid, 0,
7898+ cman_config.max_nodes * sizeof (struct cluster_member *));
7899+ sizeof_members_array = cman_config.max_nodes;
7900+
7901+ return 0;
7902+}
7903+
7904+/* Set the votes & expected_votes variables */
7905+void set_votes(int v, int e)
7906+{
7907+ votes = v;
7908+ expected_votes = e;
7909+}
7910+
7911+int get_quorum()
7912+{
7913+ return quorum;
7914+}
7915+
7916+/* Called by cnxman to see if activity should be blocked because we are in a
7917+ * state transition */
7918+int in_transition()
7919+{
7920+ return node_state == TRANSITION ||
7921+ node_state == TRANSITION_COMPLETE || node_state == MASTER;
7922+}
7923+
7924+/* Return the current membership state as a string for the main line to put
7925+ * into /proc . I really should be using snprintf rather than sprintf but it's
7926+ * not exported... */
7927+char *membership_state(char *buf, int buflen)
7928+{
7929+ switch (node_state) {
7930+ case STARTING:
7931+ strncpy(buf, "Starting", buflen);
7932+ break;
7933+ case JOINING:
7934+ strncpy(buf, "Joining", buflen);
7935+ break;
7936+ case JOINWAIT:
7937+ strncpy(buf, "Join-Wait", buflen);
7938+ break;
7939+ case JOINACK:
7940+ strncpy(buf, "Join-Ack", buflen);
7941+ break;
7942+ case TRANSITION:
7943+ sprintf(buf, "State-Transition: Master is %s",
7944+ master_node ? master_node->name : "Unknown");
7945+ break;
7946+ case MEMBER:
7947+ strncpy(buf, "Cluster-Member", buflen);
7948+ break;
7949+ case REJECTED:
7950+ strncpy(buf, "Rejected", buflen);
7951+ break;
7952+ case LEFT_CLUSTER:
7953+ strncpy(buf, "Left-Cluster", buflen);
7954+ break;
7955+ case TRANSITION_COMPLETE:
7956+ strncpy(buf, "Transition-Complete", buflen);
7957+ break;
7958+ case MASTER:
7959+ strncpy(buf, "Transition-Master", buflen);
7960+ break;
7961+ default:
7962+ sprintf(buf, "Unknown: code=%d", node_state);
7963+ break;
7964+ }
7965+
7966+ return buf;
7967+}
7968+
7969+#ifdef DEBUG_MEMB
7970+static char *msgname(int msg)
7971+{
7972+ switch (msg) {
7973+ case CLUSTER_MEM_JOINCONF:
7974+ return "JOINCONF";
7975+ case CLUSTER_MEM_JOINREQ:
7976+ return "JOINREQ";
7977+ case CLUSTER_MEM_LEAVE:
7978+ return "LEAVE";
7979+ case CLUSTER_MEM_HELLO:
7980+ return "HELLO";
7981+ case CLUSTER_MEM_KILL:
7982+ return "KILL";
7983+ case CLUSTER_MEM_JOINACK:
7984+ return "JOINACK";
7985+ case CLUSTER_MEM_ENDTRANS:
7986+ return "ENDTRANS";
7987+ case CLUSTER_MEM_RECONFIG:
7988+ return "RECONFIG";
7989+ case CLUSTER_MEM_MASTERVIEW:
7990+ return "MASTERVIEW";
7991+ case CLUSTER_MEM_STARTTRANS:
7992+ return "STARTTRANS";
7993+ case CLUSTER_MEM_JOINREJ:
7994+ return "JOINREJ";
7995+ case CLUSTER_MEM_VIEWACK:
7996+ return "VIEWACK";
7997+ case CLUSTER_MEM_STARTACK:
7998+ return "STARTACK";
7999+ case CLUSTER_MEM_NEWCLUSTER:
8000+ return "NEWCLUSTER";
8001+ case CLUSTER_MEM_CONFACK:
8002+ return "CONFACK";
8003+ case CLUSTER_MEM_NOMINATE:
8004+ return "NOMINATE";
8005+
8006+ default:
8007+ return "??UNKNOWN??";
8008+ }
8009+}
8010+
8011+#endif
8012+
8013+/*
8014+ * Overrides for Emacs so that we follow Linus's tabbing style.
8015+ * Emacs will notice this stuff at the end of the file and automatically
8016+ * adjust the settings for this buffer only. This must remain at the end
8017+ * of the file.
8018+ * ---------------------------------------------------------------------------
8019+ * Local variables:
8020+ * c-file-style: "linux"
8021+ * End:
8022+ */
8023diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
8024--- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8025+++ linux-patched/cluster/cman/proc.c 2004-06-29 20:07:50.000000000 +0800
4bf12011 8026@@ -0,0 +1,364 @@
8027+/******************************************************************************
8028+*******************************************************************************
8029+**
8030+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8031+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8032+**
8033+** This copyrighted material is made available to anyone wishing to use,
8034+** modify, copy, or redistribute it subject to the terms and conditions
8035+** of the GNU General Public License v.2.
8036+**
8037+*******************************************************************************
8038+******************************************************************************/
8039+
8040+#include <linux/init.h>
8041+#include <linux/socket.h>
8042+#include <linux/kernel.h>
8043+#include <linux/sched.h>
8044+#include <linux/file.h>
8045+#include <linux/proc_fs.h>
8046+#include <linux/seq_file.h>
8047+#include <linux/list.h>
8048+#include <linux/in.h>
8049+#include <net/sock.h>
8050+#include <cluster/cnxman.h>
8051+#include <cluster/service.h>
8052+
8053+#include "cnxman-private.h"
8054+#include "config.h"
8055+
8056+extern int cluster_members;
8057+extern struct list_head cluster_members_list;
8058+extern struct semaphore cluster_members_lock;
8059+extern struct cluster_node *quorum_device;
8060+extern int we_are_a_cluster_member;
8061+extern int cluster_is_quorate;
8062+extern unsigned short cluster_id;
8063+extern atomic_t use_count;
8064+extern unsigned int address_length;
8065+extern unsigned int config_version;
8066+extern char cluster_name[];
8067+extern struct cluster_node *us;
8068+static struct seq_operations cluster_info_op;
8069+
8070+int sm_procdata(char *b, char **start, off_t offset, int length);
8071+int sm_debug_info(char *b, char **start, off_t offset, int length);
8072+
8073+/* /proc interface to the configuration struct */
8074+static struct config_proc_info {
8075+ char *name;
8076+ int *value;
8077+} config_proc[] = {
8078+ {
8079+ .name = "joinwait_timeout",
8080+ .value = &cman_config.joinwait_timeout,
8081+ },
8082+ {
8083+ .name = "joinconf_timeout",
8084+ .value = &cman_config.joinconf_timeout,
8085+ },
8086+ {
8087+ .name = "join_timeout",
8088+ .value = &cman_config.join_timeout,
8089+ },
8090+ {
8091+ .name = "hello_timer",
8092+ .value = &cman_config.hello_timer,
8093+ },
8094+ {
8095+ .name = "deadnode_timeout",
8096+ .value = &cman_config.deadnode_timeout,
8097+ },
8098+ {
8099+ .name = "transition_timeout",
8100+ .value = &cman_config.transition_timeout,
8101+ },
8102+ {
8103+ .name = "transition_restarts",
8104+ .value = &cman_config.transition_restarts,
8105+ },
8106+ {
8107+ .name = "max_nodes",
8108+ .value = &cman_config.max_nodes,
8109+ },
8110+ {
8111+ .name = "sm_debug_size",
8112+ .value = &cman_config.sm_debug_size,
8113+ },
8114+};
8115+
8116+
8117+static int proc_cluster_status(char *b, char **start, off_t offset, int length)
8118+{
8119+ struct list_head *nodelist;
8120+ struct cluster_node *node;
8121+ struct cluster_node_addr *node_addr;
8122+ unsigned int total_votes = 0;
8123+ unsigned int max_expected = 0;
8124+ int c = 0;
8125+ char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
8126+
8127+ if (!we_are_a_cluster_member) {
8128+ c += sprintf(b+c, "Not a cluster member. State: %s\n",
8129+ membership_state(node_buf,
8130+ sizeof (node_buf)));
8131+ return c;
8132+ }
8133+
8134+ /* Total the votes */
8135+ down(&cluster_members_lock);
8136+ list_for_each(nodelist, &cluster_members_list) {
8137+ node = list_entry(nodelist, struct cluster_node, list);
8138+ if (node->state == NODESTATE_MEMBER) {
8139+ total_votes += node->votes;
8140+ max_expected =
8141+ max(max_expected, node->expected_votes);
8142+ }
8143+ }
8144+ up(&cluster_members_lock);
8145+
8146+ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
8147+ total_votes += quorum_device->votes;
8148+
8149+ c += sprintf(b+c,
8150+ "Version: %d.%d.%d\nConfig version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
8151+ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
8152+ CNXMAN_PATCH_VERSION,
8153+ config_version,
8154+ cluster_name, cluster_id,
8155+ membership_state(node_buf, sizeof (node_buf)));
8156+ c += sprintf(b+c,
8157+ "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n",
8158+ cluster_members, max_expected, total_votes,
8159+ get_quorum(),
8160+ cluster_is_quorate ? " " : "Activity blocked");
8161+ c += sprintf(b+c, "Active subsystems: %d\n",
8162+ atomic_read(&use_count));
8163+
8164+
8165+ c += sprintf(b+c, "Node addresses: ");
8166+ list_for_each_entry(node_addr, &us->addr_list, list) {
8167+ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
8168+ if (saddr->sin6_family == AF_INET6) {
8169+ c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ",
8170+ be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
8171+ be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
8172+ be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
8173+ be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
8174+ be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
8175+ be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
8176+ be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
8177+ be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
8178+ }
8179+ else {
8180+ struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
8181+ uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
8182+ c+= sprintf(b+c, "%u.%u.%u.%u ",
8183+ addr[0], addr[1], addr[2], addr[3]);
8184+ }
8185+ }
8186+ c += sprintf(b+c, "\n\n");
8187+ return c;
8188+}
8189+
8190+
8191+/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
8192+ * we are */
8193+struct cluster_seq_info {
8194+ int nodeid;
8195+ int highest_nodeid;
8196+};
8197+
8198+static int cluster_open(struct inode *inode, struct file *file)
8199+{
8200+ return seq_open(file, &cluster_info_op);
8201+}
8202+
8203+static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
8204+{
8205+ struct cluster_seq_info *csi =
8206+ kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
8207+
8208+ if (!csi)
8209+ return NULL;
8210+
8211+ /* Keep highest_nodeid here so we don't need to keep traversing the
8212+ * list to find it */
8213+ csi->nodeid = *pos;
8214+ csi->highest_nodeid = get_highest_nodeid();
8215+
8216+ /* Print the header */
8217+ if (*pos == 0) {
8218+ seq_printf(m,
8219+ "Node Votes Exp Sts Name\n");
8220+ return csi;
8221+ }
8222+ return csi;
8223+}
8224+
8225+static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
8226+{
8227+ struct cluster_seq_info *csi = p;
8228+
8229+ *pos = ++csi->nodeid;
8230+ if (csi->nodeid > csi->highest_nodeid)
8231+ return NULL;
8232+
8233+ return csi;
8234+}
8235+
8236+static int cluster_seq_show(struct seq_file *m, void *p)
8237+{
8238+ char state = '?';
8239+ struct cluster_node *node;
8240+ struct cluster_seq_info *csi = p;
8241+
8242+ /*
8243+ * If we have "0" here then display the quorum device if
8244+ * there is one.
8245+ */
8246+ if (csi->nodeid == 0)
8247+ node = quorum_device;
8248+ else
8249+ node = find_node_by_nodeid(csi->nodeid);
8250+
8251+ if (!node)
8252+ return 0;
8253+
8254+ /* Make state printable */
8255+ switch (node->state) {
8256+ case NODESTATE_MEMBER:
8257+ state = 'M';
8258+ break;
8259+ case NODESTATE_JOINING:
8260+ state = 'J';
8261+ break;
8262+ case NODESTATE_REMOTEMEMBER:
8263+ state = 'R';
8264+ break;
8265+ case NODESTATE_DEAD:
8266+ state = 'X';
8267+ break;
8268+ }
8269+ seq_printf(m, " %3d %3d %3d %c %s\n",
8270+ node->node_id,
8271+ node->votes,
8272+ node->expected_votes,
8273+ state,
8274+ node->name);
8275+
8276+ return 0;
8277+}
8278+
8279+static void cluster_seq_stop(struct seq_file *m, void *p)
8280+{
8281+ kfree(p);
8282+}
8283+
8284+static struct seq_operations cluster_info_op = {
8285+ .start = cluster_seq_start,
8286+ .next = cluster_seq_next,
8287+ .stop = cluster_seq_stop,
8288+ .show = cluster_seq_show
8289+};
8290+
8291+static struct file_operations cluster_fops = {
8292+ .open = cluster_open,
8293+ .read = seq_read,
8294+ .llseek = seq_lseek,
8295+ .release = seq_release,
8296+};
8297+
8298+static int cman_config_read_proc(char *page, char **start, off_t off, int count,
8299+ int *eof, void *data)
8300+{
8301+ struct config_proc_info *cinfo = data;
8302+
8303+ return snprintf(page, count, "%d\n", *cinfo->value);
8304+}
8305+
8306+static int cman_config_write_proc(struct file *file, const char *buffer,
8307+ unsigned long count, void *data)
8308+{
8309+ struct config_proc_info *cinfo = data;
8310+ int value;
8311+ char *end;
8312+
8313+ value = simple_strtoul(buffer, &end, 10);
8314+ if (*end) {
8315+ *cinfo->value = value;
8316+ }
8317+ return count;
8318+}
8319+
8320+/* Base of the config directory for cman */
8321+static struct proc_dir_entry *proc_cman_config;
8322+void create_proc_entries(void)
8323+{
8324+ struct proc_dir_entry *procentry;
8325+ struct proc_dir_entry *proc_cluster;
8326+ int i;
8327+
8328+ proc_cluster = proc_mkdir("cluster", 0);
8329+ if (!proc_cluster)
8330+ return;
8331+ proc_cluster->owner = THIS_MODULE;
8332+
8333+ /* Config dir filled in by us and others */
8334+ if (!proc_mkdir("cluster/config", 0))
8335+ return;
8336+
8337+ /* Don't much care if this fails, it's hardly vital */
8338+ procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
8339+ if (procentry)
8340+ procentry->proc_fops = &cluster_fops;
8341+
8342+ procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
8343+ if (procentry)
8344+ procentry->get_info = proc_cluster_status;
8345+
8346+ procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
8347+ if (procentry)
8348+ procentry->get_info = sm_procdata;
8349+
8350+ /* Config entries */
8351+ proc_cman_config = proc_mkdir("cluster/config/cman", 0);
8352+ if (!proc_cman_config)
8353+ return;
8354+
8355+ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
8356+ procentry = create_proc_entry(config_proc[i].name, 0660,
8357+ proc_cman_config);
8358+ if (procentry) {
8359+ procentry->data = &config_proc[i];
8360+ procentry->write_proc = cman_config_write_proc;
8361+ procentry->read_proc = cman_config_read_proc;
8362+ }
8363+ }
8364+
8365+ procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
8366+ if (procentry)
8367+ procentry->get_info = sm_debug_info;
8368+}
8369+
8370+void cleanup_proc_entries(void)
8371+{
8372+ int i, config_count;
8373+
8374+ remove_proc_entry("cluster/sm_debug", NULL);
8375+
8376+ config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
8377+
8378+ if (proc_cman_config) {
8379+ for (i=0; i<config_count; i++)
8380+ remove_proc_entry(config_proc[i].name, proc_cman_config);
8381+ }
8382+ remove_proc_entry("cluster/config/cman", NULL);
8383+ remove_proc_entry("cluster/config", NULL);
8384+
8385+ remove_proc_entry("cluster/nodes", NULL);
8386+ remove_proc_entry("cluster/status", NULL);
8387+ remove_proc_entry("cluster/services", NULL);
8388+ remove_proc_entry("cluster/config", NULL);
8389+ remove_proc_entry("cluster", NULL);
8390+}
8391diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
8392--- linux-orig/cluster/cman/sm.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8393+++ linux-patched/cluster/cman/sm.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 8394@@ -0,0 +1,108 @@
8395+/******************************************************************************
8396+*******************************************************************************
8397+**
8398+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8399+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8400+**
8401+** This copyrighted material is made available to anyone wishing to use,
8402+** modify, copy, or redistribute it subject to the terms and conditions
8403+** of the GNU General Public License v.2.
8404+**
8405+*******************************************************************************
8406+******************************************************************************/
8407+
8408+#ifndef __SM_DOT_H__
8409+#define __SM_DOT_H__
8410+
8411+/*
8412+ * This is the main header file to be included in each Service Manager source
8413+ * file.
8414+ */
8415+
8416+#include <linux/list.h>
8417+#include <linux/socket.h>
8418+#include <linux/kernel.h>
8419+#include <linux/sched.h>
8420+#include <linux/file.h>
8421+#include <net/sock.h>
8422+
8423+#include <cluster/cnxman.h>
8424+#include <cluster/service.h>
8425+
8426+#define SG_LEVELS (4)
8427+
8428+#include "sm_internal.h"
8429+#include "sm_barrier.h"
8430+#include "sm_control.h"
8431+#include "sm_daemon.h"
8432+#include "sm_joinleave.h"
8433+#include "sm_membership.h"
8434+#include "sm_message.h"
8435+#include "sm_misc.h"
8436+#include "sm_recover.h"
8437+#include "sm_services.h"
8438+
8439+extern struct list_head sm_sg[SG_LEVELS];
8440+extern struct semaphore sm_sglock;
8441+
8442+#ifndef TRUE
8443+#define TRUE (1)
8444+#endif
8445+
8446+#ifndef FALSE
8447+#define FALSE (0)
8448+#endif
8449+
8450+#define SM_ASSERT(x, do) \
8451+{ \
8452+ if (!(x)) \
8453+ { \
8454+ printk("\nSM: Assertion failed on line %d of file %s\n" \
8455+ "SM: assertion: \"%s\"\n" \
8456+ "SM: time = %lu\n", \
8457+ __LINE__, __FILE__, #x, jiffies); \
8458+ {do} \
8459+ printk("\n"); \
8460+ panic("SM: Record message above and reboot.\n"); \
8461+ } \
8462+}
8463+
8464+#define SM_RETRY(do_this, until_this) \
8465+for (;;) \
8466+{ \
8467+ do { do_this; } while (0); \
8468+ if (until_this) \
8469+ break; \
8470+ printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \
8471+ schedule();\
8472+}
8473+
8474+
8475+#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
8476+
8477+#define log_error(sg, fmt, args...) \
8478+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8479+
8480+
8481+#define SM_DEBUG_LOG
8482+
8483+#ifdef SM_DEBUG_CONSOLE
8484+#define log_debug(sg, fmt, args...) \
8485+ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8486+#endif
8487+
8488+#ifdef SM_DEBUG_LOG
8489+#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
8490+#endif
8491+
8492+#ifdef SM_DEBUG_ALL
8493+#define log_debug(sg, fmt, args...) \
8494+do \
8495+{ \
8496+ printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
8497+ sm_debug_log(sg, fmt, ##args); \
8498+} \
8499+while (0)
8500+#endif
8501+
8502+#endif /* __SM_DOT_H__ */
8503diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
8504--- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8505+++ linux-patched/cluster/cman/sm_barrier.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 8506@@ -0,0 +1,232 @@
8507+/******************************************************************************
8508+*******************************************************************************
8509+**
8510+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8511+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8512+**
8513+** This copyrighted material is made available to anyone wishing to use,
8514+** modify, copy, or redistribute it subject to the terms and conditions
8515+** of the GNU General Public License v.2.
8516+**
8517+*******************************************************************************
8518+******************************************************************************/
8519+
8520+#include "sm.h"
8521+
8522+static struct list_head barriers;
8523+static spinlock_t barriers_lock;
8524+
8525+struct bc_entry {
8526+ struct list_head list;
8527+ uint32_t gid;
8528+ int status;
8529+ char type;
8530+};
8531+typedef struct bc_entry bc_entry_t;
8532+
8533+void init_barriers(void)
8534+{
8535+ INIT_LIST_HEAD(&barriers);
8536+ spin_lock_init(&barriers_lock);
8537+}
8538+
8539+static int atoi(char *c)
8540+{
8541+ int x = 0;
8542+
8543+ while ('0' <= *c && *c <= '9') {
8544+ x = x * 10 + (*c - '0');
8545+ c++;
8546+ }
8547+ return x;
8548+}
8549+
8550+static void add_barrier_callback(char *name, int status, int type)
8551+{
8552+ char *p;
8553+ uint32_t gid;
8554+ bc_entry_t *be;
8555+
8556+ /* an ESRCH callback just means there was a cnxman transition */
8557+ if (status == -ESRCH)
8558+ return;
8559+
8560+ /* extract global id of SG from barrier name */
8561+ p = strstr(name, "sm.");
8562+
8563+ SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
8564+
8565+ p += strlen("sm.");
8566+ gid = atoi(p);
8567+
8568+ SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
8569+
8570+ be->gid = gid;
8571+ be->status = status;
8572+ be->type = type;
8573+
8574+ spin_lock(&barriers_lock);
8575+ list_add_tail(&be->list, &barriers);
8576+ spin_unlock(&barriers_lock);
8577+
8578+ wake_serviced(DO_BARRIERS);
8579+}
8580+
8581+static void callback_recovery_barrier(char *name, int status)
8582+{
8583+ add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
8584+}
8585+
8586+static void callback_startdone_barrier_new(char *name, int status)
8587+{
8588+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
8589+}
8590+
8591+static void callback_startdone_barrier(char *name, int status)
8592+{
8593+ add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
8594+}
8595+
8596+int sm_barrier(char *name, int count, int type)
8597+{
8598+ int error;
8599+ unsigned long fn = 0;
8600+
8601+ switch (type) {
8602+ case SM_BARRIER_STARTDONE:
8603+ fn = (unsigned long) callback_startdone_barrier;
8604+ break;
8605+ case SM_BARRIER_STARTDONE_NEW:
8606+ fn = (unsigned long) callback_startdone_barrier_new;
8607+ break;
8608+ case SM_BARRIER_RECOVERY:
8609+ fn = (unsigned long) callback_recovery_barrier;
8610+ break;
8611+ }
8612+
8613+ error = kcl_barrier_register(name, 0, count);
8614+ if (error) {
8615+ log_print("barrier register error %d", error);
8616+ goto fail;
8617+ }
8618+
8619+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
8620+ if (error) {
8621+ log_print("barrier setattr autodel error %d", error);
8622+ goto fail_bar;
8623+ }
8624+
8625+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
8626+ if (error) {
8627+ log_print("barrier setattr cb error %d", error);
8628+ goto fail_bar;
8629+ }
8630+
8631+ error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
8632+ if (error) {
8633+ log_print("barrier setattr enabled error %d", error);
8634+ goto fail_bar;
8635+ }
8636+
8637+ return 0;
8638+
8639+ fail_bar:
8640+ kcl_barrier_delete(name);
8641+ fail:
8642+ return error;
8643+}
8644+
8645+void process_startdone_barrier_new(sm_group_t *sg, int status)
8646+{
8647+ sm_sevent_t *sev = sg->sevent;
8648+
8649+ if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
8650+ log_debug(sev->se_sg, "ignore barrier cb status %d", status);
8651+ return;
8652+ }
8653+
8654+ sev->se_barrier_status = status;
8655+ sev->se_state = SEST_BARRIER_DONE;
8656+ set_bit(SEFL_CHECK, &sev->se_flags);
8657+ wake_serviced(DO_JOINLEAVE);
8658+}
8659+
8660+void process_startdone_barrier(sm_group_t *sg, int status)
8661+{
8662+ sm_uevent_t *uev = &sg->uevent;
8663+
8664+ if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
8665+ log_debug(sg, "ignore barrier cb status %d", status);
8666+ return;
8667+ }
8668+
8669+ uev->ue_barrier_status = status;
8670+ uev->ue_state = UEST_BARRIER_DONE;
8671+ set_bit(UEFL_CHECK, &uev->ue_flags);
8672+ wake_serviced(DO_MEMBERSHIP);
8673+}
8674+
8675+void process_recovery_barrier(sm_group_t *sg, int status)
8676+{
8677+ if (status) {
8678+ log_error(sg, "process_recovery_barrier status=%d", status);
8679+ return;
8680+ }
8681+
8682+ if (sg->state != SGST_RECOVER ||
8683+ sg->recover_state != RECOVER_BARRIERWAIT) {
8684+ log_error(sg, "process_recovery_barrier state %d recover %d",
8685+ sg->state, sg->recover_state);
8686+ return;
8687+ }
8688+
8689+ if (!sg->recover_stop)
8690+ sg->recover_state = RECOVER_STOP;
8691+ else
8692+ sg->recover_state = RECOVER_BARRIERDONE;
8693+
8694+ wake_serviced(DO_RECOVERIES);
8695+}
8696+
8697+void process_barriers(void)
8698+{
8699+ sm_group_t *sg;
8700+ bc_entry_t *be;
8701+
8702+ while (1) {
8703+ be = NULL;
8704+
8705+ spin_lock(&barriers_lock);
8706+ if (!list_empty(&barriers)) {
8707+ be = list_entry(barriers.next, bc_entry_t, list);
8708+ list_del(&be->list);
8709+ }
8710+ spin_unlock(&barriers_lock);
8711+
8712+ if (!be)
8713+ break;
8714+
8715+ sg = sm_global_id_to_sg(be->gid);
8716+ if (!sg) {
8717+ log_print("process_barriers: no sg %08x", be->gid);
8718+ break;
8719+ }
8720+
8721+ switch (be->type) {
8722+ case SM_BARRIER_STARTDONE_NEW:
8723+ process_startdone_barrier_new(sg, be->status);
8724+ break;
8725+
8726+ case SM_BARRIER_STARTDONE:
8727+ process_startdone_barrier(sg, be->status);
8728+ break;
8729+
8730+ case SM_BARRIER_RECOVERY:
8731+ process_recovery_barrier(sg, be->status);
8732+ break;
8733+ }
8734+
8735+ kfree(be);
8736+ schedule();
8737+ }
8738+}
8739diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
8740--- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8741+++ linux-patched/cluster/cman/sm_barrier.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 8742@@ -0,0 +1,29 @@
8743+/******************************************************************************
8744+*******************************************************************************
8745+**
8746+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8747+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8748+**
8749+** This copyrighted material is made available to anyone wishing to use,
8750+** modify, copy, or redistribute it subject to the terms and conditions
8751+** of the GNU General Public License v.2.
8752+**
8753+*******************************************************************************
8754+******************************************************************************/
8755+
8756+#ifndef __SM_BARRIER_DOT_H__
8757+#define __SM_BARRIER_DOT_H__
8758+
8759+#define SM_BARRIER_STARTDONE (0)
8760+#define SM_BARRIER_STARTDONE_NEW (1)
8761+#define SM_BARRIER_RECOVERY (2)
8762+#define SM_BARRIER_RESET (3)
8763+
8764+void init_barriers(void);
8765+void process_barriers(void);
8766+int sm_barrier(char *name, int count, int type);
8767+void process_startdone_barrier(sm_group_t *sg, int status);
8768+void process_startdone_barrier_new(sm_group_t *sg, int status);
8769+void process_recovery_barrier(sm_group_t *sg, int status);
8770+
8771+#endif
8772diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
8773--- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8774+++ linux-patched/cluster/cman/sm_control.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 8775@@ -0,0 +1,156 @@
8776+/******************************************************************************
8777+*******************************************************************************
8778+**
8779+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8780+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8781+**
8782+** This copyrighted material is made available to anyone wishing to use,
8783+** modify, copy, or redistribute it subject to the terms and conditions
8784+** of the GNU General Public License v.2.
8785+**
8786+*******************************************************************************
8787+******************************************************************************/
8788+
8789+#include "sm.h"
8790+#include "config.h"
8791+
8792+struct socket * sm_socket;
8793+uint32_t * sm_new_nodeids;
8794+uint32_t sm_our_nodeid;
8795+int sm_quorum, sm_quorum_next;
8796+struct list_head sm_members;
8797+int sm_member_count;
8798+
8799+
8800+/*
8801+ * Context: cnxman
8802+ * Called by cnxman when it has a new member list.
8803+ */
8804+
8805+void sm_member_update(int quorate)
8806+{
8807+ sm_quorum_next = quorate;
8808+ wake_serviced(DO_START_RECOVERY);
8809+}
8810+
8811+/*
8812+ * Context: cnxman
8813+ * Called when module is loaded.
8814+ */
8815+
8816+void sm_init(void)
8817+{
8818+ sm_socket = NULL;
8819+ sm_new_nodeids = NULL;
8820+ sm_quorum = 0;
8821+ sm_quorum_next = 0;
8822+ sm_our_nodeid = 0;
8823+ INIT_LIST_HEAD(&sm_members);
8824+ sm_member_count = 0;
8825+
8826+ init_services();
8827+ init_messages();
8828+ init_barriers();
8829+ init_serviced();
8830+ init_recovery();
8831+ init_joinleave();
8832+ init_sm_misc();
8833+}
8834+
8835+/*
8836+ * Context: cnxman
8837+ * Called at beginning of cluster join procedure.
8838+ */
8839+
8840+void sm_start(void)
8841+{
8842+ struct sockaddr_cl saddr;
8843+ struct socket *sock;
8844+ int result;
8845+
8846+ /* Create a communication channel among service managers */
8847+
8848+ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
8849+ if (result < 0) {
8850+ log_print("can't create socket %d", result);
8851+ goto fail;
8852+ }
8853+
8854+ sm_socket = sock;
8855+
8856+ saddr.scl_family = AF_CLUSTER;
8857+ saddr.scl_port = CLUSTER_PORT_SERVICES;
8858+
8859+ result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
8860+ sizeof(saddr));
8861+ if (result < 0) {
8862+ log_print("can't bind socket %d", result);
8863+ goto fail_release;
8864+ }
8865+
8866+ result = kcl_register_read_callback(sm_socket, sm_cluster_message);
8867+ if (result < 0) {
8868+ log_print("can't register read callback %d", result);
8869+ goto fail_release;
8870+ }
8871+
8872+ sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
8873+ sizeof(uint32_t),
8874+ GFP_KERNEL);
8875+ start_serviced();
8876+
8877+ /* cnxman should call sm_member_update() once we've joined - then we
8878+ * can get our first list of members and our own nodeid */
8879+
8880+ return;
8881+
8882+ fail_release:
8883+ sock_release(sm_socket);
8884+ sm_socket = NULL;
8885+
8886+ fail:
8887+ return;
8888+}
8889+
8890+/*
8891+ * Context: cnxman
8892+ * Called before cnxman leaves the cluster. If this returns an error to cman,
8893+ * cman should not leave the cluster but return EBUSY.
8894+ * If force is set we go away anyway. cman knows best in this case
8895+ */
8896+
8897+int sm_stop(int force)
8898+{
8899+ struct list_head *head;
8900+ sm_group_t *sg;
8901+ sm_node_t *node;
8902+ int i, busy = FALSE, error = -EBUSY;
8903+
8904+ for (i = 0; i < SG_LEVELS; i++) {
8905+ if (!list_empty(&sm_sg[i])) {
8906+ sg = list_entry(sm_sg[i].next, sm_group_t, list);
8907+ log_error(sg, "sm_stop: SG still joined");
8908+ busy = TRUE;
8909+ }
8910+ }
8911+
8912+ if (!busy || force) {
8913+ stop_serviced();
8914+
8915+ if (sm_socket)
8916+ sock_release(sm_socket);
8917+
8918+ head = &sm_members;
8919+ while (!list_empty(head)) {
8920+ node = list_entry(head->next, sm_node_t, list);
8921+ list_del(&node->list);
8922+ sm_member_count--;
8923+ kfree(node);
8924+ }
8925+
8926+ kfree(sm_new_nodeids);
8927+ sm_init();
8928+ error = 0;
8929+ }
8930+ return error;
8931+}
8932diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
8933--- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8934+++ linux-patched/cluster/cman/sm_control.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 8935@@ -0,0 +1,22 @@
8936+/******************************************************************************
8937+*******************************************************************************
8938+**
8939+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8940+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8941+**
8942+** This copyrighted material is made available to anyone wishing to use,
8943+** modify, copy, or redistribute it subject to the terms and conditions
8944+** of the GNU General Public License v.2.
8945+**
8946+*******************************************************************************
8947+******************************************************************************/
8948+
8949+#ifndef __SM_CONTROL_DOT_H__
8950+#define __SM_CONTROL_DOT_H__
8951+
8952+void sm_init(void);
8953+void sm_start(void);
8954+int sm_stop(int force);
8955+void sm_member_update(int quorate);
8956+
8957+#endif
8958diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
8959--- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 8960+++ linux-patched/cluster/cman/sm_daemon.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 8961@@ -0,0 +1,120 @@
8962+/******************************************************************************
8963+*******************************************************************************
8964+**
8965+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8966+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8967+**
8968+** This copyrighted material is made available to anyone wishing to use,
8969+** modify, copy, or redistribute it subject to the terms and conditions
8970+** of the GNU General Public License v.2.
8971+**
8972+*******************************************************************************
8973+******************************************************************************/
8974+
8975+#include "sm.h"
8976+
8977+static unsigned long daemon_flags;
8978+static struct task_struct * daemon_task;
8979+static struct completion daemon_done;
8980+static wait_queue_head_t daemon_wait;
8981+extern int sm_quorum;
8982+
8983+void init_serviced(void)
8984+{
8985+ daemon_flags = 0;
8986+ daemon_task = NULL;
8987+ init_completion(&daemon_done);
8988+ init_waitqueue_head(&daemon_wait);
8989+}
8990+
8991+void wake_serviced(int do_flag)
8992+{
8993+ set_bit(do_flag, &daemon_flags);
8994+ wake_up(&daemon_wait);
8995+}
8996+
8997+static inline int got_work(void)
8998+{
8999+ int rv = 0;
9000+
9001+ rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
9002+ test_bit(DO_MESSAGES, &daemon_flags) ||
9003+ test_bit(DO_BARRIERS, &daemon_flags) ||
9004+ test_bit(DO_CALLBACKS, &daemon_flags));
9005+
9006+ if (sm_quorum && !rv)
9007+ rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
9008+ test_bit(DO_RECOVERIES, &daemon_flags) ||
9009+ test_bit(DO_MEMBERSHIP, &daemon_flags));
9010+ return rv;
9011+}
9012+
9013+static int serviced(void *arg)
9014+{
9015+ DECLARE_WAITQUEUE(wait, current);
9016+
9017+ daemonize("cman_serviced");
9018+ daemon_task = current;
9019+ set_bit(DO_RUN, &daemon_flags);
9020+ complete(&daemon_done);
9021+
9022+ for (;;) {
9023+ if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
9024+ process_nodechange();
9025+
9026+ if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
9027+ process_messages();
9028+
9029+ if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
9030+ process_barriers();
9031+
9032+ if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
9033+ process_callbacks();
9034+
9035+ if (sm_quorum) {
9036+ if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
9037+ process_recoveries();
9038+
9039+ if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
9040+ process_joinleave();
9041+
9042+ if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
9043+ process_membership();
9044+ }
9045+
9046+ if (!test_bit(DO_RUN, &daemon_flags))
9047+ break;
9048+
9049+ current->state = TASK_INTERRUPTIBLE;
9050+ add_wait_queue(&daemon_wait, &wait);
9051+ if (!got_work() && test_bit(DO_RUN, &daemon_flags))
9052+ schedule();
9053+ remove_wait_queue(&daemon_wait, &wait);
9054+ current->state = TASK_RUNNING;
9055+ }
9056+
9057+ complete(&daemon_done);
9058+ return 0;
9059+}
9060+
9061+int start_serviced(void)
9062+{
9063+ int error;
9064+
9065+ error = kernel_thread(serviced, NULL, 0);
9066+ if (error < 0)
9067+ goto out;
9068+
9069+ error = 0;
9070+ wait_for_completion(&daemon_done);
9071+
9072+ out:
9073+ return error;
9074+}
9075+
9076+void stop_serviced(void)
9077+{
9078+ clear_bit(DO_RUN, &daemon_flags);
9079+ wake_up(&daemon_wait);
9080+ wait_for_completion(&daemon_done);
9081+}
9082diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
9083--- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9084+++ linux-patched/cluster/cman/sm_daemon.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 9085@@ -0,0 +1,32 @@
9086+/******************************************************************************
9087+*******************************************************************************
9088+**
9089+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9090+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9091+**
9092+** This copyrighted material is made available to anyone wishing to use,
9093+** modify, copy, or redistribute it subject to the terms and conditions
9094+** of the GNU General Public License v.2.
9095+**
9096+*******************************************************************************
9097+******************************************************************************/
9098+
9099+#ifndef __SM_DAEMON_DOT_H__
9100+#define __SM_DAEMON_DOT_H__
9101+
9102+#define DO_RUN (0)
9103+#define DO_START_RECOVERY (1)
9104+#define DO_MESSAGES (2)
9105+#define DO_BARRIERS (3)
9106+#define DO_CALLBACKS (4)
9107+#define DO_JOINLEAVE (5)
9108+#define DO_RECOVERIES (6)
9109+#define DO_MEMBERSHIP (7)
9110+#define DO_RESET (8)
9111+
9112+void init_serviced(void);
9113+void wake_serviced(int do_flag);
9114+void stop_serviced(void);
9115+int start_serviced(void);
9116+
9117+#endif
9118diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
9119--- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9120+++ linux-patched/cluster/cman/sm_internal.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 9121@@ -0,0 +1,230 @@
9122+/******************************************************************************
9123+*******************************************************************************
9124+**
9125+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9126+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9127+**
9128+** This copyrighted material is made available to anyone wishing to use,
9129+** modify, copy, or redistribute it subject to the terms and conditions
9130+** of the GNU General Public License v.2.
9131+**
9132+*******************************************************************************
9133+******************************************************************************/
9134+
9135+#ifndef __SM_INTERNAL_DOT_H__
9136+#define __SM_INTERNAL_DOT_H__
9137+
9138+/*
9139+ * Any header files needed by this file should be included before it in sm.h.
9140+ * This file should only be included by sm.h.
9141+ */
9142+
9143+struct sm_group;
9144+struct sm_sevent;
9145+struct sm_uevent;
9146+struct sm_node;
9147+struct sm_msg;
9148+
9149+typedef struct sm_group sm_group_t;
9150+typedef struct sm_sevent sm_sevent_t;
9151+typedef struct sm_uevent sm_uevent_t;
9152+typedef struct sm_node sm_node_t;
9153+typedef struct sm_msg sm_msg_t;
9154+
9155+
9156+/*
9157+ * Number of seconds to wait before trying again to join or leave an SG
9158+ */
9159+#define RETRY_DELAY (2)
9160+
9161+
9162+/*
9163+ * Service Event - what a node uses to join or leave an sg
9164+ */
9165+
9166+/* SE Flags */
9167+#define SEFL_CHECK (0)
9168+#define SEFL_ALLOW_JOIN (1)
9169+#define SEFL_ALLOW_JSTOP (2)
9170+#define SEFL_ALLOW_LEAVE (3)
9171+#define SEFL_ALLOW_LSTOP (4)
9172+#define SEFL_ALLOW_STARTDONE (5)
9173+#define SEFL_ALLOW_BARRIER (6)
9174+#define SEFL_DELAY (7)
9175+#define SEFL_LEAVE (8)
9176+#define SEFL_CANCEL (9)
9177+
9178+/* SE States */
9179+#define SEST_JOIN_BEGIN (1)
9180+#define SEST_JOIN_ACKWAIT (2)
9181+#define SEST_JOIN_ACKED (3)
9182+#define SEST_JSTOP_ACKWAIT (4)
9183+#define SEST_JSTOP_ACKED (5)
9184+#define SEST_JSTART_SERVICEWAIT (6)
9185+#define SEST_JSTART_SERVICEDONE (7)
9186+#define SEST_BARRIER_WAIT (8)
9187+#define SEST_BARRIER_DONE (9)
9188+#define SEST_LEAVE_BEGIN (10)
9189+#define SEST_LEAVE_ACKWAIT (11)
9190+#define SEST_LEAVE_ACKED (12)
9191+#define SEST_LSTOP_ACKWAIT (13)
9192+#define SEST_LSTOP_ACKED (14)
9193+#define SEST_LSTART_WAITREMOTE (15)
9194+#define SEST_LSTART_REMOTEDONE (16)
9195+
9196+struct sm_sevent {
9197+ struct list_head se_list;
9198+ unsigned int se_id;
9199+ sm_group_t * se_sg;
9200+ unsigned long se_flags;
9201+ unsigned int se_state;
9202+
9203+ int se_node_count;
9204+ int se_memb_count;
9205+ int se_reply_count;
9206+
9207+ uint32_t * se_node_ids;
9208+ char * se_node_status;
9209+ int se_len_ids; /* length of node_ids */
9210+ int se_len_status; /* length of node_status */
9211+
9212+ int se_barrier_status;
9213+ struct timer_list se_restart_timer;
9214+};
9215+
9216+/*
9217+ * Update Event - what an sg member uses to respond to an sevent
9218+ */
9219+
9220+/* UE Flags */
9221+#define UEFL_ALLOW_STARTDONE (0)
9222+#define UEFL_ALLOW_BARRIER (1)
9223+#define UEFL_CANCEL (2)
9224+#define UEFL_LEAVE (3)
9225+#define UEFL_CHECK (4)
9226+
9227+/* UE States */
9228+#define UEST_JSTOP (1)
9229+#define UEST_JSTART_WAITCMD (2)
9230+#define UEST_JSTART (3)
9231+#define UEST_JSTART_SERVICEWAIT (4)
9232+#define UEST_JSTART_SERVICEDONE (5)
9233+#define UEST_BARRIER_WAIT (6)
9234+#define UEST_BARRIER_DONE (7)
9235+#define UEST_LSTOP (8)
9236+#define UEST_LSTART_WAITCMD (9)
9237+#define UEST_LSTART (10)
9238+#define UEST_LSTART_SERVICEWAIT (11)
9239+#define UEST_LSTART_SERVICEDONE (12)
9240+
9241+struct sm_uevent {
9242+ unsigned int ue_state;
9243+ unsigned long ue_flags;
9244+ uint32_t ue_id;
9245+ uint32_t ue_nodeid;
9246+ int ue_num_nodes;
9247+ int ue_barrier_status;
9248+ uint16_t ue_remote_seid;
9249+};
9250+
9251+/*
9252+ * Service Group
9253+ */
9254+
9255+#define RECOVER_NONE (0)
9256+#define RECOVER_STOP (1)
9257+#define RECOVER_START (2)
9258+#define RECOVER_STARTDONE (3)
9259+#define RECOVER_BARRIERWAIT (4)
9260+#define RECOVER_BARRIERDONE (5)
9261+
9262+/* SG Flags */
9263+#define SGFL_SEVENT (1)
9264+#define SGFL_UEVENT (2)
9265+#define SGFL_NEED_RECOVERY (3)
9266+
9267+/* SG States */
9268+#define SGST_NONE (0)
9269+#define SGST_JOIN (1)
9270+#define SGST_RUN (2)
9271+#define SGST_RECOVER (3)
9272+#define SGST_UEVENT (4)
9273+
9274+struct sm_group {
9275+ struct list_head list; /* list of sg's */
9276+ uint16_t level;
9277+ uint32_t local_id;
9278+ uint32_t global_id;
9279+ unsigned long flags;
9280+ int state;
9281+ int refcount; /* references from reg/unreg */
9282+ void * service_data; /* data from the service */
9283+ struct kcl_service_ops *ops; /* ops from the service */
9284+ struct completion event_comp;
9285+
9286+ struct list_head memb; /* Membership List for RC */
9287+ int memb_count; /* number of nodes in memb */
9288+ struct list_head joining; /* nodes joining the sg */
9289+ sm_sevent_t * sevent;
9290+ sm_uevent_t uevent;
9291+
9292+ int recover_state;
9293+ int recover_stop;
9294+ struct list_head recover_list; /* recovery event list */
9295+ void * recover_data;
9296+ char recover_barrier[MAX_BARRIER_NAME_LEN];
9297+
9298+ int namelen;
9299+ char name[1]; /* must be last field */
9300+};
9301+
9302+/*
9303+ * Service Message
9304+ */
9305+
9306+/* SMSG Type */
9307+#define SMSG_JOIN_REQ (1)
9308+#define SMSG_JOIN_REP (2)
9309+#define SMSG_JSTOP_REQ (3)
9310+#define SMSG_JSTOP_REP (4)
9311+#define SMSG_JSTART_CMD (5)
9312+#define SMSG_LEAVE_REQ (6)
9313+#define SMSG_LEAVE_REP (7)
9314+#define SMSG_LSTOP_REQ (8)
9315+#define SMSG_LSTOP_REP (9)
9316+#define SMSG_LSTART_CMD (10)
9317+#define SMSG_LSTART_DONE (11)
9318+#define SMSG_RECOVER (12)
9319+
9320+/* SMSG Status */
9321+#define STATUS_POS (1)
9322+#define STATUS_NEG (2)
9323+#define STATUS_WAIT (3)
9324+
9325+struct sm_msg {
9326+ uint8_t ms_type;
9327+ uint8_t ms_status;
9328+ uint16_t ms_sevent_id;
9329+ uint32_t ms_global_sgid;
9330+ uint32_t ms_global_lastid;
9331+ uint16_t ms_sglevel;
9332+ uint16_t ms_length;
9333+ /* buf of ms_length bytes follows */
9334+};
9335+
9336+/*
9337+ * Node structure
9338+ */
9339+
9340+#define SNFL_NEED_RECOVERY (0)
9341+#define SNFL_CLUSTER_MEMBER (1)
9342+#define SNFL_LEAVING (2)
9343+
9344+struct sm_node {
9345+ struct list_head list;
9346+ uint32_t id; /* node id from cnxman */
9347+ unsigned long flags;
9348+ int incarnation; /* node incarnation number */
9349+};
9350+
9351+#endif /* __SM_INTERNAL_DOT_H__ */
9352diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
9353--- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 9354+++ linux-patched/cluster/cman/sm_joinleave.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 9355@@ -0,0 +1,1286 @@
9356+/******************************************************************************
9357+*******************************************************************************
9358+**
9359+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9360+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9361+**
9362+** This copyrighted material is made available to anyone wishing to use,
9363+** modify, copy, or redistribute it subject to the terms and conditions
9364+** of the GNU General Public License v.2.
9365+**
9366+*******************************************************************************
9367+******************************************************************************/
9368+
9369+#include "sm.h"
9370+
9371+/*
9372+ * Routines used by nodes that are joining or leaving a SG. These "sevent"
9373+ * routines initiate membership changes to a SG. Existing SG members respond
9374+ * using the "uevent" membership update routines.
9375+ */
9376+
9377+extern uint32_t sm_our_nodeid;
9378+extern struct list_head sm_members;
9379+static struct list_head new_event;
9380+static spinlock_t new_event_lock;
9381+static struct list_head joinleave_events;
9382+
9383+void init_joinleave(void)
9384+{
9385+ INIT_LIST_HEAD(&new_event);
9386+ spin_lock_init(&new_event_lock);
9387+ INIT_LIST_HEAD(&joinleave_events);
9388+}
9389+
9390+void new_joinleave(sm_sevent_t *sev)
9391+{
9392+ spin_lock(&new_event_lock);
9393+ list_add_tail(&sev->se_list, &new_event);
9394+ spin_unlock(&new_event_lock);
9395+ wake_serviced(DO_JOINLEAVE);
9396+}
9397+
9398+sm_sevent_t *find_sevent(unsigned int id)
9399+{
9400+ sm_sevent_t *sev;
9401+
9402+ list_for_each_entry(sev, &joinleave_events, se_list) {
9403+ if (sev->se_id == id)
9404+ return sev;
9405+ }
9406+ return NULL;
9407+}
9408+
9409+static void release_sevent(sm_sevent_t *sev)
9410+{
9411+ if (sev->se_len_ids) {
9412+ kfree(sev->se_node_ids);
9413+ sev->se_node_ids = NULL;
9414+ }
9415+
9416+ if (sev->se_len_status) {
9417+ kfree(sev->se_node_status);
9418+ sev->se_node_status = NULL;
9419+ }
9420+
9421+ sev->se_node_count = 0;
9422+ sev->se_memb_count = 0;
9423+ sev->se_reply_count = 0;
9424+}
9425+
9426+static int init_sevent(sm_sevent_t *sev)
9427+{
9428+ sm_node_t *node;
9429+ int len1, len2, count, cluster_members = 0;
9430+
9431+ /* clear state from any previous attempt */
9432+ release_sevent(sev);
9433+
9434+ list_for_each_entry(node, &sm_members, list) {
9435+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9436+ cluster_members++;
9437+ }
9438+
9439+ sev->se_node_count = cluster_members;
9440+ sev->se_memb_count = sev->se_sg->memb_count;
9441+
9442+ /*
9443+ * When joining, we need a node array the size of the entire cluster
9444+ * member list because we get responses from all nodes. When leaving,
9445+ * we only get responses from SG members, so the node array need only
9446+ * be that large.
9447+ */
9448+
9449+ if (sev->se_state < SEST_LEAVE_BEGIN)
9450+ count = sev->se_node_count;
9451+ else
9452+ count = sev->se_memb_count;
9453+
9454+ len1 = count * sizeof(uint32_t);
9455+ sev->se_len_ids = len1;
9456+
9457+ sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
9458+ if (!sev->se_node_ids)
9459+ goto fail;
9460+
9461+ len2 = count * sizeof (char);
9462+ sev->se_len_status = len2;
9463+
9464+ sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
9465+ if (!sev->se_node_status)
9466+ goto fail_free;
9467+
9468+ memset(sev->se_node_status, 0, len2);
9469+ memset(sev->se_node_ids, 0, len1);
9470+
9471+ return 0;
9472+
9473+ fail_free:
9474+ kfree(sev->se_node_ids);
9475+ sev->se_node_ids = NULL;
9476+ sev->se_len_ids = 0;
9477+
9478+ fail:
9479+ return -ENOMEM;
9480+}
9481+
9482+/* Context: timer */
9483+
9484+static void sev_restart(unsigned long data)
9485+{
9486+ sm_sevent_t *sev = (sm_sevent_t *) data;
9487+
9488+ clear_bit(SEFL_DELAY, &sev->se_flags);
9489+ set_bit(SEFL_CHECK, &sev->se_flags);
9490+ wake_serviced(DO_JOINLEAVE);
9491+}
9492+
9493+static void schedule_sev_restart(sm_sevent_t *sev)
9494+{
9495+ init_timer(&sev->se_restart_timer);
9496+ sev->se_restart_timer.function = sev_restart;
9497+ sev->se_restart_timer.data = (long) sev;
9498+ mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
9499+}
9500+
9501+void free_sg_memb(sm_group_t *sg)
9502+{
9503+ sm_node_t *node;
9504+
9505+ while (!list_empty(&sg->memb)) {
9506+ node = list_entry(sg->memb.next, sm_node_t, list);
9507+ list_del(&node->list);
9508+ kfree(node);
9509+ }
9510+ sg->memb_count = 0;
9511+}
9512+
9513+/*
9514+ * 1. First step in joining a SG - send a message to all nodes in the cluster
9515+ * asking to join the named SG. If any nodes are members they will reply with
9516+ * a POS, or a WAIT (wait means try again, only one node can join at a time).
9517+ * If no one knows about this SG, they all send NEG replies which means we form
9518+ * the SG with just ourself as a member.
9519+ */
9520+
9521+static int send_join_notice(sm_sevent_t *sev)
9522+{
9523+ sm_group_t *sg = sev->se_sg;
9524+ sm_node_t *node;
9525+ char *msg;
9526+ int i = 0, error, namelen, len = 0;
9527+
9528+ /*
9529+ * Create node array from member list in which to collect responses.
9530+ */
9531+
9532+ error = init_sevent(sev);
9533+ if (error)
9534+ goto out;
9535+
9536+ list_for_each_entry(node, &sm_members, list) {
9537+ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9538+ sev->se_node_ids[i++] = node->id;
9539+ }
9540+
9541+ /*
9542+ * Create and send a join request message.
9543+ *
9544+ * Other nodes then run process_join_request and reply to us; we
9545+ * collect the responses in process_reply and check them in
9546+ * check_join_notice.
9547+ */
9548+
9549+ namelen = sg->namelen;
9550+ msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
9551+ memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
9552+
9553+ error = send_broadcast_message_sev(msg, len, sev);
9554+
9555+ out:
9556+ return error;
9557+}
9558+
9559+/*
9560+ * 2. Second step in joining a SG - after we collect all replies to our join
9561+ * request, we look at them. If anyone told us to wait, we'll wait a while, go
9562+ * back and start at step 1 again.
9563+ */
9564+
9565+static int check_join_notice(sm_sevent_t *sev)
9566+{
9567+ int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
9568+
9569+ for (i = 0; i < sev->se_node_count; i++) {
9570+ switch (sev->se_node_status[i]) {
9571+ case STATUS_POS:
9572+ /* this node is in the SG and will be in new proposed
9573+ * memb list */
9574+ pos++;
9575+ break;
9576+
9577+ case STATUS_WAIT:
9578+ /* this node is in the SG but something else is
9579+ * happening with it at the moment. */
9580+ wait++;
9581+ break;
9582+
9583+ case STATUS_NEG:
9584+ /* this node has no record of the SG we're interested
9585+ * in */
9586+ neg++;
9587+
9588+ if (sev->se_node_ids[i] == sm_our_nodeid)
9589+ sev->se_node_status[i] = STATUS_POS;
9590+ break;
9591+
9592+ default:
9593+ /* we didn't get a valid response from this node,
9594+ * restart the entire sev. */
9595+ restart++;
9596+ break;
9597+ }
9598+ }
9599+
9600+ if (pos && !wait && !restart) {
9601+ /* all current members of this sg pos'ed our entry */
9602+ } else if (!pos && !wait && !restart && neg) {
9603+ /* we're the first in the cluster to join this sg */
9604+ sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
9605+ } else
9606+ error = -1;
9607+
9608+ return error;
9609+}
9610+
9611+/*
9612+ * 3. Third step in joining the SG - tell the nodes that are already members
9613+ * to "stop" the service. We stop them so that everyone can restart with the
9614+ * new member (us!) added.
9615+ */
9616+
9617+static int send_join_stop(sm_sevent_t *sev)
9618+{
9619+ sm_group_t *sg = sev->se_sg;
9620+ sm_node_t *node;
9621+ char *msg;
9622+ uint32_t be_count;
9623+ int i, len = 0, error = 0;
9624+
9625+ /*
9626+ * Form the SG memb list with us in it.
9627+ */
9628+
9629+ for (i = 0; i < sev->se_node_count; i++) {
9630+ if (sev->se_node_status[i] != STATUS_POS)
9631+ continue;
9632+
9633+ node = sm_new_node(sev->se_node_ids[i]);
9634+ if (!node)
9635+ goto fail;
9636+
9637+ list_add_tail(&node->list, &sg->memb);
9638+ sg->memb_count++;
9639+ }
9640+
9641+ /*
9642+ * Re-init the node vector in which to collect responses again.
9643+ */
9644+
9645+ sev->se_memb_count = sg->memb_count;
9646+
9647+ memset(sev->se_node_status, 0, sev->se_len_status);
9648+ memset(sev->se_node_ids, 0, sev->se_len_ids);
9649+ i = 0;
9650+
9651+ list_for_each_entry(node, &sg->memb, list)
9652+ sev->se_node_ids[i++] = node->id;
9653+
9654+ /*
9655+ * Create and send a stop message.
9656+ *
9657+ * Other nodes then run process_stop_request and process_join_stop and
9658+ * reply to us. They stop the sg we're trying to join if they agree.
9659+ * We collect responses in process_reply and check them in
9660+ * check_join_stop.
9661+ */
9662+
9663+ msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
9664+ be_count = cpu_to_be32(sg->memb_count);
9665+ memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
9666+
9667+ error = send_members_message_sev(sg, msg, len, sev);
9668+ if (error < 0)
9669+ goto fail;
9670+
9671+ return 0;
9672+
9673+ fail:
9674+ free_sg_memb(sg);
9675+ return error;
9676+}
9677+
9678+/*
9679+ * 4. Fourth step in joining the SG - after we collect replies to our stop
9680+ * request, we look at them. Everyone sending POS agrees with us joining and
9681+ * has stopped their SG. If some nodes sent NEG, something is wrong and we
9682+ * don't have a good way to address that yet since some nodes may have sent
9683+ * POS.
9684+ *
9685+ * FIXME: even nodes replying with NEG should stop their SG so we can send an
9686+ * abort and have everyone at the same place to start from again.
9687+ */
9688+
9689+static int check_join_stop(sm_sevent_t *sev)
9690+{
9691+ sm_group_t *sg = sev->se_sg;
9692+ int i, pos = 0, neg = 0;
9693+
9694+ for (i = 0; i < sev->se_memb_count; i++) {
9695+ switch (sev->se_node_status[i]) {
9696+ case STATUS_POS:
9697+ pos++;
9698+ break;
9699+
9700+ case STATUS_NEG:
9701+ log_error(sg, "check_join_stop: neg from nodeid %u "
9702+ "(%d, %d, %u)", sev->se_node_ids[i],
9703+ pos, neg, sev->se_memb_count);
9704+ neg++;
9705+ break;
9706+
9707+ default:
9708+ log_error(sg, "check_join_stop: unknown status=%u "
9709+ "nodeid=%u", sev->se_node_status[i],
9710+ sev->se_node_ids[i]);
9711+ neg++;
9712+ break;
9713+ }
9714+ }
9715+
9716+ if (pos == sg->memb_count)
9717+ return 0;
9718+
9719+ free_sg_memb(sg);
9720+ return -1;
9721+}
9722+
9723+/*
9724+ * 5. Fifth step in joining the SG - everyone has stopped their service and we
9725+ * all now start the service with us, the new member, added to the SG member
9726+ * list. We send start to our own service here and send a message to the other
9727+ * members that they should also start their service.
9728+ */
9729+
9730+static int send_join_start(sm_sevent_t *sev)
9731+{
9732+ sm_group_t *sg = sev->se_sg;
9733+ sm_node_t *node;
9734+ uint32_t *memb;
9735+ char *msg;
9736+ int error, count = 0, len = 0;
9737+
9738+ /*
9739+ * Create a start message and send it.
9740+ */
9741+
9742+ msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
9743+
9744+ error = send_members_message(sg, msg, len);
9745+ if (error < 0)
9746+ goto fail;
9747+
9748+ /*
9749+ * Start the service ourself. The chunk of memory with the member ids
9750+ * must be freed by the service when it is done with it.
9751+ */
9752+
9753+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
9754+ memb);
9755+
9756+ list_for_each_entry(node, &sg->memb, list)
9757+ memb[count++] = node->id;
9758+
9759+ set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
9760+
9761+ sg->ops->start(sg->service_data, memb, count, sev->se_id,
9762+ SERVICE_NODE_JOIN);
9763+ return 0;
9764+
9765+ fail:
9766+ free_sg_memb(sg);
9767+ return error;
9768+}
9769+
9770+/*
9771+ * 6. Sixth step in joining the SG - once the service has completed its start,
9772+ * it does a kcl_start_done() to signal us that it's done. That gets us here
9773+ * and we do a barrier with all other members which join the barrier when their
9774+ * service is done starting.
9775+ */
9776+
9777+static int startdone_barrier_new(sm_sevent_t *sev)
9778+{
9779+ sm_group_t *sg = sev->se_sg;
9780+ char bname[MAX_BARRIER_NAME_LEN];
9781+ int error;
9782+
9783+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
9784+ sev->se_barrier_status = -1;
9785+
9786+ set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
9787+
9788+ /* If we're the only member, skip the barrier */
9789+ if (sg->memb_count == 1) {
9790+ process_startdone_barrier_new(sg, 0);
9791+ return 0;
9792+ }
9793+
9794+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
9795+ sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
9796+
9797+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
9798+ if (error)
9799+ goto fail;
9800+
9801+ return 0;
9802+
9803+ fail:
9804+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
9805+ sg->ops->stop(sg->service_data);
9806+ free_sg_memb(sg);
9807+ return error;
9808+}
9809+
9810+/*
9811+ * 7. Seventh step in joining the SG - check that the barrier we joined with
9812+ * all other members returned with a successful status.
9813+ */
9814+
9815+static int check_startdone_barrier_new(sm_sevent_t *sev)
9816+{
9817+ sm_group_t *sg = sev->se_sg;
9818+ int error = sev->se_barrier_status;
9819+
9820+ if (error) {
9821+ sg->ops->stop(sg->service_data);
9822+ free_sg_memb(sg);
9823+ }
9824+ return error;
9825+}
9826+
9827+/*
9828+ * 8. Eigth step in joining the SG - send the service a "finish" indicating
9829+ * that all members have successfully started the service.
9830+ */
9831+
9832+static void do_finish_new(sm_sevent_t *sev)
9833+{
9834+ sm_group_t *sg = sev->se_sg;
9835+
9836+ sg->state = SGST_RUN;
9837+ sg->sevent = NULL;
9838+ clear_bit(SGFL_SEVENT, &sg->flags);
9839+
9840+ sg->ops->finish(sg->service_data, sev->se_id);
9841+}
9842+
9843+/*
9844+ * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff
9845+ * and tell the process which initiated the join that it's done.
9846+ */
9847+
9848+static void sevent_done(sm_sevent_t *sev)
9849+{
9850+ sm_group_t *sg = sev->se_sg;
9851+
9852+ list_del(&sev->se_list);
9853+ release_sevent(sev);
9854+ kfree(sev);
9855+ complete(&sg->event_comp);
9856+}
9857+
9858+/*
9859+ * Move through the steps of a join. Summary:
9860+ *
9861+ * 1. Send a join notice to all cluster members.
9862+ * 2. Collect and check replies to the join notice.
9863+ * 3. Send a stop message to all SG members.
9864+ * 4. Collect and check replies to the stop message.
9865+ * 5. Send a start message to all SG members and start service ourself.
9866+ * 6. Use barrier to wait for all nodes to complete the start.
9867+ * 7. Check that all SG members joined the barrier.
9868+ * 8. Send finish to the service indicating that all nodes started it.
9869+ * 9. Clean up sevent and signal completion to the process that started the join
9870+ */
9871+
9872+static void process_join_sevent(sm_sevent_t *sev)
9873+{
9874+ int error = 0;
9875+
9876+ /*
9877+ * We may cancel the current join attempt if another node is also
9878+ * attempting to join or leave. (Only a single node can join or leave
9879+ * at once.) If cancelled, 0ur join attempt will be restarted later.
9880+ */
9881+
9882+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
9883+ error = -1;
9884+ goto cancel;
9885+ }
9886+
9887+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
9888+
9889+ switch (sev->se_state) {
9890+
9891+ /*
9892+ * An sevent is created in kcl_join_service with a state of
9893+ * JOIN_BEGIN.
9894+ */
9895+
9896+ case SEST_JOIN_BEGIN:
9897+ sev->se_state = SEST_JOIN_ACKWAIT;
9898+ error = send_join_notice(sev);
9899+ break;
9900+
9901+ /*
9902+ * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
9903+ * process_reply (when all the replies have been received)
9904+ */
9905+
9906+ case SEST_JOIN_ACKED:
9907+ error = check_join_notice(sev);
9908+ if (error)
9909+ break;
9910+
9911+ sev->se_state = SEST_JSTOP_ACKWAIT;
9912+ error = send_join_stop(sev);
9913+ break;
9914+
9915+ /*
9916+ * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
9917+ * proces_reply (when all the replies have been received)
9918+ */
9919+
9920+ case SEST_JSTOP_ACKED:
9921+ error = check_join_stop(sev);
9922+ if (error)
9923+ break;
9924+
9925+ sev->se_state = SEST_JSTART_SERVICEWAIT;
9926+ error = send_join_start(sev);
9927+ break;
9928+
9929+ /*
9930+ * se_state is changed from JSTART_SERVICEWAIT to
9931+ * JSTART_SERVICEDONE in kcl_start_done
9932+ */
9933+
9934+ case SEST_JSTART_SERVICEDONE:
9935+ sev->se_state = SEST_BARRIER_WAIT;
9936+ error = startdone_barrier_new(sev);
9937+ break;
9938+
9939+ /*
9940+ * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
9941+ * process_startdone_barrier_new
9942+ */
9943+
9944+ case SEST_BARRIER_DONE:
9945+ error = check_startdone_barrier_new(sev);
9946+ if (error)
9947+ break;
9948+
9949+ do_finish_new(sev);
9950+ sevent_done(sev);
9951+ break;
9952+
9953+ default:
9954+ log_error(sev->se_sg, "no join processing for state %u",
9955+ sev->se_state);
9956+ }
9957+
9958+ cancel:
9959+ if (error) {
9960+ /* restart the sevent from the beginning */
9961+ sev->se_state = SEST_JOIN_BEGIN;
9962+ sev->se_sg->global_id = 0;
9963+ set_bit(SEFL_DELAY, &sev->se_flags);
9964+ schedule_sev_restart(sev);
9965+ }
9966+}
9967+
9968+/*
9969+ * 1. First step in leaving an SG - send a message to other SG members asking
9970+ * to leave the SG. Nodes that don't have another active sevent or uevent for
9971+ * this SG will return POS.
9972+ */
9973+
9974+static int send_leave_notice(sm_sevent_t *sev)
9975+{
9976+ sm_group_t *sg = sev->se_sg;
9977+ sm_node_t *node;
9978+ char *msg;
9979+ int i = 0, error = -1, len = 0;
9980+
9981+ /*
9982+ * Create a node array from member list in which to collect responses.
9983+ */
9984+
9985+ error = init_sevent(sev);
9986+ if (error)
9987+ goto out;
9988+
9989+ list_for_each_entry(node, &sg->memb, list)
9990+ sev->se_node_ids[i++] = node->id;
9991+
9992+ /*
9993+ * Create and send a leave request message.
9994+ */
9995+
9996+ msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
9997+
9998+ error = send_members_message_sev(sg, msg, len, sev);
9999+
10000+ out:
10001+ return error;
10002+}
10003+
10004+/*
10005+ * 2. Second step in leaving an SG - after we collect all replies to our leave
10006+ * request, we look at them. If anyone replied with WAIT, we abort our attempt
10007+ * at leaving and try again in a bit.
10008+ */
10009+
10010+static int check_leave_notice(sm_sevent_t *sev)
10011+{
10012+ int pos = 0, wait = 0, neg = 0, restart = 0, i;
10013+
10014+ for (i = 0; i < sev->se_memb_count; i++) {
10015+ switch (sev->se_node_status[i]) {
10016+ case STATUS_POS:
10017+ pos++;
10018+ break;
10019+
10020+ case STATUS_WAIT:
10021+ wait++;
10022+ break;
10023+
10024+ case STATUS_NEG:
10025+ neg++;
10026+ break;
10027+
10028+ default:
10029+ /* we didn't get a valid response from this node,
10030+ * restart the entire sev. */
10031+ restart++;
10032+ break;
10033+ }
10034+ }
10035+
10036+ /* all members approve */
10037+ if (pos && !wait && !restart)
10038+ return 0;
10039+
10040+ return -1;
10041+}
10042+
10043+/*
10044+ * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG.
10045+ * They must be stopped in order to restart without us as a member.
10046+ */
10047+
10048+static int send_leave_stop(sm_sevent_t *sev)
10049+{
10050+ sm_group_t *sg = sev->se_sg;
10051+ char *msg;
10052+ int error, len = 0;
10053+
10054+ /*
10055+ * Re-init the status vector in which to collect responses.
10056+ */
10057+
10058+ memset(sev->se_node_status, 0, sev->se_len_status);
10059+
10060+ /*
10061+ * Create and send a stop message.
10062+ */
10063+
10064+ msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
10065+
10066+ error = send_members_message_sev(sg, msg, len, sev);
10067+ if (error < 0)
10068+ goto out;
10069+
10070+ /*
10071+ * we and all others stop the SG now
10072+ */
10073+
10074+ sg->ops->stop(sg->service_data);
10075+
10076+ out:
10077+ return error;
10078+}
10079+
10080+/*
10081+ * 4. Fourth step in leaving the SG - check the replies to our stop request.
10082+ * Same problem with getting different replies as check_join_stop.
10083+ */
10084+
10085+static int check_leave_stop(sm_sevent_t *sev)
10086+{
10087+ sm_group_t *sg = sev->se_sg;
10088+ int i, pos = 0, neg = 0;
10089+
10090+ for (i = 0; i < sev->se_memb_count; i++) {
10091+ switch (sev->se_node_status[i]) {
10092+ case STATUS_POS:
10093+ pos++;
10094+ break;
10095+
10096+ case STATUS_NEG:
10097+ log_error(sg, "check_leave_stop: fail from nodeid %u "
10098+ "(%d, %d, %u)", sev->se_node_ids[i],
10099+ pos, neg, sev->se_memb_count);
10100+ neg++;
10101+ break;
10102+
10103+ default:
10104+ log_error(sg, "check_leave_stop: status %u nodeid %u",
10105+ sev->se_node_status[i], sev->se_node_ids[i]);
10106+ neg++;
10107+ break;
10108+ }
10109+ }
10110+
10111+ if (pos == sg->memb_count)
10112+ return 0;
10113+
10114+ return -1;
10115+}
10116+
10117+/*
10118+ * 5. Fifth step in leaving the SG - tell the other SG members to restart the
10119+ * service without us. We, of course, don't start our own stopped service. If
10120+ * we're the last SG member and leaving, we jump right to the next step.
10121+ */
10122+
10123+static int send_leave_start(sm_sevent_t *sev)
10124+{
10125+ sm_group_t *sg = sev->se_sg;
10126+ char *msg;
10127+ int error = 0, len = 0;
10128+
10129+ if (sg->memb_count == 1) {
10130+ sev->se_state = SEST_LSTART_REMOTEDONE;
10131+ set_bit(SEFL_CHECK, &sev->se_flags);
10132+ wake_serviced(DO_JOINLEAVE);
10133+ } else {
10134+ msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
10135+ error = send_members_message(sg, msg, len);
10136+ }
10137+ return error;
10138+}
10139+
10140+/*
10141+ * Move through the steps of a leave. Summary:
10142+ *
10143+ * 1. Send a leave notice to all SG members.
10144+ * 2. Collect and check replies to the leave notice.
10145+ * 3. Send a stop message to all SG members and stop our own SG.
10146+ * 4. Collect and check replies to the stop message.
10147+ * 5. Send a start message to SG members.
10148+ * 6. Clean up sevent and signal completion to the process that
10149+ * started the leave.
10150+ */
10151+
10152+static void process_leave_sevent(sm_sevent_t *sev)
10153+{
10154+ int error = 0;
10155+
10156+ /*
10157+ * We may cancel the current leave attempt if another node is also
10158+ * attempting to join or leave. (Only a single node can join or leave
10159+ * at once.) Our leave attempt will be restarted after being
10160+ * cancelled.
10161+ */
10162+
10163+ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10164+ error = 1;
10165+ goto cancel;
10166+ }
10167+
10168+ if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
10169+ error = 2;
10170+ goto cancel;
10171+ }
10172+
10173+ if (!list_empty(&sev->se_sg->joining)) {
10174+ error = 3;
10175+ goto cancel;
10176+ }
10177+
10178+ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10179+
10180+ switch (sev->se_state) {
10181+
10182+ /*
10183+ * An sevent is created in kcl_leave_service with a state of
10184+ * LEAVE_BEGIN.
10185+ */
10186+
10187+ case SEST_LEAVE_BEGIN:
10188+ sev->se_state = SEST_LEAVE_ACKWAIT;
10189+ error = send_leave_notice(sev);
10190+ break;
10191+
10192+ /*
10193+ * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
10194+ * process_reply (when all the replies have been received)
10195+ */
10196+
10197+ case SEST_LEAVE_ACKED:
10198+ error = check_leave_notice(sev);
10199+ if (error)
10200+ break;
10201+
10202+ sev->se_state = SEST_LSTOP_ACKWAIT;
10203+ error = send_leave_stop(sev);
10204+ break;
10205+
10206+ /*
10207+ * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
10208+ * process_reply
10209+ */
10210+
10211+ case SEST_LSTOP_ACKED:
10212+ error = check_leave_stop(sev);
10213+ if (error)
10214+ break;
10215+
10216+ sev->se_state = SEST_LSTART_WAITREMOTE;
10217+ error = send_leave_start(sev);
10218+ break;
10219+
10220+ /*
10221+ * se_state is changed from LSTART_WAITREMOTE to
10222+ * LSTART_REMOTEDONE in process_leave_done
10223+ */
10224+
10225+ case SEST_LSTART_REMOTEDONE:
10226+ sevent_done(sev);
10227+ break;
10228+
10229+ default:
10230+ log_error(sev->se_sg, "process_leave_sevent state=%u\n",
10231+ sev->se_state);
10232+ }
10233+
10234+ cancel:
10235+ if (error) {
10236+ /* restart the sevent from the beginning */
10237+ sev->se_state = SEST_LEAVE_BEGIN;
10238+ set_bit(SEFL_DELAY, &sev->se_flags);
10239+ schedule_sev_restart(sev);
10240+ }
10241+}
10242+
10243+/*
10244+ * Sevent backout code. Take appropriate steps when a recovery occurs while
10245+ * we're in the midst of an sevent. The recovery may or may not affect the
10246+ * sevent. If it does, it usually means cancelling the sevent and restarting
10247+ * it from the beginning once the recovery processing is done.
10248+ */
10249+
10250+/*
10251+ * If any of the nodes that replied with OK is dead, we give up on the current
10252+ * join attempt and restart. Otherwise, this sevent can continue.
10253+ */
10254+
10255+static int backout_join_acked(sm_sevent_t *sev)
10256+{
10257+ sm_node_t *node;
10258+ int i;
10259+
10260+ for (i = 0; i < sev->se_node_count; i++) {
10261+ if (sev->se_node_status[i] != STATUS_POS)
10262+ continue;
10263+
10264+ list_for_each_entry(node, &sm_members, list) {
10265+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
10266+ (node->id == sev->se_node_ids[i]))
10267+ return TRUE;
10268+ }
10269+ }
10270+ return FALSE;
10271+}
10272+
10273+/*
10274+ * In this state our sg member list exists and mark_affected_sgs() will have
10275+ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We
10276+ * restart the join process if this is the case, otherwise this sevent can
10277+ * continue.
10278+ */
10279+
10280+static int backout_jstop_ackwait(sm_sevent_t *sev)
10281+{
10282+ sm_group_t *sg = sev->se_sg;
10283+
10284+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10285+ return FALSE;
10286+
10287+ clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
10288+ free_sg_memb(sg);
10289+ return TRUE;
10290+}
10291+
10292+/*
10293+ * Same as previous.
10294+ */
10295+
10296+static int backout_jstop_acked(sm_sevent_t *sev)
10297+{
10298+ return backout_jstop_ackwait(sev);
10299+}
10300+
10301+/*
10302+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10303+ * starting our service. The recovery process will restart the service on all
10304+ * the prior sg members (not including those that died or us). We will
10305+ * reattempt our join which should be accepted once the nodes are done with
10306+ * recovery.
10307+ */
10308+
10309+static int backout_jstart_servicewait(sm_sevent_t *sev)
10310+{
10311+ sm_group_t *sg = sev->se_sg;
10312+
10313+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10314+ return FALSE;
10315+
10316+ clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10317+ sg->ops->stop(sg->service_data);
10318+ free_sg_memb(sg);
10319+ return TRUE;
10320+}
10321+
10322+/*
10323+ * Same as previous.
10324+ */
10325+
10326+static int backout_jstart_servicedone(sm_sevent_t *sev)
10327+{
10328+ return backout_jstart_servicewait(sev);
10329+}
10330+
10331+/*
10332+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10333+ * waiting on the "all done" barrier. Stop our service that we just started
10334+ * and cancel the barrier. The recovery process will restart the service on
10335+ * all the prior sg members (not including those that died or us). We will
10336+ * reattempt our join which should be accepted once the nodes are done with
10337+ * recovery.
10338+ */
10339+
10340+static int backout_barrier_wait(sm_sevent_t *sev)
10341+{
10342+ sm_group_t *sg = sev->se_sg;
10343+ char bname[MAX_BARRIER_NAME_LEN];
10344+
10345+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10346+ return FALSE;
10347+
10348+ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10349+
10350+ sg->ops->stop(sg->service_data);
10351+
10352+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10353+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10354+ sg->global_id, sm_our_nodeid, sev->se_id,
10355+ sg->memb_count);
10356+ kcl_barrier_cancel(bname);
10357+
10358+ free_sg_memb(sg);
10359+ return TRUE;
10360+}
10361+
10362+/*
10363+ * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The
10364+ * recovery began after the barrier callback. If the result in the callback is
10365+ * "success" then we are joined, this sevent is finished and we'll process the
10366+ * sg within the forthcoming recovery with the other members.
10367+ *
10368+ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
10369+ * all nodes will receive the corresponding barrier callback *before any*
10370+ * receive an sm_member_update() due to one of those nodes failing just after
10371+ * joining the barrier. If some nodes receive the sm_member_update() before
10372+ * the barrier callback and others receive the barrier callback before the
10373+ * sm_member_update() then they will disagree as to whether the node joining/
10374+ * leaving is in/out of the sg.
10375+ */
10376+
10377+static int backout_barrier_done(sm_sevent_t *sev)
10378+{
10379+ sm_group_t *sg = sev->se_sg;
10380+
10381+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10382+ return FALSE;
10383+
10384+ if (!sev->se_barrier_status) {
10385+ do_finish_new(sev);
10386+ sevent_done(sev);
10387+ return FALSE;
10388+ } else {
10389+ sg->ops->stop(sg->service_data);
10390+ free_sg_memb(sg);
10391+ return TRUE;
10392+ }
10393+}
10394+
10395+/*
10396+ * We've done nothing yet, just restart when recovery is done (if sg is flagged
10397+ * with recovery.)
10398+ */
10399+
10400+static int backout_leave_begin(sm_sevent_t *sev)
10401+{
10402+ sm_group_t *sg = sev->se_sg;
10403+
10404+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10405+ return FALSE;
10406+
10407+ return TRUE;
10408+}
10409+
10410+/*
10411+ * Ignore any replies to our leave notice and restart when recovery is done (if
10412+ * sg is flagged with recovery.)
10413+ */
10414+
10415+static int backout_leave_ackwait(sm_sevent_t *sev)
10416+{
10417+ sm_group_t *sg = sev->se_sg;
10418+
10419+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10420+ return FALSE;
10421+
10422+ clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
10423+
10424+ return TRUE;
10425+}
10426+
10427+/*
10428+ * Same as previous.
10429+ */
10430+
10431+static int backout_leave_acked(sm_sevent_t *sev)
10432+{
10433+ return backout_leave_ackwait(sev);
10434+}
10435+
10436+/*
10437+ * Ignore any stop replies. All the members will be stopped anyway to do the
10438+ * recovery. Let that happen and restart our leave when done.
10439+ */
10440+
10441+static int backout_lstop_ackwait(sm_sevent_t *sev)
10442+{
10443+ sm_group_t *sg = sev->se_sg;
10444+
10445+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10446+ return FALSE;
10447+
10448+ clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
10449+
10450+ return TRUE;
10451+}
10452+
10453+/*
10454+ * Same as previous.
10455+ */
10456+
10457+static int backout_lstop_acked(sm_sevent_t *sev)
10458+{
10459+ return backout_lstop_ackwait(sev);
10460+}
10461+
10462+/*
10463+ * All members will be stopped due to recovery and restarted by recovery
10464+ * processing. That includes us, we have to retry the leave once the recovery
10465+ * is done.
10466+ */
10467+
10468+static int backout_lstart_waitremote(sm_sevent_t *sev)
10469+{
10470+ sm_group_t *sg = sev->se_sg;
10471+
10472+ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10473+ return FALSE;
10474+
10475+ return TRUE;
10476+}
10477+
10478+/*
10479+ * Reset an sevent to its beginning so it can be restarted. This is necessary
10480+ * when recovery affects an SG while we're trying to join or leave (ie. a node
10481+ * in the SG fails).
10482+ */
10483+
10484+void backout_sevents(void)
10485+{
10486+ sm_sevent_t *sev, *safe;
10487+ int delay;
10488+
10489+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10490+
10491+ delay = FALSE;
10492+
10493+ log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
10494+
10495+ switch (sev->se_state) {
10496+
10497+ /* backout after kcl_join_service and before
10498+ * send_join_notice */
10499+ case SEST_JOIN_BEGIN:
10500+ break;
10501+
10502+ /* backout after send_join_notice and before final
10503+ * process_reply */
10504+ case SEST_JOIN_ACKWAIT:
10505+ clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
10506+ sev->se_state = SEST_JOIN_BEGIN;
10507+ schedule_sev_restart(sev);
10508+ break;
10509+
10510+ /* backout after final process_reply and before
10511+ * check_join_notice */
10512+ case SEST_JOIN_ACKED:
10513+ delay = backout_join_acked(sev);
10514+ break;
10515+
10516+ /* backout after send_join_stop and before final
10517+ * process_reply */
10518+ case SEST_JSTOP_ACKWAIT:
10519+ delay = backout_jstop_ackwait(sev);
10520+ break;
10521+
10522+ /* backout after final process_reply and before
10523+ * check_join_stop */
10524+ case SEST_JSTOP_ACKED:
10525+ delay = backout_jstop_acked(sev);
10526+ break;
10527+
10528+ /* backout after send_join_start and before
10529+ * kcl_start_done */
10530+ case SEST_JSTART_SERVICEWAIT:
10531+ delay = backout_jstart_servicewait(sev);
10532+ break;
10533+
10534+ /* backout after kcl_start_done and before
10535+ * startdone_barrier_new */
10536+ case SEST_JSTART_SERVICEDONE:
10537+ delay = backout_jstart_servicedone(sev);
10538+ break;
10539+
10540+ /* backout after startdone_barrier_new and before
10541+ * callback_startdone_barrier_new */
10542+ case SEST_BARRIER_WAIT:
10543+ delay = backout_barrier_wait(sev);
10544+ break;
10545+
10546+ /* backout after callback_startdone_barrier_new and
10547+ * before check_startdone_barrier_new */
10548+ case SEST_BARRIER_DONE:
10549+ delay = backout_barrier_done(sev);
10550+ break;
10551+
10552+ /* backout after kcl_leave_service and before
10553+ * send_leave_notice */
10554+ case SEST_LEAVE_BEGIN:
10555+ delay = backout_leave_begin(sev);
10556+ break;
10557+
10558+ /* backout after send_leave_notice and before final
10559+ * process_reply */
10560+ case SEST_LEAVE_ACKWAIT:
10561+ delay = backout_leave_ackwait(sev);
10562+ break;
10563+
10564+ /* backout after final process_reply and before
10565+ * check_leave_notice */
10566+ case SEST_LEAVE_ACKED:
10567+ delay = backout_leave_acked(sev);
10568+ break;
10569+
10570+ /* backout after send_leave_stop and before final
10571+ * process_reply */
10572+ case SEST_LSTOP_ACKWAIT:
10573+ delay = backout_lstop_ackwait(sev);
10574+ break;
10575+
10576+ /* backout after final process_reply and before
10577+ * check_leave_stop */
10578+ case SEST_LSTOP_ACKED:
10579+ delay = backout_lstop_acked(sev);
10580+ break;
10581+
10582+ /* backout after send_leave_start and before
10583+ * process_lstart_done */
10584+ case SEST_LSTART_WAITREMOTE:
10585+ delay = backout_lstart_waitremote(sev);
10586+ break;
10587+
10588+ /* backout after process_lstart_done and before
10589+ * process_leave_sevent */
10590+ case SEST_LSTART_REMOTEDONE:
10591+ sevent_done(sev);
10592+ delay = FALSE;
10593+ break;
10594+
10595+ default:
10596+ log_error(sev->se_sg, "backout_sevents: bad state %d",
10597+ sev->se_state);
10598+ }
10599+
10600+ if (delay) {
10601+ set_bit(SEFL_DELAY, &sev->se_flags);
10602+
10603+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
10604+ sev->se_state = SEST_LEAVE_BEGIN;
10605+ /* The DELAY flag will be cleared once recovery
10606+ * is done allowing the leave to be retried. */
10607+ } else {
10608+ sev->se_state = SEST_JOIN_BEGIN;
10609+ /* restart timer function will clear DELAY */
10610+ schedule_sev_restart(sev);
10611+ }
10612+ }
10613+ }
10614+}
10615+
10616+void process_joinleave(void)
10617+{
10618+ sm_sevent_t *sev = NULL, *safe;
10619+
10620+ spin_lock(&new_event_lock);
10621+ if (!list_empty(&new_event)) {
10622+ sev = list_entry(new_event.next, sm_sevent_t, se_list);
10623+ list_del(&sev->se_list);
10624+ list_add_tail(&sev->se_list, &joinleave_events);
10625+ set_bit(SEFL_CHECK, &sev->se_flags);
10626+ }
10627+ spin_unlock(&new_event_lock);
10628+
10629+ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10630+ if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
10631+ continue;
10632+
10633+ if (test_bit(SEFL_DELAY, &sev->se_flags))
10634+ continue;
10635+
10636+ if (sev->se_state < SEST_LEAVE_BEGIN)
10637+ process_join_sevent(sev);
10638+ else
10639+ process_leave_sevent(sev);
10640+ }
10641+}
10642diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
10643--- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 10644+++ linux-patched/cluster/cman/sm_joinleave.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 10645@@ -0,0 +1,23 @@
10646+/******************************************************************************
10647+*******************************************************************************
10648+**
10649+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10650+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10651+**
10652+** This copyrighted material is made available to anyone wishing to use,
10653+** modify, copy, or redistribute it subject to the terms and conditions
10654+** of the GNU General Public License v.2.
10655+**
10656+*******************************************************************************
10657+******************************************************************************/
10658+
10659+#ifndef __SM_JOINLEAVE_DOT_H__
10660+#define __SM_JOINLEAVE_DOT_H__
10661+
10662+void init_joinleave(void);
10663+void new_joinleave(sm_sevent_t *sev);
10664+void process_joinleave(void);
10665+void backout_sevents(void);
10666+sm_sevent_t *find_sevent(unsigned int id);
10667+
10668+#endif
10669diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
10670--- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 10671+++ linux-patched/cluster/cman/sm_membership.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 10672@@ -0,0 +1,696 @@
10673+/******************************************************************************
10674+*******************************************************************************
10675+**
10676+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
10677+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
10678+**
10679+** This copyrighted material is made available to anyone wishing to use,
10680+** modify, copy, or redistribute it subject to the terms and conditions
10681+** of the GNU General Public License v.2.
10682+**
10683+*******************************************************************************
10684+******************************************************************************/
10685+
10686+#include "sm.h"
10687+
10688+extern struct list_head sm_members;
10689+
10690+/*
10691+ * Routines for SG members to handle other nodes joining or leaving the SG.
10692+ * These "uevent" membership update routines are the response to an "sevent" on
10693+ * a joining/leaving node.
10694+ */
10695+
10696+static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
10697+{
10698+ sm_node_t *node;
10699+
10700+ list_for_each_entry(node, &sg->memb, list) {
10701+ if (node->id != nodeid)
10702+ continue;
10703+ list_del(&node->list);
10704+ kfree(node);
10705+ sg->memb_count--;
10706+ log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
10707+ break;
10708+ }
10709+}
10710+
10711+static void add_memb_node(sm_group_t *sg, sm_node_t *node)
10712+{
10713+ list_add_tail(&node->list, &sg->memb);
10714+ sg->memb_count++;
10715+ log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
10716+}
10717+
10718+/*
10719+ * Join 1. The receive end of send_join_stop() from a node requesting to join
10720+ * the SG. We stop the service so it can be restarted with the new node.
10721+ */
10722+
10723+static int process_join_stop(sm_group_t *sg)
10724+{
10725+ sm_uevent_t *uev = &sg->uevent;
10726+ sm_node_t *node;
10727+ sm_msg_t reply;
10728+ int error;
10729+
10730+ if (uev->ue_num_nodes != sg->memb_count + 1) {
10731+ log_error(sg, "process_join_stop: bad num nodes %u %u",
10732+ uev->ue_num_nodes, sg->memb_count);
10733+ return -1;
10734+ }
10735+
10736+ sm_set_event_id(&uev->ue_id);
10737+
10738+ node = sm_find_joiner(sg, uev->ue_nodeid);
10739+ SM_ASSERT(node,);
10740+
10741+ sg->state = SGST_UEVENT;
10742+ sg->ops->stop(sg->service_data);
10743+
10744+ reply.ms_type = SMSG_JSTOP_REP;
10745+ reply.ms_status = STATUS_POS;
10746+ reply.ms_sevent_id = uev->ue_remote_seid;
10747+ smsg_bswap_out(&reply);
10748+
10749+ error = send_nodeid_message((char *) &reply, sizeof(reply),
10750+ uev->ue_nodeid);
10751+ if (error < 0)
10752+ return error;
10753+ return 0;
10754+}
10755+
10756+/*
10757+ * Join 2. The receive end of send_join_start() from a node joining the SG.
10758+ * We are re-starting the service with the new member added.
10759+ */
10760+
10761+static int process_join_start(sm_group_t *sg)
10762+{
10763+ sm_uevent_t *uev = &sg->uevent;
10764+ sm_node_t *node;
10765+ uint32_t *memb;
10766+ int count = 0;
10767+
10768+ /* this memory is passed to the service which must free it */
10769+ SM_RETRY(memb =
10770+ kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
10771+ memb);
10772+
10773+ /* transfer joining node from joining list to member list */
10774+ node = sm_find_joiner(sg, uev->ue_nodeid);
10775+ SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
10776+ list_del(&node->list);
10777+ add_memb_node(sg, node);
10778+
10779+ /* the new member list for the service */
10780+ list_for_each_entry(node, &sg->memb, list)
10781+ memb[count++] = node->id;
10782+
10783+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
10784+
10785+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
10786+ SERVICE_NODE_JOIN);
10787+ return 0;
10788+}
10789+
10790+/*
10791+ * Join 3. When done starting their local service, every previous SG member
10792+ * calls startdone_barrier() and the new/joining member calls
10793+ * startdone_barrier_new(). The barrier returns when everyone has started
10794+ * their service and joined the barrier.
10795+ */
10796+
10797+static int startdone_barrier(sm_group_t *sg)
10798+{
10799+ sm_uevent_t *uev = &sg->uevent;
10800+ char bname[MAX_BARRIER_NAME_LEN];
10801+ int error;
10802+
10803+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
10804+ uev->ue_barrier_status = -1;
10805+
10806+ set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
10807+
10808+ /* If we're the only member, skip the barrier */
10809+ if (sg->memb_count == 1) {
10810+ process_startdone_barrier(sg, 0);
10811+ return 0;
10812+ }
10813+
10814+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10815+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
10816+ sg->memb_count);
10817+
10818+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
10819+
10820+ return error;
10821+}
10822+
10823+/*
10824+ * Join 4. Check that the "all started" barrier returned a successful status.
10825+ * The newly joined member calls check_startdone_barrier_new().
10826+ */
10827+
10828+static int check_startdone_barrier(sm_group_t *sg)
10829+{
10830+ int error = sg->uevent.ue_barrier_status;
10831+ return error;
10832+}
10833+
10834+/*
10835+ * Join 5. Send the service a "finish" indicating that all members have
10836+ * successfully started. The newly joined member calls do_finish_new().
10837+ */
10838+
10839+static void do_finish(sm_group_t *sg)
10840+{
10841+ sg->state = SGST_RUN;
10842+ clear_bit(SGFL_UEVENT, &sg->flags);
10843+ sg->ops->finish(sg->service_data, sg->uevent.ue_id);
10844+}
10845+
10846+/*
10847+ * Join 6. The uevent is done. If this was a uevent for a node leaving the
10848+ * SG, then send a final message to the departed node signalling that the
10849+ * remaining nodes have restarted since it left.
10850+ */
10851+
10852+static void uevent_done(sm_group_t *sg)
10853+{
10854+ sm_uevent_t *uev = &sg->uevent;
10855+ sm_msg_t reply;
10856+
10857+ if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
10858+ reply.ms_type = SMSG_LSTART_DONE;
10859+ reply.ms_status = STATUS_POS;
10860+ reply.ms_sevent_id = uev->ue_remote_seid;
10861+ smsg_bswap_out(&reply);
10862+ send_nodeid_message((char *) &reply, sizeof(reply),
10863+ uev->ue_nodeid);
10864+ }
10865+ memset(&sg->uevent, 0, sizeof(sm_uevent_t));
10866+}
10867+
10868+/*
10869+ * Leave 1. The receive end of send_leave_stop() from a node leaving the SG.
10870+ */
10871+
10872+static int process_leave_stop(sm_group_t *sg)
10873+{
10874+ sm_uevent_t *uev = &sg->uevent;
10875+ sm_msg_t reply;
10876+ int error;
10877+
10878+ sm_set_event_id(&uev->ue_id);
10879+
10880+ sg->state = SGST_UEVENT;
10881+ sg->ops->stop(sg->service_data);
10882+
10883+ reply.ms_type = SMSG_LSTOP_REP;
10884+ reply.ms_status = STATUS_POS;
10885+ reply.ms_sevent_id = uev->ue_remote_seid;
10886+ smsg_bswap_out(&reply);
10887+
10888+ error = send_nodeid_message((char *) &reply, sizeof(reply),
10889+ uev->ue_nodeid);
10890+ if (error < 0)
10891+ return error;
10892+ return 0;
10893+}
10894+
10895+/*
10896+ * Leave 2. The receive end of send_leave_start() from a node leaving the SG.
10897+ * We are re-starting the service (without the node that's left naturally.)
10898+ */
10899+
10900+static int process_leave_start(sm_group_t *sg)
10901+{
10902+ sm_uevent_t *uev = &sg->uevent;
10903+ sm_node_t *node;
10904+ uint32_t *memb;
10905+ int count = 0;
10906+
10907+ SM_ASSERT(sg->memb_count > 1,
10908+ printk("memb_count=%u\n", sg->memb_count););
10909+
10910+ /* this memory is passed to the service which must free it */
10911+ SM_RETRY(memb =
10912+ kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
10913+ memb);
10914+
10915+ /* remove departed member from sg member list */
10916+ del_memb_node(sg, uev->ue_nodeid);
10917+
10918+ /* build member list to pass to service */
10919+ list_for_each_entry(node, &sg->memb, list)
10920+ memb[count++] = node->id;
10921+
10922+ /* allow us to accept the start_done callback for this start */
10923+ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
10924+
10925+ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
10926+ SERVICE_NODE_LEAVE);
10927+ return 0;
10928+}
10929+
10930+/*
10931+ * Move through the steps of another node joining or leaving the SG.
10932+ */
10933+
10934+static void process_one_uevent(sm_group_t *sg)
10935+{
10936+ sm_uevent_t *uev = &sg->uevent;
10937+ int error = 0;
10938+
10939+ log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
10940+
10941+ switch (uev->ue_state) {
10942+
10943+ /*
10944+ * a uevent is initialized with state JSTOP in
10945+ * process_stop_request
10946+ */
10947+
10948+ case UEST_JSTOP:
10949+ uev->ue_state = UEST_JSTART_WAITCMD;
10950+ error = process_join_stop(sg);
10951+ break;
10952+
10953+ /*
10954+ * ue_state is changed from JSTART_WAITCMD to JSTART in
10955+ * process_start_request
10956+ */
10957+
10958+ case UEST_JSTART:
10959+ uev->ue_state = UEST_JSTART_SERVICEWAIT;
10960+ error = process_join_start(sg);
10961+ break;
10962+
10963+ /*
10964+ * ue_state is changed from JSTART_SERVICEWAIT to
10965+ * JSTART_SERVICEDONE in kcl_start_done
10966+ */
10967+
10968+ case UEST_JSTART_SERVICEDONE:
10969+ uev->ue_state = UEST_BARRIER_WAIT;
10970+ error = startdone_barrier(sg);
10971+ break;
10972+
10973+ /*
10974+ * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
10975+ * process_startdone_barrier
10976+ */
10977+
10978+ case UEST_BARRIER_DONE:
10979+ error = check_startdone_barrier(sg);
10980+ if (error)
10981+ break;
10982+
10983+ do_finish(sg);
10984+ uevent_done(sg);
10985+ break;
10986+
10987+ /*
10988+ * a uevent is initialized with state LSTOP in
10989+ * process_stop_request
10990+ */
10991+
10992+ case UEST_LSTOP:
10993+ uev->ue_state = UEST_LSTART_WAITCMD;
10994+ error = process_leave_stop(sg);
10995+ break;
10996+
10997+ /*
10998+ * a uevent is changed from LSTART_WAITCMD to LSTART in
10999+ * process_start_request
11000+ */
11001+
11002+ case UEST_LSTART:
11003+ uev->ue_state = UEST_LSTART_SERVICEWAIT;
11004+ error = process_leave_start(sg);
11005+ break;
11006+
11007+ /*
11008+ * a uevent is changed from LSTART_SERVICEWAIT to to
11009+ * LSTART_SERVICEDONE in kcl_start_done
11010+ */
11011+
11012+ case UEST_LSTART_SERVICEDONE:
11013+ uev->ue_state = UEST_BARRIER_WAIT;
11014+ error = startdone_barrier(sg);
11015+ break;
11016+
11017+ default:
11018+ error = -1;
11019+ }
11020+
11021+ /* If we encounter an error during these routines, we do nothing,
11022+ expecting that a node failure related to this sg will cause a
11023+ recovery event to arrive and call cancel_one_uevent(). */
11024+
11025+ if (error)
11026+ log_error(sg, "process_one_uevent error %d state %u",
11027+ error, uev->ue_state);
11028+}
11029+
11030+static sm_node_t *failed_memb(sm_group_t *sg, int *count)
11031+{
11032+ sm_node_t *node, *sm_node, *failed_uev_node = NULL;
11033+
11034+ list_for_each_entry(node, &sg->memb, list) {
11035+
11036+ sm_node = sm_find_member(node->id);
11037+ SM_ASSERT(sm_node, );
11038+
11039+ if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
11040+ (*count)++;
11041+ if (node->id == sg->uevent.ue_nodeid)
11042+ failed_uev_node = sm_node;
11043+ }
11044+ }
11045+ return failed_uev_node;
11046+}
11047+
11048+static void send_recover_msg(sm_group_t *sg)
11049+{
11050+ char *msg;
11051+ int len = 0;
11052+ msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
11053+ send_members_message(sg, msg, len);
11054+}
11055+
11056+static void cancel_barrier(sm_group_t *sg)
11057+{
11058+ sm_uevent_t *uev = &sg->uevent;
11059+ char bname[MAX_BARRIER_NAME_LEN];
11060+
11061+ clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11062+
11063+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
11064+ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11065+ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11066+ sg->memb_count);
11067+ kcl_barrier_cancel(bname);
11068+}
11069+
11070+static void cancel_one_uevent(sm_group_t *sg, int *effected)
11071+{
11072+ sm_uevent_t *uev = &sg->uevent;
11073+ int failed_count;
11074+ sm_node_t *node, *failed_joiner, *failed_leaver;
11075+
11076+ log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
11077+ uev->ue_nodeid);
11078+
11079+ switch (uev->ue_state) {
11080+
11081+ case UEST_JSTOP:
11082+ case UEST_JSTART_WAITCMD:
11083+ case UEST_JSTART:
11084+
11085+ sg->ops->stop(sg->service_data);
11086+
11087+ failed_count = 0;
11088+ failed_joiner = failed_memb(sg, &failed_count);
11089+ SM_ASSERT(!failed_joiner, );
11090+
11091+ node = sm_find_member(uev->ue_nodeid);
11092+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11093+ failed_joiner = node;
11094+
11095+ if (!failed_count) {
11096+ /* only joining node failed */
11097+ SM_ASSERT(failed_joiner, );
11098+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11099+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11100+ (*effected)++;
11101+ /* some nodes may not have gotten a JSTOP message
11102+ in which case this will tell them to begin
11103+ recovery for this sg. */
11104+ send_recover_msg(sg);
11105+
11106+ } else {
11107+ /* a member node failed (and possibly joining node, it
11108+ doesn't matter) */
11109+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11110+ }
11111+
11112+ clear_bit(SGFL_UEVENT, &sg->flags);
11113+ memset(uev, 0, sizeof(sm_uevent_t));
11114+ break;
11115+
11116+
11117+ case UEST_JSTART_SERVICEWAIT:
11118+ case UEST_JSTART_SERVICEDONE:
11119+
11120+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11121+ sg->ops->stop(sg->service_data);
11122+
11123+ failed_count = 0;
11124+ failed_joiner = failed_memb(sg, &failed_count);
11125+ SM_ASSERT(failed_count, );
11126+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11127+
11128+ if (failed_count == 1 && failed_joiner) {
11129+ /* only joining node failed */
11130+
11131+ } else if (failed_count && failed_joiner) {
11132+ /* joining node and another member failed */
11133+
11134+ } else {
11135+ /* other member failed, joining node still alive */
11136+ SM_ASSERT(!failed_joiner, );
11137+ del_memb_node(sg, uev->ue_nodeid);
11138+ }
11139+
11140+ clear_bit(SGFL_UEVENT, &sg->flags);
11141+ memset(uev, 0, sizeof(sm_uevent_t));
11142+ break;
11143+
11144+
11145+ case UEST_LSTOP:
11146+ case UEST_LSTART_WAITCMD:
11147+ case UEST_LSTART:
11148+
11149+ sg->ops->stop(sg->service_data);
11150+
11151+ failed_count = 0;
11152+ failed_leaver = failed_memb(sg, &failed_count);
11153+ SM_ASSERT(failed_count, );
11154+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11155+
11156+ if (failed_count == 1 && failed_leaver) {
11157+ /* only leaving node failed */
11158+
11159+ } else if (failed_count && failed_leaver) {
11160+ /* leaving node and another member failed */
11161+
11162+ } else {
11163+ /* other member failed, leaving node still alive */
11164+ SM_ASSERT(!failed_leaver, );
11165+ }
11166+
11167+ clear_bit(SGFL_UEVENT, &sg->flags);
11168+ memset(uev, 0, sizeof(sm_uevent_t));
11169+ break;
11170+
11171+
11172+ case UEST_LSTART_SERVICEWAIT:
11173+ case UEST_LSTART_SERVICEDONE:
11174+
11175+ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11176+ sg->ops->stop(sg->service_data);
11177+
11178+ failed_count = 0;
11179+ failed_leaver = failed_memb(sg, &failed_count);
11180+ SM_ASSERT(!failed_leaver, );
11181+
11182+ node = sm_find_member(uev->ue_nodeid);
11183+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11184+ failed_leaver = node;
11185+
11186+ if (!failed_count) {
11187+ /* only leaving node failed */
11188+ SM_ASSERT(failed_leaver, );
11189+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11190+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11191+ (*effected)++;
11192+
11193+ } else if (failed_count && failed_leaver) {
11194+ /* leaving node and another member failed */
11195+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11196+
11197+ } else {
11198+ /* other member failed, leaving node still alive */
11199+ SM_ASSERT(failed_count, );
11200+ SM_ASSERT(!failed_leaver, );
11201+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11202+ node = sm_new_node(sg->uevent.ue_nodeid);
11203+ add_memb_node(sg, node);
11204+ }
11205+
11206+ clear_bit(SGFL_UEVENT, &sg->flags);
11207+ memset(uev, 0, sizeof(sm_uevent_t));
11208+ break;
11209+
11210+
11211+ case UEST_BARRIER_WAIT:
11212+
11213+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11214+ goto barrier_wait_leave;
11215+
11216+ sg->ops->stop(sg->service_data);
11217+ cancel_barrier(sg);
11218+
11219+ barrier_wait_join:
11220+
11221+ failed_count = 0;
11222+ failed_joiner = failed_memb(sg, &failed_count);
11223+ SM_ASSERT(failed_count, );
11224+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11225+
11226+ if (failed_count == 1 && failed_joiner) {
11227+ /* only joining node failed */
11228+
11229+ } else if (failed_count && failed_joiner) {
11230+ /* joining node and another member failed */
11231+
11232+ } else {
11233+ /* other member failed, joining node still alive */
11234+ SM_ASSERT(!failed_joiner, );
11235+ del_memb_node(sg, uev->ue_nodeid);
11236+ }
11237+
11238+ clear_bit(SGFL_UEVENT, &sg->flags);
11239+ memset(uev, 0, sizeof(sm_uevent_t));
11240+ break;
11241+
11242+ barrier_wait_leave:
11243+
11244+ failed_count = 0;
11245+ failed_leaver = failed_memb(sg, &failed_count);
11246+ SM_ASSERT(!failed_leaver, );
11247+
11248+ node = sm_find_member(uev->ue_nodeid);
11249+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11250+ failed_leaver = node;
11251+
11252+ if (!failed_count) {
11253+ /* only leaving node failed */
11254+ SM_ASSERT(failed_leaver, );
11255+ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11256+ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11257+ (*effected)++;
11258+
11259+ } else if (failed_count && failed_leaver) {
11260+ /* leaving node and another member failed */
11261+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11262+
11263+ } else {
11264+ /* other member failed, leaving node still alive */
11265+ SM_ASSERT(failed_count, );
11266+ SM_ASSERT(!failed_leaver, );
11267+ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11268+ node = sm_new_node(sg->uevent.ue_nodeid);
11269+ add_memb_node(sg, node);
11270+ }
11271+
11272+ clear_bit(SGFL_UEVENT, &sg->flags);
11273+ memset(uev, 0, sizeof(sm_uevent_t));
11274+ break;
11275+
11276+
11277+ case UEST_BARRIER_DONE:
11278+
11279+ if (!uev->ue_barrier_status) {
11280+ do_finish(sg);
11281+ uevent_done(sg);
11282+ break;
11283+ }
11284+
11285+ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11286+ goto barrier_wait_leave;
11287+ else
11288+ goto barrier_wait_join;
11289+
11290+
11291+ default:
11292+ log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
11293+ }
11294+}
11295+
11296+void cancel_uevents(int *effected)
11297+{
11298+ sm_group_t *sg;
11299+ sm_node_t *node, *sgnode;
11300+ int i;
11301+
11302+ list_for_each_entry(node, &sm_members, list) {
11303+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
11304+ continue;
11305+
11306+ /*
11307+ * Clear this dead node from the "interested in joining" list
11308+ * of any SG. The node is added to this list before the uevent
11309+ * begins.
11310+ */
11311+
11312+ for (i = 0; i < SG_LEVELS; i++) {
11313+ list_for_each_entry(sg, &sm_sg[i], list) {
11314+ sgnode = sm_find_joiner(sg, node->id);
11315+ if (sgnode) {
11316+ log_debug(sg, "clear joining node %u",
11317+ sgnode->id);
11318+ list_del(&sgnode->list);
11319+ kfree(sgnode);
11320+ }
11321+ }
11322+ }
11323+ }
11324+
11325+ /* Adjust any uevents in sg's effected by the failed node(s) */
11326+
11327+ for (i = 0; i < SG_LEVELS; i++) {
11328+ list_for_each_entry(sg, &sm_sg[i], list) {
11329+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11330+ continue;
11331+
11332+ /* We may have some cancelling to do if this sg is
11333+ flagged as having a failed member, or if a joining
11334+ or leaving node has died. */
11335+
11336+ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
11337+ cancel_one_uevent(sg, effected);
11338+ else if (sg->uevent.ue_nodeid) {
11339+ node = sm_find_member(sg->uevent.ue_nodeid);
11340+ SM_ASSERT(node, );
11341+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11342+ cancel_one_uevent(sg, effected);
11343+ }
11344+ }
11345+ }
11346+}
11347+
11348+void process_membership(void)
11349+{
11350+ sm_group_t *sg;
11351+ int i;
11352+
11353+ down(&sm_sglock);
11354+
11355+ for (i = 0; i < SG_LEVELS; i++) {
11356+ list_for_each_entry(sg, &sm_sg[i], list) {
11357+ if (!test_bit(SGFL_UEVENT, &sg->flags))
11358+ continue;
11359+
11360+ if (!test_and_clear_bit(UEFL_CHECK,
11361+ &sg->uevent.ue_flags))
11362+ continue;
11363+
11364+ process_one_uevent(sg);
11365+ }
11366+ }
11367+ up(&sm_sglock);
11368+}
11369diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
11370--- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11371+++ linux-patched/cluster/cman/sm_membership.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 11372@@ -0,0 +1,20 @@
11373+/******************************************************************************
11374+*******************************************************************************
11375+**
11376+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11377+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11378+**
11379+** This copyrighted material is made available to anyone wishing to use,
11380+** modify, copy, or redistribute it subject to the terms and conditions
11381+** of the GNU General Public License v.2.
11382+**
11383+*******************************************************************************
11384+******************************************************************************/
11385+
11386+#ifndef __SM_MEMBERSHIP_DOT_H__
11387+#define __SM_MEMBERSHIP_DOT_H__
11388+
11389+void process_membership(void);
11390+void cancel_uevents(int *effected);
11391+
11392+#endif
11393diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
11394--- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 11395+++ linux-patched/cluster/cman/sm_message.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 11396@@ -0,0 +1,867 @@
11397+/******************************************************************************
11398+*******************************************************************************
11399+**
11400+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11401+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11402+**
11403+** This copyrighted material is made available to anyone wishing to use,
11404+** modify, copy, or redistribute it subject to the terms and conditions
11405+** of the GNU General Public License v.2.
11406+**
11407+*******************************************************************************
11408+******************************************************************************/
11409+
11410+#include "sm.h"
11411+
11412+#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
11413+
11414+extern struct socket * sm_socket;
11415+extern uint32_t sm_our_nodeid;
11416+static uint32_t global_last_id;
11417+static struct list_head messages;
11418+static spinlock_t message_lock;
11419+static char smsg_buf[SMSG_BUF_SIZE];
11420+
11421+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
11422+
11423+struct rq_entry {
11424+ struct list_head list;
11425+ char *msg;
11426+ int len;
11427+ uint32_t nodeid;
11428+};
11429+typedef struct rq_entry rq_entry_t;
11430+
11431+void init_messages(void)
11432+{
11433+ global_last_id = 1;
11434+ INIT_LIST_HEAD(&messages);
11435+ spin_lock_init(&message_lock);
11436+}
11437+
11438+uint32_t sm_new_global_id(int level)
11439+{
11440+ uint32_t id = global_last_id++;
11441+ uint8_t l = (uint8_t) level;
11442+
11443+ if (level > 255)
11444+ return 0;
11445+
11446+ if (id > 0x00FFFFFF)
11447+ return 0;
11448+
11449+ id |= (l << 24);
11450+ return id;
11451+}
11452+
11453+static void smsg_copy_in(char *msg, sm_msg_t *smsg)
11454+{
11455+ sm_msg_t *in = (sm_msg_t *) msg;
11456+
11457+ smsg->ms_type = in->ms_type;
11458+ smsg->ms_status = in->ms_status;
11459+ smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
11460+ smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
11461+ smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
11462+ smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
11463+ smsg->ms_length = le16_to_cpu(in->ms_length);
11464+}
11465+
11466+/* swapping bytes in place is an easy source of errors - be careful not to
11467+ * access the fields after calling this */
11468+
11469+void smsg_bswap_out(sm_msg_t *smsg)
11470+{
11471+ smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
11472+ smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
11473+ smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
11474+ smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
11475+ smsg->ms_length = cpu_to_le16(smsg->ms_length);
11476+}
11477+
11478+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
11479+ sm_sevent_t *sev)
11480+{
11481+ char *msg;
11482+ sm_msg_t *smsg;
11483+ int fulllen = sizeof(sm_msg_t) + datalen;
11484+
11485+ msg = smsg_buf;
11486+ memset(smsg_buf, 0, SMSG_BUF_SIZE);
11487+ SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
11488+
11489+ smsg = (sm_msg_t *) msg;
11490+ smsg->ms_type = type;
11491+ smsg->ms_global_sgid = sg->global_id;
11492+ smsg->ms_sglevel = sg->level;
11493+ smsg->ms_length = datalen;
11494+ smsg->ms_sevent_id = sev ? sev->se_id : 0;
11495+
11496+ smsg_bswap_out(smsg);
11497+ *msglen = fulllen;
11498+ return msg;
11499+}
11500+
11501+static unsigned int msgtype_to_flag(int type)
11502+{
11503+ unsigned int flag;
11504+
11505+ switch (type) {
11506+ case SMSG_JOIN_REP:
11507+ case SMSG_JOIN_REQ:
11508+ flag = SEFL_ALLOW_JOIN;
11509+ break;
11510+
11511+ case SMSG_JSTOP_REP:
11512+ case SMSG_JSTOP_REQ:
11513+ flag = SEFL_ALLOW_JSTOP;
11514+ break;
11515+
11516+ case SMSG_LEAVE_REP:
11517+ case SMSG_LEAVE_REQ:
11518+ flag = SEFL_ALLOW_LEAVE;
11519+ break;
11520+
11521+ case SMSG_LSTOP_REP:
11522+ case SMSG_LSTOP_REQ:
11523+ flag = SEFL_ALLOW_LSTOP;
11524+ break;
11525+
11526+ default:
11527+ SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
11528+ }
11529+ return flag;
11530+}
11531+
11532+static int test_allowed_msgtype(sm_sevent_t * sev, int type)
11533+{
11534+ unsigned int flag = msgtype_to_flag(type);
11535+
11536+ return test_bit(flag, &sev->se_flags);
11537+}
11538+
11539+static void clear_allowed_msgtype(sm_sevent_t * sev, int type)
11540+{
11541+ unsigned int flag = msgtype_to_flag(type);
11542+
11543+ clear_bit(flag, &sev->se_flags);
11544+}
11545+
11546+static void set_allowed_msgtype(sm_sevent_t * sev, int type)
11547+{
11548+ unsigned int flag = msgtype_to_flag(type);
11549+
11550+ set_bit(flag, &sev->se_flags);
11551+}
11552+
11553+static int save_global_id(sm_sevent_t * sev, sm_msg_t * smsg)
11554+{
11555+ sm_group_t *sg = sev->se_sg;
11556+
11557+ if (!smsg->ms_global_sgid) {
11558+ log_error(sg, "save_global_id: zero sg id");
11559+ return -1;
11560+ }
11561+
11562+ if (!sg->global_id)
11563+ sg->global_id = smsg->ms_global_sgid;
11564+
11565+ if (sg->global_id != smsg->ms_global_sgid) {
11566+ log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
11567+ return -1;
11568+ }
11569+ return 0;
11570+}
11571+
11572+static void save_lastid(sm_msg_t * smsg)
11573+{
11574+ uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
11575+
11576+ /*
11577+ * Keep track of the highst SG id which has been used
11578+ * in the cluster in case we need to choose a new SG id.
11579+ */
11580+
11581+ if (gid > global_last_id)
11582+ global_last_id = gid;
11583+}
11584+
11585+static int next_sev_state(int msg_type, int cur_state)
11586+{
11587+ int next = 0;
11588+
11589+ switch (msg_type) {
11590+ case SMSG_JOIN_REP:
11591+ SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
11592+ next = SEST_JOIN_ACKED;
11593+ break;
11594+
11595+ case SMSG_JSTOP_REP:
11596+ SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
11597+ next = SEST_JSTOP_ACKED;
11598+ break;
11599+
11600+ case SMSG_LEAVE_REP:
11601+ SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
11602+ next = SEST_LEAVE_ACKED;
11603+ break;
11604+
11605+ case SMSG_LSTOP_REP:
11606+ SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
11607+ next = SEST_LSTOP_ACKED;
11608+ break;
11609+ }
11610+ return next;
11611+}
11612+
11613+/*
11614+ * Functions in sevent.c send messages to other nodes and then expect replies.
11615+ * This function collects the replies for the sevent messages and moves the
11616+ * sevent to the next stage when all the expected replies have been received.
11617+ */
11618+
11619+static void process_reply(sm_msg_t * smsg, uint32_t nodeid)
11620+{
11621+ sm_sevent_t *sev;
11622+ int i, expected, type = smsg->ms_type;
11623+
11624+ /*
11625+ * Find the relevant sevent.
11626+ */
11627+
11628+ sev = find_sevent(smsg->ms_sevent_id);
11629+ if (!sev) {
11630+ log_print("process_reply invalid id=%u nodeid=%u",
11631+ smsg->ms_sevent_id, nodeid);
11632+ goto out;
11633+ }
11634+
11635+ /*
11636+ * Check if this message type is what this sevent is waiting for.
11637+ */
11638+
11639+ if (!test_allowed_msgtype(sev, type)) {
11640+ log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id);
11641+ goto out;
11642+ }
11643+
11644+ expected =
11645+ (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
11646+
11647+ SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
11648+ printk("type=%d expected=%d len_ids=%d node_count=%d "
11649+ "memb_count=%d\n", type, expected, sev->se_len_ids,
11650+ sev->se_node_count, sev->se_memb_count););
11651+
11652+ SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
11653+ printk("type=%d expected=%d len_status=%d node_count=%d "
11654+ "memb_count=%d\n", type, expected, sev->se_len_status,
11655+ sev->se_node_count, sev->se_memb_count););
11656+
11657+ for (i = 0; i < expected; i++) {
11658+ if (sev->se_node_ids[i] == nodeid) {
11659+ /*
11660+ * Save the status from the replying node
11661+ */
11662+
11663+ if (!sev->se_node_status[i])
11664+ sev->se_node_status[i] = smsg->ms_status;
11665+ else {
11666+ log_error(sev->se_sg, "process_reply duplicate"
11667+ "id=%u nodeid=%u %u/%u",
11668+ sev->se_id, nodeid,
11669+ sev->se_node_status[i],
11670+ smsg->ms_status);
11671+ goto out;
11672+ }
11673+
11674+ if (type == SMSG_JOIN_REP) {
11675+ save_lastid(smsg);
11676+
11677+ if (smsg->ms_status == STATUS_POS)
11678+ save_global_id(sev, smsg);
11679+ }
11680+
11681+ /*
11682+ * Signal sm if we have all replies
11683+ */
11684+
11685+ if (++sev->se_reply_count == expected) {
11686+ clear_allowed_msgtype(sev, type);
11687+ sev->se_state = next_sev_state(type,
11688+ sev->se_state);
11689+ set_bit(SEFL_CHECK, &sev->se_flags);
11690+ wake_serviced(DO_JOINLEAVE);
11691+ }
11692+
11693+ break;
11694+ }
11695+ }
11696+
11697+ out:
11698+ return;
11699+}
11700+
11701+/*
11702+ * A node wants to join an SG and has run send_join_notice. If we know nothing
11703+ * about the SG , then we have no objection - send back STATUS_POS. If we're a
11704+ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
11705+ * no sevent or uevent of higher priority in progress (only a single join or
11706+ * leave is permitted for the SG at once). If there happens to be a higher
11707+ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
11708+ * requested join for a bit.
11709+ */
11710+
11711+static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
11712+{
11713+ sm_group_t *sg = NULL;
11714+ sm_sevent_t *sev = NULL;
11715+ sm_node_t *node;
11716+ int found = FALSE;
11717+ int level = smsg->ms_sglevel;
11718+ sm_msg_t reply;
11719+
11720+ memset(&reply, 0, sizeof(reply));
11721+
11722+ down(&sm_sglock);
11723+
11724+ if (nodeid == sm_our_nodeid)
11725+ goto next;
11726+
11727+ /*
11728+ * search SG list for an SG with given name/len
11729+ */
11730+
11731+ list_for_each_entry(sg, &sm_sg[level], list) {
11732+ if ((sg->namelen != smsg->ms_length) ||
11733+ memcmp(sg->name, name, sg->namelen))
11734+ continue;
11735+ found = TRUE;
11736+ break;
11737+ }
11738+
11739+ /*
11740+ * build reply message
11741+ */
11742+
11743+ next:
11744+
11745+ if (!found) {
11746+ reply.ms_type = SMSG_JOIN_REP;
11747+ reply.ms_status = STATUS_NEG;
11748+ reply.ms_global_lastid = global_last_id;
11749+ reply.ms_sevent_id = smsg->ms_sevent_id;
11750+ } else {
11751+ reply.ms_type = SMSG_JOIN_REP;
11752+ reply.ms_status = STATUS_POS;
11753+ reply.ms_sevent_id = smsg->ms_sevent_id;
11754+ reply.ms_global_sgid = sg->global_id;
11755+ reply.ms_global_lastid = global_last_id;
11756+
11757+ /*
11758+ * The node trying to join should wait and try again until
11759+ * we're done with recovery.
11760+ */
11761+
11762+ if (sg->state == SGST_RECOVER) {
11763+ reply.ms_status = STATUS_WAIT;
11764+ goto send;
11765+ }
11766+
11767+ /*
11768+ * An sevent node trying to join may have gotten as far as
11769+ * creating a uevent with us and then backed out. That node
11770+ * will retry joining from the beginning so we should not turn
11771+ * them away. If we're handling a uevent for another node,
11772+ * tell the joining node to wait.
11773+ */
11774+
11775+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
11776+ if (sg->uevent.ue_nodeid != nodeid)
11777+ reply.ms_status = STATUS_WAIT;
11778+ goto send;
11779+ }
11780+
11781+ /*
11782+ * We're trying to join or leave the SG at the moment.
11783+ */
11784+
11785+ if (test_bit(SGFL_SEVENT, &sg->flags)) {
11786+ sev = sg->sevent;
11787+
11788+ /*
11789+ * We're trying to leave. Make the join wait until
11790+ * we've left if we're beyond LEAVE_ACKWAIT.
11791+ */
11792+
11793+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
11794+ if (sev->se_state > SEST_LEAVE_ACKED)
11795+ reply.ms_status = STATUS_WAIT;
11796+ else {
11797+ reply.ms_status = STATUS_POS;
11798+ clear_bit(SEFL_ALLOW_LEAVE,
11799+ &sev->se_flags);
11800+ set_bit(SEFL_CANCEL, &sev->se_flags);
11801+ }
11802+ }
11803+
11804+ /*
11805+ * We're trying to join. Making the other join wait
11806+ * until we're joined if we're beyond JOIN_ACKWAIT or
11807+ * if we have a lower id. (Send NEG to allow the other
11808+ * node to go ahead because we're not in the SG.)
11809+ */
11810+
11811+ else {
11812+ if (sev->se_state > SEST_JOIN_ACKED)
11813+ reply.ms_status = STATUS_WAIT;
11814+ else if (sm_our_nodeid < nodeid)
11815+ reply.ms_status = STATUS_WAIT;
11816+ else {
11817+ reply.ms_status = STATUS_NEG;
11818+ clear_bit(SEFL_ALLOW_JOIN,
11819+ &sev->se_flags);
11820+ set_bit(SEFL_CANCEL, &sev->se_flags);
11821+ }
11822+ }
11823+
11824+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
11825+ set_bit(SEFL_CHECK, &sev->se_flags);
11826+ wake_serviced(DO_JOINLEAVE);
11827+ }
11828+ goto send;
11829+ }
11830+
11831+ /* no r,u,s event, stick with STATUS_POS */
11832+ }
11833+
11834+ send:
11835+
11836+ if (reply.ms_status == STATUS_POS) {
11837+ node = sm_find_joiner(sg, nodeid);
11838+ if (!node) {
11839+ node = sm_new_node(nodeid);
11840+ list_add_tail(&node->list, &sg->joining);
11841+ }
11842+ }
11843+
11844+ up(&sm_sglock);
11845+ smsg_bswap_out(&reply);
11846+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
11847+}
11848+
11849+/*
11850+ * Another node wants us to stop a service so it can join or leave the SG. We
11851+ * do this by saving the request info in a uevent and having the sm thread do
11852+ * the processing and then replying.
11853+ */
11854+
11855+static void process_stop_request(sm_msg_t * smsg, uint32_t nodeid,
11856+ uint32_t * msgbuf)
11857+{
11858+ sm_group_t *sg;
11859+ sm_uevent_t *uev;
11860+ sm_msg_t reply;
11861+ int type = smsg->ms_type;
11862+
11863+ if (nodeid == sm_our_nodeid)
11864+ goto agree;
11865+
11866+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
11867+ if (!sg) {
11868+ log_print("process_stop_request: unknown sg id %x",
11869+ smsg->ms_global_sgid);
11870+ return;
11871+ }
11872+
11873+ /*
11874+ * We shouldn't get here with uevent already set.
11875+ */
11876+
11877+ if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
11878+ log_error(sg, "process_stop_request: uevent already set");
11879+ return;
11880+ }
11881+
11882+ uev = &sg->uevent;
11883+ uev->ue_nodeid = nodeid;
11884+ uev->ue_remote_seid = smsg->ms_sevent_id;
11885+ uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
11886+
11887+ if (type == SMSG_JSTOP_REQ)
11888+ uev->ue_num_nodes = be32_to_cpu(*msgbuf);
11889+ else
11890+ set_bit(UEFL_LEAVE, &uev->ue_flags);
11891+
11892+ /*
11893+ * Do process_join_stop() or process_leave_stop().
11894+ */
11895+
11896+ set_bit(UEFL_CHECK, &uev->ue_flags);
11897+ wake_serviced(DO_MEMBERSHIP);
11898+ return;
11899+
11900+ agree:
11901+ reply.ms_status = STATUS_POS;
11902+ reply.ms_type =
11903+ (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
11904+ reply.ms_sevent_id = smsg->ms_sevent_id;
11905+ smsg_bswap_out(&reply);
11906+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
11907+}
11908+
11909+static void process_start_request(sm_msg_t * smsg, uint32_t nodeid)
11910+{
11911+ sm_group_t *sg;
11912+ sm_uevent_t *uev;
11913+ int type = smsg->ms_type;
11914+
11915+ if (nodeid == sm_our_nodeid)
11916+ return;
11917+
11918+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
11919+ if (!sg) {
11920+ log_print("process_start_request: unknown sg id %x",
11921+ smsg->ms_global_sgid);
11922+ return;
11923+ }
11924+
11925+ if (!test_bit(SGFL_UEVENT, &sg->flags)) {
11926+ log_error(sg, "process_start_request: no uevent");
11927+ return;
11928+ }
11929+
11930+ uev = &sg->uevent;
11931+
11932+ if (type == SMSG_JSTART_CMD)
11933+ uev->ue_state = UEST_JSTART;
11934+ else
11935+ uev->ue_state = UEST_LSTART;
11936+
11937+ set_bit(UEFL_CHECK, &uev->ue_flags);
11938+ wake_serviced(DO_MEMBERSHIP);
11939+}
11940+
11941+static void process_leave_request(sm_msg_t * smsg, uint32_t nodeid)
11942+{
11943+ sm_group_t *sg;
11944+ sm_node_t *node;
11945+ sm_msg_t reply;
11946+ sm_sevent_t *sev;
11947+ int found = FALSE;
11948+
11949+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
11950+ if (sg) {
11951+ if (nodeid == sm_our_nodeid)
11952+ found = TRUE;
11953+ else {
11954+ list_for_each_entry(node, &sg->memb, list) {
11955+ if (node->id != nodeid)
11956+ continue;
11957+ set_bit(SNFL_LEAVING, &node->flags);
11958+ found = TRUE;
11959+ break;
11960+ }
11961+ }
11962+ }
11963+
11964+ if (!found) {
11965+ reply.ms_type = SMSG_LEAVE_REP;
11966+ reply.ms_status = STATUS_NEG;
11967+ reply.ms_sevent_id = smsg->ms_sevent_id;
11968+ } else {
11969+ reply.ms_type = SMSG_LEAVE_REP;
11970+ reply.ms_status = STATUS_POS;
11971+ reply.ms_sevent_id = smsg->ms_sevent_id;
11972+
11973+ if (sg->state == SGST_RECOVER)
11974+ reply.ms_status = STATUS_WAIT;
11975+
11976+ else if (test_bit(SGFL_SEVENT, &sg->flags) &&
11977+ nodeid != sm_our_nodeid) {
11978+ sev = sg->sevent;
11979+
11980+ /*
11981+ * We're trying to join or leave at the moment. If
11982+ * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
11983+ * wait. Otherwise, if joining we'll cancel to let the
11984+ * leave happen first, or if we're leaving allow the
11985+ * lower nodeid to leave first.
11986+ */
11987+
11988+ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
11989+ if (sev->se_state > SEST_LEAVE_ACKWAIT)
11990+ reply.ms_status = STATUS_WAIT;
11991+ else if (sm_our_nodeid < nodeid)
11992+ reply.ms_status = STATUS_WAIT;
11993+ else {
11994+ reply.ms_status = STATUS_POS;
11995+ clear_bit(SEFL_ALLOW_LEAVE,
11996+ &sev->se_flags);
11997+ set_bit(SEFL_CANCEL, &sev->se_flags);
11998+ }
11999+ } else {
12000+ if (sev->se_state > SEST_JOIN_ACKWAIT)
12001+ reply.ms_status = STATUS_WAIT;
12002+ else {
12003+ reply.ms_status = STATUS_NEG;
12004+ clear_bit(SEFL_ALLOW_JOIN,
12005+ &sev->se_flags);
12006+ set_bit(SEFL_CANCEL, &sev->se_flags);
12007+ }
12008+ }
12009+
12010+ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12011+ set_bit(SEFL_CHECK, &sev->se_flags);
12012+ wake_serviced(DO_JOINLEAVE);
12013+ }
12014+ }
12015+
12016+ else if (test_bit(SGFL_UEVENT, &sg->flags)) {
12017+ if (sg->uevent.ue_nodeid != nodeid)
12018+ reply.ms_status = STATUS_WAIT;
12019+ }
12020+
12021+ }
12022+
12023+ smsg_bswap_out(&reply);
12024+ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12025+}
12026+
12027+/*
12028+ * Each remaining node will send us a done message. We quit when we get the
12029+ * first. The subsequent done messages for the finished sevent get here and
12030+ * are ignored.
12031+ */
12032+
12033+static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
12034+{
12035+ sm_sevent_t *sev;
12036+
12037+ sev = find_sevent(smsg->ms_sevent_id);
12038+ if (!sev)
12039+ return;
12040+
12041+ if (sev->se_state != SEST_LSTART_WAITREMOTE)
12042+ return;
12043+
12044+ sev->se_state = SEST_LSTART_REMOTEDONE;
12045+ set_bit(SEFL_CHECK, &sev->se_flags);
12046+ wake_serviced(DO_JOINLEAVE);
12047+}
12048+
12049+/*
12050+ * This function and everything it calls always runs in sm context.
12051+ */
12052+
12053+static void process_message(char *msg, uint32_t nodeid)
12054+{
12055+ sm_msg_t smsg;
12056+
12057+ smsg_copy_in(msg, &smsg);
12058+
12059+ switch (smsg.ms_type) {
12060+ case SMSG_JOIN_REQ:
12061+ process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
12062+ break;
12063+
12064+ case SMSG_JSTOP_REQ:
12065+ process_stop_request(&smsg, nodeid,
12066+ (uint32_t *) (msg + sizeof(sm_msg_t)));
12067+ break;
12068+
12069+ case SMSG_LEAVE_REQ:
12070+ process_leave_request(&smsg, nodeid);
12071+ break;
12072+
12073+ case SMSG_LSTOP_REQ:
12074+ process_stop_request(&smsg, nodeid, NULL);
12075+ break;
12076+
12077+ case SMSG_JSTART_CMD:
12078+ case SMSG_LSTART_CMD:
12079+ process_start_request(&smsg, nodeid);
12080+ break;
12081+
12082+ case SMSG_LSTART_DONE:
12083+ process_lstart_done(&smsg, nodeid);
12084+ break;
12085+
12086+ case SMSG_JOIN_REP:
12087+ case SMSG_JSTOP_REP:
12088+ case SMSG_LEAVE_REP:
12089+ case SMSG_LSTOP_REP:
12090+ process_reply(&smsg, nodeid);
12091+ break;
12092+
12093+ case SMSG_RECOVER:
12094+ process_recover_msg(&smsg, nodeid);
12095+ break;
12096+
12097+ default:
12098+ log_print("process_message: unknown type %u nodeid %u",
12099+ smsg.ms_type, nodeid);
12100+ }
12101+}
12102+
12103+/*
12104+ * Always called from sm context.
12105+ */
12106+
12107+void process_messages(void)
12108+{
12109+ rq_entry_t *re;
12110+
12111+ while (1) {
12112+ re = NULL;
12113+
12114+ spin_lock(&message_lock);
12115+ if (!list_empty(&messages)) {
12116+ re = list_entry(messages.next, rq_entry_t, list);
12117+ list_del(&re->list);
12118+ }
12119+ spin_unlock(&message_lock);
12120+
12121+ if (!re)
12122+ break;
12123+ process_message(re->msg, re->nodeid);
12124+ kfree(re->msg);
12125+ kfree(re);
12126+ schedule();
12127+ }
12128+}
12129+
12130+/*
12131+ * Context: cnxman and sm
12132+ */
12133+
12134+static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
12135+{
12136+ rq_entry_t *re;
12137+
12138+ SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
12139+ re);
12140+ SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
12141+
12142+ memcpy(re->msg, msg, len);
12143+ re->len = len;
12144+ re->nodeid = nodeid;
12145+
12146+ spin_lock(&message_lock);
12147+ list_add_tail(&re->list, &messages);
12148+ spin_unlock(&message_lock);
12149+
12150+ wake_serviced(DO_MESSAGES);
12151+ return 0;
12152+}
12153+
12154+/*
12155+ * Context: cnxman
12156+ * Called by cnxman when a service manager message arrives.
12157+ */
12158+
12159+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12160+ unsigned int node_id)
12161+{
12162+ struct kcl_cluster_node kclnode;
12163+ uint32_t nodeid = 0;
12164+ int error = 0;
12165+
12166+ if (!node_id) {
12167+ error = kcl_get_node_by_addr(addr, addr_len, &kclnode);
12168+ if (error)
12169+ return error;
12170+ nodeid = kclnode.node_id;
12171+ } else
12172+ nodeid = node_id;
12173+
12174+ return add_to_recvqueue(msg, len, nodeid);
12175+}
12176+
12177+/*
12178+ * These send routines are used by sm and are always called from sm context.
12179+ */
12180+
12181+int send_nodeid_message(char *msg, int len, uint32_t nodeid)
12182+{
12183+ int error = 0;
12184+ struct sockaddr_cl saddr;
12185+
12186+ if (nodeid == sm_our_nodeid) {
12187+ add_to_recvqueue(msg, len, nodeid);
12188+ goto out;
12189+ }
12190+
12191+ saddr.scl_family = AF_CLUSTER;
12192+ saddr.scl_port = CLUSTER_PORT_SERVICES;
12193+ saddr.scl_nodeid = nodeid;
12194+ error = kcl_sendmsg(sm_socket, msg, len, &saddr,
12195+ sizeof(saddr), 0);
12196+ if (error > 0)
12197+ error = 0;
12198+
12199+ if (error)
12200+ log_print("send_nodeid_message error %d to %u", error, nodeid);
12201+ out:
12202+ return error;
12203+}
12204+
12205+int send_broadcast_message(char *msg, int len)
12206+{
12207+ int error;
12208+
12209+ error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
12210+ if (error > 0)
12211+ error = 0;
12212+
12213+ add_to_recvqueue(msg, len, sm_our_nodeid);
12214+
12215+ if (error)
12216+ log_print("send_broadcast_message error %d", error);
12217+
12218+ return error;
12219+}
12220+
12221+int send_members_message(sm_group_t *sg, char *msg, int len)
12222+{
12223+ sm_node_t *node;
12224+ int error = 0;
12225+
12226+ list_for_each_entry(node, &sg->memb, list) {
12227+ error = send_nodeid_message(msg, len, node->id);
12228+ if (error < 0)
12229+ break;
12230+ }
12231+ return error;
12232+}
12233+
12234+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12235+ sm_sevent_t * sev)
12236+{
12237+ int error;
12238+ sm_msg_t *smsg = (sm_msg_t *) msg;
12239+
12240+ set_allowed_msgtype(sev, smsg->ms_type);
12241+ sev->se_reply_count = 0;
12242+
12243+ error = send_members_message(sg, msg, len);
12244+ if (error < 0)
12245+ clear_allowed_msgtype(sev, smsg->ms_type);
12246+
12247+ return error;
12248+}
12249+
12250+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
12251+{
12252+ int error;
12253+ sm_msg_t *smsg = (sm_msg_t *) msg;
12254+
12255+ set_allowed_msgtype(sev, smsg->ms_type);
12256+ sev->se_reply_count = 0;
12257+
12258+ error = send_broadcast_message(msg, len);
12259+ if (error < 0)
12260+ clear_allowed_msgtype(sev, smsg->ms_type);
12261+
12262+ return error;
12263+}
12264diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
12265--- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12266+++ linux-patched/cluster/cman/sm_message.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 12267@@ -0,0 +1,34 @@
12268+/******************************************************************************
12269+*******************************************************************************
12270+**
12271+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12272+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12273+**
12274+** This copyrighted material is made available to anyone wishing to use,
12275+** modify, copy, or redistribute it subject to the terms and conditions
12276+** of the GNU General Public License v.2.
12277+**
12278+*******************************************************************************
12279+******************************************************************************/
12280+
12281+#ifndef __SM_MESSAGE_DOT_H__
12282+#define __SM_MESSAGE_DOT_H__
12283+
12284+void init_messages(void);
12285+uint32_t sm_new_global_id(int level);
12286+void smsg_bswap_out(sm_msg_t * smsg);
12287+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
12288+ sm_sevent_t *sev);
12289+void process_messages(void);
12290+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12291+ unsigned int node_id);
12292+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
12293+int send_broadcast_message(char *msg, int len);
12294+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
12295+int send_members_message(sm_group_t *sg, char *msg, int len);
12296+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12297+ sm_sevent_t * sev);
12298+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12299+ unsigned int node_id);
12300+
12301+#endif
12302diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
12303--- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12304+++ linux-patched/cluster/cman/sm_misc.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 12305@@ -0,0 +1,369 @@
12306+/******************************************************************************
12307+*******************************************************************************
12308+**
12309+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12310+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12311+**
12312+** This copyrighted material is made available to anyone wishing to use,
12313+** modify, copy, or redistribute it subject to the terms and conditions
12314+** of the GNU General Public License v.2.
12315+**
12316+*******************************************************************************
12317+******************************************************************************/
12318+
12319+#include "sm.h"
12320+#include "config.h"
12321+
12322+#define MAX_DEBUG_MSG_LEN (40)
12323+
12324+extern struct list_head sm_members;
12325+static uint32_t local_ids;
12326+static uint32_t event_id;
12327+static spinlock_t event_id_lock;
12328+static char * debug_buf;
12329+static unsigned int debug_size;
12330+static unsigned int debug_point;
12331+static int debug_wrap;
12332+static spinlock_t debug_lock;
12333+
12334+
12335+void init_sm_misc(void)
12336+{
12337+ local_ids = 1;
12338+ event_id = 1;
12339+ spin_lock_init(&event_id_lock);
12340+ debug_buf = NULL;
12341+ debug_size = 0;
12342+ debug_point = 0;
12343+ debug_wrap = 0;
12344+ spin_lock_init(&debug_lock);
12345+
12346+ sm_debug_setup(cman_config.sm_debug_size);
12347+}
12348+
12349+sm_node_t *sm_new_node(uint32_t nodeid)
12350+{
12351+ struct kcl_cluster_node kclnode;
12352+ sm_node_t *node;
12353+ int error;
12354+
12355+ error = kcl_get_node_by_nodeid(nodeid, &kclnode);
12356+ SM_ASSERT(!error,);
12357+
12358+ SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
12359+ node);
12360+
12361+ memset(node, 0, sizeof(sm_node_t));
12362+ node->id = nodeid;
12363+ node->incarnation = kclnode.incarnation;
12364+ return node;
12365+}
12366+
12367+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
12368+{
12369+ sm_node_t *node;
12370+
12371+ list_for_each_entry(node, &sg->joining, list) {
12372+ if (node->id == nodeid)
12373+ return node;
12374+ }
12375+ return NULL;
12376+}
12377+
12378+sm_node_t *sm_find_member(uint32_t nodeid)
12379+{
12380+ sm_node_t *node;
12381+
12382+ list_for_each_entry(node, &sm_members, list) {
12383+ if (node->id == nodeid)
12384+ return node;
12385+ }
12386+ return NULL;
12387+}
12388+
12389+uint32_t sm_new_local_id(int level)
12390+{
12391+ uint32_t id = local_ids++;
12392+ uint8_t l = (uint8_t) level;
12393+
12394+ if (level > 0xFF)
12395+ return 0;
12396+
12397+ if (id > 0x00FFFFFF)
12398+ return 0;
12399+
12400+ id |= (l << 24);
12401+ return id;
12402+}
12403+
12404+int sm_id_to_level(uint32_t id)
12405+{
12406+ uint8_t l = (id & 0xFF000000) >> 24;
12407+
12408+ return (int) l;
12409+}
12410+
12411+void sm_set_event_id(int *id)
12412+{
12413+ spin_lock(&event_id_lock);
12414+ *id = event_id++;
12415+ spin_unlock(&event_id_lock);
12416+}
12417+
12418+sm_group_t *sm_local_id_to_sg(int id)
12419+{
12420+ sm_group_t *sg;
12421+ int level = sm_id_to_level(id);
12422+ int found = FALSE;
12423+
12424+ down(&sm_sglock);
12425+
12426+ list_for_each_entry(sg, &sm_sg[level], list) {
12427+ if (sg->local_id == id) {
12428+ found = TRUE;
12429+ break;
12430+ }
12431+ }
12432+ up(&sm_sglock);
12433+ if (!found)
12434+ sg = NULL;
12435+ return sg;
12436+}
12437+
12438+sm_group_t *sm_global_id_to_sg(int id)
12439+{
12440+ sm_group_t *sg;
12441+ int level = sm_id_to_level(id);
12442+ int found = FALSE;
12443+
12444+ down(&sm_sglock);
12445+
12446+ list_for_each_entry(sg, &sm_sg[level], list) {
12447+ if (sg->global_id == id) {
12448+ found = TRUE;
12449+ break;
12450+ }
12451+ }
12452+ up(&sm_sglock);
12453+ if (!found)
12454+ sg = NULL;
12455+ return sg;
12456+}
12457+
12458+void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
12459+{
12460+ va_list va;
12461+ int i, n, size, len;
12462+ char buf[MAX_DEBUG_MSG_LEN+1];
12463+
12464+ spin_lock(&debug_lock);
12465+
12466+ if (!debug_buf)
12467+ goto out;
12468+
12469+ size = MAX_DEBUG_MSG_LEN;
12470+ memset(buf, 0, size+1);
12471+
12472+ n = snprintf(buf, size, "%08x ", sg->global_id);
12473+ size -= n;
12474+
12475+ va_start(va, fmt);
12476+ vsnprintf(buf+n, size, fmt, va);
12477+ va_end(va);
12478+
12479+ len = strlen(buf);
12480+ if (len > MAX_DEBUG_MSG_LEN-1)
12481+ len = MAX_DEBUG_MSG_LEN-1;
12482+ buf[len] = '\n';
12483+ buf[len+1] = '\0';
12484+
12485+ for (i = 0; i < strlen(buf); i++) {
12486+ debug_buf[debug_point++] = buf[i];
12487+
12488+ if (debug_point == debug_size) {
12489+ debug_point = 0;
12490+ debug_wrap = 1;
12491+ }
12492+ }
12493+ out:
12494+ spin_unlock(&debug_lock);
12495+}
12496+
12497+void sm_debug_setup(int size)
12498+{
12499+ char *b = kmalloc(size, GFP_KERNEL);
12500+
12501+ spin_lock(&debug_lock);
12502+ if (debug_buf)
12503+ kfree(debug_buf);
12504+
12505+ if (size > PAGE_SIZE)
12506+ size = PAGE_SIZE;
12507+ debug_size = size;
12508+ debug_point = 0;
12509+ debug_wrap = 0;
12510+ debug_buf = b;
12511+ memset(debug_buf, 0, debug_size);
12512+ spin_unlock(&debug_lock);
12513+}
12514+
12515+#ifdef CONFIG_PROC_FS
12516+
12517+int sm_debug_info(char *b, char **start, off_t offset, int length)
12518+{
12519+ int i, n = 0;
12520+
12521+ spin_lock(&debug_lock);
12522+
12523+ if (debug_wrap) {
12524+ for (i = debug_point; i < debug_size; i++)
12525+ n += sprintf(b + n, "%c", debug_buf[i]);
12526+ }
12527+ for (i = 0; i < debug_point; i++)
12528+ n += sprintf(b + n, "%c", debug_buf[i]);
12529+
12530+ spin_unlock(&debug_lock);
12531+
12532+ return n;
12533+}
12534+
12535+int sm_procdata(char *b, char **start, off_t offset, int length)
12536+{
12537+ sm_group_t *sg;
12538+ sm_node_t *node;
12539+ int n = 0, level, i;
12540+
12541+ n += sprintf(b + n, "\n");
12542+
12543+ /*
12544+ * Header
12545+ */
12546+
12547+ n += sprintf(b + n,
12548+ "Service Name GID LID State Code\n");
12549+
12550+ down(&sm_sglock);
12551+
12552+ for (level = 0; level < SG_LEVELS; level++) {
12553+ list_for_each_entry(sg, &sm_sg[level], list) {
12554+
12555+ /*
12556+ * Cluster Service
12557+ */
12558+
12559+ switch (level) {
12560+ case SERVICE_LEVEL_FENCE:
12561+ n += sprintf(b + n, "Fence Domain: ");
12562+ break;
12563+ case SERVICE_LEVEL_GDLM:
12564+ n += sprintf(b + n, "DLM Lock Space: ");
12565+ break;
12566+ case SERVICE_LEVEL_GFS:
12567+ n += sprintf(b + n, "GFS Mount Group: ");
12568+ break;
12569+ case SERVICE_LEVEL_USER:
12570+ n += sprintf(b + n, "User: ");
12571+ break;
12572+ }
12573+
12574+ /*
12575+ * Name
12576+ */
12577+
12578+ n += sprintf(b + n, "\"");
12579+ for (i = 0; i < sg->namelen; i++)
12580+ n += sprintf(b + n, "%c", sg->name[i]);
12581+ n += sprintf(b + n, "\"");
12582+
12583+ for (; i < MAX_SERVICE_NAME_LEN-1; i++)
12584+ n += sprintf(b + n, " ");
12585+
12586+ /*
12587+ * GID LID (sans level from top byte)
12588+ */
12589+
12590+ n += sprintf(b + n, "%3u %3u ",
12591+ (sg->global_id & 0x00FFFFFF),
12592+ (sg->local_id & 0x00FFFFFF));
12593+
12594+ /*
12595+ * State
12596+ */
12597+
12598+ switch (sg->state) {
12599+ case SGST_NONE:
12600+ n += sprintf(b + n, "none ");
12601+ break;
12602+ case SGST_JOIN:
12603+ n += sprintf(b + n, "join ");
12604+ break;
12605+ case SGST_RUN:
12606+ n += sprintf(b + n, "run ");
12607+ break;
12608+ case SGST_RECOVER:
12609+ n += sprintf(b + n, "recover %u ",
12610+ sg->recover_state);
12611+ break;
12612+ case SGST_UEVENT:
12613+ n += sprintf(b + n, "update ");
12614+ break;
12615+ }
12616+
12617+ /*
12618+ * Code
12619+ */
12620+
12621+ if (test_bit(SGFL_SEVENT, &sg->flags))
12622+ n += sprintf(b + n, "S");
12623+ if (test_bit(SGFL_UEVENT, &sg->flags))
12624+ n += sprintf(b + n, "U");
12625+ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
12626+ n += sprintf(b + n, "N");
12627+
12628+ n += sprintf(b + n, "-");
12629+
12630+ if (test_bit(SGFL_SEVENT, &sg->flags)
12631+ && sg->sevent) {
12632+ n += sprintf(b + n, "%u,%lx,%u",
12633+ sg->sevent->se_state,
12634+ sg->sevent->se_flags,
12635+ sg->sevent->se_reply_count);
12636+ }
12637+
12638+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
12639+ n += sprintf(b + n, "%u,%lx,%u",
12640+ sg->uevent.ue_state,
12641+ sg->uevent.ue_flags,
12642+ sg->uevent.ue_nodeid);
12643+ }
12644+
12645+ n += sprintf(b + n, "\n");
12646+
12647+ /*
12648+ * node list
12649+ */
12650+
12651+ i = 0;
12652+
12653+ n += sprintf(b + n, "[");
12654+
12655+ list_for_each_entry(node, &sg->memb, list) {
12656+ if (i && !(i % 24))
12657+ n += sprintf(b + n, "\n");
12658+
12659+ if (i)
12660+ n += sprintf(b + n, " ");
12661+
12662+ n += sprintf(b + n, "%u", node->id);
12663+ i++;
12664+ }
12665+
12666+ n += sprintf(b + n, "]\n\n");
12667+ }
12668+ }
12669+
12670+ up(&sm_sglock);
12671+
12672+ return n;
12673+}
12674+#endif
12675diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
12676--- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12677+++ linux-patched/cluster/cman/sm_misc.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 12678@@ -0,0 +1,29 @@
12679+/******************************************************************************
12680+*******************************************************************************
12681+**
12682+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12683+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12684+**
12685+** This copyrighted material is made available to anyone wishing to use,
12686+** modify, copy, or redistribute it subject to the terms and conditions
12687+** of the GNU General Public License v.2.
12688+**
12689+*******************************************************************************
12690+******************************************************************************/
12691+
12692+#ifndef __SM_MISC_DOT_H__
12693+#define __SM_MISC_DOT_H__
12694+
12695+void init_sm_misc(void);
12696+sm_node_t *sm_new_node(uint32_t nodeid);
12697+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
12698+sm_node_t *sm_find_member(uint32_t nodeid);
12699+uint32_t sm_new_local_id(int level);
12700+int sm_id_to_level(uint32_t id);
12701+void sm_set_event_id(int *id);
12702+sm_group_t *sm_local_id_to_sg(int id);
12703+sm_group_t *sm_global_id_to_sg(int id);
12704+void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
12705+void sm_debug_setup(int size);
12706+
12707+#endif
12708diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
12709--- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 12710+++ linux-patched/cluster/cman/sm_recover.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 12711@@ -0,0 +1,522 @@
12712+/******************************************************************************
12713+*******************************************************************************
12714+**
12715+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12716+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12717+**
12718+** This copyrighted material is made available to anyone wishing to use,
12719+** modify, copy, or redistribute it subject to the terms and conditions
12720+** of the GNU General Public License v.2.
12721+**
12722+*******************************************************************************
12723+******************************************************************************/
12724+
12725+#include "sm.h"
12726+#include "config.h"
12727+
12728+/*
12729+ * A collection of sg's which need to be recovered due to a failed member.
12730+ * These sg's are recovered in order of level. An sg subject to cascading
12731+ * failures is moved from one of these structs to a newer one.
12732+ */
12733+
12734+struct recover {
12735+ struct list_head list; /* list of current re's */
12736+ struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */
12737+ int event_id; /* event id */
12738+ int cur_level;
12739+};
12740+typedef struct recover recover_t;
12741+
12742+
12743+extern uint32_t * sm_new_nodeids;
12744+extern int sm_quorum, sm_quorum_next;
12745+extern uint32_t sm_our_nodeid;
12746+extern struct list_head sm_members;
12747+extern int sm_member_count;
12748+static struct list_head recoveries;
12749+
12750+
12751+void init_recovery(void)
12752+{
12753+ INIT_LIST_HEAD(&recoveries);
12754+}
12755+
12756+/*
12757+ * This is the first thing called when a change is announced in cluster
12758+ * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new
12759+ * nodes to its sm_members list which it's not seen before. Nodes which were
12760+ * alive but are now gone are marked as "need recovery".
12761+ *
12762+ * The "need recovery" status of nodes is propagated to the node's SG's in
12763+ * mark_effected_sgs. The effected SG's are themselves marked as needing
12764+ * recovery and in new_recovery the dead nodes are removed from the SG's
12765+ * individual member lists. The "need recovery" status of nodes is cleared in
12766+ * adjust_members_done().
12767+ */
12768+
12769+static int adjust_members(void)
12770+{
12771+ sm_node_t *node;
12772+ struct kcl_cluster_node knode;
12773+ int i, error, num_nodes, sub = 0, add = 0, found;
12774+
12775+ /*
12776+ * Get list of current members from cnxman
12777+ */
12778+
12779+ memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
12780+ num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
12781+
12782+ /*
12783+ * Determine who's gone
12784+ */
12785+
12786+ list_for_each_entry(node, &sm_members, list) {
12787+ found = FALSE;
12788+ for (i = 0; i < num_nodes; i++) {
12789+ if (node->id == sm_new_nodeids[i]) {
12790+ found = TRUE;
12791+ sm_new_nodeids[i] = 0;
12792+ break;
12793+ }
12794+ }
12795+
12796+ if (found) {
12797+ error = kcl_get_node_by_nodeid(node->id, &knode);
12798+ SM_ASSERT(!error, printk("error=%d\n", error););
12799+
12800+ if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
12801+ /* former member is back */
12802+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
12803+ node->incarnation = knode.incarnation;
12804+ add++;
12805+ } else {
12806+ /* current member is still alive - if the
12807+ * incarnation number is different it died and
12808+ * returned between checks */
12809+ if (node->incarnation != knode.incarnation) {
12810+ set_bit(SNFL_NEED_RECOVERY,
12811+ &node->flags);
12812+ node->incarnation = knode.incarnation;
12813+ sub++;
12814+ }
12815+ }
12816+ } else {
12817+ /* current member has died */
12818+ if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
12819+ &node->flags)) {
12820+ set_bit(SNFL_NEED_RECOVERY, &node->flags);
12821+ sub++;
12822+ }
12823+ }
12824+ }
12825+
12826+ /*
12827+ * Look for new nodes
12828+ */
12829+
12830+ for (i = 0; i < num_nodes; i++) {
12831+ if (sm_new_nodeids[i]) {
12832+ node = sm_new_node(sm_new_nodeids[i]);
12833+ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
12834+ add++;
12835+ list_add_tail(&node->list, &sm_members);
12836+ sm_member_count++;
12837+ }
12838+ }
12839+
12840+ /*
12841+ * Get our own nodeid
12842+ */
12843+
12844+ if (!sm_our_nodeid) {
12845+ list_for_each_entry(node, &sm_members, list) {
12846+ error = kcl_get_node_by_nodeid(node->id, &knode);
12847+ SM_ASSERT(!error, printk("error=%d\n", error););
12848+
12849+ if (knode.us) {
12850+ sm_our_nodeid = knode.node_id;
12851+ break;
12852+ }
12853+ }
12854+ }
12855+
12856+ return sub;
12857+}
12858+
12859+/*
12860+ * Given some number of dead nodes, flag SG's the dead nodes were part of.
12861+ * This requires a number of loops because each node structure does not keep a
12862+ * list of SG's it's in.
12863+ */
12864+
12865+static int mark_effected_sgs(void)
12866+{
12867+ sm_group_t *sg;
12868+ sm_node_t *node, *sgnode;
12869+ uint32_t dead_id;
12870+ int i, effected = 0;
12871+
12872+ down(&sm_sglock);
12873+
12874+ list_for_each_entry(node, &sm_members, list) {
12875+ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
12876+ continue;
12877+
12878+ dead_id = node->id;
12879+
12880+ for (i = 0; i < SG_LEVELS; i++) {
12881+ list_for_each_entry(sg, &sm_sg[i], list) {
12882+ /* check if dead node is among sg's members */
12883+ list_for_each_entry(sgnode, &sg->memb, list) {
12884+ if (sgnode->id == dead_id) {
12885+ set_bit(SGFL_NEED_RECOVERY,
12886+ &sg->flags);
12887+ effected++;
12888+ break;
12889+ }
12890+ }
12891+ }
12892+ }
12893+ }
12894+ up(&sm_sglock);
12895+
12896+ return effected;
12897+}
12898+
12899+static recover_t *alloc_recover(void)
12900+{
12901+ recover_t *rev;
12902+ int i;
12903+
12904+ SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
12905+
12906+ memset(rev, 0, sizeof(recover_t));
12907+
12908+ sm_set_event_id(&rev->event_id);
12909+
12910+ for (i = 0; i < SG_LEVELS; i++) {
12911+ INIT_LIST_HEAD(&rev->sgs[i]);
12912+ }
12913+
12914+ return rev;
12915+}
12916+
12917+/*
12918+ * An in-progress revent re-start for an SG is interrupted by another node
12919+ * failure in the SG. Cancel an outstanding barrier if there is one. The SG
12920+ * will be moved to the new revent and re-started as part of that.
12921+ */
12922+
12923+static void cancel_prev_recovery(sm_group_t *sg)
12924+{
12925+ int error;
12926+
12927+ if (sg->recover_state == RECOVER_BARRIERWAIT) {
12928+ error = kcl_barrier_cancel(sg->recover_barrier);
12929+ if (error)
12930+ log_error(sg, "cancel_prev_recovery: error %d", error);
12931+ }
12932+}
12933+
12934+static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
12935+{
12936+ if (sg->state == SGST_RECOVER) {
12937+ cancel_prev_recovery(sg);
12938+ list_del(&sg->recover_list);
12939+ }
12940+
12941+ sg->ops->stop(sg->service_data);
12942+ sg->state = SGST_RECOVER;
12943+ sg->recover_state = RECOVER_NONE;
12944+ sg->recover_data = rev;
12945+ list_add(&sg->recover_list, &rev->sgs[sg->level]);
12946+}
12947+
12948+/*
12949+ * When adjust_members finds that some nodes are dead and mark_effected_sgs
12950+ * finds that some SG's are effected by departed nodes, this is called to
12951+ * collect together the SG's which need to be recovered. An revent (recovery
12952+ * event) is the group of effected SG's.
12953+ */
12954+
12955+static int new_recovery(void)
12956+{
12957+ sm_group_t *sg;
12958+ recover_t *rev;
12959+ sm_node_t *node, *sgnode, *safe;
12960+ int i;
12961+
12962+ rev = alloc_recover();
12963+ list_add_tail(&rev->list, &recoveries);
12964+
12965+ down(&sm_sglock);
12966+
12967+ /*
12968+ * Stop effected SG's and add them to the rev
12969+ */
12970+
12971+ for (i = 0; i < SG_LEVELS; i++) {
12972+ list_for_each_entry(sg, &sm_sg[i], list) {
12973+ if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
12974+ if (sg->state == SGST_JOIN)
12975+ continue;
12976+ pre_recover_sg(sg, rev);
12977+ }
12978+ }
12979+ }
12980+
12981+ /*
12982+ * For an SG needing recovery, remove dead nodes from sg->memb list
12983+ */
12984+
12985+ for (i = 0; i < SG_LEVELS; i++) {
12986+ list_for_each_entry(sg, &rev->sgs[i], recover_list) {
12987+
12988+ /* Remove dead members from SG's member list */
12989+ list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
12990+
12991+ node = sm_find_member(sgnode->id);
12992+ SM_ASSERT(node, printk("id %u\n", sgnode->id););
12993+
12994+ if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
12995+ list_del(&sgnode->list);
12996+ kfree(sgnode);
12997+ sg->memb_count--;
12998+ log_debug(sg, "remove node %u count %d",
12999+ sgnode->id, sg->memb_count);
13000+ }
13001+ }
13002+ }
13003+ }
13004+
13005+ up(&sm_sglock);
13006+ rev->cur_level = 0;
13007+ return 0;
13008+}
13009+
13010+/*
13011+ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
13012+ * mark_effected_sgs() and add_revent(). After that, we're done using the bit
13013+ * and we clear it here.
13014+ */
13015+
13016+static void adjust_members_done(void)
13017+{
13018+ sm_node_t *node;
13019+
13020+ list_for_each_entry(node, &sm_members, list)
13021+ clear_bit(SNFL_NEED_RECOVERY, &node->flags);
13022+}
13023+
13024+/*
13025+ * Start the service of the given SG. The service must be given an array of
13026+ * nodeids specifying the new sg membership. The service is responsible to
13027+ * free this chunk of memory when done with it.
13028+ */
13029+
13030+static void start_sg(sm_group_t *sg, uint32_t event_id)
13031+{
13032+ sm_node_t *node;
13033+ uint32_t *memb;
13034+ int count = 0;
13035+
13036+ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
13037+ memb);
13038+
13039+ list_for_each_entry(node, &sg->memb, list)
13040+ memb[count++] = node->id;
13041+
13042+ sg->ops->start(sg->service_data, memb, count, event_id,
13043+ SERVICE_NODE_FAILED);
13044+}
13045+
13046+static void recovery_barrier(sm_group_t *sg)
13047+{
13048+ char bname[MAX_BARRIER_NAME_LEN];
13049+ int error, len;
13050+
13051+ memset(bname, 0, MAX_BARRIER_NAME_LEN);
13052+
13053+ /* bypass the barrier if we're the only member */
13054+ if (sg->memb_count == 1) {
13055+ process_recovery_barrier(sg, 0);
13056+ return;
13057+ }
13058+
13059+ len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
13060+ sg->global_id, sg->recover_stop, sg->memb_count);
13061+
13062+ /* We save this barrier name so we can cancel it if needed. */
13063+ memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
13064+ memcpy(sg->recover_barrier, bname, len);
13065+
13066+ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
13067+ if (error)
13068+ log_error(sg, "recovery_barrier error %d: %s", error, bname);
13069+}
13070+
13071+static void recover_sg(sm_group_t *sg, int event_id)
13072+{
13073+ log_debug(sg, "recover state %d", sg->recover_state);
13074+
13075+ switch (sg->recover_state) {
13076+
13077+ case RECOVER_NONE:
13078+ /* must wait for recovery to stop sg on all nodes */
13079+ sg->recover_state = RECOVER_BARRIERWAIT;
13080+ sg->recover_stop = 0;
13081+ recovery_barrier(sg);
13082+ break;
13083+
13084+ case RECOVER_BARRIERWAIT:
13085+ break;
13086+
13087+ case RECOVER_STOP:
13088+ /* barrier callback sets state STOP */
13089+ sg->recover_stop = 1;
13090+ sg->recover_state = RECOVER_START;
13091+ start_sg(sg, event_id);
13092+ break;
13093+
13094+ case RECOVER_START:
13095+ break;
13096+
13097+ case RECOVER_STARTDONE:
13098+ /* service callback sets state STARTDONE */
13099+ sg->recover_state = RECOVER_BARRIERWAIT;
13100+ recovery_barrier(sg);
13101+ break;
13102+
13103+ case RECOVER_BARRIERDONE:
13104+ /* barrier callback sets state BARRIERDONE */
13105+ sg->ops->finish(sg->service_data, event_id);
13106+ list_del(&sg->recover_list);
13107+ sg->recover_state = RECOVER_NONE;
13108+ sg->state = SGST_RUN;
13109+
13110+ /* Continue a previous, interrupted attempt to leave the sg */
13111+ if (sg->sevent) {
13112+ clear_bit(SEFL_DELAY, &sg->sevent->se_flags);
13113+ set_bit(SEFL_CHECK, &sg->sevent->se_flags);
13114+ wake_serviced(DO_JOINLEAVE);
13115+ }
13116+ break;
13117+
13118+ default:
13119+ log_error(sg, "invalid recover_state %u", sg->recover_state);
13120+ }
13121+}
13122+
13123+static void recover_level(recover_t *rev, int level)
13124+{
13125+ sm_group_t *sg, *safe;
13126+
13127+ list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
13128+ recover_sg(sg, rev->event_id);
13129+}
13130+
13131+static void recover_levels(recover_t *rev)
13132+{
13133+ for (;;) {
13134+ recover_level(rev, rev->cur_level);
13135+
13136+ if (list_empty(&rev->sgs[rev->cur_level])) {
13137+ if (rev->cur_level == SG_LEVELS - 1) {
13138+ list_del(&rev->list);
13139+ kfree(rev);
13140+ return;
13141+ }
13142+ rev->cur_level++;
13143+ continue;
13144+ }
13145+ break;
13146+ }
13147+}
13148+
13149+/*
13150+ * Called by SM thread when the cluster is quorate. It restarts
13151+ * SG's that were stopped in new_recovery() due to a member death.
13152+ * It waits for all SG's at level N to complete restart before
13153+ * restarting SG's at level N+1.
13154+ */
13155+
13156+void process_recoveries(void)
13157+{
13158+ recover_t *rev, *safe;
13159+
13160+ down(&sm_sglock);
13161+ list_for_each_entry_safe(rev, safe, &recoveries, list)
13162+ recover_levels(rev);
13163+ up(&sm_sglock);
13164+}
13165+
13166+/*
13167+ * The cnxman membership has changed. Check if there's still quorum and
13168+ * whether any nodes have died. If nodes have died, initiate recovery on any
13169+ * SG's they were in. This begins immediately if the cluster remains quorate;
13170+ * if not this waits until the cluster regains quorum.
13171+ */
13172+
13173+void process_nodechange(void)
13174+{
13175+ int gone, effected;
13176+
13177+ if ((sm_quorum = sm_quorum_next))
13178+ wake_serviced(DO_RUN);
13179+
13180+ gone = adjust_members();
13181+ if (gone > 0) {
13182+ effected = mark_effected_sgs();
13183+
13184+ backout_sevents();
13185+ cancel_uevents(&effected);
13186+
13187+ if (effected > 0) {
13188+ new_recovery();
13189+ wake_serviced(DO_RECOVERIES);
13190+ }
13191+ }
13192+ adjust_members_done();
13193+}
13194+
13195+int check_recovery(sm_group_t *sg, int event_id)
13196+{
13197+ if (sg->state == SGST_RECOVER) {
13198+ recover_t *rev = (recover_t *) sg->recover_data;
13199+ if (rev && rev->event_id == event_id)
13200+ return 1;
13201+ }
13202+ return 0;
13203+}
13204+
13205+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
13206+{
13207+ sm_group_t *sg;
13208+ recover_t *rev;
13209+
13210+ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
13211+ if (!sg) {
13212+ log_print("process_recover_msg: unknown sg id %x",
13213+ smsg->ms_global_sgid);
13214+ return;
13215+ }
13216+
13217+ /* we already know about the recovery and can ignore the msg */
13218+ if (sg->state == SGST_RECOVER)
13219+ return;
13220+
13221+ if (test_bit(SGFL_UEVENT, &sg->flags)) {
13222+ /* we will initiate recovery on our own if we know about the
13223+ uevent so we can ignore this */
13224+ log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
13225+ return;
13226+ }
13227+
13228+ log_debug(sg, "recovery initiated by msg from %u", nodeid);
13229+ rev = alloc_recover();
13230+ list_add_tail(&rev->list, &recoveries);
13231+ pre_recover_sg(sg, rev);
13232+ wake_serviced(DO_RECOVERIES);
13233+}
13234diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
13235--- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13236+++ linux-patched/cluster/cman/sm_recover.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 13237@@ -0,0 +1,23 @@
13238+/******************************************************************************
13239+*******************************************************************************
13240+**
13241+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13242+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13243+**
13244+** This copyrighted material is made available to anyone wishing to use,
13245+** modify, copy, or redistribute it subject to the terms and conditions
13246+** of the GNU General Public License v.2.
13247+**
13248+*******************************************************************************
13249+******************************************************************************/
13250+
13251+#ifndef __SM_RECOVER_DOT_H__
13252+#define __SM_RECOVER_DOT_H__
13253+
13254+void init_recovery(void);
13255+void process_recoveries(void);
13256+void process_nodechange(void);
13257+int check_recovery(sm_group_t *sg, int event_id);
13258+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
13259+
13260+#endif
13261diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
13262--- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13263+++ linux-patched/cluster/cman/sm_services.c 2004-06-29 20:07:51.000000000 +0800
4bf12011 13264@@ -0,0 +1,418 @@
13265+/******************************************************************************
13266+*******************************************************************************
13267+**
13268+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13269+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13270+**
13271+** This copyrighted material is made available to anyone wishing to use,
13272+** modify, copy, or redistribute it subject to the terms and conditions
13273+** of the GNU General Public License v.2.
13274+**
13275+*******************************************************************************
13276+******************************************************************************/
13277+
13278+#include "sm.h"
13279+
13280+static struct list_head callbacks;
13281+static spinlock_t callback_lock;
13282+static struct list_head sg_registered[SG_LEVELS];
13283+
13284+/*
13285+ * These are the functions to register, join, leave, unregister, callback
13286+ * with/to the sm.
13287+ */
13288+
13289+struct sc_entry {
13290+ struct list_head list;
13291+ uint32_t local_id;
13292+ int event_id;
13293+};
13294+typedef struct sc_entry sc_entry_t;
13295+
13296+void init_services(void)
13297+{
13298+ int i;
13299+
13300+ INIT_LIST_HEAD(&callbacks);
13301+ spin_lock_init(&callback_lock);
13302+
13303+ for (i = 0; i < SG_LEVELS; i++) {
13304+ INIT_LIST_HEAD(&sm_sg[i]);
13305+ INIT_LIST_HEAD(&sg_registered[i]);
13306+ }
13307+ init_MUTEX(&sm_sglock);
13308+}
13309+
13310+/* Context: service */
13311+
13312+int kcl_register_service(char *name, int namelen, int level,
13313+ struct kcl_service_ops *ops, int unique,
13314+ void *servicedata, uint32_t *service_id)
13315+{
13316+ sm_group_t *sg;
13317+ int found = FALSE;
13318+ int error = -EINVAL;
13319+
13320+ if (level > SG_LEVELS - 1)
13321+ goto fail;
13322+
13323+ if (namelen > MAX_SERVICE_NAME_LEN)
13324+ goto fail;
13325+
13326+ error = kcl_addref_cluster();
13327+ if (error)
13328+ goto fail;
13329+
13330+ down(&sm_sglock);
13331+
13332+ list_for_each_entry(sg, &sm_sg[level], list) {
13333+ if ((sg->namelen == namelen) &&
13334+ (!strncmp(sg->name, name, namelen))) {
13335+ found = TRUE;
13336+ goto next;
13337+ }
13338+ }
13339+
13340+ list_for_each_entry(sg, &sg_registered[level], list) {
13341+ if ((sg->namelen == namelen) &&
13342+ (!strncmp(sg->name, name, namelen))) {
13343+ found = TRUE;
13344+ goto next;
13345+ }
13346+ }
13347+
13348+ next:
13349+
13350+ if (found && unique) {
13351+ error = -EEXIST;
13352+ goto fail_unlock;
13353+ }
13354+
13355+ if (found) {
13356+ sg->refcount++;
13357+ goto out;
13358+ }
13359+
13360+ sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
13361+ if (!sg) {
13362+ error = -ENOMEM;
13363+ goto fail_unlock;
13364+ }
13365+ memset(sg, 0, sizeof(sm_group_t) + namelen);
13366+
13367+ sg->refcount = 1;
13368+ sg->service_data = servicedata;
13369+ sg->ops = ops;
13370+ sg->level = level;
13371+ sg->namelen = namelen;
13372+ memcpy(sg->name, name, namelen);
13373+ sg->local_id = sm_new_local_id(level);
13374+ sg->state = SGST_NONE;
13375+ INIT_LIST_HEAD(&sg->memb);
13376+ INIT_LIST_HEAD(&sg->joining);
13377+ init_completion(&sg->event_comp);
13378+
13379+ list_add_tail(&sg->list, &sg_registered[level]);
13380+
13381+ out:
13382+ *service_id = sg->local_id;
13383+ up(&sm_sglock);
13384+ return 0;
13385+
13386+ fail_unlock:
13387+ up(&sm_sglock);
13388+ kcl_releaseref_cluster();
13389+ fail:
13390+ return error;
13391+}
13392+
13393+/* Context: service */
13394+
13395+void kcl_unregister_service(uint32_t local_id)
13396+{
13397+ sm_group_t *sg;
13398+ int level = sm_id_to_level(local_id);
13399+
13400+ down(&sm_sglock);
13401+
13402+ list_for_each_entry(sg, &sg_registered[level], list) {
13403+ if (sg->local_id == local_id) {
13404+ SM_ASSERT(sg->refcount,);
13405+ sg->refcount--;
13406+
13407+ if (!sg->refcount) {
13408+ list_del(&sg->list);
13409+ kfree(sg);
13410+ }
13411+ kcl_releaseref_cluster();
13412+ break;
13413+ }
13414+ }
13415+ up(&sm_sglock);
13416+}
13417+
13418+/* Context: service */
13419+
13420+int kcl_join_service(uint32_t local_id)
13421+{
13422+ sm_group_t *sg;
13423+ sm_sevent_t *sev;
13424+ int level = sm_id_to_level(local_id);
13425+ int error, found = FALSE;
13426+
13427+ down(&sm_sglock);
13428+
13429+ list_for_each_entry(sg, &sg_registered[level], list) {
13430+ if (sg->local_id == local_id) {
13431+ found = TRUE;
13432+ break;
13433+ }
13434+ }
13435+
13436+ if (!found) {
13437+ up(&sm_sglock);
13438+ error = -ENOENT;
13439+ goto out;
13440+ }
13441+
13442+ if (sg->state != SGST_NONE) {
13443+ up(&sm_sglock);
13444+ error = -EINVAL;
13445+ goto out;
13446+ }
13447+
13448+ sg->state = SGST_JOIN;
13449+ set_bit(SGFL_SEVENT, &sg->flags);
13450+ list_del(&sg->list);
13451+ list_add_tail(&sg->list, &sm_sg[sg->level]);
13452+
13453+ up(&sm_sglock);
13454+
13455+ /*
13456+ * The join is a service event which will be processed asynchronously.
13457+ */
13458+
13459+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13460+ if (!sev) {
13461+ error = -ENOMEM;
13462+ goto out;
13463+ }
13464+
13465+ memset(sev, 0, sizeof (sm_sevent_t));
13466+ sev->se_state = SEST_JOIN_BEGIN;
13467+ sev->se_sg = sg;
13468+ sg->sevent = sev;
13469+ sm_set_event_id(&sev->se_id);
13470+
13471+ new_joinleave(sev);
13472+ wait_for_completion(&sg->event_comp);
13473+ error = 0;
13474+
13475+ out:
13476+ return error;
13477+}
13478+
13479+/* Context: service */
13480+
13481+int kcl_leave_service(uint32_t local_id)
13482+{
13483+ sm_group_t *sg = NULL;
13484+ sm_sevent_t *sev;
13485+ int error;
13486+
13487+ error = -ENOENT;
13488+ sg = sm_local_id_to_sg(local_id);
13489+ if (!sg)
13490+ goto out;
13491+
13492+ /* sg was never joined */
13493+ error = -EINVAL;
13494+ if (sg->state == SGST_NONE)
13495+ goto out;
13496+
13497+ /* may still be joining */
13498+ error = -EBUSY;
13499+ if (test_and_set_bit(SGFL_SEVENT, &sg->flags))
13500+ goto out;
13501+
13502+ error = -ENOMEM;
13503+ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13504+ if (!sev)
13505+ goto out;
13506+
13507+ memset(sev, 0, sizeof (sm_sevent_t));
13508+ sev->se_state = SEST_LEAVE_BEGIN;
13509+ set_bit(SEFL_LEAVE, &sev->se_flags);
13510+ sev->se_sg = sg;
13511+ sg->sevent = sev;
13512+ sm_set_event_id(&sev->se_id);
13513+
13514+ new_joinleave(sev);
13515+ wait_for_completion(&sg->event_comp);
13516+ error = 0;
13517+
13518+ down(&sm_sglock);
13519+ list_del(&sg->list);
13520+ list_add_tail(&sg->list, &sg_registered[sg->level]);
13521+ up(&sm_sglock);
13522+
13523+ out:
13524+ return error;
13525+}
13526+
13527+static void process_callback(uint32_t local_id, int event_id)
13528+{
13529+ sm_group_t *sg;
13530+ sm_sevent_t *sev;
13531+ sm_uevent_t *uev;
13532+
13533+ sg = sm_local_id_to_sg(local_id);
13534+ if (!sg)
13535+ return;
13536+
13537+ if (sg->state == SGST_RECOVER) {
13538+ if (!check_recovery(sg, event_id)) {
13539+ log_error(sg, "process_callback invalid recover "
13540+ "event id %d", event_id);
13541+ return;
13542+ }
13543+
13544+ if (sg->recover_state == RECOVER_START)
13545+ sg->recover_state = RECOVER_STARTDONE;
13546+ else
13547+ log_error(sg, "process_callback recover state %u",
13548+ sg->recover_state);
13549+ wake_serviced(DO_RECOVERIES);
13550+ }
13551+
13552+ else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
13553+ (sg->sevent->se_id == event_id)) {
13554+ sev = sg->sevent;
13555+
13556+ if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
13557+ (sev->se_state == SEST_JSTART_SERVICEWAIT))
13558+ sev->se_state = SEST_JSTART_SERVICEDONE;
13559+
13560+ set_bit(SEFL_CHECK, &sev->se_flags);
13561+ wake_serviced(DO_JOINLEAVE);
13562+ }
13563+
13564+ else if (test_bit(SGFL_UEVENT, &sg->flags) &&
13565+ (sg->uevent.ue_id == event_id)) {
13566+ uev = &sg->uevent;
13567+
13568+ if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
13569+ if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
13570+ uev->ue_state = UEST_JSTART_SERVICEDONE;
13571+ else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
13572+ uev->ue_state = UEST_LSTART_SERVICEDONE;
13573+ }
13574+ set_bit(UEFL_CHECK, &uev->ue_flags);
13575+ wake_serviced(DO_MEMBERSHIP);
13576+ }
13577+
13578+ else
13579+ log_error(sg, "ignoring service callback id=%x event=%u",
13580+ local_id, event_id);
13581+}
13582+
13583+void process_callbacks(void)
13584+{
13585+ sc_entry_t *se;
13586+
13587+ while (1) {
13588+ se = NULL;
13589+
13590+ spin_lock(&callback_lock);
13591+ if (!list_empty(&callbacks)) {
13592+ se = list_entry(callbacks.next, sc_entry_t, list);
13593+ list_del(&se->list);
13594+ }
13595+ spin_unlock(&callback_lock);
13596+
13597+ if (!se)
13598+ break;
13599+ process_callback(se->local_id, se->event_id);
13600+ kfree(se);
13601+ schedule();
13602+ }
13603+}
13604+
13605+/* Context: service */
13606+
13607+void kcl_start_done(uint32_t local_id, int event_id)
13608+{
13609+ sc_entry_t *se;
13610+
13611+ SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
13612+
13613+ se->local_id = local_id;
13614+ se->event_id = event_id;
13615+
13616+ spin_lock(&callback_lock);
13617+ list_add_tail(&se->list, &callbacks);
13618+ spin_unlock(&callback_lock);
13619+
13620+ wake_serviced(DO_CALLBACKS);
13621+}
13622+
13623+/* Context: service */
13624+
13625+void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
13626+{
13627+ sm_group_t *sg = sm_local_id_to_sg(local_id);
13628+
13629+ if (!sg)
13630+ log_print("kcl_global_service_id: can't find %x", local_id);
13631+ else
13632+ *global_id = sg->global_id;
13633+}
13634+
13635+static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
13636+{
13637+ s->level = sg->level;
13638+ s->local_id = sg->local_id;
13639+ s->global_id = sg->global_id;
13640+ s->node_count = sg->memb_count;
13641+ strcpy(s->name, sg->name);
13642+}
13643+
13644+int kcl_get_services(struct list_head *head, int level)
13645+{
13646+ sm_group_t *sg;
13647+ struct kcl_service *s;
13648+ int error = -ENOMEM, count = 0;
13649+
13650+ down(&sm_sglock);
13651+
13652+ list_for_each_entry(sg, &sg_registered[level], list) {
13653+ if (head) {
13654+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
13655+ if (!s)
13656+ goto out;
13657+ copy_to_service(sg, s);
13658+ list_add(&s->list, head);
13659+ }
13660+ count++;
13661+ }
13662+
13663+ list_for_each_entry(sg, &sm_sg[level], list) {
13664+ if (head) {
13665+ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
13666+ if (!s)
13667+ goto out;
13668+ copy_to_service(sg, s);
13669+ list_add(&s->list, head);
13670+ }
13671+ count++;
13672+ }
13673+
13674+ error = count;
13675+ out:
13676+ up(&sm_sglock);
13677+ return error;
13678+}
13679+
13680+/* These three global variables listed in extern form in sm.h. */
13681+struct list_head sm_sg[SG_LEVELS];
13682+struct semaphore sm_sglock;
13683diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
13684--- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 13685+++ linux-patched/cluster/cman/sm_services.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 13686@@ -0,0 +1,20 @@
13687+/******************************************************************************
13688+*******************************************************************************
13689+**
13690+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13691+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13692+**
13693+** This copyrighted material is made available to anyone wishing to use,
13694+** modify, copy, or redistribute it subject to the terms and conditions
13695+** of the GNU General Public License v.2.
13696+**
13697+*******************************************************************************
13698+******************************************************************************/
13699+
13700+#ifndef __SM_SERVICES_DOT_H__
13701+#define __SM_SERVICES_DOT_H__
13702+
13703+void init_services(void);
13704+void process_callbacks(void);
13705+
13706+#endif
13707diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
13708--- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730
5cdbd17b
AM
13709+++ linux-patched/cluster/cman/sm_user.c 2004-06-29 20:07:51.000000000 +0800
13710@@ -0,0 +1,569 @@
4bf12011 13711+/******************************************************************************
13712+*******************************************************************************
13713+**
13714+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13715+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13716+**
13717+** This copyrighted material is made available to anyone wishing to use,
13718+** modify, copy, or redistribute it subject to the terms and conditions
13719+** of the GNU General Public License v.2.
13720+**
13721+*******************************************************************************
13722+******************************************************************************/
13723+
13724+#include "sm.h"
13725+#include "cnxman-private.h"
13726+
13727+void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
13728+
13729+#define UST_REGISTER 1
13730+#define UST_UNREGISTER 2
13731+#define UST_JOIN 3
13732+#define UST_LEAVE 4
13733+#define UST_JOINED 5
13734+
13735+struct event {
13736+ struct list_head list;
13737+ service_event_t type;
13738+ service_start_t start_type;
13739+ unsigned int event_id;
13740+ unsigned int last_stop;
13741+ unsigned int last_start;
13742+ unsigned int last_finish;
13743+ unsigned int node_count;
13744+ uint32_t * nodeids;
13745+};
13746+typedef struct event event_t;
13747+
13748+struct user_service {
13749+ uint32_t local_id;
13750+ pid_t pid;
13751+ int signal;
13752+ struct socket * sock;
13753+ uint8_t state;
13754+ uint8_t async;
13755+ struct semaphore lock;
13756+ struct list_head events;
13757+ spinlock_t event_lock;
13758+ unsigned int last_stop;
13759+ unsigned int last_start;
13760+ unsigned int last_finish;
13761+ unsigned int need_startdone;
13762+ unsigned int node_count;
13763+ uint32_t * nodeids;
13764+ int name_len;
13765+ char name[MAX_SERVICE_NAME_LEN];
13766+};
13767+typedef struct user_service user_service_t;
13768+
13769+
13770+static void add_event(user_service_t *us, event_t *ev)
13771+{
13772+ spin_lock(&us->event_lock);
13773+ list_add_tail(&ev->list, &us->events);
13774+
13775+ switch(ev->type) {
13776+ case SERVICE_EVENT_STOP:
13777+ us->last_stop = us->last_start;
13778+ break;
13779+ case SERVICE_EVENT_START:
13780+ us->last_start = ev->event_id;
13781+ break;
13782+ case SERVICE_EVENT_FINISH:
13783+ us->last_finish = ev->event_id;
13784+ break;
13785+ case SERVICE_EVENT_LEAVEDONE:
13786+ break;
13787+ }
13788+ spin_unlock(&us->event_lock);
13789+}
13790+
13791+static event_t *get_event(user_service_t *us)
13792+{
13793+ event_t *ev = NULL;
13794+
13795+ spin_lock(&us->event_lock);
13796+ if (!list_empty(&us->events)) {
13797+ ev = list_entry(us->events.next, event_t, list);
13798+ ev->last_stop = us->last_stop;
13799+ ev->last_start = us->last_start;
13800+ ev->last_finish = us->last_finish;
13801+ }
13802+ spin_unlock(&us->event_lock);
13803+ return ev;
13804+}
13805+
13806+static void del_event(user_service_t *us, event_t *ev)
13807+{
13808+ spin_lock(&us->event_lock);
13809+ list_del(&ev->list);
13810+ spin_unlock(&us->event_lock);
13811+}
13812+
13813+static event_t *alloc_event(void)
13814+{
13815+ event_t *ev;
13816+ SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
13817+ memset(ev, 0, sizeof(event_t));
13818+ return ev;
13819+}
13820+
13821+/* us->lock must be held before calling */
13822+static void user_notify(user_service_t *us)
13823+{
13824+ if (us->sock)
13825+ queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
13826+ if (us->pid && us->signal)
13827+ kill_proc(us->pid, us->signal, 0);
13828+}
13829+
13830+static service_start_t start_type(int type)
13831+{
13832+ switch (type) {
13833+ case SERVICE_NODE_FAILED:
13834+ return SERVICE_START_FAILED;
13835+ case SERVICE_NODE_JOIN:
13836+ return SERVICE_START_JOIN;
13837+ case SERVICE_NODE_LEAVE:
13838+ return SERVICE_START_LEAVE;
13839+ }
13840+ return 0;
13841+}
13842+
13843+static int user_stop(void *servicedata)
13844+{
13845+ user_service_t *us = (user_service_t *) servicedata;
13846+ event_t *ev;
13847+
13848+ down(&us->lock);
13849+ if (!us->sock)
13850+ goto out;
13851+
13852+ ev = alloc_event();
13853+ ev->type = SERVICE_EVENT_STOP;
13854+
13855+ add_event(us, ev);
13856+ user_notify(us);
13857+ out:
13858+ up(&us->lock);
13859+ return 0;
13860+}
13861+
13862+static int user_start(void *servicedata, uint32_t *nodeids, int count,
13863+ int event_id, int type)
13864+{
13865+ user_service_t *us = (user_service_t *) servicedata;
13866+ event_t *ev;
13867+
13868+ down(&us->lock);
13869+ if (!us->sock) {
13870+ kcl_start_done(us->local_id, event_id);
13871+ goto out;
13872+ }
13873+
13874+ us->need_startdone = event_id;
13875+
13876+ ev = alloc_event();
13877+ ev->type = SERVICE_EVENT_START;
13878+ ev->node_count = count;
13879+ ev->start_type = start_type(type);
13880+ ev->event_id = event_id;
13881+ ev->nodeids = nodeids;
13882+
13883+ add_event(us, ev);
13884+ user_notify(us);
13885+ out:
13886+ up(&us->lock);
13887+ return 0;
13888+}
13889+
13890+static void user_finish(void *servicedata, int event_id)
13891+{
13892+ user_service_t *us = (user_service_t *) servicedata;
13893+ event_t *ev;
13894+
13895+ down(&us->lock);
13896+ if (!us->sock)
13897+ goto out;
13898+
13899+ ev = alloc_event();
13900+ ev->type = SERVICE_EVENT_FINISH;
13901+ ev->event_id = event_id;
13902+
13903+ add_event(us, ev);
13904+ user_notify(us);
13905+ out:
13906+ up(&us->lock);
13907+}
13908+
13909+struct kcl_service_ops user_service_ops = {
13910+ .stop = user_stop,
13911+ .start = user_start,
13912+ .finish = user_finish
13913+};
13914+
5cdbd17b 13915+static int user_register(char *u_name, user_service_t **us_data)
4bf12011 13916+{
13917+ user_service_t *us;
5cdbd17b
AM
13918+ char name[MAX_SERVICE_NAME_LEN+1];
13919+ int len, error;
13920+
13921+ memset(name, 0, MAX_SERVICE_NAME_LEN+1);
13922+
13923+ if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
13924+ return -EFAULT;
4bf12011 13925+
5cdbd17b
AM
13926+ len = strlen(name);
13927+ if (len > MAX_SERVICE_NAME_LEN)
4bf12011 13928+ return -ENAMETOOLONG;
13929+ if (!len)
13930+ return -EINVAL;
13931+
13932+ us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
13933+ if (!us)
13934+ return -ENOMEM;
13935+ memset(us, 0, sizeof(user_service_t));
13936+ us->nodeids = NULL;
13937+ INIT_LIST_HEAD(&us->events);
13938+ spin_lock_init(&us->event_lock);
13939+ init_MUTEX(&us->lock);
13940+ us->name_len = len;
13941+ memcpy(us->name, name, len);
13942+
13943+ error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
13944+ &user_service_ops, TRUE, (void *) us,
13945+ &us->local_id);
13946+ if (error) {
13947+ kfree(us);
13948+ us = NULL;
13949+ }
13950+ *us_data = us;
13951+ return error;
13952+}
13953+
13954+static void user_unregister(user_service_t *us)
13955+{
13956+ event_t *ev;
13957+
13958+ kcl_unregister_service(us->local_id);
13959+
13960+ if (us->nodeids)
13961+ kfree(us->nodeids);
13962+
13963+ while ((ev = get_event(us))) {
13964+ del_event(us, ev);
13965+ if (ev->nodeids)
13966+ kfree(ev->nodeids);
13967+ kfree(ev);
13968+ }
13969+}
13970+
13971+static int user_join_async(void *arg)
13972+{
13973+ user_service_t *us = arg;
13974+ int user_gone = 0;
13975+
13976+ daemonize("cman_userjoin");
13977+
13978+ kcl_join_service(us->local_id);
13979+
13980+ down(&us->lock);
13981+ us->state = UST_JOINED;
13982+ us->async = 0;
13983+ if (!us->sock) {
13984+ if (us->need_startdone)
13985+ kcl_start_done(us->local_id, us->need_startdone);
13986+ user_gone = 1;
13987+ }
13988+ up(&us->lock);
13989+
13990+ if (user_gone) {
13991+ kcl_leave_service(us->local_id);
13992+ user_unregister(us);
13993+ kfree(us);
13994+ }
13995+ return 0;
13996+}
13997+
13998+static int user_leave_async(void *arg)
13999+{
14000+ user_service_t *us = arg;
14001+
14002+ daemonize("cman_userleave");
14003+
14004+ kcl_leave_service(us->local_id);
14005+
14006+ down(&us->lock);
14007+ us->async = 0;
14008+ if (!us->sock) {
14009+ user_unregister(us);
14010+ kfree(us);
14011+ } else {
14012+ event_t *ev = alloc_event();
14013+ ev->type = SERVICE_EVENT_LEAVEDONE;
14014+ add_event(us, ev);
14015+ user_notify(us);
14016+ up(&us->lock);
14017+ }
14018+
14019+ return 0;
14020+}
14021+
14022+static int user_join(user_service_t *us, int wait)
14023+{
14024+ int error = 0;
14025+
14026+ if (wait) {
14027+ error = kcl_join_service(us->local_id);
14028+ us->state = UST_JOINED;
14029+ }
14030+ else {
14031+ us->async = 1;
14032+ kernel_thread(user_join_async, us, 0);
14033+ }
14034+
14035+ return error;
14036+}
14037+
14038+static void user_leave(user_service_t *us, int wait)
14039+{
14040+ if (wait)
14041+ kcl_leave_service(us->local_id);
14042+ else {
14043+ us->async = 1;
14044+ kernel_thread(user_leave_async, us, 0);
14045+ }
14046+}
14047+
14048+static int user_start_done(user_service_t *us, unsigned int event_id)
14049+{
14050+ if (!us->need_startdone)
14051+ return -EINVAL;
14052+ if (us->need_startdone == event_id)
14053+ us->need_startdone = 0;
14054+ kcl_start_done(us->local_id, event_id);
14055+ return 0;
14056+}
14057+
14058+static void user_set_signal(user_service_t *us, int signal)
14059+{
14060+ us->pid = current->pid;
14061+ us->signal = signal;
14062+}
14063+
14064+static int user_get_event(user_service_t *us,
14065+ struct cl_service_event *user_event)
14066+{
14067+ event_t *ev;
14068+ struct cl_service_event event;
14069+
14070+ ev = get_event(us);
14071+ if (!ev)
14072+ return 0;
14073+
14074+ event.type = ev->type;
14075+ event.start_type = ev->start_type;
14076+ event.event_id = ev->event_id;
14077+ event.last_stop = ev->last_stop;
14078+ event.last_start = ev->last_start;
14079+ event.last_finish = ev->last_finish;
14080+ event.node_count = ev->node_count;
14081+
14082+ if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
14083+ return -EFAULT;
14084+
14085+ del_event(us, ev);
14086+
14087+ if (ev->type == SERVICE_EVENT_START) {
14088+ if (us->nodeids)
14089+ kfree(us->nodeids);
14090+ us->nodeids = ev->nodeids;
14091+ us->node_count = ev->node_count;
14092+ }
14093+
14094+ kfree(ev);
14095+ return 1;
14096+}
14097+
14098+static int user_get_members(user_service_t *us,
14099+ struct cl_cluster_nodelist *u_nodelist)
14100+{
14101+ struct cl_cluster_nodelist user_nodelist;
14102+ struct cl_cluster_node user_node, *u_node;
14103+ struct cluster_node *node;
14104+ unsigned int i;
14105+ int num_nodes = 0;
14106+
14107+ if (!u_nodelist)
14108+ return us->node_count;
14109+
14110+ if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
14111+ sizeof(struct cl_cluster_nodelist)))
14112+ return -EFAULT;
14113+
14114+ if (user_nodelist.max_members < us->node_count)
14115+ return -E2BIG;
14116+
14117+ u_node = user_nodelist.nodes;
14118+
14119+ for (i = 0; i < us->node_count; i++) {
14120+ node = find_node_by_nodeid(us->nodeids[i]);
14121+ if (!node)
14122+ continue;
14123+
14124+ copy_to_usernode(node, &user_node);
14125+ if (copy_to_user(u_node, &user_node,
14126+ sizeof(struct cl_cluster_node)))
14127+ return -EFAULT;
14128+
14129+ u_node++;
14130+ num_nodes++;
14131+ }
14132+ return num_nodes;
14133+}
14134+
14135+static int user_global_id(user_service_t *us, uint32_t *id)
14136+{
14137+ uint32_t gid = 0;
14138+
14139+ if (us->state != UST_JOINED)
14140+ return -EINVAL;
14141+
14142+ kcl_global_service_id(us->local_id, &gid);
14143+
14144+ if (copy_to_user(id, &gid, sizeof(uint32_t)))
14145+ return -EFAULT;
14146+ return 0;
14147+}
14148+
14149+static int user_set_level(user_service_t *us, int level)
14150+{
14151+ int prev_id = us->local_id;
14152+ int error;
14153+
14154+ if (us->state != UST_REGISTER)
14155+ return -EINVAL;
14156+
14157+ error = kcl_register_service(us->name, us->name_len, level,
14158+ &user_service_ops, TRUE, (void *) us,
14159+ &us->local_id);
14160+ if (error)
14161+ return error;
14162+
14163+ kcl_unregister_service(prev_id);
14164+ return 0;
14165+}
14166+
14167+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
14168+{
14169+ struct cluster_sock *c = cluster_sk(sock->sk);
14170+ user_service_t *us = c->service_data;
14171+ int error = 0;
14172+
14173+ if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
14174+ return -EINVAL;
14175+
14176+ switch (cmd) {
14177+ case SIOCCLUSTER_SERVICE_REGISTER:
14178+ error = user_register((char *) arg, &us);
14179+ if (!error) {
14180+ us->state = UST_REGISTER;
14181+ us->sock = sock;
14182+ c->service_data = us;
14183+ }
14184+ break;
14185+
14186+ case SIOCCLUSTER_SERVICE_UNREGISTER:
14187+ down(&us->lock);
14188+ us->state = UST_UNREGISTER;
14189+ user_unregister(us);
14190+ up(&us->lock);
14191+ break;
14192+
14193+ case SIOCCLUSTER_SERVICE_JOIN:
14194+ us->state = UST_JOIN;
14195+ user_join(us, 0);
14196+ break;
14197+
14198+ case SIOCCLUSTER_SERVICE_LEAVE:
14199+ down(&us->lock);
14200+ if (us->state != UST_JOINED) {
14201+ error = -EBUSY;
14202+ up(&us->lock);
14203+ } else {
14204+ us->state = UST_LEAVE;
14205+ up(&us->lock);
14206+ user_leave(us, 0);
14207+ }
14208+ break;
14209+
14210+ case SIOCCLUSTER_SERVICE_SETSIGNAL:
14211+ user_set_signal(us, (int) arg);
14212+ break;
14213+
14214+ case SIOCCLUSTER_SERVICE_STARTDONE:
14215+ error = user_start_done(us, (unsigned int) arg);
14216+ break;
14217+
14218+ case SIOCCLUSTER_SERVICE_GETEVENT:
14219+ error = user_get_event(us, (struct cl_service_event *) arg);
14220+ break;
14221+
14222+ case SIOCCLUSTER_SERVICE_GETMEMBERS:
14223+ error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
14224+ break;
14225+
14226+ case SIOCCLUSTER_SERVICE_GLOBALID:
14227+ error = user_global_id(us, (uint32_t *) arg);
14228+ break;
14229+
14230+ case SIOCCLUSTER_SERVICE_SETLEVEL:
14231+ error = user_set_level(us, (int) arg);
14232+ break;
14233+
14234+ default:
14235+ error = -EINVAL;
14236+ }
14237+
14238+ return error;
14239+}
14240+
14241+void sm_sock_release(struct socket *sock)
14242+{
14243+ struct cluster_sock *c = cluster_sk(sock->sk);
14244+ user_service_t *us = c->service_data;
14245+ int state;
14246+
14247+ if (!us)
14248+ return;
14249+
14250+ down(&us->lock);
14251+ us->sock = NULL;
14252+ c->service_data = NULL;
14253+
14254+ if (us->need_startdone)
14255+ kcl_start_done(us->local_id, us->need_startdone);
14256+
14257+ if (us->async) {
14258+ /* async thread will clean up before exiting */
14259+ up(&us->lock);
14260+ return;
14261+ }
14262+ state = us->state;
14263+ up(&us->lock);
14264+
14265+ switch (state) {
14266+ case UST_JOIN:
14267+ break;
14268+ case UST_JOINED:
14269+ user_leave(us, 1);
14270+ /* fall through */
14271+ case UST_LEAVE:
14272+ case UST_REGISTER:
14273+ user_unregister(us);
14274+ /* fall through */
14275+ case UST_UNREGISTER:
14276+ kfree(us);
14277+ break;
14278+ }
14279+}
14280diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
14281--- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14282+++ linux-patched/cluster/cman/sm_user.h 2004-06-29 20:07:51.000000000 +0800
4bf12011 14283@@ -0,0 +1,21 @@
14284+/******************************************************************************
14285+*******************************************************************************
14286+**
14287+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14288+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14289+**
14290+** This copyrighted material is made available to anyone wishing to use,
14291+** modify, copy, or redistribute it subject to the terms and conditions
14292+** of the GNU General Public License v.2.
14293+**
14294+*******************************************************************************
14295+******************************************************************************/
14296+
14297+#ifndef __SM_USER_DOT_H__
14298+#define __SM_USER_DOT_H__
14299+
14300+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
14301+void sm_sock_release(struct socket *sock);
14302+void sm_sock_bind(struct socket *sock);
14303+
14304+#endif
14305diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
14306--- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14307+++ linux-patched/include/cluster/cnxman-socket.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14308@@ -0,0 +1,226 @@
14309+/******************************************************************************
14310+*******************************************************************************
14311+**
14312+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14313+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14314+**
14315+** This copyrighted material is made available to anyone wishing to use,
14316+** modify, copy, or redistribute it subject to the terms and conditions
14317+** of the GNU General Public License v.2.
14318+**
14319+*******************************************************************************
14320+******************************************************************************/
14321+
14322+/* CMAN socket interface header,
14323+ may be include by user or kernel code */
14324+
14325+#ifndef __CNXMAN_SOCKET_H
14326+#define __CNXMAN_SOCKET_H
14327+
14328+/* Just made these up but the address family must be less than 32 (NPROTO) */
14329+#define AF_CLUSTER 31
14330+#define PF_CLUSTER AF_CLUSTER
14331+
14332+/* Protocol(socket) types */
14333+#define CLPROTO_MASTER 2
14334+#define CLPROTO_CLIENT 3
14335+
14336+/* Setsockopt -- maybe should be ioctls?? */
14337+#define CLU_SET_MULTICAST 100
14338+#define CLU_JOIN_CLUSTER 101
14339+#define CLU_LEAVE_CLUSTER 102
14340+#define CLU_SET_RCVONLY 103
14341+#define CLU_SET_UNICAST 104
14342+#define KCL_SET_MULTICAST 105
14343+#define KCL_SET_RCVONLY 106
14344+#define KCL_SET_UNICAST 107
14345+#define KCL_SET_NODENAME 108
14346+#define CLU_SET_NODENAME 109
14347+
14348+/* ioctls -- should register these properly */
14349+#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int)
14350+#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02)
14351+#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist)
14352+#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
14353+#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05)
14354+#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request)
14355+#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist)
14356+#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int)
14357+#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version)
14358+#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version)
14359+#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b)
14360+#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int)
14361+#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d)
14362+#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char)
14363+#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
14364+#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10)
14365+#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20)
14366+#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
14367+#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
14368+#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event)
14369+#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
14370+#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t)
14371+#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int)
14372+#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node)
14373+#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info)
14374+
14375+/* Maximum size of a cluster message */
14376+#define MAX_CLUSTER_MESSAGE 1500
14377+#define MAX_CLUSTER_MEMBER_NAME_LEN 255
14378+#define MAX_BARRIER_NAME_LEN 33
14379+#define MAX_SA_ADDR_LEN 12
14380+#define MAX_CLUSTER_NAME_LEN 16
14381+
14382+/* Well-known cluster port numbers */
14383+#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster
14384+ * transitions! */
14385+#define CLUSTER_PORT_SERVICES 2
14386+#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
14387+#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
14388+#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */
14389+
14390+/* Port numbers above this will be blocked when the cluster is inquorate or in
14391+ * transition */
14392+#define HIGH_PROTECTED_PORT 9
14393+
14394+/* Reasons for leaving the cluster */
14395+#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */
14396+#define CLUSTER_LEAVEFLAG_KILLED 1
14397+#define CLUSTER_LEAVEFLAG_PANIC 2
14398+#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */
14399+#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the
14400+ * first place */
14401+#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is
14402+ * in a minority */
14403+#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */
14404+#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */
14405+
14406+/* OOB messages sent to a local socket */
14407+#define CLUSTER_OOB_MSG_PORTCLOSED 1
14408+#define CLUSTER_OOB_MSG_STATECHANGE 2
14409+#define CLUSTER_OOB_MSG_SERVICEEVENT 3
14410+
14411+/* Sendmsg flags, these are above the normal sendmsg flags so they don't
14412+ * interfere */
14413+#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */
14414+#define MSG_QUEUE 0x020000 /* Queue the message for sending later */
14415+#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
14416+ */
14417+#define MSG_ALLINT 0x100000 /* Send out of all interfaces */
14418+
14419+typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER,
14420+ NODESTATE_DEAD } nodestate_t;
14421+
14422+
14423+struct sockaddr_cl {
14424+ unsigned short scl_family;
14425+ unsigned char scl_flags;
14426+ unsigned char scl_port;
14427+ int scl_nodeid;
14428+};
14429+
14430+/* This is how we pass the multicast socket into kernel space. addr is the
14431+ * multicast address to use in the address family of the socket (eg for UDP it
14432+ * might be 255.255.255.0) */
14433+struct cl_multicast_sock {
14434+ int fd; /* FD of master socket to do multicast on */
14435+ int number; /* Socket number, to match up recvonly & bcast
14436+ * sockets */
14437+};
14438+
14439+/* Cluster configuration info passed when we join the cluster */
14440+struct cl_join_cluster_info {
14441+ unsigned char votes;
14442+ unsigned int expected_votes;
14443+ unsigned int two_node;
14444+ unsigned int config_version;
14445+
14446+ char cluster_name[17];
14447+};
14448+
14449+
14450+/* This is the structure, per node, returned from the membership ioctl */
14451+struct cl_cluster_node {
14452+ unsigned int size;
14453+ unsigned int node_id;
14454+ unsigned int us;
14455+ unsigned int leave_reason;
14456+ unsigned int incarnation;
14457+ nodestate_t state;
14458+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14459+ unsigned char votes;
14460+};
14461+
14462+/* The struct passed to the membership ioctls */
14463+struct cl_cluster_nodelist {
14464+ uint32_t max_members;
14465+ struct cl_cluster_node *nodes;
14466+};
14467+
14468+/* Structure passed to SIOCCLUSTER_ISLISTENING */
14469+struct cl_listen_request {
14470+ unsigned char port;
14471+ int nodeid;
14472+};
14473+
14474+/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
14475+struct cl_portclosed_oob {
14476+ unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */
14477+ unsigned char port;
14478+};
14479+
14480+/* Get all version numbers or set the config version */
14481+struct cl_version {
14482+ unsigned int major;
14483+ unsigned int minor;
14484+ unsigned int patch;
14485+ unsigned int config;
14486+};
14487+
14488+/* structure passed to barrier ioctls */
14489+struct cl_barrier_info {
14490+ char cmd;
14491+ char name[MAX_BARRIER_NAME_LEN];
14492+ unsigned int flags;
14493+ unsigned long arg;
14494+};
14495+
14496+typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
14497+ SERVICE_EVENT_LEAVEDONE } service_event_t;
14498+
14499+typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
14500+ service_start_t;
14501+
14502+struct cl_service_event {
14503+ service_event_t type;
14504+ service_start_t start_type;
14505+ unsigned int event_id;
14506+ unsigned int last_stop;
14507+ unsigned int last_start;
14508+ unsigned int last_finish;
14509+ unsigned int node_count;
14510+};
14511+
14512+
14513+/* Commands to the barrier ioctl */
14514+#define BARRIER_IOCTL_REGISTER 1
14515+#define BARRIER_IOCTL_CHANGE 2
14516+#define BARRIER_IOCTL_DELETE 3
14517+#define BARRIER_IOCTL_WAIT 4
14518+
14519+/* Attributes of a barrier - bitmask */
14520+#define BARRIER_ATTR_AUTODELETE 1
14521+#define BARRIER_ATTR_MULTISTEP 2
14522+#define BARRIER_ATTR_MANUAL 4
14523+#define BARRIER_ATTR_ENABLED 8
14524+#define BARRIER_ATTR_CALLBACK 16
14525+
14526+/* Attribute setting commands */
14527+#define BARRIER_SETATTR_AUTODELETE 1
14528+#define BARRIER_SETATTR_MULTISTEP 2
14529+#define BARRIER_SETATTR_ENABLED 3
14530+#define BARRIER_SETATTR_NODES 4
14531+#define BARRIER_SETATTR_CALLBACK 5
14532+#define BARRIER_SETATTR_TIMEOUT 6
14533+
14534+#endif
14535diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
14536--- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14537+++ linux-patched/include/cluster/cnxman.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14538@@ -0,0 +1,87 @@
14539+/******************************************************************************
14540+*******************************************************************************
14541+**
14542+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14543+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14544+**
14545+** This copyrighted material is made available to anyone wishing to use,
14546+** modify, copy, or redistribute it subject to the terms and conditions
14547+** of the GNU General Public License v.2.
14548+**
14549+*******************************************************************************
14550+******************************************************************************/
14551+
14552+#ifndef __CNXMAN_H
14553+#define __CNXMAN_H
14554+
14555+#include "linux/in6.h"
14556+#include "cluster/cnxman-socket.h"
14557+
14558+/* In-kernel API */
14559+
14560+/* This is the structure, per node, returned from the membership request */
14561+struct kcl_cluster_node {
14562+ unsigned int size;
14563+ unsigned int node_id;
14564+ unsigned int us;
14565+ unsigned int leave_reason;
14566+ unsigned int incarnation;
14567+ nodestate_t state;
14568+ struct list_head list;
14569+ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14570+ unsigned char votes;
14571+};
14572+
14573+struct cluster_node_addr {
14574+ struct list_head list;
14575+ unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
14576+ int addr_len;
14577+};
14578+
14579+
14580+/* Reasons for a kernel membership callback */
14581+typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
14582+
14583+/* Kernel version of above, the void *sock is a struct socket */
14584+struct kcl_multicast_sock {
14585+ void *sock;
14586+ int number; /* Socket number, to match up recvonly & bcast
14587+ * sockets */
14588+};
14589+
14590+extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
14591+ struct sockaddr_cl *caddr, int addr_len,
14592+ unsigned int flags);
14593+extern int kcl_register_read_callback(struct socket *sock,
14594+ int (*routine) (char *, int, char *, int,
14595+ unsigned int));
14596+extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
14597+extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
14598+extern int kcl_get_members(struct list_head *list);
14599+extern int kcl_get_member_ids(uint32_t * idbuf, int size);
14600+extern int kcl_get_all_members(struct list_head *list);
14601+extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
14602+ struct kcl_cluster_node *n);
14603+extern int kcl_get_node_by_name(unsigned char *name,
14604+ struct kcl_cluster_node *n);
14605+extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
14606+extern int kcl_is_quorate(void);
14607+extern int kcl_addref_cluster(void);
14608+extern int kcl_releaseref_cluster(void);
14609+extern int kcl_cluster_name(char **cname);
14610+extern int kcl_get_current_interface(void);
14611+extern struct list_head *kcl_get_node_addresses(int nodeid);
14612+
14613+extern int kcl_barrier_register(char *name, unsigned int flags,
14614+ unsigned int nodes);
14615+extern int kcl_barrier_setattr(char *name, unsigned int attr,
14616+ unsigned long arg);
14617+extern int kcl_barrier_delete(char *name);
14618+extern int kcl_barrier_wait(char *name);
14619+extern int kcl_barrier_cancel(char *name);
14620+
14621+extern int kcl_register_quorum_device(char *name, int votes);
14622+extern int kcl_unregister_quorum_device(void);
14623+extern int kcl_quorum_device_available(int yesno);
14624+
14625+#endif
14626diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
14627--- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730
5cdbd17b 14628+++ linux-patched/include/cluster/service.h 2004-06-29 20:07:50.000000000 +0800
4bf12011 14629@@ -0,0 +1,102 @@
14630+/******************************************************************************
14631+*******************************************************************************
14632+**
14633+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14634+** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14635+**
14636+** This copyrighted material is made available to anyone wishing to use,
14637+** modify, copy, or redistribute it subject to the terms and conditions
14638+** of the GNU General Public License v.2.
14639+**
14640+*******************************************************************************
14641+******************************************************************************/
14642+
14643+#ifndef __SERVICE_DOT_H__
14644+#define __SERVICE_DOT_H__
14645+
14646+/*
14647+ * Interface between service manager and services
14648+ */
14649+
14650+/*
14651+ * Service levels are started in order from lowest, so level 0 is started on
14652+ * all nodes before level 1 is started.
14653+ */
14654+
14655+#define SERVICE_LEVEL_FENCE (0)
14656+#define SERVICE_LEVEL_GDLM (1)
14657+#define SERVICE_LEVEL_GFS (2)
14658+#define SERVICE_LEVEL_USER (3)
14659+
14660+#define MAX_SERVICE_NAME_LEN (33)
14661+
14662+/*
14663+ * The type of start a service receives. The start (and preceding stop) may be
14664+ * due to a node joining or leaving the SG or due to a node having failed.
14665+ */
14666+
14667+#define SERVICE_NODE_FAILED (1)
14668+#define SERVICE_NODE_JOIN (2)
14669+#define SERVICE_NODE_LEAVE (3)
14670+
14671+
14672+struct kcl_service {
14673+ struct list_head list;
14674+ uint16_t level;
14675+ uint32_t local_id;
14676+ uint32_t global_id;
14677+ int node_count;
14678+ char name[MAX_SERVICE_NAME_LEN];
14679+};
14680+
14681+int kcl_get_services(struct list_head *list, int level);
14682+
14683+
14684+/*
14685+ * These routines which run in CMAN context must return quickly and cannot
14686+ * block.
14687+ */
14688+
14689+struct kcl_service_ops {
14690+ int (*stop) (void *servicedata);
14691+ int (*start) (void *servicedata, uint32_t *nodeids, int count,
14692+ int event_id, int type);
14693+ void (*finish) (void *servicedata, int event_id);
14694+};
14695+
14696+/*
14697+ * Register will cause CMAN to create a Service Group (SG) for the named
14698+ * instance of the service. A local ID is returned which is used to join,
14699+ * leave and unregister the service.
14700+ */
14701+
14702+int kcl_register_service(char *name, int namelen, int level,
14703+ struct kcl_service_ops *ops, int unique,
14704+ void *servicedata, uint32_t *local_id);
14705+
14706+void kcl_unregister_service(uint32_t local_id);
14707+
14708+/*
14709+ * Once a service is joined it will be managed by CMAN and receive start, stop,
14710+ * and finish calls. After leave is called the service is no longer managed by
14711+ * CMAN. The first start for a service may arrive before kcl_join_service()
14712+ * returns.
14713+ */
14714+
14715+int kcl_join_service(uint32_t local_id);
14716+int kcl_leave_service(uint32_t local_id);
14717+
14718+/*
14719+ * After a service is started, it can ask for its cluster-wide unique ID.
14720+ */
14721+
14722+void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
14723+
14724+/*
14725+ * Called by a service when it's done with a start(). Cannot be called from
14726+ * the start function.
14727+ */
14728+
14729+void kcl_start_done(uint32_t local_id, int event_id);
14730+
14731+#endif
This page took 6.010529 seconds and 4 git commands to generate.