1 diff -urN linux-orig/arch/alpha/Kconfig linux-orig2/arch/alpha/Kconfig
2 --- linux-orig/arch/alpha/Kconfig 2004-10-18 16:55:37.000000000 -0500
3 +++ linux-orig2/arch/alpha/Kconfig 2004-10-22 11:29:33.507218717 -0500
8 +source "cluster/Kconfig"
9 diff -urN linux-orig/arch/arm/Kconfig linux-orig2/arch/arm/Kconfig
10 --- linux-orig/arch/arm/Kconfig 2004-10-18 16:54:31.000000000 -0500
11 +++ linux-orig2/arch/arm/Kconfig 2004-10-22 11:30:56.358918506 -0500
13 source "crypto/Kconfig"
17 +source "cluster/Kconfig"
18 diff -urN linux-orig/arch/arm26/Kconfig linux-orig2/arch/arm26/Kconfig
19 --- linux-orig/arch/arm26/Kconfig 2004-10-18 16:54:32.000000000 -0500
20 +++ linux-orig2/arch/arm26/Kconfig 2004-10-22 11:29:33.531218341 -0500
25 +source "cluster/Kconfig"
26 diff -urN linux-orig/arch/cris/Kconfig linux-orig2/arch/cris/Kconfig
27 --- linux-orig/arch/cris/Kconfig 2004-10-18 16:55:07.000000000 -0500
28 +++ linux-orig2/arch/cris/Kconfig 2004-10-22 11:31:11.965673644 -0500
30 source "crypto/Kconfig"
34 +source "cluster/Kconfig"
35 diff -urN linux-orig/arch/i386/Kconfig linux-orig2/arch/i386/Kconfig
36 --- linux-orig/arch/i386/Kconfig 2004-10-18 16:53:22.000000000 -0500
37 +++ linux-orig2/arch/i386/Kconfig 2004-10-22 11:29:33.533218309 -0500
42 +source "cluster/Kconfig"
46 depends on SMP && !X86_VOYAGER
47 diff -urN linux-orig/arch/ia64/Kconfig linux-orig2/arch/ia64/Kconfig
48 --- linux-orig/arch/ia64/Kconfig 2004-10-18 16:55:27.000000000 -0500
49 +++ linux-orig2/arch/ia64/Kconfig 2004-10-22 11:29:33.534218294 -0500
51 source "security/Kconfig"
53 source "crypto/Kconfig"
55 +source "cluster/Kconfig"
56 diff -urN linux-orig/arch/m68k/Kconfig linux-orig2/arch/m68k/Kconfig
57 --- linux-orig/arch/m68k/Kconfig 2004-10-18 16:54:32.000000000 -0500
58 +++ linux-orig2/arch/m68k/Kconfig 2004-10-22 11:31:38.187262279 -0500
60 source "crypto/Kconfig"
64 +source "cluster/Kconfig"
65 diff -urN linux-orig/arch/mips/Kconfig linux-orig2/arch/mips/Kconfig
66 --- linux-orig/arch/mips/Kconfig 2004-10-18 16:54:08.000000000 -0500
67 +++ linux-orig2/arch/mips/Kconfig 2004-10-22 11:29:33.541218184 -0500
72 +source "cluster/Kconfig"
75 # Use the generic interrupt handling code in kernel/irq/:
77 diff -urN linux-orig/arch/parisc/Kconfig linux-orig2/arch/parisc/Kconfig
78 --- linux-orig/arch/parisc/Kconfig 2004-10-18 16:54:37.000000000 -0500
79 +++ linux-orig2/arch/parisc/Kconfig 2004-10-22 11:31:57.146964867 -0500
81 source "crypto/Kconfig"
85 +source "cluster/Kconfig"
86 diff -urN linux-orig/arch/ppc/Kconfig linux-orig2/arch/ppc/Kconfig
87 --- linux-orig/arch/ppc/Kconfig 2004-10-18 16:55:29.000000000 -0500
88 +++ linux-orig2/arch/ppc/Kconfig 2004-10-22 11:29:33.550218043 -0500
90 source "security/Kconfig"
92 source "crypto/Kconfig"
94 +source "cluster/Kconfig"
95 diff -urN linux-orig/arch/ppc64/Kconfig linux-orig2/arch/ppc64/Kconfig
96 --- linux-orig/arch/ppc64/Kconfig 2004-10-18 16:54:31.000000000 -0500
97 +++ linux-orig2/arch/ppc64/Kconfig 2004-10-22 11:32:11.150745212 -0500
99 source "crypto/Kconfig"
103 +source "cluster/Kconfig"
104 diff -urN linux-orig/arch/s390/Kconfig linux-orig2/arch/s390/Kconfig
105 --- linux-orig/arch/s390/Kconfig 2004-10-18 16:53:51.000000000 -0500
106 +++ linux-orig2/arch/s390/Kconfig 2004-10-22 11:32:31.175431141 -0500
108 source "crypto/Kconfig"
112 +source "cluster/Kconfig"
113 diff -urN linux-orig/arch/sh/Kconfig linux-orig2/arch/sh/Kconfig
114 --- linux-orig/arch/sh/Kconfig 2004-10-18 16:55:29.000000000 -0500
115 +++ linux-orig2/arch/sh/Kconfig 2004-10-22 11:32:47.169180310 -0500
117 source "crypto/Kconfig"
121 +source "cluster/Kconfig"
122 diff -urN linux-orig/arch/sparc/Kconfig linux-orig2/arch/sparc/Kconfig
123 --- linux-orig/arch/sparc/Kconfig 2004-10-18 16:53:05.000000000 -0500
124 +++ linux-orig2/arch/sparc/Kconfig 2004-10-22 11:33:06.891871022 -0500
126 source "crypto/Kconfig"
130 +source "cluster/Kconfig"
131 diff -urN linux-orig/arch/sparc64/Kconfig linux-orig2/arch/sparc64/Kconfig
132 --- linux-orig/arch/sparc64/Kconfig 2004-10-18 16:55:06.000000000 -0500
133 +++ linux-orig2/arch/sparc64/Kconfig 2004-10-22 11:33:19.290676599 -0500
135 source "crypto/Kconfig"
139 +source "cluster/Kconfig"
140 diff -urN linux-orig/arch/um/Kconfig linux-orig2/arch/um/Kconfig
141 --- linux-orig/arch/um/Kconfig 2004-10-18 16:54:08.000000000 -0500
142 +++ linux-orig2/arch/um/Kconfig 2004-10-22 11:29:33.564217823 -0500
147 +source "cluster/Kconfig"
152 diff -urN linux-orig/arch/x86_64/Kconfig linux-orig2/arch/x86_64/Kconfig
153 --- linux-orig/arch/x86_64/Kconfig 2004-10-18 16:54:55.000000000 -0500
154 +++ linux-orig2/arch/x86_64/Kconfig 2004-10-22 11:33:37.130396876 -0500
156 source "crypto/Kconfig"
160 +source "cluster/Kconfig"
161 diff -urN linux-orig/cluster/cman/Makefile linux-orig2/cluster/cman/Makefile
162 --- linux-orig/cluster/cman/Makefile 1969-12-31 18:00:00.000000000 -0600
163 +++ linux-orig2/cluster/cman/Makefile 2004-10-22 11:29:33.566217791 -0500
165 +cman-objs := cnxman.o config.o membership.o proc.o\
166 + sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
167 + sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
170 +obj-$(CONFIG_CLUSTER) := cman.o
171 diff -urN linux-orig/cluster/Kconfig linux-orig2/cluster/Kconfig
172 --- linux-orig/cluster/Kconfig 1969-12-31 18:00:00.000000000 -0600
173 +++ linux-orig2/cluster/Kconfig 2004-10-22 11:29:33.565217807 -0500
175 +menu "Cluster Support"
178 + tristate "Cluster support"
180 + Enable clustering support. This is not the high-performance clustering
181 + made famous by beowulf. It is a high-availability cluster often using
183 + The cluster manager is the heart(beat) of the cluster system. It is
184 + needed by all the other components. It provides membership services
185 + for those other subsystems.
188 diff -urN linux-orig/cluster/Makefile linux-orig2/cluster/Makefile
189 --- linux-orig/cluster/Makefile 1969-12-31 18:00:00.000000000 -0600
190 +++ linux-orig2/cluster/Makefile 2004-10-22 11:29:33.566217791 -0500
192 +obj-y := nocluster.o
194 +obj-$(CONFIG_CLUSTER) += cman/
195 diff -urN linux-orig/cluster/nocluster.c linux-orig2/cluster/nocluster.c
196 --- linux-orig/cluster/nocluster.c 1969-12-31 18:00:00.000000000 -0600
197 +++ linux-orig2/cluster/nocluster.c 2004-10-22 11:29:33.567217776 -0500
200 + * cluster/nocluster.c
202 + * Copy from net/nonet.c
203 + * Dummy functions to allow us to configure cluster support entirely
204 + * out of the kernel.
206 + * Distributed under the terms of the GNU GPL version 2.
207 + * Copyright (c) Matthew Wilcox 2003
210 +#include <linux/module.h>
211 +#include <linux/errno.h>
212 +#include <linux/fs.h>
213 +#include <linux/init.h>
214 +#include <linux/kernel.h>
216 +void __init nocluster_init(void)
219 diff -urN linux-orig/Makefile linux-orig2/Makefile
220 --- linux-orig/Makefile 2004-10-18 16:54:38.000000000 -0500
221 +++ linux-orig2/Makefile 2004-10-22 11:29:33.507218717 -0500
224 # Objects we will link into vmlinux / subdirs we need to visit
226 -drivers-y := drivers/ sound/
227 +drivers-y := drivers/ sound/ cluster/
231 diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
232 --- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730
233 +++ linux-patched/cluster/cman/cnxman-private.h 2004-11-03 11:37:37.000000000 +0800
235 +/******************************************************************************
236 +*******************************************************************************
238 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
239 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
241 +** This copyrighted material is made available to anyone wishing to use,
242 +** modify, copy, or redistribute it subject to the terms and conditions
243 +** of the GNU General Public License v.2.
245 +*******************************************************************************
246 +******************************************************************************/
248 +#ifndef __CNXMAN_PRIVATE_H
249 +#define __CNXMAN_PRIVATE_H
251 +/* Version triplet */
252 +#define CNXMAN_MAJOR_VERSION 3
253 +#define CNXMAN_MINOR_VERSION 0
254 +#define CNXMAN_PATCH_VERSION 1
256 +#define MAX_RETRIES 3 /* Maximum number of send retries */
257 +#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the
261 +/* How we announce ourself in console events */
262 +#define CMAN_NAME "CMAN"
264 +/* One of these per AF_CLUSTER socket */
265 +struct cluster_sock {
266 + /* WARNING: sk has to be the first member */
269 + unsigned char port; /* Bound port or zero */
270 + int (*kernel_callback) (char *, int, char *, int, unsigned int);
271 + void *service_data;
274 +#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
276 +/* We have one of these for each socket we use for communications */
277 +struct cl_comms_socket {
278 + struct socket *sock;
279 + int broadcast; /* This is a broadcast socket */
280 + int recv_only; /* This is the unicast receive end of a
281 + * multicast socket */
282 + struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
283 + * the remote end(s) */
284 + int addr_len; /* Length of above */
285 + int number; /* Internal socket number, used to cycle around
286 + * sockets in case of network errors */
287 + struct file *file; /* file pointer for user-passed in sockets */
291 + /* The socket list */
292 + struct list_head list;
294 + /* On here when it has something to say */
295 + struct list_head active_list;
296 + unsigned long active;
299 +/* A client socket. We keep a list of these so we can notify clients of cluster
301 +struct cl_client_socket {
302 + struct socket *sock;
303 + struct list_head list;
306 +/* This structure is tacked onto the start of a cluster message packet for our
307 + * own nefarious purposes. */
308 +struct cl_protheader {
309 + unsigned char tgtport; /* Target port number */
310 + unsigned char srcport; /* Source (originationg) port number */
311 + unsigned short seq; /* Packet sequence number, little-endian */
312 + unsigned short ack; /* Inline ACK */
313 + unsigned short cluster; /* Our cluster number, little-endian */
314 + unsigned int flags;
315 + int srcid; /* Node ID of the sender */
316 + int tgtid; /* Node ID of the target or 0 for multicast
320 +/* A cluster internal protocol message - port number 0 */
322 + struct cl_protheader header;
326 +/* A Cluster ACK message */
328 + struct cl_protheader header;
329 + unsigned char cmd; /* Always CLUSTER_CMD_ACK */
330 + unsigned char remport; /* Remote port number the original message was
332 + unsigned char aflags; /* ACK flags 0=OK, 1=No listener */
336 +/* A Cluster LISTENREQ/LISTENRESP message */
337 +struct cl_listenmsg {
338 + unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */
339 + unsigned char target_port; /* Port to probe */
340 + unsigned char listening; /* Always 0 for LISTENREQ */
342 + unsigned short tag; /* PID of remote waiting process */
345 +/* A Cluster PORTCLOSED message */
346 +struct cl_closemsg {
347 + unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */
348 + unsigned char port;
351 +/* Structure of a newly dead node, passed from cnxman to kmembershipd */
352 +struct cl_new_dead_node {
353 + struct list_head list;
354 + struct cluster_node *node;
357 +/* Subcommands for BARRIER message */
358 +#define BARRIER_REGISTER 1
359 +#define BARRIER_CHANGE 2
360 +#define BARRIER_WAIT 4
361 +#define BARRIER_COMPLETE 5
363 +/* A Cluster BARRIER message */
364 +struct cl_barriermsg {
365 + unsigned char cmd; /* CLUSTER_CMD_BARRIER */
366 + unsigned char subcmd; /* BARRIER sub command */
367 + unsigned short pad;
368 + unsigned int flags;
369 + unsigned int nodes;
370 + char name[MAX_BARRIER_NAME_LEN];
373 +/* Membership services messages, the cl_protheader is added transparently */
374 +struct cl_mem_hello_msg {
376 + unsigned char flags;
377 + unsigned short members; /* Number of nodes in the cluster,
379 + unsigned int generation; /* Current cluster generation number */
382 +struct cl_mem_endtrans_msg {
384 + unsigned char pad1;
385 + unsigned short pad2;
386 + unsigned int quorum;
387 + unsigned int total_votes;
388 + unsigned int generation; /* Current cluster generation number */
389 + unsigned int new_node_id; /* If reason is a new node joining */
392 +/* ACK types for JOINACK message */
393 +#define JOINACK_TYPE_OK 1 /* You can join */
394 +#define JOINACK_TYPE_NAK 2 /* You can NOT join */
395 +#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition
398 +struct cl_mem_joinack_msg {
400 + unsigned char acktype;
403 +/* This is used by JOINREQ message */
404 +struct cl_mem_join_msg {
406 + unsigned char votes;
407 + unsigned short num_addr; /* Number of addresses for this node */
408 + unsigned int expected_votes;
409 + unsigned int nodeid; /* node ID we want */
410 + unsigned int major_version; /* Not backwards compatible */
411 + unsigned int minor_version; /* Backwards compatible */
412 + unsigned int patch_version; /* Backwards/forwards compatible */
413 + unsigned int config_version;
414 + unsigned int addr_len; /* length of node addresses */
415 + char clustername[16];
416 + /* Followed by <num_addr> addresses of `address_length` bytes and a
417 + * NUL-terminated node name */
420 +/* State transition start reasons: */
421 +#define TRANS_NEWNODE 1 /* A new node is joining the cluster */
422 +#define TRANS_REMNODE 2 /* a node has left the cluster */
423 +#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
425 +#define TRANS_NEWMASTER 4 /* We have had an election and I am the new
427 +#define TRANS_CHECK 5 /* A consistency check was called for */
428 +#define TRANS_RESTART 6 /* Transition restarted because of a previous
430 +#define TRANS_DEADMASTER 7 /* The master died during transition and I have
433 +/* This is used to start a state transition */
434 +struct cl_mem_starttrans_msg {
436 + unsigned char reason; /* Why a start transition was started - see
438 + unsigned char flags;
439 + unsigned char votes;
440 + unsigned int expected_votes;
441 + unsigned int generation; /* Incremented for each STARTTRANS sent
443 + int nodeid; /* Node to be removed */
444 + unsigned short num_addrs;
445 + /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
446 + * `address_length` bytes and a NUL-terminated node name */
449 +struct cl_mem_startack_msg {
451 + unsigned char reason;
452 + unsigned short pad;
453 + unsigned int generation;
454 + unsigned int node_id; /* node_id we think new node should have */
455 + unsigned int highest_node_id; /* highest node_id on this system */
458 +/* Reconfigure a cluster parameter */
459 +struct cl_mem_reconfig_msg {
461 + unsigned char param;
462 + unsigned short pad;
463 + unsigned int value;
466 +/* Structure containing information about an outstanding listen request */
467 +struct cl_waiting_listen_request {
468 + wait_queue_head_t waitq;
471 + unsigned short tag;
473 + struct list_head list;
476 +/* Messages from membership services */
477 +#define CLUSTER_MEM_JOINCONF 1
478 +#define CLUSTER_MEM_JOINREQ 2
479 +#define CLUSTER_MEM_LEAVE 3
480 +#define CLUSTER_MEM_HELLO 4
481 +#define CLUSTER_MEM_KILL 5
482 +#define CLUSTER_MEM_JOINACK 6
483 +#define CLUSTER_MEM_ENDTRANS 7
484 +#define CLUSTER_MEM_RECONFIG 8
485 +#define CLUSTER_MEM_MASTERVIEW 9
486 +#define CLUSTER_MEM_STARTTRANS 10
487 +#define CLUSTER_MEM_JOINREJ 11
488 +#define CLUSTER_MEM_VIEWACK 12
489 +#define CLUSTER_MEM_STARTACK 13
490 +#define CLUSTER_MEM_TRANSITION 14
491 +#define CLUSTER_MEM_NEWCLUSTER 15
492 +#define CLUSTER_MEM_CONFACK 16
493 +#define CLUSTER_MEM_NOMINATE 17
495 +/* Flags in the HELLO message */
496 +#define HELLO_FLAG_MASTER 1
497 +#define HELLO_FLAG_QUORATE 2
499 +/* Parameters for RECONFIG command */
500 +#define RECONFIG_PARAM_EXPECTED_VOTES 1
501 +#define RECONFIG_PARAM_NODE_VOTES 2
502 +#define RECONFIG_PARAM_CONFIG_VERSION 3
504 +/* Data associated with an outgoing socket */
506 + struct file *file; /* The real file */
507 + struct socket *socket; /* The real sock */
508 + int num_nodes; /* On this link */
509 + int retransmit_count;
512 +/* There's one of these for each node in the cluster */
513 +struct cluster_node {
514 + struct list_head list;
515 + char *name; /* Node/host name of node */
516 + struct list_head addr_list;
517 + int us; /* This node is us */
518 + unsigned int node_id; /* Unique node ID */
520 + unsigned short last_seq_recv;
521 + unsigned short last_seq_acked;
522 + unsigned short last_seq_sent;
523 + unsigned int votes;
524 + unsigned int expected_votes;
525 + unsigned int leave_reason;
526 + unsigned int incarnation; /* Incremented each time a node joins
528 + unsigned long last_hello; /* Jiffies */
529 + struct timeval join_time;
532 +/* This is how we keep a list of user processes that are listening for cluster
533 + * membership events */
534 +struct notify_struct {
535 + struct list_head list;
540 +/* This is how we keep a list of kernel callbacks that are registered for
541 + * cluster membership events */
542 +struct kernel_notify_struct {
543 + struct list_head list;
544 + void (*callback) (kcl_callback_reason, long arg);
547 +/* A message waiting to be sent */
548 +struct queued_message {
549 + struct list_head list;
551 + struct socket *socket;
552 + struct sockaddr_cl addr;
555 + unsigned char port;
556 + unsigned int flags;
557 + char msg_buffer[MAX_CLUSTER_MESSAGE];
562 + struct list_head list;
564 + char name[MAX_BARRIER_NAME_LEN];
565 + unsigned int flags;
566 + enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
567 + BARRIER_STATE_COMPLETE } state;
568 + unsigned int expected_nodes;
569 + unsigned int registered_nodes;
570 + atomic_t got_nodes;
571 + atomic_t completed_nodes;
572 + unsigned int inuse;
573 + unsigned int waitsent;
574 + unsigned int phase; /* Completion phase */
575 + unsigned int endreason; /* Reason we were woken, usually 0 */
576 + unsigned long timeout; /* In seconds */
578 + void (*callback) (char *name, int status);
579 + wait_queue_head_t waitq;
580 + struct semaphore lock; /* To synch with cnxman messages */
581 + spinlock_t phase2_spinlock; /* Need to synchronise with timer
583 + struct timer_list timer;
586 +/* Cluster protocol commands sent to port 0 */
587 +#define CLUSTER_CMD_ACK 1
588 +#define CLUSTER_CMD_LISTENREQ 2
589 +#define CLUSTER_CMD_LISTENRESP 3
590 +#define CLUSTER_CMD_PORTCLOSED 4
591 +#define CLUSTER_CMD_BARRIER 5
593 +extern struct cluster_node *find_node_by_addr(unsigned char *addr,
595 +extern struct cluster_node *find_node_by_nodeid(unsigned int id);
596 +extern struct cluster_node *find_node_by_name(char *name);
597 +extern void set_quorate(int);
598 +extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
599 +extern void notify_listeners(void);
600 +extern void free_nodeid_array(void);
601 +extern int send_reconfigure(int param, unsigned int value);
602 +extern int calculate_quorum(int, int, int *);
603 +extern void recalculate_quorum(int);
604 +extern int send_leave(unsigned char);
605 +extern int get_quorum(void);
606 +extern void set_votes(int, int);
607 +extern void kcl_wait_for_all_acks(void);
608 +extern char *membership_state(char *, int);
609 +extern char *leave_string(int reason);
610 +extern void a_node_just_died(struct cluster_node *node);
611 +extern void check_barrier_returns(void);
612 +extern int in_transition(void);
613 +extern void get_local_addresses(struct cluster_node *node);
614 +extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
615 +extern void create_proc_entries(void);
616 +extern void cleanup_proc_entries(void);
617 +extern unsigned int get_highest_nodeid(void);
618 +extern int allocate_nodeid_array(void);
619 +extern void queue_oob_skb(struct socket *sock, int cmd);
620 +extern int new_temp_nodeid(char *addr, int addrlen);
621 +extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
622 +extern void purge_temp_nodeids(void);
623 +extern inline char *print_addr(unsigned char *addr, int len, char *buf)
628 + for (i = 0; i < len; i++)
629 + ptr += sprintf(buf + ptr, "%02x ", addr[i]);
634 +#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
636 +/* Debug enabling macros. Sorry about the C++ comments but they're easier to
637 + * get rid of than C ones... */
639 +// #define DEBUG_MEMB
640 +// #define DEBUG_COMMS
641 +// #define DEBUG_BARRIER
645 +#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
647 +#define P_COMMS(fmt, args...)
650 +#ifdef DEBUG_BARRIER
651 +#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
653 +#define P_BARRIER(fmt, args...)
657 +#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
658 +#define C_MEMB(fmt, args...) printk(fmt, ## args)
660 +#define P_MEMB(fmt, args...)
661 +#define C_MEMB(fmt, args...)
664 +#endif /* __KERNEL */
667 diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
668 --- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730
669 +++ linux-patched/cluster/cman/cnxman.c 2004-11-03 11:37:37.000000000 +0800
671 +/******************************************************************************
672 +*******************************************************************************
674 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
675 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
677 +** This copyrighted material is made available to anyone wishing to use,
678 +** modify, copy, or redistribute it subject to the terms and conditions
679 +** of the GNU General Public License v.2.
681 +*******************************************************************************
682 +******************************************************************************/
684 +#define EXPORT_SYMTAB
685 +#include <linux/init.h>
686 +#include <linux/socket.h>
687 +#include <linux/kernel.h>
688 +#include <linux/sched.h>
689 +#include <linux/file.h>
690 +#include <linux/utsname.h>
691 +#include <net/sock.h>
692 +#include <linux/proc_fs.h>
693 +#include <linux/poll.h>
694 +#include <linux/module.h>
695 +#include <linux/list.h>
696 +#include <linux/uio.h>
697 +#include <cluster/cnxman.h>
698 +#include <cluster/service.h>
700 +#include "cnxman-private.h"
701 +#include "sm_control.h"
702 +#include "sm_user.h"
705 +#define CMAN_RELEASE_NAME "<CVS>"
707 +static void process_incoming_packet(struct cl_comms_socket *csock,
708 + struct msghdr *msg, struct kvec *vec, int veclen, int len);
709 +static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
710 + int addr_len, char *addr, unsigned char remport,
711 + unsigned char flag);
712 +static void send_listen_request(int nodeid, unsigned char port);
713 +static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
714 + unsigned char port, unsigned short tag);
715 +static void resend_last_message(void);
716 +static void start_ack_timer(void);
717 +static int send_queued_message(struct queued_message *qmsg);
718 +static void send_port_close_oob(unsigned char port);
719 +static void post_close_oob(unsigned char port, int nodeid);
720 +static void process_barrier_msg(struct cl_barriermsg *msg,
721 + struct cluster_node *node);
722 +static struct cl_barrier *find_barrier(char *name);
723 +static void node_shutdown(void);
724 +static void node_cleanup(void);
725 +static int send_or_queue_message(struct socket *sock, void *buf, int len, struct sockaddr_cl *caddr,
726 + unsigned int flags);
727 +static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
728 +static void check_for_unacked_nodes(void);
729 +static void free_cluster_sockets(void);
730 +static uint16_t generate_cluster_id(char *name);
731 +static int is_valid_temp_nodeid(int nodeid);
733 +extern int start_membership_services(pid_t);
734 +extern int kcl_leave_cluster(int remove);
735 +extern int send_kill(int nodeid);
737 +static struct proto_ops cl_proto_ops;
738 +static struct sock *master_sock;
739 +static kmem_cache_t *cluster_sk_cachep;
741 +/* Pointer to the pseudo node that maintains quorum in a 2node system */
742 +struct cluster_node *quorum_device = NULL;
744 +/* Array of "ports" allocated. This is just a list of pointers to the sock that
745 + * has this port bound. Speed is a major issue here so 1-2K of allocated
746 + * storage is worth sacrificing. Port 0 is reserved for protocol messages */
747 +static struct sock *port_array[256];
748 +static struct semaphore port_array_lock;
750 +/* Our cluster name & number */
751 +uint16_t cluster_id;
752 +char cluster_name[MAX_CLUSTER_NAME_LEN+1];
754 +/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
755 + * No more than two nodes are permitted to join the cluster. */
756 +unsigned short two_node;
758 +/* Cluster configuration version that must be the same among members. */
759 +unsigned int config_version;
761 +/* Reference counting for cluster applications */
764 +/* Length of sockaddr address for our comms protocol */
765 +unsigned int address_length;
767 +/* Message sending */
768 +static unsigned short cur_seq; /* Last message sent */
769 +static unsigned int ack_count; /* Number of acks received for message
771 +static unsigned int acks_expected; /* Number of acks we expect to receive */
772 +static struct semaphore send_lock;
773 +static struct timer_list ack_timer;
775 +/* Saved packet information in case we need to resend it */
776 +static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
777 +static int saved_msg_len;
778 +static int retry_count;
780 +/* Task variables */
781 +static pid_t kcluster_pid;
782 +static pid_t membership_pid;
783 +extern struct task_struct *membership_task;
784 +extern int quit_threads;
786 +wait_queue_head_t cnxman_waitq;
788 +/* Variables owned by membership services */
789 +extern int cluster_members;
790 +extern struct list_head cluster_members_list;
791 +extern struct semaphore cluster_members_lock;
792 +extern int we_are_a_cluster_member;
793 +extern int cluster_is_quorate;
794 +extern struct cluster_node *us;
795 +extern struct list_head new_dead_node_list;
796 +extern struct semaphore new_dead_node_lock;
797 +extern char nodename[];
798 +extern int wanted_nodeid;
800 +/* A list of processes listening for membership events */
801 +static struct list_head event_listener_list;
802 +static struct semaphore event_listener_lock;
804 +/* A list of kernel callbacks listening for membership events */
805 +static struct list_head kernel_listener_list;
806 +static struct semaphore kernel_listener_lock;
808 +/* A list of sockets we are listening on (and can transmit on...later) */
809 +static struct list_head socket_list;
811 +/* A list of all open cluster client sockets */
812 +static struct list_head client_socket_list;
813 +static struct semaphore client_socket_lock;
815 +/* A list of all current barriers */
816 +static struct list_head barrier_list;
817 +static struct semaphore barrier_list_lock;
819 +/* When a socket is read for reading it goes on this queue */
820 +static spinlock_t active_socket_lock;
821 +static struct list_head active_socket_list;
823 +/* If the cnxman process is running and available for work */
824 +atomic_t cnxman_running;
826 +/* Fkags set by timers etc for the mainloop to detect and act upon */
827 +static unsigned long mainloop_flags;
829 +#define ACK_TIMEOUT 1
830 +#define RESEND_NEEDED 2
832 +/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
833 + * process context then the messages get put in here */
834 +static struct list_head messages_list;
835 +static struct semaphore messages_list_lock;
837 +static struct semaphore start_thread_sem;
839 +/* List of outstanding ISLISTENING requests */
840 +static struct list_head listenreq_list;
841 +static struct semaphore listenreq_lock;
843 +/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
845 +static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
847 +/* Wait for thread to exit properly */
848 +struct completion cluster_thread_comp;
849 +struct completion member_thread_comp;
851 +/* The resend delay to use, We increase this geometrically(word?) each time a
852 + * send is delayed. in deci-seconds */
853 +static int resend_delay = 1;
855 +/* Highest numbered interface and the current default */
856 +static int num_interfaces;
857 +static struct cl_comms_socket *current_interface = NULL;
862 + char addr[sizeof(struct sockaddr_in6)];
864 + struct list_head list;
866 +static struct list_head tempnode_list;
867 +static struct semaphore tempnode_lock;
870 +/* This is what's squirrelled away in skb->cb */
879 +/* Wake up any processes that are waiting to send. This is usually called when
880 + * all the ACKs have been gathered up or when a node has left the cluster
881 + * unexpectedly and we reckon there are no more acks to collect */
882 +static void unjam(void)
884 + wake_up_interruptible(&socket_waitq);
885 + wake_up_interruptible(&cnxman_waitq);
888 +/* Used by the data_ready routine to locate a connection given the socket */
889 +static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
891 + struct list_head *conlist;
893 + list_for_each(conlist, &socket_list) {
894 + struct cl_comms_socket *clsock =
895 + list_entry(conlist, struct cl_comms_socket, list);
896 + if (clsock->sock->sk == sk) {
903 +/* Data available on socket */
904 +static void cnxman_data_ready(struct sock *sk, int count_unused)
906 + struct cl_comms_socket *clsock = find_comms_by_sock(sk);
908 + if (clsock == NULL) /* ASSERT ?? */
911 + /* If we're already on the list then don't do it again */
912 + if (test_and_set_bit(1, &clsock->active))
915 + spin_lock_irq(&active_socket_lock);
916 + list_add(&clsock->active_list, &active_socket_list);
917 + spin_unlock_irq(&active_socket_lock);
919 + wake_up_interruptible(&cnxman_waitq);
922 +static int receive_message(struct cl_comms_socket *csock, char *iobuf)
926 + struct sockaddr_in6 sin;
929 + memset(&sin, 0, sizeof (sin));
931 + msg.msg_control = NULL;
932 + msg.msg_controllen = 0;
933 + msg.msg_name = &sin;
934 + msg.msg_namelen = sizeof (sin);
937 + vec.iov_len = MAX_CLUSTER_MESSAGE;
938 + vec.iov_base = iobuf;
940 + len = kernel_recvmsg(csock->sock, &msg,
941 + &vec, 1, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
943 + vec.iov_base = iobuf;
946 + if (len > MAX_CLUSTER_MESSAGE) {
947 + printk(KERN_CRIT CMAN_NAME
948 + ": %d byte message far too big\n", len);
951 + process_incoming_packet(csock, &msg, &vec, 1, len);
954 + if (len != -EAGAIN)
955 + printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
961 +static int cluster_kthread(void *unused)
965 + struct list_head *socklist;
966 + struct cl_comms_socket *csock;
967 + wait_queue_t cnxman_waitq_head;
970 + daemonize("cman_comms");
972 + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
973 + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
974 + sigprocmask(SIG_BLOCK, &tmpsig, NULL);
976 + /* This is the waitq we can wake the process up with */
977 + init_waitqueue_head(&cnxman_waitq);
978 + init_waitqueue_entry(&cnxman_waitq_head, current);
979 + add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
981 + set_user_nice(current, -6);
983 + /* Allow the sockets to start receiving */
984 + list_for_each(socklist, &socket_list) {
985 + csock = list_entry(socklist, struct cl_comms_socket, list);
987 + clear_bit(1, &csock->active);
990 + iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
992 + printk(KERN_CRIT CMAN_NAME
993 + ": Cannot allocate receive buffer for cluster comms\n");
997 + complete(&cluster_thread_comp);
1000 + struct list_head *temp;
1002 + /* Wait for activity on any of the sockets */
1003 + set_task_state(current, TASK_INTERRUPTIBLE);
1005 + if (list_empty(&active_socket_list))
1007 + set_task_state(current, TASK_RUNNING);
1012 + if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
1013 + check_for_unacked_nodes();
1016 + /* Now receive any messages waiting for us */
1017 + spin_lock_irq(&active_socket_lock);
1018 + list_for_each_safe(socklist, temp, &active_socket_list) {
1020 + list_entry(socklist, struct cl_comms_socket,
1023 + list_del(&csock->active_list);
1024 + clear_bit(1, &csock->active);
1026 + spin_unlock_irq(&active_socket_lock);
1029 + len = receive_message(csock, iobuf);
1033 + spin_lock_irq(&active_socket_lock);
1036 + break; /* EOF on socket */
1038 + spin_unlock_irq(&active_socket_lock);
1040 + /* Resend any unacked messages */
1041 + if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
1042 + && acks_expected) {
1043 + resend_last_message();
1046 + /* Send any queued messages */
1047 + if (acks_expected == 0) {
1048 + struct list_head *temp;
1049 + struct list_head *msglist;
1051 + down(&messages_list_lock);
1052 + list_for_each_safe(msglist, temp, &messages_list) {
1053 + struct queued_message *qmsg =
1054 + list_entry(msglist, struct queued_message,
1056 + int status = send_queued_message(qmsg);
1058 + if (status >= 0) {
1059 + /* Suceeded, remove it from the queue */
1060 + list_del(&qmsg->list);
1063 + /* Did it fail horribly ?? */
1064 + if (status < 0 && status != -EAGAIN) {
1065 + printk(KERN_INFO CMAN_NAME
1066 + ": send_queued_message failed, error %d\n",
1068 + list_del(&qmsg->list);
1071 + break; /* Only send one message at a time */
1073 + up(&messages_list_lock);
1076 + if (signal_pending(current))
1079 + P_COMMS("closing down\n");
1081 + quit_threads = 1; /* force other thread to die too */
1083 + /* Wait for membership thread to finish, that way any
1084 + LEAVE message will get sent. */
1085 + wake_up_process(membership_task);
1086 + wait_for_completion(&member_thread_comp);
1090 + if (timer_pending(&ack_timer))
1091 + del_timer(&ack_timer);
1096 + complete(&cluster_thread_comp);
1100 +void notify_kernel_listeners(kcl_callback_reason reason, long arg)
1102 + struct kernel_notify_struct *knotify;
1103 + struct list_head *proclist;
1105 + down(&kernel_listener_lock);
1106 + list_for_each(proclist, &kernel_listener_list) {
1108 + list_entry(proclist, struct kernel_notify_struct, list);
1109 + knotify->callback(reason, arg);
1111 + up(&kernel_listener_lock);
1114 +static void check_for_unacked_nodes()
1116 + struct list_head *nodelist;
1117 + struct list_head *temp;
1118 + struct cluster_node *node;
1120 + clear_bit(RESEND_NEEDED, &mainloop_flags);
1123 + P_COMMS("Retry count exceeded -- looking for dead node\n");
1125 + /* Node did not ACK a message after <n> tries, remove it from the
1127 + down(&cluster_members_lock);
1128 + list_for_each_safe(nodelist, temp, &cluster_members_list) {
1129 + node = list_entry(nodelist, struct cluster_node, list);
1131 + P_COMMS("checking node %s: last_acked = %d, last_seq_sent = %d\n",
1132 + node->name, node->last_seq_acked, node->last_seq_sent);
1133 + if (node->state != NODESTATE_DEAD &&
1134 + node->last_seq_acked != node->last_seq_sent && !node->us) {
1135 + printk(KERN_WARNING CMAN_NAME
1136 + ": node %s is not responding - removing from the cluster\n",
1139 + /* Drop this lock or we can deadlock with membership */
1140 + up(&cluster_members_lock);
1142 + /* Start a state transition */
1143 + a_node_just_died(node);
1144 + down(&cluster_members_lock);
1147 + up(&cluster_members_lock);
1148 + acks_expected = ack_count = 0;
1153 +static void ack_timer_fn(unsigned long arg)
1155 + P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
1157 + /* Too many retries ? */
1158 + if (++retry_count > MAX_RETRIES) {
1159 + set_bit(ACK_TIMEOUT, &mainloop_flags);
1160 + wake_up_interruptible(&cnxman_waitq);
1163 + /* Resend last message */
1164 + set_bit(RESEND_NEEDED, &mainloop_flags);
1165 + wake_up_interruptible(&cnxman_waitq);
1169 +/* Called to resend a packet if sock_sendmsg was busy */
1170 +static void short_timer_fn(unsigned long arg)
1172 + P_COMMS("short_timer fired\n");
1174 + /* Resend last message */
1175 + resend_delay <<= 1;
1176 + set_bit(RESEND_NEEDED, &mainloop_flags);
1177 + wake_up_interruptible(&cnxman_waitq);
1180 +static void start_ack_timer()
1182 + ack_timer.function = ack_timer_fn;
1183 + ack_timer.data = 0L;
1184 + mod_timer(&ack_timer, jiffies + HZ);
1187 +static void start_short_timer(void)
1189 + ack_timer.function = short_timer_fn;
1190 + ack_timer.data = 0L;
1191 + mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
1195 +static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
1197 + struct list_head *llist;
1198 + struct cl_waiting_listen_request *listener;
1200 + list_for_each(llist, &listenreq_list) {
1201 + listener = list_entry(llist, struct cl_waiting_listen_request,
1203 + if (listener->tag == tag) {
1210 +static void process_ack(struct cluster_node *rem_node, unsigned short seq)
1212 + if (rem_node && rem_node->state != NODESTATE_DEAD) {
1213 + /* This copes with duplicate acks from a multipathed
1215 + if (rem_node->last_seq_acked !=
1216 + le16_to_cpu(seq)) {
1217 + rem_node->last_seq_acked =
1221 + if (++ack_count >= acks_expected) {
1223 + /* Cancel the timer */
1224 + del_timer(&ack_timer);
1225 + acks_expected = 0;
1232 +static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
1233 + int len, char *addr, int addrlen,
1234 + struct cluster_node *rem_node)
1236 + struct cl_protmsg *msg = (struct cl_protmsg *) data;
1237 + struct cl_protheader *header = (struct cl_protheader *) data;
1238 + struct cl_ackmsg *ackmsg;
1239 + struct cl_listenmsg *listenmsg;
1240 + struct cl_closemsg *closemsg;
1241 + struct cl_barriermsg *barriermsg;
1242 + struct cl_waiting_listen_request *listen_request;
1244 + P_COMMS("Message on port 0 is %d\n", msg->cmd);
1245 + switch (msg->cmd) {
1246 + case CLUSTER_CMD_ACK:
1247 + ackmsg = (struct cl_ackmsg *) data;
1249 + if (rem_node && (ackmsg->aflags & 1)) {
1250 + if (net_ratelimit())
1251 + printk(KERN_INFO CMAN_NAME
1252 + ": WARNING no listener for port %d on node %s\n",
1253 + ackmsg->remport, rem_node->name);
1255 + P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
1256 + rem_node ? rem_node->name : "Unknown",
1257 + le16_to_cpu(ackmsg->header.ack), cur_seq);
1259 + /* ACK processing has already happened */
1262 + /* Return 1 if we have a listener on this port, 0 if not */
1263 + case CLUSTER_CMD_LISTENREQ:
1265 + (struct cl_listenmsg *) (data +
1266 + sizeof (struct cl_protheader));
1267 + cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1268 + send_listen_response(csock, le32_to_cpu(header->srcid),
1269 + listenmsg->target_port, listenmsg->tag);
1272 + case CLUSTER_CMD_LISTENRESP:
1273 + /* Wake up process waiting for listen response */
1275 + (struct cl_listenmsg *) (data +
1276 + sizeof (struct cl_protheader));
1277 + cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1278 + down(&listenreq_lock);
1279 + listen_request = find_listen_request(listenmsg->tag);
1280 + if (listen_request) {
1281 + listen_request->result = listenmsg->listening;
1282 + listen_request->waiting = 0;
1283 + wake_up_interruptible(&listen_request->waitq);
1285 + up(&listenreq_lock);
1288 + case CLUSTER_CMD_PORTCLOSED:
1290 + (struct cl_closemsg *) (data +
1291 + sizeof (struct cl_protheader));
1292 + cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1293 + post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
1296 + case CLUSTER_CMD_BARRIER:
1298 + (struct cl_barriermsg *) (data +
1299 + sizeof (struct cl_protheader));
1300 + cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1302 + process_barrier_msg(barriermsg, rem_node);
1306 + printk(KERN_ERR CMAN_NAME
1307 + ": Unknown protocol message %d received\n", msg->cmd);
1314 +static int valid_addr_for_node(struct cluster_node *node, char *addr)
1316 + struct list_head *addrlist;
1317 + struct cluster_node_addr *nodeaddr;
1319 + /* We don't compare the first two bytes of the address because it's
1320 + * the Address Family and always in native byte order...so it will
1321 + * not match if we have mixed big & little-endian machines in the cluster
1324 + list_for_each(addrlist, &node->addr_list) {
1325 + nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
1327 + if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0)
1328 + return 1; /* TRUE */
1330 + return 0; /* FALSE */
1333 +static void memcpy_fromkvec(void *data, struct kvec *vec, int len)
1336 + if (vec->iov_len) {
1337 + int copy = min_t(unsigned int, len, vec->iov_len);
1338 + memcpy(data, vec->iov_base, copy);
1341 + vec->iov_base += copy;
1342 + vec->iov_len -= copy;
1348 +static int send_to_user_port(struct cl_comms_socket *csock,
1349 + struct cl_protheader *header,
1350 + struct msghdr *msg,
1351 + struct kvec *iov, int veclen,
1354 + struct sk_buff *skb;
1355 + struct cb_info *cbinfo;
1358 + /* Get the port number and look for a listener */
1359 + down(&port_array_lock);
1360 + if (port_array[header->tgtport]) {
1361 + struct cluster_sock *c = cluster_sk(port_array[header->tgtport]);
1364 + if (!(header->flags & MSG_NOACK) &&
1365 + !(header->flags & MSG_REPLYEXP)) {
1367 + cl_sendack(csock, header->seq, msg->msg_namelen,
1368 + msg->msg_name, header->tgtport, 0);
1371 + /* Call a callback if there is one */
1372 + if (c->kernel_callback) {
1373 + up(&port_array_lock);
1374 + if (veclen == 1) {
1375 + c->kernel_callback(iov->iov_base,
1377 + msg->msg_name, msg->msg_namelen,
1378 + le32_to_cpu(header->srcid));
1381 + else { /* Unroll iov, this Hardly ever Happens */
1383 + data = kmalloc(len, GFP_KERNEL);
1387 + memcpy_fromkvec(data, iov, len);
1388 + c->kernel_callback(data, len,
1389 + msg->msg_name, msg->msg_namelen,
1390 + le32_to_cpu(header->srcid));
1396 + /* Otherwise put it into an SKB and pass it onto the recvmsg
1398 + skb = alloc_skb(len, GFP_KERNEL);
1400 + up(&port_array_lock);
1401 + printk(KERN_INFO CMAN_NAME
1402 + ": Failed to allocate skb\n");
1406 + skb_put(skb, len);
1407 + memcpy_fromkvec(skb->data, iov, len);
1409 + /* Put metadata into cb[] */
1410 + cbinfo = (struct cb_info *)skb->cb;
1411 + cbinfo->orig_nodeid = le32_to_cpu(header->srcid);
1412 + cbinfo->orig_port = header->srcport;
1416 + sock_queue_rcv_skb(port_array[header->tgtport], skb)) < 0) {
1418 + printk(KERN_INFO CMAN_NAME
1419 + ": Error queueing request to port %d: %d\n",
1420 + header->tgtport, err);
1423 + /* If the port was MEMBERSHIP then we have to die */
1424 + if (header->tgtport == CLUSTER_PORT_MEMBERSHIP) {
1425 + up(&port_array_lock);
1426 + send_leave(CLUSTER_LEAVEFLAG_PANIC);
1427 + panic("membership stopped responding");
1430 + up(&port_array_lock);
1434 + /* ACK it, but set the flag bit so remote end knows no-one
1436 + if (!(header->flags & MSG_NOACK))
1437 + cl_sendack(csock, header->seq,
1438 + msg->msg_namelen, msg->msg_name,
1439 + header->tgtport, 1);
1441 + /* Nobody listening, drop it */
1442 + up(&port_array_lock);
1447 +/* NOTE: This routine knows (assumes!) that there is only one
1448 + iov element passed into it. */
1449 +static void process_incoming_packet(struct cl_comms_socket *csock,
1450 + struct msghdr *msg,
1451 + struct kvec *vec, int veclen, int len)
1453 + char *data = vec->iov_base;
1454 + char *addr = msg->msg_name;
1455 + int addrlen = msg->msg_namelen;
1456 + struct cl_protheader *header = (struct cl_protheader *) data;
1457 + struct cluster_node *rem_node =
1458 + find_node_by_nodeid(le32_to_cpu(header->srcid));
1460 + P_COMMS("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
1461 + le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1462 + le16_to_cpu(header->seq), rem_node,
1463 + rem_node ? rem_node->state : -1);
1465 + /* If the remote end is being coy about its node ID then look it up by
1467 + if (!rem_node && header->srcid == 0) {
1468 + rem_node = find_node_by_addr(addr, addrlen);
1471 + /* If this node is an ex-member then treat it as unknown */
1472 + if (rem_node && rem_node->state != NODESTATE_MEMBER
1473 + && rem_node->state != NODESTATE_JOINING)
1476 + /* Ignore messages not for our cluster */
1477 + if (le16_to_cpu(header->cluster) != cluster_id) {
1478 + P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
1479 + cluster_id, header->cluster);
1480 + goto incoming_finish;
1483 + /* If the message is from us then just dump it */
1484 + if (rem_node && rem_node->us)
1485 + goto incoming_finish;
1487 + /* If we can't find the nodeid then check for our own messages the hard
1488 + * way - this only happens during joining */
1490 + struct list_head *socklist;
1491 + struct cl_comms_socket *clsock;
1493 + list_for_each(socklist, &socket_list) {
1495 + list_entry(socklist, struct cl_comms_socket, list);
1497 + if (clsock->recv_only) {
1499 + if (memcmp(addr, &clsock->saddr, address_length) == 0) {
1500 + goto incoming_finish;
1507 + /* Ignore messages not for us */
1508 + if (le32_to_cpu(header->tgtid) > 0 && us
1509 + && le32_to_cpu(header->tgtid) != us->node_id) {
1510 + goto incoming_finish;
1513 + P_COMMS("got message, from %d for %d, sequence num = %d\n",
1514 + le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1515 + le16_to_cpu(header->seq));
1517 + if (header->ack && rem_node) {
1518 + process_ack(rem_node, header->ack);
1521 + /* Have we received this message before ? If so just ignore it, it's a
1522 + * resend for someone else's benefit */
1523 + if (!(header->flags & MSG_NOACK) &&
1524 + rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
1526 + ("Discarding message - Already seen this sequence number %d\n",
1527 + rem_node->last_seq_recv);
1528 + /* Still need to ACK it though, in case it was the ACK that got
1530 + cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1531 + goto incoming_finish;
1534 + /* Check that the message is from the node we think it is from */
1535 + if (rem_node && !valid_addr_for_node(rem_node, addr)) {
1539 + /* If it's a new node then assign it a temporary node ID */
1541 + header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
1543 + P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
1544 + header->flags, header->tgtport, we_are_a_cluster_member);
1547 + /* If we are not part of the cluster then ignore multicast messages
1548 + * that need an ACK as we will confuse the sender who is only expecting
1549 + * ACKS from bona fide members */
1550 + if ((header->flags & MSG_MULTICAST) &&
1551 + !(header->flags & MSG_NOACK) && !we_are_a_cluster_member) {
1553 + ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
1554 + header->tgtport, header->flags);
1555 + goto incoming_finish;
1558 + /* Save the sequence number of this message so we can ignore duplicates
1560 + if (!(header->flags & MSG_NOACK) && rem_node) {
1561 + P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
1563 + rem_node->last_seq_recv = le16_to_cpu(header->seq);
1566 + /* Is it a protocol message? */
1567 + if (header->tgtport == 0) {
1568 + process_cnxman_message(csock, data, len, addr, addrlen,
1570 + goto incoming_finish;
1573 + /* Skip past the header to the data */
1574 + vec[0].iov_base = data + sizeof (struct cl_protheader);
1575 + vec[0].iov_len -= sizeof (struct cl_protheader);
1576 + len -= sizeof (struct cl_protheader);
1578 + send_to_user_port(csock, header, msg, vec, veclen, len);
1584 +static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
1587 + struct cluster_sock *c;
1590 + sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
1591 + cluster_sk_cachep)) == NULL)
1595 + sock->ops = &cl_proto_ops;
1597 + sock_init_data(sock, sk);
1599 + sk->sk_destruct = NULL;
1600 + sk->sk_no_check = 1;
1601 + sk->sk_family = PF_CLUSTER;
1602 + sk->sk_allocation = gfp;
1604 + c = cluster_sk(sk);
1606 + c->service_data = NULL;
1613 +static int cl_release(struct socket *sock)
1615 + struct sock *sk = sock->sk;
1616 + struct cl_client_socket *csock;
1617 + struct list_head *socklist;
1618 + struct list_head *tmp;
1620 + down(&client_socket_lock);
1622 + /* Remove port allocations if it's a bound socket */
1623 + struct cluster_sock *c = cluster_sk(sk);
1625 + down(&port_array_lock);
1627 + port_array[c->port] = NULL;
1629 + up(&port_array_lock);
1631 + /* Tell other nodes in the cluster that this listener is going
1633 + if (atomic_read(&cnxman_running) && c->port)
1634 + send_port_close_oob(c->port);
1636 + if (c->service_data)
1637 + sm_sock_release(sock);
1639 + /* Master socket released ? */
1640 + if (sk->sk_protocol == CLPROTO_MASTER) {
1641 + master_sock = NULL;
1643 + /* If this socket is being freed and cnxman is not
1644 + * started then free all the comms sockets as either
1645 + * the userland "join" process has crashed or the
1648 + if (!atomic_read(&cnxman_running)) {
1650 + free_cluster_sockets();
1663 + /* Remove it from the list of clients */
1664 + list_for_each_safe(socklist, tmp, &client_socket_list) {
1665 + csock = list_entry(socklist, struct cl_client_socket, list);
1667 + if (csock->sock == sock) {
1668 + list_del(&csock->list);
1673 + up(&client_socket_lock);
1678 +static int cl_create(struct socket *sock, int protocol)
1682 + /* All are datagrams */
1683 + if (sock->type != SOCK_DGRAM)
1684 + return -ESOCKTNOSUPPORT;
1686 + if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
1689 + /* Can only have one master socket */
1690 + if (master_sock && protocol == CLPROTO_MASTER)
1693 + /* cnxman not running and a client was requested */
1694 + if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
1697 + if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
1700 + sk->sk_protocol = protocol;
1702 + if (protocol == CLPROTO_MASTER)
1705 + /* Add client sockets to the list */
1706 + if (protocol == CLPROTO_CLIENT) {
1707 + struct cl_client_socket *clsock =
1708 + kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
1713 + clsock->sock = sock;
1714 + down(&client_socket_lock);
1715 + list_add(&clsock->list, &client_socket_list);
1716 + up(&client_socket_lock);
1722 +static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1724 + struct sock *sk = sock->sk;
1725 + struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
1726 + struct cluster_sock *c = cluster_sk(sk);
1728 + if (!capable(CAP_NET_BIND_SERVICE))
1731 + if (sk->sk_zapped == 0)
1734 + if (addr_len != sizeof (struct sockaddr_cl))
1737 + if (saddr->scl_family != AF_CLUSTER)
1740 + if (saddr->scl_port == 0)
1741 + return -EINVAL; /* Port 0 is reserved for protocol messages */
1743 + down(&port_array_lock);
1745 + if (port_array[saddr->scl_port]) {
1746 + up(&port_array_lock);
1747 + return -EADDRINUSE;
1750 + port_array[saddr->scl_port] = sk;
1752 + up(&port_array_lock);
1754 + c->port = saddr->scl_port;
1755 + sk->sk_zapped = 0;
1757 + /* If we are not a cluster member yet then make the client wait until
1758 + * we are, this allows nodes to start cluster clients at the same time
1759 + * as cluster services but they will wait until membership is achieved.
1760 + * This looks odd in bind() (open would seem more obvious) but we need
1761 + * to know which port number is being used so that things like
1762 + * membership services don't get blocked
1765 + if (saddr->scl_port > HIGH_PROTECTED_PORT)
1766 + while (!we_are_a_cluster_member || !cluster_is_quorate
1767 + || in_transition()) {
1768 + DECLARE_WAITQUEUE(wq, current);
1769 + struct task_struct *tsk = current;
1771 + set_task_state(tsk, TASK_INTERRUPTIBLE);
1772 + add_wait_queue(&socket_waitq, &wq);
1774 + if (!we_are_a_cluster_member || !cluster_is_quorate
1775 + || in_transition())
1778 + set_task_state(tsk, TASK_RUNNING);
1779 + remove_wait_queue(&socket_waitq, &wq);
1781 + /* We were woken up because the cluster is going down,
1782 + * ...and we never got a chance to do any work! (sob) */
1783 + if (atomic_read(&cnxman_running) == 0 || quit_threads) {
1791 +static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
1792 + int *uaddr_len, int peer)
1794 + struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
1795 + struct sock *sk = sock->sk;
1796 + struct cluster_sock *c = cluster_sk(sk);
1798 + *uaddr_len = sizeof (struct sockaddr_cl);
1802 + sa->scl_port = c->port;
1803 + sa->scl_flags = 0;
1804 + sa->scl_family = AF_CLUSTER;
1811 +static unsigned int cl_poll(struct file *file, struct socket *sock,
1812 + poll_table * wait)
1814 + return datagram_poll(file, sock, wait);
1817 +/* Copy internal node format to userland format */
1818 +void copy_to_usernode(struct cluster_node *node,
1819 + struct cl_cluster_node *unode)
1821 + strcpy(unode->name, node->name);
1822 + unode->size = sizeof (struct cl_cluster_node);
1823 + unode->votes = node->votes;
1824 + unode->state = node->state;
1825 + unode->us = node->us;
1826 + unode->node_id = node->node_id;
1827 + unode->leave_reason = node->leave_reason;
1828 + unode->incarnation = node->incarnation;
1831 +static int add_clsock(int broadcast, int number, struct socket *sock,
1832 + struct file *file)
1834 + struct cl_comms_socket *newsock =
1835 + kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
1839 + memset(newsock, 0, sizeof (*newsock));
1840 + newsock->number = number;
1841 + newsock->sock = sock;
1843 + newsock->broadcast = 1;
1844 + newsock->recv_only = 0;
1847 + newsock->broadcast = 0;
1848 + newsock->recv_only = 1;
1851 + newsock->file = file;
1852 + newsock->addr_len = sizeof(struct sockaddr_in6);
1854 + /* Mark it active until cnxman thread is running and ready to process
1856 + set_bit(1, &newsock->active);
1858 + /* Find out what it's bound to */
1859 + newsock->sock->ops->getname(newsock->sock,
1860 + (struct sockaddr *)&newsock->saddr,
1861 + &newsock->addr_len, 0);
1863 + num_interfaces = max(num_interfaces, newsock->number);
1864 + if (!current_interface && newsock->broadcast)
1865 + current_interface = newsock;
1867 + /* Hook data_ready */
1868 + newsock->sock->sk->sk_data_ready = cnxman_data_ready;
1870 + /* Make an attempt to keep them in order */
1871 + list_add_tail(&newsock->list, &socket_list);
1873 + address_length = newsock->addr_len;
1877 +/* ioctl processing functions */
1879 +static int do_ioctl_set_version(unsigned long arg)
1881 + struct cl_version version, *u_version;
1883 + if (!capable(CAP_CLUSTER))
1888 + u_version = (struct cl_version *) arg;
1890 + if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
1893 + if (version.major != CNXMAN_MAJOR_VERSION ||
1894 + version.minor != CNXMAN_MINOR_VERSION ||
1895 + version.patch != CNXMAN_PATCH_VERSION)
1898 + if (config_version == version.config)
1901 + config_version = version.config;
1902 + send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
1906 +static int do_ioctl_get_members(unsigned long arg)
1908 + struct cluster_node *node;
1909 + /* Kernel copies */
1910 + struct cl_cluster_node user_format_node;
1911 + struct cl_cluster_nodelist user_format_nodelist;
1912 + /* User space array ptr */
1913 + struct cl_cluster_node *user_node;
1914 + struct list_head *nodelist;
1915 + int num_nodes = 0;
1918 + return cluster_members;
1920 + if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1923 + down(&cluster_members_lock);
1925 + if (user_format_nodelist.max_members < cluster_members) {
1926 + up(&cluster_members_lock);
1930 + user_node = user_format_nodelist.nodes;
1932 + list_for_each(nodelist, &cluster_members_list) {
1933 + node = list_entry(nodelist, struct cluster_node, list);
1934 + if (node->state == NODESTATE_MEMBER) {
1935 + copy_to_usernode(node, &user_format_node);
1936 + if (copy_to_user(user_node, &user_format_node,
1937 + sizeof (struct cl_cluster_node))) {
1938 + up(&cluster_members_lock);
1945 + up(&cluster_members_lock);
1950 +static int do_ioctl_get_all_members(unsigned long arg)
1952 + struct cluster_node *node;
1953 + /* Kernel copies */
1954 + struct cl_cluster_node user_format_node;
1955 + struct cl_cluster_nodelist user_format_nodelist;
1956 + /* User space array ptr*/
1957 + struct cl_cluster_node *user_node;
1958 + struct list_head *nodelist;
1959 + int num_nodes = 0;
1961 + if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1964 + down(&cluster_members_lock);
1966 + user_node = user_format_nodelist.nodes;
1968 + list_for_each(nodelist, &cluster_members_list) {
1969 + node = list_entry(nodelist, struct cluster_node, list);
1971 + copy_to_usernode(node,
1972 + &user_format_node);
1974 + if (copy_to_user(user_node, &user_format_node,
1975 + sizeof (struct cl_cluster_node))) {
1976 + up(&cluster_members_lock);
1980 + if (--user_format_nodelist.max_members < 0) {
1981 + num_nodes = -EFAULT;
1989 + up(&cluster_members_lock);
1995 +static int do_ioctl_get_cluster(unsigned long arg)
1997 + struct cl_cluster_info __user *info;
1999 + info = (struct cl_cluster_info *)arg;
2001 + if (copy_to_user(&info->number, &cluster_id, sizeof(cluster_id)))
2004 + if (copy_to_user(&info->name, cluster_name, strlen(cluster_name)+1))
2010 +static int do_ioctl_get_node(unsigned long arg)
2012 + struct cluster_node *node;
2013 + struct cl_cluster_node k_node, *u_node;
2015 + u_node = (struct cl_cluster_node *) arg;
2017 + if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
2020 + if (!k_node.name[0]) {
2021 + if (k_node.node_id == 0)
2022 + k_node.node_id = us->node_id;
2023 + node = find_node_by_nodeid(k_node.node_id);
2026 + node = find_node_by_name(k_node.name);
2031 + copy_to_usernode(node, &k_node);
2033 + if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
2039 +static int do_ioctl_set_expected(unsigned long arg)
2041 + struct list_head *nodelist;
2042 + struct cluster_node *node;
2043 + unsigned int total_votes;
2044 + unsigned int newquorum;
2046 + if (!capable(CAP_CLUSTER))
2051 + newquorum = calculate_quorum(1, arg, &total_votes);
2053 + if (newquorum < total_votes / 2
2054 + || newquorum > total_votes) {
2059 + down(&cluster_members_lock);
2060 + list_for_each(nodelist, &cluster_members_list) {
2061 + node = list_entry(nodelist, struct cluster_node, list);
2062 + if (node->state == NODESTATE_MEMBER
2063 + && node->expected_votes > arg) {
2064 + node->expected_votes = arg;
2067 + up(&cluster_members_lock);
2069 + recalculate_quorum(1);
2071 + send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
2072 + sm_member_update(cluster_is_quorate);
2077 +static int do_ioctl_kill_node(unsigned long arg)
2079 + struct cluster_node *node;
2081 + if (!capable(CAP_CLUSTER))
2085 + if ((node = find_node_by_nodeid(arg)) == NULL)
2088 + /* Can't kill us */
2092 + if (node->state != NODESTATE_MEMBER)
2095 + /* Just in case it is alive, send a KILL message */
2098 + node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
2099 + a_node_just_died(node);
2104 +static int do_ioctl_barrier(unsigned long arg)
2106 + struct cl_barrier_info info;
2108 + if (!capable(CAP_CLUSTER))
2111 + if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0)
2114 + switch (info.cmd) {
2115 + case BARRIER_IOCTL_REGISTER:
2116 + return kcl_barrier_register(info.name,
2119 + case BARRIER_IOCTL_CHANGE:
2120 + return kcl_barrier_setattr(info.name,
2123 + case BARRIER_IOCTL_WAIT:
2124 + return kcl_barrier_wait(info.name);
2125 + case BARRIER_IOCTL_DELETE:
2126 + return kcl_barrier_delete(info.name);
2132 +static int do_ioctl_islistening(unsigned long arg)
2134 + DECLARE_WAITQUEUE(wq, current);
2135 + struct cl_listen_request rq;
2136 + struct cluster_node *rem_node;
2139 + struct cl_waiting_listen_request *listen_request;
2144 + if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
2147 + nodeid = rq.nodeid;
2149 + nodeid = us->node_id;
2151 + rem_node = find_node_by_nodeid(nodeid);
2153 + /* Node not in the cluster */
2157 + if (rem_node->state != NODESTATE_MEMBER)
2160 + /* If the request is for us then just look in the ports
2163 + return (port_array[rq.port] != 0) ? 1 : 0;
2165 + /* For a remote node we need to send a request out */
2167 + /* If we are in transition then wait until we are not */
2168 + while (in_transition()) {
2169 + set_task_state(current, TASK_INTERRUPTIBLE);
2170 + add_wait_queue(&socket_waitq, &wq);
2172 + if (in_transition())
2175 + set_task_state(current, TASK_RUNNING);
2176 + remove_wait_queue(&socket_waitq, &wq);
2178 + if (signal_pending(current))
2182 + /* Were we shut down before it completed ? */
2183 + if (!atomic_read(&cnxman_running))
2187 + kmalloc(sizeof (struct cl_waiting_listen_request),
2189 + if (!listen_request)
2192 + /* Build the request */
2193 + listen_request->waiting = 1;
2194 + listen_request->result = 0;
2195 + listen_request->tag = current->pid;
2196 + listen_request->nodeid = nodeid;
2197 + init_waitqueue_head(&listen_request->waitq);
2199 + down(&listenreq_lock);
2200 + list_add(&listen_request->list, &listenreq_list);
2201 + up(&listenreq_lock);
2203 + /* Now wait for the response to come back */
2204 + send_listen_request(rq.nodeid, rq.port);
2206 + while (listen_request->waiting) {
2207 + set_task_state(current, TASK_INTERRUPTIBLE);
2208 + add_wait_queue(&listen_request->waitq, &wq);
2210 + if (listen_request->waiting)
2213 + set_task_state(current, TASK_RUNNING);
2214 + remove_wait_queue(&listen_request->waitq, &wq);
2216 + if (signal_pending(current)) {
2217 + result = -ERESTARTSYS;
2221 + result = listen_request->result;
2224 + down(&listenreq_lock);
2225 + list_del(&listen_request->list);
2226 + kfree(listen_request);
2227 + up(&listenreq_lock);
2231 +static int do_ioctl_set_votes(unsigned long arg)
2233 + unsigned int total_votes;
2234 + unsigned int newquorum;
2237 + if (!capable(CAP_CLUSTER))
2240 + /* Check votes is valid */
2241 + saved_votes = us->votes;
2244 + newquorum = calculate_quorum(1, 0, &total_votes);
2246 + if (newquorum < total_votes / 2 || newquorum > total_votes) {
2247 + us->votes = saved_votes;
2251 + recalculate_quorum(1);
2253 + send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
2258 +static int do_ioctl_pass_socket(unsigned long arg)
2260 + struct cl_passed_sock sock_info;
2261 + struct file *file;
2264 + if (!capable(CAP_CLUSTER))
2267 + if (atomic_read(&cnxman_running))
2272 + if (copy_from_user(&sock_info, (void *)arg, sizeof(sock_info)))
2275 + file = fget(sock_info.fd);
2277 + struct inode *inode = file->f_dentry->d_inode;
2279 + error = add_clsock(sock_info.multicast,
2280 + sock_info.number, SOCKET_I(inode),
2289 +static int do_ioctl_set_nodename(unsigned long arg)
2291 + if (!capable(CAP_CLUSTER))
2293 + if (atomic_read(&cnxman_running))
2295 + if (strncpy_from_user(nodename, (void *)arg, MAX_CLUSTER_MEMBER_NAME_LEN) < 0)
2300 +static int do_ioctl_set_nodeid(unsigned long arg)
2302 + int nodeid = (int)arg;
2304 + if (!capable(CAP_CLUSTER))
2306 + if (atomic_read(&cnxman_running))
2308 + if (nodeid < 0 || nodeid > 4096)
2311 + wanted_nodeid = (int)arg;
2315 +static int do_ioctl_join_cluster(unsigned long arg)
2317 + struct cl_join_cluster_info join_info;
2319 + if (!capable(CAP_CLUSTER))
2322 + if (atomic_read(&cnxman_running))
2325 + if (copy_from_user(&join_info, (void *)arg, sizeof (struct cl_join_cluster_info) ))
2328 + if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
2331 + if (list_empty(&socket_list))
2334 + set_votes(join_info.votes, join_info.expected_votes);
2335 + cluster_id = generate_cluster_id(join_info.cluster_name);
2336 + strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
2337 + two_node = join_info.two_node;
2338 + config_version = join_info.config_version;
2341 + acks_expected = 0;
2342 + init_completion(&cluster_thread_comp);
2343 + init_completion(&member_thread_comp);
2344 + if (allocate_nodeid_array())
2347 + kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
2348 + if (kcluster_pid < 0)
2349 + return kcluster_pid;
2351 + wait_for_completion(&cluster_thread_comp);
2352 + init_completion(&cluster_thread_comp);
2354 + atomic_set(&cnxman_running, 1);
2356 + /* Make sure we have a node name */
2357 + if (nodename[0] == '\0')
2358 + strcpy(nodename, system_utsname.nodename);
2360 + membership_pid = start_membership_services(kcluster_pid);
2361 + if (membership_pid < 0) {
2363 + wait_for_completion(&cluster_thread_comp);
2364 + init_completion(&member_thread_comp);
2365 + return membership_pid;
2372 +static int do_ioctl_leave_cluster(unsigned long leave_flags)
2374 + if (!capable(CAP_CLUSTER))
2377 + if (!atomic_read(&cnxman_running))
2380 + if (in_transition())
2383 + /* Ignore the use count if FORCE is set */
2384 + if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
2385 + if (atomic_read(&use_count))
2389 + us->leave_reason = leave_flags;
2391 + wake_up_interruptible(&cnxman_waitq);
2393 + wait_for_completion(&cluster_thread_comp);
2394 + atomic_set(&use_count, 0);
2398 +static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2400 + int err = -EOPNOTSUPP;
2401 + struct list_head *proclist;
2402 + struct list_head *tmp;
2403 + struct notify_struct *notify;
2404 + struct cl_version cnxman_version;
2407 + /* Process requests notification of cluster events */
2408 + case SIOCCLUSTER_NOTIFY:
2409 + notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
2412 + notify->pid = current->pid;
2413 + notify->signal = arg;
2414 + down(&event_listener_lock);
2415 + list_add(¬ify->list, &event_listener_list);
2416 + up(&event_listener_lock);
2420 + /* Process is no longer interested cluster events */
2421 + case SIOCCLUSTER_REMOVENOTIFY:
2424 + down(&event_listener_lock);
2425 + list_for_each_safe(proclist, tmp, &event_listener_list) {
2427 + list_entry(proclist, struct notify_struct, list);
2428 + if (notify->pid == current->pid) {
2429 + list_del(¬ify->list);
2434 + up(&event_listener_lock);
2437 + /* Return the cnxman version number */
2438 + case SIOCCLUSTER_GET_VERSION:
2442 + cnxman_version.major = CNXMAN_MAJOR_VERSION;
2443 + cnxman_version.minor = CNXMAN_MINOR_VERSION;
2444 + cnxman_version.patch = CNXMAN_PATCH_VERSION;
2445 + cnxman_version.config = config_version;
2446 + if (copy_to_user((void *) arg, &cnxman_version,
2447 + sizeof (struct cl_version))) {
2452 + /* Set the cnxman config version number */
2453 + case SIOCCLUSTER_SET_VERSION:
2454 + err = do_ioctl_set_version(arg);
2457 + /* Return the active membership list */
2458 + case SIOCCLUSTER_GETMEMBERS:
2459 + err = do_ioctl_get_members(arg);
2462 + /* Return the full membership list include dead nodes */
2463 + case SIOCCLUSTER_GETALLMEMBERS:
2464 + err = do_ioctl_get_all_members(arg);
2467 + case SIOCCLUSTER_GETNODE:
2468 + err = do_ioctl_get_node(arg);
2471 + case SIOCCLUSTER_GETCLUSTER:
2472 + err = do_ioctl_get_cluster(arg);
2475 + case SIOCCLUSTER_ISQUORATE:
2476 + return cluster_is_quorate;
2478 + case SIOCCLUSTER_ISACTIVE:
2479 + return atomic_read(&cnxman_running);
2481 + case SIOCCLUSTER_SETEXPECTED_VOTES:
2482 + err = do_ioctl_set_expected(arg);
2485 + /* Change the number of votes for this node */
2486 + case SIOCCLUSTER_SET_VOTES:
2487 + err = do_ioctl_set_votes(arg);
2490 + /* Return 1 if the specified node is listening on a given port */
2491 + case SIOCCLUSTER_ISLISTENING:
2492 + err = do_ioctl_islistening(arg);
2495 + /* Forcibly kill a node */
2496 + case SIOCCLUSTER_KILLNODE:
2497 + err = do_ioctl_kill_node(arg);
2500 + case SIOCCLUSTER_GET_JOINCOUNT:
2501 + if (!capable(CAP_CLUSTER))
2504 + return atomic_read(&use_count);
2506 + /* ioctl interface to the barrier system */
2507 + case SIOCCLUSTER_BARRIER:
2508 + err = do_ioctl_barrier(arg);
2511 + case SIOCCLUSTER_PASS_SOCKET:
2512 + if (sock->sk->sk_protocol != CLPROTO_MASTER)
2513 + err = -EOPNOTSUPP;
2515 + err = do_ioctl_pass_socket(arg);
2518 + case SIOCCLUSTER_SET_NODENAME:
2519 + if (sock->sk->sk_protocol != CLPROTO_MASTER)
2520 + err = -EOPNOTSUPP;
2522 + err = do_ioctl_set_nodename(arg);
2525 + case SIOCCLUSTER_SET_NODEID:
2526 + if (sock->sk->sk_protocol != CLPROTO_MASTER)
2527 + err = -EOPNOTSUPP;
2529 + err = do_ioctl_set_nodeid(arg);
2532 + case SIOCCLUSTER_JOIN_CLUSTER:
2533 + if (sock->sk->sk_protocol != CLPROTO_MASTER)
2534 + err = -EOPNOTSUPP;
2536 + err = do_ioctl_join_cluster(arg);
2539 + case SIOCCLUSTER_LEAVE_CLUSTER:
2540 + err = do_ioctl_leave_cluster(arg);
2544 + err = sm_ioctl(sock, cmd, arg);
2549 +static int cl_shutdown(struct socket *sock, int how)
2551 + struct sock *sk = sock->sk;
2552 + int err = -ENOTCONN;
2556 + if (sock->state == SS_UNCONNECTED)
2560 + if (sock->state == SS_DISCONNECTING)
2565 + if (how != SHUTDOWN_MASK)
2568 + sk->sk_shutdown = how;
2578 +/* We'll be giving out reward points next... */
2579 +/* Send the packet and save a copy in case someone loses theirs. Should be
2580 + * protected by the send mutexphore */
2581 +static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
2582 + struct kvec *vec, int veclen,
2583 + int size, int needack)
2586 + struct kvec save_vectors[veclen];
2588 + /* Save a copy of the IO vectors as sendmsg mucks around with them and
2589 + * we might want to send the same stuff out more than once (for different
2592 + memcpy(save_vectors, vec,
2593 + sizeof (struct kvec) * veclen);
2595 + result = kernel_sendmsg(csock->sock, msg, vec, veclen, size);
2597 + if (result >= 0 && acks_expected && needack) {
2599 + /* Start retransmit timer if it didn't go */
2600 + if (result == 0) {
2601 + start_short_timer();
2608 + /* Restore IOVs */
2609 + memcpy(vec, save_vectors,
2610 + sizeof (struct kvec) * veclen);
2615 +static void resend_last_message()
2617 + struct msghdr msg;
2618 + struct kvec vec[1];
2621 + P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
2622 + jiffies, saved_msg_len, saved_msg_buffer[0],
2623 + saved_msg_buffer[6]);
2625 + /* Assume there is something wrong with the last interface */
2626 + current_interface = get_next_interface(current_interface);
2627 + if (num_interfaces > 1)
2628 + printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
2629 + current_interface->number);
2631 + vec[0].iov_base = saved_msg_buffer;
2632 + vec[0].iov_len = saved_msg_len;
2634 + memset(&msg, 0, sizeof (msg));
2635 + msg.msg_name = ¤t_interface->saddr;
2636 + msg.msg_namelen = current_interface->addr_len;
2638 + result = kernel_sendmsg(current_interface->sock, &msg, vec, 1, saved_msg_len);
2641 + printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
2643 + /* Try indefinitely to send this, the backlog must die down eventually
2646 + start_short_timer();
2648 + /* Send succeeded, continue waiting for ACKS */
2650 + start_ack_timer();
2654 +static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
2655 + struct msghdr *msg, size_t size, int flags)
2657 + struct sock *sk = sock->sk;
2658 + struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
2659 + struct sk_buff *skb;
2660 + struct cb_info *cbinfo;
2661 + int copied, err = 0;
2663 + /* Socket was notified of shutdown, remove any pending skbs and return
2665 + if (!atomic_read(&cnxman_running)) {
2666 + while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
2667 + skb_free_datagram(sk, skb);
2668 + return 0; /* cnxman has left the building */
2671 + /* Generic datagram code does most of the work. If the user is not
2672 + * interested in OOB messages then ignore them */
2674 + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2678 + cbinfo = (struct cb_info *)skb->cb;
2680 + /* If it is OOB and the user doesn't want it, then throw it away. */
2681 + if (cbinfo->oob && !(flags & MSG_OOB)) {
2682 + skb_free_datagram(sk, skb);
2684 + /* If we peeked (?) an OOB but the user doesn't want it
2685 + then we need to discard it or we'll loop forever */
2686 + if (flags & MSG_PEEK) {
2687 + skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
2688 + MSG_DONTWAIT, &err);
2690 + skb_free_datagram(sk, skb);
2696 + while (cbinfo->oob && !(flags & MSG_OOB));
2698 + copied = skb->len;
2699 + if (copied > size) {
2701 + msg->msg_flags |= MSG_TRUNC;
2703 + err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
2708 + if (msg->msg_name && msg->msg_namelen) {
2709 + memset(msg->msg_name, 0, msg->msg_namelen);
2711 + if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
2713 + /* Nodeid is in native byte order - anything else is just
2715 + sin->scl_nodeid = cbinfo->orig_nodeid;
2717 + msg->msg_namelen = sizeof (struct sockaddr_cl);
2718 + sin->scl_port = cbinfo->orig_port;
2721 + if (cbinfo->oob) {
2722 + msg->msg_flags |= MSG_OOB;
2725 + sock_recv_timestamp(msg, sk, skb);
2730 + skb_free_datagram(sk, skb);
2736 +/* Send a message out on all interfaces */
2737 +static int send_to_all_ints(int nodeid, struct msghdr *our_msg,
2738 + struct kvec *vec, int veclen, int size, int flags)
2740 + struct sockaddr_in6 daddr;
2741 + struct cl_comms_socket *clsock;
2744 + our_msg->msg_name = &daddr;
2746 + list_for_each_entry(clsock, &socket_list, list) {
2748 + /* Don't send out a recv-only socket */
2749 + if (!clsock->recv_only) {
2751 + /* For temporary node IDs send to the node's real IP address */
2753 + get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
2756 + memcpy(&daddr, &clsock->saddr, clsock->addr_len);
2757 + our_msg->msg_namelen = clsock->addr_len;
2760 + result = __send_and_save(clsock, our_msg, vec, veclen,
2761 + size + sizeof (struct cl_protheader),
2762 + !(flags & MSG_NOACK));
2769 +/* Internal common send message routine */
2770 +static int __sendmsg(struct socket *sock, struct msghdr *msg,
2771 + struct kvec *vec, int veclen, int size,
2772 + unsigned char port)
2774 + int result = 0, i;
2775 + int flags = msg->msg_flags;
2776 + struct msghdr our_msg;
2777 + struct sockaddr_cl *caddr = msg->msg_name;
2778 + struct cl_protheader header;
2779 + struct kvec vectors[veclen + 1];
2780 + unsigned char srcport;
2783 + if (size > MAX_CLUSTER_MESSAGE)
2785 + if (!atomic_read(&cnxman_running))
2789 + nodeid = caddr->scl_nodeid;
2791 + /* Check that the node id (if present) is valid */
2792 + if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
2793 + !is_valid_temp_nodeid(nodeid))) {
2797 + /* If there's no sending client socket then the source
2798 + port is 0: "us" */
2800 + struct cluster_sock *csock = cluster_sk(sock->sk);
2801 + srcport = csock->port;
2807 + /* We can only have one send outstanding at a time so we might as well
2808 + * lock the whole send mechanism */
2811 + while ((port > HIGH_PROTECTED_PORT
2812 + && (!cluster_is_quorate || in_transition()))
2813 + || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
2815 + DECLARE_WAITQUEUE(wq, current);
2816 + struct task_struct *tsk = current;
2818 + if (flags & MSG_DONTWAIT) {
2823 + if (current->pid == kcluster_pid) {
2825 + ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
2826 + port, ack_count, acks_expected);
2831 + P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
2832 + ack_count, acks_expected);
2834 + set_task_state(tsk, TASK_INTERRUPTIBLE);
2835 + add_wait_queue(&socket_waitq, &wq);
2837 + if ((port > HIGH_PROTECTED_PORT
2838 + && (!cluster_is_quorate || in_transition()))
2839 + || (acks_expected > 0)) {
2846 + set_task_state(tsk, TASK_RUNNING);
2847 + remove_wait_queue(&socket_waitq, &wq);
2850 + if (quit_threads) {
2855 + if (signal_pending(current)) {
2857 + return -ERESTARTSYS;
2860 + /* Were we shut down in the meantime ? */
2861 + if (!atomic_read(&cnxman_running)) {
2868 + memset(&our_msg, 0, sizeof (our_msg));
2870 + /* Build the header */
2871 + header.tgtport = port;
2872 + header.srcport = srcport;
2873 + header.flags = msg->msg_flags;
2874 + header.cluster = cpu_to_le16(cluster_id);
2875 + header.srcid = us ? cpu_to_le32(us->node_id) : 0;
2876 + header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
2879 + header.seq = cpu_to_le16(cur_seq);
2882 + if (header.tgtid) {
2883 + struct cluster_node *remnode;
2885 + remnode = find_node_by_nodeid(nodeid);
2887 + header.ack = cpu_to_le16(remnode->last_seq_recv);
2891 + /* Set the MULTICAST flag on messages with no particular destination */
2892 + if (!msg->msg_namelen) {
2893 + header.flags |= MSG_MULTICAST;
2897 + /* Loopback shortcut */
2898 + if (nodeid == us->node_id && nodeid != 0) {
2901 + header.flags |= MSG_NOACK; /* Don't ack it! */
2903 + return send_to_user_port(NULL, &header, msg, vec, veclen, size);
2906 + /* Copy the existing kvecs into our array and add the header on at the
2908 + vectors[0].iov_base = &header;
2909 + vectors[0].iov_len = sizeof (header);
2910 + for (i = 0; i < veclen; i++) {
2911 + vectors[i + 1] = vec[i];
2915 + /* Work out how many ACKS are wanted - *don't* reset acks_expected to
2916 + * zero if no acks are required as an ACK-needed message may still be
2918 + if (!(msg->msg_flags & MSG_NOACK)) {
2919 + if (msg->msg_namelen)
2920 + acks_expected = 1; /* Unicast */
2922 + acks_expected = max(cluster_members - 1, 0);
2927 + ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
2928 + nodeid, header.port,
2929 + (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
2930 + le16_to_cpu(header.seq), header.flags);
2932 + /* Don't include temp nodeids in the message itself */
2933 + if (header.tgtid < 0)
2936 + /* For non-member sends we use all the interfaces */
2937 + if ((nodeid < 0) || (flags & MSG_ALLINT)) {
2939 + result = send_to_all_ints(nodeid, &our_msg, vectors, veclen+1,
2940 + size, msg->msg_flags);
2943 + /* Send to only the current socket - resends will use the
2944 + * others if necessary */
2945 + our_msg.msg_name = ¤t_interface->saddr;
2946 + our_msg.msg_namelen = current_interface->addr_len;
2949 + __send_and_save(current_interface, &our_msg,
2950 + vectors, veclen+1,
2951 + size + sizeof (header),
2952 + !(msg->msg_flags & MSG_NOACK));
2955 + /* Make a note in each nodes' structure that it has been sent a message
2956 + * so we can see which ones went astray */
2957 + if (!(flags & MSG_NOACK) && nodeid >= 0) {
2958 + if (msg->msg_namelen) {
2959 + struct cluster_node *node;
2961 + node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
2963 + node->last_seq_sent = cur_seq;
2966 + struct cluster_node *node;
2967 + struct list_head *nodelist;
2969 + list_for_each(nodelist, &cluster_members_list) {
2971 + list_entry(nodelist, struct cluster_node,
2973 + if (node->state == NODESTATE_MEMBER) {
2974 + node->last_seq_sent = cur_seq;
2980 + /* if the client wants a broadcast message sending back to itself
2981 + then loop it back */
2982 + if (nodeid == 0 && (flags & MSG_BCASTSELF)) {
2983 + header.flags |= MSG_NOACK; /* Don't ack it! */
2985 + result = send_to_user_port(NULL, &header, msg, vec, veclen, size);
2988 + /* Save a copy of the message if we're expecting an ACK */
2989 + if (!(flags & MSG_NOACK) && acks_expected) {
2990 + struct cl_protheader *savhdr = (struct cl_protheader *) saved_msg_buffer;
2992 + memcpy_fromkvec(saved_msg_buffer, vectors,
2993 + size + sizeof (header));
2995 + saved_msg_len = size + sizeof (header);
2996 + retry_count = ack_count = 0;
2997 + clear_bit(RESEND_NEEDED, &mainloop_flags);
2999 + /* Clear the REPLYEXPected flag so we force a real ACK
3000 + if it's necessary to resend this packet */
3001 + savhdr->flags &= ~MSG_REPLYEXP;
3002 + start_ack_timer();
3009 +static int queue_message(struct socket *sock, void *buf, int len,
3010 + struct sockaddr_cl *caddr,
3011 + unsigned char port, int flags)
3013 + struct queued_message *qmsg;
3015 + qmsg = kmalloc(sizeof (struct queued_message),
3017 + || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
3021 + memcpy(qmsg->msg_buffer, buf, len);
3022 + qmsg->msg_len = len;
3024 + memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
3025 + qmsg->addr_len = sizeof (struct sockaddr_cl);
3028 + qmsg->addr_len = 0;
3030 + qmsg->flags = flags;
3031 + qmsg->port = port;
3032 + qmsg->socket = sock;
3034 + down(&messages_list_lock);
3035 + list_add_tail(&qmsg->list, &messages_list);
3036 + up(&messages_list_lock);
3038 + wake_up_interruptible(&cnxman_waitq);
3043 +static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
3044 + struct msghdr *msg, size_t size)
3046 + struct cluster_sock *c = cluster_sk(sock->sk);
3051 + struct sockaddr_cl *caddr = msg->msg_name;
3053 + if (sock->sk->sk_protocol == CLPROTO_MASTER)
3054 + return -EOPNOTSUPP;
3058 + /* Only capable users can override the port number */
3059 + if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
3060 + port = caddr->scl_port;
3063 + return -EDESTADDRREQ;
3065 + /* Allocate a kernel buffer for the data so we can put it into a kvec */
3066 + buffer = kmalloc(size, GFP_KERNEL);
3070 + if (memcpy_fromiovec(buffer, msg->msg_iov, size)) {
3075 + vec.iov_len = size;
3076 + vec.iov_base = buffer;
3078 + status = __sendmsg(sock, msg, &vec, 1, size, port);
3086 +/* Kernel call to sendmsg */
3087 +int kcl_sendmsg(struct socket *sock, void *buf, int size,
3088 + struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
3090 + struct kvec vecs[1];
3091 + struct msghdr msg;
3092 + struct cluster_sock *c = cluster_sk(sock->sk);
3093 + unsigned char port;
3095 + if (size > MAX_CLUSTER_MESSAGE)
3097 + if (!atomic_read(&cnxman_running))
3101 + if (caddr && caddr->scl_port)
3102 + port = caddr->scl_port;
3105 + return -EDESTADDRREQ;
3107 + /* If we have no process context then queue it up for kclusterd to
3109 + if (in_interrupt() || flags & MSG_QUEUE) {
3110 + return queue_message(sock, buf, size, caddr, port,
3111 + flags & ~MSG_QUEUE);
3114 + vecs[0].iov_base = buf;
3115 + vecs[0].iov_len = size;
3117 + memset(&msg, 0, sizeof (msg));
3118 + msg.msg_name = caddr;
3119 + msg.msg_namelen = addr_len;
3120 + msg.msg_flags = flags;
3122 + return __sendmsg(sock, &msg, vecs, 1, size, port);
3125 +static int send_queued_message(struct queued_message *qmsg)
3127 + struct kvec vecs[1];
3128 + struct msghdr msg;
3130 + /* Don't send blocked messages */
3131 + if (qmsg->port > HIGH_PROTECTED_PORT
3132 + && (!cluster_is_quorate || in_transition()))
3135 + vecs[0].iov_base = qmsg->msg_buffer;
3136 + vecs[0].iov_len = qmsg->msg_len;
3138 + memset(&msg, 0, sizeof (msg));
3139 + msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
3140 + msg.msg_namelen = qmsg->addr_len;
3141 + msg.msg_flags = qmsg->flags;
3143 + return __sendmsg(qmsg->socket, &msg, vecs, 1,
3144 + qmsg->msg_len, qmsg->port);
3147 +int kcl_register_read_callback(struct socket *sock,
3148 + int (*routine) (char *, int, char *, int,
3151 + struct cluster_sock *c = cluster_sk(sock->sk);
3153 + c->kernel_callback = routine;
3158 +/* Used where we are in kclusterd context and we can't allow the task to wait
3159 + * as we are also responsible to processing the ACKs that do the wake up. Try
3160 + * to send the message immediately and queue it if that's not possible */
3161 +static int send_or_queue_message(struct socket *sock, void *buf, int len,
3162 + struct sockaddr_cl *caddr,
3163 + unsigned int flags)
3165 + struct kvec vecs[1];
3166 + struct msghdr msg;
3169 + vecs[0].iov_base = buf;
3170 + vecs[0].iov_len = len;
3172 + memset(&msg, 0, sizeof (msg));
3173 + msg.msg_name = caddr;
3174 + msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
3175 + msg.msg_flags = MSG_DONTWAIT | flags;
3177 + status = __sendmsg(NULL, &msg, vecs, 1, len, 0);
3179 + /* Did it work ? */
3184 + /* Failure other than EAGAIN is fatal */
3185 + if (status != -EAGAIN) {
3189 + return queue_message(sock, buf, len, caddr, 0, flags);
3192 +/* Send a listen request to a node */
3193 +static void send_listen_request(int nodeid, unsigned char port)
3195 + struct cl_listenmsg listenmsg;
3196 + struct sockaddr_cl caddr;
3198 + memset(&caddr, 0, sizeof (caddr));
3200 + /* Build the header */
3201 + listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
3202 + listenmsg.target_port = port;
3203 + listenmsg.listening = 0;
3204 + listenmsg.tag = current->pid;
3206 + caddr.scl_family = AF_CLUSTER;
3207 + caddr.scl_port = 0;
3208 + caddr.scl_nodeid = nodeid;
3210 + send_or_queue_message(NULL, &listenmsg, sizeof(listenmsg), &caddr, MSG_REPLYEXP);
3214 +/* Return 1 or 0 to indicate if we have a listener on the requested port */
3215 +static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
3216 + unsigned char port, unsigned short tag)
3218 + struct cl_listenmsg listenmsg;
3219 + struct sockaddr_cl caddr;
3222 + memset(&caddr, 0, sizeof (caddr));
3224 + /* Build the message */
3225 + listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
3226 + listenmsg.target_port = port;
3227 + listenmsg.tag = tag;
3228 + listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
3230 + caddr.scl_family = AF_CLUSTER;
3231 + caddr.scl_port = 0;
3232 + caddr.scl_nodeid = nodeid;
3234 + status = send_or_queue_message(NULL, &listenmsg,
3235 + sizeof (listenmsg),
3242 +static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
3243 + int addr_len, char *addr, unsigned char remport,
3244 + unsigned char flag)
3247 + struct cl_ackmsg ackmsg;
3248 + struct msghdr msg;
3249 + struct sockaddr_in6 daddr;
3253 + char buf[MAX_ADDR_PRINTED_LEN];
3255 + P_COMMS("Sending ACK to %s, seq=%d\n",
3256 + print_addr(addr, address_length, buf), le16_to_cpu(seq));
3260 + memcpy(&daddr, addr, addr_len);
3263 + memcpy(&daddr, &csock->saddr, csock->addr_len);
3264 + addr_len = csock->addr_len;
3267 + /* Build the header */
3268 + ackmsg.header.tgtport = 0; /* Protocol port */
3269 + ackmsg.header.srcport = 0;
3270 + ackmsg.header.seq = 0;
3271 + ackmsg.header.flags = MSG_NOACK;
3272 + ackmsg.header.cluster = cpu_to_le16(cluster_id);
3273 + ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
3274 + ackmsg.header.ack = seq; /* already in LE order */
3275 + ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother
3276 + * to look this up */
3277 + ackmsg.cmd = CLUSTER_CMD_ACK;
3278 + ackmsg.remport = remport;
3279 + ackmsg.aflags = flag;
3280 + vec.iov_base = &ackmsg;
3281 + vec.iov_len = sizeof (ackmsg);
3283 + memset(&msg, 0, sizeof (msg));
3284 + msg.msg_name = &daddr;
3285 + msg.msg_namelen = addr_len;
3287 + result = kernel_sendmsg(csock->sock, &msg, &vec, 1, sizeof (ackmsg));
3290 + printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
3296 +/* Wait for all ACKS to be gathered */
3297 +void kcl_wait_for_all_acks()
3299 + while (ack_count < acks_expected) {
3301 + DECLARE_WAITQUEUE(wq, current);
3302 + struct task_struct *tsk = current;
3304 + set_task_state(tsk, TASK_INTERRUPTIBLE);
3305 + add_wait_queue(&socket_waitq, &wq);
3307 + if (ack_count < acks_expected) {
3311 + set_task_state(tsk, TASK_RUNNING);
3312 + remove_wait_queue(&socket_waitq, &wq);
3316 +/* Send a closedown OOB message to all cluster nodes - this tells them that a
3317 + * port listener has gone away */
3318 +static void send_port_close_oob(unsigned char port)
3320 + struct cl_closemsg closemsg;
3322 + /* Build the header */
3323 + closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
3324 + closemsg.port = port;
3326 + send_or_queue_message(NULL, &closemsg, sizeof (closemsg), NULL, 0);
3330 +/* A remote port has been closed - post an OOB message to the local listen on
3331 + * that port (if there is one) */
3332 +static void post_close_oob(unsigned char port, int nodeid)
3334 + struct cl_portclosed_oob *oobmsg;
3335 + struct sk_buff *skb;
3336 + struct sock *sock = port_array[port];
3337 + struct cb_info *cbinfo;
3340 + return; /* No-one listening */
3343 + skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3347 + skb_put(skb, sizeof (*oobmsg));
3348 + oobmsg = (struct cl_portclosed_oob *) skb->data;
3349 + oobmsg->port = port;
3350 + oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
3352 + cbinfo = (struct cb_info *)skb->cb;
3354 + cbinfo->orig_nodeid = nodeid;
3355 + cbinfo->orig_port = port;
3357 + sock_queue_rcv_skb(sock, skb);
3361 +/* Leave the cluster */
3362 +static void node_shutdown()
3364 + struct cl_barrier *barrier;
3365 + struct list_head *blist;
3366 + struct list_head *temp;
3367 + struct list_head *socklist;
3368 + struct cl_client_socket *csock;
3369 + struct sk_buff *null_skb;
3371 + if (we_are_a_cluster_member)
3372 + printk(KERN_INFO CMAN_NAME ": we are leaving the cluster. %s\n",
3373 + us->leave_reason?leave_string(us->leave_reason):"");
3375 + atomic_set(&cnxman_running, 0);
3378 + /* Notify kernel listeners first */
3379 + notify_kernel_listeners(LEAVING, 0);
3381 + /* Notify client sockets */
3382 + down(&client_socket_lock);
3383 + list_for_each_safe(socklist, temp, &client_socket_list) {
3384 + csock = list_entry(socklist, struct cl_client_socket, list);
3386 + null_skb = alloc_skb(0, GFP_KERNEL);
3388 + sock_queue_rcv_skb(csock->sock->sk, null_skb);
3389 + list_del(&csock->list);
3392 + up(&client_socket_lock);
3393 + we_are_a_cluster_member = 0;
3394 + cluster_is_quorate = 0;
3398 + /* Wake up any processes waiting for barriers */
3399 + down(&barrier_list_lock);
3400 + list_for_each(blist, &barrier_list) {
3401 + barrier = list_entry(blist, struct cl_barrier, list);
3403 + /* Cancel any timers */
3404 + if (timer_pending(&barrier->timer))
3405 + del_timer(&barrier->timer);
3407 + /* Force it to be auto-delete so it discards itself */
3408 + if (barrier->state == BARRIER_STATE_WAITING) {
3409 + barrier->flags |= BARRIER_ATTR_AUTODELETE;
3410 + wake_up_interruptible(&barrier->waitq);
3413 + if (barrier->callback) {
3414 + barrier->callback(barrier->name, -ENOTCONN);
3415 + barrier->callback = NULL;
3419 + up(&barrier_list_lock);
3421 + /* Wake up any processes waiting for ISLISTENING requests */
3422 + down(&listenreq_lock);
3423 + list_for_each(blist, &listenreq_list) {
3424 + struct cl_waiting_listen_request *lrequest =
3425 + list_entry(blist, struct cl_waiting_listen_request, list);
3427 + if (lrequest->waiting)
3428 + wake_up_interruptible(&lrequest->waitq);
3430 + up(&listenreq_lock);
3433 +static void free_cluster_sockets()
3435 + struct list_head *socklist;
3436 + struct cl_comms_socket *sock;
3437 + struct list_head *temp;
3439 + list_for_each_safe(socklist, temp, &socket_list) {
3440 + sock = list_entry(socklist, struct cl_comms_socket, list);
3442 + list_del(&sock->list);
3446 + num_interfaces = 0;
3447 + current_interface = NULL;
3450 +/* Tidy up after all the rest of the cluster bits have shut down */
3451 +static void node_cleanup()
3453 + struct list_head *nodelist;
3454 + struct list_head *proclist;
3455 + struct list_head *temp;
3456 + struct list_head *socklist;
3457 + struct list_head *blist;
3458 + struct temp_node *tn;
3459 + struct temp_node *tmp;
3460 + struct cl_comms_socket *sock;
3461 + struct kernel_notify_struct *knotify;
3463 + /* Free list of kernel listeners */
3464 + list_for_each_safe(proclist, temp, &kernel_listener_list) {
3466 + list_entry(proclist, struct kernel_notify_struct, list);
3467 + list_del(&knotify->list);
3471 + /* Mark the sockets as busy so they don't get added to the active
3472 + * sockets list in the next few lines of code before we free them */
3473 + list_for_each_safe(socklist, temp, &socket_list) {
3474 + sock = list_entry(socklist, struct cl_comms_socket, list);
3476 + set_bit(1, &sock->active);
3479 + /* Tidy the active sockets list */
3480 + list_for_each_safe(socklist, temp, &active_socket_list) {
3482 + list_entry(socklist, struct cl_comms_socket, active_list);
3483 + list_del(&sock->active_list);
3486 + /* Free the memory allocated to cluster nodes */
3487 + free_nodeid_array();
3488 + down(&cluster_members_lock);
3490 + list_for_each_safe(nodelist, temp, &cluster_members_list) {
3492 + struct list_head *addrlist;
3493 + struct list_head *addrtemp;
3494 + struct cluster_node *node;
3495 + struct cluster_node_addr *nodeaddr;
3497 + node = list_entry(nodelist, struct cluster_node, list);
3499 + list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
3501 + list_entry(addrlist, struct cluster_node_addr,
3504 + list_del(&nodeaddr->list);
3507 + list_del(&node->list);
3508 + kfree(node->name);
3511 + cluster_members = 0;
3512 + up(&cluster_members_lock);
3514 + /* Clean the temp node IDs list. */
3515 + down(&tempnode_lock);
3516 + list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
3517 + list_del(&tn->list);
3520 + up(&tempnode_lock);
3522 + /* Free the memory allocated to the outgoing sockets */
3523 + free_cluster_sockets();
3525 + /* Make sure that all the barriers are deleted */
3526 + down(&barrier_list_lock);
3527 + list_for_each_safe(blist, temp, &barrier_list) {
3528 + struct cl_barrier *barrier =
3529 + list_entry(blist, struct cl_barrier, list);
3531 + list_del(&barrier->list);
3534 + up(&barrier_list_lock);
3537 + clear_bit(RESEND_NEEDED, &mainloop_flags);
3538 + acks_expected = 0;
3539 + wanted_nodeid = 0;
3542 +/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
3544 +void set_quorate(int total_votes)
3548 + if (get_quorum() > total_votes) {
3555 + /* Hide messages during startup state transition */
3556 + if (we_are_a_cluster_member) {
3557 + if (cluster_is_quorate && !quorate)
3558 + printk(KERN_CRIT CMAN_NAME
3559 + ": quorum lost, blocking activity\n");
3560 + if (!cluster_is_quorate && quorate)
3561 + printk(KERN_CRIT CMAN_NAME
3562 + ": quorum regained, resuming activity\n");
3564 + cluster_is_quorate = quorate;
3566 + /* Wake up any sleeping processes */
3567 + if (cluster_is_quorate) {
3573 +void queue_oob_skb(struct socket *sock, int cmd)
3575 + struct sk_buff *skb;
3576 + struct cb_info *cbinfo;
3577 + struct cl_portclosed_oob *oobmsg;
3579 + skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3583 + skb_put(skb, sizeof (*oobmsg));
3584 + oobmsg = (struct cl_portclosed_oob *) skb->data;
3586 + oobmsg->cmd = cmd;
3588 + /* There is no remote node associated with this so
3589 + clear out the field to avoid any accidents */
3590 + cbinfo = (struct cb_info *)skb->cb;
3592 + cbinfo->orig_nodeid = 0;
3593 + cbinfo->orig_port = 0;
3595 + sock_queue_rcv_skb(sock->sk, skb);
3598 +/* Notify interested parties that the cluster configuration has changed */
3599 +void notify_listeners()
3601 + struct notify_struct *notify;
3602 + struct list_head *proclist;
3603 + struct list_head *socklist;
3604 + struct list_head *temp;
3606 + /* Do kernel listeners first */
3607 + notify_kernel_listeners(CLUSTER_RECONFIG, 0);
3609 + /* Now we deign to tell userspace */
3610 + down(&event_listener_lock);
3611 + list_for_each_safe(proclist, temp, &event_listener_list) {
3612 + notify = list_entry(proclist, struct notify_struct, list);
3614 + /* If the kill fails then remove the process from the list */
3615 + if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
3616 + list_del(¬ify->list);
3620 + up(&event_listener_lock);
3622 + /* Tell userspace processes which want OOB messages */
3623 + down(&client_socket_lock);
3624 + list_for_each(socklist, &client_socket_list) {
3625 + struct cl_client_socket *csock;
3626 + csock = list_entry(socklist, struct cl_client_socket, list);
3627 + queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
3629 + up(&client_socket_lock);
3632 +/* This fills in the list of all addresses for the local node */
3633 +void get_local_addresses(struct cluster_node *node)
3635 + struct list_head *socklist;
3636 + struct cl_comms_socket *sock;
3638 + list_for_each(socklist, &socket_list) {
3639 + sock = list_entry(socklist, struct cl_comms_socket, list);
3641 + if (sock->recv_only) {
3642 + add_node_address(node, (char *) &sock->saddr, address_length);
3648 +static uint16_t generate_cluster_id(char *name)
3653 + for (i=0; i<strlen(name); i++) {
3657 + return value & 0xFFFF;
3660 +/* Return the next comms socket we can use. */
3661 +static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
3664 + struct list_head *socklist;
3666 + /* Fast path for single interface systems */
3667 + if (num_interfaces <= 1)
3671 + next = cur->number + 1;
3672 + if (next > num_interfaces)
3675 + /* Find the socket with this number, I could optimise this by starting
3676 + * at the current i/f but most systems are going to have a small number
3677 + * of them anyway */
3678 + list_for_each(socklist, &socket_list) {
3679 + struct cl_comms_socket *sock;
3680 + sock = list_entry(socklist, struct cl_comms_socket, list);
3682 + if (!sock->recv_only && sock->number == next)
3690 +/* MUST be called with the barrier list lock held */
3691 +static struct cl_barrier *find_barrier(char *name)
3693 + struct list_head *blist;
3694 + struct cl_barrier *bar;
3696 + list_for_each(blist, &barrier_list) {
3697 + bar = list_entry(blist, struct cl_barrier, list);
3699 + if (strcmp(name, bar->name) == 0)
3705 +/* Do the stuff we need to do when the barrier has completed phase 1 */
3706 +static void check_barrier_complete_phase1(struct cl_barrier *barrier)
3708 + if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
3709 + ? barrier->expected_nodes :
3710 + cluster_members)) {
3712 + struct cl_barriermsg bmsg;
3714 + atomic_inc(&barrier->completed_nodes); /* We have completed */
3715 + barrier->phase = 2; /* Wait for complete phase II */
3717 + /* Send completion message, remember: we are in cnxman context
3718 + * and must not block */
3719 + bmsg.cmd = CLUSTER_CMD_BARRIER;
3720 + bmsg.subcmd = BARRIER_COMPLETE;
3722 + strcpy(bmsg.name, barrier->name);
3724 + P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
3725 + queue_message(NULL, (char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
3729 +/* Do the stuff we need to do when the barrier has been reached */
3730 +/* Return 1 if we deleted the barrier */
3731 +static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
3733 + spin_lock_irq(&barrier->phase2_spinlock);
3735 + if (barrier->state != BARRIER_STATE_COMPLETE &&
3736 + (status == -ETIMEDOUT ||
3737 + atomic_read(&barrier->completed_nodes) ==
3738 + ((barrier->expected_nodes != 0)
3739 + ? barrier->expected_nodes : cluster_members))) {
3741 + if (status == 0 && barrier->timeout)
3742 + del_timer(&barrier->timer);
3743 + barrier->endreason = status;
3745 + /* Wake up listener */
3746 + if (barrier->state == BARRIER_STATE_WAITING) {
3747 + wake_up_interruptible(&barrier->waitq);
3750 + /* Additional tasks we have to do if the user was not
3752 + /* Call the callback */
3753 + if (barrier->callback) {
3754 + barrier->callback(barrier->name, 0);
3755 + barrier->callback = NULL;
3757 + /* Remove it if it's AUTO-DELETE */
3758 + if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
3759 + list_del(&barrier->list);
3760 + spin_unlock_irq(&barrier->phase2_spinlock);
3765 + barrier->state = BARRIER_STATE_COMPLETE;
3767 + spin_unlock_irq(&barrier->phase2_spinlock);
3771 +/* Called if a barrier timeout happens */
3772 +static void barrier_timer_fn(unsigned long arg)
3774 + struct cl_barrier *barrier = (struct cl_barrier *) arg;
3776 + /* Ignore any futher messages, they are too late. */
3777 + barrier->phase = 0;
3779 + /* and cause it to timeout */
3780 + check_barrier_complete_phase2(barrier, -ETIMEDOUT);
3783 +/* Process BARRIER messages from other nodes */
3784 +static void process_barrier_msg(struct cl_barriermsg *msg,
3785 + struct cluster_node *node)
3787 + struct cl_barrier *barrier;
3789 + down(&barrier_list_lock);
3790 + barrier = find_barrier(msg->name);
3791 + up(&barrier_list_lock);
3793 + /* Ignore other peoples messages, in_transition() is needed here so
3794 + * that joining nodes will see their barrier messages before the
3795 + * we_are_a_cluster_member is set */
3796 + if (!we_are_a_cluster_member && !in_transition())
3801 + P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
3802 + node ? node->name : "unknown");
3804 + switch (msg->subcmd) {
3805 + case BARRIER_WAIT:
3806 + down(&barrier->lock);
3807 + if (barrier->phase == 0)
3808 + barrier->phase = 1;
3810 + if (barrier->phase == 1) {
3811 + atomic_inc(&barrier->got_nodes);
3812 + check_barrier_complete_phase1(barrier);
3815 + printk(KERN_WARNING CMAN_NAME
3816 + ": got WAIT barrier not in phase 1 %s (%d)\n",
3817 + msg->name, barrier->phase);
3820 + up(&barrier->lock);
3823 + case BARRIER_COMPLETE:
3824 + down(&barrier->lock);
3825 + atomic_inc(&barrier->completed_nodes);
3827 + /* First node to get all the WAIT messages sends COMPLETE, so
3828 + * we all complete */
3829 + if (barrier->phase == 1) {
3830 + atomic_set(&barrier->got_nodes,
3831 + barrier->expected_nodes);
3832 + check_barrier_complete_phase1(barrier);
3835 + if (barrier->phase == 2) {
3836 + /* If it was deleted (ret==1) then no need to unlock
3838 + if (check_barrier_complete_phase2(barrier, 0) == 1)
3841 + up(&barrier->lock);
3846 +/* In-kernel membership API */
3847 +int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
3849 + struct kernel_notify_struct *notify;
3851 + notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
3854 + notify->callback = callback;
3856 + down(&kernel_listener_lock);
3857 + list_add(¬ify->list, &kernel_listener_list);
3858 + up(&kernel_listener_lock);
3863 +int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
3865 + struct list_head *calllist;
3866 + struct list_head *temp;
3867 + struct kernel_notify_struct *notify;
3869 + down(&kernel_listener_lock);
3870 + list_for_each_safe(calllist, temp, &kernel_listener_list) {
3871 + notify = list_entry(calllist, struct kernel_notify_struct, list);
3872 + if (notify->callback == callback){
3873 + list_del(¬ify->list);
3875 + up(&kernel_listener_lock);
3879 + up(&kernel_listener_lock);
3883 +/* Return quorate status */
3884 +int kcl_is_quorate()
3886 + return cluster_is_quorate;
3889 +/* Return the address list for a node */
3890 +struct list_head *kcl_get_node_addresses(int nodeid)
3892 + struct cluster_node *node = find_node_by_nodeid(nodeid);
3895 + return &node->addr_list;
3900 +static void copy_to_kclnode(struct cluster_node *node,
3901 + struct kcl_cluster_node *knode)
3903 + strcpy(knode->name, node->name);
3904 + knode->size = sizeof (struct kcl_cluster_node);
3905 + knode->votes = node->votes;
3906 + knode->state = node->state;
3907 + knode->node_id = node->node_id;
3908 + knode->us = node->us;
3909 + knode->leave_reason = node->leave_reason;
3910 + knode->incarnation = node->incarnation;
3913 +/* Return the info for a node given it's address. if addr is NULL then return
3915 +int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
3916 + struct kcl_cluster_node *n)
3918 + struct cluster_node *node;
3920 + /* They want us */
3921 + if (addr == NULL) {
3925 + node = find_node_by_addr(addr, addr_len);
3930 + /* Copy to user's buffer */
3931 + copy_to_kclnode(node, n);
3935 +int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
3937 + struct cluster_node *node;
3939 + /* They want us */
3940 + if (name == NULL) {
3946 + node = find_node_by_name(name);
3951 + /* Copy to user's buffer */
3952 + copy_to_kclnode(node, n);
3956 +/* As above but by node id. MUCH faster */
3957 +int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
3959 + struct cluster_node *node;
3961 + /* They want us */
3962 + if (nodeid == 0) {
3968 + node = find_node_by_nodeid(nodeid);
3973 + /* Copy to user's buffer */
3974 + copy_to_kclnode(node, n);
3978 +/* Return a list of all cluster members ever */
3979 +int kcl_get_all_members(struct list_head *list)
3981 + struct list_head *nodelist;
3982 + struct cluster_node *node;
3983 + struct kcl_cluster_node *newnode;
3984 + int num_nodes = 0;
3986 + down(&cluster_members_lock);
3987 + list_for_each(nodelist, &cluster_members_list) {
3989 + node = list_entry(nodelist, struct cluster_node, list);
3991 + kmalloc(sizeof (struct kcl_cluster_node),
3994 + copy_to_kclnode(node, newnode);
3995 + list_add(&newnode->list, list);
4003 + up(&cluster_members_lock);
4008 +/* Return a list of cluster members */
4009 +int kcl_get_members(struct list_head *list)
4011 + struct list_head *nodelist;
4012 + struct cluster_node *node;
4013 + struct kcl_cluster_node *newnode;
4014 + int num_nodes = 0;
4016 + down(&cluster_members_lock);
4017 + list_for_each(nodelist, &cluster_members_list) {
4018 + node = list_entry(nodelist, struct cluster_node, list);
4020 + if (node->state == NODESTATE_MEMBER) {
4023 + kmalloc(sizeof (struct kcl_cluster_node),
4026 + copy_to_kclnode(node, newnode);
4027 + list_add(&newnode->list, list);
4036 + up(&cluster_members_lock);
4041 +/* Copy current member's nodeids into buffer */
4042 +int kcl_get_member_ids(uint32_t *idbuf, int size)
4044 + struct list_head *nodelist;
4045 + struct cluster_node *node;
4046 + int num_nodes = 0;
4048 + down(&cluster_members_lock);
4049 + list_for_each(nodelist, &cluster_members_list) {
4050 + node = list_entry(nodelist, struct cluster_node, list);
4052 + if (node->state == NODESTATE_MEMBER) {
4053 + if (idbuf && size) {
4054 + idbuf[num_nodes] = node->node_id;
4063 + up(&cluster_members_lock);
4069 +int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
4071 + struct cl_barrier *barrier;
4073 + /* We are not joined to a cluster */
4074 + if (!we_are_a_cluster_member)
4077 + /* Must have a valid name */
4078 + if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
4081 + /* We don't do this yet */
4082 + if (flags & BARRIER_ATTR_MULTISTEP)
4085 + down(&barrier_list_lock);
4087 + /* See if it already exists */
4088 + if ((barrier = find_barrier(name))) {
4089 + up(&barrier_list_lock);
4090 + if (nodes != barrier->expected_nodes) {
4091 + printk(KERN_WARNING CMAN_NAME
4092 + ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
4093 + name, barrier->expected_nodes, nodes);
4094 + up(&barrier_list_lock);
4101 + /* Build a new struct and add it to the list */
4102 + barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
4103 + if (barrier == NULL) {
4104 + up(&barrier_list_lock);
4107 + memset(barrier, 0, sizeof (*barrier));
4109 + strcpy(barrier->name, name);
4110 + barrier->flags = flags;
4111 + barrier->expected_nodes = nodes;
4112 + atomic_set(&barrier->got_nodes, 0);
4113 + atomic_set(&barrier->completed_nodes, 0);
4114 + barrier->endreason = 0;
4115 + barrier->registered_nodes = 1;
4116 + spin_lock_init(&barrier->phase2_spinlock);
4117 + barrier->state = BARRIER_STATE_INACTIVE;
4118 + init_MUTEX(&barrier->lock);
4120 + list_add(&barrier->list, &barrier_list);
4121 + up(&barrier_list_lock);
4126 +static int barrier_setattr_enabled(struct cl_barrier *barrier,
4127 + unsigned int attr, unsigned long arg)
4131 + /* Can't disable a barrier */
4133 + up(&barrier->lock);
4137 + /* We need to send WAIT now because the user may not
4138 + * actually call kcl_barrier_wait() */
4139 + if (!barrier->waitsent) {
4140 + struct cl_barriermsg bmsg;
4142 + /* Send it to the rest of the cluster */
4143 + bmsg.cmd = CLUSTER_CMD_BARRIER;
4144 + bmsg.subcmd = BARRIER_WAIT;
4145 + strcpy(bmsg.name, barrier->name);
4147 + barrier->waitsent = 1;
4148 + barrier->phase = 1;
4150 + atomic_inc(&barrier->got_nodes);
4152 + /* Start the timer if one was wanted */
4153 + if (barrier->timeout) {
4154 + init_timer(&barrier->timer);
4155 + barrier->timer.function = barrier_timer_fn;
4156 + barrier->timer.data = (long) barrier;
4157 + mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
4160 + /* Barrier WAIT and COMPLETE messages are
4161 + * always queued - that way they always get
4162 + * sent out in the right order. If we don't do
4163 + * this then one can get sent out in the
4164 + * context of the user process and the other in
4165 + * cnxman and COMPLETE may /just/ slide in
4166 + * before WAIT if its in the queue
4168 + P_BARRIER("Sending WAIT for %s\n", barrier->name);
4169 + status = queue_message(NULL, &bmsg, sizeof (bmsg), NULL, 0, 0);
4171 + up(&barrier->lock);
4175 + /* It might have been reached now */
4177 + && barrier->state != BARRIER_STATE_COMPLETE
4178 + && barrier->phase == 1)
4179 + check_barrier_complete_phase1(barrier);
4181 + if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
4182 + up(&barrier->lock);
4183 + return barrier->endreason;
4185 + up(&barrier->lock);
4186 + return 0; /* Nothing to propogate */
4189 +int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
4191 + struct cl_barrier *barrier;
4193 + /* See if it already exists */
4194 + down(&barrier_list_lock);
4195 + if (!(barrier = find_barrier(name))) {
4196 + up(&barrier_list_lock);
4199 + up(&barrier_list_lock);
4201 + down(&barrier->lock);
4202 + if (barrier->state == BARRIER_STATE_COMPLETE) {
4203 + up(&barrier->lock);
4208 + case BARRIER_SETATTR_AUTODELETE:
4210 + barrier->flags |= BARRIER_ATTR_AUTODELETE;
4212 + barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
4213 + up(&barrier->lock);
4217 + case BARRIER_SETATTR_TIMEOUT:
4218 + /* Can only change the timout of an inactive barrier */
4219 + if (barrier->state == BARRIER_STATE_WAITING
4220 + || barrier->waitsent) {
4221 + up(&barrier->lock);
4224 + barrier->timeout = arg;
4225 + up(&barrier->lock);
4228 + case BARRIER_SETATTR_MULTISTEP:
4229 + up(&barrier->lock);
4232 + case BARRIER_SETATTR_ENABLED:
4233 + return barrier_setattr_enabled(barrier, attr, arg);
4235 + case BARRIER_SETATTR_NODES:
4236 + /* Can only change the expected node count of an inactive
4238 + if (barrier->state == BARRIER_STATE_WAITING
4239 + || barrier->waitsent)
4241 + barrier->expected_nodes = arg;
4244 + case BARRIER_SETATTR_CALLBACK:
4245 + if (barrier->state == BARRIER_STATE_WAITING
4246 + || barrier->waitsent)
4248 + barrier->callback = (void (*)(char *, int)) arg;
4249 + up(&barrier->lock);
4250 + return 0; /* Don't propgate this to other nodes */
4253 + up(&barrier->lock);
4257 +int kcl_barrier_delete(char *name)
4259 + struct cl_barrier *barrier;
4261 + down(&barrier_list_lock);
4262 + /* See if it exists */
4263 + if (!(barrier = find_barrier(name))) {
4264 + up(&barrier_list_lock);
4269 + list_del(&barrier->list);
4272 + up(&barrier_list_lock);
4277 +int kcl_barrier_cancel(char *name)
4279 + struct cl_barrier *barrier;
4281 + /* See if it exists */
4282 + down(&barrier_list_lock);
4283 + if (!(barrier = find_barrier(name))) {
4284 + up(&barrier_list_lock);
4287 + down(&barrier->lock);
4289 + barrier->endreason = -ENOTCONN;
4291 + if (barrier->callback) {
4292 + barrier->callback(barrier->name, -ECONNRESET);
4293 + barrier->callback = NULL;
4296 + if (barrier->timeout)
4297 + del_timer(&barrier->timer);
4299 + /* Remove it if it's AUTO-DELETE */
4300 + if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4301 + list_del(&barrier->list);
4302 + up(&barrier->lock);
4304 + up(&barrier_list_lock);
4308 + if (barrier->state == BARRIER_STATE_WAITING)
4309 + wake_up_interruptible(&barrier->waitq);
4311 + up(&barrier->lock);
4312 + up(&barrier_list_lock);
4316 +int kcl_barrier_wait(char *name)
4318 + struct cl_barrier *barrier;
4321 + if (!atomic_read(&cnxman_running))
4325 + kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
4327 + down(&barrier_list_lock);
4329 + /* See if it still exists - enable may have deleted it! */
4330 + if (!(barrier = find_barrier(name))) {
4331 + up(&barrier_list_lock);
4335 + down(&barrier->lock);
4337 + up(&barrier_list_lock);
4339 + /* If it has already completed then return the status */
4340 + if (barrier->state == BARRIER_STATE_COMPLETE) {
4341 + up(&barrier->lock);
4342 + return barrier->endreason;
4345 + barrier->state = BARRIER_STATE_WAITING;
4347 + /* Have we all reached the barrier? */
4348 + while (atomic_read(&barrier->completed_nodes) !=
4349 + ((barrier->expected_nodes == 0)
4350 + ? cluster_members : barrier->expected_nodes)
4351 + && barrier->endreason == 0) {
4355 + init_waitqueue_entry(&wq, current);
4356 + init_waitqueue_head(&barrier->waitq);
4358 + /* Wait for em all */
4359 + set_task_state(current, TASK_INTERRUPTIBLE);
4360 + add_wait_queue(&barrier->waitq, &wq);
4362 + if (atomic_read(&barrier->completed_nodes) !=
4363 + ((barrier->expected_nodes ==
4364 + 0) ? cluster_members : barrier->expected_nodes)
4365 + && barrier->endreason == 0) {
4366 + up(&barrier->lock);
4368 + down(&barrier->lock);
4371 + remove_wait_queue(&barrier->waitq, &wq);
4372 + set_task_state(current, TASK_RUNNING);
4374 + if (signal_pending(current)) {
4375 + barrier->endreason = -EINTR;
4379 + barrier->state = BARRIER_STATE_INACTIVE;
4381 + if (barrier->timeout)
4382 + del_timer(&barrier->timer);
4384 + /* Barrier has been reached on all nodes, call the callback */
4385 + if (barrier->callback) {
4386 + barrier->callback(barrier->name, barrier->endreason);
4387 + barrier->callback = NULL;
4390 + atomic_set(&barrier->got_nodes, 0);
4392 + /* Return the reason we were woken */
4393 + ret = barrier->endreason;
4395 + /* Remove it if it's AUTO-DELETE */
4396 + if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4397 + down(&barrier_list_lock);
4398 + list_del(&barrier->list);
4399 + up(&barrier_list_lock);
4400 + up(&barrier->lock);
4404 + up(&barrier->lock);
4407 + /* We were woken up because the node left the cluster ? */
4408 + if (!atomic_read(&cnxman_running))
4414 +/* This is called from membership services when a node has left the cluster -
4415 + * we signal all waiting barriers with -ESRCH so they know to do something
4416 + * else, if the number of nodes is left at 0 then we compare the new number of
4417 + * nodes in the cluster with that at the barrier and return 0 (success) in that
4419 +void check_barrier_returns()
4421 + struct list_head *blist;
4422 + struct list_head *llist;
4423 + struct cl_barrier *barrier;
4426 + down(&barrier_list_lock);
4427 + list_for_each(blist, &barrier_list) {
4428 + barrier = list_entry(blist, struct cl_barrier, list);
4430 + if (barrier->waitsent) {
4433 + /* Check for a dynamic member barrier */
4434 + if (barrier->expected_nodes == 0) {
4435 + if (barrier->registered_nodes ==
4436 + cluster_members) {
4446 + /* Do we need to tell the barrier? */
4448 + if (barrier->state == BARRIER_STATE_WAITING) {
4449 + barrier->endreason = status;
4450 + wake_up_interruptible(&barrier->waitq);
4453 + if (barrier->callback) {
4454 + barrier->callback(barrier->name,
4461 + up(&barrier_list_lock);
4463 + /* Part 2 check for outstanding listen requests for dead nodes and
4465 + down(&listenreq_lock);
4466 + list_for_each(llist, &listenreq_list) {
4467 + struct cl_waiting_listen_request *lrequest =
4468 + list_entry(llist, struct cl_waiting_listen_request, list);
4469 + struct cluster_node *node =
4470 + find_node_by_nodeid(lrequest->nodeid);
4472 + if (node && node->state != NODESTATE_MEMBER) {
4473 + lrequest->result = -ENOTCONN;
4474 + lrequest->waiting = 0;
4475 + wake_up_interruptible(&lrequest->waitq);
4478 + up(&listenreq_lock);
4481 +int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
4483 + struct temp_node *tn;
4484 + int err = 1; /* true */
4486 + char buf[MAX_ADDR_PRINTED_LEN];
4489 + down(&tempnode_lock);
4491 + list_for_each_entry(tn, &tempnode_list, list) {
4492 + if (tn->nodeid == nodeid) {
4493 + memcpy(addr, tn->addr, tn->addrlen);
4494 + *addrlen = tn->addrlen;
4495 + P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
4496 + tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4504 + up(&tempnode_lock);
4508 +/* Create a new temporary node ID. This list will only ever be very small
4509 + (usaully only 1 item) but I can't take the risk that someone won't try to
4510 + boot 128 nodes all at exactly the same time. */
4511 +int new_temp_nodeid(char *addr, int addrlen)
4513 + struct temp_node *tn;
4515 + int try_nodeid = 0;
4517 + char buf[MAX_ADDR_PRINTED_LEN];
4520 + P_COMMS("new_temp_nodeid needed for\n: %s\n",
4521 + print_addr(addr, addrlen, buf));
4523 + down(&tempnode_lock);
4525 + /* First see if we already know about this node */
4526 + list_for_each_entry(tn, &tempnode_list, list) {
4528 + P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
4529 + tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4531 + /* We're already in here... */
4532 + if (tn->addrlen == addrlen &&
4533 + memcmp(tn->addr, addr, addrlen) == 0) {
4534 + P_COMMS("reused temp node ID %d\n", tn->nodeid);
4540 + /* Nope, OK, invent a suitable number */
4543 + list_for_each_entry(tn, &tempnode_list, list) {
4545 + if (tn->nodeid == try_nodeid)
4549 + tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
4553 + memcpy(tn->addr, addr, addrlen);
4554 + tn->addrlen = addrlen;
4555 + tn->nodeid = try_nodeid;
4556 + list_add_tail(&tn->list, &tempnode_list);
4558 + P_COMMS("new temp nodeid = %d\n", try_nodeid);
4560 + up(&tempnode_lock);
4564 +static int is_valid_temp_nodeid(int nodeid)
4566 + struct temp_node *tn;
4567 + int err = 1; /* true */
4569 + down(&tempnode_lock);
4571 + list_for_each_entry(tn, &tempnode_list, list) {
4572 + if (tn->nodeid == nodeid)
4578 + P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
4579 + up(&tempnode_lock);
4584 + * Remove any temp nodeIDs that refer to now-valid cluster members.
4586 +void purge_temp_nodeids()
4588 + struct temp_node *tn;
4589 + struct temp_node *tmp;
4590 + struct cluster_node *node;
4591 + struct cluster_node_addr *nodeaddr;
4594 + down(&tempnode_lock);
4595 + down(&cluster_members_lock);
4598 + * The ordering of these nested lists is deliberately
4599 + * arranged for the fewest list traversals overall
4602 + /* For each node... */
4603 + list_for_each_entry(node, &cluster_members_list, list) {
4604 + if (node->state == NODESTATE_MEMBER) {
4605 + /* ...We check the temp node ID list... */
4606 + list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
4608 + /* ...against that node's address */
4609 + list_for_each_entry(nodeaddr, &node->addr_list, list) {
4611 + if (memcmp(nodeaddr->addr, tn->addr, tn->addrlen) == 0) {
4612 + list_del(&tn->list);
4619 + up(&cluster_members_lock);
4620 + up(&tempnode_lock);
4624 +/* Quorum device functions */
4625 +int kcl_register_quorum_device(char *name, int votes)
4627 + if (quorum_device)
4630 + if (find_node_by_name(name))
4633 + quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
4634 + if (!quorum_device)
4636 + memset(quorum_device, 0, sizeof (struct cluster_node));
4638 + quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
4639 + if (!quorum_device->name) {
4640 + kfree(quorum_device);
4641 + quorum_device = NULL;
4645 + strcpy(quorum_device->name, name);
4646 + quorum_device->votes = votes;
4647 + quorum_device->state = NODESTATE_DEAD;
4649 + /* Keep this list valid so it doesn't confuse other code */
4650 + INIT_LIST_HEAD(&quorum_device->addr_list);
4655 +int kcl_unregister_quorum_device(void)
4657 + if (!quorum_device)
4659 + if (quorum_device->state == NODESTATE_MEMBER)
4662 + quorum_device = NULL;
4667 +int kcl_quorum_device_available(int yesno)
4669 + if (!quorum_device)
4673 + quorum_device->last_hello = jiffies;
4674 + if (quorum_device->state == NODESTATE_DEAD) {
4675 + quorum_device->state = NODESTATE_MEMBER;
4676 + recalculate_quorum(0);
4680 + if (quorum_device->state == NODESTATE_MEMBER) {
4681 + quorum_device->state = NODESTATE_DEAD;
4682 + recalculate_quorum(0);
4689 +/* APIs for cluster ref counting. */
4690 +int kcl_addref_cluster()
4692 + int ret = -ENOTCONN;
4694 + if (!atomic_read(&cnxman_running))
4697 + if (try_module_get(THIS_MODULE)) {
4698 + atomic_inc(&use_count);
4706 +int kcl_releaseref_cluster()
4708 + if (!atomic_read(&cnxman_running))
4710 + atomic_dec(&use_count);
4711 + module_put(THIS_MODULE);
4715 +int kcl_cluster_name(char **cname)
4719 + name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
4723 + strncpy(name, cluster_name, strlen(cluster_name)+1);
4728 +int kcl_get_current_interface(void)
4730 + return current_interface->number;
4733 +/* Socket registration stuff */
4734 +static struct net_proto_family cl_family_ops = {
4735 + .family = AF_CLUSTER,
4736 + .create = cl_create,
4737 + .owner = THIS_MODULE,
4740 +static struct proto_ops cl_proto_ops = {
4741 + .family = AF_CLUSTER,
4743 + .release = cl_release,
4745 + .connect = sock_no_connect,
4746 + .socketpair = sock_no_socketpair,
4747 + .accept = sock_no_accept,
4748 + .getname = cl_getname,
4750 + .ioctl = cl_ioctl,
4751 + .listen = sock_no_listen,
4752 + .shutdown = cl_shutdown,
4753 + .setsockopt = sock_no_setsockopt,
4754 + .getsockopt = sock_no_getsockopt,
4755 + .sendmsg = cl_sendmsg,
4756 + .recvmsg = cl_recvmsg,
4757 + .mmap = sock_no_mmap,
4758 + .sendpage = sock_no_sendpage,
4759 + .owner = THIS_MODULE,
4763 +MODULE_DESCRIPTION("Cluster Connection and Service Manager");
4764 +MODULE_AUTHOR("Red Hat, Inc");
4765 +MODULE_LICENSE("GPL");
4768 +static int __init cluster_init(void)
4770 + printk("CMAN %s (built %s %s) installed\n",
4771 + CMAN_RELEASE_NAME, __DATE__, __TIME__);
4773 + if (sock_register(&cl_family_ops)) {
4774 + printk(KERN_INFO "Unable to register cluster socket type\n");
4778 + /* allocate our sock slab cache */
4779 + cluster_sk_cachep = kmem_cache_create("cluster_sock",
4780 + sizeof (struct cluster_sock), 0,
4781 + SLAB_HWCACHE_ALIGN, 0, 0);
4782 + if (!cluster_sk_cachep) {
4784 + "cluster_init: Cannot create cluster_sock SLAB cache\n");
4785 + sock_unregister(AF_CLUSTER);
4789 +#ifdef CONFIG_PROC_FS
4790 + create_proc_entries();
4793 + init_MUTEX(&start_thread_sem);
4794 + init_MUTEX(&send_lock);
4795 + init_MUTEX(&barrier_list_lock);
4796 + init_MUTEX(&cluster_members_lock);
4797 + init_MUTEX(&port_array_lock);
4798 + init_MUTEX(&messages_list_lock);
4799 + init_MUTEX(&listenreq_lock);
4800 + init_MUTEX(&client_socket_lock);
4801 + init_MUTEX(&new_dead_node_lock);
4802 + init_MUTEX(&event_listener_lock);
4803 + init_MUTEX(&kernel_listener_lock);
4804 + init_MUTEX(&tempnode_lock);
4805 + spin_lock_init(&active_socket_lock);
4806 + init_timer(&ack_timer);
4808 + INIT_LIST_HEAD(&event_listener_list);
4809 + INIT_LIST_HEAD(&kernel_listener_list);
4810 + INIT_LIST_HEAD(&socket_list);
4811 + INIT_LIST_HEAD(&client_socket_list);
4812 + INIT_LIST_HEAD(&active_socket_list);
4813 + INIT_LIST_HEAD(&barrier_list);
4814 + INIT_LIST_HEAD(&messages_list);
4815 + INIT_LIST_HEAD(&listenreq_list);
4816 + INIT_LIST_HEAD(&cluster_members_list);
4817 + INIT_LIST_HEAD(&new_dead_node_list);
4818 + INIT_LIST_HEAD(&tempnode_list);
4820 + atomic_set(&cnxman_running, 0);
4827 +static void __exit cluster_exit(void)
4829 +#ifdef CONFIG_PROC_FS
4830 + cleanup_proc_entries();
4833 + sock_unregister(AF_CLUSTER);
4834 + kmem_cache_destroy(cluster_sk_cachep);
4837 +module_init(cluster_init);
4838 +module_exit(cluster_exit);
4840 +EXPORT_SYMBOL(kcl_sendmsg);
4841 +EXPORT_SYMBOL(kcl_register_read_callback);
4842 +EXPORT_SYMBOL(kcl_add_callback);
4843 +EXPORT_SYMBOL(kcl_remove_callback);
4844 +EXPORT_SYMBOL(kcl_get_members);
4845 +EXPORT_SYMBOL(kcl_get_member_ids);
4846 +EXPORT_SYMBOL(kcl_get_all_members);
4847 +EXPORT_SYMBOL(kcl_is_quorate);
4848 +EXPORT_SYMBOL(kcl_get_node_by_addr);
4849 +EXPORT_SYMBOL(kcl_get_node_by_name);
4850 +EXPORT_SYMBOL(kcl_get_node_by_nodeid);
4851 +EXPORT_SYMBOL(kcl_get_node_addresses);
4852 +EXPORT_SYMBOL(kcl_addref_cluster);
4853 +EXPORT_SYMBOL(kcl_releaseref_cluster);
4854 +EXPORT_SYMBOL(kcl_cluster_name);
4856 +EXPORT_SYMBOL(kcl_barrier_register);
4857 +EXPORT_SYMBOL(kcl_barrier_setattr);
4858 +EXPORT_SYMBOL(kcl_barrier_delete);
4859 +EXPORT_SYMBOL(kcl_barrier_wait);
4860 +EXPORT_SYMBOL(kcl_barrier_cancel);
4862 +EXPORT_SYMBOL(kcl_register_quorum_device);
4863 +EXPORT_SYMBOL(kcl_unregister_quorum_device);
4864 +EXPORT_SYMBOL(kcl_quorum_device_available);
4866 +EXPORT_SYMBOL(kcl_register_service);
4867 +EXPORT_SYMBOL(kcl_unregister_service);
4868 +EXPORT_SYMBOL(kcl_join_service);
4869 +EXPORT_SYMBOL(kcl_leave_service);
4870 +EXPORT_SYMBOL(kcl_global_service_id);
4871 +EXPORT_SYMBOL(kcl_start_done);
4872 +EXPORT_SYMBOL(kcl_get_services);
4873 +EXPORT_SYMBOL(kcl_get_current_interface);
4876 + * Overrides for Emacs so that we follow Linus's tabbing style.
4877 + * Emacs will notice this stuff at the end of the file and automatically
4878 + * adjust the settings for this buffer only. This must remain at the end
4880 + * ---------------------------------------------------------------------------
4881 + * Local variables:
4882 + * c-file-style: "linux"
4885 diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
4886 --- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730
4887 +++ linux-patched/cluster/cman/config.c 2004-11-03 11:37:37.000000000 +0800
4889 +/******************************************************************************
4890 +*******************************************************************************
4892 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4893 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4895 +** This copyrighted material is made available to anyone wishing to use,
4896 +** modify, copy, or redistribute it subject to the terms and conditions
4897 +** of the GNU General Public License v.2.
4899 +*******************************************************************************
4900 +******************************************************************************/
4902 +#include "config.h"
4904 +/* Config file defaults */
4906 +#define DEFAULT_JOIN_WAIT_TIME 16 /* Time to wait while sending JOINREQ
4907 + * messages. Should be at least twice
4908 + * the HELLO timer, probably 3x */
4909 +#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a
4910 + * JOINACK to regarding that node as
4912 +#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */
4913 +#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a
4914 + * node in this period kill it */
4915 +#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition
4917 +#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to
4918 + * a JOINCONF message */
4919 +#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */
4920 +#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition
4921 + * restarts before we die */
4922 +#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */
4924 +#define DEFAULT_NEWCLUSTER_TIMEOUT 16 /* Time to send NEWCLUSTER messages */
4926 +struct config_info cman_config = {
4927 + .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
4928 + .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
4929 + .join_timeout = DEFAULT_JOIN_TIMEOUT,
4930 + .hello_timer = DEFAULT_HELLO_TIMER,
4931 + .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
4932 + .transition_timeout = DEFAULT_TRANSITION_TIMER,
4933 + .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
4934 + .max_nodes = DEFAULT_MAX_NODES,
4935 + .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
4936 + .newcluster_timeout = DEFAULT_NEWCLUSTER_TIMEOUT,
4938 diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
4939 --- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730
4940 +++ linux-patched/cluster/cman/config.h 2004-11-03 11:37:37.000000000 +0800
4942 +/******************************************************************************
4943 +*******************************************************************************
4945 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4946 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4948 +** This copyrighted material is made available to anyone wishing to use,
4949 +** modify, copy, or redistribute it subject to the terms and conditions
4950 +** of the GNU General Public License v.2.
4952 +*******************************************************************************
4953 +******************************************************************************/
4955 +#ifndef __CONFIG_DOT_H__
4956 +#define __CONFIG_DOT_H__
4958 +struct config_info {
4959 + int joinwait_timeout;
4960 + int joinconf_timeout;
4963 + int deadnode_timeout;
4964 + int transition_timeout;
4965 + int transition_restarts;
4967 + int sm_debug_size;
4968 + int newcluster_timeout;
4971 +extern struct config_info cman_config;
4973 +#endif /* __CONFIG_DOT_H__ */
4974 diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
4975 --- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730
4976 +++ linux-patched/cluster/cman/kjoin.c 2004-11-03 11:37:37.000000000 +0800
4978 +/******************************************************************************
4979 +*******************************************************************************
4981 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
4982 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4984 +** This copyrighted material is made available to anyone wishing to use,
4985 +** modify, copy, or redistribute it subject to the terms and conditions
4986 +** of the GNU General Public License v.2.
4988 +*******************************************************************************
4989 +******************************************************************************/
4991 +#include <linux/socket.h>
4992 +#include <net/sock.h>
4993 +#include <linux/list.h>
4994 +#include <cluster/cnxman.h>
4995 +#include <linux/in.h>
4997 +#include "cnxman-private.h"
4999 +static struct socket *mcast_sock;
5000 +static struct socket *recv_sock;
5001 +static struct socket *cluster_sock;
5003 +extern short cluster_id;
5004 +extern int join_count;
5005 +extern struct semaphore join_count_lock;
5006 +extern atomic_t cnxman_running;
5008 +int kcl_join_cluster(struct cl_join_cluster_info *join_info)
5011 + int one = 1, error;
5012 + unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
5013 + unsigned short port = join_info->port;
5015 + struct sockaddr_in saddr;
5016 + struct kcl_multicast_sock mcast_info;
5018 + down(&join_count_lock);
5019 + if (atomic_read(&cnxman_running))
5022 + if (join_info->cluster_id == cluster_id)
5026 + up(&join_count_lock);
5029 + up(&join_count_lock);
5031 + result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
5034 + printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
5038 + result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
5041 + printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
5048 + if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
5049 + (void *) &one, sizeof (int))))
5052 + printk("Error %d Setting master socket to SO_BROADCAST\n",
5054 + sock_release(mcast_sock);
5059 + /* Bind the multicast socket */
5060 + saddr.sin_family = AF_INET;
5061 + saddr.sin_port = htons(port);
5062 + saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5064 + mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
5068 + printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
5069 + sock_release(mcast_sock);
5070 + sock_release(recv_sock);
5074 + /* Bind the receive socket to our IP address */
5075 + saddr.sin_family = AF_INET;
5076 + saddr.sin_port = htons(port);
5077 + saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
5079 + recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
5083 + printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
5084 + sock_release(mcast_sock);
5085 + sock_release(recv_sock);
5089 + /* Create the cluster master socket */
5091 + sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
5094 + printk(KERN_ERR CMAN_NAME
5095 + ": Can't create cluster master socket\n");
5096 + sock_release(mcast_sock);
5097 + sock_release(recv_sock);
5101 + /* This is the broadcast transmit address */
5102 + saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5104 + /* Pass the multicast socket to kernel space */
5105 + mcast_info.sock = mcast_sock;
5106 + mcast_info.number = 1;
5111 + if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5112 + KCL_SET_MULTICAST,
5113 + (void *) &mcast_info,
5114 + sizeof (mcast_info))))
5118 + ": Unable to pass multicast socket to cnxman, %d\n",
5120 + sock_release(mcast_sock);
5121 + sock_release(recv_sock);
5122 + sock_release(cluster_sock);
5126 + mcast_info.sock = recv_sock;
5128 + cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5130 + (void *) &mcast_info,
5131 + sizeof (mcast_info))))
5135 + ": Unable to pass receive socket to cnxman, %d\n",
5137 + sock_release(mcast_sock);
5138 + sock_release(recv_sock);
5139 + sock_release(cluster_sock);
5143 + /* This setsockopt expects usermode variables */
5145 + if (cluster_sock->ops->
5146 + setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
5147 + (void *) join_info,
5148 + sizeof (struct cl_join_cluster_info)))
5152 + printk(CMAN_NAME ": Unable to join cluster\n");
5153 + sock_release(mcast_sock);
5154 + sock_release(recv_sock);
5155 + sock_release(cluster_sock);
5163 +int kcl_leave_cluster(int remove)
5168 + struct socket *shutdown_sock = cluster_sock;
5170 + cluster_sock = NULL;
5172 + if (!shutdown_sock)
5174 + /* Create the cluster master socket */
5176 + sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
5180 + printk(KERN_ERR CMAN_NAME
5181 + ": Can't create cluster master socket\n");
5182 + sock_release(mcast_sock);
5183 + sock_release(recv_sock);
5192 + shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
5193 + CLU_LEAVE_CLUSTER, (void *) &rem,
5196 + printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
5201 + sock_release(shutdown_sock);
5207 + * Overrides for Emacs so that we follow Linus's tabbing style.
5208 + * Emacs will notice this stuff at the end of the file and automatically
5209 + * adjust the settings for this buffer only. This must remain at the end
5211 + * ---------------------------------------------------------------------------
5212 + * Local variables:
5213 + * c-file-style: "linux"
5216 diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
5217 --- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730
5218 +++ linux-patched/cluster/cman/membership.c 2004-11-03 11:37:37.000000000 +0800
5220 +/******************************************************************************
5221 +*******************************************************************************
5223 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5224 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
5226 +** This copyrighted material is made available to anyone wishing to use,
5227 +** modify, copy, or redistribute it subject to the terms and conditions
5228 +** of the GNU General Public License v.2.
5230 +*******************************************************************************
5231 +******************************************************************************/
5233 +#include <linux/socket.h>
5234 +#include <net/sock.h>
5235 +#include <linux/slab.h>
5236 +#include <linux/spinlock.h>
5237 +#include <linux/vmalloc.h>
5238 +#include <asm/uaccess.h>
5239 +#include <linux/list.h>
5240 +#include <cluster/cnxman.h>
5242 +#include "cnxman-private.h"
5243 +#include "config.h"
5244 +#include "sm_control.h"
5250 +/* Barrier name for membership transitions. %d is the cluster generation number
5252 +#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d"
5254 +/* Variables also used by connection manager */
5255 +struct list_head cluster_members_list;
5256 +struct semaphore cluster_members_lock;
5257 +int cluster_members; /* Number of ACTIVE members, not a count of
5258 + * nodes in the list */
5259 +int we_are_a_cluster_member;
5260 +int cluster_is_quorate;
5262 +struct task_struct *membership_task;
5263 +struct cluster_node *us;
5265 +static struct task_struct *hello_task;
5266 +static struct semaphore hello_task_lock;
5268 +/* Variables that belong to the connection manager */
5269 +extern wait_queue_head_t cnxman_waitq;
5270 +extern struct completion member_thread_comp;
5271 +extern struct cluster_node *quorum_device;
5272 +extern unsigned short two_node;
5273 +extern char cluster_name[];
5274 +extern unsigned int config_version;
5275 +extern unsigned int address_length;
5277 +static struct socket *mem_socket;
5278 +static pid_t kcluster_pid;
5280 +static char iobuf[MAX_CLUSTER_MESSAGE];
5281 +static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
5283 +/* Our node name, usually system_utsname.nodename, but can be overridden */
5284 +char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
5286 +/* Node ID that we want. defaults of zero means
5287 + * it will be allocated by the cluster join mechanism
5291 +static spinlock_t members_by_nodeid_lock;
5292 +static int sizeof_members_array; /* Can dynamically increase (vmalloc
5294 +static struct cluster_node **members_by_nodeid;
5296 +#define MEMBER_INCREMENT_SIZE 10
5298 +static int votes = 1; /* Votes this node has */
5299 +static int expected_votes = 1; /* Total expected votes in the cluster */
5300 +static unsigned int quorum; /* Quorum, fewer votes than this and we stop
5302 +static int leavereason; /* Saved for the duration of a state transition */
5303 +static int transitionreason; /* Reason this transition was initiated */
5304 +static unsigned int highest_nodeid; /* Highest node ID known to the cluster */
5305 +static struct timer_list transition_timer; /* Kicks in if the transition
5306 + * doesn't complete in a
5307 + * reasonable time */
5308 +static struct timer_list hello_timer; /* Timer to send HELLOs on */
5309 +static unsigned long join_time; /* The time that we got our JOIN-ACK */
5310 +static unsigned long start_time; /* The time that we were started */
5311 +static int joinconf_count; /* Number of JOINCONF messages we have sent to
5313 +static unsigned long wake_flags;/* Reason we were woken */
5315 +/* Flags in above */
5316 +#define WAKE_FLAG_DEADNODE 1
5317 +#define WAKE_FLAG_TRANSTIMER 2
5319 +/* The time the transition finished */
5320 +static unsigned long transition_end_time;
5322 +/* A list of nodes that cnxman tells us are dead. I hope this never has more
5323 + * than one element in it but I can't take that chance. only non-static so it
5324 + * can be initialised in module_load. */
5325 +struct list_head new_dead_node_list;
5326 +struct semaphore new_dead_node_lock;
5328 +static int do_membership_packet(struct msghdr *msg, char *buf, int len);
5329 +static int do_process_joinreq(struct msghdr *msg, char *buf, int len);
5330 +static int do_process_joinack(struct msghdr *msg, char *buf, int len);
5331 +static int do_process_joinconf(struct msghdr *msg, char *buf, int len);
5332 +static int do_process_leave(struct msghdr *msg, char *buf, int len);
5333 +static int do_process_hello(struct msghdr *msg, char *buf, int len);
5334 +static int do_process_kill(struct msghdr *msg, char *buf, int len);
5335 +static int do_process_reconfig(struct msghdr *msg, char *buf, int len);
5336 +static int do_process_starttrans(struct msghdr *msg, char *buf, int len);
5337 +static int do_process_masterview(struct msghdr *msg, char *buf, int len);
5338 +static int do_process_endtrans(struct msghdr *msg, char *buf, int len);
5339 +static int do_process_viewack(struct msghdr *msg, char *buf, int len);
5340 +static int do_process_startack(struct msghdr *msg, char *buf, int len);
5341 +static int do_process_newcluster(struct msghdr *msg, char *buf, int len);
5342 +static int do_process_nominate(struct msghdr *msg, char *buf, int len);
5343 +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
5344 + unsigned int flags, unsigned int flags2);
5345 +static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
5346 +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
5347 +static int send_hello(void);
5348 +static int send_master_hello(void);
5349 +static int send_newcluster(void);
5350 +static int end_transition(void);
5351 +static int dispatch_messages(struct socket *mem_socket);
5352 +static void check_for_dead_nodes(void);
5353 +static void confirm_joiner(void);
5354 +static void reset_hello_time(void);
5355 +static int add_us(void);
5356 +static int send_joinconf(void);
5357 +static int init_membership_services(void);
5358 +static int elect_master(struct cluster_node **);
5359 +static void trans_timer_expired(unsigned long arg);
5360 +static void hello_timer_expired(unsigned long arg);
5361 +static void join_or_form_cluster(void);
5362 +static int do_timer_wakeup(void);
5363 +static int start_transition(unsigned char reason, struct cluster_node *node);
5364 +static uint32_t low32_of_ip(void);
5365 +int send_leave(unsigned char);
5366 +int send_reconfigure(int, unsigned int);
5369 +static char *msgname(int msg);
5370 +static int debug_sendmsg(struct socket *sock, void *buf, int size,
5371 + struct sockaddr_cl *caddr, int addr_len,
5372 + unsigned int flags)
5374 + P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
5376 + return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
5379 +#define kcl_sendmsg debug_sendmsg
5382 +/* State of the node */
5383 +static enum { STARTING, NEWCLUSTER, JOINING, JOINWAIT, JOINACK, TRANSITION,
5384 + TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
5385 +} node_state = LEFT_CLUSTER;
5387 +/* Sub-state when we are MASTER */
5388 +static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
5389 + MASTER_COMPLETE } master_state;
5391 +/* Number of responses collected while a master controlling a state transition */
5392 +static int responses_collected;
5393 +static int responses_expected;
5395 +/* Current cluster generation number */
5396 +int cluster_generation = 1;
5398 +/* When another node initiates a transtion then store it's pointer in here so
5399 + * we can check for other nodes trying to spoof us */
5400 +static struct cluster_node *master_node = NULL;
5402 +/* Struct the node wanting to join us */
5403 +static struct cluster_node *joining_node = NULL;
5404 +static int joining_temp_nodeid;
5406 +/* Last time a HELLO message was sent */
5407 +unsigned long last_hello;
5409 +/* When we got our JOINWAIT or NEWCLUSTER */
5410 +unsigned long joinwait_time;
5412 +/* Number of times a transition has restarted when we were master */
5413 +int transition_restarts;
5415 +/* Variables used by the master to collect cluster status during a transition */
5416 +static int agreeing_nodes;
5417 +static int dissenting_nodes;
5418 +static uint8_t *node_opinion = NULL;
5419 +#define OPINION_AGREE 1
5420 +#define OPINION_DISAGREE 2
5422 +/* Set node id of a node, also add it to the members array and expand the array
5424 +static inline void set_nodeid(struct cluster_node *node, int nodeid)
5429 + node->node_id = nodeid;
5430 + if (nodeid >= sizeof_members_array) {
5431 + int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
5432 + struct cluster_node **new_array;
5434 + if (new_size < nodeid)
5435 + new_size = nodeid + MEMBER_INCREMENT_SIZE;
5437 + new_array = vmalloc((new_size) * sizeof (struct cluster_node *));
5439 + spin_lock(&members_by_nodeid_lock);
5440 + memcpy(new_array, members_by_nodeid,
5441 + sizeof_members_array *
5442 + sizeof (struct cluster_node *));
5443 + memset(&new_array[sizeof_members_array], 0,
5444 + (new_size - sizeof_members_array) *
5445 + sizeof (struct cluster_node *));
5446 + vfree(members_by_nodeid);
5448 + members_by_nodeid = new_array;
5449 + sizeof_members_array = new_size;
5450 + spin_unlock(&members_by_nodeid_lock);
5453 + panic("No memory for more nodes");
5456 + notify_kernel_listeners(NEWNODE, (long) nodeid);
5458 + spin_lock(&members_by_nodeid_lock);
5459 + members_by_nodeid[nodeid] = node;
5460 + spin_unlock(&members_by_nodeid_lock);
5463 +static int hello_kthread(void *unused)
5465 + struct task_struct *tsk = current;
5468 + daemonize("cman_hbeat");
5470 + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5471 + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5472 + sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5474 + down(&hello_task_lock);
5476 + up(&hello_task_lock);
5478 + set_user_nice(current, -6);
5480 + while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5482 + /* Scan the nodes list for dead nodes */
5483 + if (node_state == MEMBER)
5484 + check_for_dead_nodes();
5486 + set_task_state(current, TASK_INTERRUPTIBLE);
5488 + set_task_state(current, TASK_RUNNING);
5490 + if (node_state != REJECTED && node_state != LEFT_CLUSTER)
5493 + down(&hello_task_lock);
5494 + hello_task = NULL;
5495 + up(&hello_task_lock);
5496 + P_MEMB("heartbeat closing down\n");
5500 +/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
5501 + * that keeps track of and controls cluster membership. */
5502 +static int membership_kthread(void *unused)
5504 + struct task_struct *tsk = current;
5507 + daemonize("cman_memb");
5509 + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5510 + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5511 + sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5513 + membership_task = tsk;
5514 + set_user_nice(current, -5);
5516 + /* Open the socket */
5517 + if (init_membership_services())
5521 + joining_node = us;
5523 + init_timer(&hello_timer);
5524 + hello_timer.function = hello_timer_expired;
5525 + hello_timer.data = 0L;
5527 + /* Do joining stuff */
5528 + join_or_form_cluster();
5530 + transition_end_time = jiffies;
5533 + while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5535 + struct task_struct *tsk = current;
5537 + DECLARE_WAITQUEUE(wait, tsk);
5539 + tsk->state = TASK_INTERRUPTIBLE;
5540 + add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5542 + if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
5543 + wake_flags == 0) {
5544 + if (node_state == JOINACK ||
5545 + node_state == JOINWAIT)
5546 + schedule_timeout(HZ);
5551 + tsk->state = TASK_RUNNING;
5552 + remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5554 + /* Are we being shut down? */
5555 + if (node_state == LEFT_CLUSTER || quit_threads ||
5556 + signal_pending(current))
5559 + /* Were we woken by a dead node passed down from cnxman ? */
5560 + if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
5561 + struct list_head *nodelist, *tmp;
5562 + struct cl_new_dead_node *deadnode;
5564 + down(&new_dead_node_lock);
5565 + list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
5567 + list_entry(nodelist,
5568 + struct cl_new_dead_node, list);
5570 + if (deadnode->node->state == NODESTATE_MEMBER)
5571 + a_node_just_died(deadnode->node);
5572 + list_del(&deadnode->list);
5575 + up(&new_dead_node_lock);
5578 + /* Process received messages. If dispatch_message() returns an
5579 + * error then we shut down */
5580 + if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5581 + if (dispatch_messages(mem_socket) < 0)
5582 + goto leave_cluster;
5586 + /* Were we woken by the transition timer firing ? */
5587 + if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
5588 + switch (do_timer_wakeup()) {
5594 + goto leave_cluster;
5598 + /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
5599 + * messages again */
5600 + if (node_state == JOINACK &&
5601 + time_after(jiffies,
5602 + join_time + cman_config.join_timeout * HZ)) {
5604 + ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
5605 + node_state = JOINWAIT;
5606 + joinwait_time = jiffies;
5609 + /* Have we had an ACK for our JOINREQ message ? */
5610 + if (node_state == JOINING &&
5611 + time_after(jiffies,
5612 + join_time + cman_config.join_timeout * HZ)) {
5613 + P_MEMB("didn't get JOINACK, going back to JOINWAIT\n");
5614 + node_state = JOINWAIT;
5615 + joinwait_time = jiffies;
5618 + /* Have we been in joinwait for too long... */
5619 + if (node_state == JOINWAIT &&
5620 + time_after(jiffies,
5621 + joinwait_time + cman_config.joinwait_timeout * HZ)) {
5623 + ": Been in JOINWAIT for too long - giving up\n");
5624 + goto leave_cluster;
5630 + /* Wake up the heartbeat thread so it can exit */
5631 + down(&hello_task_lock);
5633 + wake_up_process(hello_task);
5634 + up(&hello_task_lock);
5636 + if (timer_pending(&hello_timer))
5637 + del_timer(&hello_timer);
5639 + if (timer_pending(&transition_timer))
5640 + del_timer(&transition_timer);
5642 + node_state = LEFT_CLUSTER;
5643 + P_MEMB("closing down\n");
5644 + quit_threads = 1; /* force other thread to exit too */
5646 + send_leave(us->leave_reason);
5647 + sock_release(mem_socket);
5648 + highest_nodeid = 0;
5649 + complete(&member_thread_comp);
5653 +/* Things to do in the main thread when the transition timer has woken us.
5654 + * Usually this happens when a transition is taking too long and we need to
5655 + * take remedial action.
5657 + * returns: -1 continue; 0 carry on processing +1 leave cluster; */
5658 +static int do_timer_wakeup()
5660 + P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
5662 + /* Resend JOINCONF if it got lost on the wire */
5663 + if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5664 + mod_timer(&transition_timer,
5665 + jiffies + cman_config.joinconf_timeout * HZ);
5666 + if (++joinconf_count < MAX_RETRIES) {
5667 + P_MEMB("Resending JOINCONF\n");
5671 + P_MEMB("JOINCONF not acked, cancelling transition\n");
5677 + /* A joining node probably died */
5678 + if (cluster_members == 1) {
5683 + /* See if the master is still there */
5684 + if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
5686 + /* If we are in transition and master_node is NULL then we are
5687 + * waiting for ENDTRANS after JOIN-CONF */
5688 + if (!master_node) {
5689 + /* Hmmm. master died after sending JOINCONF, we'll have
5690 + * to die as we are in mid-transition */
5691 + printk(KERN_INFO CMAN_NAME
5692 + ": Master died after JOINCONF, we must leave the cluster\n");
5697 + /* No messages from the master - see if it's stil there */
5698 + if (master_node->state == NODESTATE_MEMBER) {
5699 + send_master_hello();
5700 + mod_timer(&transition_timer,
5702 + cman_config.transition_timeout * HZ);
5705 + /* If the master is dead then elect a new one */
5706 + if (master_node->state == NODESTATE_DEAD) {
5708 + struct cluster_node *node;
5710 + P_MEMB("Master node is dead...Election!\n");
5711 + if (elect_master(&node)) {
5713 + /* We are master now, all kneel */
5714 + start_transition(TRANS_DEADMASTER, master_node);
5717 + /* Leave the job to someone on more pay */
5718 + master_node = node;
5719 + mod_timer(&transition_timer,
5721 + cman_config.transition_timeout * HZ);
5726 + /* If we are the master node then restart the transition */
5727 + if (node_state == MASTER) {
5728 + start_transition(TRANS_RESTART, us);
5734 +static void form_cluster(void)
5736 + printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
5737 + node_state = MEMBER;
5738 + we_are_a_cluster_member = TRUE;
5739 + us->state = NODESTATE_MEMBER;
5740 + if (wanted_nodeid)
5741 + set_nodeid(us, wanted_nodeid);
5743 + set_nodeid(us, 1);
5744 + recalculate_quorum(0);
5745 + sm_member_update(cluster_is_quorate);
5747 + kernel_thread(hello_kthread, NULL, 0);
5748 + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5751 +/* This does the initial JOIN part of the membership process. Actually most of
5752 + * is done in the message processing routines but this is the main loop that
5753 + * controls it. The side-effect of this routine is "node_state" which tells the
5754 + * real main loop (in the kernel thread routine) what to do next */
5755 +static void join_or_form_cluster()
5757 + start_time = jiffies;
5759 + printk(KERN_INFO CMAN_NAME
5760 + ": Waiting to join or form a Linux-cluster\n");
5764 + start_time = jiffies;
5765 + joinwait_time = jiffies;
5768 + /* Listen for HELLO or NEWCLUSTER messages */
5770 + DECLARE_WAITQUEUE(wait, current);
5771 + set_task_state(current, TASK_INTERRUPTIBLE);
5772 + add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5774 + if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5775 + schedule_timeout((cman_config.joinwait_timeout * HZ) /
5778 + set_task_state(current, TASK_RUNNING);
5779 + remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5781 + while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5782 + dispatch_messages(mem_socket);
5785 + node_state = LEFT_CLUSTER;
5788 + while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
5789 + node_state == STARTING);
5791 + if (node_state == STARTING) {
5792 + start_time = jiffies;
5793 + joinwait_time = jiffies;
5794 + node_state = NEWCLUSTER;
5797 + /* If we didn't hear any HELLO messages then start sending NEWCLUSTER messages */
5798 + while (time_before(jiffies, start_time + cman_config.newcluster_timeout * HZ) &&
5799 + node_state == NEWCLUSTER) {
5801 + DECLARE_WAITQUEUE(wait, current);
5803 + send_newcluster();
5805 + set_task_state(current, TASK_INTERRUPTIBLE);
5806 + add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5808 + if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5809 + schedule_timeout((cman_config.joinwait_timeout * HZ) /
5812 + set_task_state(current, TASK_RUNNING);
5813 + remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5815 + while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5816 + dispatch_messages(mem_socket);
5818 + /* Did we get a lower "NEWCLUSTER" message ? */
5819 + if (node_state == STARTING) {
5820 + P_MEMB("NEWCLUSTER: restarting joinwait\n");
5821 + goto restart_joinwait;
5825 + node_state = LEFT_CLUSTER;
5830 + /* If we didn't hear any HELLO messages then form a new cluster */
5831 + if (node_state == NEWCLUSTER) {
5835 + last_hello = jiffies;
5839 +int start_membership_services(pid_t cluster_pid)
5841 + kcluster_pid = cluster_pid;
5843 + init_timer(&transition_timer);
5844 + transition_timer.function = trans_timer_expired;
5845 + transition_timer.data = 0L;
5847 + /* Start the thread */
5848 + return kernel_thread(membership_kthread, NULL, 0);
5851 +static int init_membership_services()
5854 + struct sockaddr_cl saddr;
5855 + struct socket *sock;
5857 + init_MUTEX(&hello_task_lock);
5858 + /* Create a socket to communicate with */
5859 + result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
5861 + printk(KERN_ERR CMAN_NAME
5862 + ": Can't create cluster socket for membership services\n");
5865 + mem_socket = sock;
5867 + /* Bind to our port */
5868 + saddr.scl_family = AF_CLUSTER;
5869 + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5871 + sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
5873 + printk(KERN_ERR CMAN_NAME
5874 + ": Can't bind to cluster membership services port\n");
5875 + sock_release(sock);
5879 + node_state = STARTING;
5883 +static int send_joinconf()
5885 + struct sockaddr_cl saddr;
5888 + if (joining_temp_nodeid == 0) {
5892 + master_state = MASTER_CONFIRM;
5893 + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5894 + saddr.scl_family = AF_CLUSTER;
5895 + saddr.scl_nodeid = joining_temp_nodeid;
5896 + status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
5900 + printk("Error %d sending JOINCONF, aborting transition\n", status);
5906 +static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
5908 + char *msgbuf = scratchbuf;
5909 + struct list_head *addrlist;
5910 + int ptr = sizeof (struct cl_mem_join_msg);
5911 + unsigned short num_addr = 0;
5912 + struct cluster_node_addr *nodeaddr;
5913 + struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
5915 + msg->cmd = CLUSTER_MEM_JOINREQ;
5916 + msg->votes = votes;
5917 + msg->expected_votes = cpu_to_le32(expected_votes);
5918 + msg->nodeid = cpu_to_le32(wanted_nodeid);
5919 + msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
5920 + msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
5921 + msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
5922 + msg->config_version = cpu_to_le32(config_version);
5923 + msg->addr_len = cpu_to_le32(address_length);
5924 + strcpy(msg->clustername, cluster_name);
5926 + /* Add our addresses */
5927 + list_for_each(addrlist, &us->addr_list) {
5928 + nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
5930 + memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
5931 + ptr += address_length;
5934 + msg->num_addr = cpu_to_le16(num_addr);
5936 + /* And our name */
5937 + strcpy(msgbuf + ptr, nodename);
5938 + ptr += strlen(nodename) + 1;
5940 + return kcl_sendmsg(mem_socket, msgbuf, ptr,
5941 + addr, addr_len, MSG_NOACK);
5944 +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
5946 + struct cl_mem_startack_msg msg;
5948 + msg.cmd = CLUSTER_MEM_STARTACK;
5949 + msg.generation = cpu_to_le32(cluster_generation);
5950 + msg.node_id = cpu_to_le32(node_id);
5951 + msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
5953 + return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, MSG_REPLYEXP);
5956 +static int send_newcluster()
5961 + buf[0] = CLUSTER_MEM_NEWCLUSTER;
5962 + lowip = cpu_to_le32(low32_of_ip());
5963 + memcpy(&buf[1], &lowip, sizeof(lowip));
5965 + return kcl_sendmsg(mem_socket, buf, sizeof(uint32_t)+1,
5970 +static int send_hello()
5972 + struct cl_mem_hello_msg hello_msg;
5975 + hello_msg.cmd = CLUSTER_MEM_HELLO;
5976 + hello_msg.members = cpu_to_le16(cluster_members);
5977 + hello_msg.flags = cluster_is_quorate ? HELLO_FLAG_QUORATE : 0;
5978 + hello_msg.generation = cpu_to_le32(cluster_generation);
5980 + status = kcl_sendmsg(mem_socket, &hello_msg,
5981 + sizeof(struct cl_mem_hello_msg),
5982 + NULL, 0, MSG_NOACK | MSG_ALLINT);
5984 + last_hello = jiffies;
5989 +/* This is a special HELLO message that requires an ACK. clients in transition
5990 + * send these to the master to check it is still alive. If it does not ACK then
5991 + * cnxman will signal it dead and we can restart the transition */
5992 +static int send_master_hello()
5994 + struct cl_mem_hello_msg hello_msg;
5996 + struct sockaddr_cl saddr;
5998 + hello_msg.cmd = CLUSTER_MEM_HELLO;
5999 + hello_msg.members = cpu_to_le16(cluster_members);
6000 + hello_msg.flags = HELLO_FLAG_MASTER |
6001 + (cluster_is_quorate ? HELLO_FLAG_QUORATE : 0);
6002 + hello_msg.generation = cpu_to_le32(cluster_generation);
6004 + saddr.scl_family = AF_CLUSTER;
6005 + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6006 + saddr.scl_nodeid = master_node->node_id;
6008 + status = kcl_sendmsg(mem_socket, &hello_msg,
6009 + sizeof(struct cl_mem_hello_msg),
6010 + &saddr, sizeof (saddr), 0);
6012 + last_hello = jiffies;
6017 +/* Called when the transition timer has expired, meaning we sent a transition
6018 + * message that was not ACKed */
6019 +static void trans_timer_expired(unsigned long arg)
6021 + P_MEMB("Transition timer fired %ld\n", jiffies);
6023 + set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
6024 + wake_up_process(membership_task);
6027 +static void hello_timer_expired(unsigned long arg)
6029 + P_MEMB("Hello timer fired %ld\n", jiffies);
6031 + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
6033 + if (node_state >= TRANSITION) {
6034 + wake_up_process(hello_task);
6038 +static int wait_for_completion_barrier(void)
6041 + char barriername[MAX_BARRIER_NAME_LEN];
6043 + sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
6045 + /* Make sure we all complete together */
6046 + P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
6048 + kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
6049 + printk(CMAN_NAME ": Error registering barrier: %d\n", status);
6052 + kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
6053 + cman_config.transition_timeout);
6054 + status = kcl_barrier_wait(barriername);
6055 + kcl_barrier_delete(barriername);
6057 + P_MEMB("Completion barrier reached : status = %d\n", status);
6061 +/* Called at the end of a state transition when we are the master */
6062 +static int end_transition()
6064 + struct cl_mem_endtrans_msg msg;
6068 + /* Cancel the timer */
6069 + del_timer(&transition_timer);
6073 + quorum = calculate_quorum(leavereason, 0, &total_votes);
6075 + msg.cmd = CLUSTER_MEM_ENDTRANS;
6076 + msg.quorum = cpu_to_le32(quorum);
6077 + msg.generation = cpu_to_le32(++cluster_generation);
6078 + msg.total_votes = cpu_to_le32(total_votes);
6079 + if (joining_node && transitionreason == TRANS_NEWNODE) {
6080 + msg.new_node_id = cpu_to_le32(joining_node->node_id);
6083 + msg.new_node_id = 0;
6085 + status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
6087 + /* When that's all settled down, do the transition completion barrier */
6088 + kcl_wait_for_all_acks();
6090 + if (wait_for_completion_barrier() != 0) {
6091 + P_MEMB("Barrier timed out - restart\n");
6092 + start_transition(TRANS_RESTART, us);
6096 + joining_temp_nodeid = 0;
6097 + purge_temp_nodeids();
6099 + set_quorate(total_votes);
6101 + notify_listeners();
6102 + reset_hello_time();
6104 + /* Tell any waiting barriers that we had a transition */
6105 + check_barrier_returns();
6108 + node_state = MEMBER;
6109 + transition_end_time = jiffies;
6111 + sm_member_update(cluster_is_quorate);
6116 +int send_reconfigure(int param, unsigned int value)
6119 + struct cl_mem_reconfig_msg *msg =
6120 + (struct cl_mem_reconfig_msg *) &msgbuf;
6122 + if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
6123 + expected_votes = value;
6125 + msg->cmd = CLUSTER_MEM_RECONFIG;
6126 + msg->param = param;
6127 + msg->value = cpu_to_le32(value);
6129 + return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
6132 +static int send_joinack(char *addr, int addr_len, unsigned char acktype)
6134 + struct cl_mem_joinack_msg msg;
6136 + msg.cmd = CLUSTER_MEM_JOINACK;
6137 + msg.acktype = acktype;
6139 + return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
6140 + (struct sockaddr_cl *)addr, addr_len, MSG_NOACK);
6143 +/* Only send a leave message to one node in the cluster so that it can master
6144 + * the state transition, otherwise we get a "thundering herd" of potential
6145 + * masters fighting it out */
6146 +int send_leave(unsigned char flags)
6148 + unsigned char msg[2];
6149 + struct sockaddr_cl saddr;
6150 + struct cluster_node *node = NULL;
6156 + saddr.scl_family = AF_CLUSTER;
6157 + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6159 + /* If we are in transition then use the current master */
6160 + if (node_state == TRANSITION) {
6161 + node = master_node;
6164 + /* If we are the master or not in transition then pick a node
6165 + * almost at random */
6166 + struct list_head *nodelist;
6168 + down(&cluster_members_lock);
6169 + list_for_each(nodelist, &cluster_members_list) {
6170 + node = list_entry(nodelist, struct cluster_node, list);
6172 + if (node->state == NODESTATE_MEMBER && !node->us)
6175 + up(&cluster_members_lock);
6178 + /* we are the only member of the cluster - there is no-one to tell */
6179 + if (node && !node->us) {
6180 + saddr.scl_nodeid = node->node_id;
6182 + P_MEMB("Sending LEAVE to %s\n", node->name);
6183 + msg[0] = CLUSTER_MEM_LEAVE;
6185 + status = kcl_sendmsg(mem_socket, msg, 2,
6186 + &saddr, sizeof (saddr),
6193 + node_state = LEFT_CLUSTER;
6194 + wake_up_process(membership_task);
6198 +int send_kill(int nodeid)
6201 + struct sockaddr_cl saddr;
6203 + killmsg = CLUSTER_MEM_KILL;
6205 + saddr.scl_family = AF_CLUSTER;
6206 + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6207 + saddr.scl_nodeid = nodeid;
6208 + return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
6209 + sizeof (struct sockaddr_cl), MSG_NOACK);
6212 +/* Process a message */
6213 +static int do_membership_packet(struct msghdr *msg, char *buf, int len)
6216 + struct sockaddr_cl *saddr = msg->msg_name;
6217 + struct cluster_node *node;
6219 + node = find_node_by_nodeid(saddr->scl_nodeid);
6221 + P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
6222 + msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
6225 + case CLUSTER_MEM_JOINREQ:
6226 + result = do_process_joinreq(msg, buf, len);
6229 + case CLUSTER_MEM_LEAVE:
6230 + if (we_are_a_cluster_member)
6231 + result = do_process_leave(msg, buf, len);
6234 + case CLUSTER_MEM_HELLO:
6235 + result = do_process_hello(msg, buf, len);
6238 + case CLUSTER_MEM_KILL:
6239 + if (we_are_a_cluster_member)
6240 + result = do_process_kill(msg, buf, len);
6243 + case CLUSTER_MEM_JOINCONF:
6244 + if (node_state == JOINACK) {
6245 + do_process_joinconf(msg, buf, len);
6249 + case CLUSTER_MEM_CONFACK:
6250 + if (node_state == MASTER && master_state == MASTER_CONFIRM) {
6255 + case CLUSTER_MEM_MASTERVIEW:
6256 + if (node_state == TRANSITION)
6257 + do_process_masterview(msg, buf, len);
6260 + case CLUSTER_MEM_JOINACK:
6261 + if (node_state == JOINING || node_state == JOINWAIT ||
6262 + node_state == JOINACK) {
6263 + do_process_joinack(msg, buf, len);
6266 + case CLUSTER_MEM_RECONFIG:
6267 + if (we_are_a_cluster_member) {
6268 + do_process_reconfig(msg, buf, len);
6272 + case CLUSTER_MEM_STARTTRANS:
6273 + result = do_process_starttrans(msg, buf, len);
6276 + case CLUSTER_MEM_ENDTRANS:
6277 + result = do_process_endtrans(msg, buf, len);
6280 + case CLUSTER_MEM_VIEWACK:
6281 + if (node_state == MASTER && master_state == MASTER_COLLECT)
6282 + result = do_process_viewack(msg, buf, len);
6285 + case CLUSTER_MEM_STARTACK:
6286 + if (node_state == MASTER)
6287 + result = do_process_startack(msg, buf, len);
6290 + case CLUSTER_MEM_NEWCLUSTER:
6291 + result = do_process_newcluster(msg, buf, len);
6294 + case CLUSTER_MEM_NOMINATE:
6295 + if (node_state != MASTER)
6296 + result = do_process_nominate(msg, buf, len);
6300 + printk(KERN_ERR CMAN_NAME
6301 + ": Unknown membership services message %d received from node %d port %d\n",
6302 + *buf, saddr->scl_nodeid, saddr->scl_port);
6309 +/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
6310 + * to ignore request (node already joining) */
6311 +static int check_duplicate_node(char *name, struct msghdr *msg, int len)
6313 + struct cluster_node *node;
6314 + struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
6315 + char addr[address_length];
6318 + if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
6321 + /* See if we already have a cluster member with that name... */
6322 + node = find_node_by_name(name);
6323 + if (node && node->state != NODESTATE_DEAD) {
6325 + if (node->state == NODESTATE_JOINING)
6328 + printk(KERN_WARNING CMAN_NAME
6329 + ": Rejecting cluster membership application from %s - already have a node with that name\n",
6335 + /* Need to check the node's address too */
6336 + if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
6337 + (node = find_node_by_addr(addr, addrlen)) &&
6338 + node->state != NODESTATE_DEAD) {
6340 + if (node->state == NODESTATE_JOINING)
6343 + printk(KERN_WARNING CMAN_NAME
6344 + ": Rejecting cluster membership application from %s - already have a node with that address\n",
6351 +/* Start the state transition */
6352 +static int start_transition(unsigned char reason, struct cluster_node *node)
6354 + char *startbuf = scratchbuf;
6355 + struct cl_mem_starttrans_msg *msg =
6356 + (struct cl_mem_starttrans_msg *) startbuf;
6358 + P_MEMB("Start transition - reason = %d\n", reason);
6360 + /* If this is a restart then zero the counters */
6361 + if (reason == TRANS_RESTART) {
6362 + agreeing_nodes = 0;
6363 + dissenting_nodes = 0;
6364 + if (node_opinion) {
6365 + kfree(node_opinion);
6366 + node_opinion = NULL;
6368 + responses_collected = 0;
6371 + /* If we have timed out too many times then just die */
6372 + if (reason == TRANS_RESTART
6373 + && ++transition_restarts > cman_config.transition_restarts) {
6374 + printk(KERN_WARNING CMAN_NAME
6375 + ": too many transition restarts - will die\n");
6376 + us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
6377 + node_state = LEFT_CLUSTER;
6379 + wake_up_process(membership_task);
6380 + wake_up_interruptible(&cnxman_waitq);
6383 + if (reason != TRANS_RESTART)
6384 + transition_restarts = 0;
6386 + /* Only keep the original state transition reason in the global
6388 + if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
6389 + reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
6390 + transitionreason = reason;
6392 + /* Save the info of the requesting node */
6393 + if (reason == TRANS_NEWNODE)
6394 + joining_node = node;
6396 + node_state = MASTER;
6397 + master_state = MASTER_START;
6398 + responses_collected = 0;
6399 + responses_expected = cluster_members - 1;
6401 + /* If we are on our own then just do it */
6402 + if (responses_expected == 0) {
6403 + P_MEMB("We are on our own...lonely here\n");
6404 + responses_collected--;
6405 + do_process_startack(NULL, NULL, 0);
6408 + int ptr = sizeof (struct cl_mem_starttrans_msg);
6409 + struct list_head *addrlist;
6410 + unsigned short num_addrs = 0;
6411 + int flags = MSG_REPLYEXP;
6413 + /* Send the STARTTRANS message */
6414 + msg->cmd = CLUSTER_MEM_STARTTRANS;
6415 + msg->reason = reason;
6416 + msg->votes = node->votes;
6417 + msg->expected_votes = cpu_to_le32(node->expected_votes);
6418 + msg->generation = cpu_to_le32(++cluster_generation);
6419 + msg->nodeid = cpu_to_le32(node->node_id);
6421 + if (reason == TRANS_NEWNODE) {
6422 + /* Add the addresses */
6423 + list_for_each(addrlist, &node->addr_list) {
6424 + struct cluster_node_addr *nodeaddr =
6425 + list_entry(addrlist,
6426 + struct cluster_node_addr, list);
6428 + memcpy(startbuf + ptr, nodeaddr->addr,
6430 + ptr += address_length;
6434 + /* And the name */
6435 + strcpy(startbuf + ptr, node->name);
6436 + ptr += strlen(node->name) + 1;
6439 + /* If another node died then we must queue the STARTTRANS
6440 + * messages so that membershipd can carry on processing the
6441 + * other replies */
6442 + if (reason == TRANS_ANOTHERREMNODE)
6443 + flags |= MSG_QUEUE;
6445 + msg->num_addrs = cpu_to_le16(num_addrs);
6446 + kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
6448 + /* Set a timer in case we don't get 'em all back */
6449 + mod_timer(&transition_timer,
6450 + jiffies + cman_config.transition_timeout * HZ);
6454 +/* A node has died - decide what to do */
6455 +void a_node_just_died(struct cluster_node *node)
6457 + /* If we are not in the context of kmembershipd then stick it on the
6458 + * list and wake it */
6459 + if (current != membership_task) {
6460 + struct cl_new_dead_node *newnode =
6461 + kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
6464 + newnode->node = node;
6465 + down(&new_dead_node_lock);
6466 + list_add_tail(&newnode->list, &new_dead_node_list);
6467 + set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
6468 + up(&new_dead_node_lock);
6469 + wake_up_process(membership_task);
6470 + P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
6475 + down(&cluster_members_lock);
6476 + if (node->state == NODESTATE_MEMBER)
6477 + cluster_members--;
6478 + node->state = NODESTATE_DEAD;
6479 + up(&cluster_members_lock);
6481 + /* Notify listeners */
6482 + notify_kernel_listeners(DIED, (long) node->node_id);
6484 + /* If we are in normal operation then become master and initiate a
6485 + * state-transition */
6486 + if (node_state == MEMBER) {
6487 + start_transition(TRANS_REMNODE, node);
6491 + /* If we are a slave in transition then see if it's the master that has
6492 + * failed. If not then ignore it. If it /is/ the master then elect a
6494 + if (node_state == TRANSITION) {
6495 + if (master_node == node) {
6496 + if (elect_master(&node)) {
6497 + del_timer(&transition_timer);
6498 + node_state = MASTER;
6500 + start_transition(TRANS_DEADMASTER, master_node);
6503 + /* Someone else can be in charge - phew! */
6509 + /* If we are the master then we need to start the transition all over
6511 + if (node_state == MASTER) {
6512 + /* Cancel timer */
6513 + del_timer(&transition_timer);
6515 + /* Restart the transition */
6516 + start_transition(TRANS_ANOTHERREMNODE, node);
6517 + transition_restarts = 0;
6523 + * Build up and send a set of messages consisting of the whole cluster view.
6524 + * The first byte is the command (cmd as passed in), the second is a flag byte:
6525 + * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
6526 + * this is the only message sent The rest is a set of packed node entries, which
6527 + * are NOT split over packets. */
6528 +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
6529 + unsigned int flags, unsigned int flags2)
6534 + int last_node_start = 2;
6535 + unsigned char first_packet_flag = 1;
6536 + struct list_head *nodelist;
6537 + struct list_head *temp;
6538 + struct cluster_node *node;
6539 + char *message = scratchbuf;
6543 + down(&cluster_members_lock);
6544 + list_for_each_safe(nodelist, temp, &cluster_members_list) {
6545 + node = list_entry(nodelist, struct cluster_node, list);
6547 + if (node->state == NODESTATE_MEMBER || node->state == NODESTATE_DEAD) {
6548 + unsigned int evotes;
6549 + unsigned int node_id;
6550 + unsigned short num_addrs = 0;
6551 + unsigned short num_addrs_le;
6552 + struct list_head *addrlist;
6554 + last_node_start = ptr;
6556 + message[ptr++] = len = strlen(node->name);
6557 + strcpy(&message[ptr], node->name);
6560 + message[ptr++] = node->state;
6562 + /* Count the number of addresses this node has */
6563 + list_for_each(addrlist, &node->addr_list) {
6567 + num_addrs_le = cpu_to_le16(num_addrs);
6568 + memcpy(&message[ptr], &num_addrs_le, sizeof (short));
6569 + ptr += sizeof (short);
6572 + list_for_each(addrlist, &node->addr_list) {
6574 + struct cluster_node_addr *nodeaddr =
6575 + list_entry(addrlist,
6576 + struct cluster_node_addr, list);
6578 + memcpy(&message[ptr], nodeaddr->addr,
6580 + ptr += address_length;
6583 + message[ptr++] = node->votes;
6585 + evotes = cpu_to_le32(node->expected_votes);
6586 + memcpy(&message[ptr], &evotes, sizeof (int));
6587 + ptr += sizeof (int);
6589 + node_id = cpu_to_le32(node->node_id);
6590 + memcpy(&message[ptr], &node_id, sizeof (int));
6591 + ptr += sizeof (int);
6593 + /* If the block is full then send it */
6594 + if (ptr > MAX_CLUSTER_MESSAGE) {
6595 + message[1] = first_packet_flag;
6597 + up(&cluster_members_lock);
6598 + status = kcl_sendmsg(mem_socket, message,
6599 + last_node_start, saddr,
6600 + saddr ? sizeof (struct sockaddr_cl) : 0,
6606 + down(&cluster_members_lock);
6608 + first_packet_flag = 0;
6609 + /* Copy the overflow back to the start of the
6610 + * buffer for the next send */
6611 + memcpy(&message[2], &message[last_node_start],
6612 + ptr - last_node_start);
6613 + ptr = ptr - last_node_start + 2;
6618 + up(&cluster_members_lock);
6620 + message[1] = first_packet_flag | 2; /* The last may also be first */
6621 + status = kcl_sendmsg(mem_socket, message, ptr,
6622 + saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
6629 +/* Make the JOINING node into a MEMBER */
6630 +static void confirm_joiner()
6632 + if (joining_node && joining_node->state == NODESTATE_JOINING) {
6633 + down(&cluster_members_lock);
6634 + joining_node->state = NODESTATE_MEMBER;
6635 + cluster_members++;
6636 + up(&cluster_members_lock);
6640 +/* Reset HELLO timers for all nodes We do this after a state-transition as we
6641 + * have had HELLOS disabled during the transition and if we don't do this the
6642 + * nodes will go on an uncontrolled culling-spree afterwards */
6643 +static void reset_hello_time()
6645 + struct list_head *nodelist;
6646 + struct cluster_node *node;
6648 + down(&cluster_members_lock);
6649 + list_for_each(nodelist, &cluster_members_list) {
6650 + node = list_entry(nodelist, struct cluster_node, list);
6652 + if (node->state == NODESTATE_MEMBER) {
6653 + node->last_hello = jiffies;
6657 + up(&cluster_members_lock);
6660 +/* Calculate the new quorum and return the value. do *not* set it in here as
6661 + * cnxman calls this to check if a new expected_votes value is valid. It
6662 + * (optionally) returns the total number of votes in the cluster */
6663 +int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
6665 + struct list_head *nodelist;
6666 + struct cluster_node *node;
6667 + unsigned int total_votes = 0;
6668 + unsigned int highest_expected = 0;
6669 + unsigned int newquorum, q1, q2;
6671 + down(&cluster_members_lock);
6672 + list_for_each(nodelist, &cluster_members_list) {
6673 + node = list_entry(nodelist, struct cluster_node, list);
6675 + if (node->state == NODESTATE_MEMBER) {
6676 + highest_expected =
6677 + max(highest_expected, node->expected_votes);
6678 + total_votes += node->votes;
6681 + up(&cluster_members_lock);
6682 + if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
6683 + total_votes += quorum_device->votes;
6685 + if (max_expected > 0)
6686 + highest_expected = max_expected;
6688 + /* This quorum calculation is taken from the OpenVMS Cluster Systems
6689 + * manual, but, then, you guessed that didn't you */
6690 + q1 = (highest_expected + 2) / 2;
6691 + q2 = (total_votes + 2) / 2;
6692 + newquorum = max(q1, q2);
6694 + /* Normally quorum never decreases but the system administrator can
6695 + * force it down by setting expected votes to a maximum value */
6696 + if (!allow_decrease)
6697 + newquorum = max(quorum, newquorum);
6699 + /* The special two_node mode allows each of the two nodes to retain
6700 + * quorum if the other fails. Only one of the two should live past
6701 + * fencing (as both nodes try to fence each other in split-brain.) */
6705 + if (ret_total_votes)
6706 + *ret_total_votes = total_votes;
6710 +/* Recalculate cluster quorum, set quorate and notify changes */
6711 +void recalculate_quorum(int allow_decrease)
6715 + quorum = calculate_quorum(allow_decrease, 0, &total_votes);
6716 + set_quorate(total_votes);
6717 + notify_listeners();
6720 +/* Add new node address to an existing node */
6721 +int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
6723 + struct cluster_node_addr *newaddr;
6725 + newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
6729 + memcpy(newaddr->addr, addr, len);
6730 + newaddr->addr_len = len;
6731 + list_add_tail(&newaddr->list, &node->addr_list);
6736 +static struct cluster_node *add_new_node(char *name, unsigned char votes,
6737 + unsigned int expected_votes,
6738 + int node_id, int state)
6740 + struct cluster_node *newnode;
6742 + /* Look for a dead node with this name */
6743 + newnode = find_node_by_name(name);
6745 + /* Is it already joining */
6746 + if (newnode && newnode->state == NODESTATE_JOINING)
6749 + /* Update existing information */
6750 + if (newnode && newnode->state == NODESTATE_DEAD) {
6751 + newnode->last_hello = jiffies;
6752 + newnode->votes = votes;
6753 + newnode->expected_votes = expected_votes;
6754 + newnode->state = state;
6756 + newnode->leave_reason = 0;
6757 + newnode->last_seq_recv = 0;
6758 + newnode->last_seq_acked = 0;
6759 + newnode->last_seq_sent = 0;
6760 + newnode->incarnation++;
6761 + do_gettimeofday(&newnode->join_time);
6762 + /* Don't overwrite the node ID */
6764 + if (state == NODESTATE_MEMBER) {
6765 + down(&cluster_members_lock);
6766 + cluster_members++;
6767 + up(&cluster_members_lock);
6770 + printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
6774 + newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
6778 + memset(newnode, 0, sizeof (struct cluster_node));
6779 + newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
6780 + if (!newnode->name)
6783 + strcpy(newnode->name, name);
6784 + newnode->last_hello = jiffies;
6785 + newnode->votes = votes;
6786 + newnode->expected_votes = expected_votes;
6787 + newnode->state = state;
6788 + newnode->node_id = node_id;
6790 + newnode->leave_reason = 0;
6791 + newnode->last_seq_recv = 0;
6792 + newnode->last_seq_acked = 0;
6793 + newnode->last_seq_sent = 0;
6794 + newnode->incarnation = 0;
6795 + do_gettimeofday(&newnode->join_time);
6796 + INIT_LIST_HEAD(&newnode->addr_list);
6797 + set_nodeid(newnode, node_id);
6799 + /* Add the new node to the list */
6800 + down(&cluster_members_lock);
6801 + list_add(&newnode->list, &cluster_members_list);
6802 + if (state == NODESTATE_MEMBER)
6803 + cluster_members++;
6804 + up(&cluster_members_lock);
6806 + printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
6812 + send_leave(CLUSTER_LEAVEFLAG_PANIC);
6814 + printk(KERN_CRIT CMAN_NAME
6815 + ": Cannot allocate memory for new cluster node %s\n", name);
6817 + panic("cluster memory allocation failed");
6822 +/* Remove node from a STARTTRANS message */
6823 +static struct cluster_node *remove_node(int nodeid)
6825 + struct cluster_node *node = find_node_by_nodeid(nodeid);
6827 + if (node && node->state == NODESTATE_MEMBER) {
6828 + P_MEMB("starttrans removes node %s\n", node->name);
6829 + down(&cluster_members_lock);
6830 + node->state = NODESTATE_DEAD;
6831 + cluster_members--;
6832 + up(&cluster_members_lock);
6834 + notify_kernel_listeners(DIED, (long) nodeid);
6836 + /* If this node is us then go quietly */
6838 + printk(KERN_INFO CMAN_NAME
6839 + ": killed by STARTTRANS or NOMINATE\n");
6840 + node_state = LEFT_CLUSTER;
6842 + wake_up_process(membership_task);
6843 + wake_up_interruptible(&cnxman_waitq);
6849 +/* Add a node from a STARTTRANS or NOMINATE message */
6850 +static void add_node_from_starttrans(struct msghdr *msg, char *buf, int len)
6852 + /* Add the new node but don't fill in the ID until the master has
6854 + struct cl_mem_starttrans_msg *startmsg =
6855 + (struct cl_mem_starttrans_msg *)buf;
6856 + int ptr = sizeof (struct cl_mem_starttrans_msg);
6858 + char *name = buf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
6859 + char *nodeaddr = buf + sizeof(struct cl_mem_starttrans_msg);
6861 + joining_node = add_new_node(name, startmsg->votes,
6862 + le32_to_cpu(startmsg->expected_votes),
6863 + 0, NODESTATE_JOINING);
6865 + /* add_new_node returns NULL if the node already exists */
6866 + if (!joining_node)
6867 + joining_node = find_node_by_name(name);
6869 + /* Add the node's addresses */
6870 + if (list_empty(&joining_node->addr_list)) {
6871 + for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
6872 + add_node_address(joining_node, buf + ptr, address_length);
6873 + ptr += address_length;
6877 + /* Make sure we have a temp nodeid for the new node in case we
6879 + joining_temp_nodeid = new_temp_nodeid(nodeaddr,
6883 +/* We have been nominated as master for a transition */
6884 +static int do_process_nominate(struct msghdr *msg, char *buf, int len)
6886 + struct cl_mem_starttrans_msg *startmsg =
6887 + (struct cl_mem_starttrans_msg *)buf;
6888 + struct cluster_node *node = NULL;
6890 + P_MEMB("nominate reason is %d\n", startmsg->reason);
6892 + if (startmsg->reason == TRANS_REMNODE) {
6893 + node = remove_node(le32_to_cpu(startmsg->nodeid));
6896 + if (startmsg->reason == TRANS_NEWNODE) {
6897 + add_node_from_starttrans(msg, buf, len);
6898 + node = joining_node;
6901 + /* This should be a TRANS_CHECK but start_transition needs some node
6905 + start_transition(startmsg->reason, node);
6909 +/* Got a STARTACK response from a node */
6910 +static int do_process_startack(struct msghdr *msg, char *buf, int len)
6912 + if (node_state != MASTER && master_state != MASTER_START) {
6913 + P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
6917 + /* buf is NULL if we are called directly from start_transition */
6919 + struct cl_mem_startack_msg *ackmsg =
6920 + (struct cl_mem_startack_msg *)buf;
6922 + /* Ignore any messages wil old generation numbers in them */
6923 + if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
6924 + P_MEMB("Got old generation START-ACK msg - ignoring\n");
6929 + /* If the node_id is non-zero then use it. */
6930 + if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
6931 + struct cl_mem_startack_msg *ackmsg =
6932 + (struct cl_mem_startack_msg *)buf;
6934 + if (ackmsg->node_id) {
6935 + set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
6938 + max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
6939 + P_MEMB("Node id = %d, highest node id = %d\n",
6940 + le32_to_cpu(ackmsg->node_id),
6941 + le32_to_cpu(ackmsg->highest_node_id));
6944 + /* If we have all the responses in then move to the next stage */
6945 + if (++responses_collected == responses_expected) {
6947 + /* If the new node has no node_id (ie nobody in the cluster has
6948 + * heard of it before) then assign it a new one */
6949 + if (transitionreason == TRANS_NEWNODE && joining_node) {
6951 + max(highest_nodeid, get_highest_nodeid());
6952 + if (joining_node->node_id == 0) {
6953 + set_nodeid(joining_node, ++highest_nodeid);
6955 + P_MEMB("nodeIDs: new node: %d, highest: %d\n",
6956 + joining_node->node_id, highest_nodeid);
6959 + /* Behave a little differently if we are on our own */
6960 + if (cluster_members == 1) {
6961 + if (transitionreason == TRANS_NEWNODE) {
6962 + /* If the cluster is just us then confirm at
6964 + joinconf_count = 0;
6965 + mod_timer(&transition_timer,
6967 + cman_config.joinconf_timeout * HZ);
6971 + else { /* Node leaving the cluster */
6972 + recalculate_quorum(leavereason);
6974 + node_state = MEMBER;
6978 + master_state = MASTER_COLLECT;
6979 + responses_collected = 0;
6980 + responses_expected = cluster_members - 1;
6981 + P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
6982 + responses_expected);
6984 + send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0, MSG_REPLYEXP);
6986 + /* Set a timer in case we don't get 'em all back */
6987 + mod_timer(&transition_timer,
6989 + cman_config.transition_timeout * HZ);
6995 +/* Got a VIEWACK response from a node */
6996 +static int do_process_viewack(struct msghdr *msg, char *reply, int len)
6998 + struct sockaddr_cl *saddr = msg->msg_name;
7000 + if (node_opinion == NULL) {
7002 + kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
7003 + if (!node_opinion) {
7004 + panic(": malloc agree/dissent failed\n");
7006 + memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
7009 + /* Keep a list of agreeing and dissenting nodes */
7010 + if (reply[1] == 1) {
7011 + /* ACK - remote node agrees with me */
7012 + P_MEMB("Node agrees\n");
7013 + node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
7017 + /* Remote node disagrees */
7018 + P_MEMB("Node disagrees\n");
7019 + node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
7020 + dissenting_nodes++;
7023 + P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
7024 + responses_expected);
7026 + /* Are all the results in yet ? */
7027 + if (++responses_collected == responses_expected) {
7028 + del_timer(&transition_timer);
7030 + P_MEMB("The results are in: %d agree, %d dissent\n",
7031 + agreeing_nodes, dissenting_nodes);
7033 + if (agreeing_nodes > dissenting_nodes) {
7034 + /* Kill dissenting nodes */
7037 + for (i = 1; i <= responses_collected; i++) {
7038 + if (node_opinion[i] == OPINION_DISAGREE)
7043 + /* We must leave the cluster as we are in a minority,
7044 + * the rest of them can fight it out amongst
7046 + us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
7047 + agreeing_nodes = 0;
7048 + dissenting_nodes = 0;
7049 + kfree(node_opinion);
7050 + node_opinion = NULL;
7051 + node_state = LEFT_CLUSTER;
7053 + wake_up_process(membership_task);
7054 + wake_up_interruptible(&cnxman_waitq);
7058 + /* Reset counters */
7059 + agreeing_nodes = 0;
7060 + dissenting_nodes = 0;
7061 + kfree(node_opinion);
7062 + node_opinion = NULL;
7064 + /* Confirm new node */
7065 + if (transitionreason == TRANS_NEWNODE) {
7066 + mod_timer(&transition_timer,
7067 + jiffies + cman_config.joinconf_timeout * HZ);
7068 + joinconf_count = 0;
7073 + master_state = MASTER_COMPLETE;
7081 +/* Got an ENDTRANS message */
7082 +static int do_process_endtrans(struct msghdr *msg, char *buf, int len)
7084 + struct cl_mem_endtrans_msg *endmsg =
7085 + (struct cl_mem_endtrans_msg *)buf;
7086 + struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7088 + /* Someone else's state transition */
7089 + if (node_state != TRANSITION && node_state != JOINACK)
7092 + /* Check we got it from the MASTER node */
7093 + if (master_node && master_node->node_id != saddr->scl_nodeid) {
7095 + "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
7096 + master_node->node_id, saddr->scl_nodeid);
7100 + del_timer(&transition_timer);
7102 + /* Set node ID on new node */
7103 + if (endmsg->new_node_id) {
7104 + set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
7105 + P_MEMB("new node %s has ID %d\n", joining_node->name,
7106 + joining_node->node_id);
7109 + node_state = TRANSITION_COMPLETE;
7111 + /* Need to set this here or the barrier code will reject us if we've
7113 + we_are_a_cluster_member = TRUE;
7116 + cluster_generation = le32_to_cpu(endmsg->generation);
7118 + if (wait_for_completion_barrier() != 0) {
7119 + P_MEMB("Barrier timed out - restart\n");
7120 + node_state = TRANSITION;
7121 + mod_timer(&transition_timer,
7122 + jiffies + cman_config.transition_timeout * HZ);
7126 + quorum = le32_to_cpu(endmsg->quorum);
7127 + set_quorate(le32_to_cpu(endmsg->total_votes));
7128 + highest_nodeid = get_highest_nodeid();
7130 + /* Tell any waiting barriers that we had a transition */
7131 + check_barrier_returns();
7133 + purge_temp_nodeids();
7135 + /* Clear the master node */
7136 + master_node = NULL;
7138 + node_state = MEMBER;
7140 + /* Notify other listeners that transition has completed */
7141 + notify_listeners();
7142 + reset_hello_time();
7143 + transition_end_time = jiffies;
7145 + sm_member_update(cluster_is_quorate);
7149 +/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
7150 +static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
7153 + struct sockaddr_cl maddr;
7155 + maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
7156 + maddr.scl_family = AF_CLUSTER;
7157 + maddr.scl_nodeid = nodeid;
7159 + startmsg->cmd = CLUSTER_MEM_NOMINATE;
7160 + return kcl_sendmsg(mem_socket, startmsg, msglen,
7161 + &maddr, sizeof (maddr), 0);
7164 +/* Got a STARTTRANS message */
7165 +static int do_process_starttrans(struct msghdr *msg, char *buf, int len)
7167 + struct cl_mem_starttrans_msg *startmsg =
7168 + (struct cl_mem_starttrans_msg *)buf;
7169 + struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7170 + struct cluster_node *node;
7171 + unsigned int newgen = le32_to_cpu(startmsg->generation);
7173 + /* Got a WHAT from WHOM? */
7174 + node = find_node_by_nodeid(saddr->scl_nodeid);
7175 + if (!node || node->state != NODESTATE_MEMBER)
7178 + /* Someone else's state transition */
7179 + if (node_state != MEMBER &&
7180 + node_state != TRANSITION && node_state != MASTER)
7183 + /* Ignore old generation STARTTRANS messages */
7184 + if ((newgen < cluster_generation) ||
7185 + (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
7186 + P_MEMB("Ignoring STARTTRANS with old generation number\n");
7190 + P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
7191 + newgen, cluster_generation, startmsg->reason);
7193 + /* Up the generation number */
7194 + cluster_generation = newgen;
7196 + /* If we are also a master then decide between us */
7197 + if (node_state == MASTER) {
7199 + /* See if we really want the responsibility of being master */
7200 + if (elect_master(&node)) {
7202 + /* I reluctantly accept this position of responsibility
7204 + P_MEMB("I elected myself master\n");
7206 + /* start_transition will re-establish this */
7207 + del_timer(&transition_timer);
7209 + start_transition(TRANS_NEWMASTER, node);
7214 + P_MEMB("Backing down from MASTER status\n");
7215 + master_node = node;
7216 + node_state = MEMBER;
7218 + /* If we were bringing a new node into the cluster then
7219 + * we will have to abandon that now and tell the new
7220 + * node to try again later */
7221 + if (transitionreason == TRANS_NEWNODE && joining_node) {
7222 + struct cluster_node_addr *first_addr =
7223 + (struct cluster_node_addr *) joining_node->
7226 + P_MEMB("Postponing membership of node %s\n",
7227 + joining_node->name);
7228 + send_joinack(first_addr->addr, address_length,
7229 + JOINACK_TYPE_WAIT);
7231 + /* Not dead, just sleeping */
7232 + joining_node->state = NODESTATE_DEAD;
7233 + joining_node = NULL;
7236 + /* If the new master is not us OR the node we just got
7237 + * the STARTTRANS from then make sure it knows it has
7239 + if (saddr->scl_nodeid != node->node_id) {
7240 + send_nominate(startmsg, len, node->node_id);
7244 + /* Fall through into MEMBER code below if we are
7245 + * obeying the STARTTRANS we just received */
7249 + /* Do non-MASTER STARTTRANS bits */
7250 + if (node_state == MEMBER) {
7251 + int ptr = sizeof (struct cl_mem_starttrans_msg);
7254 + P_MEMB("Normal transition start\n");
7256 + /* If the master is adding a new node and we know it's node ID
7257 + * then ACK with it. */
7258 + if (startmsg->reason == TRANS_NEWNODE) {
7259 + struct cluster_node *node =
7260 + find_node_by_addr((char *) startmsg + ptr,
7263 + node_id = node->node_id;
7266 + /* Save the master info */
7267 + master_node = find_node_by_nodeid(saddr->scl_nodeid);
7268 + node_state = TRANSITION;
7270 + if (startmsg->reason == TRANS_NEWNODE) {
7271 + add_node_from_starttrans(msg, buf, len);
7274 + if (startmsg->reason == TRANS_REMNODE ||
7275 + startmsg->reason == TRANS_ANOTHERREMNODE) {
7276 + remove_node(le32_to_cpu(startmsg->nodeid));
7279 + send_startack(saddr, msg->msg_namelen,
7282 + /* Establish timer in case the master dies */
7283 + mod_timer(&transition_timer,
7284 + jiffies + cman_config.transition_timeout * HZ);
7289 + /* We are in transition but this may be a restart */
7290 + if (node_state == TRANSITION) {
7292 + master_node = find_node_by_nodeid(saddr->scl_nodeid);
7293 + send_startack(saddr, msg->msg_namelen, 0);
7295 + /* Is it a new joining node ? This happens if a master is
7297 + if (startmsg->reason == TRANS_NEWNODE) {
7298 + struct cluster_node *oldjoin = joining_node;
7300 + add_node_from_starttrans(msg, buf, len);
7302 + /* If this is a different node joining than the one we
7303 + * were previously joining (probably cos the master is
7304 + * a nominated one) then mark our "old" joiner as DEAD.
7305 + * The original master will already have told the node
7306 + * to go back into JOINWAIT state */
7307 + if (oldjoin && oldjoin != joining_node
7308 + && oldjoin->state == NODESTATE_JOINING)
7309 + oldjoin->state = NODESTATE_DEAD;
7312 + /* Is it a new master node? */
7313 + if (startmsg->reason == TRANS_NEWMASTER ||
7314 + startmsg->reason == TRANS_DEADMASTER) {
7315 + P_MEMB("starttrans %s, node=%d\n",
7316 + startmsg->reason ==
7317 + TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
7318 + le32_to_cpu(startmsg->nodeid));
7320 + /* If the old master has died then remove it */
7321 + if (startmsg->reason == TRANS_DEADMASTER) {
7322 + remove_node(le32_to_cpu(startmsg->nodeid));
7325 + /* Store new master */
7326 + master_node = find_node_by_nodeid(saddr->scl_nodeid);
7329 + /* Another node has died (or been killed) */
7330 + if (startmsg->reason == TRANS_ANOTHERREMNODE) {
7331 + /* Remove new dead node */
7332 + remove_node(le32_to_cpu(startmsg->nodeid));
7334 + /* Restart the timer */
7335 + del_timer(&transition_timer);
7336 + mod_timer(&transition_timer,
7337 + jiffies + cman_config.transition_timeout * HZ);
7343 +/* Change a cluster parameter */
7344 +static int do_process_reconfig(struct msghdr *msg, char *buf, int len)
7346 + struct cl_mem_reconfig_msg *confmsg;
7347 + struct sockaddr_cl *saddr = msg->msg_name;
7348 + struct cluster_node *node;
7351 + if (len < sizeof(struct cl_mem_reconfig_msg))
7354 + confmsg = (struct cl_mem_reconfig_msg *)buf;
7355 + val = le32_to_cpu(confmsg->value);
7357 + switch (confmsg->param) {
7359 + case RECONFIG_PARAM_EXPECTED_VOTES:
7360 + /* Set any nodes with expected_votes higher than the new value
7363 + struct cluster_node *node;
7365 + down(&cluster_members_lock);
7366 + list_for_each_entry(node, &cluster_members_list, list) {
7367 + if (node->state == NODESTATE_MEMBER &&
7368 + node->expected_votes > val) {
7369 + node->expected_votes = val;
7372 + up(&cluster_members_lock);
7373 + if (expected_votes > val)
7374 + expected_votes = val;
7376 + recalculate_quorum(1); /* Allow decrease */
7377 + sm_member_update(cluster_is_quorate);
7380 + case RECONFIG_PARAM_NODE_VOTES:
7381 + node = find_node_by_nodeid(saddr->scl_nodeid);
7382 + node->votes = val;
7383 + recalculate_quorum(1); /* Allow decrease */
7384 + sm_member_update(cluster_is_quorate);
7387 + case RECONFIG_PARAM_CONFIG_VERSION:
7388 + config_version = val;
7392 + printk(KERN_INFO CMAN_NAME
7393 + ": got unknown parameter in reconfigure message. %d\n",
7400 +/* Response from master node */
7401 +static int do_process_joinack(struct msghdr *msg, char *buf, int len)
7403 + struct cl_mem_joinack_msg *ackmsg =
7404 + (struct cl_mem_joinack_msg *)buf;
7406 + join_time = jiffies;
7407 + if (ackmsg->acktype == JOINACK_TYPE_OK) {
7408 + node_state = JOINACK;
7411 + if (ackmsg->acktype == JOINACK_TYPE_NAK) {
7412 + printk(KERN_WARNING CMAN_NAME
7413 + ": Cluster membership rejected\n");
7414 + P_MEMB("Got JOINACK NACK\n");
7415 + node_state = REJECTED;
7418 + if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
7419 + P_MEMB("Got JOINACK WAIT\n");
7420 + node_state = JOINWAIT;
7421 + joinwait_time = jiffies;
7427 +/* Check a JOINREQ message for validity,
7428 + return -1 if we can't let the node join our cluster */
7429 +static int validate_joinmsg(struct cl_mem_join_msg *joinmsg, int len)
7431 + struct cluster_node *node;
7433 + /* Check version number */
7434 + if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
7435 + char *ptr = (char *) joinmsg;
7438 + ptr += sizeof (*joinmsg);
7439 + name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7441 + /* Sanity-check the num_addrs field otherwise we could oops */
7442 + if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
7443 + printk(KERN_WARNING CMAN_NAME
7444 + ": num_addr in JOIN-REQ message is rubbish: %d\n",
7445 + le16_to_cpu(joinmsg->num_addr));
7449 + /* Check the cluster name matches */
7450 + if (strcmp(cluster_name, joinmsg->clustername)) {
7451 + printk(KERN_WARNING CMAN_NAME
7452 + ": attempt to join with cluster name '%s' refused\n",
7453 + joinmsg->clustername);
7457 + /* Check we are not exceeding the maximum number of nodes */
7458 + if (cluster_members >= cman_config.max_nodes) {
7459 + printk(KERN_WARNING CMAN_NAME
7460 + ": Join request from %s rejected, exceeds maximum number of nodes\n",
7465 + /* Check that we don't exceed the two_node limit, if applicable */
7466 + if (two_node && cluster_members == 2) {
7467 + printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7468 + "rejected, exceeds two node limit\n", name);
7472 + if (le32_to_cpu(joinmsg->config_version) != config_version) {
7473 + printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7474 + "rejected, config version local %u remote %u\n",
7475 + name, config_version,
7476 + le32_to_cpu(joinmsg->config_version));
7480 + /* Validate requested static node ID */
7481 + if (joinmsg->nodeid &&
7482 + (node = find_node_by_nodeid(le32_to_cpu(joinmsg->nodeid))) &&
7483 + (node->state != NODESTATE_DEAD ||
7484 + (strcmp(node->name, name)))) {
7485 + printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7486 + "rejected, node ID %d already in use by %s\n",
7487 + name, node->node_id, node->name);
7490 + if (joinmsg->nodeid &&
7491 + (node = find_node_by_name(name)) &&
7492 + (node->state != NODESTATE_DEAD ||
7493 + node->node_id != le32_to_cpu(joinmsg->nodeid))) {
7494 + printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7495 + "rejected, wanted node %d but previously had %d\n",
7496 + name, le32_to_cpu(joinmsg->nodeid), node->node_id);
7500 + /* If these don't match then I don't know how the message
7501 + arrived! However, I can't take the chance */
7502 + if (le32_to_cpu(joinmsg->addr_len) != address_length) {
7503 + printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7504 + "rejected, address length local: %u remote %u\n",
7505 + name, address_length,
7506 + le32_to_cpu(joinmsg->addr_len));
7511 + /* Version number mismatch, don't use any part of the message
7512 + * other than the version numbers as things may have moved */
7513 + printk(KERN_INFO CMAN_NAME
7514 + ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d)\n",
7515 + CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
7516 + CNXMAN_PATCH_VERSION,
7517 + le32_to_cpu(joinmsg->major_version),
7518 + le32_to_cpu(joinmsg->minor_version),
7519 + le32_to_cpu(joinmsg->patch_version));
7526 +/* Request to join the cluster. This makes us the master for this state
7528 +static int do_process_joinreq(struct msghdr *msg, char *buf, int len)
7530 + static unsigned long last_joinreq = 0;
7531 + static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
7532 + struct cl_mem_join_msg *joinmsg = (struct cl_mem_join_msg *)buf;
7533 + struct cluster_node *node;
7534 + char *ptr = (char *) joinmsg;
7537 + struct sockaddr_cl *addr = msg->msg_name;
7539 + ptr += sizeof (*joinmsg);
7540 + name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7542 + /* If we are in a state transition then tell the new node to wait a bit
7544 + if (node_state != MEMBER) {
7545 + if (node_state == MASTER || node_state == TRANSITION) {
7546 + send_joinack(msg->msg_name, msg->msg_namelen,
7547 + JOINACK_TYPE_WAIT);
7552 + /* Reject application if message is invalid for any reason */
7553 + if (validate_joinmsg(joinmsg, len)) {
7554 + send_joinack(msg->msg_name, msg->msg_namelen,
7555 + JOINACK_TYPE_NAK);
7559 + /* Do we already know about this node? */
7560 + if (check_duplicate_node(name, msg, len) < 0) {
7561 + send_joinack(msg->msg_name, msg->msg_namelen,
7562 + JOINACK_TYPE_NAK);
7566 + /* Duplicate checking: Because joining messages do not have
7567 + * sequence numbers we may get as many JOINREQ messages as we
7568 + * have interfaces. This bit of code here just checks for
7569 + * JOINREQ messages that come in from the same node in a small
7570 + * period of time and removes the duplicates */
7571 + if (time_before(jiffies, last_joinreq + 10 * HZ)
7572 + && strcmp(name, last_name) == 0) {
7576 + /* OK, you can be in my gang */
7577 + last_joinreq = jiffies;
7578 + strcpy(last_name, name);
7580 + node = add_new_node(name, joinmsg->votes,
7581 + le32_to_cpu(joinmsg->expected_votes),
7582 + le32_to_cpu(joinmsg->nodeid),
7583 + NODESTATE_JOINING);
7585 + /* Add the node's addresses */
7586 + if (list_empty(&node->addr_list)) {
7587 + for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
7589 + add_node_address(node, ptr, address_length);
7590 + ptr += address_length;
7593 + send_joinack(msg->msg_name, msg->msg_namelen,
7595 + joining_node = node;
7596 + joining_temp_nodeid = addr->scl_nodeid;
7598 + /* Start the state transition */
7599 + start_transition(TRANS_NEWNODE, node);
7604 +/* A simple function to invent a small number based
7605 + on the node name */
7606 +static int node_hash(void)
7611 + for (i=0; i<strlen(nodename); i++) {
7612 + value += nodename[i];
7614 + return (value & 0xF) + 1;
7618 +/* Return the low 32 bits of our IP address */
7619 +static uint32_t low32_of_ip()
7621 + struct cluster_node_addr *addr;
7624 + addr = list_entry(us->addr_list.next, struct cluster_node_addr, list);
7625 + memcpy(&lowip, addr->addr+address_length-sizeof(uint32_t), sizeof(uint32_t));
7627 + memcpy(&lowip, addr->addr - sizeof(uint32_t)*2, sizeof(uint32_t));
7632 +/* A new node has stated its intent to form a new cluster. we may have
7633 + * something to say about that... */
7634 +static int do_process_newcluster(struct msghdr *msg, char *buf, int len)
7636 + /* If we are also in STARTING state then back down for a random period
7638 + if (node_state == STARTING) {
7639 + P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
7640 + start_time = jiffies + node_hash() * HZ;
7643 + if (node_state == NEWCLUSTER) {
7646 + memcpy(&otherip, buf+1, sizeof(otherip));
7647 + otherip = le32_to_cpu(otherip);
7648 + P_MEMB("got NEWCLUSTER, remote ip = %x, us = %x\n", otherip, low32_of_ip());
7649 + if (otherip < low32_of_ip())
7650 + node_state = STARTING;
7653 + if (node_state == MEMBER)
7659 +/* Called for each node by the node-message unpacker. Returns -1 if there is a
7660 + * mismatch and the caller will stop processing */
7661 +static int check_node(struct cluster_node *newnode, char *addrs,
7662 + unsigned short num_addr)
7664 + struct cluster_node *node = find_node_by_name(newnode->name);
7666 + P_MEMB("check_node: %s", newnode->name);
7669 + C_MEMB(" - not found\n");
7673 + if (node->votes != newnode->votes ||
7674 + node->node_id != newnode->node_id ||
7675 + node->state != newnode->state) {
7676 + C_MEMB(" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
7677 + node->votes, newnode->votes, node->node_id,
7678 + newnode->node_id, node->state);
7681 + C_MEMB(" - OK\n");
7685 +/* Called for each new node found in a JOINCONF message. Create a new node
7687 +static int add_node(struct cluster_node *node, char *addrs,
7688 + unsigned short num_addr)
7690 + P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
7691 + node->expected_votes, node->node_id);
7693 + if (!find_node_by_name(node->name)) {
7694 + struct cluster_node *newnode;
7698 + add_new_node(node->name, node->votes, node->expected_votes,
7699 + node->node_id, node->state)) == NULL) {
7700 + P_MEMB("Error adding node\n");
7703 + if (list_empty(&newnode->addr_list)) {
7704 + for (i = 0; i < num_addr; i++) {
7705 + add_node_address(newnode,
7706 + addrs + i * address_length, address_length);
7712 + P_MEMB("Already got node with name %s\n", node->name);
7717 +/* Call a specified routine for each node unpacked from the message. Return
7718 + * either the number of nodes found or -1 for an error */
7719 +static int unpack_nodes(unsigned char *buf, int len,
7720 + int (*routine) (struct cluster_node *, char *,
7724 + int num_nodes = 0;
7725 + char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
7726 + struct cluster_node node;
7728 + node.name = nodename;
7730 + while (ptr < len) {
7731 + int namelen = buf[ptr++];
7732 + unsigned int evotes;
7733 + unsigned int node_id;
7734 + unsigned short num_addr;
7735 + unsigned char *addrs;
7737 + memcpy(nodename, &buf[ptr], namelen);
7738 + nodename[namelen] = '\0';
7741 + node.state = buf[ptr++];
7743 + memcpy(&num_addr, &buf[ptr], sizeof (short));
7744 + num_addr = le16_to_cpu(num_addr);
7745 + ptr += sizeof (short);
7747 + /* Just make a note of the addrs "array" */
7748 + addrs = &buf[ptr];
7749 + ptr += num_addr * address_length;
7751 + node.votes = buf[ptr++];
7753 + memcpy(&evotes, &buf[ptr], sizeof (int));
7754 + node.expected_votes = le32_to_cpu(evotes);
7755 + ptr += sizeof (int);
7757 + memcpy(&node_id, &buf[ptr], sizeof (int));
7758 + node.node_id = le32_to_cpu(node_id);
7759 + ptr += sizeof (int);
7761 + /* Call the callback routine */
7762 + if (routine(&node, addrs, num_addr) < 0)
7765 + /* Return the number of MEMBER nodes */
7766 + if (node.state == NODESTATE_MEMBER)
7772 +/* Got join confirmation from a master node. This message contains a list of
7773 + * cluster nodes which we unpack and build into our cluster nodes list. When we
7774 + * have the last message we can go into TRANSITION state */
7775 +static int do_process_joinconf(struct msghdr *msg, char *buf, int len)
7777 + if (unpack_nodes(buf + 2, len - 2, add_node) < 0) {
7779 + ": Error procssing joinconf message - giving up on cluster join\n");
7780 + us->leave_reason = CLUSTER_LEAVEFLAG_PANIC;
7781 + node_state = LEFT_CLUSTER;
7785 + /* Last message in the list? */
7788 + struct sockaddr_cl *addr = msg->msg_name;
7790 + us->state = NODESTATE_MEMBER;
7791 + node_state = TRANSITION;
7792 + we_are_a_cluster_member = TRUE;
7794 + ackmsg = CLUSTER_MEM_CONFACK;
7795 + kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
7796 + sizeof (struct sockaddr_cl),
7798 + kernel_thread(hello_kthread, NULL, 0);
7799 + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
7804 +/* Got the master's view of the cluster - compare it with ours and tell it the
7806 +static int do_process_masterview(struct msghdr *msg, char *buf, int len)
7808 + char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
7809 + static int num_nodes;
7811 + /* Someone else's state transition */
7812 + if (node_state != MEMBER &&
7813 + node_state != TRANSITION && node_state != MASTER)
7816 + /* First message, zero the counter */
7820 + num_nodes += unpack_nodes(buf + 2, len - 2, check_node);
7822 + /* Last message, check the count and reply */
7824 + if (num_nodes == cluster_members) {
7830 + ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
7831 + num_nodes, cluster_members);
7835 + kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
7836 + msg->msg_namelen, 0);
7841 +static int do_process_leave(struct msghdr *msg, char *buf, int len)
7843 + struct cluster_node *node;
7844 + struct sockaddr_cl *saddr = msg->msg_name;
7845 + unsigned char *leavemsg = (unsigned char *)buf;
7847 + if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
7848 + unsigned char reason = leavemsg[1];
7850 + if (node->state != NODESTATE_DEAD) {
7851 + printk(KERN_INFO CMAN_NAME
7852 + ": Node %s is leaving the cluster, %s\n",
7853 + node->name, leave_string(reason));
7855 + node->leave_reason = reason;
7857 + leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
7859 + a_node_just_died(node);
7864 +static int do_process_hello(struct msghdr *msg, char *buf, int len)
7866 + struct cluster_node *node;
7867 + struct cl_mem_hello_msg *hellomsg =
7868 + (struct cl_mem_hello_msg *)buf;
7869 + struct sockaddr_cl *saddr = msg->msg_name;
7871 + /* We are starting up. Send a join message to the node whose HELLO we
7872 + * just received */
7873 + if (node_state == STARTING || node_state == JOINWAIT ||
7874 + node_state == JOINING || node_state == NEWCLUSTER) {
7875 + struct sockaddr_cl *addr = msg->msg_name;
7877 + printk(KERN_INFO CMAN_NAME ": sending membership request\n");
7879 + send_joinreq(addr, msg->msg_namelen);
7880 + join_time = jiffies;
7881 + node_state = JOINING;
7885 + /* Only process HELLOs if we are not in transition */
7886 + if (node_state == MEMBER) {
7888 + node = find_node_by_nodeid(saddr->scl_nodeid);
7889 + if (node && node->state != NODESTATE_DEAD) {
7891 + /* Check the cluster generation in the HELLO message.
7892 + * NOTE: this may be different if the message crossed
7893 + * on the wire with an END-TRANS so we allow a period
7894 + * of grace in which this is allowable */
7895 + if (cluster_generation !=
7896 + le32_to_cpu(hellomsg->generation)
7897 + && node_state == MEMBER
7898 + && time_after(jiffies,
7899 + cman_config.hello_timer * HZ +
7900 + transition_end_time)) {
7902 + printk(KERN_INFO CMAN_NAME
7903 + ": bad generation number %d in HELLO message, expected %d\n",
7904 + le32_to_cpu(hellomsg->generation),
7905 + cluster_generation);
7907 + notify_kernel_listeners(DIED,
7908 + (long) node->node_id);
7910 + send_kill(node->node_id);
7914 + if (cluster_members != le16_to_cpu(hellomsg->members)
7915 + && node_state == MEMBER) {
7916 + printk(KERN_INFO CMAN_NAME
7917 + ": nmembers in HELLO message does not match our view (got %d, exp %d)\n",
7918 + le16_to_cpu(hellomsg->members), cluster_members);
7919 + start_transition(TRANS_CHECK, node);
7922 + /* The message is OK - save the time */
7923 + node->last_hello = jiffies;
7926 + /* This node is a danger to our valid cluster */
7927 + if (cluster_is_quorate) {
7928 + send_kill(saddr->scl_nodeid);
7937 +static int do_process_kill(struct msghdr *msg, char *buf, int len)
7939 + struct sockaddr_cl *saddr = msg->msg_name;
7940 + struct cluster_node *node;
7942 + node = find_node_by_nodeid(saddr->scl_nodeid);
7943 + if (node && node->state == NODESTATE_MEMBER) {
7945 + printk(KERN_INFO CMAN_NAME
7946 + ": Being told to leave the cluster by node %d\n",
7947 + saddr->scl_nodeid);
7949 + node_state = LEFT_CLUSTER;
7951 + wake_up_process(membership_task);
7952 + wake_up_interruptible(&cnxman_waitq);
7955 + P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
7960 +/* Some cluster membership utility functions */
7961 +struct cluster_node *find_node_by_name(char *name)
7963 + struct list_head *nodelist;
7964 + struct cluster_node *node;
7966 + down(&cluster_members_lock);
7967 + list_for_each(nodelist, &cluster_members_list) {
7968 + node = list_entry(nodelist, struct cluster_node, list);
7970 + if (strcmp(node->name, name) == 0) {
7971 + up(&cluster_members_lock);
7975 + up(&cluster_members_lock);
7979 +/* Try to avoid using this as it's slow and holds the members lock */
7980 +struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
7982 + struct list_head *nodelist;
7983 + struct list_head *addrlist;
7984 + struct cluster_node *node;
7985 + struct cluster_node_addr *nodeaddr;
7987 + down(&cluster_members_lock);
7989 + list_for_each(nodelist, &cluster_members_list) {
7990 + node = list_entry(nodelist, struct cluster_node, list);
7992 + list_for_each(addrlist, &node->addr_list) {
7994 + list_entry(addrlist, struct cluster_node_addr,
7997 + if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0) {
7998 + up(&cluster_members_lock);
8004 + up(&cluster_members_lock);
8008 +/* This is the quick way to find a node */
8009 +struct cluster_node *find_node_by_nodeid(unsigned int id)
8011 + struct cluster_node *node;
8013 + if (id > sizeof_members_array)
8016 + spin_lock(&members_by_nodeid_lock);
8017 + node = members_by_nodeid[id];
8018 + spin_unlock(&members_by_nodeid_lock);
8022 +static int dispatch_messages(struct socket *mem_socket)
8026 + while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
8027 + struct msghdr msg;
8029 + struct sockaddr_cl sin;
8032 + memset(&sin, 0, sizeof (sin));
8034 + msg.msg_control = NULL;
8035 + msg.msg_controllen = 0;
8036 + msg.msg_name = &sin;
8037 + msg.msg_namelen = sizeof (sin);
8038 + msg.msg_flags = 0;
8040 + vec.iov_len = MAX_CLUSTER_MESSAGE;
8041 + vec.iov_base = iobuf;
8043 + len = kernel_recvmsg(mem_socket, &msg, &vec, 1,
8044 + MAX_CLUSTER_MESSAGE,
8047 + msg.msg_name = &sin;
8048 + do_membership_packet(&msg, iobuf, len);
8051 + if (len == -EAGAIN)
8061 +/* Scan the nodes list for dead nodes */
8062 +static void check_for_dead_nodes()
8064 + struct list_head *nodelist;
8065 + struct cluster_node *node;
8067 + down(&cluster_members_lock);
8068 + list_for_each(nodelist, &cluster_members_list) {
8069 + node = list_entry(nodelist, struct cluster_node, list);
8071 + if (node->state != NODESTATE_DEAD &&
8072 + time_after(jiffies,
8073 + node->last_hello +
8074 + cman_config.deadnode_timeout * HZ) && !node->us) {
8076 + up(&cluster_members_lock);
8078 + printk(KERN_WARNING CMAN_NAME
8079 + ": no HELLO from %s, removing from the cluster\n",
8082 + P_MEMB("last hello was %ld, current time is %ld\n",
8083 + node->last_hello, jiffies);
8085 + node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
8088 + /* This is unlikely to work but it's worth a try! */
8089 + send_kill(node->node_id);
8091 + /* Start state transition */
8092 + a_node_just_died(node);
8096 + up(&cluster_members_lock);
8098 + /* Also check for a dead quorum device */
8099 + if (quorum_device) {
8100 + if (quorum_device->state == NODESTATE_MEMBER &&
8101 + time_after(jiffies,
8102 + quorum_device->last_hello +
8103 + cman_config.deadnode_timeout * HZ)) {
8104 + quorum_device->state = NODESTATE_DEAD;
8105 + printk(KERN_WARNING CMAN_NAME
8106 + ": Quorum device %s timed out\n",
8107 + quorum_device->name);
8108 + recalculate_quorum(0);
8115 +/* add "us" as a node in the cluster */
8116 +static int add_us()
8118 + struct cluster_node *newnode =
8119 + kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
8122 + /* Oh shit, we have to commit hara kiri here for the greater
8123 + * good of the cluster */
8124 + send_leave(CLUSTER_LEAVEFLAG_PANIC);
8126 + printk(KERN_CRIT CMAN_NAME
8127 + ": Cannot allocate memory for our node structure\n");
8128 + panic("Must die");
8133 + memset(newnode, 0, sizeof (struct cluster_node));
8134 + newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
8135 + if (!newnode->name) {
8136 + send_leave(CLUSTER_LEAVEFLAG_PANIC);
8138 + printk(KERN_CRIT CMAN_NAME
8139 + ": Cannot allocate memory for node name\n");
8142 + panic("Must die");
8147 + strcpy(newnode->name, nodename);
8148 + newnode->last_hello = jiffies;
8149 + newnode->votes = votes;
8150 + newnode->expected_votes = expected_votes;
8151 + newnode->state = NODESTATE_JOINING;
8152 + newnode->node_id = 0; /* Will get filled in by ENDTRANS message */
8154 + newnode->leave_reason = 0;
8155 + INIT_LIST_HEAD(&newnode->addr_list);
8156 + get_local_addresses(newnode); /* Get from cnxman socket info */
8158 + /* Add the new node to the list */
8159 + down(&cluster_members_lock);
8160 + list_add(&newnode->list, &cluster_members_list);
8161 + cluster_members++;
8162 + up(&cluster_members_lock);
8168 +/* Return the highest known node_id */
8169 +unsigned int get_highest_nodeid()
8171 + struct list_head *nodelist;
8172 + struct cluster_node *node = NULL;
8173 + unsigned int highest = 0;
8175 + down(&cluster_members_lock);
8176 + list_for_each(nodelist, &cluster_members_list) {
8177 + node = list_entry(nodelist, struct cluster_node, list);
8179 + if (node->node_id > highest)
8180 + highest = node->node_id;
8182 + up(&cluster_members_lock);
8187 +/* Elect a new master if there is a clash. Returns 1 if we are the new master,
8188 + * the master's struct will also be returned. This, rather primitively, uses
8189 + * the lowest node ID */
8190 +static int elect_master(struct cluster_node **master_node)
8194 + for (i = 1; i < sizeof_members_array; i++) {
8195 + if (members_by_nodeid[i]
8196 + && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
8197 + *master_node = members_by_nodeid[i];
8198 + P_MEMB("Elected master is %s\n", (*master_node)->name);
8199 + return (*master_node)->us;
8206 +/* Called by node_cleanup in cnxman when we have left the cluster */
8207 +void free_nodeid_array()
8209 + vfree(members_by_nodeid);
8210 + members_by_nodeid = NULL;
8211 + sizeof_members_array = 0;
8214 +int allocate_nodeid_array()
8216 + /* Allocate space for the nodeid lookup array */
8217 + if (!members_by_nodeid) {
8218 + spin_lock_init(&members_by_nodeid_lock);
8219 + members_by_nodeid =
8220 + vmalloc(cman_config.max_nodes *
8221 + sizeof (struct cluster_member *));
8224 + if (!members_by_nodeid) {
8225 + printk(KERN_WARNING
8226 + "Unable to allocate members array for %d members\n",
8227 + cman_config.max_nodes);
8230 + memset(members_by_nodeid, 0,
8231 + cman_config.max_nodes * sizeof (struct cluster_member *));
8232 + sizeof_members_array = cman_config.max_nodes;
8237 +/* Set the votes & expected_votes variables */
8238 +void set_votes(int v, int e)
8241 + expected_votes = e;
8249 +/* Called by cnxman to see if activity should be blocked because we are in a
8250 + * state transition */
8251 +int in_transition()
8253 + return node_state == TRANSITION ||
8254 + node_state == TRANSITION_COMPLETE || node_state == MASTER;
8257 +/* Return the current membership state as a string for the main line to put
8258 + * into /proc . I really should be using snprintf rather than sprintf but it's
8259 + * not exported... */
8260 +char *membership_state(char *buf, int buflen)
8262 + switch (node_state) {
8264 + strncpy(buf, "Starting", buflen);
8267 + strncpy(buf, "New-Cluster?", buflen);
8270 + strncpy(buf, "Joining", buflen);
8273 + strncpy(buf, "Join-Wait", buflen);
8276 + strncpy(buf, "Join-Ack", buflen);
8279 + sprintf(buf, "State-Transition: Master is %s",
8280 + master_node ? master_node->name : "Unknown");
8283 + strncpy(buf, "Cluster-Member", buflen);
8286 + strncpy(buf, "Rejected", buflen);
8288 + case LEFT_CLUSTER:
8289 + strncpy(buf, "Not-in-Cluster", buflen);
8291 + case TRANSITION_COMPLETE:
8292 + strncpy(buf, "Transition-Complete", buflen);
8295 + strncpy(buf, "Transition-Master", buflen);
8298 + sprintf(buf, "Unknown: code=%d", node_state);
8305 +char *leave_string(int reason)
8307 + static char msg[32];
8310 + case CLUSTER_LEAVEFLAG_DOWN:
8311 + return "Shutdown";
8312 + case CLUSTER_LEAVEFLAG_KILLED:
8313 + return "Killed by another node";
8314 + case CLUSTER_LEAVEFLAG_PANIC:
8316 + case CLUSTER_LEAVEFLAG_REMOVED:
8318 + case CLUSTER_LEAVEFLAG_REJECTED:
8319 + return "Membership rejected";
8321 + sprintf(msg, "Reason is %d\n", reason);
8327 +static char *msgname(int msg)
8330 + case CLUSTER_MEM_JOINCONF:
8331 + return "JOINCONF";
8332 + case CLUSTER_MEM_JOINREQ:
8334 + case CLUSTER_MEM_LEAVE:
8336 + case CLUSTER_MEM_HELLO:
8338 + case CLUSTER_MEM_KILL:
8340 + case CLUSTER_MEM_JOINACK:
8342 + case CLUSTER_MEM_ENDTRANS:
8343 + return "ENDTRANS";
8344 + case CLUSTER_MEM_RECONFIG:
8345 + return "RECONFIG";
8346 + case CLUSTER_MEM_MASTERVIEW:
8347 + return "MASTERVIEW";
8348 + case CLUSTER_MEM_STARTTRANS:
8349 + return "STARTTRANS";
8350 + case CLUSTER_MEM_JOINREJ:
8352 + case CLUSTER_MEM_VIEWACK:
8354 + case CLUSTER_MEM_STARTACK:
8355 + return "STARTACK";
8356 + case CLUSTER_MEM_NEWCLUSTER:
8357 + return "NEWCLUSTER";
8358 + case CLUSTER_MEM_CONFACK:
8360 + case CLUSTER_MEM_NOMINATE:
8361 + return "NOMINATE";
8364 + return "??UNKNOWN??";
8371 + * Overrides for Emacs so that we follow Linus's tabbing style.
8372 + * Emacs will notice this stuff at the end of the file and automatically
8373 + * adjust the settings for this buffer only. This must remain at the end
8375 + * ---------------------------------------------------------------------------
8376 + * Local variables:
8377 + * c-file-style: "linux"
8380 diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
8381 --- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730
8382 +++ linux-patched/cluster/cman/proc.c 2004-11-03 11:37:37.000000000 +0800
8384 +/******************************************************************************
8385 +*******************************************************************************
8387 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8388 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8390 +** This copyrighted material is made available to anyone wishing to use,
8391 +** modify, copy, or redistribute it subject to the terms and conditions
8392 +** of the GNU General Public License v.2.
8394 +*******************************************************************************
8395 +******************************************************************************/
8397 +#include <linux/init.h>
8398 +#include <linux/socket.h>
8399 +#include <linux/kernel.h>
8400 +#include <linux/sched.h>
8401 +#include <linux/file.h>
8402 +#include <linux/proc_fs.h>
8403 +#include <linux/seq_file.h>
8404 +#include <linux/list.h>
8405 +#include <linux/in.h>
8406 +#include <net/sock.h>
8407 +#include <cluster/cnxman.h>
8408 +#include <cluster/service.h>
8410 +#include "cnxman-private.h"
8411 +#include "config.h"
8413 +extern int cluster_members;
8414 +extern struct list_head cluster_members_list;
8415 +extern struct semaphore cluster_members_lock;
8416 +extern struct cluster_node *quorum_device;
8417 +extern int we_are_a_cluster_member;
8418 +extern int cluster_is_quorate;
8419 +extern uint16_t cluster_id;
8420 +extern atomic_t use_count;
8421 +extern unsigned int address_length;
8422 +extern unsigned int config_version;
8423 +extern char cluster_name[];
8424 +extern struct cluster_node *us;
8425 +static struct seq_operations cluster_info_op;
8427 +int sm_proc_open(struct inode *inode, struct file *file);
8428 +int sm_debug_info(char *b, char **start, off_t offset, int length);
8430 +/* /proc interface to the configuration struct */
8431 +static struct config_proc_info {
8434 +} config_proc[] = {
8436 + .name = "joinwait_timeout",
8437 + .value = &cman_config.joinwait_timeout,
8440 + .name = "joinconf_timeout",
8441 + .value = &cman_config.joinconf_timeout,
8444 + .name = "join_timeout",
8445 + .value = &cman_config.join_timeout,
8448 + .name = "hello_timer",
8449 + .value = &cman_config.hello_timer,
8452 + .name = "deadnode_timeout",
8453 + .value = &cman_config.deadnode_timeout,
8456 + .name = "transition_timeout",
8457 + .value = &cman_config.transition_timeout,
8460 + .name = "transition_restarts",
8461 + .value = &cman_config.transition_restarts,
8464 + .name = "max_nodes",
8465 + .value = &cman_config.max_nodes,
8468 + .name = "sm_debug_size",
8469 + .value = &cman_config.sm_debug_size,
8472 + .name = "newcluster_timeout",
8473 + .value = &cman_config.newcluster_timeout,
8478 +static int proc_cluster_status(char *b, char **start, off_t offset, int length)
8480 + struct list_head *nodelist;
8481 + struct cluster_node *node;
8482 + struct cluster_node_addr *node_addr;
8483 + unsigned int total_votes = 0;
8484 + unsigned int max_expected = 0;
8486 + char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
8489 + "Version: %d.%d.%d\n",
8490 + CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
8491 + CNXMAN_PATCH_VERSION);
8494 + "Config version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
8496 + cluster_name, cluster_id,
8497 + membership_state(node_buf, sizeof (node_buf)));
8499 + if (!we_are_a_cluster_member)
8502 + /* Total the votes */
8503 + down(&cluster_members_lock);
8504 + list_for_each(nodelist, &cluster_members_list) {
8505 + node = list_entry(nodelist, struct cluster_node, list);
8506 + if (node->state == NODESTATE_MEMBER) {
8507 + total_votes += node->votes;
8509 + max(max_expected, node->expected_votes);
8512 + up(&cluster_members_lock);
8514 + if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
8515 + total_votes += quorum_device->votes;
8518 + "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n",
8519 + cluster_members, max_expected, total_votes,
8521 + cluster_is_quorate ? " " : "Activity blocked");
8522 + c += sprintf(b+c, "Active subsystems: %d\n",
8523 + atomic_read(&use_count));
8526 + c += sprintf(b+c, "Node addresses: ");
8527 + list_for_each_entry(node_addr, &us->addr_list, list) {
8528 + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
8529 + if (saddr->sin6_family == AF_INET6) {
8530 + c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ",
8531 + be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
8532 + be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
8533 + be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
8534 + be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
8535 + be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
8536 + be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
8537 + be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
8538 + be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
8541 + struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
8542 + uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
8543 + c+= sprintf(b+c, "%u.%u.%u.%u ",
8544 + addr[0], addr[1], addr[2], addr[3]);
8547 + c += sprintf(b+c, "\n\n");
8552 +/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
8554 +struct cluster_seq_info {
8556 + int highest_nodeid;
8559 +static int cluster_open(struct inode *inode, struct file *file)
8561 + return seq_open(file, &cluster_info_op);
8564 +static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
8566 + struct cluster_seq_info *csi =
8567 + kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
8572 + /* Keep highest_nodeid here so we don't need to keep traversing the
8573 + * list to find it */
8574 + csi->nodeid = *pos;
8575 + csi->highest_nodeid = get_highest_nodeid();
8577 + /* Print the header */
8579 + seq_printf(m, "Node Votes Exp Sts Name\n");
8584 +static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
8586 + struct cluster_seq_info *csi = p;
8588 + *pos = ++csi->nodeid;
8589 + if (csi->nodeid > csi->highest_nodeid)
8595 +static int cluster_seq_show(struct seq_file *m, void *p)
8598 + struct cluster_node *node;
8599 + struct cluster_seq_info *csi = p;
8602 + * If we have "0" here then display the quorum device if
8605 + if (csi->nodeid == 0)
8606 + node = quorum_device;
8608 + node = find_node_by_nodeid(csi->nodeid);
8613 + /* Make state printable */
8614 + switch (node->state) {
8615 + case NODESTATE_MEMBER:
8618 + case NODESTATE_JOINING:
8621 + case NODESTATE_DEAD:
8625 + seq_printf(m, "%4d %3d %3d %c %s\n",
8628 + node->expected_votes,
8635 +static void cluster_seq_stop(struct seq_file *m, void *p)
8640 +static struct seq_operations cluster_info_op = {
8641 + .start = cluster_seq_start,
8642 + .next = cluster_seq_next,
8643 + .stop = cluster_seq_stop,
8644 + .show = cluster_seq_show
8647 +static struct file_operations cluster_fops = {
8648 + .open = cluster_open,
8650 + .llseek = seq_lseek,
8651 + .release = seq_release,
8652 + .owner = THIS_MODULE,
8655 +static struct file_operations service_fops = {
8656 + .open = sm_proc_open,
8658 + .llseek = seq_lseek,
8659 + .release = seq_release,
8660 + .owner = THIS_MODULE,
8663 +static int cman_config_read_proc(char *page, char **start, off_t off, int count,
8664 + int *eof, void *data)
8666 + struct config_proc_info *cinfo = data;
8668 + return snprintf(page, count, "%d\n", *cinfo->value);
8671 +static int cman_config_write_proc(struct file *file, const char *buffer,
8672 + unsigned long count, void *data)
8674 + struct config_proc_info *cinfo = data;
8678 + value = simple_strtoul(buffer, &end, 10);
8680 + *cinfo->value = value;
8685 +/* Base of the config directory for cman */
8686 +static struct proc_dir_entry *proc_cman_config;
8687 +void create_proc_entries(void)
8689 + struct proc_dir_entry *procentry;
8690 + struct proc_dir_entry *proc_cluster;
8693 + proc_cluster = proc_mkdir("cluster", 0);
8694 + if (!proc_cluster)
8696 + proc_cluster->owner = THIS_MODULE;
8698 + /* Config dir filled in by us and others */
8699 + if (!proc_mkdir("cluster/config", 0))
8702 + /* Don't much care if this fails, it's hardly vital */
8703 + procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
8705 + procentry->proc_fops = &cluster_fops;
8707 + procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
8709 + procentry->get_info = proc_cluster_status;
8711 + procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
8713 + procentry->proc_fops = &service_fops;
8715 + /* Config entries */
8716 + proc_cman_config = proc_mkdir("cluster/config/cman", 0);
8717 + if (!proc_cman_config)
8720 + for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
8721 + procentry = create_proc_entry(config_proc[i].name, 0660,
8722 + proc_cman_config);
8724 + procentry->data = &config_proc[i];
8725 + procentry->write_proc = cman_config_write_proc;
8726 + procentry->read_proc = cman_config_read_proc;
8730 + procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
8732 + procentry->get_info = sm_debug_info;
8735 +void cleanup_proc_entries(void)
8737 + int i, config_count;
8739 + remove_proc_entry("cluster/sm_debug", NULL);
8741 + config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
8743 + if (proc_cman_config) {
8744 + for (i=0; i<config_count; i++)
8745 + remove_proc_entry(config_proc[i].name, proc_cman_config);
8747 + remove_proc_entry("cluster/config/cman", NULL);
8748 + remove_proc_entry("cluster/config", NULL);
8750 + remove_proc_entry("cluster/nodes", NULL);
8751 + remove_proc_entry("cluster/status", NULL);
8752 + remove_proc_entry("cluster/services", NULL);
8753 + remove_proc_entry("cluster/config", NULL);
8754 + remove_proc_entry("cluster", NULL);
8756 diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
8757 --- linux-orig/cluster/cman/sm.h 1970-01-01 07:30:00.000000000 +0730
8758 +++ linux-patched/cluster/cman/sm.h 2004-11-03 11:37:37.000000000 +0800
8760 +/******************************************************************************
8761 +*******************************************************************************
8763 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8764 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8766 +** This copyrighted material is made available to anyone wishing to use,
8767 +** modify, copy, or redistribute it subject to the terms and conditions
8768 +** of the GNU General Public License v.2.
8770 +*******************************************************************************
8771 +******************************************************************************/
8773 +#ifndef __SM_DOT_H__
8774 +#define __SM_DOT_H__
8777 + * This is the main header file to be included in each Service Manager source
8781 +#include <linux/list.h>
8782 +#include <linux/socket.h>
8783 +#include <linux/kernel.h>
8784 +#include <linux/sched.h>
8785 +#include <linux/file.h>
8786 +#include <linux/kthread.h>
8787 +#include <net/sock.h>
8789 +#include <cluster/cnxman.h>
8790 +#include <cluster/service.h>
8792 +#define SG_LEVELS (4)
8794 +#include "sm_internal.h"
8795 +#include "sm_barrier.h"
8796 +#include "sm_control.h"
8797 +#include "sm_daemon.h"
8798 +#include "sm_joinleave.h"
8799 +#include "sm_membership.h"
8800 +#include "sm_message.h"
8801 +#include "sm_misc.h"
8802 +#include "sm_recover.h"
8803 +#include "sm_services.h"
8805 +extern struct list_head sm_sg[SG_LEVELS];
8806 +extern struct semaphore sm_sglock;
8816 +#define SM_ASSERT(x, do) \
8820 + printk("\nSM: Assertion failed on line %d of file %s\n" \
8821 + "SM: assertion: \"%s\"\n" \
8822 + "SM: time = %lu\n", \
8823 + __LINE__, __FILE__, #x, jiffies); \
8826 + panic("SM: Record message above and reboot.\n"); \
8830 +#define SM_RETRY(do_this, until_this) \
8833 + do { do_this; } while (0); \
8836 + printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \
8841 +#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
8843 +#define log_error(sg, fmt, args...) \
8844 + printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8847 +#define SM_DEBUG_LOG
8849 +#ifdef SM_DEBUG_CONSOLE
8850 +#define log_debug(sg, fmt, args...) \
8851 + printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8854 +#ifdef SM_DEBUG_LOG
8855 +#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
8858 +#ifdef SM_DEBUG_ALL
8859 +#define log_debug(sg, fmt, args...) \
8862 + printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
8863 + sm_debug_log(sg, fmt, ##args); \
8868 +#endif /* __SM_DOT_H__ */
8869 diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
8870 --- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730
8871 +++ linux-patched/cluster/cman/sm_barrier.c 2004-11-03 11:37:37.000000000 +0800
8873 +/******************************************************************************
8874 +*******************************************************************************
8876 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
8877 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
8879 +** This copyrighted material is made available to anyone wishing to use,
8880 +** modify, copy, or redistribute it subject to the terms and conditions
8881 +** of the GNU General Public License v.2.
8883 +*******************************************************************************
8884 +******************************************************************************/
8888 +static struct list_head barriers;
8889 +static spinlock_t barriers_lock;
8892 + struct list_head list;
8897 +typedef struct bc_entry bc_entry_t;
8899 +void init_barriers(void)
8901 + INIT_LIST_HEAD(&barriers);
8902 + spin_lock_init(&barriers_lock);
8905 +static int atoi(char *c)
8909 + while ('0' <= *c && *c <= '9') {
8910 + x = x * 10 + (*c - '0');
8916 +static void add_barrier_callback(char *name, int status, int type)
8922 + /* an ESRCH callback just means there was a cnxman transition */
8923 + if (status == -ESRCH)
8926 + /* extract global id of SG from barrier name */
8927 + p = strstr(name, "sm.");
8929 + SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
8931 + p += strlen("sm.");
8934 + SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
8937 + be->status = status;
8940 + spin_lock(&barriers_lock);
8941 + list_add_tail(&be->list, &barriers);
8942 + spin_unlock(&barriers_lock);
8944 + wake_serviced(DO_BARRIERS);
8947 +static void callback_recovery_barrier(char *name, int status)
8949 + add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
8952 +static void callback_startdone_barrier_new(char *name, int status)
8954 + add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
8957 +static void callback_startdone_barrier(char *name, int status)
8959 + add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
8962 +int sm_barrier(char *name, int count, int type)
8965 + unsigned long fn = 0;
8968 + case SM_BARRIER_STARTDONE:
8969 + fn = (unsigned long) callback_startdone_barrier;
8971 + case SM_BARRIER_STARTDONE_NEW:
8972 + fn = (unsigned long) callback_startdone_barrier_new;
8974 + case SM_BARRIER_RECOVERY:
8975 + fn = (unsigned long) callback_recovery_barrier;
8979 + error = kcl_barrier_register(name, 0, count);
8981 + log_print("barrier register error %d", error);
8985 + error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
8987 + log_print("barrier setattr autodel error %d", error);
8991 + error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
8993 + log_print("barrier setattr cb error %d", error);
8997 + error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
8999 + log_print("barrier setattr enabled error %d", error);
9006 + kcl_barrier_delete(name);
9011 +void process_startdone_barrier_new(sm_group_t *sg, int status)
9013 + sm_sevent_t *sev = sg->sevent;
9015 + if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
9016 + log_debug(sev->se_sg, "ignore barrier cb status %d", status);
9020 + sev->se_barrier_status = status;
9021 + sev->se_state = SEST_BARRIER_DONE;
9022 + set_bit(SEFL_CHECK, &sev->se_flags);
9023 + wake_serviced(DO_JOINLEAVE);
9026 +void process_startdone_barrier(sm_group_t *sg, int status)
9028 + sm_uevent_t *uev = &sg->uevent;
9030 + if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
9031 + log_debug(sg, "ignore barrier cb status %d", status);
9035 + uev->ue_barrier_status = status;
9036 + uev->ue_state = UEST_BARRIER_DONE;
9037 + set_bit(UEFL_CHECK, &uev->ue_flags);
9038 + wake_serviced(DO_MEMBERSHIP);
9041 +void process_recovery_barrier(sm_group_t *sg, int status)
9044 + log_error(sg, "process_recovery_barrier status=%d", status);
9048 + if (sg->state != SGST_RECOVER ||
9049 + sg->recover_state != RECOVER_BARRIERWAIT) {
9050 + log_error(sg, "process_recovery_barrier state %d recover %d",
9051 + sg->state, sg->recover_state);
9055 + if (!sg->recover_stop)
9056 + sg->recover_state = RECOVER_STOP;
9058 + sg->recover_state = RECOVER_BARRIERDONE;
9060 + wake_serviced(DO_RECOVERIES);
9063 +void process_barriers(void)
9071 + spin_lock(&barriers_lock);
9072 + if (!list_empty(&barriers)) {
9073 + be = list_entry(barriers.next, bc_entry_t, list);
9074 + list_del(&be->list);
9076 + spin_unlock(&barriers_lock);
9081 + sg = sm_global_id_to_sg(be->gid);
9083 + log_print("process_barriers: no sg %08x", be->gid);
9087 + switch (be->type) {
9088 + case SM_BARRIER_STARTDONE_NEW:
9089 + process_startdone_barrier_new(sg, be->status);
9092 + case SM_BARRIER_STARTDONE:
9093 + process_startdone_barrier(sg, be->status);
9096 + case SM_BARRIER_RECOVERY:
9097 + process_recovery_barrier(sg, be->status);
9105 diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
9106 --- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730
9107 +++ linux-patched/cluster/cman/sm_barrier.h 2004-11-03 11:37:37.000000000 +0800
9109 +/******************************************************************************
9110 +*******************************************************************************
9112 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9113 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9115 +** This copyrighted material is made available to anyone wishing to use,
9116 +** modify, copy, or redistribute it subject to the terms and conditions
9117 +** of the GNU General Public License v.2.
9119 +*******************************************************************************
9120 +******************************************************************************/
9122 +#ifndef __SM_BARRIER_DOT_H__
9123 +#define __SM_BARRIER_DOT_H__
9125 +#define SM_BARRIER_STARTDONE (0)
9126 +#define SM_BARRIER_STARTDONE_NEW (1)
9127 +#define SM_BARRIER_RECOVERY (2)
9128 +#define SM_BARRIER_RESET (3)
9130 +void init_barriers(void);
9131 +void process_barriers(void);
9132 +int sm_barrier(char *name, int count, int type);
9133 +void process_startdone_barrier(sm_group_t *sg, int status);
9134 +void process_startdone_barrier_new(sm_group_t *sg, int status);
9135 +void process_recovery_barrier(sm_group_t *sg, int status);
9138 diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
9139 --- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730
9140 +++ linux-patched/cluster/cman/sm_control.c 2004-11-03 11:37:37.000000000 +0800
9142 +/******************************************************************************
9143 +*******************************************************************************
9145 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9146 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9148 +** This copyrighted material is made available to anyone wishing to use,
9149 +** modify, copy, or redistribute it subject to the terms and conditions
9150 +** of the GNU General Public License v.2.
9152 +*******************************************************************************
9153 +******************************************************************************/
9156 +#include "config.h"
9158 +struct socket * sm_socket;
9159 +uint32_t * sm_new_nodeids;
9160 +uint32_t sm_our_nodeid;
9161 +int sm_quorum, sm_quorum_next;
9162 +struct list_head sm_members;
9163 +int sm_member_count;
9168 + * Called by cnxman when it has a new member list.
9171 +void sm_member_update(int quorate)
9173 + sm_quorum_next = quorate;
9174 + wake_serviced(DO_START_RECOVERY);
9179 + * Called when module is loaded.
9185 + sm_new_nodeids = NULL;
9187 + sm_quorum_next = 0;
9188 + sm_our_nodeid = 0;
9189 + INIT_LIST_HEAD(&sm_members);
9190 + sm_member_count = 0;
9203 + * Called at beginning of cluster join procedure.
9206 +void sm_start(void)
9208 + struct sockaddr_cl saddr;
9209 + struct socket *sock;
9212 + /* Create a communication channel among service managers */
9214 + result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
9216 + log_print("can't create socket %d", result);
9222 + saddr.scl_family = AF_CLUSTER;
9223 + saddr.scl_port = CLUSTER_PORT_SERVICES;
9225 + result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
9228 + log_print("can't bind socket %d", result);
9229 + goto fail_release;
9232 + result = kcl_register_read_callback(sm_socket, sm_cluster_message);
9234 + log_print("can't register read callback %d", result);
9235 + goto fail_release;
9238 + sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
9243 + /* cnxman should call sm_member_update() once we've joined - then we
9244 + * can get our first list of members and our own nodeid */
9249 + sock_release(sm_socket);
9258 + * Called before cnxman leaves the cluster. If this returns an error to cman,
9259 + * cman should not leave the cluster but return EBUSY.
9260 + * If force is set we go away anyway. cman knows best in this case
9263 +int sm_stop(int force)
9265 + struct list_head *head;
9268 + int i, busy = FALSE, error = -EBUSY;
9270 + for (i = 0; i < SG_LEVELS; i++) {
9271 + if (!list_empty(&sm_sg[i])) {
9272 + sg = list_entry(sm_sg[i].next, sm_group_t, list);
9273 + log_error(sg, "sm_stop: SG still joined");
9278 + if (!busy || force) {
9282 + sock_release(sm_socket);
9284 + head = &sm_members;
9285 + while (!list_empty(head)) {
9286 + node = list_entry(head->next, sm_node_t, list);
9287 + list_del(&node->list);
9288 + sm_member_count--;
9292 + kfree(sm_new_nodeids);
9298 diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
9299 --- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730
9300 +++ linux-patched/cluster/cman/sm_control.h 2004-11-03 11:37:37.000000000 +0800
9302 +/******************************************************************************
9303 +*******************************************************************************
9305 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9306 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9308 +** This copyrighted material is made available to anyone wishing to use,
9309 +** modify, copy, or redistribute it subject to the terms and conditions
9310 +** of the GNU General Public License v.2.
9312 +*******************************************************************************
9313 +******************************************************************************/
9315 +#ifndef __SM_CONTROL_DOT_H__
9316 +#define __SM_CONTROL_DOT_H__
9318 +void sm_init(void);
9319 +void sm_start(void);
9320 +int sm_stop(int force);
9321 +void sm_member_update(int quorate);
9324 diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
9325 --- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
9326 +++ linux-patched/cluster/cman/sm_daemon.c 2004-11-03 11:37:37.000000000 +0800
9328 +/******************************************************************************
9329 +*******************************************************************************
9331 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9332 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9334 +** This copyrighted material is made available to anyone wishing to use,
9335 +** modify, copy, or redistribute it subject to the terms and conditions
9336 +** of the GNU General Public License v.2.
9338 +*******************************************************************************
9339 +******************************************************************************/
9343 +static unsigned long daemon_flags;
9344 +static struct task_struct * daemon_task;
9345 +extern int sm_quorum;
9347 +void init_serviced(void)
9350 + daemon_task = NULL;
9353 +void wake_serviced(int do_flag)
9355 + set_bit(do_flag, &daemon_flags);
9356 + wake_up_process(daemon_task);
9359 +static inline int got_work(void)
9363 + rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
9364 + test_bit(DO_MESSAGES, &daemon_flags) ||
9365 + test_bit(DO_BARRIERS, &daemon_flags) ||
9366 + test_bit(DO_CALLBACKS, &daemon_flags));
9368 + if (sm_quorum && !rv)
9369 + rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
9370 + test_bit(DO_RECOVERIES, &daemon_flags) ||
9371 + test_bit(DO_MEMBERSHIP, &daemon_flags));
9375 +static int serviced(void *arg)
9377 + while (!kthread_should_stop()) {
9378 + if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
9379 + process_nodechange();
9381 + if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
9382 + process_messages();
9384 + if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
9385 + process_barriers();
9387 + if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
9388 + process_callbacks();
9391 + if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
9392 + process_recoveries();
9394 + if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
9395 + process_joinleave();
9397 + if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
9398 + process_membership();
9401 + set_current_state(TASK_INTERRUPTIBLE);
9404 + set_current_state(TASK_RUNNING);
9410 +int start_serviced(void)
9412 + struct task_struct *p;
9414 + p = kthread_run(serviced, NULL, 0, "cman_serviced");
9416 + printk("can't start cman_serviced daemon");
9417 + return (IS_ERR(p));
9424 +void stop_serviced(void)
9426 + kthread_stop(daemon_task);
9428 diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
9429 --- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
9430 +++ linux-patched/cluster/cman/sm_daemon.h 2004-11-03 11:37:37.000000000 +0800
9432 +/******************************************************************************
9433 +*******************************************************************************
9435 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9436 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9438 +** This copyrighted material is made available to anyone wishing to use,
9439 +** modify, copy, or redistribute it subject to the terms and conditions
9440 +** of the GNU General Public License v.2.
9442 +*******************************************************************************
9443 +******************************************************************************/
9445 +#ifndef __SM_DAEMON_DOT_H__
9446 +#define __SM_DAEMON_DOT_H__
9449 +#define DO_START_RECOVERY (1)
9450 +#define DO_MESSAGES (2)
9451 +#define DO_BARRIERS (3)
9452 +#define DO_CALLBACKS (4)
9453 +#define DO_JOINLEAVE (5)
9454 +#define DO_RECOVERIES (6)
9455 +#define DO_MEMBERSHIP (7)
9456 +#define DO_RESET (8)
9458 +void init_serviced(void);
9459 +void wake_serviced(int do_flag);
9460 +void stop_serviced(void);
9461 +int start_serviced(void);
9464 diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
9465 --- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730
9466 +++ linux-patched/cluster/cman/sm_internal.h 2004-11-03 11:37:37.000000000 +0800
9468 +/******************************************************************************
9469 +*******************************************************************************
9471 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9472 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9474 +** This copyrighted material is made available to anyone wishing to use,
9475 +** modify, copy, or redistribute it subject to the terms and conditions
9476 +** of the GNU General Public License v.2.
9478 +*******************************************************************************
9479 +******************************************************************************/
9481 +#ifndef __SM_INTERNAL_DOT_H__
9482 +#define __SM_INTERNAL_DOT_H__
9485 + * Any header files needed by this file should be included before it in sm.h.
9486 + * This file should only be included by sm.h.
9495 +typedef struct sm_group sm_group_t;
9496 +typedef struct sm_sevent sm_sevent_t;
9497 +typedef struct sm_uevent sm_uevent_t;
9498 +typedef struct sm_node sm_node_t;
9499 +typedef struct sm_msg sm_msg_t;
9503 + * Number of seconds to wait before trying again to join or leave an SG
9505 +#define RETRY_DELAY (2)
9509 + * Service Event - what a node uses to join or leave an sg
9513 +#define SEFL_CHECK (0)
9514 +#define SEFL_ALLOW_JOIN (1)
9515 +#define SEFL_ALLOW_JSTOP (2)
9516 +#define SEFL_ALLOW_LEAVE (3)
9517 +#define SEFL_ALLOW_LSTOP (4)
9518 +#define SEFL_ALLOW_STARTDONE (5)
9519 +#define SEFL_ALLOW_BARRIER (6)
9520 +#define SEFL_DELAY (7)
9521 +#define SEFL_DELAY_RECOVERY (8)
9522 +#define SEFL_LEAVE (9)
9523 +#define SEFL_CANCEL (10)
9526 +#define SEST_JOIN_BEGIN (1)
9527 +#define SEST_JOIN_ACKWAIT (2)
9528 +#define SEST_JOIN_ACKED (3)
9529 +#define SEST_JSTOP_ACKWAIT (4)
9530 +#define SEST_JSTOP_ACKED (5)
9531 +#define SEST_JSTART_SERVICEWAIT (6)
9532 +#define SEST_JSTART_SERVICEDONE (7)
9533 +#define SEST_BARRIER_WAIT (8)
9534 +#define SEST_BARRIER_DONE (9)
9535 +#define SEST_LEAVE_BEGIN (10)
9536 +#define SEST_LEAVE_ACKWAIT (11)
9537 +#define SEST_LEAVE_ACKED (12)
9538 +#define SEST_LSTOP_ACKWAIT (13)
9539 +#define SEST_LSTOP_ACKED (14)
9540 +#define SEST_LSTART_WAITREMOTE (15)
9541 +#define SEST_LSTART_REMOTEDONE (16)
9544 + struct list_head se_list;
9545 + unsigned int se_id;
9546 + sm_group_t * se_sg;
9547 + unsigned long se_flags;
9548 + unsigned int se_state;
9550 + int se_node_count;
9551 + int se_memb_count;
9552 + int se_reply_count;
9554 + uint32_t * se_node_ids;
9555 + char * se_node_status;
9556 + int se_len_ids; /* length of node_ids */
9557 + int se_len_status; /* length of node_status */
9559 + int se_barrier_status;
9560 + struct timer_list se_restart_timer;
9564 + * Update Event - what an sg member uses to respond to an sevent
9568 +#define UEFL_ALLOW_STARTDONE (0)
9569 +#define UEFL_ALLOW_BARRIER (1)
9570 +#define UEFL_CANCEL (2)
9571 +#define UEFL_LEAVE (3)
9572 +#define UEFL_CHECK (4)
9575 +#define UEST_JSTOP (1)
9576 +#define UEST_JSTART_WAITCMD (2)
9577 +#define UEST_JSTART (3)
9578 +#define UEST_JSTART_SERVICEWAIT (4)
9579 +#define UEST_JSTART_SERVICEDONE (5)
9580 +#define UEST_BARRIER_WAIT (6)
9581 +#define UEST_BARRIER_DONE (7)
9582 +#define UEST_LSTOP (8)
9583 +#define UEST_LSTART_WAITCMD (9)
9584 +#define UEST_LSTART (10)
9585 +#define UEST_LSTART_SERVICEWAIT (11)
9586 +#define UEST_LSTART_SERVICEDONE (12)
9589 + unsigned int ue_state;
9590 + unsigned long ue_flags;
9592 + uint32_t ue_nodeid;
9594 + int ue_barrier_status;
9595 + uint16_t ue_remote_seid;
9602 +#define RECOVER_NONE (0)
9603 +#define RECOVER_STOP (1)
9604 +#define RECOVER_START (2)
9605 +#define RECOVER_STARTDONE (3)
9606 +#define RECOVER_BARRIERWAIT (4)
9607 +#define RECOVER_BARRIERDONE (5)
9610 +#define SGFL_SEVENT (1)
9611 +#define SGFL_UEVENT (2)
9612 +#define SGFL_NEED_RECOVERY (3)
9615 +#define SGST_NONE (0)
9616 +#define SGST_JOIN (1)
9617 +#define SGST_RUN (2)
9618 +#define SGST_RECOVER (3)
9619 +#define SGST_UEVENT (4)
9622 + struct list_head list; /* list of sg's */
9624 + uint32_t local_id;
9625 + uint32_t global_id;
9626 + unsigned long flags;
9628 + int refcount; /* references from reg/unreg */
9629 + void * service_data; /* data from the service */
9630 + struct kcl_service_ops *ops; /* ops from the service */
9631 + struct completion event_comp;
9633 + struct list_head memb; /* Membership List for RC */
9634 + int memb_count; /* number of nodes in memb */
9635 + struct list_head joining; /* nodes joining the sg */
9636 + sm_sevent_t * sevent;
9637 + sm_uevent_t uevent;
9639 + int recover_state;
9641 + struct list_head recover_list; /* recovery event list */
9642 + void * recover_data;
9643 + char recover_barrier[MAX_BARRIER_NAME_LEN];
9646 + char name[1]; /* must be last field */
9654 +#define SMSG_JOIN_REQ (1)
9655 +#define SMSG_JOIN_REP (2)
9656 +#define SMSG_JSTOP_REQ (3)
9657 +#define SMSG_JSTOP_REP (4)
9658 +#define SMSG_JSTART_CMD (5)
9659 +#define SMSG_LEAVE_REQ (6)
9660 +#define SMSG_LEAVE_REP (7)
9661 +#define SMSG_LSTOP_REQ (8)
9662 +#define SMSG_LSTOP_REP (9)
9663 +#define SMSG_LSTART_CMD (10)
9664 +#define SMSG_LSTART_DONE (11)
9665 +#define SMSG_RECOVER (12)
9668 +#define STATUS_POS (1)
9669 +#define STATUS_NEG (2)
9670 +#define STATUS_WAIT (3)
9674 + uint8_t ms_status;
9675 + uint16_t ms_sevent_id;
9676 + uint32_t ms_global_sgid;
9677 + uint32_t ms_global_lastid;
9678 + uint16_t ms_sglevel;
9679 + uint16_t ms_length;
9680 + /* buf of ms_length bytes follows */
9687 +#define SNFL_NEED_RECOVERY (0)
9688 +#define SNFL_CLUSTER_MEMBER (1)
9689 +#define SNFL_LEAVING (2)
9692 + struct list_head list;
9693 + uint32_t id; /* node id from cnxman */
9694 + unsigned long flags;
9695 + int incarnation; /* node incarnation number */
9698 +#endif /* __SM_INTERNAL_DOT_H__ */
9699 diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
9700 --- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730
9701 +++ linux-patched/cluster/cman/sm_joinleave.c 2004-11-03 11:37:37.000000000 +0800
9703 +/******************************************************************************
9704 +*******************************************************************************
9706 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
9707 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
9709 +** This copyrighted material is made available to anyone wishing to use,
9710 +** modify, copy, or redistribute it subject to the terms and conditions
9711 +** of the GNU General Public License v.2.
9713 +*******************************************************************************
9714 +******************************************************************************/
9719 + * Routines used by nodes that are joining or leaving a SG. These "sevent"
9720 + * routines initiate membership changes to a SG. Existing SG members respond
9721 + * using the "uevent" membership update routines.
9724 +extern uint32_t sm_our_nodeid;
9725 +extern struct list_head sm_members;
9726 +static struct list_head new_event;
9727 +static spinlock_t new_event_lock;
9728 +static struct list_head joinleave_events;
9730 +void init_joinleave(void)
9732 + INIT_LIST_HEAD(&new_event);
9733 + spin_lock_init(&new_event_lock);
9734 + INIT_LIST_HEAD(&joinleave_events);
9737 +void new_joinleave(sm_sevent_t *sev)
9739 + spin_lock(&new_event_lock);
9740 + list_add_tail(&sev->se_list, &new_event);
9741 + spin_unlock(&new_event_lock);
9742 + wake_serviced(DO_JOINLEAVE);
9745 +sm_sevent_t *find_sevent(unsigned int id)
9749 + list_for_each_entry(sev, &joinleave_events, se_list) {
9750 + if (sev->se_id == id)
9756 +static void release_sevent(sm_sevent_t *sev)
9758 + if (sev->se_len_ids) {
9759 + kfree(sev->se_node_ids);
9760 + sev->se_node_ids = NULL;
9763 + if (sev->se_len_status) {
9764 + kfree(sev->se_node_status);
9765 + sev->se_node_status = NULL;
9768 + sev->se_node_count = 0;
9769 + sev->se_memb_count = 0;
9770 + sev->se_reply_count = 0;
9773 +static int init_sevent(sm_sevent_t *sev)
9776 + int len1, len2, count, cluster_members = 0;
9778 + /* clear state from any previous attempt */
9779 + release_sevent(sev);
9781 + list_for_each_entry(node, &sm_members, list) {
9782 + if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9783 + cluster_members++;
9786 + sev->se_node_count = cluster_members;
9787 + sev->se_memb_count = sev->se_sg->memb_count;
9790 + * When joining, we need a node array the size of the entire cluster
9791 + * member list because we get responses from all nodes. When leaving,
9792 + * we only get responses from SG members, so the node array need only
9796 + if (sev->se_state < SEST_LEAVE_BEGIN)
9797 + count = sev->se_node_count;
9799 + count = sev->se_memb_count;
9801 + len1 = count * sizeof(uint32_t);
9802 + sev->se_len_ids = len1;
9804 + sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
9805 + if (!sev->se_node_ids)
9808 + len2 = count * sizeof (char);
9809 + sev->se_len_status = len2;
9811 + sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
9812 + if (!sev->se_node_status)
9815 + memset(sev->se_node_status, 0, len2);
9816 + memset(sev->se_node_ids, 0, len1);
9821 + kfree(sev->se_node_ids);
9822 + sev->se_node_ids = NULL;
9823 + sev->se_len_ids = 0;
9829 +/* Context: timer */
9831 +static void sev_restart(unsigned long data)
9833 + sm_sevent_t *sev = (sm_sevent_t *) data;
9835 + clear_bit(SEFL_DELAY, &sev->se_flags);
9836 + set_bit(SEFL_CHECK, &sev->se_flags);
9837 + wake_serviced(DO_JOINLEAVE);
9840 +static void schedule_sev_restart(sm_sevent_t *sev)
9842 + init_timer(&sev->se_restart_timer);
9843 + sev->se_restart_timer.function = sev_restart;
9844 + sev->se_restart_timer.data = (long) sev;
9845 + mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
9848 +void free_sg_memb(sm_group_t *sg)
9852 + while (!list_empty(&sg->memb)) {
9853 + node = list_entry(sg->memb.next, sm_node_t, list);
9854 + list_del(&node->list);
9857 + sg->memb_count = 0;
9861 + * 1. First step in joining a SG - send a message to all nodes in the cluster
9862 + * asking to join the named SG. If any nodes are members they will reply with
9863 + * a POS, or a WAIT (wait means try again, only one node can join at a time).
9864 + * If no one knows about this SG, they all send NEG replies which means we form
9865 + * the SG with just ourself as a member.
9868 +static int send_join_notice(sm_sevent_t *sev)
9870 + sm_group_t *sg = sev->se_sg;
9873 + int i = 0, error, namelen, len = 0;
9876 + * Create node array from member list in which to collect responses.
9879 + error = init_sevent(sev);
9883 + list_for_each_entry(node, &sm_members, list) {
9884 + if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9885 + sev->se_node_ids[i++] = node->id;
9889 + * Create and send a join request message.
9891 + * Other nodes then run process_join_request and reply to us; we
9892 + * collect the responses in process_reply and check them in
9893 + * check_join_notice.
9896 + namelen = sg->namelen;
9897 + msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
9898 + memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
9900 + error = send_broadcast_message_sev(msg, len, sev);
9907 + * 2. Second step in joining a SG - after we collect all replies to our join
9908 + * request, we look at them. If anyone told us to wait, we'll wait a while, go
9909 + * back and start at step 1 again.
9912 +static int check_join_notice(sm_sevent_t *sev)
9914 + int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
9916 + for (i = 0; i < sev->se_node_count; i++) {
9917 + switch (sev->se_node_status[i]) {
9919 + /* this node is in the SG and will be in new proposed
9925 + /* this node is in the SG but something else is
9926 + * happening with it at the moment. */
9931 + /* this node has no record of the SG we're interested
9935 + if (sev->se_node_ids[i] == sm_our_nodeid)
9936 + sev->se_node_status[i] = STATUS_POS;
9940 + /* we didn't get a valid response from this node,
9941 + * restart the entire sev. */
9947 + if (pos && !wait && !restart) {
9948 + /* all current members of this sg pos'ed our entry */
9949 + } else if (!pos && !wait && !restart && neg) {
9950 + /* we're the first in the cluster to join this sg */
9951 + sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
9959 + * 3. Third step in joining the SG - tell the nodes that are already members
9960 + * to "stop" the service. We stop them so that everyone can restart with the
9961 + * new member (us!) added.
9964 +static int send_join_stop(sm_sevent_t *sev)
9966 + sm_group_t *sg = sev->se_sg;
9969 + uint32_t be_count;
9970 + int i, len = 0, error = 0;
9973 + * Form the SG memb list with us in it.
9976 + for (i = 0; i < sev->se_node_count; i++) {
9977 + if (sev->se_node_status[i] != STATUS_POS)
9980 + node = sm_new_node(sev->se_node_ids[i]);
9984 + list_add_tail(&node->list, &sg->memb);
9989 + * Re-init the node vector in which to collect responses again.
9992 + sev->se_memb_count = sg->memb_count;
9994 + memset(sev->se_node_status, 0, sev->se_len_status);
9995 + memset(sev->se_node_ids, 0, sev->se_len_ids);
9998 + list_for_each_entry(node, &sg->memb, list)
9999 + sev->se_node_ids[i++] = node->id;
10002 + * Create and send a stop message.
10004 + * Other nodes then run process_stop_request and process_join_stop and
10005 + * reply to us. They stop the sg we're trying to join if they agree.
10006 + * We collect responses in process_reply and check them in
10007 + * check_join_stop.
10010 + msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
10011 + be_count = cpu_to_be32(sg->memb_count);
10012 + memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
10014 + error = send_members_message_sev(sg, msg, len, sev);
10021 + free_sg_memb(sg);
10026 + * 4. Fourth step in joining the SG - after we collect replies to our stop
10027 + * request, we look at them. Everyone sending POS agrees with us joining and
10028 + * has stopped their SG. If some nodes sent NEG, something is wrong and we
10029 + * don't have a good way to address that yet since some nodes may have sent
10032 + * FIXME: even nodes replying with NEG should stop their SG so we can send an
10033 + * abort and have everyone at the same place to start from again.
10036 +static int check_join_stop(sm_sevent_t *sev)
10038 + sm_group_t *sg = sev->se_sg;
10039 + int i, pos = 0, neg = 0;
10041 + for (i = 0; i < sev->se_memb_count; i++) {
10042 + switch (sev->se_node_status[i]) {
10048 + log_error(sg, "check_join_stop: neg from nodeid %u "
10049 + "(%d, %d, %u)", sev->se_node_ids[i],
10050 + pos, neg, sev->se_memb_count);
10055 + log_error(sg, "check_join_stop: unknown status=%u "
10056 + "nodeid=%u", sev->se_node_status[i],
10057 + sev->se_node_ids[i]);
10063 + if (pos == sg->memb_count)
10066 + free_sg_memb(sg);
10071 + * 5. Fifth step in joining the SG - everyone has stopped their service and we
10072 + * all now start the service with us, the new member, added to the SG member
10073 + * list. We send start to our own service here and send a message to the other
10074 + * members that they should also start their service.
10077 +static int send_join_start(sm_sevent_t *sev)
10079 + sm_group_t *sg = sev->se_sg;
10083 + int error, count = 0, len = 0;
10086 + * Create a start message and send it.
10089 + msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
10091 + error = send_members_message(sg, msg, len);
10096 + * Start the service ourself. The chunk of memory with the member ids
10097 + * must be freed by the service when it is done with it.
10100 + SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
10103 + list_for_each_entry(node, &sg->memb, list)
10104 + memb[count++] = node->id;
10106 + set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10108 + sg->ops->start(sg->service_data, memb, count, sev->se_id,
10109 + SERVICE_NODE_JOIN);
10113 + free_sg_memb(sg);
10118 + * 6. Sixth step in joining the SG - once the service has completed its start,
10119 + * it does a kcl_start_done() to signal us that it's done. That gets us here
10120 + * and we do a barrier with all other members which join the barrier when their
10121 + * service is done starting.
10124 +static int startdone_barrier_new(sm_sevent_t *sev)
10126 + sm_group_t *sg = sev->se_sg;
10127 + char bname[MAX_BARRIER_NAME_LEN];
10130 + memset(bname, 0, MAX_BARRIER_NAME_LEN);
10131 + sev->se_barrier_status = -1;
10133 + set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10135 + /* If we're the only member, skip the barrier */
10136 + if (sg->memb_count == 1) {
10137 + process_startdone_barrier_new(sg, 0);
10141 + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10142 + sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
10144 + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
10151 + clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10152 + sg->ops->stop(sg->service_data);
10153 + free_sg_memb(sg);
10158 + * 7. Seventh step in joining the SG - check that the barrier we joined with
10159 + * all other members returned with a successful status.
10162 +static int check_startdone_barrier_new(sm_sevent_t *sev)
10164 + sm_group_t *sg = sev->se_sg;
10165 + int error = sev->se_barrier_status;
10168 + sg->ops->stop(sg->service_data);
10169 + free_sg_memb(sg);
10175 + * 8. Eigth step in joining the SG - send the service a "finish" indicating
10176 + * that all members have successfully started the service.
10179 +static void do_finish_new(sm_sevent_t *sev)
10181 + sm_group_t *sg = sev->se_sg;
10183 + sg->state = SGST_RUN;
10184 + sg->sevent = NULL;
10185 + clear_bit(SGFL_SEVENT, &sg->flags);
10187 + sg->ops->finish(sg->service_data, sev->se_id);
10191 + * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff
10192 + * and tell the process which initiated the join that it's done.
10195 +static void sevent_done(sm_sevent_t *sev)
10197 + sm_group_t *sg = sev->se_sg;
10199 + list_del(&sev->se_list);
10200 + release_sevent(sev);
10202 + complete(&sg->event_comp);
10206 + * Move through the steps of a join. Summary:
10208 + * 1. Send a join notice to all cluster members.
10209 + * 2. Collect and check replies to the join notice.
10210 + * 3. Send a stop message to all SG members.
10211 + * 4. Collect and check replies to the stop message.
10212 + * 5. Send a start message to all SG members and start service ourself.
10213 + * 6. Use barrier to wait for all nodes to complete the start.
10214 + * 7. Check that all SG members joined the barrier.
10215 + * 8. Send finish to the service indicating that all nodes started it.
10216 + * 9. Clean up sevent and signal completion to the process that started the join
10219 +static void process_join_sevent(sm_sevent_t *sev)
10224 + * We may cancel the current join attempt if another node is also
10225 + * attempting to join or leave. (Only a single node can join or leave
10226 + * at once.) If cancelled, 0ur join attempt will be restarted later.
10229 + if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10234 + log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10236 + switch (sev->se_state) {
10239 + * An sevent is created in kcl_join_service with a state of
10243 + case SEST_JOIN_BEGIN:
10244 + sev->se_state = SEST_JOIN_ACKWAIT;
10245 + error = send_join_notice(sev);
10249 + * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
10250 + * process_reply (when all the replies have been received)
10253 + case SEST_JOIN_ACKED:
10254 + error = check_join_notice(sev);
10258 + sev->se_state = SEST_JSTOP_ACKWAIT;
10259 + error = send_join_stop(sev);
10263 + * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
10264 + * proces_reply (when all the replies have been received)
10267 + case SEST_JSTOP_ACKED:
10268 + error = check_join_stop(sev);
10272 + sev->se_state = SEST_JSTART_SERVICEWAIT;
10273 + error = send_join_start(sev);
10277 + * se_state is changed from JSTART_SERVICEWAIT to
10278 + * JSTART_SERVICEDONE in kcl_start_done
10281 + case SEST_JSTART_SERVICEDONE:
10282 + sev->se_state = SEST_BARRIER_WAIT;
10283 + error = startdone_barrier_new(sev);
10287 + * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
10288 + * process_startdone_barrier_new
10291 + case SEST_BARRIER_DONE:
10292 + error = check_startdone_barrier_new(sev);
10296 + do_finish_new(sev);
10297 + sevent_done(sev);
10301 + log_error(sev->se_sg, "no join processing for state %u",
10307 + /* restart the sevent from the beginning */
10308 + log_debug(sev->se_sg, "process_join error %d %lx", error,
10310 + sev->se_state = SEST_JOIN_BEGIN;
10311 + sev->se_sg->global_id = 0;
10312 + set_bit(SEFL_DELAY, &sev->se_flags);
10313 + schedule_sev_restart(sev);
10318 + * 1. First step in leaving an SG - send a message to other SG members asking
10319 + * to leave the SG. Nodes that don't have another active sevent or uevent for
10320 + * this SG will return POS.
10323 +static int send_leave_notice(sm_sevent_t *sev)
10325 + sm_group_t *sg = sev->se_sg;
10328 + int i = 0, error = -1, len = 0;
10331 + * Create a node array from member list in which to collect responses.
10334 + error = init_sevent(sev);
10338 + list_for_each_entry(node, &sg->memb, list)
10339 + sev->se_node_ids[i++] = node->id;
10342 + * Create and send a leave request message.
10345 + msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
10347 + error = send_members_message_sev(sg, msg, len, sev);
10354 + * 2. Second step in leaving an SG - after we collect all replies to our leave
10355 + * request, we look at them. If anyone replied with WAIT, we abort our attempt
10356 + * at leaving and try again in a bit.
10359 +static int check_leave_notice(sm_sevent_t *sev)
10361 + int pos = 0, wait = 0, neg = 0, restart = 0, i;
10363 + for (i = 0; i < sev->se_memb_count; i++) {
10364 + switch (sev->se_node_status[i]) {
10369 + case STATUS_WAIT:
10378 + /* we didn't get a valid response from this node,
10379 + * restart the entire sev. */
10385 + /* all members approve */
10386 + if (pos && !wait && !restart)
10393 + * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG.
10394 + * They must be stopped in order to restart without us as a member.
10397 +static int send_leave_stop(sm_sevent_t *sev)
10399 + sm_group_t *sg = sev->se_sg;
10401 + int error, len = 0;
10404 + * Re-init the status vector in which to collect responses.
10407 + memset(sev->se_node_status, 0, sev->se_len_status);
10410 + * Create and send a stop message.
10413 + msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
10415 + error = send_members_message_sev(sg, msg, len, sev);
10420 + * we and all others stop the SG now
10423 + sg->ops->stop(sg->service_data);
10430 + * 4. Fourth step in leaving the SG - check the replies to our stop request.
10431 + * Same problem with getting different replies as check_join_stop.
10434 +static int check_leave_stop(sm_sevent_t *sev)
10436 + sm_group_t *sg = sev->se_sg;
10437 + int i, pos = 0, neg = 0;
10439 + for (i = 0; i < sev->se_memb_count; i++) {
10440 + switch (sev->se_node_status[i]) {
10446 + log_error(sg, "check_leave_stop: fail from nodeid %u "
10447 + "(%d, %d, %u)", sev->se_node_ids[i],
10448 + pos, neg, sev->se_memb_count);
10453 + log_error(sg, "check_leave_stop: status %u nodeid %u",
10454 + sev->se_node_status[i], sev->se_node_ids[i]);
10460 + if (pos == sg->memb_count)
10467 + * 5. Fifth step in leaving the SG - tell the other SG members to restart the
10468 + * service without us. We, of course, don't start our own stopped service. If
10469 + * we're the last SG member and leaving, we jump right to the next step.
10472 +static int send_leave_start(sm_sevent_t *sev)
10474 + sm_group_t *sg = sev->se_sg;
10476 + int error = 0, len = 0;
10478 + if (sg->memb_count == 1) {
10479 + sev->se_state = SEST_LSTART_REMOTEDONE;
10480 + set_bit(SEFL_CHECK, &sev->se_flags);
10481 + wake_serviced(DO_JOINLEAVE);
10483 + msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
10484 + error = send_members_message(sg, msg, len);
10490 + * Move through the steps of a leave. Summary:
10492 + * 1. Send a leave notice to all SG members.
10493 + * 2. Collect and check replies to the leave notice.
10494 + * 3. Send a stop message to all SG members and stop our own SG.
10495 + * 4. Collect and check replies to the stop message.
10496 + * 5. Send a start message to SG members.
10497 + * 6. Clean up sevent and signal completion to the process that
10498 + * started the leave.
10501 +static void process_leave_sevent(sm_sevent_t *sev)
10506 + * We may cancel the current leave attempt if another node is also
10507 + * attempting to join or leave. (Only a single node can join or leave
10508 + * at once.) Our leave attempt will be restarted after being
10512 + if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10517 + if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
10522 + if (!list_empty(&sev->se_sg->joining)) {
10527 + log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10529 + switch (sev->se_state) {
10532 + * An sevent is created in kcl_leave_service with a state of
10536 + case SEST_LEAVE_BEGIN:
10537 + sev->se_state = SEST_LEAVE_ACKWAIT;
10538 + error = send_leave_notice(sev);
10542 + * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
10543 + * process_reply (when all the replies have been received)
10546 + case SEST_LEAVE_ACKED:
10547 + error = check_leave_notice(sev);
10551 + sev->se_state = SEST_LSTOP_ACKWAIT;
10552 + error = send_leave_stop(sev);
10556 + * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
10560 + case SEST_LSTOP_ACKED:
10561 + error = check_leave_stop(sev);
10565 + sev->se_state = SEST_LSTART_WAITREMOTE;
10566 + error = send_leave_start(sev);
10570 + * se_state is changed from LSTART_WAITREMOTE to
10571 + * LSTART_REMOTEDONE in process_leave_done
10574 + case SEST_LSTART_REMOTEDONE:
10575 + sevent_done(sev);
10579 + log_error(sev->se_sg, "process_leave_sevent state=%u",
10585 + log_debug(sev->se_sg, "process_leave error %d %lx", error,
10587 + /* restart the sevent from the beginning */
10588 + sev->se_state = SEST_LEAVE_BEGIN;
10589 + set_bit(SEFL_DELAY, &sev->se_flags);
10590 + schedule_sev_restart(sev);
10595 + * Sevent backout code. Take appropriate steps when a recovery occurs while
10596 + * we're in the midst of an sevent. The recovery may or may not affect the
10597 + * sevent. If it does, it usually means cancelling the sevent and restarting
10598 + * it from the beginning once the recovery processing is done.
10602 + * If any of the nodes that replied with OK is dead, we give up on the current
10603 + * join attempt and restart. Otherwise, this sevent can continue.
10606 +static int backout_join_acked(sm_sevent_t *sev)
10611 + for (i = 0; i < sev->se_node_count; i++) {
10612 + if (sev->se_node_status[i] != STATUS_POS)
10615 + list_for_each_entry(node, &sm_members, list) {
10616 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
10617 + (node->id == sev->se_node_ids[i]))
10625 + * In this state our sg member list exists and mark_affected_sgs() will have
10626 + * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We
10627 + * restart the join process if this is the case, otherwise this sevent can
10631 +static int backout_jstop_ackwait(sm_sevent_t *sev)
10633 + sm_group_t *sg = sev->se_sg;
10635 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10638 + clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
10639 + free_sg_memb(sg);
10644 + * Same as previous.
10647 +static int backout_jstop_acked(sm_sevent_t *sev)
10649 + return backout_jstop_ackwait(sev);
10653 + * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10654 + * starting our service. The recovery process will restart the service on all
10655 + * the prior sg members (not including those that died or us). We will
10656 + * reattempt our join which should be accepted once the nodes are done with
10660 +static int backout_jstart_servicewait(sm_sevent_t *sev)
10662 + sm_group_t *sg = sev->se_sg;
10664 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10667 + clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10668 + sg->ops->stop(sg->service_data);
10669 + free_sg_memb(sg);
10674 + * Same as previous.
10677 +static int backout_jstart_servicedone(sm_sevent_t *sev)
10679 + return backout_jstart_servicewait(sev);
10683 + * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10684 + * waiting on the "all done" barrier. Stop our service that we just started
10685 + * and cancel the barrier. The recovery process will restart the service on
10686 + * all the prior sg members (not including those that died or us). We will
10687 + * reattempt our join which should be accepted once the nodes are done with
10691 +static int backout_barrier_wait(sm_sevent_t *sev)
10693 + sm_group_t *sg = sev->se_sg;
10694 + char bname[MAX_BARRIER_NAME_LEN];
10696 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10699 + clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10701 + sg->ops->stop(sg->service_data);
10703 + memset(bname, 0, MAX_BARRIER_NAME_LEN);
10704 + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10705 + sg->global_id, sm_our_nodeid, sev->se_id,
10707 + kcl_barrier_cancel(bname);
10709 + free_sg_memb(sg);
10714 + * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The
10715 + * recovery began after the barrier callback. If the result in the callback is
10716 + * "success" then we are joined, this sevent is finished and we'll process the
10717 + * sg within the forthcoming recovery with the other members.
10719 + * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
10720 + * all nodes will receive the corresponding barrier callback *before any*
10721 + * receive an sm_member_update() due to one of those nodes failing just after
10722 + * joining the barrier. If some nodes receive the sm_member_update() before
10723 + * the barrier callback and others receive the barrier callback before the
10724 + * sm_member_update() then they will disagree as to whether the node joining/
10725 + * leaving is in/out of the sg.
10728 +static int backout_barrier_done(sm_sevent_t *sev)
10730 + sm_group_t *sg = sev->se_sg;
10732 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10735 + if (!sev->se_barrier_status) {
10736 + do_finish_new(sev);
10737 + sevent_done(sev);
10740 + sg->ops->stop(sg->service_data);
10741 + free_sg_memb(sg);
10747 + * We've done nothing yet, just restart when recovery is done (if sg is flagged
10748 + * with recovery.)
10751 +static int backout_leave_begin(sm_sevent_t *sev)
10753 + sm_group_t *sg = sev->se_sg;
10755 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10762 + * Ignore any replies to our leave notice and restart when recovery is done (if
10763 + * sg is flagged with recovery.)
10766 +static int backout_leave_ackwait(sm_sevent_t *sev)
10768 + sm_group_t *sg = sev->se_sg;
10770 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10773 + clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
10779 + * Same as previous.
10782 +static int backout_leave_acked(sm_sevent_t *sev)
10784 + return backout_leave_ackwait(sev);
10788 + * Ignore any stop replies. All the members will be stopped anyway to do the
10789 + * recovery. Let that happen and restart our leave when done.
10792 +static int backout_lstop_ackwait(sm_sevent_t *sev)
10794 + sm_group_t *sg = sev->se_sg;
10796 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10799 + clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
10805 + * Same as previous.
10808 +static int backout_lstop_acked(sm_sevent_t *sev)
10810 + return backout_lstop_ackwait(sev);
10814 + * All members will be stopped due to recovery and restarted by recovery
10815 + * processing. That includes us, we have to retry the leave once the recovery
10819 +static int backout_lstart_waitremote(sm_sevent_t *sev)
10821 + sm_group_t *sg = sev->se_sg;
10823 + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10830 + * Reset an sevent to its beginning so it can be restarted. This is necessary
10831 + * when recovery affects an SG while we're trying to join or leave (ie. a node
10832 + * in the SG fails).
10835 +void backout_sevents(void)
10837 + sm_sevent_t *sev, *safe;
10840 + list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10844 + log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
10846 + switch (sev->se_state) {
10848 + /* backout after kcl_join_service and before
10849 + * send_join_notice */
10850 + case SEST_JOIN_BEGIN:
10853 + /* backout after send_join_notice and before final
10854 + * process_reply */
10855 + case SEST_JOIN_ACKWAIT:
10856 + clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
10857 + sev->se_state = SEST_JOIN_BEGIN;
10858 + set_bit(SEFL_CHECK, &sev->se_flags);
10859 + wake_serviced(DO_JOINLEAVE);
10862 + /* backout after final process_reply and before
10863 + * check_join_notice */
10864 + case SEST_JOIN_ACKED:
10865 + delay = backout_join_acked(sev);
10868 + /* backout after send_join_stop and before final
10869 + * process_reply */
10870 + case SEST_JSTOP_ACKWAIT:
10871 + delay = backout_jstop_ackwait(sev);
10874 + /* backout after final process_reply and before
10875 + * check_join_stop */
10876 + case SEST_JSTOP_ACKED:
10877 + delay = backout_jstop_acked(sev);
10880 + /* backout after send_join_start and before
10881 + * kcl_start_done */
10882 + case SEST_JSTART_SERVICEWAIT:
10883 + delay = backout_jstart_servicewait(sev);
10886 + /* backout after kcl_start_done and before
10887 + * startdone_barrier_new */
10888 + case SEST_JSTART_SERVICEDONE:
10889 + delay = backout_jstart_servicedone(sev);
10892 + /* backout after startdone_barrier_new and before
10893 + * callback_startdone_barrier_new */
10894 + case SEST_BARRIER_WAIT:
10895 + delay = backout_barrier_wait(sev);
10898 + /* backout after callback_startdone_barrier_new and
10899 + * before check_startdone_barrier_new */
10900 + case SEST_BARRIER_DONE:
10901 + delay = backout_barrier_done(sev);
10904 + /* backout after kcl_leave_service and before
10905 + * send_leave_notice */
10906 + case SEST_LEAVE_BEGIN:
10907 + delay = backout_leave_begin(sev);
10910 + /* backout after send_leave_notice and before final
10911 + * process_reply */
10912 + case SEST_LEAVE_ACKWAIT:
10913 + delay = backout_leave_ackwait(sev);
10916 + /* backout after final process_reply and before
10917 + * check_leave_notice */
10918 + case SEST_LEAVE_ACKED:
10919 + delay = backout_leave_acked(sev);
10922 + /* backout after send_leave_stop and before final
10923 + * process_reply */
10924 + case SEST_LSTOP_ACKWAIT:
10925 + delay = backout_lstop_ackwait(sev);
10928 + /* backout after final process_reply and before
10929 + * check_leave_stop */
10930 + case SEST_LSTOP_ACKED:
10931 + delay = backout_lstop_acked(sev);
10934 + /* backout after send_leave_start and before
10935 + * process_lstart_done */
10936 + case SEST_LSTART_WAITREMOTE:
10937 + delay = backout_lstart_waitremote(sev);
10940 + /* backout after process_lstart_done and before
10941 + * process_leave_sevent */
10942 + case SEST_LSTART_REMOTEDONE:
10943 + sevent_done(sev);
10948 + log_error(sev->se_sg, "backout_sevents: bad state %d",
10953 + if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
10954 + sev->se_state = SEST_LEAVE_BEGIN;
10955 + set_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
10956 + set_bit(SEFL_CHECK, &sev->se_flags);
10957 + wake_serviced(DO_JOINLEAVE);
10959 + sev->se_state = SEST_JOIN_BEGIN;
10960 + set_bit(SEFL_CHECK, &sev->se_flags);
10961 + wake_serviced(DO_JOINLEAVE);
10967 +void process_joinleave(void)
10969 + sm_sevent_t *sev = NULL, *safe;
10971 + spin_lock(&new_event_lock);
10972 + if (!list_empty(&new_event)) {
10973 + sev = list_entry(new_event.next, sm_sevent_t, se_list);
10974 + list_del(&sev->se_list);
10975 + list_add_tail(&sev->se_list, &joinleave_events);
10976 + set_bit(SEFL_CHECK, &sev->se_flags);
10978 + spin_unlock(&new_event_lock);
10980 + list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10981 + if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
10984 + if (test_bit(SEFL_DELAY, &sev->se_flags) ||
10985 + test_bit(SEFL_DELAY_RECOVERY, &sev->se_flags))
10988 + if (sev->se_state < SEST_LEAVE_BEGIN)
10989 + process_join_sevent(sev);
10991 + process_leave_sevent(sev);
10994 diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
10995 --- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730
10996 +++ linux-patched/cluster/cman/sm_joinleave.h 2004-11-03 11:37:37.000000000 +0800
10998 +/******************************************************************************
10999 +*******************************************************************************
11001 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11002 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11004 +** This copyrighted material is made available to anyone wishing to use,
11005 +** modify, copy, or redistribute it subject to the terms and conditions
11006 +** of the GNU General Public License v.2.
11008 +*******************************************************************************
11009 +******************************************************************************/
11011 +#ifndef __SM_JOINLEAVE_DOT_H__
11012 +#define __SM_JOINLEAVE_DOT_H__
11014 +void init_joinleave(void);
11015 +void new_joinleave(sm_sevent_t *sev);
11016 +void process_joinleave(void);
11017 +void backout_sevents(void);
11018 +sm_sevent_t *find_sevent(unsigned int id);
11021 diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
11022 --- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730
11023 +++ linux-patched/cluster/cman/sm_membership.c 2004-11-03 11:37:37.000000000 +0800
11025 +/******************************************************************************
11026 +*******************************************************************************
11028 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11029 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11031 +** This copyrighted material is made available to anyone wishing to use,
11032 +** modify, copy, or redistribute it subject to the terms and conditions
11033 +** of the GNU General Public License v.2.
11035 +*******************************************************************************
11036 +******************************************************************************/
11040 +extern struct list_head sm_members;
11043 + * Routines for SG members to handle other nodes joining or leaving the SG.
11044 + * These "uevent" membership update routines are the response to an "sevent" on
11045 + * a joining/leaving node.
11048 +static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
11052 + list_for_each_entry(node, &sg->memb, list) {
11053 + if (node->id != nodeid)
11055 + list_del(&node->list);
11057 + sg->memb_count--;
11058 + log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
11063 +static void add_memb_node(sm_group_t *sg, sm_node_t *node)
11065 + list_add_tail(&node->list, &sg->memb);
11066 + sg->memb_count++;
11067 + log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
11071 + * Join 1. The receive end of send_join_stop() from a node requesting to join
11072 + * the SG. We stop the service so it can be restarted with the new node.
11075 +static int process_join_stop(sm_group_t *sg)
11077 + sm_uevent_t *uev = &sg->uevent;
11082 + if (uev->ue_num_nodes != sg->memb_count + 1) {
11083 + log_error(sg, "process_join_stop: bad num nodes %u %u",
11084 + uev->ue_num_nodes, sg->memb_count);
11088 + sm_set_event_id(&uev->ue_id);
11090 + node = sm_find_joiner(sg, uev->ue_nodeid);
11091 + SM_ASSERT(node,);
11093 + sg->state = SGST_UEVENT;
11094 + sg->ops->stop(sg->service_data);
11096 + reply.ms_type = SMSG_JSTOP_REP;
11097 + reply.ms_status = STATUS_POS;
11098 + reply.ms_sevent_id = uev->ue_remote_seid;
11099 + smsg_bswap_out(&reply);
11101 + error = send_nodeid_message((char *) &reply, sizeof(reply),
11109 + * Join 2. The receive end of send_join_start() from a node joining the SG.
11110 + * We are re-starting the service with the new member added.
11113 +static int process_join_start(sm_group_t *sg)
11115 + sm_uevent_t *uev = &sg->uevent;
11120 + /* this memory is passed to the service which must free it */
11122 + kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
11125 + /* transfer joining node from joining list to member list */
11126 + node = sm_find_joiner(sg, uev->ue_nodeid);
11127 + SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
11128 + list_del(&node->list);
11129 + add_memb_node(sg, node);
11131 + /* the new member list for the service */
11132 + list_for_each_entry(node, &sg->memb, list)
11133 + memb[count++] = node->id;
11135 + set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11137 + sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11138 + SERVICE_NODE_JOIN);
11143 + * Join 3. When done starting their local service, every previous SG member
11144 + * calls startdone_barrier() and the new/joining member calls
11145 + * startdone_barrier_new(). The barrier returns when everyone has started
11146 + * their service and joined the barrier.
11149 +static int startdone_barrier(sm_group_t *sg)
11151 + sm_uevent_t *uev = &sg->uevent;
11152 + char bname[MAX_BARRIER_NAME_LEN];
11155 + memset(bname, 0, MAX_BARRIER_NAME_LEN);
11156 + uev->ue_barrier_status = -1;
11158 + set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11160 + /* If we're the only member, skip the barrier */
11161 + if (sg->memb_count == 1) {
11162 + process_startdone_barrier(sg, 0);
11166 + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11167 + sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11170 + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
11176 + * Join 4. Check that the "all started" barrier returned a successful status.
11177 + * The newly joined member calls check_startdone_barrier_new().
11180 +static int check_startdone_barrier(sm_group_t *sg)
11182 + int error = sg->uevent.ue_barrier_status;
11187 + * Join 5. Send the service a "finish" indicating that all members have
11188 + * successfully started. The newly joined member calls do_finish_new().
11191 +static void do_finish(sm_group_t *sg)
11193 + sg->state = SGST_RUN;
11194 + clear_bit(SGFL_UEVENT, &sg->flags);
11195 + sg->ops->finish(sg->service_data, sg->uevent.ue_id);
11199 + * Join 6. The uevent is done. If this was a uevent for a node leaving the
11200 + * SG, then send a final message to the departed node signalling that the
11201 + * remaining nodes have restarted since it left.
11204 +static void uevent_done(sm_group_t *sg)
11206 + sm_uevent_t *uev = &sg->uevent;
11209 + if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
11210 + reply.ms_type = SMSG_LSTART_DONE;
11211 + reply.ms_status = STATUS_POS;
11212 + reply.ms_sevent_id = uev->ue_remote_seid;
11213 + smsg_bswap_out(&reply);
11214 + send_nodeid_message((char *) &reply, sizeof(reply),
11217 + memset(&sg->uevent, 0, sizeof(sm_uevent_t));
11221 + * Leave 1. The receive end of send_leave_stop() from a node leaving the SG.
11224 +static int process_leave_stop(sm_group_t *sg)
11226 + sm_uevent_t *uev = &sg->uevent;
11230 + sm_set_event_id(&uev->ue_id);
11232 + sg->state = SGST_UEVENT;
11233 + sg->ops->stop(sg->service_data);
11235 + reply.ms_type = SMSG_LSTOP_REP;
11236 + reply.ms_status = STATUS_POS;
11237 + reply.ms_sevent_id = uev->ue_remote_seid;
11238 + smsg_bswap_out(&reply);
11240 + error = send_nodeid_message((char *) &reply, sizeof(reply),
11248 + * Leave 2. The receive end of send_leave_start() from a node leaving the SG.
11249 + * We are re-starting the service (without the node that's left naturally.)
11252 +static int process_leave_start(sm_group_t *sg)
11254 + sm_uevent_t *uev = &sg->uevent;
11259 + SM_ASSERT(sg->memb_count > 1,
11260 + printk("memb_count=%u\n", sg->memb_count););
11262 + /* this memory is passed to the service which must free it */
11264 + kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
11267 + /* remove departed member from sg member list */
11268 + del_memb_node(sg, uev->ue_nodeid);
11270 + /* build member list to pass to service */
11271 + list_for_each_entry(node, &sg->memb, list)
11272 + memb[count++] = node->id;
11274 + /* allow us to accept the start_done callback for this start */
11275 + set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11277 + sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11278 + SERVICE_NODE_LEAVE);
11283 + * Move through the steps of another node joining or leaving the SG.
11286 +static void process_one_uevent(sm_group_t *sg)
11288 + sm_uevent_t *uev = &sg->uevent;
11291 + log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
11293 + switch (uev->ue_state) {
11296 + * a uevent is initialized with state JSTOP in
11297 + * process_stop_request
11301 + uev->ue_state = UEST_JSTART_WAITCMD;
11302 + error = process_join_stop(sg);
11306 + * ue_state is changed from JSTART_WAITCMD to JSTART in
11307 + * process_start_request
11310 + case UEST_JSTART:
11311 + uev->ue_state = UEST_JSTART_SERVICEWAIT;
11312 + error = process_join_start(sg);
11316 + * ue_state is changed from JSTART_SERVICEWAIT to
11317 + * JSTART_SERVICEDONE in kcl_start_done
11320 + case UEST_JSTART_SERVICEDONE:
11321 + uev->ue_state = UEST_BARRIER_WAIT;
11322 + error = startdone_barrier(sg);
11326 + * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
11327 + * process_startdone_barrier
11330 + case UEST_BARRIER_DONE:
11331 + error = check_startdone_barrier(sg);
11340 + * a uevent is initialized with state LSTOP in
11341 + * process_stop_request
11345 + uev->ue_state = UEST_LSTART_WAITCMD;
11346 + error = process_leave_stop(sg);
11350 + * a uevent is changed from LSTART_WAITCMD to LSTART in
11351 + * process_start_request
11354 + case UEST_LSTART:
11355 + uev->ue_state = UEST_LSTART_SERVICEWAIT;
11356 + error = process_leave_start(sg);
11360 + * a uevent is changed from LSTART_SERVICEWAIT to to
11361 + * LSTART_SERVICEDONE in kcl_start_done
11364 + case UEST_LSTART_SERVICEDONE:
11365 + uev->ue_state = UEST_BARRIER_WAIT;
11366 + error = startdone_barrier(sg);
11373 + /* If we encounter an error during these routines, we do nothing,
11374 + expecting that a node failure related to this sg will cause a
11375 + recovery event to arrive and call cancel_one_uevent(). */
11378 + log_error(sg, "process_one_uevent error %d state %u",
11379 + error, uev->ue_state);
11382 +static sm_node_t *failed_memb(sm_group_t *sg, int *count)
11384 + sm_node_t *node, *sm_node, *failed_uev_node = NULL;
11386 + list_for_each_entry(node, &sg->memb, list) {
11388 + sm_node = sm_find_member(node->id);
11389 + SM_ASSERT(sm_node, );
11391 + if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
11393 + if (node->id == sg->uevent.ue_nodeid)
11394 + failed_uev_node = sm_node;
11397 + return failed_uev_node;
11400 +static void send_recover_msg(sm_group_t *sg)
11404 + msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
11405 + send_members_message(sg, msg, len);
11408 +static void cancel_barrier(sm_group_t *sg)
11410 + sm_uevent_t *uev = &sg->uevent;
11411 + char bname[MAX_BARRIER_NAME_LEN];
11413 + clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11415 + memset(bname, 0, MAX_BARRIER_NAME_LEN);
11416 + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11417 + sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11419 + kcl_barrier_cancel(bname);
11422 +static void cancel_one_uevent(sm_group_t *sg, int *effected)
11424 + sm_uevent_t *uev = &sg->uevent;
11425 + int failed_count;
11426 + sm_node_t *node, *failed_joiner, *failed_leaver;
11428 + log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
11431 + switch (uev->ue_state) {
11434 + case UEST_JSTART_WAITCMD:
11435 + case UEST_JSTART:
11437 + sg->ops->stop(sg->service_data);
11439 + failed_count = 0;
11440 + failed_joiner = failed_memb(sg, &failed_count);
11441 + SM_ASSERT(!failed_joiner, );
11443 + node = sm_find_member(uev->ue_nodeid);
11444 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11445 + failed_joiner = node;
11447 + if (!failed_count) {
11448 + /* only joining node failed */
11449 + SM_ASSERT(failed_joiner, );
11450 + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11451 + set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11453 + /* some nodes may not have gotten a JSTOP message
11454 + in which case this will tell them to begin
11455 + recovery for this sg. */
11456 + send_recover_msg(sg);
11459 + /* a member node failed (and possibly joining node, it
11460 + doesn't matter) */
11461 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11464 + clear_bit(SGFL_UEVENT, &sg->flags);
11465 + memset(uev, 0, sizeof(sm_uevent_t));
11469 + case UEST_JSTART_SERVICEWAIT:
11470 + case UEST_JSTART_SERVICEDONE:
11472 + clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11473 + sg->ops->stop(sg->service_data);
11475 + failed_count = 0;
11476 + failed_joiner = failed_memb(sg, &failed_count);
11477 + SM_ASSERT(failed_count, );
11478 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11480 + if (failed_count == 1 && failed_joiner) {
11481 + /* only joining node failed */
11483 + } else if (failed_count && failed_joiner) {
11484 + /* joining node and another member failed */
11487 + /* other member failed, joining node still alive */
11488 + SM_ASSERT(!failed_joiner, );
11489 + del_memb_node(sg, uev->ue_nodeid);
11492 + clear_bit(SGFL_UEVENT, &sg->flags);
11493 + memset(uev, 0, sizeof(sm_uevent_t));
11498 + case UEST_LSTART_WAITCMD:
11499 + case UEST_LSTART:
11501 + sg->ops->stop(sg->service_data);
11503 + failed_count = 0;
11504 + failed_leaver = failed_memb(sg, &failed_count);
11505 + SM_ASSERT(failed_count, );
11506 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11508 + if (failed_count == 1 && failed_leaver) {
11509 + /* only leaving node failed */
11511 + } else if (failed_count && failed_leaver) {
11512 + /* leaving node and another member failed */
11515 + /* other member failed, leaving node still alive */
11516 + SM_ASSERT(!failed_leaver, );
11519 + clear_bit(SGFL_UEVENT, &sg->flags);
11520 + memset(uev, 0, sizeof(sm_uevent_t));
11524 + case UEST_LSTART_SERVICEWAIT:
11525 + case UEST_LSTART_SERVICEDONE:
11527 + clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11528 + sg->ops->stop(sg->service_data);
11530 + failed_count = 0;
11531 + failed_leaver = failed_memb(sg, &failed_count);
11532 + SM_ASSERT(!failed_leaver, );
11534 + node = sm_find_member(uev->ue_nodeid);
11535 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11536 + failed_leaver = node;
11538 + if (!failed_count) {
11539 + /* only leaving node failed */
11540 + SM_ASSERT(failed_leaver, );
11541 + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11542 + set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11545 + } else if (failed_count && failed_leaver) {
11546 + /* leaving node and another member failed */
11547 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11550 + /* other member failed, leaving node still alive */
11551 + SM_ASSERT(failed_count, );
11552 + SM_ASSERT(!failed_leaver, );
11553 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11554 + node = sm_new_node(sg->uevent.ue_nodeid);
11555 + add_memb_node(sg, node);
11558 + clear_bit(SGFL_UEVENT, &sg->flags);
11559 + memset(uev, 0, sizeof(sm_uevent_t));
11563 + case UEST_BARRIER_WAIT:
11565 + if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11566 + goto barrier_wait_leave;
11568 + sg->ops->stop(sg->service_data);
11569 + cancel_barrier(sg);
11571 + barrier_wait_join:
11573 + failed_count = 0;
11574 + failed_joiner = failed_memb(sg, &failed_count);
11575 + SM_ASSERT(failed_count, );
11576 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11578 + if (failed_count == 1 && failed_joiner) {
11579 + /* only joining node failed */
11581 + } else if (failed_count && failed_joiner) {
11582 + /* joining node and another member failed */
11585 + /* other member failed, joining node still alive */
11586 + SM_ASSERT(!failed_joiner, );
11587 + del_memb_node(sg, uev->ue_nodeid);
11590 + clear_bit(SGFL_UEVENT, &sg->flags);
11591 + memset(uev, 0, sizeof(sm_uevent_t));
11594 + barrier_wait_leave:
11596 + failed_count = 0;
11597 + failed_leaver = failed_memb(sg, &failed_count);
11598 + SM_ASSERT(!failed_leaver, );
11600 + node = sm_find_member(uev->ue_nodeid);
11601 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11602 + failed_leaver = node;
11604 + if (!failed_count) {
11605 + /* only leaving node failed */
11606 + SM_ASSERT(failed_leaver, );
11607 + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11608 + set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11611 + } else if (failed_count && failed_leaver) {
11612 + /* leaving node and another member failed */
11613 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11616 + /* other member failed, leaving node still alive */
11617 + SM_ASSERT(failed_count, );
11618 + SM_ASSERT(!failed_leaver, );
11619 + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11620 + node = sm_new_node(sg->uevent.ue_nodeid);
11621 + add_memb_node(sg, node);
11624 + clear_bit(SGFL_UEVENT, &sg->flags);
11625 + memset(uev, 0, sizeof(sm_uevent_t));
11629 + case UEST_BARRIER_DONE:
11631 + if (!uev->ue_barrier_status) {
11637 + if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11638 + goto barrier_wait_leave;
11640 + goto barrier_wait_join;
11644 + log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
11648 +void cancel_uevents(int *effected)
11651 + sm_node_t *node, *sgnode;
11654 + list_for_each_entry(node, &sm_members, list) {
11655 + if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
11659 + * Clear this dead node from the "interested in joining" list
11660 + * of any SG. The node is added to this list before the uevent
11664 + for (i = 0; i < SG_LEVELS; i++) {
11665 + list_for_each_entry(sg, &sm_sg[i], list) {
11666 + sgnode = sm_find_joiner(sg, node->id);
11668 + log_debug(sg, "clear joining node %u",
11670 + list_del(&sgnode->list);
11677 + /* Adjust any uevents in sg's effected by the failed node(s) */
11679 + for (i = 0; i < SG_LEVELS; i++) {
11680 + list_for_each_entry(sg, &sm_sg[i], list) {
11681 + if (!test_bit(SGFL_UEVENT, &sg->flags))
11684 + /* We may have some cancelling to do if this sg is
11685 + flagged as having a failed member, or if a joining
11686 + or leaving node has died. */
11688 + if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
11689 + cancel_one_uevent(sg, effected);
11690 + else if (sg->uevent.ue_nodeid) {
11691 + node = sm_find_member(sg->uevent.ue_nodeid);
11692 + SM_ASSERT(node, );
11693 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11694 + cancel_one_uevent(sg, effected);
11700 +void process_membership(void)
11705 + down(&sm_sglock);
11707 + for (i = 0; i < SG_LEVELS; i++) {
11708 + list_for_each_entry(sg, &sm_sg[i], list) {
11709 + if (!test_bit(SGFL_UEVENT, &sg->flags))
11712 + if (!test_and_clear_bit(UEFL_CHECK,
11713 + &sg->uevent.ue_flags))
11716 + process_one_uevent(sg);
11721 diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
11722 --- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730
11723 +++ linux-patched/cluster/cman/sm_membership.h 2004-11-03 11:37:37.000000000 +0800
11725 +/******************************************************************************
11726 +*******************************************************************************
11728 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11729 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11731 +** This copyrighted material is made available to anyone wishing to use,
11732 +** modify, copy, or redistribute it subject to the terms and conditions
11733 +** of the GNU General Public License v.2.
11735 +*******************************************************************************
11736 +******************************************************************************/
11738 +#ifndef __SM_MEMBERSHIP_DOT_H__
11739 +#define __SM_MEMBERSHIP_DOT_H__
11741 +void process_membership(void);
11742 +void cancel_uevents(int *effected);
11745 diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
11746 --- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730
11747 +++ linux-patched/cluster/cman/sm_message.c 2004-11-03 11:37:37.000000000 +0800
11749 +/******************************************************************************
11750 +*******************************************************************************
11752 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
11753 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
11755 +** This copyrighted material is made available to anyone wishing to use,
11756 +** modify, copy, or redistribute it subject to the terms and conditions
11757 +** of the GNU General Public License v.2.
11759 +*******************************************************************************
11760 +******************************************************************************/
11764 +#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
11766 +extern struct socket * sm_socket;
11767 +extern uint32_t sm_our_nodeid;
11768 +static uint32_t global_last_id;
11769 +static struct list_head messages;
11770 +static spinlock_t message_lock;
11771 +static char smsg_buf[SMSG_BUF_SIZE];
11773 +int send_nodeid_message(char *msg, int len, uint32_t nodeid);
11776 + struct list_head list;
11781 +typedef struct rq_entry rq_entry_t;
11783 +void init_messages(void)
11785 + global_last_id = 1;
11786 + INIT_LIST_HEAD(&messages);
11787 + spin_lock_init(&message_lock);
11790 +uint32_t sm_new_global_id(int level)
11792 + uint32_t id = global_last_id++;
11793 + uint8_t l = (uint8_t) level;
11798 + if (id > 0x00FFFFFF)
11805 +static void smsg_copy_in(char *msg, sm_msg_t *smsg)
11807 + sm_msg_t *in = (sm_msg_t *) msg;
11809 + smsg->ms_type = in->ms_type;
11810 + smsg->ms_status = in->ms_status;
11811 + smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
11812 + smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
11813 + smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
11814 + smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
11815 + smsg->ms_length = le16_to_cpu(in->ms_length);
11818 +/* swapping bytes in place is an easy source of errors - be careful not to
11819 + * access the fields after calling this */
11821 +void smsg_bswap_out(sm_msg_t *smsg)
11823 + smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
11824 + smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
11825 + smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
11826 + smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
11827 + smsg->ms_length = cpu_to_le16(smsg->ms_length);
11830 +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
11831 + sm_sevent_t *sev)
11835 + int fulllen = sizeof(sm_msg_t) + datalen;
11838 + memset(smsg_buf, 0, SMSG_BUF_SIZE);
11839 + SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
11841 + smsg = (sm_msg_t *) msg;
11842 + smsg->ms_type = type;
11843 + smsg->ms_global_sgid = sg->global_id;
11844 + smsg->ms_sglevel = sg->level;
11845 + smsg->ms_length = datalen;
11846 + smsg->ms_sevent_id = sev ? sev->se_id : 0;
11848 + smsg_bswap_out(smsg);
11849 + *msglen = fulllen;
11853 +static unsigned int msgtype_to_flag(int type)
11855 + unsigned int flag;
11858 + case SMSG_JOIN_REP:
11859 + case SMSG_JOIN_REQ:
11860 + flag = SEFL_ALLOW_JOIN;
11863 + case SMSG_JSTOP_REP:
11864 + case SMSG_JSTOP_REQ:
11865 + flag = SEFL_ALLOW_JSTOP;
11868 + case SMSG_LEAVE_REP:
11869 + case SMSG_LEAVE_REQ:
11870 + flag = SEFL_ALLOW_LEAVE;
11873 + case SMSG_LSTOP_REP:
11874 + case SMSG_LSTOP_REQ:
11875 + flag = SEFL_ALLOW_LSTOP;
11879 + SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
11884 +static int test_allowed_msgtype(sm_sevent_t *sev, int type)
11886 + unsigned int flag = msgtype_to_flag(type);
11888 + return test_bit(flag, &sev->se_flags);
11891 +static void clear_allowed_msgtype(sm_sevent_t *sev, int type)
11893 + unsigned int flag = msgtype_to_flag(type);
11895 + clear_bit(flag, &sev->se_flags);
11898 +static void set_allowed_msgtype(sm_sevent_t *sev, int type)
11900 + unsigned int flag = msgtype_to_flag(type);
11902 + set_bit(flag, &sev->se_flags);
11905 +static int save_global_id(sm_sevent_t *sev, sm_msg_t *smsg)
11907 + sm_group_t *sg = sev->se_sg;
11909 + if (!smsg->ms_global_sgid) {
11910 + log_error(sg, "save_global_id: zero sg id");
11914 + if (!sg->global_id)
11915 + sg->global_id = smsg->ms_global_sgid;
11917 + if (sg->global_id != smsg->ms_global_sgid) {
11918 + log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
11924 +static void save_lastid(sm_msg_t *smsg)
11926 + uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
11929 + * Keep track of the highst SG id which has been used
11930 + * in the cluster in case we need to choose a new SG id.
11933 + if (gid > global_last_id)
11934 + global_last_id = gid;
11937 +static int next_sev_state(int msg_type, int cur_state)
11941 + switch (msg_type) {
11942 + case SMSG_JOIN_REP:
11943 + SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
11944 + next = SEST_JOIN_ACKED;
11947 + case SMSG_JSTOP_REP:
11948 + SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
11949 + next = SEST_JSTOP_ACKED;
11952 + case SMSG_LEAVE_REP:
11953 + SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
11954 + next = SEST_LEAVE_ACKED;
11957 + case SMSG_LSTOP_REP:
11958 + SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
11959 + next = SEST_LSTOP_ACKED;
11966 + * Functions in sevent.c send messages to other nodes and then expect replies.
11967 + * This function collects the replies for the sevent messages and moves the
11968 + * sevent to the next stage when all the expected replies have been received.
11971 +static void process_reply(sm_msg_t *smsg, uint32_t nodeid)
11973 + sm_sevent_t *sev;
11974 + int i, expected, type = smsg->ms_type;
11977 + * Find the relevant sevent.
11980 + sev = find_sevent(smsg->ms_sevent_id);
11982 + log_print("process_reply invalid id=%u nodeid=%u",
11983 + smsg->ms_sevent_id, nodeid);
11988 + * Check if this message type is what this sevent is waiting for.
11991 + if (!test_allowed_msgtype(sev, type)) {
11992 + log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id);
11997 + (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
11999 + SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
12000 + printk("type=%d expected=%d len_ids=%d node_count=%d "
12001 + "memb_count=%d\n", type, expected, sev->se_len_ids,
12002 + sev->se_node_count, sev->se_memb_count););
12004 + SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
12005 + printk("type=%d expected=%d len_status=%d node_count=%d "
12006 + "memb_count=%d\n", type, expected, sev->se_len_status,
12007 + sev->se_node_count, sev->se_memb_count););
12009 + for (i = 0; i < expected; i++) {
12010 + if (sev->se_node_ids[i] == nodeid) {
12012 + * Save the status from the replying node
12015 + if (!sev->se_node_status[i])
12016 + sev->se_node_status[i] = smsg->ms_status;
12018 + log_error(sev->se_sg, "process_reply duplicate"
12019 + "id=%u nodeid=%u %u/%u",
12020 + sev->se_id, nodeid,
12021 + sev->se_node_status[i],
12022 + smsg->ms_status);
12026 + if (type == SMSG_JOIN_REP) {
12027 + save_lastid(smsg);
12029 + if (smsg->ms_status == STATUS_POS)
12030 + save_global_id(sev, smsg);
12034 + * Signal sm if we have all replies
12037 + if (++sev->se_reply_count == expected) {
12038 + clear_allowed_msgtype(sev, type);
12039 + sev->se_state = next_sev_state(type,
12041 + set_bit(SEFL_CHECK, &sev->se_flags);
12042 + wake_serviced(DO_JOINLEAVE);
12054 + * A node wants to join an SG and has run send_join_notice. If we know nothing
12055 + * about the SG , then we have no objection - send back STATUS_POS. If we're a
12056 + * member of the SG, then send back STATUS_POS (go ahead and join) if there's
12057 + * no sevent or uevent of higher priority in progress (only a single join or
12058 + * leave is permitted for the SG at once). If there happens to be a higher
12059 + * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
12060 + * requested join for a bit.
12063 +static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
12065 + sm_group_t *sg = NULL;
12066 + sm_sevent_t *sev = NULL;
12068 + int found = FALSE;
12069 + int level = smsg->ms_sglevel;
12072 + memset(&reply, 0, sizeof(reply));
12074 + down(&sm_sglock);
12076 + if (nodeid == sm_our_nodeid)
12080 + * search SG list for an SG with given name/len
12083 + list_for_each_entry(sg, &sm_sg[level], list) {
12084 + if ((sg->namelen != smsg->ms_length) ||
12085 + memcmp(sg->name, name, sg->namelen))
12092 + * build reply message
12098 + reply.ms_type = SMSG_JOIN_REP;
12099 + reply.ms_status = STATUS_NEG;
12100 + reply.ms_global_lastid = global_last_id;
12101 + reply.ms_sevent_id = smsg->ms_sevent_id;
12103 + reply.ms_type = SMSG_JOIN_REP;
12104 + reply.ms_status = STATUS_POS;
12105 + reply.ms_sevent_id = smsg->ms_sevent_id;
12106 + reply.ms_global_sgid = sg->global_id;
12107 + reply.ms_global_lastid = global_last_id;
12110 + * The node trying to join should wait and try again until
12111 + * we're done with recovery.
12114 + if (sg->state == SGST_RECOVER) {
12115 + reply.ms_status = STATUS_WAIT;
12120 + * An sevent node trying to join may have gotten as far as
12121 + * creating a uevent with us and then backed out. That node
12122 + * will retry joining from the beginning so we should not turn
12123 + * them away. If we're handling a uevent for another node,
12124 + * tell the joining node to wait.
12127 + if (test_bit(SGFL_UEVENT, &sg->flags)) {
12128 + if (sg->uevent.ue_nodeid != nodeid)
12129 + reply.ms_status = STATUS_WAIT;
12134 + * We're trying to join or leave the SG at the moment.
12137 + if (test_bit(SGFL_SEVENT, &sg->flags)) {
12138 + sev = sg->sevent;
12141 + * We're trying to leave. Make the join wait until
12142 + * we've left if we're beyond LEAVE_ACKWAIT.
12145 + if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12146 + if (sev->se_state > SEST_LEAVE_ACKED)
12147 + reply.ms_status = STATUS_WAIT;
12149 + reply.ms_status = STATUS_POS;
12150 + clear_bit(SEFL_ALLOW_LEAVE,
12152 + set_bit(SEFL_CANCEL, &sev->se_flags);
12157 + * We're trying to join. Making the other join wait
12158 + * until we're joined if we're beyond JOIN_ACKWAIT or
12159 + * if we have a lower id. (Send NEG to allow the other
12160 + * node to go ahead because we're not in the SG.)
12164 + if (sev->se_state > SEST_JOIN_ACKED)
12165 + reply.ms_status = STATUS_WAIT;
12166 + else if (sm_our_nodeid < nodeid)
12167 + reply.ms_status = STATUS_WAIT;
12169 + reply.ms_status = STATUS_NEG;
12170 + clear_bit(SEFL_ALLOW_JOIN,
12172 + set_bit(SEFL_CANCEL, &sev->se_flags);
12176 + if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12177 + set_bit(SEFL_CHECK, &sev->se_flags);
12178 + wake_serviced(DO_JOINLEAVE);
12183 + /* no r,u,s event, stick with STATUS_POS */
12188 + if (reply.ms_status == STATUS_POS) {
12189 + node = sm_find_joiner(sg, nodeid);
12191 + node = sm_new_node(nodeid);
12192 + list_add_tail(&node->list, &sg->joining);
12197 + smsg_bswap_out(&reply);
12198 + send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12202 + * Another node wants us to stop a service so it can join or leave the SG. We
12203 + * do this by saving the request info in a uevent and having the sm thread do
12204 + * the processing and then replying.
12207 +static void process_stop_request(sm_msg_t *smsg, uint32_t nodeid,
12208 + uint32_t *msgbuf)
12211 + sm_uevent_t *uev;
12213 + int type = smsg->ms_type;
12215 + if (nodeid == sm_our_nodeid)
12218 + sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12220 + log_print("process_stop_request: unknown sg id %x",
12221 + smsg->ms_global_sgid);
12226 + * We shouldn't get here with uevent already set.
12229 + if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
12230 + log_error(sg, "process_stop_request: uevent already set");
12234 + uev = &sg->uevent;
12235 + uev->ue_nodeid = nodeid;
12236 + uev->ue_remote_seid = smsg->ms_sevent_id;
12237 + uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
12239 + if (type == SMSG_JSTOP_REQ)
12240 + uev->ue_num_nodes = be32_to_cpu(*msgbuf);
12242 + set_bit(UEFL_LEAVE, &uev->ue_flags);
12245 + * Do process_join_stop() or process_leave_stop().
12248 + set_bit(UEFL_CHECK, &uev->ue_flags);
12249 + wake_serviced(DO_MEMBERSHIP);
12253 + reply.ms_status = STATUS_POS;
12255 + (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
12256 + reply.ms_sevent_id = smsg->ms_sevent_id;
12257 + smsg_bswap_out(&reply);
12258 + send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12261 +static void process_start_request(sm_msg_t *smsg, uint32_t nodeid)
12264 + sm_uevent_t *uev;
12265 + int type = smsg->ms_type;
12267 + if (nodeid == sm_our_nodeid)
12270 + sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12272 + log_print("process_start_request: unknown sg id %x",
12273 + smsg->ms_global_sgid);
12277 + if (!test_bit(SGFL_UEVENT, &sg->flags)) {
12278 + log_error(sg, "process_start_request: no uevent");
12282 + uev = &sg->uevent;
12284 + if (type == SMSG_JSTART_CMD)
12285 + uev->ue_state = UEST_JSTART;
12287 + uev->ue_state = UEST_LSTART;
12289 + set_bit(UEFL_CHECK, &uev->ue_flags);
12290 + wake_serviced(DO_MEMBERSHIP);
12293 +static void process_leave_request(sm_msg_t *smsg, uint32_t nodeid)
12298 + sm_sevent_t *sev;
12299 + int found = FALSE;
12301 + sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12303 + if (nodeid == sm_our_nodeid)
12306 + list_for_each_entry(node, &sg->memb, list) {
12307 + if (node->id != nodeid)
12309 + set_bit(SNFL_LEAVING, &node->flags);
12317 + reply.ms_type = SMSG_LEAVE_REP;
12318 + reply.ms_status = STATUS_NEG;
12319 + reply.ms_sevent_id = smsg->ms_sevent_id;
12321 + reply.ms_type = SMSG_LEAVE_REP;
12322 + reply.ms_status = STATUS_POS;
12323 + reply.ms_sevent_id = smsg->ms_sevent_id;
12325 + if (sg->state == SGST_RECOVER)
12326 + reply.ms_status = STATUS_WAIT;
12328 + else if (test_bit(SGFL_SEVENT, &sg->flags) &&
12329 + nodeid != sm_our_nodeid) {
12330 + sev = sg->sevent;
12333 + * We're trying to join or leave at the moment. If
12334 + * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
12335 + * wait. Otherwise, if joining we'll cancel to let the
12336 + * leave happen first, or if we're leaving allow the
12337 + * lower nodeid to leave first.
12340 + if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12341 + if (sev->se_state > SEST_LEAVE_ACKWAIT)
12342 + reply.ms_status = STATUS_WAIT;
12343 + else if (sm_our_nodeid < nodeid)
12344 + reply.ms_status = STATUS_WAIT;
12346 + reply.ms_status = STATUS_POS;
12347 + clear_bit(SEFL_ALLOW_LEAVE,
12349 + set_bit(SEFL_CANCEL, &sev->se_flags);
12352 + if (sev->se_state > SEST_JOIN_ACKWAIT)
12353 + reply.ms_status = STATUS_WAIT;
12355 + reply.ms_status = STATUS_NEG;
12356 + clear_bit(SEFL_ALLOW_JOIN,
12358 + set_bit(SEFL_CANCEL, &sev->se_flags);
12362 + if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12363 + set_bit(SEFL_CHECK, &sev->se_flags);
12364 + wake_serviced(DO_JOINLEAVE);
12368 + else if (test_bit(SGFL_UEVENT, &sg->flags)) {
12369 + if (sg->uevent.ue_nodeid != nodeid)
12370 + reply.ms_status = STATUS_WAIT;
12375 + smsg_bswap_out(&reply);
12376 + send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12380 + * Each remaining node will send us a done message. We quit when we get the
12381 + * first. The subsequent done messages for the finished sevent get here and
12385 +static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
12387 + sm_sevent_t *sev;
12389 + sev = find_sevent(smsg->ms_sevent_id);
12393 + if (sev->se_state != SEST_LSTART_WAITREMOTE)
12396 + sev->se_state = SEST_LSTART_REMOTEDONE;
12397 + set_bit(SEFL_CHECK, &sev->se_flags);
12398 + wake_serviced(DO_JOINLEAVE);
12402 + * This function and everything it calls always runs in sm context.
12405 +static void process_message(char *msg, uint32_t nodeid)
12409 + smsg_copy_in(msg, &smsg);
12411 + switch (smsg.ms_type) {
12412 + case SMSG_JOIN_REQ:
12413 + process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
12416 + case SMSG_JSTOP_REQ:
12417 + process_stop_request(&smsg, nodeid,
12418 + (uint32_t *) (msg + sizeof(sm_msg_t)));
12421 + case SMSG_LEAVE_REQ:
12422 + process_leave_request(&smsg, nodeid);
12425 + case SMSG_LSTOP_REQ:
12426 + process_stop_request(&smsg, nodeid, NULL);
12429 + case SMSG_JSTART_CMD:
12430 + case SMSG_LSTART_CMD:
12431 + process_start_request(&smsg, nodeid);
12434 + case SMSG_LSTART_DONE:
12435 + process_lstart_done(&smsg, nodeid);
12438 + case SMSG_JOIN_REP:
12439 + case SMSG_JSTOP_REP:
12440 + case SMSG_LEAVE_REP:
12441 + case SMSG_LSTOP_REP:
12442 + process_reply(&smsg, nodeid);
12445 + case SMSG_RECOVER:
12446 + process_recover_msg(&smsg, nodeid);
12450 + log_print("process_message: unknown type %u nodeid %u",
12451 + smsg.ms_type, nodeid);
12456 + * Always called from sm context.
12459 +void process_messages(void)
12466 + spin_lock(&message_lock);
12467 + if (!list_empty(&messages)) {
12468 + re = list_entry(messages.next, rq_entry_t, list);
12469 + list_del(&re->list);
12471 + spin_unlock(&message_lock);
12475 + process_message(re->msg, re->nodeid);
12483 + * Context: cnxman and sm
12486 +static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
12490 + SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
12492 + SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
12494 + memcpy(re->msg, msg, len);
12496 + re->nodeid = nodeid;
12498 + spin_lock(&message_lock);
12499 + list_add_tail(&re->list, &messages);
12500 + spin_unlock(&message_lock);
12502 + wake_serviced(DO_MESSAGES);
12507 + * Context: cnxman
12508 + * Called by cnxman when a service manager message arrives.
12511 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12512 + unsigned int node_id)
12516 + return add_to_recvqueue(msg, len, node_id);
12520 + * These send routines are used by sm and are always called from sm context.
12523 +int send_nodeid_message(char *msg, int len, uint32_t nodeid)
12526 + struct sockaddr_cl saddr;
12528 + if (nodeid == sm_our_nodeid) {
12529 + add_to_recvqueue(msg, len, nodeid);
12533 + saddr.scl_family = AF_CLUSTER;
12534 + saddr.scl_port = CLUSTER_PORT_SERVICES;
12535 + saddr.scl_nodeid = nodeid;
12536 + error = kcl_sendmsg(sm_socket, msg, len, &saddr, sizeof(saddr), 0);
12541 + log_print("send_nodeid_message error %d to %u", error, nodeid);
12546 +int send_broadcast_message(char *msg, int len)
12550 + error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
12554 + add_to_recvqueue(msg, len, sm_our_nodeid);
12557 + log_print("send_broadcast_message error %d", error);
12562 +int send_members_message(sm_group_t *sg, char *msg, int len)
12567 + list_for_each_entry(node, &sg->memb, list) {
12568 + error = send_nodeid_message(msg, len, node->id);
12575 +int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12576 + sm_sevent_t * sev)
12579 + sm_msg_t *smsg = (sm_msg_t *) msg;
12581 + set_allowed_msgtype(sev, smsg->ms_type);
12582 + sev->se_reply_count = 0;
12584 + error = send_members_message(sg, msg, len);
12586 + clear_allowed_msgtype(sev, smsg->ms_type);
12591 +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
12594 + sm_msg_t *smsg = (sm_msg_t *) msg;
12596 + set_allowed_msgtype(sev, smsg->ms_type);
12597 + sev->se_reply_count = 0;
12599 + error = send_broadcast_message(msg, len);
12601 + clear_allowed_msgtype(sev, smsg->ms_type);
12605 diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
12606 --- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730
12607 +++ linux-patched/cluster/cman/sm_message.h 2004-11-03 11:37:37.000000000 +0800
12609 +/******************************************************************************
12610 +*******************************************************************************
12612 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12613 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12615 +** This copyrighted material is made available to anyone wishing to use,
12616 +** modify, copy, or redistribute it subject to the terms and conditions
12617 +** of the GNU General Public License v.2.
12619 +*******************************************************************************
12620 +******************************************************************************/
12622 +#ifndef __SM_MESSAGE_DOT_H__
12623 +#define __SM_MESSAGE_DOT_H__
12625 +void init_messages(void);
12626 +uint32_t sm_new_global_id(int level);
12627 +void smsg_bswap_out(sm_msg_t * smsg);
12628 +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
12629 + sm_sevent_t *sev);
12630 +void process_messages(void);
12631 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12632 + unsigned int node_id);
12633 +int send_nodeid_message(char *msg, int len, uint32_t nodeid);
12634 +int send_broadcast_message(char *msg, int len);
12635 +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
12636 +int send_members_message(sm_group_t *sg, char *msg, int len);
12637 +int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12638 + sm_sevent_t * sev);
12639 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12640 + unsigned int node_id);
12643 diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
12644 --- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730
12645 +++ linux-patched/cluster/cman/sm_misc.c 2004-11-03 11:37:37.000000000 +0800
12647 +/******************************************************************************
12648 +*******************************************************************************
12650 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
12651 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
12653 +** This copyrighted material is made available to anyone wishing to use,
12654 +** modify, copy, or redistribute it subject to the terms and conditions
12655 +** of the GNU General Public License v.2.
12657 +*******************************************************************************
12658 +******************************************************************************/
12661 +#include "config.h"
12662 +#include <linux/seq_file.h>
12664 +#define MAX_DEBUG_MSG_LEN (40)
12666 +extern struct list_head sm_members;
12667 +static uint32_t local_ids;
12668 +static uint32_t event_id;
12669 +static spinlock_t event_id_lock;
12670 +static char * debug_buf;
12671 +static unsigned int debug_size;
12672 +static unsigned int debug_point;
12673 +static int debug_wrap;
12674 +static spinlock_t debug_lock;
12677 +void init_sm_misc(void)
12681 + spin_lock_init(&event_id_lock);
12682 + debug_buf = NULL;
12686 + spin_lock_init(&debug_lock);
12688 + sm_debug_setup(cman_config.sm_debug_size);
12691 +sm_node_t *sm_new_node(uint32_t nodeid)
12693 + struct kcl_cluster_node kclnode;
12697 + error = kcl_get_node_by_nodeid(nodeid, &kclnode);
12698 + SM_ASSERT(!error,);
12700 + SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
12703 + memset(node, 0, sizeof(sm_node_t));
12704 + node->id = nodeid;
12705 + node->incarnation = kclnode.incarnation;
12709 +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
12713 + list_for_each_entry(node, &sg->joining, list) {
12714 + if (node->id == nodeid)
12720 +sm_node_t *sm_find_member(uint32_t nodeid)
12724 + list_for_each_entry(node, &sm_members, list) {
12725 + if (node->id == nodeid)
12731 +uint32_t sm_new_local_id(int level)
12733 + uint32_t id = local_ids++;
12734 + uint8_t l = (uint8_t) level;
12736 + if (level > 0xFF)
12739 + if (id > 0x00FFFFFF)
12746 +int sm_id_to_level(uint32_t id)
12748 + uint8_t l = (id & 0xFF000000) >> 24;
12753 +void sm_set_event_id(int *id)
12755 + spin_lock(&event_id_lock);
12756 + *id = event_id++;
12757 + spin_unlock(&event_id_lock);
12760 +sm_group_t *sm_local_id_to_sg(int id)
12763 + int level = sm_id_to_level(id);
12764 + int found = FALSE;
12766 + down(&sm_sglock);
12768 + list_for_each_entry(sg, &sm_sg[level], list) {
12769 + if (sg->local_id == id) {
12780 +sm_group_t *sm_global_id_to_sg(int id)
12783 + int level = sm_id_to_level(id);
12784 + int found = FALSE;
12786 + down(&sm_sglock);
12788 + list_for_each_entry(sg, &sm_sg[level], list) {
12789 + if (sg->global_id == id) {
12800 +void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
12803 + int i, n, size, len;
12804 + char buf[MAX_DEBUG_MSG_LEN+1];
12806 + spin_lock(&debug_lock);
12811 + size = MAX_DEBUG_MSG_LEN;
12812 + memset(buf, 0, size+1);
12814 + n = snprintf(buf, size, "%08x ", sg->global_id);
12817 + va_start(va, fmt);
12818 + vsnprintf(buf+n, size, fmt, va);
12821 + len = strlen(buf);
12822 + if (len > MAX_DEBUG_MSG_LEN-1)
12823 + len = MAX_DEBUG_MSG_LEN-1;
12825 + buf[len+1] = '\0';
12827 + for (i = 0; i < strlen(buf); i++) {
12828 + debug_buf[debug_point++] = buf[i];
12830 + if (debug_point == debug_size) {
12836 + spin_unlock(&debug_lock);
12839 +void sm_debug_setup(int size)
12841 + char *b = kmalloc(size, GFP_KERNEL);
12843 + spin_lock(&debug_lock);
12845 + kfree(debug_buf);
12847 + if (size > PAGE_SIZE)
12848 + size = PAGE_SIZE;
12849 + debug_size = size;
12853 + memset(debug_buf, 0, debug_size);
12854 + spin_unlock(&debug_lock);
12857 +#ifdef CONFIG_PROC_FS
12858 +static struct seq_operations sm_info_op;
12860 +struct sm_seq_info
12867 +int sm_debug_info(char *b, char **start, off_t offset, int length)
12871 + spin_lock(&debug_lock);
12873 + if (debug_wrap) {
12874 + for (i = debug_point; i < debug_size; i++)
12875 + n += sprintf(b + n, "%c", debug_buf[i]);
12877 + for (i = 0; i < debug_point; i++)
12878 + n += sprintf(b + n, "%c", debug_buf[i]);
12880 + spin_unlock(&debug_lock);
12887 +static sm_group_t *sm_walk(loff_t offset, int *rlevel)
12893 + down(&sm_sglock);
12895 + for (level = 0; level < SG_LEVELS; level++) {
12896 + list_for_each_entry(sg, &sm_sg[level], list) {
12897 + if (++n == offset)
12898 + goto walk_finish;
12911 +static void *sm_seq_start(struct seq_file *m, loff_t * pos)
12913 + struct sm_seq_info *ssi =
12914 + kmalloc(sizeof (struct sm_seq_info), GFP_KERNEL);
12923 + /* Print the header */
12926 + "Service Name GID LID State Code\n");
12931 +static void *sm_seq_next(struct seq_file *m, void *p, loff_t * pos)
12933 + struct sm_seq_info *ssi = p;
12935 + *pos = ++ssi->pos;
12937 + if ( !(ssi->sg = sm_walk(ssi->pos, &ssi->level)) )
12943 +/* Called from /proc when /proc/cluster/services is opened */
12944 +int sm_proc_open(struct inode *inode, struct file *file)
12946 + return seq_open(file, &sm_info_op);
12949 +static int sm_seq_show(struct seq_file *s, void *p)
12951 + struct sm_seq_info *ssi = p;
12955 + if (!ssi || !ssi->sg)
12959 + * Cluster Service
12962 + switch (ssi->level) {
12963 + case SERVICE_LEVEL_FENCE:
12964 + seq_printf(s, "Fence Domain: ");
12966 + case SERVICE_LEVEL_GDLM:
12967 + seq_printf(s, "DLM Lock Space: ");
12969 + case SERVICE_LEVEL_GFS:
12970 + seq_printf(s, "GFS Mount Group: ");
12972 + case SERVICE_LEVEL_USER:
12973 + seq_printf(s, "User: ");
12981 + seq_printf(s, "\"");
12982 + for (i = 0; i < ssi->sg->namelen; i++)
12983 + seq_printf(s, "%c", ssi->sg->name[i]);
12984 + seq_printf(s, "\"");
12986 + for (; i < MAX_SERVICE_NAME_LEN-1; i++)
12987 + seq_printf(s, " ");
12990 + * GID LID (sans level from top byte)
12993 + seq_printf(s, "%3u %3u ",
12994 + (ssi->sg->global_id & 0x00FFFFFF),
12995 + (ssi->sg->local_id & 0x00FFFFFF));
13001 + switch (ssi->sg->state) {
13003 + seq_printf(s, "none ");
13006 + seq_printf(s, "join ");
13009 + seq_printf(s, "run ");
13011 + case SGST_RECOVER:
13012 + seq_printf(s, "recover %u ",
13013 + ssi->sg->recover_state);
13015 + case SGST_UEVENT:
13016 + seq_printf(s, "update ");
13024 + if (test_bit(SGFL_SEVENT, &ssi->sg->flags))
13025 + seq_printf(s, "S");
13026 + if (test_bit(SGFL_UEVENT, &ssi->sg->flags))
13027 + seq_printf(s, "U");
13028 + if (test_bit(SGFL_NEED_RECOVERY, &ssi->sg->flags))
13029 + seq_printf(s, "N");
13031 + seq_printf(s, "-");
13033 + if (test_bit(SGFL_SEVENT, &ssi->sg->flags)
13034 + && ssi->sg->sevent) {
13035 + seq_printf(s, "%u,%lx,%u",
13036 + ssi->sg->sevent->se_state,
13037 + ssi->sg->sevent->se_flags,
13038 + ssi->sg->sevent->se_reply_count);
13041 + if (test_bit(SGFL_UEVENT, &ssi->sg->flags)) {
13042 + seq_printf(s, "%u,%lx,%u",
13043 + ssi->sg->uevent.ue_state,
13044 + ssi->sg->uevent.ue_flags,
13045 + ssi->sg->uevent.ue_nodeid);
13048 + seq_printf(s, "\n");
13056 + seq_printf(s, "[");
13058 + list_for_each_entry(node, &ssi->sg->memb, list) {
13059 + if (i && !(i % 24))
13060 + seq_printf(s, "\n");
13063 + seq_printf(s, " ");
13065 + seq_printf(s, "%u", node->id);
13069 + seq_printf(s, "]\n\n");
13074 +static void sm_seq_stop(struct seq_file *m, void *p)
13080 +static struct seq_operations sm_info_op = {
13081 + .start = sm_seq_start,
13082 + .next = sm_seq_next,
13083 + .stop = sm_seq_stop,
13084 + .show = sm_seq_show
13089 diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
13090 --- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730
13091 +++ linux-patched/cluster/cman/sm_misc.h 2004-11-03 11:37:37.000000000 +0800
13093 +/******************************************************************************
13094 +*******************************************************************************
13096 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13097 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13099 +** This copyrighted material is made available to anyone wishing to use,
13100 +** modify, copy, or redistribute it subject to the terms and conditions
13101 +** of the GNU General Public License v.2.
13103 +*******************************************************************************
13104 +******************************************************************************/
13106 +#ifndef __SM_MISC_DOT_H__
13107 +#define __SM_MISC_DOT_H__
13109 +void init_sm_misc(void);
13110 +sm_node_t *sm_new_node(uint32_t nodeid);
13111 +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
13112 +sm_node_t *sm_find_member(uint32_t nodeid);
13113 +uint32_t sm_new_local_id(int level);
13114 +int sm_id_to_level(uint32_t id);
13115 +void sm_set_event_id(int *id);
13116 +sm_group_t *sm_local_id_to_sg(int id);
13117 +sm_group_t *sm_global_id_to_sg(int id);
13118 +void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
13119 +void sm_debug_setup(int size);
13122 diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
13123 --- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730
13124 +++ linux-patched/cluster/cman/sm_recover.c 2004-11-03 11:37:37.000000000 +0800
13126 +/******************************************************************************
13127 +*******************************************************************************
13129 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13130 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13132 +** This copyrighted material is made available to anyone wishing to use,
13133 +** modify, copy, or redistribute it subject to the terms and conditions
13134 +** of the GNU General Public License v.2.
13136 +*******************************************************************************
13137 +******************************************************************************/
13140 +#include "config.h"
13143 + * A collection of sg's which need to be recovered due to a failed member.
13144 + * These sg's are recovered in order of level. An sg subject to cascading
13145 + * failures is moved from one of these structs to a newer one.
13149 + struct list_head list; /* list of current re's */
13150 + struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */
13151 + int event_id; /* event id */
13154 +typedef struct recover recover_t;
13157 +extern uint32_t * sm_new_nodeids;
13158 +extern int sm_quorum, sm_quorum_next;
13159 +extern uint32_t sm_our_nodeid;
13160 +extern struct list_head sm_members;
13161 +extern int sm_member_count;
13162 +static struct list_head recoveries;
13165 +void init_recovery(void)
13167 + INIT_LIST_HEAD(&recoveries);
13171 + * This is the first thing called when a change is announced in cluster
13172 + * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new
13173 + * nodes to its sm_members list which it's not seen before. Nodes which were
13174 + * alive but are now gone are marked as "need recovery".
13176 + * The "need recovery" status of nodes is propagated to the node's SG's in
13177 + * mark_effected_sgs. The effected SG's are themselves marked as needing
13178 + * recovery and in new_recovery the dead nodes are removed from the SG's
13179 + * individual member lists. The "need recovery" status of nodes is cleared in
13180 + * adjust_members_done().
13183 +static int adjust_members(void)
13186 + struct kcl_cluster_node knode;
13187 + int i, error, num_nodes, sub = 0, add = 0, found;
13190 + * Get list of current members from cnxman
13193 + memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
13194 + num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
13197 + * Determine who's gone
13200 + list_for_each_entry(node, &sm_members, list) {
13202 + for (i = 0; i < num_nodes; i++) {
13203 + if (node->id == sm_new_nodeids[i]) {
13205 + sm_new_nodeids[i] = 0;
13211 + error = kcl_get_node_by_nodeid(node->id, &knode);
13212 + SM_ASSERT(!error, printk("error=%d\n", error););
13214 + if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
13215 + /* former member is back */
13216 + set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13217 + node->incarnation = knode.incarnation;
13220 + /* current member is still alive - if the
13221 + * incarnation number is different it died and
13222 + * returned between checks */
13223 + if (node->incarnation != knode.incarnation) {
13224 + set_bit(SNFL_NEED_RECOVERY,
13226 + node->incarnation = knode.incarnation;
13231 + /* current member has died */
13232 + if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
13234 + set_bit(SNFL_NEED_RECOVERY, &node->flags);
13241 + * Look for new nodes
13244 + for (i = 0; i < num_nodes; i++) {
13245 + if (sm_new_nodeids[i]) {
13246 + node = sm_new_node(sm_new_nodeids[i]);
13247 + set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13249 + list_add_tail(&node->list, &sm_members);
13250 + sm_member_count++;
13255 + * Get our own nodeid
13258 + if (!sm_our_nodeid) {
13259 + list_for_each_entry(node, &sm_members, list) {
13260 + error = kcl_get_node_by_nodeid(node->id, &knode);
13261 + SM_ASSERT(!error, printk("error=%d\n", error););
13264 + sm_our_nodeid = knode.node_id;
13274 + * Given some number of dead nodes, flag SG's the dead nodes were part of.
13275 + * This requires a number of loops because each node structure does not keep a
13276 + * list of SG's it's in.
13279 +static int mark_effected_sgs(void)
13282 + sm_node_t *node, *sgnode;
13283 + uint32_t dead_id;
13284 + int i, effected = 0;
13286 + down(&sm_sglock);
13288 + list_for_each_entry(node, &sm_members, list) {
13289 + if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
13292 + dead_id = node->id;
13294 + for (i = 0; i < SG_LEVELS; i++) {
13295 + list_for_each_entry(sg, &sm_sg[i], list) {
13296 + /* check if dead node is among sg's members */
13297 + list_for_each_entry(sgnode, &sg->memb, list) {
13298 + if (sgnode->id == dead_id) {
13299 + set_bit(SGFL_NEED_RECOVERY,
13313 +static recover_t *alloc_recover(void)
13318 + SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
13320 + memset(rev, 0, sizeof(recover_t));
13322 + sm_set_event_id(&rev->event_id);
13324 + for (i = 0; i < SG_LEVELS; i++) {
13325 + INIT_LIST_HEAD(&rev->sgs[i]);
13332 + * An in-progress revent re-start for an SG is interrupted by another node
13333 + * failure in the SG. Cancel an outstanding barrier if there is one. The SG
13334 + * will be moved to the new revent and re-started as part of that.
13337 +static void cancel_prev_recovery(sm_group_t *sg)
13341 + if (sg->recover_state == RECOVER_BARRIERWAIT) {
13342 + error = kcl_barrier_cancel(sg->recover_barrier);
13344 + log_error(sg, "cancel_prev_recovery: error %d", error);
13348 +static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
13350 + if (sg->state == SGST_RECOVER) {
13351 + cancel_prev_recovery(sg);
13352 + list_del(&sg->recover_list);
13355 + sg->ops->stop(sg->service_data);
13356 + sg->state = SGST_RECOVER;
13357 + sg->recover_state = RECOVER_NONE;
13358 + sg->recover_data = rev;
13359 + list_add(&sg->recover_list, &rev->sgs[sg->level]);
13363 + * When adjust_members finds that some nodes are dead and mark_effected_sgs
13364 + * finds that some SG's are effected by departed nodes, this is called to
13365 + * collect together the SG's which need to be recovered. An revent (recovery
13366 + * event) is the group of effected SG's.
13369 +static int new_recovery(void)
13373 + sm_node_t *node, *sgnode, *safe;
13376 + rev = alloc_recover();
13377 + list_add_tail(&rev->list, &recoveries);
13379 + down(&sm_sglock);
13382 + * Stop effected SG's and add them to the rev
13385 + for (i = 0; i < SG_LEVELS; i++) {
13386 + list_for_each_entry(sg, &sm_sg[i], list) {
13387 + if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
13388 + if (sg->state == SGST_JOIN)
13390 + pre_recover_sg(sg, rev);
13396 + * For an SG needing recovery, remove dead nodes from sg->memb list
13399 + for (i = 0; i < SG_LEVELS; i++) {
13400 + list_for_each_entry(sg, &rev->sgs[i], recover_list) {
13402 + /* Remove dead members from SG's member list */
13403 + list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
13405 + node = sm_find_member(sgnode->id);
13406 + SM_ASSERT(node, printk("id %u\n", sgnode->id););
13408 + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
13409 + list_del(&sgnode->list);
13411 + sg->memb_count--;
13412 + log_debug(sg, "remove node %u count %d",
13413 + sgnode->id, sg->memb_count);
13420 + rev->cur_level = 0;
13425 + * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
13426 + * mark_effected_sgs() and add_revent(). After that, we're done using the bit
13427 + * and we clear it here.
13430 +static void adjust_members_done(void)
13434 + list_for_each_entry(node, &sm_members, list)
13435 + clear_bit(SNFL_NEED_RECOVERY, &node->flags);
13439 + * Start the service of the given SG. The service must be given an array of
13440 + * nodeids specifying the new sg membership. The service is responsible to
13441 + * free this chunk of memory when done with it.
13444 +static void start_sg(sm_group_t *sg, uint32_t event_id)
13450 + SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
13453 + list_for_each_entry(node, &sg->memb, list)
13454 + memb[count++] = node->id;
13456 + sg->ops->start(sg->service_data, memb, count, event_id,
13457 + SERVICE_NODE_FAILED);
13460 +static void recovery_barrier(sm_group_t *sg)
13462 + char bname[MAX_BARRIER_NAME_LEN];
13465 + memset(bname, 0, MAX_BARRIER_NAME_LEN);
13467 + /* bypass the barrier if we're the only member */
13468 + if (sg->memb_count == 1) {
13469 + process_recovery_barrier(sg, 0);
13473 + len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
13474 + sg->global_id, sg->recover_stop, sg->memb_count);
13476 + /* We save this barrier name so we can cancel it if needed. */
13477 + memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
13478 + memcpy(sg->recover_barrier, bname, len);
13480 + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
13482 + log_error(sg, "recovery_barrier error %d: %s", error, bname);
13485 +static void recover_sg(sm_group_t *sg, int event_id)
13487 + log_debug(sg, "recover state %d", sg->recover_state);
13489 + switch (sg->recover_state) {
13491 + case RECOVER_NONE:
13492 + /* must wait for recovery to stop sg on all nodes */
13493 + sg->recover_state = RECOVER_BARRIERWAIT;
13494 + sg->recover_stop = 0;
13495 + recovery_barrier(sg);
13498 + case RECOVER_BARRIERWAIT:
13501 + case RECOVER_STOP:
13502 + /* barrier callback sets state STOP */
13503 + sg->recover_stop = 1;
13504 + sg->recover_state = RECOVER_START;
13505 + start_sg(sg, event_id);
13508 + case RECOVER_START:
13511 + case RECOVER_STARTDONE:
13512 + /* service callback sets state STARTDONE */
13513 + sg->recover_state = RECOVER_BARRIERWAIT;
13514 + recovery_barrier(sg);
13517 + case RECOVER_BARRIERDONE:
13518 + /* barrier callback sets state BARRIERDONE */
13519 + sg->ops->finish(sg->service_data, event_id);
13520 + list_del(&sg->recover_list);
13521 + sg->recover_state = RECOVER_NONE;
13522 + sg->state = SGST_RUN;
13524 + /* Continue a previous, interrupted attempt to leave the sg */
13525 + if (sg->sevent) {
13526 + sm_sevent_t *sev = sg->sevent;
13527 + log_debug(sg, "restart leave %lx", sev->se_flags);
13528 + clear_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
13529 + set_bit(SEFL_CHECK, &sev->se_flags);
13530 + wake_serviced(DO_JOINLEAVE);
13535 + log_error(sg, "invalid recover_state %u", sg->recover_state);
13539 +static void recover_level(recover_t *rev, int level)
13541 + sm_group_t *sg, *safe;
13543 + list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
13544 + recover_sg(sg, rev->event_id);
13547 +static void recover_levels(recover_t *rev)
13550 + recover_level(rev, rev->cur_level);
13552 + if (list_empty(&rev->sgs[rev->cur_level])) {
13553 + if (rev->cur_level == SG_LEVELS - 1) {
13554 + list_del(&rev->list);
13558 + rev->cur_level++;
13566 + * Called by SM thread when the cluster is quorate. It restarts
13567 + * SG's that were stopped in new_recovery() due to a member death.
13568 + * It waits for all SG's at level N to complete restart before
13569 + * restarting SG's at level N+1.
13572 +void process_recoveries(void)
13574 + recover_t *rev, *safe;
13576 + down(&sm_sglock);
13577 + list_for_each_entry_safe(rev, safe, &recoveries, list)
13578 + recover_levels(rev);
13583 + * The cnxman membership has changed. Check if there's still quorum and
13584 + * whether any nodes have died. If nodes have died, initiate recovery on any
13585 + * SG's they were in. This begins immediately if the cluster remains quorate;
13586 + * if not this waits until the cluster regains quorum.
13589 +void process_nodechange(void)
13591 + int gone, effected;
13593 + if ((sm_quorum = sm_quorum_next))
13594 + wake_serviced(DO_RUN);
13596 + gone = adjust_members();
13598 + effected = mark_effected_sgs();
13600 + backout_sevents();
13601 + cancel_uevents(&effected);
13603 + if (effected > 0) {
13605 + wake_serviced(DO_RECOVERIES);
13608 + adjust_members_done();
13611 +int check_recovery(sm_group_t *sg, int event_id)
13613 + if (sg->state == SGST_RECOVER) {
13614 + recover_t *rev = (recover_t *) sg->recover_data;
13615 + if (rev && rev->event_id == event_id)
13621 +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
13626 + sg = sm_global_id_to_sg(smsg->ms_global_sgid);
13628 + log_print("process_recover_msg: unknown sg id %x",
13629 + smsg->ms_global_sgid);
13633 + /* we already know about the recovery and can ignore the msg */
13634 + if (sg->state == SGST_RECOVER)
13637 + if (test_bit(SGFL_UEVENT, &sg->flags)) {
13638 + /* we will initiate recovery on our own if we know about the
13639 + uevent so we can ignore this */
13640 + log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
13644 + log_debug(sg, "recovery initiated by msg from %u", nodeid);
13645 + rev = alloc_recover();
13646 + list_add_tail(&rev->list, &recoveries);
13647 + pre_recover_sg(sg, rev);
13648 + wake_serviced(DO_RECOVERIES);
13650 diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
13651 --- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730
13652 +++ linux-patched/cluster/cman/sm_recover.h 2004-11-03 11:37:37.000000000 +0800
13654 +/******************************************************************************
13655 +*******************************************************************************
13657 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13658 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13660 +** This copyrighted material is made available to anyone wishing to use,
13661 +** modify, copy, or redistribute it subject to the terms and conditions
13662 +** of the GNU General Public License v.2.
13664 +*******************************************************************************
13665 +******************************************************************************/
13667 +#ifndef __SM_RECOVER_DOT_H__
13668 +#define __SM_RECOVER_DOT_H__
13670 +void init_recovery(void);
13671 +void process_recoveries(void);
13672 +void process_nodechange(void);
13673 +int check_recovery(sm_group_t *sg, int event_id);
13674 +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
13677 diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
13678 --- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730
13679 +++ linux-patched/cluster/cman/sm_services.c 2004-11-03 11:37:37.000000000 +0800
13681 +/******************************************************************************
13682 +*******************************************************************************
13684 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
13685 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
13687 +** This copyrighted material is made available to anyone wishing to use,
13688 +** modify, copy, or redistribute it subject to the terms and conditions
13689 +** of the GNU General Public License v.2.
13691 +*******************************************************************************
13692 +******************************************************************************/
13696 +static struct list_head callbacks;
13697 +static spinlock_t callback_lock;
13698 +static struct list_head sg_registered[SG_LEVELS];
13701 + * These are the functions to register, join, leave, unregister, callback
13702 + * with/to the sm.
13706 + struct list_head list;
13707 + uint32_t local_id;
13710 +typedef struct sc_entry sc_entry_t;
13712 +void init_services(void)
13716 + INIT_LIST_HEAD(&callbacks);
13717 + spin_lock_init(&callback_lock);
13719 + for (i = 0; i < SG_LEVELS; i++) {
13720 + INIT_LIST_HEAD(&sm_sg[i]);
13721 + INIT_LIST_HEAD(&sg_registered[i]);
13723 + init_MUTEX(&sm_sglock);
13726 +/* Context: service */
13728 +int kcl_register_service(char *name, int namelen, int level,
13729 + struct kcl_service_ops *ops, int unique,
13730 + void *servicedata, uint32_t *service_id)
13733 + int found = FALSE;
13734 + int error = -EINVAL;
13736 + if (level > SG_LEVELS - 1)
13739 + if (namelen > MAX_SERVICE_NAME_LEN)
13742 + error = kcl_addref_cluster();
13746 + down(&sm_sglock);
13748 + list_for_each_entry(sg, &sm_sg[level], list) {
13749 + if ((sg->namelen == namelen) &&
13750 + (!strncmp(sg->name, name, namelen))) {
13756 + list_for_each_entry(sg, &sg_registered[level], list) {
13757 + if ((sg->namelen == namelen) &&
13758 + (!strncmp(sg->name, name, namelen))) {
13766 + if (found && unique) {
13768 + goto fail_unlock;
13776 + sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
13779 + goto fail_unlock;
13781 + memset(sg, 0, sizeof(sm_group_t) + namelen);
13783 + sg->refcount = 1;
13784 + sg->service_data = servicedata;
13786 + sg->level = level;
13787 + sg->namelen = namelen;
13788 + memcpy(sg->name, name, namelen);
13789 + sg->local_id = sm_new_local_id(level);
13790 + sg->state = SGST_NONE;
13791 + INIT_LIST_HEAD(&sg->memb);
13792 + INIT_LIST_HEAD(&sg->joining);
13793 + init_completion(&sg->event_comp);
13795 + list_add_tail(&sg->list, &sg_registered[level]);
13798 + *service_id = sg->local_id;
13804 + kcl_releaseref_cluster();
13809 +/* Context: service */
13811 +void kcl_unregister_service(uint32_t local_id)
13814 + int level = sm_id_to_level(local_id);
13816 + down(&sm_sglock);
13818 + list_for_each_entry(sg, &sg_registered[level], list) {
13819 + if (sg->local_id == local_id) {
13820 + SM_ASSERT(sg->refcount,);
13823 + if (!sg->refcount) {
13824 + list_del(&sg->list);
13827 + kcl_releaseref_cluster();
13834 +/* Context: service */
13836 +int kcl_join_service(uint32_t local_id)
13839 + sm_sevent_t *sev;
13840 + int level = sm_id_to_level(local_id);
13841 + int error, found = FALSE;
13843 + down(&sm_sglock);
13845 + list_for_each_entry(sg, &sg_registered[level], list) {
13846 + if (sg->local_id == local_id) {
13858 + if (sg->state != SGST_NONE) {
13864 + sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13871 + memset(sev, 0, sizeof (sm_sevent_t));
13872 + sev->se_state = SEST_JOIN_BEGIN;
13873 + sm_set_event_id(&sev->se_id);
13875 + sg->sevent = sev;
13876 + sg->state = SGST_JOIN;
13877 + set_bit(SGFL_SEVENT, &sg->flags);
13878 + list_del(&sg->list);
13879 + list_add_tail(&sg->list, &sm_sg[sg->level]);
13884 + * The join is a service event which will be processed asynchronously.
13887 + new_joinleave(sev);
13888 + wait_for_completion(&sg->event_comp);
13895 +/* Context: service */
13897 +int kcl_leave_service(uint32_t local_id)
13899 + sm_group_t *sg = NULL;
13900 + sm_sevent_t *sev;
13904 + sg = sm_local_id_to_sg(local_id);
13908 + /* sg was never joined */
13910 + if (sg->state == SGST_NONE)
13913 + down(&sm_sglock);
13915 + /* may still be joining */
13916 + if (test_and_set_bit(SGFL_SEVENT, &sg->flags)) {
13922 + sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13929 + memset(sev, 0, sizeof (sm_sevent_t));
13930 + sev->se_state = SEST_LEAVE_BEGIN;
13931 + sm_set_event_id(&sev->se_id);
13932 + set_bit(SEFL_LEAVE, &sev->se_flags);
13934 + sg->sevent = sev;
13938 + new_joinleave(sev);
13939 + wait_for_completion(&sg->event_comp);
13942 + down(&sm_sglock);
13943 + list_del(&sg->list);
13944 + list_add_tail(&sg->list, &sg_registered[sg->level]);
13951 +static void process_callback(uint32_t local_id, int event_id)
13954 + sm_sevent_t *sev;
13955 + sm_uevent_t *uev;
13957 + sg = sm_local_id_to_sg(local_id);
13961 + if (sg->state == SGST_RECOVER) {
13962 + if (!check_recovery(sg, event_id)) {
13963 + log_error(sg, "process_callback invalid recover "
13964 + "event id %d", event_id);
13968 + if (sg->recover_state == RECOVER_START)
13969 + sg->recover_state = RECOVER_STARTDONE;
13971 + log_error(sg, "process_callback recover state %u",
13972 + sg->recover_state);
13973 + wake_serviced(DO_RECOVERIES);
13976 + else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
13977 + (sg->sevent->se_id == event_id)) {
13978 + sev = sg->sevent;
13980 + if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
13981 + (sev->se_state == SEST_JSTART_SERVICEWAIT))
13982 + sev->se_state = SEST_JSTART_SERVICEDONE;
13984 + set_bit(SEFL_CHECK, &sev->se_flags);
13985 + wake_serviced(DO_JOINLEAVE);
13988 + else if (test_bit(SGFL_UEVENT, &sg->flags) &&
13989 + (sg->uevent.ue_id == event_id)) {
13990 + uev = &sg->uevent;
13992 + if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
13993 + if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
13994 + uev->ue_state = UEST_JSTART_SERVICEDONE;
13995 + else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
13996 + uev->ue_state = UEST_LSTART_SERVICEDONE;
13998 + set_bit(UEFL_CHECK, &uev->ue_flags);
13999 + wake_serviced(DO_MEMBERSHIP);
14003 + log_error(sg, "ignoring service callback id=%x event=%u",
14004 + local_id, event_id);
14007 +void process_callbacks(void)
14014 + spin_lock(&callback_lock);
14015 + if (!list_empty(&callbacks)) {
14016 + se = list_entry(callbacks.next, sc_entry_t, list);
14017 + list_del(&se->list);
14019 + spin_unlock(&callback_lock);
14023 + process_callback(se->local_id, se->event_id);
14029 +/* Context: service */
14031 +void kcl_start_done(uint32_t local_id, int event_id)
14035 + SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
14037 + se->local_id = local_id;
14038 + se->event_id = event_id;
14040 + spin_lock(&callback_lock);
14041 + list_add_tail(&se->list, &callbacks);
14042 + spin_unlock(&callback_lock);
14044 + wake_serviced(DO_CALLBACKS);
14047 +/* Context: service */
14049 +void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
14051 + sm_group_t *sg = sm_local_id_to_sg(local_id);
14054 + log_print("kcl_global_service_id: can't find %x", local_id);
14056 + *global_id = sg->global_id;
14059 +static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
14061 + s->level = sg->level;
14062 + s->local_id = sg->local_id;
14063 + s->global_id = sg->global_id;
14064 + s->node_count = sg->memb_count;
14065 + strcpy(s->name, sg->name);
14068 +int kcl_get_services(struct list_head *head, int level)
14071 + struct kcl_service *s;
14072 + int error = -ENOMEM, count = 0;
14074 + down(&sm_sglock);
14076 + list_for_each_entry(sg, &sg_registered[level], list) {
14078 + s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14081 + copy_to_service(sg, s);
14082 + list_add(&s->list, head);
14087 + list_for_each_entry(sg, &sm_sg[level], list) {
14089 + s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14092 + copy_to_service(sg, s);
14093 + list_add(&s->list, head);
14104 +/* These three global variables listed in extern form in sm.h. */
14105 +struct list_head sm_sg[SG_LEVELS];
14106 +struct semaphore sm_sglock;
14107 diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
14108 --- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730
14109 +++ linux-patched/cluster/cman/sm_services.h 2004-11-03 11:37:37.000000000 +0800
14111 +/******************************************************************************
14112 +*******************************************************************************
14114 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14115 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14117 +** This copyrighted material is made available to anyone wishing to use,
14118 +** modify, copy, or redistribute it subject to the terms and conditions
14119 +** of the GNU General Public License v.2.
14121 +*******************************************************************************
14122 +******************************************************************************/
14124 +#ifndef __SM_SERVICES_DOT_H__
14125 +#define __SM_SERVICES_DOT_H__
14127 +void init_services(void);
14128 +void process_callbacks(void);
14131 diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
14132 --- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730
14133 +++ linux-patched/cluster/cman/sm_user.c 2004-11-03 11:37:37.000000000 +0800
14135 +/******************************************************************************
14136 +*******************************************************************************
14138 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14139 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14141 +** This copyrighted material is made available to anyone wishing to use,
14142 +** modify, copy, or redistribute it subject to the terms and conditions
14143 +** of the GNU General Public License v.2.
14145 +*******************************************************************************
14146 +******************************************************************************/
14149 +#include "cnxman-private.h"
14151 +void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
14153 +#define UST_REGISTER 1
14154 +#define UST_UNREGISTER 2
14155 +#define UST_JOIN 3
14156 +#define UST_LEAVE 4
14157 +#define UST_JOINED 5
14160 + struct list_head list;
14161 + service_event_t type;
14162 + service_start_t start_type;
14163 + unsigned int event_id;
14164 + unsigned int last_stop;
14165 + unsigned int last_start;
14166 + unsigned int last_finish;
14167 + unsigned int node_count;
14168 + uint32_t * nodeids;
14170 +typedef struct event event_t;
14172 +struct user_service {
14173 + uint32_t local_id;
14176 + struct socket * sock;
14179 + struct semaphore lock;
14180 + struct list_head events;
14181 + spinlock_t event_lock;
14182 + unsigned int last_stop;
14183 + unsigned int last_start;
14184 + unsigned int last_finish;
14185 + unsigned int need_startdone;
14186 + unsigned int node_count;
14187 + uint32_t * nodeids;
14189 + char name[MAX_SERVICE_NAME_LEN];
14191 +typedef struct user_service user_service_t;
14194 +static void add_event(user_service_t *us, event_t *ev)
14196 + spin_lock(&us->event_lock);
14197 + list_add_tail(&ev->list, &us->events);
14199 + switch(ev->type) {
14200 + case SERVICE_EVENT_STOP:
14201 + us->last_stop = us->last_start;
14203 + case SERVICE_EVENT_START:
14204 + us->last_start = ev->event_id;
14206 + case SERVICE_EVENT_FINISH:
14207 + us->last_finish = ev->event_id;
14209 + case SERVICE_EVENT_LEAVEDONE:
14212 + spin_unlock(&us->event_lock);
14215 +static event_t *get_event(user_service_t *us)
14217 + event_t *ev = NULL;
14219 + spin_lock(&us->event_lock);
14220 + if (!list_empty(&us->events)) {
14221 + ev = list_entry(us->events.next, event_t, list);
14222 + ev->last_stop = us->last_stop;
14223 + ev->last_start = us->last_start;
14224 + ev->last_finish = us->last_finish;
14226 + spin_unlock(&us->event_lock);
14230 +static void del_event(user_service_t *us, event_t *ev)
14232 + spin_lock(&us->event_lock);
14233 + list_del(&ev->list);
14234 + spin_unlock(&us->event_lock);
14237 +static event_t *alloc_event(void)
14240 + SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
14241 + memset(ev, 0, sizeof(event_t));
14245 +/* us->lock must be held before calling */
14246 +static void user_notify(user_service_t *us)
14249 + queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
14250 + if (us->pid && us->signal)
14251 + kill_proc(us->pid, us->signal, 0);
14254 +static service_start_t start_type(int type)
14257 + case SERVICE_NODE_FAILED:
14258 + return SERVICE_START_FAILED;
14259 + case SERVICE_NODE_JOIN:
14260 + return SERVICE_START_JOIN;
14261 + case SERVICE_NODE_LEAVE:
14262 + return SERVICE_START_LEAVE;
14267 +static int user_stop(void *servicedata)
14269 + user_service_t *us = (user_service_t *) servicedata;
14276 + ev = alloc_event();
14277 + ev->type = SERVICE_EVENT_STOP;
14279 + add_event(us, ev);
14286 +static int user_start(void *servicedata, uint32_t *nodeids, int count,
14287 + int event_id, int type)
14289 + user_service_t *us = (user_service_t *) servicedata;
14294 + kcl_start_done(us->local_id, event_id);
14298 + us->need_startdone = event_id;
14300 + ev = alloc_event();
14301 + ev->type = SERVICE_EVENT_START;
14302 + ev->node_count = count;
14303 + ev->start_type = start_type(type);
14304 + ev->event_id = event_id;
14305 + ev->nodeids = nodeids;
14307 + add_event(us, ev);
14314 +static void user_finish(void *servicedata, int event_id)
14316 + user_service_t *us = (user_service_t *) servicedata;
14323 + ev = alloc_event();
14324 + ev->type = SERVICE_EVENT_FINISH;
14325 + ev->event_id = event_id;
14327 + add_event(us, ev);
14333 +struct kcl_service_ops user_service_ops = {
14334 + .stop = user_stop,
14335 + .start = user_start,
14336 + .finish = user_finish
14339 +static int user_register(char *u_name, user_service_t **us_data)
14341 + user_service_t *us;
14342 + char name[MAX_SERVICE_NAME_LEN+1];
14345 + memset(name, 0, MAX_SERVICE_NAME_LEN+1);
14347 + if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
14350 + len = strlen(name);
14351 + if (len > MAX_SERVICE_NAME_LEN)
14352 + return -ENAMETOOLONG;
14356 + us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
14359 + memset(us, 0, sizeof(user_service_t));
14360 + us->nodeids = NULL;
14361 + INIT_LIST_HEAD(&us->events);
14362 + spin_lock_init(&us->event_lock);
14363 + init_MUTEX(&us->lock);
14364 + us->name_len = len;
14365 + memcpy(us->name, name, len);
14367 + error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
14368 + &user_service_ops, TRUE, (void *) us,
14378 +static void user_unregister(user_service_t *us)
14382 + kcl_unregister_service(us->local_id);
14385 + kfree(us->nodeids);
14387 + while ((ev = get_event(us))) {
14388 + del_event(us, ev);
14390 + kfree(ev->nodeids);
14395 +static int user_join_async(void *arg)
14397 + user_service_t *us = arg;
14398 + int user_gone = 0;
14400 + daemonize("cman_userjoin");
14402 + kcl_join_service(us->local_id);
14405 + us->state = UST_JOINED;
14408 + if (us->need_startdone)
14409 + kcl_start_done(us->local_id, us->need_startdone);
14415 + kcl_leave_service(us->local_id);
14416 + user_unregister(us);
14422 +static int user_leave_async(void *arg)
14424 + user_service_t *us = arg;
14426 + daemonize("cman_userleave");
14428 + kcl_leave_service(us->local_id);
14433 + user_unregister(us);
14436 + event_t *ev = alloc_event();
14437 + ev->type = SERVICE_EVENT_LEAVEDONE;
14438 + add_event(us, ev);
14446 +static int user_join(user_service_t *us, int wait)
14451 + error = kcl_join_service(us->local_id);
14452 + us->state = UST_JOINED;
14456 + kernel_thread(user_join_async, us, 0);
14462 +static void user_leave(user_service_t *us, int wait)
14465 + kcl_leave_service(us->local_id);
14468 + kernel_thread(user_leave_async, us, 0);
14472 +static int user_start_done(user_service_t *us, unsigned int event_id)
14474 + if (!us->need_startdone)
14476 + if (us->need_startdone == event_id)
14477 + us->need_startdone = 0;
14478 + kcl_start_done(us->local_id, event_id);
14482 +static void user_set_signal(user_service_t *us, int signal)
14484 + us->pid = current->pid;
14485 + us->signal = signal;
14488 +static int user_get_event(user_service_t *us,
14489 + struct cl_service_event *user_event)
14492 + struct cl_service_event event;
14494 + ev = get_event(us);
14498 + event.type = ev->type;
14499 + event.start_type = ev->start_type;
14500 + event.event_id = ev->event_id;
14501 + event.last_stop = ev->last_stop;
14502 + event.last_start = ev->last_start;
14503 + event.last_finish = ev->last_finish;
14504 + event.node_count = ev->node_count;
14506 + if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
14509 + del_event(us, ev);
14511 + if (ev->type == SERVICE_EVENT_START) {
14513 + kfree(us->nodeids);
14514 + us->nodeids = ev->nodeids;
14515 + us->node_count = ev->node_count;
14522 +static int user_get_members(user_service_t *us,
14523 + struct cl_cluster_nodelist *u_nodelist)
14525 + struct cl_cluster_nodelist user_nodelist;
14526 + struct cl_cluster_node user_node, *u_node;
14527 + struct cluster_node *node;
14529 + int num_nodes = 0;
14532 + return us->node_count;
14534 + if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
14535 + sizeof(struct cl_cluster_nodelist)))
14538 + if (user_nodelist.max_members < us->node_count)
14541 + u_node = user_nodelist.nodes;
14543 + for (i = 0; i < us->node_count; i++) {
14544 + node = find_node_by_nodeid(us->nodeids[i]);
14548 + copy_to_usernode(node, &user_node);
14549 + if (copy_to_user(u_node, &user_node,
14550 + sizeof(struct cl_cluster_node)))
14556 + return num_nodes;
14559 +static int user_global_id(user_service_t *us, uint32_t *id)
14561 + uint32_t gid = 0;
14563 + if (us->state != UST_JOINED)
14566 + kcl_global_service_id(us->local_id, &gid);
14568 + if (copy_to_user(id, &gid, sizeof(uint32_t)))
14573 +static int user_set_level(user_service_t *us, int level)
14575 + int prev_id = us->local_id;
14578 + if (us->state != UST_REGISTER)
14581 + error = kcl_register_service(us->name, us->name_len, level,
14582 + &user_service_ops, TRUE, (void *) us,
14587 + kcl_unregister_service(prev_id);
14591 +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
14593 + struct cluster_sock *c = cluster_sk(sock->sk);
14594 + user_service_t *us = c->service_data;
14597 + if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
14601 + case SIOCCLUSTER_SERVICE_REGISTER:
14602 + error = user_register((char *) arg, &us);
14604 + us->state = UST_REGISTER;
14606 + c->service_data = us;
14610 + case SIOCCLUSTER_SERVICE_UNREGISTER:
14612 + us->state = UST_UNREGISTER;
14613 + user_unregister(us);
14617 + case SIOCCLUSTER_SERVICE_JOIN:
14618 + us->state = UST_JOIN;
14619 + user_join(us, 0);
14622 + case SIOCCLUSTER_SERVICE_LEAVE:
14624 + if (us->state != UST_JOINED) {
14628 + us->state = UST_LEAVE;
14630 + user_leave(us, 0);
14634 + case SIOCCLUSTER_SERVICE_SETSIGNAL:
14635 + user_set_signal(us, (int) arg);
14638 + case SIOCCLUSTER_SERVICE_STARTDONE:
14639 + error = user_start_done(us, (unsigned int) arg);
14642 + case SIOCCLUSTER_SERVICE_GETEVENT:
14643 + error = user_get_event(us, (struct cl_service_event *) arg);
14646 + case SIOCCLUSTER_SERVICE_GETMEMBERS:
14647 + error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
14650 + case SIOCCLUSTER_SERVICE_GLOBALID:
14651 + error = user_global_id(us, (uint32_t *) arg);
14654 + case SIOCCLUSTER_SERVICE_SETLEVEL:
14655 + error = user_set_level(us, (int) arg);
14665 +void sm_sock_release(struct socket *sock)
14667 + struct cluster_sock *c = cluster_sk(sock->sk);
14668 + user_service_t *us = c->service_data;
14676 + c->service_data = NULL;
14678 + if (us->need_startdone)
14679 + kcl_start_done(us->local_id, us->need_startdone);
14682 + /* async thread will clean up before exiting */
14686 + state = us->state;
14693 + user_leave(us, 1);
14694 + /* fall through */
14696 + case UST_REGISTER:
14697 + user_unregister(us);
14698 + /* fall through */
14699 + case UST_UNREGISTER:
14704 diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
14705 --- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730
14706 +++ linux-patched/cluster/cman/sm_user.h 2004-11-03 11:37:37.000000000 +0800
14708 +/******************************************************************************
14709 +*******************************************************************************
14711 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14712 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14714 +** This copyrighted material is made available to anyone wishing to use,
14715 +** modify, copy, or redistribute it subject to the terms and conditions
14716 +** of the GNU General Public License v.2.
14718 +*******************************************************************************
14719 +******************************************************************************/
14721 +#ifndef __SM_USER_DOT_H__
14722 +#define __SM_USER_DOT_H__
14724 +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
14725 +void sm_sock_release(struct socket *sock);
14726 +void sm_sock_bind(struct socket *sock);
14729 diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
14730 --- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
14731 +++ linux-patched/include/cluster/cnxman-socket.h 2004-11-03 11:37:37.000000000 +0800
14733 +/******************************************************************************
14734 +*******************************************************************************
14736 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14737 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14739 +** This copyrighted material is made available to anyone wishing to use,
14740 +** modify, copy, or redistribute it subject to the terms and conditions
14741 +** of the GNU General Public License v.2.
14743 +*******************************************************************************
14744 +******************************************************************************/
14746 +/* CMAN socket interface header,
14747 + may be include by user or kernel code */
14749 +#ifndef __CNXMAN_SOCKET_H
14750 +#define __CNXMAN_SOCKET_H
14752 +/* A currently unused number. TIPC also uses this number and you're unlikely
14753 + to be using both.
14755 +#define AF_CLUSTER 30
14756 +#define PF_CLUSTER AF_CLUSTER
14758 +/* Protocol(socket) types */
14759 +#define CLPROTO_MASTER 2
14760 +#define CLPROTO_CLIENT 3
14762 +/* ioctls -- should register these properly */
14763 +#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int)
14764 +#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02)
14765 +#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist)
14766 +#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
14767 +#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05)
14768 +#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request)
14769 +#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist)
14770 +#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int)
14771 +#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version)
14772 +#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version)
14773 +#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b)
14774 +#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int)
14775 +#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d)
14776 +#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char)
14777 +#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
14778 +#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10)
14779 +#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20)
14780 +#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
14781 +#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
14782 +#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event)
14783 +#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
14784 +#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t)
14785 +#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int)
14786 +#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node)
14787 +#define SIOCCLUSTER_GETCLUSTER _IOWR('x', 0x91, struct cl_cluster_info)
14788 +#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info)
14790 +/* These were setsockopts */
14791 +#define SIOCCLUSTER_PASS_SOCKET _IOW('x', 0x0b0, struct cl_passed_sock)
14792 +#define SIOCCLUSTER_SET_NODENAME _IOW('x', 0x0b1, char *)
14793 +#define SIOCCLUSTER_SET_NODEID _IOW('x', 0x0b2, int)
14794 +#define SIOCCLUSTER_JOIN_CLUSTER _IOW('x', 0x0b3, struct cl_join_cluster_info)
14795 +#define SIOCCLUSTER_LEAVE_CLUSTER _IOW('x', 0x0b4, int)
14798 +/* Maximum size of a cluster message */
14799 +#define MAX_CLUSTER_MESSAGE 1500
14800 +#define MAX_CLUSTER_MEMBER_NAME_LEN 255
14801 +#define MAX_BARRIER_NAME_LEN 33
14802 +#define MAX_SA_ADDR_LEN 12
14803 +#define MAX_CLUSTER_NAME_LEN 16
14805 +/* Well-known cluster port numbers */
14806 +#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster
14807 + * transitions! */
14808 +#define CLUSTER_PORT_SERVICES 2
14809 +#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
14810 +#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
14811 +#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */
14813 +/* Port numbers above this will be blocked when the cluster is inquorate or in
14815 +#define HIGH_PROTECTED_PORT 9
14817 +/* Reasons for leaving the cluster */
14818 +#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */
14819 +#define CLUSTER_LEAVEFLAG_KILLED 1
14820 +#define CLUSTER_LEAVEFLAG_PANIC 2
14821 +#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */
14822 +#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the
14824 +#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is
14825 + * in a minority */
14826 +#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */
14827 +#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */
14829 +/* OOB messages sent to a local socket */
14830 +#define CLUSTER_OOB_MSG_PORTCLOSED 1
14831 +#define CLUSTER_OOB_MSG_STATECHANGE 2
14832 +#define CLUSTER_OOB_MSG_SERVICEEVENT 3
14834 +/* Sendmsg flags, these are above the normal sendmsg flags so they don't
14836 +#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */
14837 +#define MSG_QUEUE 0x020000 /* Queue the message for sending later */
14838 +#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
14840 +#define MSG_ALLINT 0x100000 /* Send out of all interfaces */
14841 +#define MSG_REPLYEXP 0x200000 /* Reply is expected */
14842 +#define MSG_BCASTSELF 0x400000 /* Broadcast message also gets send to us */
14844 +typedef enum { NODESTATE_JOINING=1, NODESTATE_MEMBER,
14845 + NODESTATE_DEAD } nodestate_t;
14848 +struct sockaddr_cl {
14849 + unsigned short scl_family;
14850 + unsigned char scl_flags;
14851 + unsigned char scl_port;
14856 + * This is how we pass the multicast & receive sockets into kernel space.
14858 +struct cl_passed_sock {
14859 + int fd; /* FD of master socket to do multicast on */
14860 + int number; /* Socket number, to match up recvonly & bcast
14862 + int multicast; /* Is it multicast or receive ? */
14865 +/* Cluster configuration info passed when we join the cluster */
14866 +struct cl_join_cluster_info {
14867 + unsigned char votes;
14868 + unsigned int expected_votes;
14869 + unsigned int two_node;
14870 + unsigned int config_version;
14872 + char cluster_name[17];
14876 +/* This is the structure, per node, returned from the membership ioctl */
14877 +struct cl_cluster_node {
14878 + unsigned int size;
14879 + unsigned int node_id;
14881 + unsigned int leave_reason;
14882 + unsigned int incarnation;
14883 + nodestate_t state;
14884 + char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14885 + unsigned char votes;
14888 +/* The struct passed to the membership ioctls */
14889 +struct cl_cluster_nodelist {
14890 + uint32_t max_members;
14891 + struct cl_cluster_node *nodes;
14894 +/* Structure passed to SIOCCLUSTER_ISLISTENING */
14895 +struct cl_listen_request {
14896 + unsigned char port;
14900 +/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
14901 +struct cl_portclosed_oob {
14902 + unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */
14903 + unsigned char port;
14906 +/* Get all version numbers or set the config version */
14907 +struct cl_version {
14908 + unsigned int major;
14909 + unsigned int minor;
14910 + unsigned int patch;
14911 + unsigned int config;
14914 +/* structure passed to barrier ioctls */
14915 +struct cl_barrier_info {
14917 + char name[MAX_BARRIER_NAME_LEN];
14918 + unsigned int flags;
14919 + unsigned long arg;
14922 +struct cl_cluster_info {
14923 + char name[MAX_CLUSTER_NAME_LEN+1];
14927 +typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
14928 + SERVICE_EVENT_LEAVEDONE } service_event_t;
14930 +typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
14933 +struct cl_service_event {
14934 + service_event_t type;
14935 + service_start_t start_type;
14936 + unsigned int event_id;
14937 + unsigned int last_stop;
14938 + unsigned int last_start;
14939 + unsigned int last_finish;
14940 + unsigned int node_count;
14944 +/* Commands to the barrier ioctl */
14945 +#define BARRIER_IOCTL_REGISTER 1
14946 +#define BARRIER_IOCTL_CHANGE 2
14947 +#define BARRIER_IOCTL_DELETE 3
14948 +#define BARRIER_IOCTL_WAIT 4
14950 +/* Attributes of a barrier - bitmask */
14951 +#define BARRIER_ATTR_AUTODELETE 1
14952 +#define BARRIER_ATTR_MULTISTEP 2
14953 +#define BARRIER_ATTR_MANUAL 4
14954 +#define BARRIER_ATTR_ENABLED 8
14955 +#define BARRIER_ATTR_CALLBACK 16
14957 +/* Attribute setting commands */
14958 +#define BARRIER_SETATTR_AUTODELETE 1
14959 +#define BARRIER_SETATTR_MULTISTEP 2
14960 +#define BARRIER_SETATTR_ENABLED 3
14961 +#define BARRIER_SETATTR_NODES 4
14962 +#define BARRIER_SETATTR_CALLBACK 5
14963 +#define BARRIER_SETATTR_TIMEOUT 6
14966 diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
14967 --- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
14968 +++ linux-patched/include/cluster/cnxman.h 2004-11-03 11:37:37.000000000 +0800
14970 +/******************************************************************************
14971 +*******************************************************************************
14973 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
14974 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
14976 +** This copyrighted material is made available to anyone wishing to use,
14977 +** modify, copy, or redistribute it subject to the terms and conditions
14978 +** of the GNU General Public License v.2.
14980 +*******************************************************************************
14981 +******************************************************************************/
14983 +#ifndef __CNXMAN_H
14984 +#define __CNXMAN_H
14986 +#include "linux/in6.h"
14987 +#include "cluster/cnxman-socket.h"
14989 +/* In-kernel API */
14991 +/* This is the structure, per node, returned from the membership request */
14992 +struct kcl_cluster_node {
14993 + unsigned int size;
14994 + unsigned int node_id;
14996 + unsigned int leave_reason;
14997 + unsigned int incarnation;
14998 + nodestate_t state;
14999 + struct list_head list;
15000 + char name[MAX_CLUSTER_MEMBER_NAME_LEN];
15001 + unsigned char votes;
15004 +struct cluster_node_addr {
15005 + struct list_head list;
15006 + unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
15011 +/* Reasons for a kernel membership callback */
15012 +typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
15014 +/* Kernel version of above, the void *sock is a struct socket */
15015 +struct kcl_multicast_sock {
15017 + int number; /* Socket number, to match up recvonly & bcast
15021 +extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
15022 + struct sockaddr_cl *caddr, int addr_len,
15023 + unsigned int flags);
15024 +extern int kcl_register_read_callback(struct socket *sock,
15025 + int (*routine) (char *, int, char *, int,
15027 +extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
15028 +extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
15029 +extern int kcl_get_members(struct list_head *list);
15030 +extern int kcl_get_member_ids(uint32_t * idbuf, int size);
15031 +extern int kcl_get_all_members(struct list_head *list);
15032 +extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
15033 + struct kcl_cluster_node *n);
15034 +extern int kcl_get_node_by_name(unsigned char *name,
15035 + struct kcl_cluster_node *n);
15036 +extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
15037 +extern int kcl_is_quorate(void);
15038 +extern int kcl_addref_cluster(void);
15039 +extern int kcl_releaseref_cluster(void);
15040 +extern int kcl_cluster_name(char **cname);
15041 +extern int kcl_get_current_interface(void);
15042 +extern struct list_head *kcl_get_node_addresses(int nodeid);
15044 +extern int kcl_barrier_register(char *name, unsigned int flags,
15045 + unsigned int nodes);
15046 +extern int kcl_barrier_setattr(char *name, unsigned int attr,
15047 + unsigned long arg);
15048 +extern int kcl_barrier_delete(char *name);
15049 +extern int kcl_barrier_wait(char *name);
15050 +extern int kcl_barrier_cancel(char *name);
15052 +extern int kcl_register_quorum_device(char *name, int votes);
15053 +extern int kcl_unregister_quorum_device(void);
15054 +extern int kcl_quorum_device_available(int yesno);
15057 diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
15058 --- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730
15059 +++ linux-patched/include/cluster/service.h 2004-11-03 11:37:37.000000000 +0800
15061 +/******************************************************************************
15062 +*******************************************************************************
15064 +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
15065 +** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
15067 +** This copyrighted material is made available to anyone wishing to use,
15068 +** modify, copy, or redistribute it subject to the terms and conditions
15069 +** of the GNU General Public License v.2.
15071 +*******************************************************************************
15072 +******************************************************************************/
15074 +#ifndef __SERVICE_DOT_H__
15075 +#define __SERVICE_DOT_H__
15078 + * Interface between service manager and services
15082 + * Service levels are started in order from lowest, so level 0 is started on
15083 + * all nodes before level 1 is started.
15086 +#define SERVICE_LEVEL_FENCE (0)
15087 +#define SERVICE_LEVEL_GDLM (1)
15088 +#define SERVICE_LEVEL_GFS (2)
15089 +#define SERVICE_LEVEL_USER (3)
15091 +#define MAX_SERVICE_NAME_LEN (33)
15094 + * The type of start a service receives. The start (and preceding stop) may be
15095 + * due to a node joining or leaving the SG or due to a node having failed.
15098 +#define SERVICE_NODE_FAILED (1)
15099 +#define SERVICE_NODE_JOIN (2)
15100 +#define SERVICE_NODE_LEAVE (3)
15103 +struct kcl_service {
15104 + struct list_head list;
15106 + uint32_t local_id;
15107 + uint32_t global_id;
15109 + char name[MAX_SERVICE_NAME_LEN];
15112 +int kcl_get_services(struct list_head *list, int level);
15116 + * These routines which run in CMAN context must return quickly and cannot
15120 +struct kcl_service_ops {
15121 + int (*stop) (void *servicedata);
15122 + int (*start) (void *servicedata, uint32_t *nodeids, int count,
15123 + int event_id, int type);
15124 + void (*finish) (void *servicedata, int event_id);
15128 + * Register will cause CMAN to create a Service Group (SG) for the named
15129 + * instance of the service. A local ID is returned which is used to join,
15130 + * leave and unregister the service.
15133 +int kcl_register_service(char *name, int namelen, int level,
15134 + struct kcl_service_ops *ops, int unique,
15135 + void *servicedata, uint32_t *local_id);
15137 +void kcl_unregister_service(uint32_t local_id);
15140 + * Once a service is joined it will be managed by CMAN and receive start, stop,
15141 + * and finish calls. After leave is called the service is no longer managed by
15142 + * CMAN. The first start for a service may arrive before kcl_join_service()
15146 +int kcl_join_service(uint32_t local_id);
15147 +int kcl_leave_service(uint32_t local_id);
15150 + * After a service is started, it can ask for its cluster-wide unique ID.
15153 +void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
15156 + * Called by a service when it's done with a start(). Cannot be called from
15157 + * the start function.
15160 +void kcl_start_done(uint32_t local_id, int event_id);