# Add CMAN to build system diff -urN -p linux-2.6.7/Makefile linux/Makefile --- linux-2.6.7/Makefile 2004-06-16 13:19:37.000000000 +0800 +++ linux/Makefile 2004-06-17 14:55:06.000000000 +0800 @@ -418,7 +418,7 @@ all: vmlinux # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ +drivers-y := drivers/ sound/ cluster/ net-y := net/ libs-y := lib/ core-y := usr/ diff -urN -p linux-2.6.7/arch/alpha/Kconfig linux/arch/alpha/Kconfig --- linux-2.6.7/arch/alpha/Kconfig 2004-06-16 13:19:44.000000000 +0800 +++ linux/arch/alpha/Kconfig 2004-06-17 14:55:06.000000000 +0800 @@ -698,3 +698,4 @@ source "crypto/Kconfig" source "lib/Kconfig" +source "cluster/Kconfig" diff -urN -p linux-2.6.7/arch/i386/Kconfig linux/arch/i386/Kconfig --- linux-2.6.7/arch/i386/Kconfig 2004-06-16 13:18:59.000000000 +0800 +++ linux/arch/i386/Kconfig 2004-06-17 14:55:06.000000000 +0800 @@ -1315,6 +1315,8 @@ source "crypto/Kconfig" source "lib/Kconfig" +source "cluster/Kconfig" + config X86_SMP bool depends on SMP && !X86_VOYAGER diff -urN -p linux-2.6.7/arch/parisc/Kconfig linux/arch/parisc/Kconfig --- linux-2.6.7/arch/parisc/Kconfig 2004-06-16 13:19:36.000000000 +0800 +++ linux/arch/parisc/Kconfig 2004-06-17 14:55:06.000000000 +0800 @@ -229,3 +229,4 @@ source "crypto/Kconfig" source "lib/Kconfig" +source "cluster/Kconfig" diff -urN -p linux-2.6.7/arch/sparc64/Kconfig linux/arch/sparc64/Kconfig --- linux-2.6.7/arch/sparc64/Kconfig 2004-06-16 13:19:52.000000000 +0800 +++ linux/arch/sparc64/Kconfig 2004-06-17 14:55:06.000000000 +0800 @@ -713,3 +713,4 @@ source "crypto/Kconfig" source "lib/Kconfig" +source "cluster/Kconfig" diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig --- linux-2.6.7/cluster/Kconfig 1970-01-01 07:30:00.000000000 +0730 +++ linux/cluster/Kconfig 2004-06-17 14:55:06.000000000 +0800 @@ -0,0 +1,13 @@ +menu "Cluster Support" + +config CLUSTER + tristate "Cluster support" + ---help--- + Enable clustering support. This is not the high-performance clustering + made famous by beowulf. It is a high-availability cluster often using + shared storage. + The cluster manager is the heart(beat) of the cluster system. It is + needed by all the other components. It provides membership services + for those other subsystems. + +endmenu diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile --- linux-2.6.7/cluster/Makefile 1970-01-01 07:30:00.000000000 +0730 +++ linux/cluster/Makefile 2004-06-17 14:55:06.000000000 +0800 @@ -0,0 +1,3 @@ +obj-y := nocluster.o + +obj-$(CONFIG_CLUSTER) += cman/ diff -urN -p linux-2.6.7/cluster/cman/Makefile linux/cluster/cman/Makefile --- linux-2.6.7/cluster/cman/Makefile 1970-01-01 07:30:00.000000000 +0730 +++ linux/cluster/cman/Makefile 2004-06-17 14:55:06.000000000 +0800 @@ -0,0 +1,6 @@ +cman-objs := cnxman.o config.o membership.o proc.o\ + sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\ + sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \ + sm_user.o + +obj-$(CONFIG_CLUSTER) := cman.o diff -urN -p linux-2.6.7/cluster/nocluster.c linux/cluster/nocluster.c --- linux-2.6.7/cluster/nocluster.c 1970-01-01 07:30:00.000000000 +0730 +++ linux/cluster/nocluster.c 2004-06-17 14:55:06.000000000 +0800 @@ -0,0 +1,20 @@ +/* + * cluster/nocluster.c + * + * Copy from net/nonet.c + * Dummy functions to allow us to configure cluster support entirely + * out of the kernel. + * + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) Matthew Wilcox 2003 + */ + +#include +#include +#include +#include +#include + +void __init nocluster_init(void) +{ +} diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h --- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/cnxman-private.h 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,427 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CNXMAN_PRIVATE_H +#define __CNXMAN_PRIVATE_H + +/* Version triplet */ +#define CNXMAN_MAJOR_VERSION 2 +#define CNXMAN_MINOR_VERSION 0 +#define CNXMAN_PATCH_VERSION 1 + +#define MAX_RETRIES 3 /* Maximum number of send retries */ +#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the + * cluster */ +#ifdef __KERNEL__ + +/* How we announce ourself in console events */ +#define CMAN_NAME "CMAN" + +/* One of these per AF_CLUSTER socket */ +struct cluster_sock { + /* WARNING: sk has to be the first member */ + struct sock sk; + + unsigned char port; /* Bound port or zero */ + int (*kernel_callback) (char *, int, char *, int, unsigned int); + void *service_data; +}; + +#define cluster_sk(__sk) ((struct cluster_sock *)__sk) + +/* We have one of these for each socket we use for communications */ +struct cl_comms_socket { + struct socket *sock; + int broadcast; /* This is a broadcast socket */ + int recv_only; /* This is the unicast receive end of a + * multicast socket */ + struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for + * the remote end(s) */ + int addr_len; /* Length of above */ + int number; /* Internal socket number, used to cycle around + * sockets in case of network errors */ + struct file *file; /* file pointer for user-passed in sockets */ + + wait_queue_t wait; + + /* The socket list */ + struct list_head list; + + /* On here when it has something to say */ + struct list_head active_list; + unsigned long active; +}; + +/* A client socket. We keep a list of these so we can notify clients of cluster + * events */ +struct cl_client_socket { + struct socket *sock; + struct list_head list; +}; + +/* This structure is tacked onto the start of a cluster message packet for our + * own nefarious purposes. */ +struct cl_protheader { + unsigned char port; + unsigned char flags; + unsigned short cluster; /* Our cluster number, little-endian */ + unsigned short seq; /* Packet sequence number, little-endian */ + int srcid; /* Node ID of the sender */ + int tgtid; /* Node ID of the target or 0 for multicast + * messages */ +}; + +/* A cluster internal protocol message - port number 0 */ +struct cl_protmsg { + struct cl_protheader header; + unsigned char cmd; +}; + +/* A Cluster ACK message */ +struct cl_ackmsg { + struct cl_protheader header; + unsigned char cmd; /* Always CLUSTER_CMD_ACK */ + unsigned char remport; /* Remoye port number the original message was + * for */ + unsigned char aflags; /* ACK flags 0=OK, 1=No listener */ + unsigned char pad; + unsigned short seq; /* Sequence number we are acking */ +}; + +/* A Cluster LISTENREQ/LISTENRESP message */ +struct cl_listenmsg { + unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */ + unsigned char target_port; /* Port to probe */ + unsigned char listening; /* Always 0 for LISTENREQ */ + unsigned char pad; + unsigned short tag; /* PID of remote waiting process */ +}; + +/* A Cluster PORTCLOSED message */ +struct cl_closemsg { + unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */ + unsigned char port; +}; + +/* Structure of a newly dead node, passed from cnxman to kmembershipd */ +struct cl_new_dead_node { + struct list_head list; + struct cluster_node *node; +}; + +/* Subcommands for BARRIER message */ +#define BARRIER_REGISTER 1 +#define BARRIER_CHANGE 2 +#define BARRIER_WAIT 4 +#define BARRIER_COMPLETE 5 + +/* A Cluster BARRIER message */ +struct cl_barriermsg { + unsigned char cmd; /* CLUSTER_CMD_BARRIER */ + unsigned char subcmd; /* BARRIER sub command */ + unsigned short pad; + unsigned int flags; + unsigned int nodes; + char name[MAX_BARRIER_NAME_LEN]; +}; + +/* Membership services messages, the cl_protheader is added transparently */ +struct cl_mem_hello_msg { + unsigned char cmd; + unsigned char flags; + unsigned short members; /* Number of nodes in the cluster, + * little-endian */ + unsigned int generation; /* Current cluster generation number */ +}; + +struct cl_mem_endtrans_msg { + unsigned char cmd; + unsigned char pad1; + unsigned short pad2; + unsigned int quorum; + unsigned int total_votes; + unsigned int generation; /* Current cluster generation number */ + unsigned int new_node_id; /* If reason is a new node joining */ +}; + +/* ACK types for JOINACK message */ +#define JOINACK_TYPE_OK 1 /* You can join */ +#define JOINACK_TYPE_NAK 2 /* You can NOT join */ +#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition + * already */ + +struct cl_mem_joinack_msg { + unsigned char cmd; + unsigned char acktype; +}; + +/* This is used by JOINREQ message */ +struct cl_mem_join_msg { + unsigned char cmd; + unsigned char votes; + unsigned short num_addr; /* Number of addresses for this node */ + unsigned int expected_votes; + unsigned int members; /* Number of nodes in the cluster, + * little-endian */ + unsigned int major_version; /* Not backwards compatible */ + unsigned int minor_version; /* Backwards compatible */ + unsigned int patch_version; /* Backwards/forwards compatible */ + unsigned int config_version; + unsigned int addr_len; /* length of node addresses */ + char clustername[16]; + /* Followed by addresses of `address_length` bytes and a + * NUL-terminated node name */ +}; + +/* State transition start reasons: */ +#define TRANS_NEWNODE 1 /* A new node is joining the cluster */ +#define TRANS_REMNODE 2 /* a node has left the cluster */ +#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in + * transition */ +#define TRANS_NEWMASTER 4 /* We have had an election and I am the new + * master */ +#define TRANS_CHECK 5 /* A consistency check was called for */ +#define TRANS_RESTART 6 /* Transition restarted because of a previous + * timeout */ +#define TRANS_DEADMASTER 7 /* The master died during transition and I have + * taken over */ + +/* This is used to start a state transition */ +struct cl_mem_starttrans_msg { + unsigned char cmd; + unsigned char reason; /* Why a start transition was started - see + * above */ + unsigned char flags; + unsigned char votes; + unsigned int expected_votes; + unsigned int generation; /* Incremented for each STARTTRANS sent + */ + int nodeid; /* Node to be removed */ + unsigned short num_addrs; + /* If reason == TRANS_NEWNODE: Followed by addresses of + * `address_length` bytes and a NUL-terminated node name */ +}; + +struct cl_mem_startack_msg { + unsigned char cmd; + unsigned char reason; + unsigned short pad; + unsigned int generation; + unsigned int node_id; /* node_id we think new node should have */ + unsigned int highest_node_id; /* highest node_id on this system */ +}; + +/* Reconfigure a cluster parameter */ +struct cl_mem_reconfig_msg { + unsigned char cmd; + unsigned char param; + unsigned short pad; + unsigned int value; +}; + +/* Structure containing information about an outstanding listen request */ +struct cl_waiting_listen_request { + wait_queue_head_t waitq; + int result; + int waiting; + unsigned short tag; + int nodeid; + struct list_head list; +}; + +/* Messages from membership services */ +#define CLUSTER_MEM_JOINCONF 1 +#define CLUSTER_MEM_JOINREQ 2 +#define CLUSTER_MEM_LEAVE 3 +#define CLUSTER_MEM_HELLO 4 +#define CLUSTER_MEM_KILL 5 +#define CLUSTER_MEM_JOINACK 6 +#define CLUSTER_MEM_ENDTRANS 7 +#define CLUSTER_MEM_RECONFIG 8 +#define CLUSTER_MEM_MASTERVIEW 9 +#define CLUSTER_MEM_STARTTRANS 10 +#define CLUSTER_MEM_JOINREJ 11 +#define CLUSTER_MEM_VIEWACK 12 +#define CLUSTER_MEM_STARTACK 13 +#define CLUSTER_MEM_TRANSITION 14 +#define CLUSTER_MEM_NEWCLUSTER 15 +#define CLUSTER_MEM_CONFACK 16 +#define CLUSTER_MEM_NOMINATE 17 + +/* Parameters for RECONFIG command */ +#define RECONFIG_PARAM_EXPECTED_VOTES 1 +#define RECONFIG_PARAM_NODE_VOTES 2 +#define RECONFIG_PARAM_CONFIG_VERSION 3 + +/* Data associated with an outgoing socket */ +struct cl_socket { + struct file *file; /* The real file */ + struct socket *socket; /* The real sock */ + struct cl_multicast_sock multicast_info; + int num_nodes; /* On this link */ + int retransmit_count; +}; + +/* There's one of these for each node in the cluster */ +struct cluster_node { + struct list_head list; + char *name; /* Node/host name of node */ + struct list_head addr_list; + int us; /* This node is us */ + unsigned int node_id; /* Unique node ID */ + nodestate_t state; + unsigned short last_seq_recv; + unsigned short last_seq_acked; + unsigned short last_seq_sent; + unsigned int votes; + unsigned int expected_votes; + unsigned int leave_reason; + unsigned int incarnation; /* Incremented each time a node joins + * the cluster */ + unsigned long last_hello; /* Jiffies */ +}; + +/* This is how we keep a list of user processes that are listening for cluster + * membership events */ +struct notify_struct { + struct list_head list; + pid_t pid; + int signal; +}; + +/* This is how we keep a list of kernel callbacks that are registered for + * cluster membership events */ +struct kernel_notify_struct { + struct list_head list; + void (*callback) (kcl_callback_reason, long arg); +}; + +/* A message waiting to be sent */ +struct queued_message { + struct list_head list; + + struct socket *socket; + struct sockaddr_cl addr; + int addr_len; + int msg_len; + unsigned char port; + unsigned int flags; + char msg_buffer[MAX_CLUSTER_MESSAGE]; +}; + +/* A barrier */ +struct cl_barrier { + struct list_head list; + + char name[MAX_BARRIER_NAME_LEN]; + unsigned int flags; + enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE, + BARRIER_STATE_COMPLETE } state; + unsigned int expected_nodes; + unsigned int registered_nodes; + atomic_t got_nodes; + atomic_t completed_nodes; + unsigned int inuse; + unsigned int waitsent; + unsigned int phase; /* Completion phase */ + unsigned int endreason; /* Reason we were woken, usually 0 */ + unsigned long timeout; /* In seconds */ + + void (*callback) (char *name, int status); + wait_queue_head_t waitq; + struct semaphore lock; /* To synch with cnxman messages */ + spinlock_t phase2_spinlock; /* Need to synchronise with timer + * interrupts */ + struct timer_list timer; +}; + +/* Cluster protocol commands sent to port 0 */ +#define CLUSTER_CMD_ACK 1 +#define CLUSTER_CMD_LISTENREQ 2 +#define CLUSTER_CMD_LISTENRESP 3 +#define CLUSTER_CMD_PORTCLOSED 4 +#define CLUSTER_CMD_BARRIER 5 + +extern struct cluster_node *find_node_by_addr(unsigned char *addr, + int addr_len); +extern struct cluster_node *find_node_by_nodeid(unsigned int id); +extern struct cluster_node *find_node_by_name(char *name); +extern void set_quorate(int); +extern void notify_kernel_listeners(kcl_callback_reason reason, long arg); +extern void notify_listeners(void); +extern void free_nodeid_array(void); +extern int send_reconfigure(int param, unsigned int value); +extern int calculate_quorum(int, int, int *); +extern void recalculate_quorum(int); +extern int send_leave(unsigned char); +extern int get_quorum(void); +extern void set_votes(int, int); +extern void kcl_wait_for_all_acks(void); +extern char *membership_state(char *, int); +extern void a_node_just_died(struct cluster_node *node); +extern void check_barrier_returns(void); +extern int in_transition(void); +extern void get_local_addresses(struct cluster_node *node); +extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len); +extern void create_proc_entries(void); +extern void cleanup_proc_entries(void); +extern unsigned int get_highest_nodeid(void); +extern int allocate_nodeid_array(void); +extern void queue_oob_skb(struct socket *sock, int cmd); +extern int new_temp_nodeid(char *addr, int addrlen); +extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen); +extern void remove_temp_nodeid(int nodeid); +extern inline char *print_addr(unsigned char *addr, int len, char *buf) +{ + int i; + int ptr = 0; + + for (i = 0; i < len; i++) + ptr += sprintf(buf + ptr, "%02x ", addr[i]); + + return buf; +} + +#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1) + +/* Debug enabling macros. Sorry about the C++ comments but they're easier to + * get rid of than C ones... */ + +// #define DEBUG_MEMB +// #define DEBUG_COMMS +// #define DEBUG_BARRIER + +/* Debug macros */ +#ifdef DEBUG_COMMS +#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args) +#else +#define P_COMMS(fmt, args...) +#endif + +#ifdef DEBUG_BARRIER +#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args) +#else +#define P_BARRIER(fmt, args...) +#endif + +#ifdef DEBUG_MEMB +#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args) +#define C_MEMB(fmt, args...) printk(fmt, ## args) +#else +#define P_MEMB(fmt, args...) +#define C_MEMB(fmt, args...) +#endif + +#endif /* __KERNEL */ + +#endif diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c --- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/cnxman.c 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,4080 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#define EXPORT_SYMTAB +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cnxman-private.h" +#include "sm_control.h" +#include "sm_user.h" +#include "config.h" + +#define CMAN_RELEASE_NAME "" + +static int __cl_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen, int flags); +static int __cl_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen, int flags); +static void send_to_userport(struct cl_comms_socket *csock, char *data, int len, + char *addr, int addrlen); +static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq, + int addr_len, char *addr, unsigned char remport, + unsigned char flag); +static void send_listen_request(int nodeid, unsigned char port); +static void send_listen_response(struct cl_comms_socket *csock, int nodeid, + unsigned char port, unsigned short tag); +static void resend_last_message(void); +static void start_ack_timer(void); +static int send_queued_message(struct queued_message *qmsg); +static void send_port_close_oob(unsigned char port); +static void post_close_oob(unsigned char port, int nodeid); +static void process_barrier_msg(struct cl_barriermsg *msg, + struct cluster_node *node); +static struct cl_barrier *find_barrier(char *name); +static void node_shutdown(void); +static void node_cleanup(void); +static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr, + unsigned char port); +static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur); +static void check_for_unacked_nodes(void); +static void free_cluster_sockets(void); +static uint16_t generate_cluster_id(char *name); + +static int is_valid_temp_nodeid(int nodeid); + +extern int start_membership_services(pid_t); +extern int kcl_leave_cluster(int remove); +extern int send_kill(int nodeid); + +static struct proto_ops cl_proto_ops; +static struct sock *master_sock; +static kmem_cache_t *cluster_sk_cachep; + +/* Pointer to the pseudo node that maintains quorum in a 2node system */ +struct cluster_node *quorum_device = NULL; + +/* Array of "ports" allocated. This is just a list of pointers to the sock that + * has this port bound. Speed is a major issue here so 1-2K of allocated + * storage is worth sacrificing. Port 0 is reserved for protocol messages */ +static struct sock *port_array[256]; +static struct semaphore port_array_lock; + +/* Our cluster name & number */ +unsigned short cluster_id; +char cluster_name[MAX_CLUSTER_NAME_LEN+1]; + +/* Two-node mode: causes cluster to remain quorate if one of two nodes fails. + * No more than two nodes are permitted to join the cluster. */ +unsigned short two_node; + +/* Cluster configuration version that must be the same among members. */ +unsigned int config_version; + +/* Reference counting for cluster applications */ +atomic_t use_count; + +/* Length of sockaddr address for our comms protocol */ +unsigned int address_length; + +/* Message sending */ +static unsigned short cur_seq; /* Last message sent */ +static unsigned int ack_count; /* Number of acks received for message + * 'cur_seq' */ +static unsigned int acks_expected; /* Number of acks we expect to receive */ +static struct semaphore send_lock; +static struct timer_list ack_timer; + +/* Saved packet information in case we need to resend it */ +static char saved_msg_buffer[MAX_CLUSTER_MESSAGE]; +static int saved_msg_len; +static int retry_count; + +/* Task variables */ +static pid_t kcluster_pid; +static pid_t membership_pid; +extern int quit_threads; + +wait_queue_head_t cnxman_waitq; + +/* Variables owned by membership services */ +extern int cluster_members; +extern struct list_head cluster_members_list; +extern struct semaphore cluster_members_lock; +extern int we_are_a_cluster_member; +extern int cluster_is_quorate; +extern struct cluster_node *us; +extern struct list_head new_dead_node_list; +extern struct semaphore new_dead_node_lock; +extern char nodename[]; + +/* A list of processes listening for membership events */ +static struct list_head event_listener_list; +static struct semaphore event_listener_lock; + +/* A list of kernel callbacks listening for membership events */ +static struct list_head kernel_listener_list; +static struct semaphore kernel_listener_lock; + +/* A list of sockets we are listening on (and can transmit on...later) */ +static struct list_head socket_list; + +/* A list of all open cluster client sockets */ +static struct list_head client_socket_list; +static struct semaphore client_socket_lock; + +/* A list of all current barriers */ +static struct list_head barrier_list; +static struct semaphore barrier_list_lock; + +/* When a socket is read for reading it goes on this queue */ +static spinlock_t active_socket_lock; +static struct list_head active_socket_list; + +/* If the cnxman process is running and available for work */ +atomic_t cnxman_running; + +/* Fkags set by timers etc for the mainloop to detect and act upon */ +static unsigned long mainloop_flags; + +#define ACK_TIMEOUT 1 +#define RESEND_NEEDED 2 + +/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of + * process context then the messages get put in here */ +static struct list_head messages_list; +static struct semaphore messages_list_lock; + +static struct semaphore start_thread_sem; + +/* List of outstanding ISLISTENING requests */ +static struct list_head listenreq_list; +static struct semaphore listenreq_lock; + +/* Any sending requests wait on this queue if necessary (eg inquorate, waiting + * ACK) */ +static DECLARE_WAIT_QUEUE_HEAD(socket_waitq); + +/* Wait for thread to exit properly */ +struct completion cluster_thread_comp; +struct completion member_thread_comp; + +/* The resend delay to use, We increase this geometrically(word?) each time a + * send is delayed. in deci-seconds */ +static int resend_delay = 1; + +/* Highest numbered interface and the current default */ +static int num_interfaces = 0; +static struct cl_comms_socket *current_interface = NULL; + +struct temp_node +{ + int nodeid; + char addr[sizeof(struct sockaddr_in6)]; + int addrlen; + struct list_head list; +}; +static struct list_head tempnode_list; +static struct semaphore tempnode_lock; + +/* Wake up any processes that are waiting to send. This is usually called when + * all the ACKs have been gathered up or when a node has left the cluster + * unexpectedly and we reckon there are no more acks to collect */ +static void unjam(void) +{ + wake_up_interruptible(&socket_waitq); + wake_up_interruptible(&cnxman_waitq); +} + +/* Used by the data_ready routine to locate a connection given the socket */ +static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk) +{ + struct list_head *conlist; + + list_for_each(conlist, &socket_list) { + struct cl_comms_socket *clsock = + list_entry(conlist, struct cl_comms_socket, list); + if (clsock->sock->sk == sk) { + return clsock; + } + } + return NULL; +} + +/* Data available on socket */ +static void cnxman_data_ready(struct sock *sk, int count_unused) +{ + struct cl_comms_socket *clsock = find_comms_by_sock(sk); + + if (clsock == NULL) /* ASSERT ?? */ + return; + + /* If we're already on the list then don't do it again */ + if (test_and_set_bit(1, &clsock->active)) + return; + + spin_lock_irq(&active_socket_lock); + list_add(&clsock->active_list, &active_socket_list); + spin_unlock_irq(&active_socket_lock); + + wake_up_interruptible(&cnxman_waitq); +} + +static int receive_message(struct cl_comms_socket *csock, char *iobuf) +{ + struct msghdr msg; + struct iovec iov; + struct sockaddr_in6 sin; + int len; + mm_segment_t fs; + + memset(&sin, 0, sizeof (sin)); + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + msg.msg_name = &sin; + msg.msg_namelen = sizeof (sin); + msg.msg_flags = 0; + + iov.iov_len = MAX_CLUSTER_MESSAGE; + iov.iov_base = iobuf; + + fs = get_fs(); + set_fs(get_ds()); + + len = sock_recvmsg(csock->sock, &msg, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT); + set_fs(fs); + + if (len > 0) { + if (len > MAX_CLUSTER_MESSAGE) { + printk(KERN_CRIT CMAN_NAME + ": %d byte message far too big\n", len); + return 0; + } + send_to_userport(csock, iobuf, len, msg.msg_name, msg.msg_namelen); + } + else { + if (len != -EAGAIN) + printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n", + len); + } + return len; +} + +static int cluster_kthread(void *unused) +{ + int len; + char *iobuf; + struct list_head *socklist; + struct cl_comms_socket *csock; + wait_queue_t cnxman_waitq_head; + sigset_t tmpsig; + + daemonize("cman_comms"); + + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */ + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM); + sigprocmask(SIG_BLOCK, &tmpsig, NULL); + + /* This is the waitq we can wake the process up with */ + init_waitqueue_head(&cnxman_waitq); + init_waitqueue_entry(&cnxman_waitq_head, current); + add_wait_queue(&cnxman_waitq, &cnxman_waitq_head); + + set_user_nice(current, -6); + + /* Allow the sockets to start receiving */ + list_for_each(socklist, &socket_list) { + csock = list_entry(socklist, struct cl_comms_socket, list); + + clear_bit(1, &csock->active); + } + + iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL); + if (!iobuf) { + printk(KERN_CRIT CMAN_NAME + ": Cannot allocate receive buffer for cluster comms\n"); + return -1; + } + + complete(&cluster_thread_comp); + + for (;;) { + struct list_head *temp; + + /* Wait for activity on any of the sockets */ + set_task_state(current, TASK_INTERRUPTIBLE); + + if (list_empty(&active_socket_list)) + schedule(); + set_task_state(current, TASK_RUNNING); + + if (quit_threads) + break; + + if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) { + check_for_unacked_nodes(); + } + + /* Now receive any messages waiting for us */ + spin_lock_irq(&active_socket_lock); + list_for_each_safe(socklist, temp, &active_socket_list) { + csock = + list_entry(socklist, struct cl_comms_socket, + active_list); + + list_del(&csock->active_list); + clear_bit(1, &csock->active); + + spin_unlock_irq(&active_socket_lock); + + do { + len = receive_message(csock, iobuf); + } + while (len > 0); + + spin_lock_irq(&active_socket_lock); + + if (len == 0) + break; /* EOF on socket */ + } + spin_unlock_irq(&active_socket_lock); + + /* Resend any unacked messages */ + if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags) + && acks_expected) { + resend_last_message(); + } + + /* Send any queued messages */ + if (acks_expected == 0) { + struct list_head *temp; + struct list_head *msglist; + + down(&messages_list_lock); + list_for_each_safe(msglist, temp, &messages_list) { + struct queued_message *qmsg = + list_entry(msglist, struct queued_message, + list); + int status = send_queued_message(qmsg); + + if (status >= 0) { + /* Suceeded, remove it from the queue */ + list_del(&qmsg->list); + kfree(qmsg); + } + /* Did it fail horribly ?? */ + if (status < 0 && status != -EAGAIN) { + printk(KERN_INFO CMAN_NAME + ": send_queued_message failed, error %d\n", + status); + list_del(&qmsg->list); + kfree(qmsg); + } + break; /* Only send one message at a time */ + } + up(&messages_list_lock); + } + + if (signal_pending(current)) + break; + } + P_COMMS("closing down\n"); + + if (we_are_a_cluster_member) + send_leave(us->leave_reason); + + kfree(iobuf); + quit_threads = 1; /* force other thread to die too */ + node_shutdown(); + + if (timer_pending(&ack_timer)) + del_timer(&ack_timer); + + /* Wait for membership thread to die */ + wait_for_completion(&member_thread_comp); + + node_cleanup(); + + complete(&cluster_thread_comp); + return 0; +} + +void notify_kernel_listeners(kcl_callback_reason reason, long arg) +{ + struct kernel_notify_struct *knotify; + struct list_head *proclist; + + down(&kernel_listener_lock); + list_for_each(proclist, &kernel_listener_list) { + knotify = + list_entry(proclist, struct kernel_notify_struct, list); + knotify->callback(reason, arg); + } + up(&kernel_listener_lock); +} + +static void check_for_unacked_nodes() +{ + struct list_head *nodelist; + struct cluster_node *node; + + clear_bit(RESEND_NEEDED, &mainloop_flags); + retry_count = 0; + + P_COMMS("Retry count exceeded -- looking for dead node\n"); + + /* Node did not ACK a message after tries, remove it from the + * cluster */ + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + P_COMMS + ("checking node %s: last_acked = %d, last_seq_sent = %d\n", + node->name, node->last_seq_acked, node->last_seq_sent); + if (node->state != NODESTATE_DEAD + && node->last_seq_acked != node->last_seq_sent && !node->us) { + printk(KERN_WARNING CMAN_NAME + ": node %s is not responding - removing from the cluster\n", + node->name); + + /* Start a state transition */ + a_node_just_died(node); + } + } + up(&cluster_members_lock); + acks_expected = ack_count = 0; + unjam(); + return; +} + +static void ack_timer_fn(unsigned long arg) +{ + P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count); + + /* Too many retries ? */ + if (++retry_count > MAX_RETRIES) { + set_bit(ACK_TIMEOUT, &mainloop_flags); + wake_up_interruptible(&cnxman_waitq); + } + else { + /* Resend last message */ + set_bit(RESEND_NEEDED, &mainloop_flags); + wake_up_interruptible(&cnxman_waitq); + } +} + +/* Called to resend a packet if sock_sendmsg was busy */ +static void short_timer_fn(unsigned long arg) +{ + P_COMMS("short_timer fired\n"); + + /* Resend last message */ + resend_delay <<= 1; + set_bit(RESEND_NEEDED, &mainloop_flags); + wake_up_interruptible(&cnxman_waitq); +} + +static void start_ack_timer() +{ + ack_timer.function = ack_timer_fn; + ack_timer.data = 0L; + mod_timer(&ack_timer, jiffies + HZ); +} + +static void start_short_timer(void) +{ + ack_timer.function = short_timer_fn; + ack_timer.data = 0L; + mod_timer(&ack_timer, jiffies + (resend_delay * HZ)); +} + + +static struct cl_waiting_listen_request *find_listen_request(unsigned short tag) +{ + struct list_head *llist; + struct cl_waiting_listen_request *listener; + + down(&listenreq_lock); + list_for_each(llist, &listenreq_list) { + listener = + list_entry(llist, struct cl_waiting_listen_request, list); + if (listener->tag == tag) { + up(&listenreq_lock); + return listener; + } + } + up(&listenreq_lock); + return NULL; +} + +static void process_cnxman_message(struct cl_comms_socket *csock, char *data, + int len, char *addr, int addrlen, + struct cluster_node *rem_node) +{ + struct cl_protmsg *msg = (struct cl_protmsg *) data; + struct cl_protheader *header = (struct cl_protheader *) data; + struct cl_ackmsg *ackmsg; + struct cl_listenmsg *listenmsg; + struct cl_closemsg *closemsg; + struct cl_barriermsg *barriermsg; + struct cl_waiting_listen_request *listen_request; + + P_COMMS("Message on port 0 is %d\n", msg->cmd); + switch (msg->cmd) { + case CLUSTER_CMD_ACK: + ackmsg = (struct cl_ackmsg *) data; + + if (ackmsg->aflags & 1) { + if (net_ratelimit()) + printk(KERN_INFO CMAN_NAME + ": WARNING no listener for port %d on node %s\n", + ackmsg->remport, rem_node->name); + } + P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n", + rem_node ? rem_node->name : "Unknown", + le16_to_cpu(ackmsg->seq), cur_seq); + + if (rem_node && rem_node->state != NODESTATE_DEAD) { + /* This copes with duplicate acks from a multipathed + * host */ + if (rem_node->last_seq_acked != + le16_to_cpu(ackmsg->seq)) { + rem_node->last_seq_acked = + le16_to_cpu(ackmsg->seq); + + /* Got em all */ + if (++ack_count >= acks_expected) { + + /* Cancel the timer */ + del_timer(&ack_timer); + acks_expected = 0; + unjam(); + } + } + } + else { + if (cluster_members) { +#ifdef DEBUG_COMMS + char buf[MAX_ADDR_PRINTED_LEN]; + + printk(KERN_INFO CMAN_NAME + ": got ack from unknown or dead node: %s\n", + print_addr(addr, addrlen, buf)); +#endif + } + } + break; + + /* Return 1 if we have a listener on this port, 0 if not */ + case CLUSTER_CMD_LISTENREQ: + listenmsg = + (struct cl_listenmsg *) (data + + sizeof (struct cl_protheader)); + cl_sendack(csock, header->seq, addrlen, addr, header->port, 0); + send_listen_response(csock, le32_to_cpu(header->srcid), + listenmsg->target_port, listenmsg->tag); + break; + + case CLUSTER_CMD_LISTENRESP: + /* Wake up process waiting for listen response */ + listenmsg = + (struct cl_listenmsg *) (data + + sizeof (struct cl_protheader)); + cl_sendack(csock, header->seq, addrlen, addr, header->port, 0); + listen_request = find_listen_request(listenmsg->tag); + if (listen_request) { + listen_request->result = listenmsg->listening; + listen_request->waiting = 0; + wake_up_interruptible(&listen_request->waitq); + } + break; + + case CLUSTER_CMD_PORTCLOSED: + closemsg = + (struct cl_closemsg *) (data + + sizeof (struct cl_protheader)); + cl_sendack(csock, header->seq, addrlen, addr, header->port, 0); + post_close_oob(closemsg->port, le32_to_cpu(header->srcid)); + break; + + case CLUSTER_CMD_BARRIER: + barriermsg = + (struct cl_barriermsg *) (data + + sizeof (struct cl_protheader)); + cl_sendack(csock, header->seq, addrlen, addr, header->port, 0); + process_barrier_msg(barriermsg, rem_node); + break; + + default: + printk(KERN_ERR CMAN_NAME + ": Unknown protocol message %d received\n", msg->cmd); + break; + + } + return; +} + +static void send_to_userport(struct cl_comms_socket *csock, char *data, int len, + char *addr, int addrlen) +{ + int err; + struct cl_protheader *header = (struct cl_protheader *) data; + struct cluster_node *rem_node = + find_node_by_nodeid(le32_to_cpu(header->srcid)); + struct sk_buff *skb = NULL; + + P_COMMS + ("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n", + le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid), + le16_to_cpu(header->seq), rem_node, + rem_node ? rem_node->state : -1); + + /* If the remote end is being coy about its node ID then look it up by + * address */ + if (!rem_node && header->srcid == 0) { + rem_node = find_node_by_addr(addr, addrlen); + } + + /* If this node is an ex-member then treat it as unknown */ + if (rem_node && rem_node->state != NODESTATE_MEMBER + && rem_node->state != NODESTATE_JOINING) + rem_node = NULL; + + /* Ignore messages not for our cluster */ + if (le16_to_cpu(header->cluster) != cluster_id) { + P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n", + cluster_id, header->cluster); + goto userport_finish; + } + + /* If the message is from us then just dump it */ + if (rem_node && rem_node->us) + goto userport_finish; + + /* If we can't find the nodeid then check for our own messages the hard + * way - this only happens during joining */ + if (!rem_node) { + struct list_head *socklist; + struct cl_comms_socket *clsock; + + list_for_each(socklist, &socket_list) { + clsock = + list_entry(socklist, struct cl_comms_socket, list); + + if (clsock->recv_only) { + + if (memcmp(addr, &clsock->saddr, address_length) == 0) { + goto userport_finish; + } + } + } + + } + + /* Ignore messages not for us */ + if (le32_to_cpu(header->tgtid) > 0 && us + && le32_to_cpu(header->tgtid) != us->node_id) { + goto userport_finish; + } + + P_COMMS("got message, from %d for %d, sequence num = %d\n", + le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid), + le16_to_cpu(header->seq)); + + /* Have we received this message before ? If so just ignore it, it's a + * resend for someone else's benefit */ + if (!(header->flags & (MSG_NOACK >> 16)) && + rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) { + P_COMMS + ("Discarding message - Already seen this sequence number %d\n", + rem_node->last_seq_recv); + /* Still need to ACK it though, in case it was the ACK that got + * lost */ + cl_sendack(csock, header->seq, addrlen, addr, header->port, 0); + goto userport_finish; + } + + /* If it's a new node then assign it a temporary node ID */ + if (!rem_node) + header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen)); + + P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n", + header->flags, header->port, we_are_a_cluster_member); + + + /* If we are not part of the cluster then ignore multicast messages + * that need an ACK as we will confuse the sender who is only expecting + * ACKS from bona fide members */ + if (header->flags & (MSG_MULTICAST >> 16) && + !(header->flags & (MSG_NOACK >> 16)) && !we_are_a_cluster_member) { + P_COMMS + ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n", + header->port, header->flags); + goto userport_finish; + } + + /* Save the sequence number of this message so we can ignore duplicates + * (above) */ + if (!(header->flags & (MSG_NOACK >> 16)) && rem_node) { + P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq), + rem_node->name); + rem_node->last_seq_recv = le16_to_cpu(header->seq); + } + + /* Is it a protocol message? */ + if (header->port == 0) { + process_cnxman_message(csock, data, len, addr, addrlen, + rem_node); + goto userport_finish; + } + + /* Skip past the header to the data */ + data += sizeof (struct cl_protheader); + len -= sizeof (struct cl_protheader); + + /* Get the port number and look for a listener */ + down(&port_array_lock); + if (port_array[header->port]) { + int native_srcid; + struct cluster_sock *c = cluster_sk(port_array[header->port]); + + /* ACK it */ + if (!(header->flags & (MSG_NOACK >> 16))) + cl_sendack(csock, header->seq, addrlen, addr, + header->port, 0); + + /* Call a callback if there is one */ + if (c->kernel_callback) { + up(&port_array_lock); + c->kernel_callback(data, len, addr, addrlen, + le32_to_cpu(header->srcid)); + goto userport_finish; + } + + /* Otherwise put it into an SKB and pass it onto the recvmsg + * mechanism */ + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) { + up(&port_array_lock); + printk(KERN_INFO CMAN_NAME + ": Failed to allocate skb\n"); + return; + } + + skb_put(skb, len); + memcpy(skb->data, data, len); + + /* Put the nodeid into cb so we can pass it to the clients */ + skb->cb[0] = 0; /* Clear flags */ + native_srcid = le32_to_cpu(header->srcid); + memcpy(skb->cb + 1, &native_srcid, sizeof(int)); + + if ((err = + sock_queue_rcv_skb(port_array[header->port], skb)) < 0) { + + printk(KERN_INFO CMAN_NAME + ": Error queueing request to port %d: %d\n", + header->port, err); + kfree_skb(skb); + + /* If the port was MEMBERSHIP then we have to die */ + if (header->port == CLUSTER_PORT_MEMBERSHIP) { + up(&port_array_lock); + send_leave(CLUSTER_LEAVEFLAG_PANIC); + panic("membership stopped responding"); + } + } + up(&port_array_lock); + + } + else { + /* ACK it, but set the flag bit so remote end knows no-one + * caught it */ + if (!(header->flags & (MSG_NOACK >> 16))) + cl_sendack(csock, header->seq, addrlen, addr, + header->port, 1); + + /* Nobody listening, drop it */ + up(&port_array_lock); + } + + userport_finish: + return; +} + +static struct sock *cl_alloc_sock(struct socket *sock, int gfp) +{ + struct sock *sk; + struct cluster_sock *c; + + if ((sk = + sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock), + cluster_sk_cachep)) == NULL) + goto no_sock; + + if (sock) { + sock->ops = &cl_proto_ops; + } + sock_init_data(sock, sk); + + sk->sk_destruct = NULL; + sk->sk_no_check = 1; + sk->sk_family = PF_CLUSTER; + sk->sk_allocation = gfp; + + c = cluster_sk(sk); + c->port = 0; + c->service_data = NULL; + + return sk; + no_sock: + return NULL; +} + +static int cl_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct cl_client_socket *csock; + struct list_head *socklist; + struct list_head *tmp; + + down(&client_socket_lock); + if (sk) { + /* Remove port allocations if it's a bound socket */ + struct cluster_sock *c = cluster_sk(sk); + + down(&port_array_lock); + if (c->port) { + port_array[c->port] = NULL; + } + up(&port_array_lock); + + /* Tell other nodes in the cluster that this listener is going + * away */ + if (atomic_read(&cnxman_running) && c->port) + send_port_close_oob(c->port); + + if (c->service_data) + sm_sock_release(sock); + + /* Master socket released ? */ + if (sk->sk_protocol == CLPROTO_MASTER) { + master_sock = NULL; + + /* If this socket is being freed and cnxman is not + * started then free all the comms sockets as either + * the userland "join" process has crashed or the + * join failed. + */ + if (!atomic_read(&cnxman_running)) { + quit_threads = 1; + free_cluster_sockets(); + } + } + + sock_orphan(sk); + sock_hold(sk); + lock_sock(sk); + release_sock(sk); + sock_put(sk); + sock_put(sk); + sock->sk = NULL; + } + + /* Remove it from the list of clients */ + list_for_each_safe(socklist, tmp, &client_socket_list) { + csock = list_entry(socklist, struct cl_client_socket, list); + + if (csock->sock == sock) { + list_del(&csock->list); + kfree(csock); + break; + } + } + up(&client_socket_lock); + + return 0; +} + +static int cl_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + /* All are datagrams */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER)) + return -EPERM; + + /* Can only have one master socket */ + if (master_sock && protocol == CLPROTO_MASTER) + return -EBUSY; + + /* cnxman not running and a client was requested */ + if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER) + return -ENETDOWN; + + if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL) + return -ENOBUFS; + + sk->sk_protocol = protocol; + + if (protocol == CLPROTO_MASTER) + master_sock = sk; + + /* Add client sockets to the list */ + if (protocol == CLPROTO_CLIENT) { + struct cl_client_socket *clsock = + kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL); + if (!clsock) { + cl_release(sock); + return -ENOMEM; + } + clsock->sock = sock; + down(&client_socket_lock); + list_add(&clsock->list, &client_socket_list); + up(&client_socket_lock); + } + + return 0; +} + +static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr; + struct cluster_sock *c = cluster_sk(sk); + + if (!capable(CAP_NET_BIND_SERVICE)) + return -EPERM; + + if (sk->sk_zapped == 0) + return -EINVAL; + + if (addr_len != sizeof (struct sockaddr_cl)) + return -EINVAL; + + if (saddr->scl_family != AF_CLUSTER) + return -EINVAL; + + if (saddr->scl_port == 0) + return -EINVAL; /* Port 0 is reserved for protocol messages */ + + down(&port_array_lock); + + if (port_array[saddr->scl_port]) { + up(&port_array_lock); + return -EADDRINUSE; + } + + port_array[saddr->scl_port] = sk; + + up(&port_array_lock); + + c->port = saddr->scl_port; + sk->sk_zapped = 0; + + /* If we are not a cluster member yet then make the client wait until + * we are, this allows nodes to start cluster clients at the same time + * as cluster services but they will wait until membership is achieved. + * This looks odd in bind() (open would seem more obvious) but we need + * to know which port number is being used so that things like + * membership services don't get blocked + */ + + if (saddr->scl_port > HIGH_PROTECTED_PORT) + while (!we_are_a_cluster_member || !cluster_is_quorate + || in_transition()) { + DECLARE_WAITQUEUE(wq, current); + struct task_struct *tsk = current; + + set_task_state(tsk, TASK_INTERRUPTIBLE); + add_wait_queue(&socket_waitq, &wq); + + if (!we_are_a_cluster_member || !cluster_is_quorate + || in_transition()) + schedule(); + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&socket_waitq, &wq); + + /* We were woken up because the cluster is going down, + * ...and we never got a chance to do any work! (sob) */ + if (atomic_read(&cnxman_running) == 0 || quit_threads) { + return -ENOTCONN; + } + } + + return 0; +} + +static int cl_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr; + struct sock *sk = sock->sk; + struct cluster_sock *c = cluster_sk(sk); + + *uaddr_len = sizeof (struct sockaddr_cl); + + lock_sock(sk); + + sa->scl_port = c->port; + sa->scl_flags = 0; + sa->scl_family = AF_CLUSTER; + + release_sock(sk); + + return 0; +} + +static unsigned int cl_poll(struct file *file, struct socket *sock, + poll_table * wait) +{ + return datagram_poll(file, sock, wait); +} + +/* Copy internal node format to userland format */ +void copy_to_usernode(struct cluster_node *node, + struct cl_cluster_node *unode) +{ + strcpy(unode->name, node->name); + unode->size = sizeof (struct cl_cluster_node); + unode->votes = node->votes; + unode->state = node->state; + unode->us = node->us; + unode->node_id = node->node_id; + unode->leave_reason = node->leave_reason; + unode->incarnation = node->incarnation; +} + +/* ioctl processing functions */ + +static int do_ioctl_set_version(unsigned long arg) +{ + struct cl_version version, *u_version; + + if (!capable(CAP_CLUSTER)) + return -EPERM; + if (arg == 0) + return -EINVAL; + + u_version = (struct cl_version *) arg; + + if (copy_from_user(&version, u_version, sizeof(struct cl_version))) + return -EFAULT; + + if (version.major != CNXMAN_MAJOR_VERSION || + version.minor != CNXMAN_MINOR_VERSION || + version.patch != CNXMAN_PATCH_VERSION) + return -EINVAL; + + if (config_version == version.config) + return 0; + + config_version = version.config; + send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version); + return 0; +} + +static int do_ioctl_get_members(unsigned long arg) +{ + struct cluster_node *node; + /* Kernel copies */ + struct cl_cluster_node user_format_node; + struct cl_cluster_nodelist user_format_nodelist; + /* User space array ptr */ + struct cl_cluster_node *user_node; + struct list_head *nodelist; + int num_nodes = 0; + + if (arg == 0) + return cluster_members; + + if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist))) + return -EFAULT; + + down(&cluster_members_lock); + + if (user_format_nodelist.max_members < cluster_members) { + up(&cluster_members_lock); + return -E2BIG; + } + + user_node = user_format_nodelist.nodes; + + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + if (node->state == NODESTATE_MEMBER) { + copy_to_usernode(node, &user_format_node); + if (copy_to_user(user_node, &user_format_node, + sizeof (struct cl_cluster_node))) { + up(&cluster_members_lock); + return -EFAULT; + } + user_node++; + num_nodes++; + } + } + up(&cluster_members_lock); + + return num_nodes; +} + +static int do_ioctl_get_all_members(unsigned long arg) +{ + struct cluster_node *node; + /* Kernel copies */ + struct cl_cluster_node user_format_node; + struct cl_cluster_nodelist user_format_nodelist; + /* User space array ptr*/ + struct cl_cluster_node *user_node; + struct list_head *nodelist; + int num_nodes = 0; + + if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist))) + return -EFAULT; + + down(&cluster_members_lock); + + user_node = user_format_nodelist.nodes; + + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + if (arg) { + copy_to_usernode(node, + &user_format_node); + + if (copy_to_user(user_node, &user_format_node, + sizeof (struct cl_cluster_node))) { + up(&cluster_members_lock); + return -EFAULT; + } + user_node++; + if (--user_format_nodelist.max_members < 0) { + num_nodes = -EFAULT; + goto err_exit; + } + + } + num_nodes++; + } + err_exit: + up(&cluster_members_lock); + + return num_nodes; +} + +static int do_ioctl_get_node(unsigned long arg) +{ + struct cluster_node *node; + struct cl_cluster_node k_node, *u_node; + + u_node = (struct cl_cluster_node *) arg; + + if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node))) + return -EFAULT; + + if (k_node.node_id) + node = find_node_by_nodeid(k_node.node_id); + else + node = find_node_by_name(k_node.name); + + if (!node) + return -ENOENT; + + copy_to_usernode(node, &k_node); + + if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node))) + return -EFAULT; + + return 0; +} + +static int do_ioctl_set_expected(unsigned long arg) +{ + struct list_head *nodelist; + struct cluster_node *node; + unsigned int total_votes; + unsigned int newquorum; + + if (!capable(CAP_CLUSTER)) + return -EPERM; + if (arg == 0) + return -EINVAL; + + newquorum = calculate_quorum(1, arg, &total_votes); + + if (newquorum < total_votes / 2 + || newquorum > total_votes) { + return -EINVAL; + } + + /* Now do it */ + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + if (node->state == NODESTATE_MEMBER + && node->expected_votes > arg) { + node->expected_votes = arg; + } + } + up(&cluster_members_lock); + + recalculate_quorum(1); + + send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg); + sm_member_update(cluster_is_quorate); + + return 0; +} + +static int do_ioctl_kill_node(unsigned long arg) +{ + struct cluster_node *node; + + if (!capable(CAP_CLUSTER)) + return -EPERM; + + + if ((node = find_node_by_nodeid(arg)) == NULL) + return -EINVAL; + + /* Can't kill us */ + if (node->us) + return -EINVAL; + + if (node->state != NODESTATE_MEMBER) + return -EINVAL; + + /* Just in case it is alive, send a KILL message */ + send_kill(arg); + + node->leave_reason = CLUSTER_LEAVEFLAG_KILLED; + a_node_just_died(node); + + return 0; +} + +static int do_ioctl_barrier(unsigned long arg) +{ + struct cl_barrier_info info; + + if (!capable(CAP_CLUSTER)) + return -EPERM; + + if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0) + return -EFAULT; + + switch (info.cmd) { + case BARRIER_IOCTL_REGISTER: + return kcl_barrier_register(info.name, + info.flags, + info.arg); + case BARRIER_IOCTL_CHANGE: + return kcl_barrier_setattr(info.name, + info.flags, + info.arg); + case BARRIER_IOCTL_WAIT: + return kcl_barrier_wait(info.name); + case BARRIER_IOCTL_DELETE: + return kcl_barrier_delete(info.name); + default: + return -EINVAL; + } +} + +static int do_ioctl_islistening(unsigned long arg) +{ + DECLARE_WAITQUEUE(wq, current); + struct cl_listen_request rq; + struct cluster_node *rem_node; + int nodeid; + int result; + struct cl_waiting_listen_request *listen_request; + + if (!arg) + return -EINVAL; + + if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0) + return -EFAULT; + + nodeid = rq.nodeid; + + rem_node = find_node_by_nodeid(nodeid); + + /* Node not in the cluster */ + if (!rem_node) + return -ENOENT; + + if (rem_node->state != NODESTATE_MEMBER) + return -ENOTCONN; + + /* If the request is for us then just look in the ports + * array */ + if (nodeid == us->node_id) + return (port_array[rq.port] != 0) ? 1 : 0; + + /* For a remote node we need to send a request out */ + + /* If we are in transition then wait until we are not */ + while (in_transition()) { + set_task_state(current, TASK_INTERRUPTIBLE); + add_wait_queue(&socket_waitq, &wq); + + if (in_transition()) + schedule(); + + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&socket_waitq, &wq); + + if (signal_pending(current)) + return -EINTR; + } + + /* Were we shut down before it completed ? */ + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + + listen_request = + kmalloc(sizeof (struct cl_waiting_listen_request), + GFP_KERNEL); + if (!listen_request) + return -ENOMEM; + + /* Build the request */ + listen_request->waiting = 1; + listen_request->result = 0; + listen_request->tag = current->pid; + listen_request->nodeid = nodeid; + init_waitqueue_head(&listen_request->waitq); + + down(&listenreq_lock); + list_add(&listen_request->list, &listenreq_list); + up(&listenreq_lock); + + /* Now wait for the response to come back */ + send_listen_request(rq.nodeid, rq.port); + + while (listen_request->waiting) { + set_task_state(current, TASK_INTERRUPTIBLE); + add_wait_queue(&listen_request->waitq, &wq); + + if (listen_request->waiting) + schedule(); + + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&listen_request->waitq, &wq); + + if (signal_pending(current)) { + list_del(&listen_request->list); + kfree(listen_request); + return -ERESTARTSYS; + } + } + result = listen_request->result; + list_del(&listen_request->list); + kfree(listen_request); + return result; +} + +static int do_ioctl_set_votes(unsigned long arg) +{ + unsigned int total_votes; + unsigned int newquorum; + int saved_votes; + + if (!capable(CAP_CLUSTER)) + return -EPERM; + + /* Check votes is valid */ + saved_votes = us->votes; + us->votes = arg; + + newquorum = calculate_quorum(1, 0, &total_votes); + + if (newquorum < total_votes / 2 || newquorum > total_votes) { + us->votes = saved_votes; + return -EINVAL; + } + + recalculate_quorum(1); + + send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg); + + return 0; +} + +static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int err = -EOPNOTSUPP; + struct list_head *proclist; + struct list_head *tmp; + struct notify_struct *notify; + struct cl_version cnxman_version; + + switch (cmd) { + /* Process requests notification of cluster events */ + case SIOCCLUSTER_NOTIFY: + notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL); + if (!notify) + return -ENOMEM; + notify->pid = current->pid; + notify->signal = arg; + down(&event_listener_lock); + list_add(¬ify->list, &event_listener_list); + up(&event_listener_lock); + err = 0; + break; + + /* Process is no longer interested cluster events */ + case SIOCCLUSTER_REMOVENOTIFY: + err = EINVAL; + + down(&event_listener_lock); + list_for_each_safe(proclist, tmp, &event_listener_list) { + notify = + list_entry(proclist, struct notify_struct, list); + if (notify->pid == current->pid) { + list_del(¬ify->list); + kfree(notify); + err = 0; + } + } + up(&event_listener_lock); + break; + + /* Return the cnxman version number */ + case SIOCCLUSTER_GET_VERSION: + if (!arg) + return -EINVAL; + err = 0; + cnxman_version.major = CNXMAN_MAJOR_VERSION; + cnxman_version.minor = CNXMAN_MINOR_VERSION; + cnxman_version.patch = CNXMAN_PATCH_VERSION; + if (copy_to_user((void *) arg, &cnxman_version, + sizeof (struct cl_version))) { + return -EFAULT; + } + break; + + /* Set the cnxman config version number */ + case SIOCCLUSTER_SET_VERSION: + err = do_ioctl_set_version(arg); + break; + + /* Return the active membership list */ + case SIOCCLUSTER_GETMEMBERS: + err = do_ioctl_get_members(arg); + break; + + /* Return the full membership list include dead nodes */ + case SIOCCLUSTER_GETALLMEMBERS: + err = do_ioctl_get_all_members(arg); + break; + + case SIOCCLUSTER_GETNODE: + err = do_ioctl_get_node(arg); + break; + + case SIOCCLUSTER_ISQUORATE: + return cluster_is_quorate; + + case SIOCCLUSTER_ISACTIVE: + return atomic_read(&cnxman_running); + + case SIOCCLUSTER_SETEXPECTED_VOTES: + err = do_ioctl_set_expected(arg); + break; + + /* Change the number of votes for this node */ + case SIOCCLUSTER_SET_VOTES: + err = do_ioctl_set_votes(arg); + break; + + /* Return 1 if the specified node is listening on a given port */ + case SIOCCLUSTER_ISLISTENING: + err = do_ioctl_islistening(arg); + break; + + /* Forcibly kill a node */ + case SIOCCLUSTER_KILLNODE: + err = do_ioctl_kill_node(arg); + break; + + case SIOCCLUSTER_GET_JOINCOUNT: + if (!capable(CAP_CLUSTER)) + return -EPERM; + else + return atomic_read(&use_count); + + /* ioctl interface to the barrier system */ + case SIOCCLUSTER_BARRIER: + err = do_ioctl_barrier(arg); + break; + + default: + err = sm_ioctl(sock, cmd, arg); + } + return err; +} + +static int cl_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = -ENOTCONN; + + lock_sock(sk); + + if (sock->state == SS_UNCONNECTED) + goto out; + + err = 0; + if (sock->state == SS_DISCONNECTING) + goto out; + + err = -EINVAL; + + if (how != SHUTDOWN_MASK) + goto out; + + sk->sk_shutdown = how; + err = 0; + + out: + release_sock(sk); + + return err; +} + +static int cl_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + struct sock *sk = sock->sk; + int err; + + if (sk != master_sock) + return -EPERM; + + lock_sock(sk); + err = __cl_setsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int add_clsock(int broadcast, int number, struct socket *sock, + struct file *file) +{ + struct cl_comms_socket *newsock = + kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL); + if (!newsock) + return -ENOMEM; + + memset(newsock, 0, sizeof (*newsock)); + newsock->number = number; + newsock->sock = sock; + if (broadcast) { + newsock->broadcast = 1; + newsock->recv_only = 0; + } + else { + newsock->broadcast = 0; + newsock->recv_only = 1; + } + + newsock->file = file; + newsock->addr_len = sizeof(struct sockaddr_in6); + + /* Mark it active until cnxman thread is running and ready to process + * messages */ + set_bit(1, &newsock->active); + + /* Find out what it's bound to */ + newsock->sock->ops->getname(newsock->sock, + (struct sockaddr *)&newsock->saddr, + &newsock->addr_len, 0); + + num_interfaces = max(num_interfaces, newsock->number); + if (!current_interface && newsock->broadcast) + current_interface = newsock; + + /* Hook data_ready */ + newsock->sock->sk->sk_data_ready = cnxman_data_ready; + + /* Make an attempt to keep them in order */ + list_add_tail(&newsock->list, &socket_list); + + address_length = newsock->addr_len; + return 0; +} + +static int __cl_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen, int flags) +{ + struct file *file; + struct cl_join_cluster_info join_info; + int error; + int leave_flags; + struct cl_multicast_sock multicast_info; + + if (optlen && !optval) + return -EINVAL; + + switch (optname) { + case CLU_SET_MULTICAST: + case CLU_SET_RCVONLY: + if (!capable(CAP_CLUSTER)) + return -EPERM; + + if (optlen != sizeof (struct cl_multicast_sock)) + return -EINVAL; + + if (atomic_read(&cnxman_running)) + return -EINVAL; + + error = -EBADF; + + if (copy_from_user(&multicast_info, optval, optlen)) + return -EFAULT; + + file = fget(multicast_info.fd); + if (file) { + struct inode *inode = file->f_dentry->d_inode; + + error = + add_clsock(optname == CLU_SET_MULTICAST, + multicast_info.number, SOCKET_I(inode), + file); + if (error) + fput(file); + } + return error; + + case CLU_SET_NODENAME: + if (!capable(CAP_CLUSTER)) + return -EPERM; + + if (atomic_read(&cnxman_running)) + return -EINVAL; + + if (optlen > MAX_CLUSTER_MEMBER_NAME_LEN) + return -EINVAL; + + if (copy_from_user(nodename, optval, optlen)) + return -EFAULT; + break; + + case CLU_JOIN_CLUSTER: + if (!capable(CAP_CLUSTER)) + return -EPERM; + + if (atomic_read(&cnxman_running)) + return -EALREADY; + + if (optlen != sizeof (struct cl_join_cluster_info)) + return -EINVAL; + + if (copy_from_user(&join_info, optval, optlen)) + return -EFAULT; + + if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN) + return -EINVAL; + + if (list_empty(&socket_list)) + return -ENOTCONN; + + set_votes(join_info.votes, join_info.expected_votes); + cluster_id = generate_cluster_id(join_info.cluster_name); + strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN); + two_node = join_info.two_node; + config_version = join_info.config_version; + + quit_threads = 0; + acks_expected = 0; + init_completion(&cluster_thread_comp); + init_completion(&member_thread_comp); + if (allocate_nodeid_array()) + return -ENOMEM; + + kcluster_pid = kernel_thread(cluster_kthread, NULL, 0); + if (kcluster_pid < 0) + return kcluster_pid; + + wait_for_completion(&cluster_thread_comp); + init_completion(&cluster_thread_comp); + + atomic_set(&cnxman_running, 1); + + /* Make sure we have a node name */ + if (nodename[0] == '\0') + strcpy(nodename, system_utsname.nodename); + + membership_pid = start_membership_services(kcluster_pid); + if (membership_pid < 0) { + quit_threads = 1; + wait_for_completion(&cluster_thread_comp); + init_completion(&member_thread_comp); + return membership_pid; + } + + sm_start(); + break; + + case CLU_LEAVE_CLUSTER: + if (!capable(CAP_CLUSTER)) + return -EPERM; + + if (optlen != sizeof (int)) + return -EINVAL; + + if (copy_from_user(&leave_flags, optval, optlen)) + return -EFAULT; + + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + + if (in_transition()) + return -EBUSY; + + /* Ignore the use count if FORCE is set */ + if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) { + if (atomic_read(&use_count)) + return -ENOTCONN; + } + + us->leave_reason = leave_flags; + quit_threads = 1; + wake_up_interruptible(&cnxman_waitq); + + wait_for_completion(&cluster_thread_comp); + break; + + default: + return -ENOPROTOOPT; + } + + return 0; +} + +static int cl_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + err = __cl_getsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int __cl_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen, int flags) +{ + + switch (optname) { + default: + return -ENOPROTOOPT; + } + + return 0; +} + +/* We'll be giving out reward points next... */ +/* Send the packet and save a copy in case someone loses theirs. Should be + * protected by the send mutexphore */ +static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg, + int size, int needack) +{ + mm_segment_t fs; + int result; + struct iovec save_vectors[msg->msg_iovlen]; + + /* Save a copy of the IO vectors as send_msg mucks around with them and + * we may want to send the same stuff out more than once (for different + * interfaces) + */ + memcpy(save_vectors, msg->msg_iov, + sizeof (struct iovec) * msg->msg_iovlen); + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_sendmsg(csock->sock, msg, size); + + set_fs(fs); + + if (result >= 0 && acks_expected && needack) { + + /* Start retransmit timer if it didn't go */ + if (result == 0) { + start_short_timer(); + } + else { + resend_delay = 1; + } + } + + /* Restore IOVs */ + memcpy(msg->msg_iov, save_vectors, + sizeof (struct iovec) * msg->msg_iovlen); + + return result; +} + +static void resend_last_message() +{ + struct msghdr msg; + struct iovec vec[1]; + mm_segment_t fs; + int result; + + P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n", + jiffies, saved_msg_len, saved_msg_buffer[0], + saved_msg_buffer[6]); + + /* Assume there is something wrong with the last interface */ + current_interface = get_next_interface(current_interface); + if (num_interfaces > 1) + printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n", + current_interface->number); + + vec[0].iov_base = saved_msg_buffer; + vec[0].iov_len = saved_msg_len; + + memset(&msg, 0, sizeof (msg)); + msg.msg_name = ¤t_interface->saddr; + msg.msg_namelen = current_interface->addr_len; + msg.msg_iovlen = 1; + msg.msg_iov = vec; + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_sendmsg(current_interface->sock, &msg, saved_msg_len); + + set_fs(fs); + + if (result < 0) + printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result); + + /* Try indefinitely to send this, the backlog must die down eventually + * !? */ + if (result == 0) + start_short_timer(); + + /* Send succeeded, continue waiting for ACKS */ + if (result > 0) + start_ack_timer(); + +} + +static int cl_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name; + struct cluster_sock *c = cluster_sk(sk); + struct sk_buff *skb; + int copied, err = 0; + int isoob = 0; + + /* Socket was notified of shutdown, remove any pending skbs and return + * EOF */ + if (!atomic_read(&cnxman_running)) { + while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err))) + skb_free_datagram(sk, skb); + return 0; /* cnxman has left the building */ + } + + /* Generic datagram code does most of the work. If the user is not + * interested in OOB messages then ignore them */ + do { + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); + if (!skb) + goto out; + + /* Is it OOB */ + if (skb->cb[0] & 0x80) + isoob = 1; + else + isoob = 0; + + /* If it is and the user doesn't want it, then throw it away. */ + if (isoob && !(flags & MSG_OOB)) { + skb_free_datagram(sk, skb); + + /* If we peeked (?) an OOB but the user doesn't want it + then we need to discard it or we'll loop forever */ + if (flags & MSG_PEEK) { + skb = skb_recv_datagram(sk, flags & ~MSG_PEEK, + MSG_DONTWAIT, &err); + if (skb) + skb_free_datagram(sk, skb); + } + } + } + while (isoob && !(flags & MSG_OOB)); + + copied = skb->len; + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + + if (err) + goto out_free; + + if (msg->msg_name && msg->msg_namelen) { + memset(msg->msg_name, 0, msg->msg_namelen); + + if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) { + + /* Nodeid is in native byte order - anything else is just + * perverse */ + memcpy(&sin->scl_nodeid, skb->cb + 1, sizeof(int)); + } + msg->msg_namelen = sizeof (struct sockaddr_cl); + sin->scl_port = c->port; + } + + /* Top bit set in cb[0] means this is an OOB message */ + if (skb->cb[0] & 0x80) { + msg->msg_flags |= MSG_OOB; + } + + sock_recv_timestamp(msg, sk, skb); + + err = copied; + + out_free: + skb_free_datagram(sk, skb); + + out: + return err; +} + +/* Send a message out on all interfaces */ +static int send_to_all_ints(int nodeid, struct msghdr *our_msg, int size, int flags) +{ + struct sockaddr_in6 daddr; + struct cl_comms_socket *clsock; + int result = 0; + + our_msg->msg_name = &daddr; + + list_for_each_entry(clsock, &socket_list, list) { + + /* Don't send out a recv-only socket */ + if (!clsock->recv_only) { + + /* For temporary node IDs send to the node's real IP address */ + if (nodeid < 0) { + get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen); + } + else { + memcpy(&daddr, &clsock->saddr, clsock->addr_len); + our_msg->msg_namelen = clsock->addr_len; + } + + result = __send_and_save(clsock, our_msg, + size + sizeof (struct cl_protheader), + !(flags & MSG_NOACK)); + } + } + return result; +} + + +/* Internal common send message routine */ +static int __sendmsg(struct socket *sock, struct msghdr *msg, int size, + unsigned char port) +{ + int result = 0, i; + int flags = msg->msg_flags; + struct msghdr our_msg; + struct sockaddr_cl *caddr = msg->msg_name; + struct cl_protheader header; + struct iovec vectors[msg->msg_iovlen + 1]; + int nodeid = 0; + + if (size > MAX_CLUSTER_MESSAGE) + return -EINVAL; + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + + if (caddr) + nodeid = caddr->scl_nodeid; + + /* Check that the node id (if present) is valid */ + if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) && + !is_valid_temp_nodeid(nodeid))) { + return -ENOTCONN; + } + + /* We can only have one send outstanding at a time so we might as well + * lock the whole send mechanism */ + down(&send_lock); + + while ((port > HIGH_PROTECTED_PORT + && (!cluster_is_quorate || in_transition())) + || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) { + + DECLARE_WAITQUEUE(wq, current); + struct task_struct *tsk = current; + + if (flags & MSG_DONTWAIT) { + up(&send_lock); + return -EAGAIN; + } + + if (current->pid == kcluster_pid) { + P_COMMS + ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n", + port, ack_count, acks_expected); + up(&send_lock); + return -EAGAIN; + } + + P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm, + ack_count, acks_expected); + + set_task_state(tsk, TASK_INTERRUPTIBLE); + add_wait_queue(&socket_waitq, &wq); + + if ((port > HIGH_PROTECTED_PORT + && (!cluster_is_quorate || in_transition())) + || (acks_expected > 0)) { + + up(&send_lock); + schedule(); + down(&send_lock); + } + + /* Going down */ + if (quit_threads) { + up(&send_lock); + return -ENOTCONN; + } + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&socket_waitq, &wq); + + if (signal_pending(current)) { + up(&send_lock); + return -ERESTARTSYS; + } + + /* Were we shut down in the meantime ? */ + if (!atomic_read(&cnxman_running)) { + up(&send_lock); + return -ENOTCONN; + } + + } + + memset(&our_msg, 0, sizeof (our_msg)); + + /* Build the header */ + header.port = port; + header.flags = msg->msg_flags >> 16; + header.cluster = cpu_to_le16(cluster_id); + header.srcid = us ? cpu_to_le32(us->node_id) : 0; + header.tgtid = caddr ? cpu_to_le32(nodeid) : 0; + + ++cur_seq; + header.seq = cpu_to_le16(cur_seq); + + /* Set the MULTICAST flag on messages with no particular destination */ + if (!msg->msg_namelen) { + header.flags |= MSG_MULTICAST >> 16; + header.tgtid = 0; + } + + /* Copy the existing iovecs into our array and add the header on at the + * beginning */ + vectors[0].iov_base = &header; + vectors[0].iov_len = sizeof (header); + for (i = 0; i < msg->msg_iovlen; i++) { + vectors[i + 1] = msg->msg_iov[i]; + } + + our_msg.msg_iovlen = msg->msg_iovlen + 1; + our_msg.msg_iov = vectors; + + /* Work out how many ACKS are wanted - *don't* reset acks_expected to + * zero if no acks are required as an ACK-needed message may still be + * outstanding */ + if (!(msg->msg_flags & MSG_NOACK)) { + if (msg->msg_namelen) + acks_expected = 1; /* Unicast */ + else + acks_expected = max(cluster_members - 1, 0); + + } + + P_COMMS + ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n", + nodeid, header.port, + (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected, + le16_to_cpu(header.seq), header.flags); + + /* Don't include temp nodeids in the message itself */ + if (header.tgtid < 0) + header.tgtid = 0; + + /* For non-member sends we use all the interfaces */ + if ((nodeid < 0) || (flags & MSG_ALLINT)) { + + result = send_to_all_ints(nodeid, &our_msg, size, msg->msg_flags); + } + else { + /* Send to only the current socket - resends will use the + * others if necessary */ + our_msg.msg_name = ¤t_interface->saddr; + our_msg.msg_namelen = current_interface->addr_len; + + result = + __send_and_save(current_interface, &our_msg, + size + sizeof (header), + !(msg->msg_flags & MSG_NOACK)); + } + + /* Make a note in each nodes' structure that it has been sent a message + * so we can see which ones went astray */ + if (!(flags & MSG_NOACK) && nodeid >= 0) { + if (msg->msg_namelen) { + struct cluster_node *node; + + node = find_node_by_nodeid(le32_to_cpu(header.tgtid)); + if (node) + node->last_seq_sent = cur_seq; + } + else { + struct cluster_node *node; + struct list_head *nodelist; + + list_for_each(nodelist, &cluster_members_list) { + node = + list_entry(nodelist, struct cluster_node, + list); + if (node->state == NODESTATE_MEMBER) { + node->last_seq_sent = cur_seq; + } + } + } + } + + /* Save a copy of the message if we're expecting an ACK */ + if (!(flags & MSG_NOACK) && acks_expected) { + mm_segment_t fs; + + fs = get_fs(); + set_fs(get_ds()); + + memcpy_fromiovec(saved_msg_buffer, our_msg.msg_iov, + size + sizeof (header)); + set_fs(fs); + + saved_msg_len = size + sizeof (header); + retry_count = ack_count = 0; + clear_bit(RESEND_NEEDED, &mainloop_flags); + + start_ack_timer(); + } + + up(&send_lock); + return result; +} + +static int queue_message(void *buf, int len, struct sockaddr_cl *caddr, + unsigned char port, int flags) +{ + struct queued_message *qmsg; + + qmsg = kmalloc(sizeof (struct queued_message), + (in_atomic() + || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL); + if (qmsg == NULL) + return -1; + + memcpy(qmsg->msg_buffer, buf, len); + qmsg->msg_len = len; + if (caddr) { + memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl)); + qmsg->addr_len = sizeof (struct sockaddr_cl); + } + else { + qmsg->addr_len = 0; + } + qmsg->flags = flags; + qmsg->port = port; + qmsg->socket = NULL; + + down(&messages_list_lock); + list_add_tail(&qmsg->list, &messages_list); + up(&messages_list_lock); + + wake_up_interruptible(&cnxman_waitq); + + return 0; +} + +static int cl_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct cluster_sock *c = cluster_sk(sock->sk); + char *buffer; + int status; + int saved_iovlen; + uint8_t port; + struct iovec iov; + struct iovec *saved_iov; + struct sockaddr_cl *caddr = msg->msg_name; + + if (sock->sk->sk_protocol == CLPROTO_MASTER) + return -EOPNOTSUPP; + + port = c->port; + + /* Only capable users can override the port number */ + if (caddr && capable(CAP_CLUSTER) && caddr->scl_port) + port = caddr->scl_port; + + if (port == 0) + return -EDESTADDRREQ; + + /* Hmmm. On machines with segmented user/kernel space (sparc64, hppa & + * m68k AFAICT) we can't mix user and kernel space addresses in the + * IOV. This stymies __sendmsg a little as it tries to add a header to + * what could possibly be a userspace iov. So, here (where all the + * userspace sends come) we copy it to a kernel space buffer first. If + * performance is a big problem here then I might #ifdef it for the + * affected architectures but for now I think it will probably be OK */ + buffer = kmalloc(size, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + memcpy_fromiovec(buffer, msg->msg_iov, size); + iov.iov_len = size; + iov.iov_base = buffer; + + saved_iov = msg->msg_iov; + saved_iovlen = msg->msg_iovlen; + msg->msg_iov = &iov; + msg->msg_iovlen = 1; + + status = __sendmsg(sock, msg, size, port); + msg->msg_iov = saved_iov; + msg->msg_iovlen = saved_iovlen; + + kfree(buffer); + + return status; +} + +/* Kernel call to sendmsg */ +int kcl_sendmsg(struct socket *sock, void *buf, int size, + struct sockaddr_cl *caddr, int addr_len, unsigned int flags) +{ + struct iovec iovecs[1]; + struct msghdr msg; + struct cluster_sock *c = cluster_sk(sock->sk); + unsigned char port; + + if (size > MAX_CLUSTER_MESSAGE) + return -EINVAL; + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + + port = c->port; + if (caddr && caddr->scl_port) + port = caddr->scl_port; + + if (port == 0) + return -EDESTADDRREQ; + + /* If we have no process context then queue it up for kclusterd to + * send. */ + if (in_interrupt() || flags & MSG_QUEUE) { + return queue_message(buf, size, caddr, port, + flags & ~MSG_QUEUE); + } + + iovecs[0].iov_base = buf; + iovecs[0].iov_len = size; + + memset(&msg, 0, sizeof (msg)); + msg.msg_name = caddr; + msg.msg_namelen = addr_len; + msg.msg_iovlen = 1; + msg.msg_iov = iovecs; + msg.msg_flags = flags; + + return __sendmsg(sock, &msg, size, port); +} + +static int send_queued_message(struct queued_message *qmsg) +{ + struct iovec iovecs[1]; + struct msghdr msg; + + /* Don't send blocked messages */ + if (qmsg->port > HIGH_PROTECTED_PORT + && (!cluster_is_quorate || in_transition())) + return -EAGAIN; + + iovecs[0].iov_base = qmsg->msg_buffer; + iovecs[0].iov_len = qmsg->msg_len; + + memset(&msg, 0, sizeof (msg)); + msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL; + msg.msg_namelen = qmsg->addr_len; + msg.msg_iovlen = 1; + msg.msg_iov = iovecs; + msg.msg_flags = qmsg->flags; + + return __sendmsg(qmsg->socket, &msg, qmsg->msg_len, qmsg->port); +} + +int kcl_register_read_callback(struct socket *sock, + int (*routine) (char *, int, char *, int, + unsigned int)) +{ + struct cluster_sock *c = cluster_sk(sock->sk); + + c->kernel_callback = routine; + + return 0; +} + +/* Used where we are in kclusterd context and we can't allow the task to wait + * as we are also responsible to processing the ACKs that do the wake up. Try + * to send the message immediately and queue it if that's not possible */ +static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr, + unsigned char port) +{ + struct iovec iovecs[1]; + struct msghdr msg; + + int status; + + /* Don't send blocked messages */ + if (port > HIGH_PROTECTED_PORT + && (!cluster_is_quorate || in_transition())) { + return queue_message(buf, len, caddr, port, 0); + } + + iovecs[0].iov_base = buf; + iovecs[0].iov_len = len; + + memset(&msg, 0, sizeof (msg)); + msg.msg_name = caddr; + msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0; + msg.msg_iovlen = 1; + msg.msg_iov = iovecs; + msg.msg_flags = MSG_DONTWAIT; + + status = __sendmsg(NULL, &msg, len, port); + + /* Did it work ? */ + if (status > 0) { + return 0; + } + + /* Failure other than EAGAIN is fatal */ + if (status != -EAGAIN) { + return status; + } + + return queue_message(buf, len, caddr, port, 0); +} + +/* Send a listen request to a node */ +static void send_listen_request(int nodeid, unsigned char port) +{ + struct cl_listenmsg listenmsg; + struct sockaddr_cl caddr; + + memset(&caddr, 0, sizeof (caddr)); + + /* Build the header */ + listenmsg.cmd = CLUSTER_CMD_LISTENREQ; + listenmsg.target_port = port; + listenmsg.listening = 0; + listenmsg.tag = current->pid; + + caddr.scl_family = AF_CLUSTER; + caddr.scl_port = 0; + caddr.scl_nodeid = nodeid; + + send_or_queue_message(&listenmsg, sizeof(listenmsg), &caddr, 0); + return; +} + +/* Return 1 or 0 to indicate if we have a listener on the requested port */ +static void send_listen_response(struct cl_comms_socket *csock, int nodeid, + unsigned char port, unsigned short tag) +{ + struct cl_listenmsg listenmsg; + struct sockaddr_cl caddr; + int status; + + memset(&caddr, 0, sizeof (caddr)); + + /* Build the message */ + listenmsg.cmd = CLUSTER_CMD_LISTENRESP; + listenmsg.target_port = port; + listenmsg.tag = tag; + listenmsg.listening = (port_array[port] != 0) ? 1 : 0; + + caddr.scl_family = AF_CLUSTER; + caddr.scl_port = 0; + caddr.scl_nodeid = nodeid; + + status = send_or_queue_message(&listenmsg, + sizeof (listenmsg), + &caddr, 0); + + return; +} + +/* Send an ACK */ +static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq, + int addr_len, char *addr, unsigned char remport, + unsigned char flag) +{ + mm_segment_t fs; + struct iovec vec; + struct cl_ackmsg ackmsg; + struct msghdr msg; + struct sockaddr_in6 daddr; + int result; + +#ifdef DEBUG_COMMS + char buf[MAX_ADDR_PRINTED_LEN]; + + P_COMMS("Sending ACK to %s, seq=%d\n", + print_addr(addr, address_length, buf), le16_to_cpu(seq)); +#endif + + if (addr) { + memcpy(&daddr, addr, addr_len); + } + else { + memcpy(&daddr, &csock->saddr, csock->addr_len); + addr_len = csock->addr_len; + } + + /* Build the header */ + ackmsg.header.port = 0; /* Protocol port */ + ackmsg.header.seq = 0; + ackmsg.header.flags = MSG_NOACK >> 16; + ackmsg.header.cluster = cpu_to_le16(cluster_id); + ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0; + ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother + * to look this up */ + ackmsg.cmd = CLUSTER_CMD_ACK; + ackmsg.remport = remport; + ackmsg.aflags = flag; + ackmsg.seq = seq; /* Already in LE order */ + vec.iov_base = &ackmsg; + vec.iov_len = sizeof (ackmsg); + + memset(&msg, 0, sizeof (msg)); + msg.msg_name = &daddr; + msg.msg_namelen = addr_len; + msg.msg_iovlen = 1; + msg.msg_iov = &vec; + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_sendmsg(csock->sock, &msg, sizeof (ackmsg)); + + set_fs(fs); + + if (result < 0) + printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result); + + return result; + +} + +/* Wait for all ACKS to be gathered */ +void kcl_wait_for_all_acks() +{ + while (ack_count < acks_expected) { + + DECLARE_WAITQUEUE(wq, current); + struct task_struct *tsk = current; + + set_task_state(tsk, TASK_INTERRUPTIBLE); + add_wait_queue(&socket_waitq, &wq); + + if (ack_count < acks_expected) { + schedule(); + } + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&socket_waitq, &wq); + } +} + +/* Send a closedown OOB message to all cluster nodes - this tells them that a + * port listener has gone away */ +static void send_port_close_oob(unsigned char port) +{ + struct cl_closemsg closemsg; + + /* Build the header */ + closemsg.cmd = CLUSTER_CMD_PORTCLOSED; + closemsg.port = port; + + send_or_queue_message(&closemsg, sizeof (closemsg), NULL, 0); + return; +} + +/* A remote port has been closed - post an OOB message to the local listen on + * that port (if there is one) */ +static void post_close_oob(unsigned char port, int nodeid) +{ + struct cl_portclosed_oob *oobmsg; + struct sk_buff *skb; + struct sock *sock = port_array[port]; + + if (!sock) { + return; /* No-one listening */ + } + + skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL); + if (!skb) + return; + + skb_put(skb, sizeof (*oobmsg)); + oobmsg = (struct cl_portclosed_oob *) skb->data; + oobmsg->port = port; + oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED; + skb->cb[0] = 0x80; + memcpy(skb->cb + 1, &nodeid, sizeof(int)); + + sock_queue_rcv_skb(sock, skb); + +} + +/* Leave the cluster */ +static void node_shutdown() +{ + struct cl_barrier *barrier; + struct list_head *blist; + struct list_head *temp; + struct list_head *socklist; + struct cl_client_socket *csock; + struct sk_buff *null_skb; + + printk(KERN_INFO CMAN_NAME ": we are leaving the cluster\n"); + + atomic_set(&cnxman_running, 0); + unjam(); + + /* Notify kernel listeners first */ + notify_kernel_listeners(LEAVING, 0); + + /* Notify client sockets */ + down(&client_socket_lock); + list_for_each_safe(socklist, temp, &client_socket_list) { + csock = list_entry(socklist, struct cl_client_socket, list); + + null_skb = alloc_skb(0, GFP_KERNEL); + if (null_skb) + sock_queue_rcv_skb(csock->sock->sk, null_skb); + list_del(&csock->list); + kfree(csock); + } + up(&client_socket_lock); + we_are_a_cluster_member = 0; + + sm_stop(1); + + /* Wake up any processes waiting for barriers */ + down(&barrier_list_lock); + list_for_each(blist, &barrier_list) { + barrier = list_entry(blist, struct cl_barrier, list); + + /* Cancel any timers */ + if (timer_pending(&barrier->timer)) + del_timer(&barrier->timer); + + /* Force it to be auto-delete so it discards itself */ + if (barrier->state == BARRIER_STATE_WAITING) { + barrier->flags |= BARRIER_ATTR_AUTODELETE; + wake_up_interruptible(&barrier->waitq); + } + else { + if (barrier->callback) { + barrier->callback(barrier->name, -ENOTCONN); + barrier->callback = NULL; + } + } + } + up(&barrier_list_lock); + + /* Wake up any processes waiting for ISLISTENING requests */ + down(&listenreq_lock); + list_for_each(blist, &listenreq_list) { + struct cl_waiting_listen_request *lrequest = + list_entry(blist, struct cl_waiting_listen_request, list); + + if (lrequest->waiting) + wake_up_interruptible(&lrequest->waitq); + } + up(&listenreq_lock); +} + +static void free_cluster_sockets() +{ + struct list_head *socklist; + struct cl_comms_socket *sock; + struct list_head *temp; + + list_for_each_safe(socklist, temp, &socket_list) { + sock = list_entry(socklist, struct cl_comms_socket, list); + + list_del(&sock->list); + fput(sock->file); + kfree(sock); + } + num_interfaces = 0; + current_interface = NULL; +} + +/* Tidy up after all the rest of the cluster bits have shut down */ +static void node_cleanup() +{ + struct list_head *nodelist; + struct list_head *proclist; + struct list_head *temp; + struct list_head *socklist; + struct list_head *blist; + struct cl_comms_socket *sock; + struct kernel_notify_struct *knotify; + + /* Free list of kernel listeners */ + list_for_each_safe(proclist, temp, &kernel_listener_list) { + knotify = + list_entry(proclist, struct kernel_notify_struct, list); + list_del(&knotify->list); + kfree(knotify); + } + + /* Mark the sockets as busy so they don't get added to the active + * sockets list in the next few lines of code before we free them */ + list_for_each_safe(socklist, temp, &socket_list) { + sock = list_entry(socklist, struct cl_comms_socket, list); + + set_bit(1, &sock->active); + } + + /* Tidy the active sockets list */ + list_for_each_safe(socklist, temp, &active_socket_list) { + sock = + list_entry(socklist, struct cl_comms_socket, active_list); + list_del(&sock->active_list); + } + + /* Free the memory allocated to cluster nodes */ + free_nodeid_array(); + down(&cluster_members_lock); + us = NULL; + list_for_each_safe(nodelist, temp, &cluster_members_list) { + + struct list_head *addrlist; + struct list_head *addrtemp; + struct cluster_node *node; + struct cluster_node_addr *nodeaddr; + + node = list_entry(nodelist, struct cluster_node, list); + + list_for_each_safe(addrlist, addrtemp, &node->addr_list) { + nodeaddr = + list_entry(addrlist, struct cluster_node_addr, + list); + + list_del(&nodeaddr->list); + kfree(nodeaddr); + } + list_del(&node->list); + kfree(node->name); + kfree(node); + } + cluster_members = 0; + up(&cluster_members_lock); + + /* Free the memory allocated to the outgoing sockets */ + free_cluster_sockets(); + + /* Make sure that all the barriers are deleted */ + down(&barrier_list_lock); + list_for_each_safe(blist, temp, &barrier_list) { + struct cl_barrier *barrier = + list_entry(blist, struct cl_barrier, list); + + list_del(&barrier->list); + kfree(barrier); + } + up(&barrier_list_lock); + + kcluster_pid = 0; + clear_bit(RESEND_NEEDED, &mainloop_flags); + acks_expected = 0; +} + +/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is + * blocked. */ +void set_quorate(int total_votes) +{ + int quorate; + + if (get_quorum() > total_votes) { + quorate = 0; + } + else { + quorate = 1; + } + + /* Hide messages during startup state transition */ + if (we_are_a_cluster_member) { + if (cluster_is_quorate && !quorate) + printk(KERN_CRIT CMAN_NAME + ": quorum lost, blocking activity\n"); + if (!cluster_is_quorate && quorate) + printk(KERN_CRIT CMAN_NAME + ": quorum regained, resuming activity\n"); + } + cluster_is_quorate = quorate; + + /* Wake up any sleeping processes */ + if (cluster_is_quorate) { + unjam(); + } + +} + +void queue_oob_skb(struct socket *sock, int cmd) +{ + struct sk_buff *skb; + struct cl_portclosed_oob *oobmsg; + + skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL); + if (!skb) + return; + + skb_put(skb, sizeof (*oobmsg)); + oobmsg = (struct cl_portclosed_oob *) skb->data; + oobmsg->port = 0; + oobmsg->cmd = cmd; + + /* There is no remote node associated with this so + clear out the field to avoid any accidents */ + memset(skb->cb, 0, sizeof(int)); + skb->cb[0] = 0x80; + + sock_queue_rcv_skb(sock->sk, skb); +} + +/* Notify interested parties that the cluster configuration has changed */ +void notify_listeners() +{ + struct notify_struct *notify; + struct list_head *proclist; + struct list_head *socklist; + struct list_head *temp; + + /* Do kernel listeners first */ + notify_kernel_listeners(CLUSTER_RECONFIG, 0); + + /* Now we deign to tell userspace */ + down(&event_listener_lock); + list_for_each_safe(proclist, temp, &event_listener_list) { + notify = list_entry(proclist, struct notify_struct, list); + + /* If the kill fails then remove the process from the list */ + if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) { + list_del(¬ify->list); + kfree(notify); + } + } + up(&event_listener_lock); + + /* Tell userspace processes which want OOB messages */ + down(&client_socket_lock); + list_for_each(socklist, &client_socket_list) { + struct cl_client_socket *csock; + csock = list_entry(socklist, struct cl_client_socket, list); + queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE); + } + up(&client_socket_lock); +} + +/* This fills in the list of all addresses for the local node */ +void get_local_addresses(struct cluster_node *node) +{ + struct list_head *socklist; + struct cl_comms_socket *sock; + + list_for_each(socklist, &socket_list) { + sock = list_entry(socklist, struct cl_comms_socket, list); + + if (sock->recv_only) { + add_node_address(node, (char *) &sock->saddr, address_length); + } + } +} + + +static uint16_t generate_cluster_id(char *name) +{ + int i; + int value = 0; + + for (i=0; inumber + 1; + if (next > num_interfaces) + next = 1; + + /* Find the socket with this number, I could optimise this by starting + * at the current i/f but most systems are going to have a small number + * of them anyway */ + list_for_each(socklist, &socket_list) { + struct cl_comms_socket *sock; + sock = list_entry(socklist, struct cl_comms_socket, list); + + if (!sock->recv_only && sock->number == next) + return sock; + } + + BUG(); + return NULL; +} + +/* MUST be called with the barrier list lock held */ +static struct cl_barrier *find_barrier(char *name) +{ + struct list_head *blist; + struct cl_barrier *bar; + + list_for_each(blist, &barrier_list) { + bar = list_entry(blist, struct cl_barrier, list); + + if (strcmp(name, bar->name) == 0) + return bar; + } + return NULL; +} + +/* Do the stuff we need to do when the barrier has completed phase 1 */ +static void check_barrier_complete_phase1(struct cl_barrier *barrier) +{ + if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0) + ? barrier->expected_nodes : + cluster_members)) { + + struct cl_barriermsg bmsg; + + atomic_inc(&barrier->completed_nodes); /* We have completed */ + barrier->phase = 2; /* Wait for complete phase II */ + + /* Send completion message, remember: we are in cnxman context + * and must not block */ + bmsg.cmd = CLUSTER_CMD_BARRIER; + bmsg.subcmd = BARRIER_COMPLETE; + bmsg.flags = 0; + strcpy(bmsg.name, barrier->name); + + P_BARRIER("Sending COMPLETE for %s\n", barrier->name); + queue_message((char *) &bmsg, sizeof (bmsg), NULL, 0, 0); + } +} + +/* Do the stuff we need to do when the barrier has been reached */ +/* Return 1 if we deleted the barrier */ +static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status) +{ + spin_lock_irq(&barrier->phase2_spinlock); + + if (barrier->state != BARRIER_STATE_COMPLETE && + (status == -ETIMEDOUT || + atomic_read(&barrier->completed_nodes) == + ((barrier->expected_nodes != 0) + ? barrier->expected_nodes : cluster_members))) { + + if (status == 0 && barrier->timeout) + del_timer(&barrier->timer); + barrier->endreason = status; + + /* Wake up listener */ + if (barrier->state == BARRIER_STATE_WAITING) { + wake_up_interruptible(&barrier->waitq); + } + else { + /* Additional tasks we have to do if the user was not + * waiting... */ + /* Call the callback */ + if (barrier->callback) { + barrier->callback(barrier->name, 0); + barrier->callback = NULL; + } + /* Remove it if it's AUTO-DELETE */ + if (barrier->flags & BARRIER_ATTR_AUTODELETE) { + list_del(&barrier->list); + spin_unlock_irq(&barrier->phase2_spinlock); + kfree(barrier); + return 1; + } + } + barrier->state = BARRIER_STATE_COMPLETE; + } + spin_unlock_irq(&barrier->phase2_spinlock); + return 0; +} + +/* Called if a barrier timeout happens */ +static void barrier_timer_fn(unsigned long arg) +{ + struct cl_barrier *barrier = (struct cl_barrier *) arg; + + /* Ignore any futher messages, they are too late. */ + barrier->phase = 0; + + /* and cause it to timeout */ + check_barrier_complete_phase2(barrier, -ETIMEDOUT); +} + +/* Process BARRIER messages from other nodes */ +static void process_barrier_msg(struct cl_barriermsg *msg, + struct cluster_node *node) +{ + struct cl_barrier *barrier; + + down(&barrier_list_lock); + barrier = find_barrier(msg->name); + up(&barrier_list_lock); + + /* Ignore other peoples messages, in_transition() is needed here so + * that joining nodes will see their barrier messages before the + * we_are_a_cluster_member is set */ + if (!we_are_a_cluster_member && !in_transition()) + return; + if (!barrier) + return; + + P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name, + node ? node->name : "unknown"); + + switch (msg->subcmd) { + case BARRIER_WAIT: + down(&barrier->lock); + if (barrier->phase == 0) + barrier->phase = 1; + + if (barrier->phase == 1) { + atomic_inc(&barrier->got_nodes); + check_barrier_complete_phase1(barrier); + } + else { + printk(KERN_WARNING CMAN_NAME + ": got WAIT barrier not in phase 1 %s (%d)\n", + msg->name, barrier->phase); + + } + up(&barrier->lock); + break; + + case BARRIER_COMPLETE: + down(&barrier->lock); + atomic_inc(&barrier->completed_nodes); + + /* First node to get all the WAIT messages sends COMPLETE, so + * we all complete */ + if (barrier->phase == 1) { + atomic_set(&barrier->got_nodes, + barrier->expected_nodes); + check_barrier_complete_phase1(barrier); + } + + if (barrier->phase == 2) { + /* If it was deleted (ret==1) then no need to unlock + * the mutex */ + if (check_barrier_complete_phase2(barrier, 0) == 1) + return; + } + up(&barrier->lock); + break; + } +} + +/* In-kernel membership API */ +int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg)) +{ + struct kernel_notify_struct *notify; + + notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL); + if (!notify) + return -ENOMEM; + notify->callback = callback; + + down(&kernel_listener_lock); + list_add(¬ify->list, &kernel_listener_list); + up(&kernel_listener_lock); + + return 0; +} + +int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg)) +{ + struct list_head *calllist; + struct list_head *temp; + struct kernel_notify_struct *notify; + + down(&kernel_listener_lock); + list_for_each_safe(calllist, temp, &kernel_listener_list) { + notify = list_entry(calllist, struct kernel_notify_struct, list); + if (notify->callback == callback){ + list_del(¬ify->list); + kfree(notify); + up(&kernel_listener_lock); + return 0; + } + } + up(&kernel_listener_lock); + return -EINVAL; +} + +/* Return quorate status */ +int kcl_is_quorate() +{ + return cluster_is_quorate; +} + +/* Return the address list for a node */ +struct list_head *kcl_get_node_addresses(int nodeid) +{ + struct cluster_node *node = find_node_by_nodeid(nodeid); + + if (node) + return &node->addr_list; + else + return NULL; +} + +static void copy_to_kclnode(struct cluster_node *node, + struct kcl_cluster_node *knode) +{ + strcpy(knode->name, node->name); + knode->size = sizeof (struct kcl_cluster_node); + knode->votes = node->votes; + knode->state = node->state; + knode->node_id = node->node_id; + knode->us = node->us; + knode->leave_reason = node->leave_reason; + knode->incarnation = node->incarnation; +} + +/* Return the info for a node given it's address. if addr is NULL then return + * OUR info */ +int kcl_get_node_by_addr(unsigned char *addr, int addr_len, + struct kcl_cluster_node *n) +{ + struct cluster_node *node; + + /* They want us */ + if (addr == NULL) { + node = us; + } + else { + node = find_node_by_addr(addr, addr_len); + if (!node) + return -1; + } + + /* Copy to user's buffer */ + copy_to_kclnode(node, n); + return 0; +} + +int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n) +{ + struct cluster_node *node; + + /* They want us */ + if (name == NULL) { + node = us; + if (node == NULL) + return -1; + } + else { + node = find_node_by_name(name); + if (!node) + return -1; + } + + /* Copy to user's buffer */ + copy_to_kclnode(node, n); + return 0; +} + +/* As above but by node id. MUCH faster */ +int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n) +{ + struct cluster_node *node; + + /* They want us */ + if (nodeid == 0) { + node = us; + if (node == NULL) + return -1; + } + else { + node = find_node_by_nodeid(nodeid); + if (!node) + return -1; + } + + /* Copy to user's buffer */ + copy_to_kclnode(node, n); + return 0; +} + +/* Return a list of all cluster members ever */ +int kcl_get_all_members(struct list_head *list) +{ + struct list_head *nodelist; + struct cluster_node *node; + struct kcl_cluster_node *newnode; + int num_nodes = 0; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + if (list) { + node = list_entry(nodelist, struct cluster_node, list); + newnode = + kmalloc(sizeof (struct kcl_cluster_node), + GFP_KERNEL); + if (newnode) { + copy_to_kclnode(node, newnode); + list_add(&newnode->list, list); + num_nodes++; + } + } + else { + num_nodes++; + } + } + up(&cluster_members_lock); + + return num_nodes; +} + +/* Return a list of cluster members */ +int kcl_get_members(struct list_head *list) +{ + struct list_head *nodelist; + struct cluster_node *node; + struct kcl_cluster_node *newnode; + int num_nodes = 0; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER) { + if (list) { + newnode = + kmalloc(sizeof (struct kcl_cluster_node), + GFP_KERNEL); + if (newnode) { + copy_to_kclnode(node, newnode); + list_add(&newnode->list, list); + num_nodes++; + } + } + else { + num_nodes++; + } + } + } + up(&cluster_members_lock); + + return num_nodes; +} + +/* Copy current member's nodeids into buffer */ +int kcl_get_member_ids(uint32_t *idbuf, int size) +{ + struct list_head *nodelist; + struct cluster_node *node; + int num_nodes = 0; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER) { + if (idbuf && size) { + idbuf[num_nodes] = node->node_id; + num_nodes++; + size--; + } + else { + num_nodes++; + } + } + } + up(&cluster_members_lock); + + return num_nodes; +} + +/* Barrier API */ +int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes) +{ + struct cl_barrier *barrier; + + /* We are not joined to a cluster */ + if (!we_are_a_cluster_member) + return -ENOTCONN; + + /* Must have a valid name */ + if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1) + return -EINVAL; + + /* We don't do this yet */ + if (flags & BARRIER_ATTR_MULTISTEP) + return -ENOTSUPP; + + down(&barrier_list_lock); + + /* See if it already exists */ + if ((barrier = find_barrier(name))) { + up(&barrier_list_lock); + if (nodes != barrier->expected_nodes) { + printk(KERN_WARNING CMAN_NAME + ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n", + name, barrier->expected_nodes, nodes); + up(&barrier_list_lock); + return -EINVAL; + } + else + return 0; + } + + /* Build a new struct and add it to the list */ + barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL); + if (barrier == NULL) { + up(&barrier_list_lock); + return -ENOMEM; + } + memset(barrier, 0, sizeof (*barrier)); + + strcpy(barrier->name, name); + barrier->flags = flags; + barrier->expected_nodes = nodes; + atomic_set(&barrier->got_nodes, 0); + atomic_set(&barrier->completed_nodes, 0); + barrier->endreason = 0; + barrier->registered_nodes = 1; + spin_lock_init(&barrier->phase2_spinlock); + barrier->state = BARRIER_STATE_INACTIVE; + init_MUTEX(&barrier->lock); + + list_add(&barrier->list, &barrier_list); + up(&barrier_list_lock); + + return 0; +} + +static int barrier_setattr_enabled(struct cl_barrier *barrier, + unsigned int attr, unsigned long arg) +{ + int status; + + /* Can't disable a barrier */ + if (!arg) { + up(&barrier->lock); + return -EINVAL; + } + + /* We need to send WAIT now because the user may not + * actually call kcl_barrier_wait() */ + if (!barrier->waitsent) { + struct cl_barriermsg bmsg; + + /* Send it to the rest of the cluster */ + bmsg.cmd = CLUSTER_CMD_BARRIER; + bmsg.subcmd = BARRIER_WAIT; + strcpy(bmsg.name, barrier->name); + + barrier->waitsent = 1; + barrier->phase = 1; + + atomic_inc(&barrier->got_nodes); + + /* Start the timer if one was wanted */ + if (barrier->timeout) { + init_timer(&barrier->timer); + barrier->timer.function = barrier_timer_fn; + barrier->timer.data = (long) barrier; + mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ)); + } + + /* Barrier WAIT and COMPLETE messages are + * always queued - that way they always get + * sent out in the right order. If we don't do + * this then one can get sent out in the + * context of the user process and the other in + * cnxman and COMPLETE may /just/ slide in + * before WAIT if its in the queue + */ + P_BARRIER("Sending WAIT for %s\n", name); + status = queue_message(&bmsg, sizeof (bmsg), NULL, 0, 0); + if (status < 0) { + up(&barrier->lock); + return status; + } + + /* It might have been reached now */ + if (barrier + && barrier->state != BARRIER_STATE_COMPLETE + && barrier->phase == 1) + check_barrier_complete_phase1(barrier); + } + if (barrier && barrier->state == BARRIER_STATE_COMPLETE) { + up(&barrier->lock); + return barrier->endreason; + } + up(&barrier->lock); + return 0; /* Nothing to propogate */ +} + +int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg) +{ + struct cl_barrier *barrier; + + /* See if it already exists */ + down(&barrier_list_lock); + if (!(barrier = find_barrier(name))) { + up(&barrier_list_lock); + return -ENOENT; + } + up(&barrier_list_lock); + + down(&barrier->lock); + if (barrier->state == BARRIER_STATE_COMPLETE) { + up(&barrier->lock); + return 0; + } + + switch (attr) { + case BARRIER_SETATTR_AUTODELETE: + if (arg) + barrier->flags |= BARRIER_ATTR_AUTODELETE; + else + barrier->flags &= ~BARRIER_ATTR_AUTODELETE; + up(&barrier->lock); + return 0; + break; + + case BARRIER_SETATTR_TIMEOUT: + /* Can only change the timout of an inactive barrier */ + if (barrier->state == BARRIER_STATE_WAITING + || barrier->waitsent) { + up(&barrier->lock); + return -EINVAL; + } + barrier->timeout = arg; + up(&barrier->lock); + return 0; + + case BARRIER_SETATTR_MULTISTEP: + up(&barrier->lock); + return -ENOTSUPP; + + case BARRIER_SETATTR_ENABLED: + return barrier_setattr_enabled(barrier, attr, arg); + + case BARRIER_SETATTR_NODES: + /* Can only change the expected node count of an inactive + * barrier */ + if (barrier->state == BARRIER_STATE_WAITING + || barrier->waitsent) + return -EINVAL; + barrier->expected_nodes = arg; + break; + + case BARRIER_SETATTR_CALLBACK: + if (barrier->state == BARRIER_STATE_WAITING + || barrier->waitsent) + return -EINVAL; + barrier->callback = (void (*)(char *, int)) arg; + up(&barrier->lock); + return 0; /* Don't propgate this to other nodes */ + } + + up(&barrier->lock); + return 0; +} + +int kcl_barrier_delete(char *name) +{ + struct cl_barrier *barrier; + + down(&barrier_list_lock); + /* See if it exists */ + if (!(barrier = find_barrier(name))) { + up(&barrier_list_lock); + return -ENOENT; + } + + /* Delete it */ + list_del(&barrier->list); + kfree(barrier); + + up(&barrier_list_lock); + + return 0; +} + +int kcl_barrier_cancel(char *name) +{ + struct cl_barrier *barrier; + + /* See if it exists */ + down(&barrier_list_lock); + if (!(barrier = find_barrier(name))) { + up(&barrier_list_lock); + return -ENOENT; + } + down(&barrier->lock); + + barrier->endreason = -ENOTCONN; + + if (barrier->callback) { + barrier->callback(barrier->name, -ECONNRESET); + barrier->callback = NULL; + } + + if (barrier->timeout) + del_timer(&barrier->timer); + + /* Remove it if it's AUTO-DELETE */ + if (barrier->flags & BARRIER_ATTR_AUTODELETE) { + list_del(&barrier->list); + up(&barrier->lock); + kfree(barrier); + up(&barrier_list_lock); + return 0; + } + + if (barrier->state == BARRIER_STATE_WAITING) + wake_up_interruptible(&barrier->waitq); + + up(&barrier->lock); + up(&barrier_list_lock); + return 0; +} + +int kcl_barrier_wait(char *name) +{ + struct cl_barrier *barrier; + int ret; + + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + + /* Enable it */ + kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L); + + down(&barrier_list_lock); + + /* See if it still exists - enable may have deleted it! */ + if (!(barrier = find_barrier(name))) { + up(&barrier_list_lock); + return -ENOENT; + } + + down(&barrier->lock); + + up(&barrier_list_lock); + + /* If it has already completed then return the status */ + if (barrier->state == BARRIER_STATE_COMPLETE) { + up(&barrier->lock); + return barrier->endreason; + } + + barrier->state = BARRIER_STATE_WAITING; + + /* Have we all reached the barrier? */ + while (atomic_read(&barrier->completed_nodes) != + ((barrier->expected_nodes == 0) + ? cluster_members : barrier->expected_nodes) + && barrier->endreason == 0) { + + wait_queue_t wq; + + init_waitqueue_entry(&wq, current); + init_waitqueue_head(&barrier->waitq); + + /* Wait for em all */ + set_task_state(current, TASK_INTERRUPTIBLE); + add_wait_queue(&barrier->waitq, &wq); + + if (atomic_read(&barrier->completed_nodes) != + ((barrier->expected_nodes == + 0) ? cluster_members : barrier->expected_nodes) + && barrier->endreason == 0) { + up(&barrier->lock); + schedule(); + down(&barrier->lock); + } + + remove_wait_queue(&barrier->waitq, &wq); + set_task_state(current, TASK_RUNNING); + + if (signal_pending(current)) { + barrier->endreason = -EINTR; + break; + } + } + barrier->state = BARRIER_STATE_INACTIVE; + + if (barrier->timeout) + del_timer(&barrier->timer); + + /* Barrier has been reached on all nodes, call the callback */ + if (barrier->callback) { + barrier->callback(barrier->name, barrier->endreason); + barrier->callback = NULL; + } + + atomic_set(&barrier->got_nodes, 0); + + /* Return the reason we were woken */ + ret = barrier->endreason; + + /* Remove it if it's AUTO-DELETE */ + if (barrier->flags & BARRIER_ATTR_AUTODELETE) { + down(&barrier_list_lock); + list_del(&barrier->list); + up(&barrier_list_lock); + up(&barrier->lock); + kfree(barrier); + } + else { + up(&barrier->lock); + } + + /* We were woken up because the node left the cluster ? */ + if (!atomic_read(&cnxman_running)) + ret = -ENOTCONN; + + return ret; +} + +/* This is called from membership services when a node has left the cluster - + * we signal all waiting barriers with -ESRCH so they know to do something + * else, if the number of nodes is left at 0 then we compare the new number of + * nodes in the cluster with that at the barrier and return 0 (success) in that + * case */ +void check_barrier_returns() +{ + struct list_head *blist; + struct list_head *llist; + struct cl_barrier *barrier; + int status = 0; + + down(&barrier_list_lock); + list_for_each(blist, &barrier_list) { + barrier = list_entry(blist, struct cl_barrier, list); + + if (barrier->waitsent) { + int wakeit = 0; + + /* Check for a dynamic member barrier */ + if (barrier->expected_nodes == 0) { + if (barrier->registered_nodes == + cluster_members) { + status = 0; + wakeit = 1; + } + } + else { + status = -ESRCH; + wakeit = 1; + } + + /* Do we need to tell the barrier? */ + if (wakeit) { + if (barrier->state == BARRIER_STATE_WAITING) { + barrier->endreason = status; + wake_up_interruptible(&barrier->waitq); + } + else { + if (barrier->callback) { + barrier->callback(barrier->name, + status); + } + } + } + } + } + up(&barrier_list_lock); + + /* Part 2 check for outstanding listen requests for dead nodes and + * cancel them */ + down(&listenreq_lock); + list_for_each(llist, &listenreq_list) { + struct cl_waiting_listen_request *lrequest = + list_entry(llist, struct cl_waiting_listen_request, list); + struct cluster_node *node = + find_node_by_nodeid(lrequest->nodeid); + + if (node && node->state != NODESTATE_MEMBER) { + lrequest->result = -ENOTCONN; + lrequest->waiting = 0; + wake_up_interruptible(&lrequest->waitq); + } + } + up(&listenreq_lock); +} + +int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen) +{ + struct temp_node *tn; + int err = 1; /* true */ +#ifdef DEBUG_COMMS + char buf[MAX_ADDR_PRINTED_LEN]; +#endif + + down(&tempnode_lock); + + list_for_each_entry(tn, &tempnode_list, list) { + if (tn->nodeid == nodeid) { + memcpy(addr, tn->addr, tn->addrlen); + *addrlen = tn->addrlen; + P_COMMS("get_temp_nodeid. id %d:\n: %s\n", + tn->nodeid, print_addr(tn->addr, tn->addrlen, buf)); + + goto out; + } + } + err = 0; + + out: + up(&tempnode_lock); + return err; +} + +/* Create a new temporary node ID. This list will only ever be very small + (usaully only 1 item) but I can't take the risk that someone won't try to + boot 128 nodes all at exactly the same time. */ +int new_temp_nodeid(char *addr, int addrlen) +{ + struct temp_node *tn; + int err = -1; + int try_nodeid = 0; +#ifdef DEBUG_COMMS + char buf[MAX_ADDR_PRINTED_LEN]; +#endif + + P_COMMS("new_temp_nodeid needed for\n: %s\n", + print_addr(addr, addrlen, buf)); + + down(&tempnode_lock); + + /* First see if we already know about this node */ + list_for_each_entry(tn, &tempnode_list, list) { + + P_COMMS("new_temp_nodeid list. id %d:\n: %s\n", + tn->nodeid, print_addr(tn->addr, tn->addrlen, buf)); + + /* We're already in here... */ + if (tn->addrlen == addrlen && + memcmp(tn->addr, addr, addrlen) == 0) { + P_COMMS("reused temp node ID %d\n", tn->nodeid); + err = tn->nodeid; + goto out; + } + } + + /* Nope, OK, invent a suitable number */ + retry: + try_nodeid -= 1; + list_for_each_entry(tn, &tempnode_list, list) { + + if (tn->nodeid == try_nodeid) + goto retry; + } + + tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL); + if (!tn) + goto out; + + memcpy(tn->addr, addr, addrlen); + tn->addrlen = addrlen; + tn->nodeid = try_nodeid; + list_add_tail(&tn->list, &tempnode_list); + err = try_nodeid; + P_COMMS("new temp nodeid = %d\n", try_nodeid); + out: + up(&tempnode_lock); + return err; +} + +static int is_valid_temp_nodeid(int nodeid) +{ + struct temp_node *tn; + int err = 1; /* true */ + + down(&tempnode_lock); + + list_for_each_entry(tn, &tempnode_list, list) { + if (tn->nodeid == nodeid) + goto out; + } + err = 0; + + out: + P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err); + up(&tempnode_lock); + return err; +} + +/* TODO: This needs to clean the list more fully of + nodes that are now full members but we did not master the transition */ +void remove_temp_nodeid(int nodeid) +{ + struct temp_node *tn; + struct temp_node *tmp; + + down(&tempnode_lock); + + list_for_each_entry_safe(tn, tmp, &tempnode_list, list) { + if (nodeid == tn->nodeid) { + list_del(&tn->list); + kfree(tn); + up(&tempnode_lock); + return; + } + } + + up(&tempnode_lock); +} + +/* Quorum device functions */ +int kcl_register_quorum_device(char *name, int votes) +{ + if (quorum_device) + return -EBUSY; + + if (find_node_by_name(name)) + return -EINVAL; + + quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL); + if (!quorum_device) + return -ENOMEM; + memset(quorum_device, 0, sizeof (struct cluster_node)); + + quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (!quorum_device->name) { + kfree(quorum_device); + quorum_device = NULL; + return -ENOMEM; + } + + strcpy(quorum_device->name, name); + quorum_device->votes = votes; + quorum_device->state = NODESTATE_DEAD; + + /* Keep this list valid so it doesn't confuse other code */ + INIT_LIST_HEAD(&quorum_device->addr_list); + + return 0; +} + +int kcl_unregister_quorum_device(void) +{ + if (!quorum_device) + return -EINVAL; + if (quorum_device->state == NODESTATE_MEMBER) + return -EINVAL; + + quorum_device = NULL; + + return 0; +} + +int kcl_quorum_device_available(int yesno) +{ + if (!quorum_device) + return -EINVAL; + + if (yesno) { + quorum_device->last_hello = jiffies; + if (quorum_device->state == NODESTATE_DEAD) { + quorum_device->state = NODESTATE_MEMBER; + recalculate_quorum(0); + } + } + else { + if (quorum_device->state == NODESTATE_MEMBER) { + quorum_device->state = NODESTATE_DEAD; + recalculate_quorum(0); + } + } + + return 0; +} + +/* APIs for cluster ref counting. */ +int kcl_addref_cluster() +{ + int ret = -ENOTCONN; + + if (!atomic_read(&cnxman_running)) + goto addref_ret; + + if (try_module_get(THIS_MODULE)) { + atomic_inc(&use_count); + ret = 0; + } + + addref_ret: + return ret; +} + +int kcl_releaseref_cluster() +{ + if (!atomic_read(&cnxman_running)) + return -ENOTCONN; + atomic_dec(&use_count); + module_put(THIS_MODULE); + return 0; +} + +int kcl_cluster_name(char **cname) +{ + char *name; + + name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strncpy(name, cluster_name, strlen(cluster_name)+1); + *cname = name; + return 0; +} + +int kcl_get_current_interface(void) +{ + return current_interface->number; +} + +/* Socket registration stuff */ +static struct net_proto_family cl_family_ops = { + .family = AF_CLUSTER, + .create = cl_create +}; + +static struct proto_ops cl_proto_ops = { + .family = AF_CLUSTER, + + .release = cl_release, + .bind = cl_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = cl_getname, + .poll = cl_poll, + .ioctl = cl_ioctl, + .listen = sock_no_listen, + .shutdown = cl_shutdown, + .setsockopt = cl_setsockopt, + .getsockopt = cl_getsockopt, + .sendmsg = cl_sendmsg, + .recvmsg = cl_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#ifdef MODULE +MODULE_DESCRIPTION("Cluster Connection and Service Manager"); +MODULE_AUTHOR("Red Hat, Inc"); +MODULE_LICENSE("GPL"); +#endif + +static int __init cluster_init(void) +{ + printk("CMAN %s (built %s %s) installed\n", + CMAN_RELEASE_NAME, __DATE__, __TIME__); + + /* allocate our sock slab cache */ + cluster_sk_cachep = kmem_cache_create("cluster_sock", + sizeof (struct cluster_sock), 0, + SLAB_HWCACHE_ALIGN, 0, 0); + if (!cluster_sk_cachep) { + printk(KERN_CRIT + "cluster_init: Cannot create cluster_sock SLAB cache\n"); + return -1; + + } + + if (sock_register(&cl_family_ops)) { + printk(KERN_INFO "Unable to register cluster socket type\n"); + kmem_cache_destroy(cluster_sk_cachep); + return -1; + } + + +#ifdef CONFIG_PROC_FS + create_proc_entries(); +#endif + + init_MUTEX(&start_thread_sem); + init_MUTEX(&send_lock); + init_MUTEX(&barrier_list_lock); + init_MUTEX(&cluster_members_lock); + init_MUTEX(&port_array_lock); + init_MUTEX(&messages_list_lock); + init_MUTEX(&listenreq_lock); + init_MUTEX(&client_socket_lock); + init_MUTEX(&new_dead_node_lock); + init_MUTEX(&event_listener_lock); + init_MUTEX(&kernel_listener_lock); + init_MUTEX(&tempnode_lock); + spin_lock_init(&active_socket_lock); + init_timer(&ack_timer); + + INIT_LIST_HEAD(&event_listener_list); + INIT_LIST_HEAD(&kernel_listener_list); + INIT_LIST_HEAD(&socket_list); + INIT_LIST_HEAD(&client_socket_list); + INIT_LIST_HEAD(&active_socket_list); + INIT_LIST_HEAD(&barrier_list); + INIT_LIST_HEAD(&messages_list); + INIT_LIST_HEAD(&listenreq_list); + INIT_LIST_HEAD(&cluster_members_list); + INIT_LIST_HEAD(&new_dead_node_list); + INIT_LIST_HEAD(&tempnode_list); + + atomic_set(&cnxman_running, 0); + + sm_init(); + + return 0; +} + +static void __exit cluster_exit(void) +{ +#ifdef CONFIG_PROC_FS + cleanup_proc_entries(); +#endif + + sock_unregister(AF_CLUSTER); + kmem_cache_destroy(cluster_sk_cachep); +} + +module_init(cluster_init); +module_exit(cluster_exit); + +EXPORT_SYMBOL(kcl_sendmsg); +EXPORT_SYMBOL(kcl_register_read_callback); +EXPORT_SYMBOL(kcl_add_callback); +EXPORT_SYMBOL(kcl_remove_callback); +EXPORT_SYMBOL(kcl_get_members); +EXPORT_SYMBOL(kcl_get_member_ids); +EXPORT_SYMBOL(kcl_get_all_members); +EXPORT_SYMBOL(kcl_is_quorate); +EXPORT_SYMBOL(kcl_get_node_by_addr); +EXPORT_SYMBOL(kcl_get_node_by_name); +EXPORT_SYMBOL(kcl_get_node_by_nodeid); +EXPORT_SYMBOL(kcl_get_node_addresses); +EXPORT_SYMBOL(kcl_addref_cluster); +EXPORT_SYMBOL(kcl_releaseref_cluster); +EXPORT_SYMBOL(kcl_cluster_name); + +EXPORT_SYMBOL(kcl_barrier_register); +EXPORT_SYMBOL(kcl_barrier_setattr); +EXPORT_SYMBOL(kcl_barrier_delete); +EXPORT_SYMBOL(kcl_barrier_wait); +EXPORT_SYMBOL(kcl_barrier_cancel); + +EXPORT_SYMBOL(kcl_register_quorum_device); +EXPORT_SYMBOL(kcl_unregister_quorum_device); +EXPORT_SYMBOL(kcl_quorum_device_available); + +EXPORT_SYMBOL(kcl_register_service); +EXPORT_SYMBOL(kcl_unregister_service); +EXPORT_SYMBOL(kcl_join_service); +EXPORT_SYMBOL(kcl_leave_service); +EXPORT_SYMBOL(kcl_global_service_id); +EXPORT_SYMBOL(kcl_start_done); +EXPORT_SYMBOL(kcl_get_services); +EXPORT_SYMBOL(kcl_get_current_interface); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c --- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/config.c 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,46 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "config.h" + +/* Config file defaults */ + +#define DEFAULT_JOIN_WAIT_TIME 11 /* Time to wait while sending JOINREQ + * messages. Should be at least twice + * the HELLO timer */ +#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a + * JOINACK to regarding that node as + * dead */ +#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */ +#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a + * node in this period kill it */ +#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition + * should take */ +#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to + * a JOINCONF message */ +#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */ +#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition + * restarts before we die */ +#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */ + +struct config_info cman_config = { + .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME, + .joinconf_timeout = DEFAULT_JOINCONF_TIMER, + .join_timeout = DEFAULT_JOIN_TIMEOUT, + .hello_timer = DEFAULT_HELLO_TIMER, + .deadnode_timeout = DEFAULT_DEADNODE_TIMER, + .transition_timeout = DEFAULT_TRANSITION_TIMER, + .transition_restarts = DEFAULT_TRANSITION_RESTARTS, + .max_nodes = DEFAULT_MAX_NODES, + .sm_debug_size = DEFAULT_SM_DEBUG_SIZE, +}; diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h --- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/config.h 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,31 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CONFIG_DOT_H__ +#define __CONFIG_DOT_H__ + +struct config_info { + int joinwait_timeout; + int joinconf_timeout; + int join_timeout; + int hello_timer; + int deadnode_timeout; + int transition_timeout; + int transition_restarts; + int max_nodes; + int sm_debug_size; +}; + +extern struct config_info cman_config; + +#endif /* __CONFIG_DOT_H__ */ diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c --- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/kjoin.c 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,238 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "cnxman-private.h" + +static struct socket *mcast_sock; +static struct socket *recv_sock; +static struct socket *cluster_sock; + +extern short cluster_id; +extern int join_count; +extern struct semaphore join_count_lock; +extern atomic_t cnxman_running; + +int kcl_join_cluster(struct cl_join_cluster_info *join_info) +{ + int result; + int one = 1, error; + unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr; + unsigned short port = join_info->port; + mm_segment_t fs; + struct sockaddr_in saddr; + struct kcl_multicast_sock mcast_info; + + down(&join_count_lock); + if (atomic_read(&cnxman_running)) + { + error = 0; + if (join_info->cluster_id == cluster_id) + join_count++; + else + error = -EINVAL; + up(&join_count_lock); + return error; + } + up(&join_count_lock); + + result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n"); + return result; + } + + result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n"); + return result; + } + + fs = get_fs(); + set_fs(get_ds()); + + if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST, + (void *) &one, sizeof (int)))) + { + set_fs(fs); + printk("Error %d Setting master socket to SO_BROADCAST\n", + error); + sock_release(mcast_sock); + return -1; + } + set_fs(fs); + + /* Bind the multicast socket */ + saddr.sin_family = AF_INET; + saddr.sin_port = htons(port); + saddr.sin_addr.s_addr = cpu_to_be32(brdaddr); + result = + mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr, + sizeof (saddr)); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n"); + sock_release(mcast_sock); + sock_release(recv_sock); + return result; + } + + /* Bind the receive socket to our IP address */ + saddr.sin_family = AF_INET; + saddr.sin_port = htons(port); + saddr.sin_addr.s_addr = cpu_to_be32(ipaddr); + result = + recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr, + sizeof (saddr)); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n"); + sock_release(mcast_sock); + sock_release(recv_sock); + return result; + } + + /* Create the cluster master socket */ + result = + sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME + ": Can't create cluster master socket\n"); + sock_release(mcast_sock); + sock_release(recv_sock); + return result; + } + + /* This is the broadcast transmit address */ + saddr.sin_addr.s_addr = cpu_to_be32(brdaddr); + + /* Pass the multicast socket to kernel space */ + mcast_info.sock = mcast_sock; + mcast_info.number = 1; + + fs = get_fs(); + set_fs(get_ds()); + + if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER, + KCL_SET_MULTICAST, + (void *) &mcast_info, + sizeof (mcast_info)))) + { + set_fs(fs); + printk(CMAN_NAME + ": Unable to pass multicast socket to cnxman, %d\n", + error); + sock_release(mcast_sock); + sock_release(recv_sock); + sock_release(cluster_sock); + return -1; + } + + mcast_info.sock = recv_sock; + if ((error = + cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER, + KCL_SET_RCVONLY, + (void *) &mcast_info, + sizeof (mcast_info)))) + { + set_fs(fs); + printk(CMAN_NAME + ": Unable to pass receive socket to cnxman, %d\n", + error); + sock_release(mcast_sock); + sock_release(recv_sock); + sock_release(cluster_sock); + return -1; + } + + /* This setsockopt expects usermode variables */ + + if (cluster_sock->ops-> + setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER, + (void *) join_info, + sizeof (struct cl_join_cluster_info))) + + { + set_fs(fs); + printk(CMAN_NAME ": Unable to join cluster\n"); + sock_release(mcast_sock); + sock_release(recv_sock); + sock_release(cluster_sock); + return -1; + } + set_fs(fs); + + return 0; +} + +int kcl_leave_cluster(int remove) +{ + mm_segment_t fs; + int rem = remove; + int ret = 0; + struct socket *shutdown_sock = cluster_sock; + + cluster_sock = NULL; + + if (!shutdown_sock) + { + /* Create the cluster master socket */ + int result = + sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, + &shutdown_sock); + if (result < 0) + { + printk(KERN_ERR CMAN_NAME + ": Can't create cluster master socket\n"); + sock_release(mcast_sock); + sock_release(recv_sock); + return result; + } + } + + fs = get_fs(); + set_fs(get_ds()); + + if ((ret = + shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER, + CLU_LEAVE_CLUSTER, (void *) &rem, + sizeof (int)))) + { + printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n", + ret); + } + set_fs(fs); + + sock_release(shutdown_sock); + + return ret; +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c --- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/membership.c 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,3069 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cnxman-private.h" +#include "config.h" +#include "sm_control.h" + +#ifndef TRUE +#define TRUE 1 +#endif + +/* Barrier name for membership transitions. %d is the cluster generation number + */ +#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d" + +/* Variables also used by connection manager */ +struct list_head cluster_members_list; +struct semaphore cluster_members_lock; +int cluster_members; /* Number of ACTIVE members, not a count of + * nodes in the list */ +int we_are_a_cluster_member = 0; +int cluster_is_quorate; +int quit_threads = 0; +struct task_struct *membership_task; +struct cluster_node *us; + +static struct task_struct *hello_task; +static struct semaphore hello_task_lock; + +/* Variables that belong to the connection manager */ +extern wait_queue_head_t cnxman_waitq; +extern struct completion member_thread_comp; +extern struct cluster_node *quorum_device; +extern unsigned short two_node; +extern char cluster_name[]; +extern unsigned int config_version; +extern unsigned int address_length; + +static struct socket *mem_socket; +static pid_t kcluster_pid; + +static char iobuf[MAX_CLUSTER_MESSAGE]; +static char scratchbuf[MAX_CLUSTER_MESSAGE + 100]; + +/* Our node name, usually system_utsname.nodename, but can be overridden */ +char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1]; + +static spinlock_t members_by_nodeid_lock; +static int sizeof_members_array = 0; /* Can dynamically increase (vmalloc + * permitting) */ +static struct cluster_node **members_by_nodeid; + +#define MEMBER_INCREMENT_SIZE 10 + +static int votes = 1; /* Votes this node has */ +static int expected_votes = 1; /* Total expected votes in the cluster */ +static unsigned int quorum; /* Quorum, fewer votes than this and we stop + * work */ +static int leavereason; /* Saved for the duration of a state transition */ +static int transitionreason; /* Reason this transition was initiated */ +static unsigned int highest_nodeid; /* Highest node ID known to the cluster */ +static struct timer_list transition_timer; /* Kicks in if the transition + * doesn't complete in a + * reasonable time */ +static struct timer_list hello_timer; /* Timer to send HELLOs on */ +static unsigned long join_time; /* The time that we got our JOIN-ACK */ +static unsigned long start_time; /* The time that we were started */ +static int joinconf_count; /* Number of JOINCONF messages we have sent to + * a new node */ +static unsigned long wake_flags;/* Reason we were woken */ + +/* Flags in above */ +#define WAKE_FLAG_DEADNODE 1 +#define WAKE_FLAG_TRANSTIMER 2 + +/* The time the transition finished */ +static unsigned long transition_end_time; + +/* A list of nodes that cnxman tells us are dead. I hope this never has more + * than one element in it but I can't take that chance. only non-static so it + * can be initialised in module_load. */ +struct list_head new_dead_node_list; +struct semaphore new_dead_node_lock; + +static int do_membership_packet(struct msghdr *msg, int len); +static int do_process_joinreq(struct msghdr *msg, int len); +static int do_process_joinack(struct msghdr *msg, int len); +static int do_process_joinconf(struct msghdr *msg, int len); +static int do_process_leave(struct msghdr *msg, int len); +static int do_process_hello(struct msghdr *msg, int len); +static int do_process_kill(struct msghdr *msg, int len); +static int do_process_reconfig(struct msghdr *msg, int len); +static int do_process_starttrans(struct msghdr *msg, int len); +static int do_process_masterview(struct msghdr *msg, int len); +static int do_process_endtrans(struct msghdr *msg, int len); +static int do_process_viewack(struct msghdr *msg, int len); +static int do_process_startack(struct msghdr *msg, int len); +static int do_process_newcluster(struct msghdr *msg, int len); +static int do_process_nominate(struct msghdr *msg, int len); +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr, + unsigned int flags); +static int send_joinreq(struct sockaddr_cl *addr, int addr_len); +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id); +static int send_hello(void); +static int send_master_hello(void); +static int send_newcluster(void); +static int end_transition(void); +static int dispatch_messages(struct socket *mem_socket); +static void check_for_dead_nodes(void); +static void confirm_joiner(void); +static void reset_hello_time(void); +static int add_us(void); +static int send_joinconf(void); +static int init_membership_services(void); +static int elect_master(struct cluster_node **); +static void trans_timer_expired(unsigned long arg); +static void hello_timer_expired(unsigned long arg); +static void join_or_form_cluster(void); +static int do_timer_wakeup(void); +static int start_transition(unsigned char reason, struct cluster_node *node); +int send_leave(unsigned char); +int send_reconfigure(int, unsigned int); + +#ifdef DEBUG_MEMB +static char *msgname(int msg); +static int debug_sendmsg(struct socket *sock, void *buf, int size, + struct sockaddr_cl *caddr, int addr_len, + unsigned int flags) +{ + P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]), + size); + return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags); +} + +#define kcl_sendmsg debug_sendmsg +#endif + +/* State of the node */ +static enum { STARTING, JOINING, JOINWAIT, JOINACK, TRANSITION, + TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER +} node_state = STARTING; + +/* Sub-state when we are MASTER */ +static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM, + MASTER_COMPLETE } master_state; + +/* Number of responses collected while a master controlling a state transition */ +static int responses_collected; +static int responses_expected; + +/* Current cluster generation number */ +static int cluster_generation = 1; + +/* When another node initiates a transtion then store it's pointer in here so + * we can check for other nodes trying to spoof us */ +static struct cluster_node *master_node = NULL; + +/* Struct the node wanting to join us */ +static struct cluster_node *joining_node = NULL; +static int joining_temp_nodeid = 0; + +/* Last time a HELLO message was sent */ +unsigned long last_hello = 0; + +/* When we got our JOINWAIT or NEWCLUSTER */ +unsigned long joinwait_time = 0; + +/* Number of times a transition has restarted when we were master */ +int transition_restarts = 0; + +/* Variables used by the master to collect cluster status during a transition */ +static int agreeing_nodes = 0; +static int dissenting_nodes = 0; +static uint8_t *node_opinion = NULL; +#define OPINION_AGREE 1 +#define OPINION_DISAGREE 2 + +/* Set node id of a node, also add it to the members array and expand the array + * if necessary */ +static inline void set_nodeid(struct cluster_node *node, int nodeid) +{ + if (!nodeid) + return; + + node->node_id = nodeid; + if (nodeid > sizeof_members_array) { + int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE; + struct cluster_node **new_array = + vmalloc((new_size) * sizeof (struct cluster_node *)); + if (new_array) { + spin_lock(&members_by_nodeid_lock); + memcpy(new_array, members_by_nodeid, + sizeof_members_array * + sizeof (struct cluster_node *)); + memset(&new_array[sizeof_members_array], 0, + MEMBER_INCREMENT_SIZE * + sizeof (struct cluster_node *)); + vfree(members_by_nodeid); + members_by_nodeid = new_array; + sizeof_members_array = new_size; + spin_unlock(&members_by_nodeid_lock); + } + else { + panic("No memory for more nodes"); + } + } + notify_kernel_listeners(NEWNODE, (long) nodeid); + + spin_lock(&members_by_nodeid_lock); + members_by_nodeid[nodeid] = node; + spin_unlock(&members_by_nodeid_lock); +} + +static int hello_kthread(void *unused) +{ + struct task_struct *tsk = current; + sigset_t tmpsig; + + daemonize("cman_hbeat"); + + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */ + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM); + sigprocmask(SIG_BLOCK, &tmpsig, NULL); + + down(&hello_task_lock); + hello_task = tsk; + up(&hello_task_lock); + + set_user_nice(current, -6); + + while (node_state != REJECTED && node_state != LEFT_CLUSTER) { + send_hello(); + + /* Scan the nodes list for dead nodes */ + if (node_state == MEMBER) + check_for_dead_nodes(); + + set_task_state(current, TASK_INTERRUPTIBLE); + schedule(); + set_task_state(current, TASK_RUNNING); + } + down(&hello_task_lock); + hello_task = NULL; + up(&hello_task_lock); + P_MEMB("heartbeat closing down\n"); + return 0; +} + +/* This is the membership "daemon". A client of cnxman (but symbiotic with it) + * that keeps track of and controls cluster membership. */ +static int membership_kthread(void *unused) +{ + struct task_struct *tsk = current; + struct socket *tmp_socket; + sigset_t tmpsig; + + daemonize("cman_memb"); + + /* Block everything but SIGKILL/SIGSTOP/SIGTERM */ + siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM); + sigprocmask(SIG_BLOCK, &tmpsig, NULL); + + membership_task = tsk; + set_user_nice(current, -5); + + /* Open the socket */ + if (init_membership_services()) + return -1; + + add_us(); + joining_node = us; + + init_timer(&hello_timer); + hello_timer.function = hello_timer_expired; + hello_timer.data = 0L; + + /* Do joining stuff */ + join_or_form_cluster(); + + transition_end_time = jiffies; + + /* Main loop */ + while (node_state != REJECTED && node_state != LEFT_CLUSTER) { + + struct task_struct *tsk = current; + + DECLARE_WAITQUEUE(wait, tsk); + + tsk->state = TASK_INTERRUPTIBLE; + add_wait_queue(mem_socket->sk->sk_sleep, &wait); + + if (!skb_peek(&mem_socket->sk->sk_receive_queue) && + wake_flags == 0) { + if (node_state == JOINACK || + node_state == JOINWAIT) + schedule_timeout(HZ); + else + schedule(); + } + + tsk->state = TASK_RUNNING; + remove_wait_queue(mem_socket->sk->sk_sleep, &wait); + + /* Are we being shut down? */ + if (node_state == LEFT_CLUSTER || quit_threads || + signal_pending(current)) + break; + + /* Were we woken by a dead node passed down from cnxman ? */ + if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) { + struct list_head *nodelist, *tmp; + struct cl_new_dead_node *deadnode; + + down(&new_dead_node_lock); + list_for_each_safe(nodelist, tmp, &new_dead_node_list) { + deadnode = + list_entry(nodelist, + struct cl_new_dead_node, list); + + if (deadnode->node->state == NODESTATE_MEMBER) + a_node_just_died(deadnode->node); + list_del(&deadnode->list); + kfree(deadnode); + } + up(&new_dead_node_lock); + } + + /* Process received messages. If dispatch_message() returns an + * error then we shut down */ + if (skb_peek(&mem_socket->sk->sk_receive_queue)) { + if (dispatch_messages(mem_socket) < 0) + goto leave_cluster; + + } + + /* Were we woken by the transition timer firing ? */ + if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) { + switch (do_timer_wakeup()) { + case -1: + continue; + case 0: + break; + case +1: + goto leave_cluster; + } + } + + /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO + * messages again */ + if (node_state == JOINACK + && time_after(jiffies, + join_time + cman_config.join_timeout * HZ)) { + P_MEMB + ("Waited a long time for a join-conf, going back to JOINWAIT state\n"); + node_state = JOINWAIT; + joinwait_time = jiffies; + } + + /* Have we been in joinwait for too long... */ + if (node_state == JOINWAIT + && time_after(jiffies, joinwait_time + + cman_config.join_timeout * HZ)) { + printk(CMAN_NAME + ": Been in JOINWAIT for too long - giving up\n"); + goto leave_cluster; + } + } + + leave_cluster: + + /* Wake up the heartbeat thread so it can exit */ + down(&hello_task_lock); + if (hello_task) + wake_up_process(hello_task); + up(&hello_task_lock); + + if (timer_pending(&hello_timer)) + del_timer(&hello_timer); + + if (timer_pending(&transition_timer)) + del_timer(&transition_timer); + + node_state = LEFT_CLUSTER; + P_MEMB("closing down\n"); + quit_threads = 1; /* force other thread to exit too */ + + /* Close the socket, NULL the pointer first so it doesn't get used + * by send_leave() + */ + tmp_socket = mem_socket; + mem_socket = NULL; + sock_release(tmp_socket); + highest_nodeid = 0; + complete(&member_thread_comp); + return 0; +} + +/* Things to do in the main thread when the transition timer has woken us. + * Usually this happens when a transition is taking too long and we need to + * take remedial action. + * + * returns: -1 continue; 0 carry on processing +1 leave cluster; */ +static int do_timer_wakeup() +{ + P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies); + + /* Resend JOINCONF if it got lost on the wire */ + if (node_state == MASTER && master_state == MASTER_CONFIRM) { + mod_timer(&transition_timer, + jiffies + cman_config.joinconf_timeout * HZ); + if (++joinconf_count < MAX_RETRIES) { + P_MEMB("Resending JOINCONF\n"); + send_joinconf(); + } + else { + P_MEMB("JOINCONF not acked, cancelling transition\n"); + end_transition(); + } + return -1; + } + + /* A joining node probably died */ + if (cluster_members == 1) { + end_transition(); + return -1; + } + + /* See if the master is still there */ + if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) { + + /* If we are in transition and master_node is NULL then we are + * waiting for ENDTRANS after JOIN-CONF */ + if (!master_node) { + /* Hmmm. master died after sending JOINCONF, we'll have + * to die as we are in mid-transition */ + printk(KERN_INFO CMAN_NAME + ": Master died after JOINCONF, we must leave the cluster\n"); + quit_threads = 1; + return +1; + } + + /* No messages from the master - see if it's stil there */ + if (master_node->state == NODESTATE_MEMBER) { + send_master_hello(); + mod_timer(&transition_timer, + jiffies + + cman_config.transition_timeout * HZ); + } + + /* If the master is dead then elect a new one */ + if (master_node->state == NODESTATE_DEAD) { + + struct cluster_node *node; + + P_MEMB("Master node is dead...Election!\n"); + if (elect_master(&node)) { + + /* We are master now, all kneel */ + start_transition(TRANS_DEADMASTER, master_node); + } + else { + /* Leave the job to someone on more pay */ + master_node = node; + mod_timer(&transition_timer, + jiffies + + cman_config.transition_timeout * HZ); + } + } + } + + /* If we are the master node then restart the transition */ + if (node_state == MASTER) { + start_transition(TRANS_RESTART, us); + } + + return 0; +} + +static void form_cluster(void) +{ + printk(KERN_INFO CMAN_NAME ": forming a new cluster\n"); + node_state = MEMBER; + we_are_a_cluster_member = TRUE; + us->node_id = 1; + us->state = NODESTATE_MEMBER; + set_nodeid(us, 1); + recalculate_quorum(0); + sm_member_update(cluster_is_quorate); + send_hello(); + kernel_thread(hello_kthread, NULL, 0); + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ); +} + +/* This does the initial JOIN part of the membership process. Actually most of + * is done in the message processing routines but this is the main loop that + * controls it. The side-effect of this routine is "node_state" which tells the + * real main loop (in the kernel thread routine) what to do next */ +static void join_or_form_cluster() +{ + start_time = jiffies; + + printk(KERN_INFO CMAN_NAME + ": Waiting to join or form a Linux-cluster\n"); + join_time = 0; + start_time = jiffies; + joinwait_time = jiffies; + last_hello = 0; + send_newcluster(); + + /* Listen for a reply */ + do { + DECLARE_WAITQUEUE(wait, current); + set_task_state(current, TASK_INTERRUPTIBLE); + add_wait_queue(mem_socket->sk->sk_sleep, &wait); + + if (!skb_peek(&mem_socket->sk->sk_receive_queue)) + schedule_timeout((cman_config.joinwait_timeout * HZ) / + 5); + + set_task_state(current, TASK_RUNNING); + remove_wait_queue(mem_socket->sk->sk_sleep, &wait); + + while (skb_peek(&mem_socket->sk->sk_receive_queue)) { + dispatch_messages(mem_socket); + } + if (quit_threads) + node_state = LEFT_CLUSTER; + + } + while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) && + node_state == STARTING); + + /* If we didn't hear any HELLO messages then form a new cluster */ + if (node_state == STARTING) { + form_cluster(); + } + else + last_hello = jiffies; + +} + +int start_membership_services(pid_t cluster_pid) +{ + kcluster_pid = cluster_pid; + + init_timer(&transition_timer); + transition_timer.function = trans_timer_expired; + transition_timer.data = 0L; + + /* Start the thread */ + return kernel_thread(membership_kthread, NULL, 0); +} + +static int init_membership_services() +{ + int result; + struct sockaddr_cl saddr; + struct socket *sock; + + init_MUTEX(&hello_task_lock); + /* Create a socket to communicate with */ + result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock); + if (result < 0) { + printk(KERN_ERR CMAN_NAME + ": Can't create cluster socket for membership services\n"); + return result; + } + mem_socket = sock; + + /* Bind to our port */ + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + result = + sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr)); + if (result < 0) { + printk(KERN_ERR CMAN_NAME + ": Can't bind to cluster membership services port\n"); + sock_release(sock); + return result; + } + + node_state = STARTING; + return 0; +} + +static int send_joinconf() +{ + struct sockaddr_cl saddr; + int status; + + if (joining_temp_nodeid == 0) { + BUG(); + } + + master_state = MASTER_CONFIRM; + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + saddr.scl_family = AF_CLUSTER; + saddr.scl_nodeid = joining_temp_nodeid; + status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr, + MSG_NOACK); + + if (status < 0) { + printk("Error %d sending JOINCONF, aborting transition\n", status); + end_transition(); + } + return status; +} + +static int send_joinreq(struct sockaddr_cl *addr, int addr_len) +{ + char *msgbuf = scratchbuf; + struct list_head *addrlist; + int ptr = sizeof (struct cl_mem_join_msg); + unsigned short num_addr = 0; + struct cluster_node_addr *nodeaddr; + struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf; + + msg->cmd = CLUSTER_MEM_JOINREQ; + msg->votes = votes; + msg->expected_votes = cpu_to_le32(expected_votes); + msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION); + msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION); + msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION); + msg->config_version = cpu_to_le32(config_version); + msg->addr_len = cpu_to_le32(address_length); + strcpy(msg->clustername, cluster_name); + + /* Add our addresses */ + list_for_each(addrlist, &us->addr_list) { + nodeaddr = list_entry(addrlist, struct cluster_node_addr, list); + + memcpy(msgbuf + ptr, nodeaddr->addr, address_length); + ptr += address_length; + num_addr++; + } + msg->num_addr = cpu_to_le16(num_addr); + + /* And our name */ + strcpy(msgbuf + ptr, nodename); + ptr += strlen(nodename) + 1; + + return kcl_sendmsg(mem_socket, msgbuf, ptr, + addr, addr_len, MSG_NOACK); +} + +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id) +{ + struct cl_mem_startack_msg msg; + + msg.cmd = CLUSTER_MEM_STARTACK; + msg.generation = cpu_to_le32(cluster_generation); + msg.node_id = cpu_to_le32(node_id); + msg.highest_node_id = cpu_to_le32(get_highest_nodeid()); + + return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, 0); +} + +static int send_newcluster() +{ + char buf[1]; + + buf[0] = CLUSTER_MEM_NEWCLUSTER; + + return kcl_sendmsg(mem_socket, buf, 1, NULL, 0, + MSG_NOACK); +} + +static int send_hello() +{ + struct cl_mem_hello_msg hello_msg; + int status; + + hello_msg.cmd = CLUSTER_MEM_HELLO; + hello_msg.members = cpu_to_le16(cluster_members); + hello_msg.flags = 0; + hello_msg.generation = cpu_to_le32(cluster_generation); + + status = + kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), NULL, 0, + MSG_NOACK | MSG_ALLINT); + + last_hello = jiffies; + + return status; +} + +/* This is a special HELLO message that requires an ACK. clients in transition + * send these to the master to check it is till alive. if it does not ACK then + * cnxman will signal it dead and we can restart the transition */ +static int send_master_hello() +{ + struct cl_mem_hello_msg hello_msg; + int status; + struct sockaddr_cl saddr; + + hello_msg.cmd = CLUSTER_MEM_HELLO; + hello_msg.members = cpu_to_le16(cluster_members); + hello_msg.flags = 1; + hello_msg.generation = cpu_to_le32(cluster_generation); + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + saddr.scl_nodeid = master_node->node_id; + status = + kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), + &saddr, sizeof (saddr), 0); + + last_hello = jiffies; + + return status; +} + +/* Called when the transition timer has expired, meaning we sent a transition + * message that was not ACKed */ +static void trans_timer_expired(unsigned long arg) +{ + P_MEMB("Transition timer fired %ld\n", jiffies); + + set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags); + wake_up_process(membership_task); +} + +static void hello_timer_expired(unsigned long arg) +{ + P_MEMB("Hello timer fired %ld\n", jiffies); + + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ); + + if (node_state >= TRANSITION) { + wake_up_process(hello_task); + } +} + +static int wait_for_completion_barrier(void) +{ + int status; + char barriername[MAX_BARRIER_NAME_LEN]; + + sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation); + + /* Make sure we all complete together */ + P_MEMB("Waiting for completion barrier: %d members\n", cluster_members); + if ((status = + kcl_barrier_register(barriername, 0, cluster_members)) < 0) { + printk(CMAN_NAME ": Error registering barrier: %d\n", status); + return -1; + } + kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT, + cman_config.transition_timeout); + status = kcl_barrier_wait(barriername); + kcl_barrier_delete(barriername); + + P_MEMB("Completion barrier reached : status = %d\n", status); + return status; +} + +/* Called at the end of a state transition when we are the master */ +static int end_transition() +{ + struct cl_mem_endtrans_msg msg; + int total_votes; + int status; + + /* Cancel the timer */ + del_timer(&transition_timer); + + confirm_joiner(); + + quorum = calculate_quorum(leavereason, 0, &total_votes); + + msg.cmd = CLUSTER_MEM_ENDTRANS; + msg.quorum = cpu_to_le32(quorum); + msg.generation = cpu_to_le32(++cluster_generation); + msg.total_votes = cpu_to_le32(total_votes); + if (joining_node && transitionreason == TRANS_NEWNODE) { + msg.new_node_id = cpu_to_le32(joining_node->node_id); + } + else { + msg.new_node_id = 0; + } + status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0); + + /* When that's all settled down, do the transition completion barrier */ + kcl_wait_for_all_acks(); + + if (wait_for_completion_barrier() != 0) { + P_MEMB("Barrier timed out - restart\n"); + start_transition(TRANS_RESTART, us); + return 0; + } + + set_quorate(total_votes); + + notify_listeners(); + reset_hello_time(); + + /* Tell any waiting barriers that we had a transition */ + check_barrier_returns(); + + leavereason = 0; + node_state = MEMBER; + transition_end_time = jiffies; + + sm_member_update(cluster_is_quorate); + + return 0; +} + +int send_reconfigure(int param, unsigned int value) +{ + char msgbuf[66]; + struct cl_mem_reconfig_msg *msg = + (struct cl_mem_reconfig_msg *) &msgbuf; + + if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value) + expected_votes = value; + + msg->cmd = CLUSTER_MEM_RECONFIG; + msg->param = param; + msg->value = cpu_to_le32(value); + + return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0); +} + +static int send_joinack(char *addr, int addr_len, unsigned char acktype) +{ + struct cl_mem_joinack_msg msg; + + msg.cmd = CLUSTER_MEM_JOINACK; + msg.acktype = acktype; + + return kcl_sendmsg(mem_socket, &msg, sizeof (msg), + (struct sockaddr_cl *)addr, addr_len, MSG_NOACK); +} + +/* Only send a leave message to one node in the cluster so that it can master + * the state transition, otherwise we get a "thundering herd" of potential + * masters fighting it out */ +int send_leave(unsigned char flags) +{ + unsigned char msg[2]; + struct sockaddr_cl saddr; + struct cluster_node *node = NULL; + int status; + + if (!mem_socket) + return 0; + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + + /* If we are in transition then use the current master */ + if (node_state == TRANSITION) { + node = master_node; + } + if (!node) { + /* If we are the master or not in transition then pick a node + * almost at random */ + struct list_head *nodelist; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER && !node->us) + break; + } + up(&cluster_members_lock); + } + + /* we are the only member of the cluster - there is no-one to tell */ + if (node && !node->us) { + saddr.scl_nodeid = node->node_id; + + P_MEMB("Sending LEAVE to %s\n", node->name); + msg[0] = CLUSTER_MEM_LEAVE; + msg[1] = flags; + status = + kcl_sendmsg(mem_socket, msg, 2, + &saddr, sizeof (saddr), + MSG_NOACK); + + if (status < 0) + return status; + } + + /* And exit */ + node_state = LEFT_CLUSTER; + wake_up_process(membership_task); + return 0; +} + +int send_kill(int nodeid) +{ + char killmsg; + struct sockaddr_cl saddr; + + killmsg = CLUSTER_MEM_KILL; + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + saddr.scl_nodeid = nodeid; + return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr, + sizeof (struct sockaddr_cl), MSG_NOACK); +} + +/* Process a message */ +static int do_membership_packet(struct msghdr *msg, int len) +{ + int result = -1; + unsigned char *buf = msg->msg_iov->iov_base; + struct sockaddr_cl *saddr = msg->msg_name; + struct cluster_node *node; + + node = find_node_by_nodeid(saddr->scl_nodeid); + + P_MEMB("got membership message : %s, from (%d) %s, len = %d\n", + msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len); + + switch (*buf) { + case CLUSTER_MEM_JOINREQ: + result = do_process_joinreq(msg, len); + break; + + case CLUSTER_MEM_LEAVE: + if (we_are_a_cluster_member) + result = do_process_leave(msg, len); + break; + + case CLUSTER_MEM_HELLO: + result = do_process_hello(msg, len); + break; + + case CLUSTER_MEM_KILL: + if (we_are_a_cluster_member) + result = do_process_kill(msg, len); + break; + + case CLUSTER_MEM_JOINCONF: + if (node_state == JOINACK) { + do_process_joinconf(msg, len); + } + break; + + case CLUSTER_MEM_CONFACK: + if (node_state == MASTER && master_state == MASTER_CONFIRM) { + end_transition(); + } + break; + + case CLUSTER_MEM_MASTERVIEW: + if (node_state == TRANSITION) + do_process_masterview(msg, len); + break; + + case CLUSTER_MEM_JOINACK: + if (node_state == JOINING || node_state == JOINWAIT) { + do_process_joinack(msg, len); + } + break; + case CLUSTER_MEM_RECONFIG: + if (we_are_a_cluster_member) { + do_process_reconfig(msg, len); + } + break; + + case CLUSTER_MEM_STARTTRANS: + result = do_process_starttrans(msg, len); + break; + + case CLUSTER_MEM_ENDTRANS: + result = do_process_endtrans(msg, len); + break; + + case CLUSTER_MEM_VIEWACK: + result = do_process_viewack(msg, len); + break; + + case CLUSTER_MEM_STARTACK: + if (node_state == MASTER) + result = do_process_startack(msg, len); + break; + + case CLUSTER_MEM_NEWCLUSTER: + result = do_process_newcluster(msg, len); + break; + + case CLUSTER_MEM_NOMINATE: + if (node_state != MASTER) + result = do_process_nominate(msg, len); + break; + + default: + printk(KERN_ERR CMAN_NAME + ": Unknown membership services message %d received\n", + *buf); + break; + + } + return result; +} + +/* Returns -ve to reject membership of the cluster 0 to accept membership +ve + * to ignore request (node already joining) */ +static int check_duplicate_node(char *name, struct msghdr *msg, int len) +{ + struct cluster_node *node; + struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name; + char addr[address_length]; + int addrlen; + + if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN) + return -3; + + /* See if we already have a cluster member with that name... */ + node = find_node_by_name(name); + if (node && node->state != NODESTATE_DEAD) { + + if ((node->state == NODESTATE_JOINING || + node->state == NODESTATE_REMOTEMEMBER)) + return +1; + + printk(KERN_WARNING CMAN_NAME + ": Rejecting cluster membership application from %s - already have a node with that name\n", + name); + return -1; + + } + + /* Need to check the node's address too */ + if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) && + (node = find_node_by_addr(addr, addrlen)) && + node->state != NODESTATE_DEAD) { + + if ((node->state == NODESTATE_JOINING || + node->state == NODESTATE_REMOTEMEMBER)) + return +1; + + printk(KERN_WARNING CMAN_NAME + ": Rejecting cluster membership application from %s - already have a node with that address\n", + name); + return -1; + } + return 0; +} + +/* Start the state transition */ +static int start_transition(unsigned char reason, struct cluster_node *node) +{ + char *startbuf = scratchbuf; + struct cl_mem_starttrans_msg *msg = + (struct cl_mem_starttrans_msg *) startbuf; + + P_MEMB("Start transition - reason = %d\n", reason); + + /* If this is a restart then zero the counters */ + if (reason == TRANS_RESTART) { + agreeing_nodes = 0; + dissenting_nodes = 0; + if (node_opinion) { + kfree(node_opinion); + node_opinion = NULL; + } + responses_collected = 0; + } + + /* If we have timed out too many times then just die */ + if (reason == TRANS_RESTART + && ++transition_restarts > cman_config.transition_restarts) { + printk(KERN_WARNING CMAN_NAME + ": too many transition restarts - will die\n"); + send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT); + node_state = LEFT_CLUSTER; + quit_threads = 1; + wake_up_process(membership_task); + wake_up_interruptible(&cnxman_waitq); + return 0; + } + if (reason != TRANS_RESTART) + transition_restarts = 0; + + /* Only keep the original state transition reason in the global + * variable. */ + if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER && + reason != TRANS_RESTART && reason != TRANS_DEADMASTER) + transitionreason = reason; + + /* Save the info of the requesting node */ + if (reason == TRANS_NEWNODE) + joining_node = node; + + node_state = MASTER; + master_state = MASTER_START; + responses_collected = 0; + responses_expected = cluster_members - 1; + + /* If we are on our own then just do it */ + if (responses_expected == 0) { + P_MEMB("We are on our own...lonely here\n"); + responses_collected--; + do_process_startack(NULL, 0); + } + else { + int ptr = sizeof (struct cl_mem_starttrans_msg); + struct list_head *addrlist; + unsigned short num_addrs = 0; + int flags = 0; + + /* Send the STARTTRANS message */ + msg->cmd = CLUSTER_MEM_STARTTRANS; + msg->reason = reason; + msg->votes = node->votes; + msg->expected_votes = cpu_to_le32(node->expected_votes); + msg->generation = cpu_to_le32(++cluster_generation); + msg->nodeid = cpu_to_le32(node->node_id); + + if (reason == TRANS_NEWNODE) { + /* Add the addresses */ + list_for_each(addrlist, &node->addr_list) { + struct cluster_node_addr *nodeaddr = + list_entry(addrlist, + struct cluster_node_addr, list); + + memcpy(startbuf + ptr, nodeaddr->addr, + address_length); + ptr += address_length; + num_addrs++; + } + + /* And the name */ + strcpy(startbuf + ptr, node->name); + ptr += strlen(node->name) + 1; + } + + /* If another node died then we must queue the STARTTRANS + * messages so that membershipd can carry on processing the + * other replies */ + if (reason == TRANS_ANOTHERREMNODE) + flags |= MSG_QUEUE; + + msg->num_addrs = cpu_to_le16(num_addrs); + kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags); + } + /* Set a timer in case we don't get 'em all back */ + mod_timer(&transition_timer, + jiffies + cman_config.transition_timeout * HZ); + return 0; +} + +/* A node has died - decide what to do */ +void a_node_just_died(struct cluster_node *node) +{ + /* If we are not in the context of kmembershipd then stick it on the + * list and wake it */ + if (current != membership_task) { + struct cl_new_dead_node *newnode = + kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL); + if (!newnode) + return; + newnode->node = node; + down(&new_dead_node_lock); + list_add_tail(&newnode->list, &new_dead_node_list); + set_bit(WAKE_FLAG_DEADNODE, &wake_flags); + up(&new_dead_node_lock); + wake_up_process(membership_task); + P_MEMB("Passing dead node %s onto kmembershipd\n", node->name); + return; + } + + /* Remove it */ + down(&cluster_members_lock); + if (node->state == NODESTATE_MEMBER) + cluster_members--; + node->state = NODESTATE_DEAD; + up(&cluster_members_lock); + + /* Notify listeners */ + notify_kernel_listeners(DIED, (long) node->node_id); + + /* If we are in normal operation then become master and initiate a + * state-transition */ + if (node_state == MEMBER) { + start_transition(TRANS_REMNODE, node); + return; + } + + /* If we are a slave in transition then see if it's the master that has + * failed. If not then ignore it. If it /is/ the master then elect a + * new one */ + if (node_state == TRANSITION) { + if (master_node == node) { + if (elect_master(&node)) { + del_timer(&transition_timer); + node_state = MASTER; + + start_transition(TRANS_DEADMASTER, master_node); + } + else { + /* Someone else can be in charge - phew! */ + } + } + return; + } + + /* If we are the master then we need to start the transition all over + * again */ + if (node_state == MASTER) { + /* Cancel timer */ + del_timer(&transition_timer); + + /* Restart the transition */ + start_transition(TRANS_ANOTHERREMNODE, node); + transition_restarts = 0; + return; + } +} + +/* + * Build up and send a set of messages consisting of the whole cluster view. + * The first byte is the command (cmd as passed in), the second is a flag byte: + * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if + * this is the only message sent The rest is a set of packed node entries, which + * are NOT split over packets. */ +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr, + unsigned int flags) +{ + int ptr = 2; + int len; + int status = 0; + int last_node_start = 2; + unsigned char first_packet_flag = 1; + struct list_head *nodelist; + struct list_head *temp; + struct cluster_node *node; + char *message = scratchbuf; + + message[0] = cmd; + + down(&cluster_members_lock); + list_for_each_safe(nodelist, temp, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER) { + unsigned int evotes; + unsigned int node_id; + unsigned short num_addrs = 0; + unsigned short num_addrs_le; + struct list_head *addrlist; + + last_node_start = ptr; + + message[ptr++] = len = strlen(node->name); + strcpy(&message[ptr], node->name); + ptr += len; + + /* Count the number of addresses this node has */ + list_for_each(addrlist, &node->addr_list) { + num_addrs++; + } + + num_addrs_le = cpu_to_le16(num_addrs); + memcpy(&message[ptr], &num_addrs_le, sizeof (short)); + ptr += sizeof (short); + + /* Pack em in */ + list_for_each(addrlist, &node->addr_list) { + + struct cluster_node_addr *nodeaddr = + list_entry(addrlist, + struct cluster_node_addr, list); + + memcpy(&message[ptr], nodeaddr->addr, + address_length); + ptr += address_length; + } + + message[ptr++] = node->votes; + + evotes = cpu_to_le32(node->expected_votes); + memcpy(&message[ptr], &evotes, sizeof (int)); + ptr += sizeof (int); + + node_id = cpu_to_le32(node->node_id); + memcpy(&message[ptr], &node_id, sizeof (int)); + ptr += sizeof (int); + + /* If the block is full then send it */ + if (ptr > MAX_CLUSTER_MESSAGE) { + message[1] = first_packet_flag; + + up(&cluster_members_lock); + status = + kcl_sendmsg(mem_socket, message, + last_node_start, saddr, + saddr ? sizeof (struct sockaddr_cl) : 0, + flags); + + if (status < 0) + goto send_fail; + + down(&cluster_members_lock); + + first_packet_flag = 0; + /* Copy the overflow back to the start of the + * buffer for the next send */ + memcpy(&message[2], &message[last_node_start], + ptr - last_node_start); + ptr = ptr - last_node_start + 2; + } + } + } + + up(&cluster_members_lock); + + message[1] = first_packet_flag | 2; /* The last may also be first */ + status = kcl_sendmsg(mem_socket, message, ptr, + saddr, saddr ? sizeof (struct sockaddr_cl) : 0, + flags); + send_fail: + + return status; +} + +/* Make the JOINING node into a MEMBER */ +static void confirm_joiner() +{ + if (joining_node && joining_node->state == NODESTATE_JOINING) { + down(&cluster_members_lock); + joining_node->state = NODESTATE_MEMBER; + cluster_members++; + up(&cluster_members_lock); + } + remove_temp_nodeid(joining_temp_nodeid); + joining_temp_nodeid = 0; +} + +/* Reset HELLO timers for all nodes We do this after a state-transition as we + * have had HELLOS disabled during the transition and if we don't do this the + * nodes will go on an uncontrolled culling-spree afterwards */ +static void reset_hello_time() +{ + struct list_head *nodelist; + struct cluster_node *node; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER) { + node->last_hello = jiffies; + } + + } + up(&cluster_members_lock); +} + +/* Calculate the new quorum and return the value. do *not* set it in here as + * cnxman calls this to check if a new expected_votes value is valid. It + * (optionally) returns the total number of votes in the cluster */ +int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes) +{ + struct list_head *nodelist; + struct cluster_node *node; + unsigned int total_votes = 0; + unsigned int highest_expected = 0; + unsigned int newquorum, q1, q2; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state == NODESTATE_MEMBER) { + highest_expected = + max(highest_expected, node->expected_votes); + total_votes += node->votes; + } + } + up(&cluster_members_lock); + if (quorum_device && quorum_device->state == NODESTATE_MEMBER) + total_votes += quorum_device->votes; + + if (max_expected > 0) + highest_expected = max_expected; + + /* This quorum calculation is taken from the OpenVMS Cluster Systems + * manual, but, then, you guessed that didn't you */ + q1 = (highest_expected + 2) / 2; + q2 = (total_votes + 2) / 2; + newquorum = max(q1, q2); + + /* Normally quorum never decreases but the system administrator can + * force it down by setting expected votes to a maximum value */ + if (!allow_decrease) + newquorum = max(quorum, newquorum); + + /* The special two_node mode allows each of the two nodes to retain + * quorum if the other fails. Only one of the two should live past + * fencing (as both nodes try to fence each other in split-brain.) */ + if (two_node) + newquorum = 1; + + if (ret_total_votes) + *ret_total_votes = total_votes; + return newquorum; +} + +/* Recalculate cluster quorum, set quorate and notify changes */ +void recalculate_quorum(int allow_decrease) +{ + int total_votes; + + quorum = calculate_quorum(allow_decrease, 0, &total_votes); + set_quorate(total_votes); + notify_listeners(); +} + +/* Add new node address to an existing node */ +int add_node_address(struct cluster_node *node, unsigned char *addr, int len) +{ + struct cluster_node_addr *newaddr; + + newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL); + if (!newaddr) + return -1; + + memcpy(newaddr->addr, addr, len); + newaddr->addr_len = len; + list_add_tail(&newaddr->list, &node->addr_list); + + return 0; +} + +static struct cluster_node *add_new_node(char *name, unsigned char votes, + unsigned int expected_votes, + int node_id, int state) +{ + struct cluster_node *newnode; + + /* Look for a dead node with this name */ + newnode = find_node_by_name(name); + + /* Is it already joining */ + if (newnode && newnode->state == NODESTATE_JOINING) + return NULL; + + /* Update existing information */ + if (newnode && newnode->state == NODESTATE_DEAD) { + newnode->last_hello = jiffies; + newnode->votes = votes; + newnode->expected_votes = expected_votes; + newnode->state = state; + newnode->us = 0; + newnode->leave_reason = 0; + newnode->last_seq_recv = 0; + newnode->last_seq_acked = 0; + newnode->last_seq_sent = 0; + newnode->incarnation++; + /* Don't overwrite the node ID */ + + if (state == NODESTATE_MEMBER) { + down(&cluster_members_lock); + cluster_members++; + up(&cluster_members_lock); + } + + printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name); + return newnode; + } + + newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL); + if (!newnode) + goto alloc_err; + + memset(newnode, 0, sizeof (struct cluster_node)); + newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (!newnode->name) + goto alloc_err1; + + strcpy(newnode->name, name); + newnode->last_hello = jiffies; + newnode->votes = votes; + newnode->expected_votes = expected_votes; + newnode->state = state; + newnode->node_id = node_id; + newnode->us = 0; + newnode->leave_reason = 0; + newnode->last_seq_recv = 0; + newnode->last_seq_acked = 0; + newnode->last_seq_sent = 0; + newnode->incarnation = 0; + INIT_LIST_HEAD(&newnode->addr_list); + set_nodeid(newnode, node_id); + + /* Add the new node to the list */ + down(&cluster_members_lock); + list_add(&newnode->list, &cluster_members_list); + if (state == NODESTATE_MEMBER) + cluster_members++; + up(&cluster_members_lock); + + printk(KERN_INFO CMAN_NAME ": got node %s\n", name); + return newnode; + + alloc_err1: + kfree(newnode); + alloc_err: + send_leave(CLUSTER_LEAVEFLAG_PANIC); + + printk(KERN_CRIT CMAN_NAME + ": Cannot allocate memory for new cluster node %s\n", name); + + panic("cluster memory allocation failed"); + + return NULL; +} + +/* Remove node from a STARTTRANS message */ +static struct cluster_node *remove_node(int nodeid) +{ + struct cluster_node *node = find_node_by_nodeid(nodeid); + + if (node && node->state == NODESTATE_MEMBER) { + P_MEMB("starttrans removes node %s\n", node->name); + down(&cluster_members_lock); + node->state = NODESTATE_DEAD; + cluster_members--; + up(&cluster_members_lock); + + notify_kernel_listeners(DIED, (long) nodeid); + + /* If this node is us then go quietly */ + if (node->us) { + printk(KERN_INFO CMAN_NAME + ": killed by STARTTRANS or NOMINATE\n"); + quit_threads = 1; + wake_up_process(membership_task); + wake_up_interruptible(&cnxman_waitq); + } + } + return node; +} + +/* Add a node from a STARTTRANS or NOMINATE message */ +static void add_node_from_starttrans(struct msghdr *msg, int len) +{ + /* Add the new node but don't fill in the ID until the master has + * confirmed it */ + struct cl_mem_starttrans_msg *startmsg = + (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base; + char *msgbuf = (char *) msg->msg_iov->iov_base; + int ptr = sizeof (struct cl_mem_starttrans_msg); + char *name = + msgbuf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length; + int i; + + joining_node = add_new_node(name, startmsg->votes, + le32_to_cpu(startmsg->expected_votes), + 0, NODESTATE_JOINING); + + /* add_new_node returns NULL if the node already exists */ + if (!joining_node) + joining_node = find_node_by_name(name); + + /* Add the node's addresses */ + if (list_empty(&joining_node->addr_list)) { + for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) { + add_node_address(joining_node, msgbuf + ptr, address_length); + ptr += address_length; + } + } +} + +/* We have been nominated as master for a transition */ +static int do_process_nominate(struct msghdr *msg, int len) +{ + struct cl_mem_starttrans_msg *startmsg = + (struct cl_mem_starttrans_msg *)msg->msg_iov->iov_base; + struct cluster_node *node = NULL; + char *nodeaddr = msg->msg_iov->iov_base + sizeof(struct cl_mem_starttrans_msg); + + P_MEMB("nominate reason is %d\n", startmsg->reason); + + if (startmsg->reason == TRANS_REMNODE) { + node = remove_node(le32_to_cpu(startmsg->nodeid)); + } + + if (startmsg->reason == TRANS_NEWNODE) { + add_node_from_starttrans(msg, len); + node = joining_node; + /* Make sure we have a temp nodeid for the new node */ + joining_temp_nodeid = new_temp_nodeid(nodeaddr, + address_length); + } + + /* This should be a TRANS_CHECK but start_transition needs some node + * info */ + if (node == NULL) + node = us; + start_transition(startmsg->reason, node); + return 0; +} + +/* Got a STARTACK response from a node */ +static int do_process_startack(struct msghdr *msg, int len) +{ + if (node_state != MASTER && master_state != MASTER_START) { + P_MEMB("Got StartACK when not in MASTER_STARTING substate\n"); + return 0; + } + + /* msg is NULL if we are called directly from start_transition */ + if (msg) { + struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base; + + /* Ignore any messages wil old generation numbers in them */ + if (le32_to_cpu(ackmsg->generation) != cluster_generation) { + P_MEMB("Got old generation START-ACK msg - ignoring\n"); + return 0; + } + } + + /* If the node_id is non-zero then use it. */ + if (transitionreason == TRANS_NEWNODE && joining_node && msg) { + struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base; + + if (ackmsg->node_id) { + set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id)); + } + highest_nodeid = + max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id)); + P_MEMB("Node id = %d, highest node id = %d\n", + le32_to_cpu(ackmsg->node_id), + le32_to_cpu(ackmsg->highest_node_id)); + } + + /* If we have all the responses in then move to the next stage */ + if (++responses_collected == responses_expected) { + + /* If the new node has no node_id (ie nobody in the cluster has + * heard of it before) then assign it a new one */ + if (transitionreason == TRANS_NEWNODE && joining_node) { + highest_nodeid = + max(highest_nodeid, get_highest_nodeid()); + if (joining_node->node_id == 0) { + set_nodeid(joining_node, ++highest_nodeid); + } + P_MEMB("nodeIDs: new node: %d, highest: %d\n", + joining_node->node_id, highest_nodeid); + } + + /* Behave a little differently if we are on our own */ + if (cluster_members == 1) { + if (transitionreason == TRANS_NEWNODE) { + /* If the cluster is just us then confirm at + * once */ + joinconf_count = 0; + mod_timer(&transition_timer, + jiffies + + cman_config.joinconf_timeout * HZ); + send_joinconf(); + return 0; + } + else { /* Node leaving the cluster */ + recalculate_quorum(leavereason); + leavereason = 0; + node_state = MEMBER; + } + } + else { + master_state = MASTER_COLLECT; + responses_collected = 0; + responses_expected = cluster_members - 1; + P_MEMB("Sending MASTERVIEW: expecting %d responses\n", + responses_expected); + + send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0); + + /* Set a timer in case we don't get 'em all back */ + mod_timer(&transition_timer, + jiffies + + cman_config.transition_timeout * HZ); + } + } + return 0; +} + +/* Got a VIEWACK response from a node */ +static int do_process_viewack(struct msghdr *msg, int len) +{ + char *reply = msg->msg_iov->iov_base; + struct sockaddr_cl *saddr = msg->msg_name; + + if (master_state != MASTER_COLLECT) { + printk(KERN_INFO CMAN_NAME + ": got VIEWACK while not in state transition\n"); + return 0; + } + + if (node_opinion == NULL) { + node_opinion = + kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL); + if (!node_opinion) { + panic(": malloc agree/dissent failed\n"); + } + memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t)); + } + + /* Keep a list of agreeing and dissenting nodes */ + if (reply[1] == 1) { + /* ACK - remote node agrees with me */ + P_MEMB("Node agrees\n"); + node_opinion[saddr->scl_nodeid] = OPINION_AGREE; + agreeing_nodes++; + } + else { + /* Remote node disagrees */ + P_MEMB("Node disagrees\n"); + node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE; + dissenting_nodes++; + } + + P_MEMB("got %d responses, expected %d\n", responses_collected + 1, + responses_expected); + + /* Are all the results in yet ? */ + if (++responses_collected == responses_expected) { + del_timer(&transition_timer); + + P_MEMB("The results are in: %d agree, %d dissent\n", + agreeing_nodes, dissenting_nodes); + + if (agreeing_nodes > dissenting_nodes) { + /* Kill dissenting nodes */ + int i; + + for (i = 1; i <= responses_collected; i++) { + if (node_opinion[i] == OPINION_DISAGREE) + send_kill(i); + } + } + else { + /* We must leave the cluster as we are in a minority, + * the rest of them can fight it out amongst + * themselves. */ + send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT); + + agreeing_nodes = 0; + dissenting_nodes = 0; + kfree(node_opinion); + node_opinion = NULL; + node_state = LEFT_CLUSTER; + quit_threads = 1; + wake_up_process(membership_task); + wake_up_interruptible(&cnxman_waitq); + return -1; + } + + /* Reset counters */ + agreeing_nodes = 0; + dissenting_nodes = 0; + kfree(node_opinion); + node_opinion = NULL; + + /* Confirm new node */ + if (transitionreason == TRANS_NEWNODE) { + mod_timer(&transition_timer, + jiffies + cman_config.joinconf_timeout * HZ); + joinconf_count = 0; + send_joinconf(); + return 0; + } + + master_state = MASTER_COMPLETE; + + end_transition(); + } + + return 0; +} + +/* Got an ENDTRANS message */ +static int do_process_endtrans(struct msghdr *msg, int len) +{ + struct cl_mem_endtrans_msg *endmsg = + (struct cl_mem_endtrans_msg *) msg->msg_iov->iov_base; + struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name; + + /* Someone else's state transition */ + if (node_state != TRANSITION && node_state != JOINACK) + return 0; + + /* Check we got it from the MASTER node */ + if (master_node && master_node->node_id != saddr->scl_nodeid) { + printk(KERN_INFO + "Got ENDTRANS from a node not the master: master: %d, sender: %d\n", + master_node->node_id, saddr->scl_nodeid); + return 0; + } + + del_timer(&transition_timer); + + /* Set node ID on new node */ + if (endmsg->new_node_id) { + set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id)); + P_MEMB("new node %s has ID %d\n", joining_node->name, + joining_node->node_id); + } + + node_state = TRANSITION_COMPLETE; + + /* Need to set this here or the barrier code will reject us if we've + * just joined */ + we_are_a_cluster_member = TRUE; + + confirm_joiner(); + cluster_generation = le32_to_cpu(endmsg->generation); + + if (wait_for_completion_barrier() != 0) { + P_MEMB("Barrier timed out - restart\n"); + node_state = TRANSITION; + mod_timer(&transition_timer, + jiffies + cman_config.transition_timeout * HZ); + return 0; + } + + quorum = le32_to_cpu(endmsg->quorum); + set_quorate(le32_to_cpu(endmsg->total_votes)); + + /* Tell any waiting barriers that we had a transition */ + check_barrier_returns(); + + /* Clear the master node */ + master_node = NULL; + + node_state = MEMBER; + + /* Notify other listeners that transition has completed */ + notify_listeners(); + reset_hello_time(); + transition_end_time = jiffies; + + sm_member_update(cluster_is_quorate); + return 0; +} + +/* Turn a STARTTRANS message into NOMINATE and send it to the new master */ +static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen, + int nodeid) +{ + struct sockaddr_cl maddr; + + maddr.scl_port = CLUSTER_PORT_MEMBERSHIP; + maddr.scl_family = AF_CLUSTER; + maddr.scl_nodeid = nodeid; + + startmsg->cmd = CLUSTER_MEM_NOMINATE; + return kcl_sendmsg(mem_socket, startmsg, msglen, + &maddr, sizeof (maddr), 0); +} + +/* Got a STARTTRANS message */ +static int do_process_starttrans(struct msghdr *msg, int len) +{ + struct cl_mem_starttrans_msg *startmsg = + (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base; + struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name; + struct cluster_node *node; + unsigned int newgen = le32_to_cpu(startmsg->generation); + + /* Got a WHAT from WHOM? */ + node = find_node_by_nodeid(saddr->scl_nodeid); + if (!node || node->state != NODESTATE_MEMBER) + return 0; + + /* Someone else's state transition */ + if (node_state != MEMBER && + node_state != TRANSITION && node_state != MASTER) + return 0; + + /* Ignore old generation STARTTRANS messages */ + if ((newgen < cluster_generation) || + (newgen == 0xFFFFFFFF && cluster_generation == 0)) { + P_MEMB("Ignoring STARTTRANS with old generation number\n"); + return 0; + } + + P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n", + newgen, cluster_generation, startmsg->reason); + + /* Up the generation number */ + cluster_generation = newgen; + + /* If we are also a master then decide between us */ + if (node_state == MASTER) { + + /* See if we really want the responsibility of being master */ + if (elect_master(&node)) { + + /* I reluctantly accept this position of responsibility + */ + P_MEMB("I elected myself master\n"); + + /* start_transition will re-establish this */ + del_timer(&transition_timer); + + start_transition(TRANS_NEWMASTER, node); + return 0; + } + else { + /* Back down */ + P_MEMB("Backing down from MASTER status\n"); + master_node = node; + node_state = MEMBER; + + /* If we were bringing a new node into the cluster then + * we will have to abandon that now and tell the new + * node to try again later */ + if (transitionreason == TRANS_NEWNODE && joining_node) { + struct cluster_node_addr *first_addr = + (struct cluster_node_addr *) joining_node-> + addr_list.next; + + P_MEMB("Postponing membership of node %s\n", + joining_node->name); + send_joinack(first_addr->addr, address_length, + JOINACK_TYPE_WAIT); + + /* Not dead, just sleeping */ + joining_node->state = NODESTATE_DEAD; + joining_node = NULL; + } + + /* If the new master is not us OR the node we just got + * the STARTTRANS from then make sure it knows it has + * to be master */ + if (saddr->scl_nodeid != node->node_id) { + send_nominate(startmsg, len, node->node_id); + return 0; + } + + /* Fall through into MEMBER code below if we are + * obeying the STARTTRANS we just received */ + } + } + + /* Do non-MASTER STARTTRANS bits */ + if (node_state == MEMBER) { + int ptr = sizeof (struct cl_mem_starttrans_msg); + int node_id = 0; + + P_MEMB("Normal transition start\n"); + + /* If the master is adding a new node and we know it's node ID + * then ACK with it. */ + if (startmsg->reason == TRANS_NEWNODE) { + struct cluster_node *node = + find_node_by_addr((char *) startmsg + ptr, + address_length); + if (node) + node_id = node->node_id; + } + + /* Save the master info */ + master_node = find_node_by_nodeid(saddr->scl_nodeid); + node_state = TRANSITION; + + if (startmsg->reason == TRANS_NEWNODE) { + add_node_from_starttrans(msg, len); + } + + if (startmsg->reason == TRANS_REMNODE || + startmsg->reason == TRANS_ANOTHERREMNODE) { + remove_node(le32_to_cpu(startmsg->nodeid)); + } + + send_startack(saddr, msg->msg_namelen, + node_id); + + /* Establish timer in case the master dies */ + mod_timer(&transition_timer, + jiffies + cman_config.transition_timeout * HZ); + + return 0; + } + + /* We are in transition but this may be a restart */ + if (node_state == TRANSITION) { + + master_node = find_node_by_nodeid(saddr->scl_nodeid); + send_startack(saddr, msg->msg_namelen, 0); + + /* Is it a new joining node ? This happens if a master is + * usurped */ + if (startmsg->reason == TRANS_NEWNODE) { + struct cluster_node *oldjoin = joining_node; + + add_node_from_starttrans(msg, len); + + /* If this is a different node joining than the one we + * were previously joining (probably cos the master is + * a nominated one) then mark our "old" joiner as DEAD. + * The original master will already have told the node + * to go back into JOINWAIT state */ + if (oldjoin && oldjoin != joining_node + && oldjoin->state == NODESTATE_JOINING) + oldjoin->state = NODESTATE_DEAD; + } + + /* Is it a new master node? */ + if (startmsg->reason == TRANS_NEWMASTER || + startmsg->reason == TRANS_DEADMASTER) { + P_MEMB("starttrans %s, node=%d\n", + startmsg->reason == + TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER", + le32_to_cpu(startmsg->nodeid)); + + /* If the old master has died then remove it */ + node = + find_node_by_nodeid(le32_to_cpu(startmsg->nodeid)); + + if (startmsg->reason == TRANS_DEADMASTER && + node && node->state == NODESTATE_MEMBER) { + down(&cluster_members_lock); + node->state = NODESTATE_DEAD; + cluster_members--; + up(&cluster_members_lock); + } + + /* Store new master */ + master_node = find_node_by_nodeid(saddr->scl_nodeid); + } + + /* Another node has died (or been killed) */ + if (startmsg->reason == TRANS_ANOTHERREMNODE) { + /* Remove new dead node */ + node = + find_node_by_nodeid(le32_to_cpu(startmsg->nodeid)); + if (node && node->state == NODESTATE_MEMBER) { + down(&cluster_members_lock); + node->state = NODESTATE_DEAD; + cluster_members--; + up(&cluster_members_lock); + } + } + /* Restart the timer */ + del_timer(&transition_timer); + mod_timer(&transition_timer, + jiffies + cman_config.transition_timeout * HZ); + } + + return 0; +} + +/* Change a cluster parameter */ +static int do_process_reconfig(struct msghdr *msg, int len) +{ + struct cl_mem_reconfig_msg *confmsg; + struct sockaddr_cl *saddr = msg->msg_name; + struct cluster_node *node; + unsigned int val; + + if (len < sizeof(struct cl_mem_reconfig_msg)) + return -1; + + confmsg = (struct cl_mem_reconfig_msg *) msg->msg_iov->iov_base; + val = le32_to_cpu(confmsg->value); + + switch (confmsg->param) { + + case RECONFIG_PARAM_EXPECTED_VOTES: + /* Set any nodes with expected_votes higher than the new value + * down */ + if (val > 0) { + struct cluster_node *node; + + down(&cluster_members_lock); + list_for_each_entry(node, &cluster_members_list, list) { + if (node->state == NODESTATE_MEMBER && + node->expected_votes > val) { + node->expected_votes = val; + } + } + up(&cluster_members_lock); + if (expected_votes > val) + expected_votes = val; + } + recalculate_quorum(1); /* Allow decrease */ + sm_member_update(cluster_is_quorate); + break; + + case RECONFIG_PARAM_NODE_VOTES: + node = find_node_by_nodeid(saddr->scl_nodeid); + node->votes = val; + recalculate_quorum(1); /* Allow decrease */ + sm_member_update(cluster_is_quorate); + break; + + case RECONFIG_PARAM_CONFIG_VERSION: + config_version = val; + break; + + default: + printk(KERN_INFO CMAN_NAME + ": got unknown parameter in reconfigure message. %d\n", + confmsg->param); + break; + } + return 0; +} + +/* Response from master node */ +static int do_process_joinack(struct msghdr *msg, int len) +{ + struct cl_mem_joinack_msg *ackmsg = msg->msg_iov->iov_base; + + join_time = jiffies; + if (ackmsg->acktype == JOINACK_TYPE_OK) { + node_state = JOINACK; + } + + if (ackmsg->acktype == JOINACK_TYPE_NAK) { + printk(KERN_WARNING CMAN_NAME + ": Cluster membership rejected\n"); + P_MEMB("Got JOINACK NACK\n"); + node_state = REJECTED; + } + + if (ackmsg->acktype == JOINACK_TYPE_WAIT) { + P_MEMB("Got JOINACK WAIT\n"); + node_state = JOINWAIT; + joinwait_time = jiffies; + } + + return 0; +} + +/* Request to join the cluster. This makes us the master for this state + * transition */ +static int do_process_joinreq(struct msghdr *msg, int len) +{ + int status; + static unsigned long last_joinreq = 0; + static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN]; + struct cl_mem_join_msg *joinmsg = msg->msg_iov->iov_base; + struct cluster_node *node; + + /* If we are in a state transition then tell the new node to wait a bit + * longer */ + if (node_state != MEMBER) { + if (node_state == MASTER || node_state == TRANSITION) { + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_WAIT); + } + return 0; + } + + /* Check version number */ + if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) { + char *ptr = (char *) joinmsg; + char *name; + + /* Sanity-check the num_addrs field otherwise we could oops */ + if (le16_to_cpu(joinmsg->num_addr) * address_length > len) { + printk(KERN_WARNING CMAN_NAME + ": num_addr in JOIN-REQ message is rubbish: %d\n", + le16_to_cpu(joinmsg->num_addr)); + return 0; + } + + /* Check the cluster name matches */ + if (strcmp(cluster_name, joinmsg->clustername)) { + printk(KERN_WARNING CMAN_NAME + ": attempt to join with cluster name '%s' refused\n", + joinmsg->clustername); + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + ptr += sizeof (*joinmsg); + name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length; + + /* Check we are not exceeding the maximum number of nodes */ + if (cluster_members > cman_config.max_nodes) { + printk(KERN_WARNING CMAN_NAME + ": Join request from %s rejected, exceeds maximum number of nodes\n", + name); + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + /* Check that we don't exceed the two_node limit */ + if (two_node && cluster_members == 2) { + printk(KERN_WARNING CMAN_NAME ": Join request from %s " + "rejected, exceeds two node limit\n", name); + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + if (le16_to_cpu(joinmsg->config_version) != config_version) { + printk(KERN_WARNING CMAN_NAME ": Join request from %s " + "rejected, config version local %u remote %u\n", + name, config_version, + le16_to_cpu(joinmsg->config_version)); + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + /* If these don't match then I don't know how the message + arrived! However, I can't take the chance */ + if (le32_to_cpu(joinmsg->addr_len) != address_length) { + printk(KERN_WARNING CMAN_NAME ": Join request from %s " + "rejected, address length local: %u remote %u\n", + name, address_length, + le32_to_cpu(joinmsg->addr_len)); + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + /* Duplicate checking: Because joining messages do not have + * sequence numbers we may get as many JOINREQ messages as we + * have interfaces. This bit of code here just checks for + * JOINREQ messages that come in from the same node in a small + * period of time and removes the duplicates */ + if (time_before(jiffies, last_joinreq + 10 * HZ) + && strcmp(name, last_name) == 0) { + return 0; + } + + /* Do we already know about this node? */ + status = check_duplicate_node(name, msg, len); + + if (status < 0) { + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + /* OK, you can be in my gang */ + if (status == 0) { + int i; + struct sockaddr_cl *addr = msg->msg_name; + + last_joinreq = jiffies; + strcpy(last_name, name); + + node = + add_new_node(name, joinmsg->votes, + le32_to_cpu(joinmsg->expected_votes), + 0, NODESTATE_JOINING); + + /* Add the node's addresses */ + if (list_empty(&node->addr_list)) { + for (i = 0; i < le16_to_cpu(joinmsg->num_addr); + i++) { + add_node_address(node, ptr, address_length); + ptr += address_length; + } + } + + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_OK); + joining_node = node; + joining_temp_nodeid = addr->scl_nodeid; + + /* Start the state transition */ + start_transition(TRANS_NEWNODE, node); + } + } + else { + /* Version number mismatch, don't use any part of the message + * other than the version numbers as things may have moved */ + char buf[MAX_ADDR_PRINTED_LEN]; + + printk(KERN_INFO CMAN_NAME + ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d) addr: %s\n", + CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION, + CNXMAN_PATCH_VERSION, + le32_to_cpu(joinmsg->major_version), + le32_to_cpu(joinmsg->minor_version), + le32_to_cpu(joinmsg->patch_version), + print_addr(msg->msg_name, msg->msg_namelen, buf)); + + send_joinack(msg->msg_name, msg->msg_namelen, + JOINACK_TYPE_NAK); + return 0; + } + + return 0; +} + +/* A simple function to invent a small number based + on the node name */ +static int node_hash(void) +{ + int i; + int value = 0; + + for (i=0; iname); + + P_MEMB("check_node: %s", newnode->name); + + if (!node) { + C_MEMB(" - not found\n"); + return -1; + } + + if (node->votes != newnode->votes || + node->node_id != newnode->node_id || + node->state != NODESTATE_MEMBER) { + C_MEMB + (" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n", + node->votes, newnode->votes, node->node_id, + newnode->node_id, node->state); + return -1; + } + C_MEMB(" - OK\n"); + return 0; +} + +/* Called for each new node found in a JOINCONF message. Create a new node + * entry */ +static int add_node(struct cluster_node *node, char *addrs, + unsigned short num_addr) +{ + P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes, + node->expected_votes, node->node_id); + + if (!find_node_by_name(node->name)) { + struct cluster_node *newnode; + int i; + + if ((newnode = + add_new_node(node->name, node->votes, node->expected_votes, + node->node_id, NODESTATE_MEMBER)) == NULL) { + P_MEMB("Error adding node\n"); + return -1; + } + if (list_empty(&newnode->addr_list)) { + for (i = 0; i < num_addr; i++) { + add_node_address(newnode, + addrs + i * address_length, address_length); + } + } + return 0; + } + else { + P_MEMB("Already got node with name %s\n", node->name); + return -1; + } +} + +/* Call a specified routine for each node unpacked from the message. Return + * either the number of nodes found or -1 for an error */ +static int unpack_nodes(unsigned char *buf, int len, + int (*routine) (struct cluster_node *, char *, + unsigned short)) +{ + int ptr = 0; + int num_nodes = 0; + char nodename[MAX_CLUSTER_MEMBER_NAME_LEN]; + struct cluster_node node; + + node.name = nodename; + + while (ptr < len) { + int namelen = buf[ptr++]; + unsigned int evotes; + unsigned int node_id; + unsigned short num_addr; + unsigned char *addrs; + + memcpy(nodename, &buf[ptr], namelen); + nodename[namelen] = '\0'; + ptr += namelen; + + memcpy(&num_addr, &buf[ptr], sizeof (short)); + num_addr = le16_to_cpu(num_addr); + ptr += sizeof (short); + + /* Just make a note of the addrs "array" */ + addrs = &buf[ptr]; + ptr += num_addr * address_length; + + node.votes = buf[ptr++]; + + memcpy(&evotes, &buf[ptr], sizeof (int)); + node.expected_votes = le32_to_cpu(evotes); + ptr += sizeof (int); + + memcpy(&node_id, &buf[ptr], sizeof (int)); + node.node_id = le32_to_cpu(node_id); + ptr += sizeof (int); + + /* Call the callback routine */ + if (routine(&node, addrs, num_addr) < 0) + return -1; + num_nodes++; + } + return num_nodes; +} + +/* Got join confirmation from a master node. This message contains a list of + * cluster nodes which we unpack and build into our cluster nodes list. When we + * have the last message we can go into TRANSITION state */ +static int do_process_joinconf(struct msghdr *msg, int len) +{ + char *message = msg->msg_iov->iov_base; + + if (unpack_nodes(message + 2, len - 2, add_node) < 0) { + printk(CMAN_NAME + ": Error procssing joinconf message - giving up on cluster join\n"); + send_leave(CLUSTER_LEAVEFLAG_PANIC); + return -1; + } + + /* Last message in the list? */ + if (message[1] & 2) { + char ackmsg; + struct sockaddr_cl *addr = msg->msg_name; + + us->state = NODESTATE_MEMBER; + node_state = TRANSITION; + we_are_a_cluster_member = TRUE; + + ackmsg = CLUSTER_MEM_CONFACK; + kcl_sendmsg(mem_socket, &ackmsg, 1, addr, + sizeof (struct sockaddr_cl), + MSG_NOACK); + kernel_thread(hello_kthread, NULL, 0); + mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ); + } + return 0; +} + +/* Got the master's view of the cluster - compare it with ours and tell it the + * result */ +static int do_process_masterview(struct msghdr *msg, int len) +{ + char reply[2] = { CLUSTER_MEM_VIEWACK, 0 }; + char *message = msg->msg_iov->iov_base; + static int num_nodes; + + /* Someone else's state transition */ + if (node_state != MEMBER && + node_state != TRANSITION && node_state != MASTER) + return 0; + + /* First message, zero the counter */ + if (message[1] & 1) + num_nodes = 0; + + num_nodes += + unpack_nodes(msg->msg_iov->iov_base + 2, len - 2, check_node); + + /* Last message, check the count and reply */ + if (message[1] & 2) { + if (num_nodes == cluster_members) { + /* Send ACK */ + reply[1] = 1; + } + else { + P_MEMB + ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n", + num_nodes, cluster_members); + /* Send NAK */ + reply[1] = 0; + } + kcl_sendmsg(mem_socket, reply, 2, msg->msg_name, + msg->msg_namelen, 0); + } + return 0; +} + +static int do_process_leave(struct msghdr *msg, int len) +{ + struct cluster_node *node; + struct sockaddr_cl *saddr = msg->msg_name; + unsigned char *leavemsg = (unsigned char *) msg->msg_iov->iov_base; + + if ((node = find_node_by_nodeid(saddr->scl_nodeid))) { + unsigned char reason = leavemsg[1]; + + if (node->state != NODESTATE_DEAD) { + printk(KERN_INFO CMAN_NAME + ": Node %s is leaving the cluster, reason %d\n", + node->name, reason); + + node->leave_reason = reason; + } + leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0); + + a_node_just_died(node); + + /* If it was the master node, then we have been nominated as + * the sucessor */ + if (node == master_node) { + start_transition(TRANS_DEADMASTER, master_node); + } + + } + return 0; +} + +static int do_process_hello(struct msghdr *msg, int len) +{ + struct cluster_node *node; + struct cl_mem_hello_msg *hellomsg = + (struct cl_mem_hello_msg *) msg->msg_iov->iov_base; + struct sockaddr_cl *saddr = msg->msg_name; + + /* We are starting up. Send a join message to the node whose HELLO we + * just received */ + if (node_state == STARTING || node_state == JOINWAIT) { + struct sockaddr_cl *addr = msg->msg_name; + + printk(KERN_INFO CMAN_NAME ": sending membership request\n"); + + send_joinreq(addr, msg->msg_namelen); + join_time = jiffies; + node_state = JOINING; + return 0; + } + + /* Only process HELLOs if we are not in transition */ + if (node_state == MEMBER) { + if (len < sizeof (struct cl_mem_hello_msg)) { + printk(KERN_ERR CMAN_NAME + ": short hello message from node %d\n", + saddr->scl_nodeid); + return -1; + } + + node = find_node_by_nodeid(saddr->scl_nodeid); + if (node && node->state != NODESTATE_DEAD) { + + /* Check the cluster generation in the HELLO message. + * NOTE: this may be different if the message crossed + * on the wire with an END-TRANS so we allow a period + * of grace in which this is allowable */ + if (cluster_generation != + le32_to_cpu(hellomsg->generation) + && node_state == MEMBER + && time_after(jiffies, + cman_config.hello_timer * HZ + + transition_end_time)) { + char killmsg; + + printk(KERN_INFO CMAN_NAME + ": bad generation number %d in HELLO message, expected %d\n", + le32_to_cpu(hellomsg->generation), + cluster_generation); + + notify_kernel_listeners(DIED, + (long) node->node_id); + + killmsg = CLUSTER_MEM_KILL; + kcl_sendmsg(mem_socket, &killmsg, 1, + saddr, sizeof (struct sockaddr_cl), + MSG_NOACK); + return 0; + } + + if (cluster_members != le16_to_cpu(hellomsg->members) + && node_state == MEMBER) { + printk(KERN_INFO CMAN_NAME + ": nmembers in HELLO message does not match our view\n"); + start_transition(TRANS_CHECK, node); + return 0; + } + /* The message is OK - save the time */ + node->last_hello = jiffies; + + } + else { + struct sockaddr_cl *addr = msg->msg_name; + + /* This node is a danger to our valid cluster */ + if (cluster_is_quorate) { + char killmsg; + + killmsg = CLUSTER_MEM_KILL; + kcl_sendmsg(mem_socket, &killmsg, 1, addr, + sizeof (struct sockaddr_cl), + MSG_NOACK); + } + + } + } + + return 0; + +} + +static int do_process_kill(struct msghdr *msg, int len) +{ + struct sockaddr_cl *saddr = msg->msg_name; + struct cluster_node *node; + + node = find_node_by_nodeid(saddr->scl_nodeid); + if (node && node->state == NODESTATE_MEMBER) { + + printk(KERN_INFO CMAN_NAME + ": Being told to leave the cluster by node %d\n", + saddr->scl_nodeid); + + node_state = LEFT_CLUSTER; + quit_threads = 1; + wake_up_process(membership_task); + wake_up_interruptible(&cnxman_waitq); + } + else { + P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n"); + } + return 0; +} + +/* Some cluster membership utility functions */ +struct cluster_node *find_node_by_name(char *name) +{ + struct list_head *nodelist; + struct cluster_node *node; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (strcmp(node->name, name) == 0) { + up(&cluster_members_lock); + return node; + } + } + up(&cluster_members_lock); + return NULL; +} + +/* Try to avoid using this as it's slow and holds the members lock */ +struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len) +{ + struct list_head *nodelist; + struct list_head *addrlist; + struct cluster_node *node; + struct cluster_node_addr *nodeaddr; + + down(&cluster_members_lock); + + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + list_for_each(addrlist, &node->addr_list) { + nodeaddr = + list_entry(addrlist, struct cluster_node_addr, + list); + + if (memcmp(nodeaddr->addr, addr, address_length) == 0) { + up(&cluster_members_lock); + return node; + } + } + } + + up(&cluster_members_lock); + return NULL; +} + +/* This is the quick way to find a node */ +struct cluster_node *find_node_by_nodeid(unsigned int id) +{ + struct cluster_node *node; + + if (id > sizeof_members_array) + return NULL; + + spin_lock(&members_by_nodeid_lock); + node = members_by_nodeid[id]; + spin_unlock(&members_by_nodeid_lock); + return node; +} + +static int dispatch_messages(struct socket *mem_socket) +{ + int err = 0; + + while (skb_peek(&mem_socket->sk->sk_receive_queue)) { + struct msghdr msg; + struct iovec iov; + struct sockaddr_cl sin; + int len; + mm_segment_t fs; + + memset(&sin, 0, sizeof (sin)); + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + msg.msg_name = &sin; + msg.msg_namelen = sizeof (sin); + msg.msg_flags = 0; + + iov.iov_len = MAX_CLUSTER_MESSAGE; + iov.iov_base = iobuf; + + fs = get_fs(); + set_fs(get_ds()); + + len = + sock_recvmsg(mem_socket, &msg, MAX_CLUSTER_MESSAGE, + MSG_DONTWAIT); + set_fs(fs); + if (len > 0) { + iov.iov_base = iobuf; /* Reinstate pointer */ + msg.msg_name = &sin; + do_membership_packet(&msg, len); + } + else { + if (len == -EAGAIN) + err = 0; + else + err = -1; + break; + } + } + return err; +} + +/* Scan the nodes list for dead nodes */ +static void check_for_dead_nodes() +{ + struct list_head *nodelist; + struct cluster_node *node; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->state != NODESTATE_DEAD && + time_after(jiffies, + node->last_hello + + cman_config.deadnode_timeout * HZ) && !node->us) { + + up(&cluster_members_lock); + + printk(KERN_WARNING CMAN_NAME + ": no HELLO from %s, removing from the cluster\n", + node->name); + + P_MEMB("last hello was %ld, current time is %ld\n", + node->last_hello, jiffies); + + node->leave_reason = CLUSTER_LEAVEFLAG_DEAD; + leavereason = 0; + + /* This is unlikely to work but it's worth a try! */ + send_kill(node->node_id); + + /* Start state transition */ + a_node_just_died(node); + return; + } + } + up(&cluster_members_lock); + + /* Also check for a dead quorum device */ + if (quorum_device) { + if (quorum_device->state == NODESTATE_MEMBER && + time_after(jiffies, + quorum_device->last_hello + + cman_config.deadnode_timeout * HZ)) { + quorum_device->state = NODESTATE_DEAD; + printk(KERN_WARNING CMAN_NAME + ": Quorum device %s timed out\n", + quorum_device->name); + recalculate_quorum(0); + } + } + + return; +} + +/* add "us" as a node in the cluster */ +static int add_us() +{ + struct cluster_node *newnode = + kmalloc(sizeof (struct cluster_node), GFP_KERNEL); + + if (!newnode) { + /* Oh shit, we have to commit hara kiri here for the greater + * good of the cluster */ + send_leave(CLUSTER_LEAVEFLAG_PANIC); + + printk(KERN_CRIT CMAN_NAME + ": Cannot allocate memory for our node structure\n"); + panic("Must die"); + + return -1; + } + + memset(newnode, 0, sizeof (struct cluster_node)); + newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL); + if (!newnode->name) { + send_leave(CLUSTER_LEAVEFLAG_PANIC); + + printk(KERN_CRIT CMAN_NAME + ": Cannot allocate memory for node name\n"); + kfree(newnode); + + panic("Must die"); + + return -1; + } + + strcpy(newnode->name, nodename); + newnode->last_hello = jiffies; + newnode->votes = votes; + newnode->expected_votes = expected_votes; + newnode->state = NODESTATE_JOINING; + newnode->node_id = 0; /* Will get filled in by ENDTRANS message */ + newnode->us = 1; + newnode->leave_reason = 0; + INIT_LIST_HEAD(&newnode->addr_list); + get_local_addresses(newnode); /* Get from cnxman socket info */ + + /* Add the new node to the list */ + down(&cluster_members_lock); + list_add(&newnode->list, &cluster_members_list); + cluster_members++; + up(&cluster_members_lock); + us = newnode; + + return 0; +} + +/* Return the highest known node_id */ +unsigned int get_highest_nodeid() +{ + struct list_head *nodelist; + struct cluster_node *node = NULL; + unsigned int highest = 0; + + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + + if (node->node_id > highest) + highest = node->node_id; + } + up(&cluster_members_lock); + + return highest; +} + +/* Elect a new master if there is a clash. Returns 1 if we are the new master, + * the master's struct will also be returned. This, rather primitively, uses + * the lowest node ID */ +static int elect_master(struct cluster_node **master_node) +{ + int i; + + for (i = 1; i < sizeof_members_array; i++) { + if (members_by_nodeid[i] + && members_by_nodeid[i]->state == NODESTATE_MEMBER) { + *master_node = members_by_nodeid[i]; + P_MEMB("Elected master is %s\n", (*master_node)->name); + return (*master_node)->us; + } + } + BUG(); + return 0; +} + +/* Called by node_cleanup in cnxman when we have left the cluster */ +void free_nodeid_array() +{ + vfree(members_by_nodeid); + members_by_nodeid = NULL; + sizeof_members_array = 0; +} + +int allocate_nodeid_array() +{ + /* Allocate space for the nodeid lookup array */ + if (!members_by_nodeid) { + spin_lock_init(&members_by_nodeid_lock); + members_by_nodeid = + vmalloc(cman_config.max_nodes * + sizeof (struct cluster_member *)); + } + + if (!members_by_nodeid) { + printk(KERN_WARNING + "Unable to allocate members array for %d members\n", + cman_config.max_nodes); + return -ENOMEM; + } + memset(members_by_nodeid, 0, + cman_config.max_nodes * sizeof (struct cluster_member *)); + sizeof_members_array = cman_config.max_nodes; + + return 0; +} + +/* Set the votes & expected_votes variables */ +void set_votes(int v, int e) +{ + votes = v; + expected_votes = e; +} + +int get_quorum() +{ + return quorum; +} + +/* Called by cnxman to see if activity should be blocked because we are in a + * state transition */ +int in_transition() +{ + return node_state == TRANSITION || + node_state == TRANSITION_COMPLETE || node_state == MASTER; +} + +/* Return the current membership state as a string for the main line to put + * into /proc . I really should be using snprintf rather than sprintf but it's + * not exported... */ +char *membership_state(char *buf, int buflen) +{ + switch (node_state) { + case STARTING: + strncpy(buf, "Starting", buflen); + break; + case JOINING: + strncpy(buf, "Joining", buflen); + break; + case JOINWAIT: + strncpy(buf, "Join-Wait", buflen); + break; + case JOINACK: + strncpy(buf, "Join-Ack", buflen); + break; + case TRANSITION: + sprintf(buf, "State-Transition: Master is %s", + master_node ? master_node->name : "Unknown"); + break; + case MEMBER: + strncpy(buf, "Cluster-Member", buflen); + break; + case REJECTED: + strncpy(buf, "Rejected", buflen); + break; + case LEFT_CLUSTER: + strncpy(buf, "Left-Cluster", buflen); + break; + case TRANSITION_COMPLETE: + strncpy(buf, "Transition-Complete", buflen); + break; + case MASTER: + strncpy(buf, "Transition-Master", buflen); + break; + default: + sprintf(buf, "Unknown: code=%d", node_state); + break; + } + + return buf; +} + +#ifdef DEBUG_MEMB +static char *msgname(int msg) +{ + switch (msg) { + case CLUSTER_MEM_JOINCONF: + return "JOINCONF"; + case CLUSTER_MEM_JOINREQ: + return "JOINREQ"; + case CLUSTER_MEM_LEAVE: + return "LEAVE"; + case CLUSTER_MEM_HELLO: + return "HELLO"; + case CLUSTER_MEM_KILL: + return "KILL"; + case CLUSTER_MEM_JOINACK: + return "JOINACK"; + case CLUSTER_MEM_ENDTRANS: + return "ENDTRANS"; + case CLUSTER_MEM_RECONFIG: + return "RECONFIG"; + case CLUSTER_MEM_MASTERVIEW: + return "MASTERVIEW"; + case CLUSTER_MEM_STARTTRANS: + return "STARTTRANS"; + case CLUSTER_MEM_JOINREJ: + return "JOINREJ"; + case CLUSTER_MEM_VIEWACK: + return "VIEWACK"; + case CLUSTER_MEM_STARTACK: + return "STARTACK"; + case CLUSTER_MEM_NEWCLUSTER: + return "NEWCLUSTER"; + case CLUSTER_MEM_CONFACK: + return "CONFACK"; + case CLUSTER_MEM_NOMINATE: + return "NOMINATE"; + + default: + return "??UNKNOWN??"; + } +} + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c --- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/proc.c 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,364 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cnxman-private.h" +#include "config.h" + +extern int cluster_members; +extern struct list_head cluster_members_list; +extern struct semaphore cluster_members_lock; +extern struct cluster_node *quorum_device; +extern int we_are_a_cluster_member; +extern int cluster_is_quorate; +extern unsigned short cluster_id; +extern atomic_t use_count; +extern unsigned int address_length; +extern unsigned int config_version; +extern char cluster_name[]; +extern struct cluster_node *us; +static struct seq_operations cluster_info_op; + +int sm_procdata(char *b, char **start, off_t offset, int length); +int sm_debug_info(char *b, char **start, off_t offset, int length); + +/* /proc interface to the configuration struct */ +static struct config_proc_info { + char *name; + int *value; +} config_proc[] = { + { + .name = "joinwait_timeout", + .value = &cman_config.joinwait_timeout, + }, + { + .name = "joinconf_timeout", + .value = &cman_config.joinconf_timeout, + }, + { + .name = "join_timeout", + .value = &cman_config.join_timeout, + }, + { + .name = "hello_timer", + .value = &cman_config.hello_timer, + }, + { + .name = "deadnode_timeout", + .value = &cman_config.deadnode_timeout, + }, + { + .name = "transition_timeout", + .value = &cman_config.transition_timeout, + }, + { + .name = "transition_restarts", + .value = &cman_config.transition_restarts, + }, + { + .name = "max_nodes", + .value = &cman_config.max_nodes, + }, + { + .name = "sm_debug_size", + .value = &cman_config.sm_debug_size, + }, +}; + + +static int proc_cluster_status(char *b, char **start, off_t offset, int length) +{ + struct list_head *nodelist; + struct cluster_node *node; + struct cluster_node_addr *node_addr; + unsigned int total_votes = 0; + unsigned int max_expected = 0; + int c = 0; + char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN]; + + if (!we_are_a_cluster_member) { + c += sprintf(b+c, "Not a cluster member. State: %s\n", + membership_state(node_buf, + sizeof (node_buf))); + return c; + } + + /* Total the votes */ + down(&cluster_members_lock); + list_for_each(nodelist, &cluster_members_list) { + node = list_entry(nodelist, struct cluster_node, list); + if (node->state == NODESTATE_MEMBER) { + total_votes += node->votes; + max_expected = + max(max_expected, node->expected_votes); + } + } + up(&cluster_members_lock); + + if (quorum_device && quorum_device->state == NODESTATE_MEMBER) + total_votes += quorum_device->votes; + + c += sprintf(b+c, + "Version: %d.%d.%d\nConfig version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n", + CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION, + CNXMAN_PATCH_VERSION, + config_version, + cluster_name, cluster_id, + membership_state(node_buf, sizeof (node_buf))); + c += sprintf(b+c, + "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n", + cluster_members, max_expected, total_votes, + get_quorum(), + cluster_is_quorate ? " " : "Activity blocked"); + c += sprintf(b+c, "Active subsystems: %d\n", + atomic_read(&use_count)); + + + c += sprintf(b+c, "Node addresses: "); + list_for_each_entry(node_addr, &us->addr_list, list) { + struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr; + if (saddr->sin6_family == AF_INET6) { + c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ", + be16_to_cpu(saddr->sin6_addr.s6_addr16[0]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[1]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[2]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[3]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[4]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[5]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[6]), + be16_to_cpu(saddr->sin6_addr.s6_addr16[7])); + } + else { + struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr; + uint8_t *addr = (uint8_t *)&saddr4->sin_addr; + c+= sprintf(b+c, "%u.%u.%u.%u ", + addr[0], addr[1], addr[2], addr[3]); + } + } + c += sprintf(b+c, "\n\n"); + return c; +} + + +/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where + * we are */ +struct cluster_seq_info { + int nodeid; + int highest_nodeid; +}; + +static int cluster_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cluster_info_op); +} + +static void *cluster_seq_start(struct seq_file *m, loff_t * pos) +{ + struct cluster_seq_info *csi = + kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL); + + if (!csi) + return NULL; + + /* Keep highest_nodeid here so we don't need to keep traversing the + * list to find it */ + csi->nodeid = *pos; + csi->highest_nodeid = get_highest_nodeid(); + + /* Print the header */ + if (*pos == 0) { + seq_printf(m, + "Node Votes Exp Sts Name\n"); + return csi; + } + return csi; +} + +static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos) +{ + struct cluster_seq_info *csi = p; + + *pos = ++csi->nodeid; + if (csi->nodeid > csi->highest_nodeid) + return NULL; + + return csi; +} + +static int cluster_seq_show(struct seq_file *m, void *p) +{ + char state = '?'; + struct cluster_node *node; + struct cluster_seq_info *csi = p; + + /* + * If we have "0" here then display the quorum device if + * there is one. + */ + if (csi->nodeid == 0) + node = quorum_device; + else + node = find_node_by_nodeid(csi->nodeid); + + if (!node) + return 0; + + /* Make state printable */ + switch (node->state) { + case NODESTATE_MEMBER: + state = 'M'; + break; + case NODESTATE_JOINING: + state = 'J'; + break; + case NODESTATE_REMOTEMEMBER: + state = 'R'; + break; + case NODESTATE_DEAD: + state = 'X'; + break; + } + seq_printf(m, " %3d %3d %3d %c %s\n", + node->node_id, + node->votes, + node->expected_votes, + state, + node->name); + + return 0; +} + +static void cluster_seq_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static struct seq_operations cluster_info_op = { + .start = cluster_seq_start, + .next = cluster_seq_next, + .stop = cluster_seq_stop, + .show = cluster_seq_show +}; + +static struct file_operations cluster_fops = { + .open = cluster_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int cman_config_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct config_proc_info *cinfo = data; + + return snprintf(page, count, "%d\n", *cinfo->value); +} + +static int cman_config_write_proc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct config_proc_info *cinfo = data; + int value; + char *end; + + value = simple_strtoul(buffer, &end, 10); + if (*end) { + *cinfo->value = value; + } + return count; +} + +/* Base of the config directory for cman */ +static struct proc_dir_entry *proc_cman_config; +void create_proc_entries(void) +{ + struct proc_dir_entry *procentry; + struct proc_dir_entry *proc_cluster; + int i; + + proc_cluster = proc_mkdir("cluster", 0); + if (!proc_cluster) + return; + proc_cluster->owner = THIS_MODULE; + + /* Config dir filled in by us and others */ + if (!proc_mkdir("cluster/config", 0)) + return; + + /* Don't much care if this fails, it's hardly vital */ + procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL); + if (procentry) + procentry->proc_fops = &cluster_fops; + + procentry = create_proc_entry("cluster/status", S_IRUGO, NULL); + if (procentry) + procentry->get_info = proc_cluster_status; + + procentry = create_proc_entry("cluster/services", S_IRUGO, NULL); + if (procentry) + procentry->get_info = sm_procdata; + + /* Config entries */ + proc_cman_config = proc_mkdir("cluster/config/cman", 0); + if (!proc_cman_config) + return; + + for (i=0; idata = &config_proc[i]; + procentry->write_proc = cman_config_write_proc; + procentry->read_proc = cman_config_read_proc; + } + } + + procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL); + if (procentry) + procentry->get_info = sm_debug_info; +} + +void cleanup_proc_entries(void) +{ + int i, config_count; + + remove_proc_entry("cluster/sm_debug", NULL); + + config_count = sizeof(config_proc) / sizeof(struct config_proc_info); + + if (proc_cman_config) { + for (i=0; i +#include +#include +#include +#include +#include + +#include +#include + +#define SG_LEVELS (4) + +#include "sm_internal.h" +#include "sm_barrier.h" +#include "sm_control.h" +#include "sm_daemon.h" +#include "sm_joinleave.h" +#include "sm_membership.h" +#include "sm_message.h" +#include "sm_misc.h" +#include "sm_recover.h" +#include "sm_services.h" + +extern struct list_head sm_sg[SG_LEVELS]; +extern struct semaphore sm_sglock; + +#ifndef TRUE +#define TRUE (1) +#endif + +#ifndef FALSE +#define FALSE (0) +#endif + +#define SM_ASSERT(x, do) \ +{ \ + if (!(x)) \ + { \ + printk("\nSM: Assertion failed on line %d of file %s\n" \ + "SM: assertion: \"%s\"\n" \ + "SM: time = %lu\n", \ + __LINE__, __FILE__, #x, jiffies); \ + {do} \ + printk("\n"); \ + panic("SM: Record message above and reboot.\n"); \ + } \ +} + +#define SM_RETRY(do_this, until_this) \ +for (;;) \ +{ \ + do { do_this; } while (0); \ + if (until_this) \ + break; \ + printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \ + schedule();\ +} + + +#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args) + +#define log_error(sg, fmt, args...) \ + printk("SM: %08x " fmt "\n", (sg)->global_id , ##args) + + +#define SM_DEBUG_LOG + +#ifdef SM_DEBUG_CONSOLE +#define log_debug(sg, fmt, args...) \ + printk("SM: %08x " fmt "\n", (sg)->global_id , ##args) +#endif + +#ifdef SM_DEBUG_LOG +#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args); +#endif + +#ifdef SM_DEBUG_ALL +#define log_debug(sg, fmt, args...) \ +do \ +{ \ + printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \ + sm_debug_log(sg, fmt, ##args); \ +} \ +while (0) +#endif + +#endif /* __SM_DOT_H__ */ diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c --- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_barrier.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,232 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +static struct list_head barriers; +static spinlock_t barriers_lock; + +struct bc_entry { + struct list_head list; + uint32_t gid; + int status; + char type; +}; +typedef struct bc_entry bc_entry_t; + +void init_barriers(void) +{ + INIT_LIST_HEAD(&barriers); + spin_lock_init(&barriers_lock); +} + +static int atoi(char *c) +{ + int x = 0; + + while ('0' <= *c && *c <= '9') { + x = x * 10 + (*c - '0'); + c++; + } + return x; +} + +static void add_barrier_callback(char *name, int status, int type) +{ + char *p; + uint32_t gid; + bc_entry_t *be; + + /* an ESRCH callback just means there was a cnxman transition */ + if (status == -ESRCH) + return; + + /* extract global id of SG from barrier name */ + p = strstr(name, "sm."); + + SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status);); + + p += strlen("sm."); + gid = atoi(p); + + SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be); + + be->gid = gid; + be->status = status; + be->type = type; + + spin_lock(&barriers_lock); + list_add_tail(&be->list, &barriers); + spin_unlock(&barriers_lock); + + wake_serviced(DO_BARRIERS); +} + +static void callback_recovery_barrier(char *name, int status) +{ + add_barrier_callback(name, status, SM_BARRIER_RECOVERY); +} + +static void callback_startdone_barrier_new(char *name, int status) +{ + add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW); +} + +static void callback_startdone_barrier(char *name, int status) +{ + add_barrier_callback(name, status, SM_BARRIER_STARTDONE); +} + +int sm_barrier(char *name, int count, int type) +{ + int error; + unsigned long fn = 0; + + switch (type) { + case SM_BARRIER_STARTDONE: + fn = (unsigned long) callback_startdone_barrier; + break; + case SM_BARRIER_STARTDONE_NEW: + fn = (unsigned long) callback_startdone_barrier_new; + break; + case SM_BARRIER_RECOVERY: + fn = (unsigned long) callback_recovery_barrier; + break; + } + + error = kcl_barrier_register(name, 0, count); + if (error) { + log_print("barrier register error %d", error); + goto fail; + } + + error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE); + if (error) { + log_print("barrier setattr autodel error %d", error); + goto fail_bar; + } + + error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn); + if (error) { + log_print("barrier setattr cb error %d", error); + goto fail_bar; + } + + error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE); + if (error) { + log_print("barrier setattr enabled error %d", error); + goto fail_bar; + } + + return 0; + + fail_bar: + kcl_barrier_delete(name); + fail: + return error; +} + +void process_startdone_barrier_new(sm_group_t *sg, int status) +{ + sm_sevent_t *sev = sg->sevent; + + if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) { + log_debug(sev->se_sg, "ignore barrier cb status %d", status); + return; + } + + sev->se_barrier_status = status; + sev->se_state = SEST_BARRIER_DONE; + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); +} + +void process_startdone_barrier(sm_group_t *sg, int status) +{ + sm_uevent_t *uev = &sg->uevent; + + if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) { + log_debug(sg, "ignore barrier cb status %d", status); + return; + } + + uev->ue_barrier_status = status; + uev->ue_state = UEST_BARRIER_DONE; + set_bit(UEFL_CHECK, &uev->ue_flags); + wake_serviced(DO_MEMBERSHIP); +} + +void process_recovery_barrier(sm_group_t *sg, int status) +{ + if (status) { + log_error(sg, "process_recovery_barrier status=%d", status); + return; + } + + if (sg->state != SGST_RECOVER || + sg->recover_state != RECOVER_BARRIERWAIT) { + log_error(sg, "process_recovery_barrier state %d recover %d", + sg->state, sg->recover_state); + return; + } + + if (!sg->recover_stop) + sg->recover_state = RECOVER_STOP; + else + sg->recover_state = RECOVER_BARRIERDONE; + + wake_serviced(DO_RECOVERIES); +} + +void process_barriers(void) +{ + sm_group_t *sg; + bc_entry_t *be; + + while (1) { + be = NULL; + + spin_lock(&barriers_lock); + if (!list_empty(&barriers)) { + be = list_entry(barriers.next, bc_entry_t, list); + list_del(&be->list); + } + spin_unlock(&barriers_lock); + + if (!be) + break; + + sg = sm_global_id_to_sg(be->gid); + if (!sg) { + log_print("process_barriers: no sg %08x", be->gid); + break; + } + + switch (be->type) { + case SM_BARRIER_STARTDONE_NEW: + process_startdone_barrier_new(sg, be->status); + break; + + case SM_BARRIER_STARTDONE: + process_startdone_barrier(sg, be->status); + break; + + case SM_BARRIER_RECOVERY: + process_recovery_barrier(sg, be->status); + break; + } + + kfree(be); + schedule(); + } +} diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h --- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_barrier.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,29 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_BARRIER_DOT_H__ +#define __SM_BARRIER_DOT_H__ + +#define SM_BARRIER_STARTDONE (0) +#define SM_BARRIER_STARTDONE_NEW (1) +#define SM_BARRIER_RECOVERY (2) +#define SM_BARRIER_RESET (3) + +void init_barriers(void); +void process_barriers(void); +int sm_barrier(char *name, int count, int type); +void process_startdone_barrier(sm_group_t *sg, int status); +void process_startdone_barrier_new(sm_group_t *sg, int status); +void process_recovery_barrier(sm_group_t *sg, int status); + +#endif diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c --- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_control.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,156 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" +#include "config.h" + +struct socket * sm_socket; +uint32_t * sm_new_nodeids; +uint32_t sm_our_nodeid; +int sm_quorum, sm_quorum_next; +struct list_head sm_members; +int sm_member_count; + + +/* + * Context: cnxman + * Called by cnxman when it has a new member list. + */ + +void sm_member_update(int quorate) +{ + sm_quorum_next = quorate; + wake_serviced(DO_START_RECOVERY); +} + +/* + * Context: cnxman + * Called when module is loaded. + */ + +void sm_init(void) +{ + sm_socket = NULL; + sm_new_nodeids = NULL; + sm_quorum = 0; + sm_quorum_next = 0; + sm_our_nodeid = 0; + INIT_LIST_HEAD(&sm_members); + sm_member_count = 0; + + init_services(); + init_messages(); + init_barriers(); + init_serviced(); + init_recovery(); + init_joinleave(); + init_sm_misc(); +} + +/* + * Context: cnxman + * Called at beginning of cluster join procedure. + */ + +void sm_start(void) +{ + struct sockaddr_cl saddr; + struct socket *sock; + int result; + + /* Create a communication channel among service managers */ + + result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock); + if (result < 0) { + log_print("can't create socket %d", result); + goto fail; + } + + sm_socket = sock; + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_SERVICES; + + result = sock->ops->bind(sock, (struct sockaddr *) &saddr, + sizeof(saddr)); + if (result < 0) { + log_print("can't bind socket %d", result); + goto fail_release; + } + + result = kcl_register_read_callback(sm_socket, sm_cluster_message); + if (result < 0) { + log_print("can't register read callback %d", result); + goto fail_release; + } + + sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes * + sizeof(uint32_t), + GFP_KERNEL); + start_serviced(); + + /* cnxman should call sm_member_update() once we've joined - then we + * can get our first list of members and our own nodeid */ + + return; + + fail_release: + sock_release(sm_socket); + sm_socket = NULL; + + fail: + return; +} + +/* + * Context: cnxman + * Called before cnxman leaves the cluster. If this returns an error to cman, + * cman should not leave the cluster but return EBUSY. + * If force is set we go away anyway. cman knows best in this case + */ + +int sm_stop(int force) +{ + struct list_head *head; + sm_group_t *sg; + sm_node_t *node; + int i, busy = FALSE, error = -EBUSY; + + for (i = 0; i < SG_LEVELS; i++) { + if (!list_empty(&sm_sg[i])) { + sg = list_entry(sm_sg[i].next, sm_group_t, list); + log_error(sg, "sm_stop: SG still joined"); + busy = TRUE; + } + } + + if (!busy || force) { + stop_serviced(); + + if (sm_socket) + sock_release(sm_socket); + + head = &sm_members; + while (!list_empty(head)) { + node = list_entry(head->next, sm_node_t, list); + list_del(&node->list); + sm_member_count--; + kfree(node); + } + + kfree(sm_new_nodeids); + sm_init(); + error = 0; + } + return error; +} diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h --- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_control.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,22 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_CONTROL_DOT_H__ +#define __SM_CONTROL_DOT_H__ + +void sm_init(void); +void sm_start(void); +int sm_stop(int force); +void sm_member_update(int quorate); + +#endif diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c --- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_daemon.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,120 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +static unsigned long daemon_flags; +static struct task_struct * daemon_task; +static struct completion daemon_done; +static wait_queue_head_t daemon_wait; +extern int sm_quorum; + +void init_serviced(void) +{ + daemon_flags = 0; + daemon_task = NULL; + init_completion(&daemon_done); + init_waitqueue_head(&daemon_wait); +} + +void wake_serviced(int do_flag) +{ + set_bit(do_flag, &daemon_flags); + wake_up(&daemon_wait); +} + +static inline int got_work(void) +{ + int rv = 0; + + rv = (test_bit(DO_START_RECOVERY, &daemon_flags) || + test_bit(DO_MESSAGES, &daemon_flags) || + test_bit(DO_BARRIERS, &daemon_flags) || + test_bit(DO_CALLBACKS, &daemon_flags)); + + if (sm_quorum && !rv) + rv = (test_bit(DO_JOINLEAVE, &daemon_flags) || + test_bit(DO_RECOVERIES, &daemon_flags) || + test_bit(DO_MEMBERSHIP, &daemon_flags)); + return rv; +} + +static int serviced(void *arg) +{ + DECLARE_WAITQUEUE(wait, current); + + daemonize("cman_serviced"); + daemon_task = current; + set_bit(DO_RUN, &daemon_flags); + complete(&daemon_done); + + for (;;) { + if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags)) + process_nodechange(); + + if (test_and_clear_bit(DO_MESSAGES, &daemon_flags)) + process_messages(); + + if (test_and_clear_bit(DO_BARRIERS, &daemon_flags)) + process_barriers(); + + if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags)) + process_callbacks(); + + if (sm_quorum) { + if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags)) + process_recoveries(); + + if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags)) + process_joinleave(); + + if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags)) + process_membership(); + } + + if (!test_bit(DO_RUN, &daemon_flags)) + break; + + current->state = TASK_INTERRUPTIBLE; + add_wait_queue(&daemon_wait, &wait); + if (!got_work() && test_bit(DO_RUN, &daemon_flags)) + schedule(); + remove_wait_queue(&daemon_wait, &wait); + current->state = TASK_RUNNING; + } + + complete(&daemon_done); + return 0; +} + +int start_serviced(void) +{ + int error; + + error = kernel_thread(serviced, NULL, 0); + if (error < 0) + goto out; + + error = 0; + wait_for_completion(&daemon_done); + + out: + return error; +} + +void stop_serviced(void) +{ + clear_bit(DO_RUN, &daemon_flags); + wake_up(&daemon_wait); + wait_for_completion(&daemon_done); +} diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h --- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_daemon.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,32 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_DAEMON_DOT_H__ +#define __SM_DAEMON_DOT_H__ + +#define DO_RUN (0) +#define DO_START_RECOVERY (1) +#define DO_MESSAGES (2) +#define DO_BARRIERS (3) +#define DO_CALLBACKS (4) +#define DO_JOINLEAVE (5) +#define DO_RECOVERIES (6) +#define DO_MEMBERSHIP (7) +#define DO_RESET (8) + +void init_serviced(void); +void wake_serviced(int do_flag); +void stop_serviced(void); +int start_serviced(void); + +#endif diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h --- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_internal.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,230 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_INTERNAL_DOT_H__ +#define __SM_INTERNAL_DOT_H__ + +/* + * Any header files needed by this file should be included before it in sm.h. + * This file should only be included by sm.h. + */ + +struct sm_group; +struct sm_sevent; +struct sm_uevent; +struct sm_node; +struct sm_msg; + +typedef struct sm_group sm_group_t; +typedef struct sm_sevent sm_sevent_t; +typedef struct sm_uevent sm_uevent_t; +typedef struct sm_node sm_node_t; +typedef struct sm_msg sm_msg_t; + + +/* + * Number of seconds to wait before trying again to join or leave an SG + */ +#define RETRY_DELAY (2) + + +/* + * Service Event - what a node uses to join or leave an sg + */ + +/* SE Flags */ +#define SEFL_CHECK (0) +#define SEFL_ALLOW_JOIN (1) +#define SEFL_ALLOW_JSTOP (2) +#define SEFL_ALLOW_LEAVE (3) +#define SEFL_ALLOW_LSTOP (4) +#define SEFL_ALLOW_STARTDONE (5) +#define SEFL_ALLOW_BARRIER (6) +#define SEFL_DELAY (7) +#define SEFL_LEAVE (8) +#define SEFL_CANCEL (9) + +/* SE States */ +#define SEST_JOIN_BEGIN (1) +#define SEST_JOIN_ACKWAIT (2) +#define SEST_JOIN_ACKED (3) +#define SEST_JSTOP_ACKWAIT (4) +#define SEST_JSTOP_ACKED (5) +#define SEST_JSTART_SERVICEWAIT (6) +#define SEST_JSTART_SERVICEDONE (7) +#define SEST_BARRIER_WAIT (8) +#define SEST_BARRIER_DONE (9) +#define SEST_LEAVE_BEGIN (10) +#define SEST_LEAVE_ACKWAIT (11) +#define SEST_LEAVE_ACKED (12) +#define SEST_LSTOP_ACKWAIT (13) +#define SEST_LSTOP_ACKED (14) +#define SEST_LSTART_WAITREMOTE (15) +#define SEST_LSTART_REMOTEDONE (16) + +struct sm_sevent { + struct list_head se_list; + unsigned int se_id; + sm_group_t * se_sg; + unsigned long se_flags; + unsigned int se_state; + + int se_node_count; + int se_memb_count; + int se_reply_count; + + uint32_t * se_node_ids; + char * se_node_status; + int se_len_ids; /* length of node_ids */ + int se_len_status; /* length of node_status */ + + int se_barrier_status; + struct timer_list se_restart_timer; +}; + +/* + * Update Event - what an sg member uses to respond to an sevent + */ + +/* UE Flags */ +#define UEFL_ALLOW_STARTDONE (0) +#define UEFL_ALLOW_BARRIER (1) +#define UEFL_CANCEL (2) +#define UEFL_LEAVE (3) +#define UEFL_CHECK (4) + +/* UE States */ +#define UEST_JSTOP (1) +#define UEST_JSTART_WAITCMD (2) +#define UEST_JSTART (3) +#define UEST_JSTART_SERVICEWAIT (4) +#define UEST_JSTART_SERVICEDONE (5) +#define UEST_BARRIER_WAIT (6) +#define UEST_BARRIER_DONE (7) +#define UEST_LSTOP (8) +#define UEST_LSTART_WAITCMD (9) +#define UEST_LSTART (10) +#define UEST_LSTART_SERVICEWAIT (11) +#define UEST_LSTART_SERVICEDONE (12) + +struct sm_uevent { + unsigned int ue_state; + unsigned long ue_flags; + uint32_t ue_id; + uint32_t ue_nodeid; + int ue_num_nodes; + int ue_barrier_status; + uint16_t ue_remote_seid; +}; + +/* + * Service Group + */ + +#define RECOVER_NONE (0) +#define RECOVER_STOP (1) +#define RECOVER_START (2) +#define RECOVER_STARTDONE (3) +#define RECOVER_BARRIERWAIT (4) +#define RECOVER_BARRIERDONE (5) + +/* SG Flags */ +#define SGFL_SEVENT (1) +#define SGFL_UEVENT (2) +#define SGFL_NEED_RECOVERY (3) + +/* SG States */ +#define SGST_NONE (0) +#define SGST_JOIN (1) +#define SGST_RUN (2) +#define SGST_RECOVER (3) +#define SGST_UEVENT (4) + +struct sm_group { + struct list_head list; /* list of sg's */ + uint16_t level; + uint32_t local_id; + uint32_t global_id; + unsigned long flags; + int state; + int refcount; /* references from reg/unreg */ + void * service_data; /* data from the service */ + struct kcl_service_ops *ops; /* ops from the service */ + struct completion event_comp; + + struct list_head memb; /* Membership List for RC */ + int memb_count; /* number of nodes in memb */ + struct list_head joining; /* nodes joining the sg */ + sm_sevent_t * sevent; + sm_uevent_t uevent; + + int recover_state; + int recover_stop; + struct list_head recover_list; /* recovery event list */ + void * recover_data; + char recover_barrier[MAX_BARRIER_NAME_LEN]; + + int namelen; + char name[1]; /* must be last field */ +}; + +/* + * Service Message + */ + +/* SMSG Type */ +#define SMSG_JOIN_REQ (1) +#define SMSG_JOIN_REP (2) +#define SMSG_JSTOP_REQ (3) +#define SMSG_JSTOP_REP (4) +#define SMSG_JSTART_CMD (5) +#define SMSG_LEAVE_REQ (6) +#define SMSG_LEAVE_REP (7) +#define SMSG_LSTOP_REQ (8) +#define SMSG_LSTOP_REP (9) +#define SMSG_LSTART_CMD (10) +#define SMSG_LSTART_DONE (11) +#define SMSG_RECOVER (12) + +/* SMSG Status */ +#define STATUS_POS (1) +#define STATUS_NEG (2) +#define STATUS_WAIT (3) + +struct sm_msg { + uint8_t ms_type; + uint8_t ms_status; + uint16_t ms_sevent_id; + uint32_t ms_global_sgid; + uint32_t ms_global_lastid; + uint16_t ms_sglevel; + uint16_t ms_length; + /* buf of ms_length bytes follows */ +}; + +/* + * Node structure + */ + +#define SNFL_NEED_RECOVERY (0) +#define SNFL_CLUSTER_MEMBER (1) +#define SNFL_LEAVING (2) + +struct sm_node { + struct list_head list; + uint32_t id; /* node id from cnxman */ + unsigned long flags; + int incarnation; /* node incarnation number */ +}; + +#endif /* __SM_INTERNAL_DOT_H__ */ diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c --- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_joinleave.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,1286 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +/* + * Routines used by nodes that are joining or leaving a SG. These "sevent" + * routines initiate membership changes to a SG. Existing SG members respond + * using the "uevent" membership update routines. + */ + +extern uint32_t sm_our_nodeid; +extern struct list_head sm_members; +static struct list_head new_event; +static spinlock_t new_event_lock; +static struct list_head joinleave_events; + +void init_joinleave(void) +{ + INIT_LIST_HEAD(&new_event); + spin_lock_init(&new_event_lock); + INIT_LIST_HEAD(&joinleave_events); +} + +void new_joinleave(sm_sevent_t *sev) +{ + spin_lock(&new_event_lock); + list_add_tail(&sev->se_list, &new_event); + spin_unlock(&new_event_lock); + wake_serviced(DO_JOINLEAVE); +} + +sm_sevent_t *find_sevent(unsigned int id) +{ + sm_sevent_t *sev; + + list_for_each_entry(sev, &joinleave_events, se_list) { + if (sev->se_id == id) + return sev; + } + return NULL; +} + +static void release_sevent(sm_sevent_t *sev) +{ + if (sev->se_len_ids) { + kfree(sev->se_node_ids); + sev->se_node_ids = NULL; + } + + if (sev->se_len_status) { + kfree(sev->se_node_status); + sev->se_node_status = NULL; + } + + sev->se_node_count = 0; + sev->se_memb_count = 0; + sev->se_reply_count = 0; +} + +static int init_sevent(sm_sevent_t *sev) +{ + sm_node_t *node; + int len1, len2, count, cluster_members = 0; + + /* clear state from any previous attempt */ + release_sevent(sev); + + list_for_each_entry(node, &sm_members, list) { + if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) + cluster_members++; + } + + sev->se_node_count = cluster_members; + sev->se_memb_count = sev->se_sg->memb_count; + + /* + * When joining, we need a node array the size of the entire cluster + * member list because we get responses from all nodes. When leaving, + * we only get responses from SG members, so the node array need only + * be that large. + */ + + if (sev->se_state < SEST_LEAVE_BEGIN) + count = sev->se_node_count; + else + count = sev->se_memb_count; + + len1 = count * sizeof(uint32_t); + sev->se_len_ids = len1; + + sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL); + if (!sev->se_node_ids) + goto fail; + + len2 = count * sizeof (char); + sev->se_len_status = len2; + + sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL); + if (!sev->se_node_status) + goto fail_free; + + memset(sev->se_node_status, 0, len2); + memset(sev->se_node_ids, 0, len1); + + return 0; + + fail_free: + kfree(sev->se_node_ids); + sev->se_node_ids = NULL; + sev->se_len_ids = 0; + + fail: + return -ENOMEM; +} + +/* Context: timer */ + +static void sev_restart(unsigned long data) +{ + sm_sevent_t *sev = (sm_sevent_t *) data; + + clear_bit(SEFL_DELAY, &sev->se_flags); + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); +} + +static void schedule_sev_restart(sm_sevent_t *sev) +{ + init_timer(&sev->se_restart_timer); + sev->se_restart_timer.function = sev_restart; + sev->se_restart_timer.data = (long) sev; + mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ)); +} + +void free_sg_memb(sm_group_t *sg) +{ + sm_node_t *node; + + while (!list_empty(&sg->memb)) { + node = list_entry(sg->memb.next, sm_node_t, list); + list_del(&node->list); + kfree(node); + } + sg->memb_count = 0; +} + +/* + * 1. First step in joining a SG - send a message to all nodes in the cluster + * asking to join the named SG. If any nodes are members they will reply with + * a POS, or a WAIT (wait means try again, only one node can join at a time). + * If no one knows about this SG, they all send NEG replies which means we form + * the SG with just ourself as a member. + */ + +static int send_join_notice(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + sm_node_t *node; + char *msg; + int i = 0, error, namelen, len = 0; + + /* + * Create node array from member list in which to collect responses. + */ + + error = init_sevent(sev); + if (error) + goto out; + + list_for_each_entry(node, &sm_members, list) { + if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) + sev->se_node_ids[i++] = node->id; + } + + /* + * Create and send a join request message. + * + * Other nodes then run process_join_request and reply to us; we + * collect the responses in process_reply and check them in + * check_join_notice. + */ + + namelen = sg->namelen; + msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev); + memcpy(msg + sizeof(sm_msg_t), sg->name, namelen); + + error = send_broadcast_message_sev(msg, len, sev); + + out: + return error; +} + +/* + * 2. Second step in joining a SG - after we collect all replies to our join + * request, we look at them. If anyone told us to wait, we'll wait a while, go + * back and start at step 1 again. + */ + +static int check_join_notice(sm_sevent_t *sev) +{ + int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0; + + for (i = 0; i < sev->se_node_count; i++) { + switch (sev->se_node_status[i]) { + case STATUS_POS: + /* this node is in the SG and will be in new proposed + * memb list */ + pos++; + break; + + case STATUS_WAIT: + /* this node is in the SG but something else is + * happening with it at the moment. */ + wait++; + break; + + case STATUS_NEG: + /* this node has no record of the SG we're interested + * in */ + neg++; + + if (sev->se_node_ids[i] == sm_our_nodeid) + sev->se_node_status[i] = STATUS_POS; + break; + + default: + /* we didn't get a valid response from this node, + * restart the entire sev. */ + restart++; + break; + } + } + + if (pos && !wait && !restart) { + /* all current members of this sg pos'ed our entry */ + } else if (!pos && !wait && !restart && neg) { + /* we're the first in the cluster to join this sg */ + sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level); + } else + error = -1; + + return error; +} + +/* + * 3. Third step in joining the SG - tell the nodes that are already members + * to "stop" the service. We stop them so that everyone can restart with the + * new member (us!) added. + */ + +static int send_join_stop(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + sm_node_t *node; + char *msg; + uint32_t be_count; + int i, len = 0, error = 0; + + /* + * Form the SG memb list with us in it. + */ + + for (i = 0; i < sev->se_node_count; i++) { + if (sev->se_node_status[i] != STATUS_POS) + continue; + + node = sm_new_node(sev->se_node_ids[i]); + if (!node) + goto fail; + + list_add_tail(&node->list, &sg->memb); + sg->memb_count++; + } + + /* + * Re-init the node vector in which to collect responses again. + */ + + sev->se_memb_count = sg->memb_count; + + memset(sev->se_node_status, 0, sev->se_len_status); + memset(sev->se_node_ids, 0, sev->se_len_ids); + i = 0; + + list_for_each_entry(node, &sg->memb, list) + sev->se_node_ids[i++] = node->id; + + /* + * Create and send a stop message. + * + * Other nodes then run process_stop_request and process_join_stop and + * reply to us. They stop the sg we're trying to join if they agree. + * We collect responses in process_reply and check them in + * check_join_stop. + */ + + msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev); + be_count = cpu_to_be32(sg->memb_count); + memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t)); + + error = send_members_message_sev(sg, msg, len, sev); + if (error < 0) + goto fail; + + return 0; + + fail: + free_sg_memb(sg); + return error; +} + +/* + * 4. Fourth step in joining the SG - after we collect replies to our stop + * request, we look at them. Everyone sending POS agrees with us joining and + * has stopped their SG. If some nodes sent NEG, something is wrong and we + * don't have a good way to address that yet since some nodes may have sent + * POS. + * + * FIXME: even nodes replying with NEG should stop their SG so we can send an + * abort and have everyone at the same place to start from again. + */ + +static int check_join_stop(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + int i, pos = 0, neg = 0; + + for (i = 0; i < sev->se_memb_count; i++) { + switch (sev->se_node_status[i]) { + case STATUS_POS: + pos++; + break; + + case STATUS_NEG: + log_error(sg, "check_join_stop: neg from nodeid %u " + "(%d, %d, %u)", sev->se_node_ids[i], + pos, neg, sev->se_memb_count); + neg++; + break; + + default: + log_error(sg, "check_join_stop: unknown status=%u " + "nodeid=%u", sev->se_node_status[i], + sev->se_node_ids[i]); + neg++; + break; + } + } + + if (pos == sg->memb_count) + return 0; + + free_sg_memb(sg); + return -1; +} + +/* + * 5. Fifth step in joining the SG - everyone has stopped their service and we + * all now start the service with us, the new member, added to the SG member + * list. We send start to our own service here and send a message to the other + * members that they should also start their service. + */ + +static int send_join_start(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + sm_node_t *node; + uint32_t *memb; + char *msg; + int error, count = 0, len = 0; + + /* + * Create a start message and send it. + */ + + msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev); + + error = send_members_message(sg, msg, len); + if (error < 0) + goto fail; + + /* + * Start the service ourself. The chunk of memory with the member ids + * must be freed by the service when it is done with it. + */ + + SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL), + memb); + + list_for_each_entry(node, &sg->memb, list) + memb[count++] = node->id; + + set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags); + + sg->ops->start(sg->service_data, memb, count, sev->se_id, + SERVICE_NODE_JOIN); + return 0; + + fail: + free_sg_memb(sg); + return error; +} + +/* + * 6. Sixth step in joining the SG - once the service has completed its start, + * it does a kcl_start_done() to signal us that it's done. That gets us here + * and we do a barrier with all other members which join the barrier when their + * service is done starting. + */ + +static int startdone_barrier_new(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + char bname[MAX_BARRIER_NAME_LEN]; + int error; + + memset(bname, 0, MAX_BARRIER_NAME_LEN); + sev->se_barrier_status = -1; + + set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags); + + /* If we're the only member, skip the barrier */ + if (sg->memb_count == 1) { + process_startdone_barrier_new(sg, 0); + return 0; + } + + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u", + sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count); + + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW); + if (error) + goto fail; + + return 0; + + fail: + clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags); + sg->ops->stop(sg->service_data); + free_sg_memb(sg); + return error; +} + +/* + * 7. Seventh step in joining the SG - check that the barrier we joined with + * all other members returned with a successful status. + */ + +static int check_startdone_barrier_new(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + int error = sev->se_barrier_status; + + if (error) { + sg->ops->stop(sg->service_data); + free_sg_memb(sg); + } + return error; +} + +/* + * 8. Eigth step in joining the SG - send the service a "finish" indicating + * that all members have successfully started the service. + */ + +static void do_finish_new(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + sg->state = SGST_RUN; + sg->sevent = NULL; + clear_bit(SGFL_SEVENT, &sg->flags); + + sg->ops->finish(sg->service_data, sev->se_id); +} + +/* + * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff + * and tell the process which initiated the join that it's done. + */ + +static void sevent_done(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + list_del(&sev->se_list); + release_sevent(sev); + kfree(sev); + complete(&sg->event_comp); +} + +/* + * Move through the steps of a join. Summary: + * + * 1. Send a join notice to all cluster members. + * 2. Collect and check replies to the join notice. + * 3. Send a stop message to all SG members. + * 4. Collect and check replies to the stop message. + * 5. Send a start message to all SG members and start service ourself. + * 6. Use barrier to wait for all nodes to complete the start. + * 7. Check that all SG members joined the barrier. + * 8. Send finish to the service indicating that all nodes started it. + * 9. Clean up sevent and signal completion to the process that started the join + */ + +static void process_join_sevent(sm_sevent_t *sev) +{ + int error = 0; + + /* + * We may cancel the current join attempt if another node is also + * attempting to join or leave. (Only a single node can join or leave + * at once.) If cancelled, 0ur join attempt will be restarted later. + */ + + if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) { + error = -1; + goto cancel; + } + + log_debug(sev->se_sg, "sevent state %u", sev->se_state); + + switch (sev->se_state) { + + /* + * An sevent is created in kcl_join_service with a state of + * JOIN_BEGIN. + */ + + case SEST_JOIN_BEGIN: + sev->se_state = SEST_JOIN_ACKWAIT; + error = send_join_notice(sev); + break; + + /* + * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in + * process_reply (when all the replies have been received) + */ + + case SEST_JOIN_ACKED: + error = check_join_notice(sev); + if (error) + break; + + sev->se_state = SEST_JSTOP_ACKWAIT; + error = send_join_stop(sev); + break; + + /* + * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in + * proces_reply (when all the replies have been received) + */ + + case SEST_JSTOP_ACKED: + error = check_join_stop(sev); + if (error) + break; + + sev->se_state = SEST_JSTART_SERVICEWAIT; + error = send_join_start(sev); + break; + + /* + * se_state is changed from JSTART_SERVICEWAIT to + * JSTART_SERVICEDONE in kcl_start_done + */ + + case SEST_JSTART_SERVICEDONE: + sev->se_state = SEST_BARRIER_WAIT; + error = startdone_barrier_new(sev); + break; + + /* + * se_state is changed from BARRIER_WAIT to BARRIER_DONE in + * process_startdone_barrier_new + */ + + case SEST_BARRIER_DONE: + error = check_startdone_barrier_new(sev); + if (error) + break; + + do_finish_new(sev); + sevent_done(sev); + break; + + default: + log_error(sev->se_sg, "no join processing for state %u", + sev->se_state); + } + + cancel: + if (error) { + /* restart the sevent from the beginning */ + sev->se_state = SEST_JOIN_BEGIN; + sev->se_sg->global_id = 0; + set_bit(SEFL_DELAY, &sev->se_flags); + schedule_sev_restart(sev); + } +} + +/* + * 1. First step in leaving an SG - send a message to other SG members asking + * to leave the SG. Nodes that don't have another active sevent or uevent for + * this SG will return POS. + */ + +static int send_leave_notice(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + sm_node_t *node; + char *msg; + int i = 0, error = -1, len = 0; + + /* + * Create a node array from member list in which to collect responses. + */ + + error = init_sevent(sev); + if (error) + goto out; + + list_for_each_entry(node, &sg->memb, list) + sev->se_node_ids[i++] = node->id; + + /* + * Create and send a leave request message. + */ + + msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev); + + error = send_members_message_sev(sg, msg, len, sev); + + out: + return error; +} + +/* + * 2. Second step in leaving an SG - after we collect all replies to our leave + * request, we look at them. If anyone replied with WAIT, we abort our attempt + * at leaving and try again in a bit. + */ + +static int check_leave_notice(sm_sevent_t *sev) +{ + int pos = 0, wait = 0, neg = 0, restart = 0, i; + + for (i = 0; i < sev->se_memb_count; i++) { + switch (sev->se_node_status[i]) { + case STATUS_POS: + pos++; + break; + + case STATUS_WAIT: + wait++; + break; + + case STATUS_NEG: + neg++; + break; + + default: + /* we didn't get a valid response from this node, + * restart the entire sev. */ + restart++; + break; + } + } + + /* all members approve */ + if (pos && !wait && !restart) + return 0; + + return -1; +} + +/* + * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG. + * They must be stopped in order to restart without us as a member. + */ + +static int send_leave_stop(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + char *msg; + int error, len = 0; + + /* + * Re-init the status vector in which to collect responses. + */ + + memset(sev->se_node_status, 0, sev->se_len_status); + + /* + * Create and send a stop message. + */ + + msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev); + + error = send_members_message_sev(sg, msg, len, sev); + if (error < 0) + goto out; + + /* + * we and all others stop the SG now + */ + + sg->ops->stop(sg->service_data); + + out: + return error; +} + +/* + * 4. Fourth step in leaving the SG - check the replies to our stop request. + * Same problem with getting different replies as check_join_stop. + */ + +static int check_leave_stop(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + int i, pos = 0, neg = 0; + + for (i = 0; i < sev->se_memb_count; i++) { + switch (sev->se_node_status[i]) { + case STATUS_POS: + pos++; + break; + + case STATUS_NEG: + log_error(sg, "check_leave_stop: fail from nodeid %u " + "(%d, %d, %u)", sev->se_node_ids[i], + pos, neg, sev->se_memb_count); + neg++; + break; + + default: + log_error(sg, "check_leave_stop: status %u nodeid %u", + sev->se_node_status[i], sev->se_node_ids[i]); + neg++; + break; + } + } + + if (pos == sg->memb_count) + return 0; + + return -1; +} + +/* + * 5. Fifth step in leaving the SG - tell the other SG members to restart the + * service without us. We, of course, don't start our own stopped service. If + * we're the last SG member and leaving, we jump right to the next step. + */ + +static int send_leave_start(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + char *msg; + int error = 0, len = 0; + + if (sg->memb_count == 1) { + sev->se_state = SEST_LSTART_REMOTEDONE; + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); + } else { + msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev); + error = send_members_message(sg, msg, len); + } + return error; +} + +/* + * Move through the steps of a leave. Summary: + * + * 1. Send a leave notice to all SG members. + * 2. Collect and check replies to the leave notice. + * 3. Send a stop message to all SG members and stop our own SG. + * 4. Collect and check replies to the stop message. + * 5. Send a start message to SG members. + * 6. Clean up sevent and signal completion to the process that + * started the leave. + */ + +static void process_leave_sevent(sm_sevent_t *sev) +{ + int error = 0; + + /* + * We may cancel the current leave attempt if another node is also + * attempting to join or leave. (Only a single node can join or leave + * at once.) Our leave attempt will be restarted after being + * cancelled. + */ + + if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) { + error = 1; + goto cancel; + } + + if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) { + error = 2; + goto cancel; + } + + if (!list_empty(&sev->se_sg->joining)) { + error = 3; + goto cancel; + } + + log_debug(sev->se_sg, "sevent state %u", sev->se_state); + + switch (sev->se_state) { + + /* + * An sevent is created in kcl_leave_service with a state of + * LEAVE_BEGIN. + */ + + case SEST_LEAVE_BEGIN: + sev->se_state = SEST_LEAVE_ACKWAIT; + error = send_leave_notice(sev); + break; + + /* + * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in + * process_reply (when all the replies have been received) + */ + + case SEST_LEAVE_ACKED: + error = check_leave_notice(sev); + if (error) + break; + + sev->se_state = SEST_LSTOP_ACKWAIT; + error = send_leave_stop(sev); + break; + + /* + * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in + * process_reply + */ + + case SEST_LSTOP_ACKED: + error = check_leave_stop(sev); + if (error) + break; + + sev->se_state = SEST_LSTART_WAITREMOTE; + error = send_leave_start(sev); + break; + + /* + * se_state is changed from LSTART_WAITREMOTE to + * LSTART_REMOTEDONE in process_leave_done + */ + + case SEST_LSTART_REMOTEDONE: + sevent_done(sev); + break; + + default: + log_error(sev->se_sg, "process_leave_sevent state=%u\n", + sev->se_state); + } + + cancel: + if (error) { + /* restart the sevent from the beginning */ + sev->se_state = SEST_LEAVE_BEGIN; + set_bit(SEFL_DELAY, &sev->se_flags); + schedule_sev_restart(sev); + } +} + +/* + * Sevent backout code. Take appropriate steps when a recovery occurs while + * we're in the midst of an sevent. The recovery may or may not affect the + * sevent. If it does, it usually means cancelling the sevent and restarting + * it from the beginning once the recovery processing is done. + */ + +/* + * If any of the nodes that replied with OK is dead, we give up on the current + * join attempt and restart. Otherwise, this sevent can continue. + */ + +static int backout_join_acked(sm_sevent_t *sev) +{ + sm_node_t *node; + int i; + + for (i = 0; i < sev->se_node_count; i++) { + if (sev->se_node_status[i] != STATUS_POS) + continue; + + list_for_each_entry(node, &sm_members, list) { + if (test_bit(SNFL_NEED_RECOVERY, &node->flags) && + (node->id == sev->se_node_ids[i])) + return TRUE; + } + } + return FALSE; +} + +/* + * In this state our sg member list exists and mark_affected_sgs() will have + * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We + * restart the join process if this is the case, otherwise this sevent can + * continue. + */ + +static int backout_jstop_ackwait(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags); + free_sg_memb(sg); + return TRUE; +} + +/* + * Same as previous. + */ + +static int backout_jstop_acked(sm_sevent_t *sev) +{ + return backout_jstop_ackwait(sev); +} + +/* + * If NEED_RECOVERY is set a member of the sg we're joining died while we were + * starting our service. The recovery process will restart the service on all + * the prior sg members (not including those that died or us). We will + * reattempt our join which should be accepted once the nodes are done with + * recovery. + */ + +static int backout_jstart_servicewait(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags); + sg->ops->stop(sg->service_data); + free_sg_memb(sg); + return TRUE; +} + +/* + * Same as previous. + */ + +static int backout_jstart_servicedone(sm_sevent_t *sev) +{ + return backout_jstart_servicewait(sev); +} + +/* + * If NEED_RECOVERY is set a member of the sg we're joining died while we were + * waiting on the "all done" barrier. Stop our service that we just started + * and cancel the barrier. The recovery process will restart the service on + * all the prior sg members (not including those that died or us). We will + * reattempt our join which should be accepted once the nodes are done with + * recovery. + */ + +static int backout_barrier_wait(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + char bname[MAX_BARRIER_NAME_LEN]; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags); + + sg->ops->stop(sg->service_data); + + memset(bname, 0, MAX_BARRIER_NAME_LEN); + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u", + sg->global_id, sm_our_nodeid, sev->se_id, + sg->memb_count); + kcl_barrier_cancel(bname); + + free_sg_memb(sg); + return TRUE; +} + +/* + * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The + * recovery began after the barrier callback. If the result in the callback is + * "success" then we are joined, this sevent is finished and we'll process the + * sg within the forthcoming recovery with the other members. + * + * We rely upon cnxman to guarantee that once all nodes have joined a barrier, + * all nodes will receive the corresponding barrier callback *before any* + * receive an sm_member_update() due to one of those nodes failing just after + * joining the barrier. If some nodes receive the sm_member_update() before + * the barrier callback and others receive the barrier callback before the + * sm_member_update() then they will disagree as to whether the node joining/ + * leaving is in/out of the sg. + */ + +static int backout_barrier_done(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + if (!sev->se_barrier_status) { + do_finish_new(sev); + sevent_done(sev); + return FALSE; + } else { + sg->ops->stop(sg->service_data); + free_sg_memb(sg); + return TRUE; + } +} + +/* + * We've done nothing yet, just restart when recovery is done (if sg is flagged + * with recovery.) + */ + +static int backout_leave_begin(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + return TRUE; +} + +/* + * Ignore any replies to our leave notice and restart when recovery is done (if + * sg is flagged with recovery.) + */ + +static int backout_leave_ackwait(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags); + + return TRUE; +} + +/* + * Same as previous. + */ + +static int backout_leave_acked(sm_sevent_t *sev) +{ + return backout_leave_ackwait(sev); +} + +/* + * Ignore any stop replies. All the members will be stopped anyway to do the + * recovery. Let that happen and restart our leave when done. + */ + +static int backout_lstop_ackwait(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags); + + return TRUE; +} + +/* + * Same as previous. + */ + +static int backout_lstop_acked(sm_sevent_t *sev) +{ + return backout_lstop_ackwait(sev); +} + +/* + * All members will be stopped due to recovery and restarted by recovery + * processing. That includes us, we have to retry the leave once the recovery + * is done. + */ + +static int backout_lstart_waitremote(sm_sevent_t *sev) +{ + sm_group_t *sg = sev->se_sg; + + if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + return FALSE; + + return TRUE; +} + +/* + * Reset an sevent to its beginning so it can be restarted. This is necessary + * when recovery affects an SG while we're trying to join or leave (ie. a node + * in the SG fails). + */ + +void backout_sevents(void) +{ + sm_sevent_t *sev, *safe; + int delay; + + list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) { + + delay = FALSE; + + log_debug(sev->se_sg, "backout sevent state %u", sev->se_state); + + switch (sev->se_state) { + + /* backout after kcl_join_service and before + * send_join_notice */ + case SEST_JOIN_BEGIN: + break; + + /* backout after send_join_notice and before final + * process_reply */ + case SEST_JOIN_ACKWAIT: + clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags); + sev->se_state = SEST_JOIN_BEGIN; + schedule_sev_restart(sev); + break; + + /* backout after final process_reply and before + * check_join_notice */ + case SEST_JOIN_ACKED: + delay = backout_join_acked(sev); + break; + + /* backout after send_join_stop and before final + * process_reply */ + case SEST_JSTOP_ACKWAIT: + delay = backout_jstop_ackwait(sev); + break; + + /* backout after final process_reply and before + * check_join_stop */ + case SEST_JSTOP_ACKED: + delay = backout_jstop_acked(sev); + break; + + /* backout after send_join_start and before + * kcl_start_done */ + case SEST_JSTART_SERVICEWAIT: + delay = backout_jstart_servicewait(sev); + break; + + /* backout after kcl_start_done and before + * startdone_barrier_new */ + case SEST_JSTART_SERVICEDONE: + delay = backout_jstart_servicedone(sev); + break; + + /* backout after startdone_barrier_new and before + * callback_startdone_barrier_new */ + case SEST_BARRIER_WAIT: + delay = backout_barrier_wait(sev); + break; + + /* backout after callback_startdone_barrier_new and + * before check_startdone_barrier_new */ + case SEST_BARRIER_DONE: + delay = backout_barrier_done(sev); + break; + + /* backout after kcl_leave_service and before + * send_leave_notice */ + case SEST_LEAVE_BEGIN: + delay = backout_leave_begin(sev); + break; + + /* backout after send_leave_notice and before final + * process_reply */ + case SEST_LEAVE_ACKWAIT: + delay = backout_leave_ackwait(sev); + break; + + /* backout after final process_reply and before + * check_leave_notice */ + case SEST_LEAVE_ACKED: + delay = backout_leave_acked(sev); + break; + + /* backout after send_leave_stop and before final + * process_reply */ + case SEST_LSTOP_ACKWAIT: + delay = backout_lstop_ackwait(sev); + break; + + /* backout after final process_reply and before + * check_leave_stop */ + case SEST_LSTOP_ACKED: + delay = backout_lstop_acked(sev); + break; + + /* backout after send_leave_start and before + * process_lstart_done */ + case SEST_LSTART_WAITREMOTE: + delay = backout_lstart_waitremote(sev); + break; + + /* backout after process_lstart_done and before + * process_leave_sevent */ + case SEST_LSTART_REMOTEDONE: + sevent_done(sev); + delay = FALSE; + break; + + default: + log_error(sev->se_sg, "backout_sevents: bad state %d", + sev->se_state); + } + + if (delay) { + set_bit(SEFL_DELAY, &sev->se_flags); + + if (test_bit(SEFL_LEAVE, &sev->se_flags)) { + sev->se_state = SEST_LEAVE_BEGIN; + /* The DELAY flag will be cleared once recovery + * is done allowing the leave to be retried. */ + } else { + sev->se_state = SEST_JOIN_BEGIN; + /* restart timer function will clear DELAY */ + schedule_sev_restart(sev); + } + } + } +} + +void process_joinleave(void) +{ + sm_sevent_t *sev = NULL, *safe; + + spin_lock(&new_event_lock); + if (!list_empty(&new_event)) { + sev = list_entry(new_event.next, sm_sevent_t, se_list); + list_del(&sev->se_list); + list_add_tail(&sev->se_list, &joinleave_events); + set_bit(SEFL_CHECK, &sev->se_flags); + } + spin_unlock(&new_event_lock); + + list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) { + if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags)) + continue; + + if (test_bit(SEFL_DELAY, &sev->se_flags)) + continue; + + if (sev->se_state < SEST_LEAVE_BEGIN) + process_join_sevent(sev); + else + process_leave_sevent(sev); + } +} diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h --- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_joinleave.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,23 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_JOINLEAVE_DOT_H__ +#define __SM_JOINLEAVE_DOT_H__ + +void init_joinleave(void); +void new_joinleave(sm_sevent_t *sev); +void process_joinleave(void); +void backout_sevents(void); +sm_sevent_t *find_sevent(unsigned int id); + +#endif diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c --- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_membership.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,696 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +extern struct list_head sm_members; + +/* + * Routines for SG members to handle other nodes joining or leaving the SG. + * These "uevent" membership update routines are the response to an "sevent" on + * a joining/leaving node. + */ + +static void del_memb_node(sm_group_t *sg, uint32_t nodeid) +{ + sm_node_t *node; + + list_for_each_entry(node, &sg->memb, list) { + if (node->id != nodeid) + continue; + list_del(&node->list); + kfree(node); + sg->memb_count--; + log_debug(sg, "del node %u count %d", nodeid, sg->memb_count); + break; + } +} + +static void add_memb_node(sm_group_t *sg, sm_node_t *node) +{ + list_add_tail(&node->list, &sg->memb); + sg->memb_count++; + log_debug(sg, "add node %u count %d", node->id, sg->memb_count); +} + +/* + * Join 1. The receive end of send_join_stop() from a node requesting to join + * the SG. We stop the service so it can be restarted with the new node. + */ + +static int process_join_stop(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + sm_node_t *node; + sm_msg_t reply; + int error; + + if (uev->ue_num_nodes != sg->memb_count + 1) { + log_error(sg, "process_join_stop: bad num nodes %u %u", + uev->ue_num_nodes, sg->memb_count); + return -1; + } + + sm_set_event_id(&uev->ue_id); + + node = sm_find_joiner(sg, uev->ue_nodeid); + SM_ASSERT(node,); + + sg->state = SGST_UEVENT; + sg->ops->stop(sg->service_data); + + reply.ms_type = SMSG_JSTOP_REP; + reply.ms_status = STATUS_POS; + reply.ms_sevent_id = uev->ue_remote_seid; + smsg_bswap_out(&reply); + + error = send_nodeid_message((char *) &reply, sizeof(reply), + uev->ue_nodeid); + if (error < 0) + return error; + return 0; +} + +/* + * Join 2. The receive end of send_join_start() from a node joining the SG. + * We are re-starting the service with the new member added. + */ + +static int process_join_start(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + sm_node_t *node; + uint32_t *memb; + int count = 0; + + /* this memory is passed to the service which must free it */ + SM_RETRY(memb = + kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL), + memb); + + /* transfer joining node from joining list to member list */ + node = sm_find_joiner(sg, uev->ue_nodeid); + SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid);); + list_del(&node->list); + add_memb_node(sg, node); + + /* the new member list for the service */ + list_for_each_entry(node, &sg->memb, list) + memb[count++] = node->id; + + set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags); + + sg->ops->start(sg->service_data, memb, count, uev->ue_id, + SERVICE_NODE_JOIN); + return 0; +} + +/* + * Join 3. When done starting their local service, every previous SG member + * calls startdone_barrier() and the new/joining member calls + * startdone_barrier_new(). The barrier returns when everyone has started + * their service and joined the barrier. + */ + +static int startdone_barrier(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + char bname[MAX_BARRIER_NAME_LEN]; + int error; + + memset(bname, 0, MAX_BARRIER_NAME_LEN); + uev->ue_barrier_status = -1; + + set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags); + + /* If we're the only member, skip the barrier */ + if (sg->memb_count == 1) { + process_startdone_barrier(sg, 0); + return 0; + } + + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u", + sg->global_id, uev->ue_nodeid, uev->ue_remote_seid, + sg->memb_count); + + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE); + + return error; +} + +/* + * Join 4. Check that the "all started" barrier returned a successful status. + * The newly joined member calls check_startdone_barrier_new(). + */ + +static int check_startdone_barrier(sm_group_t *sg) +{ + int error = sg->uevent.ue_barrier_status; + return error; +} + +/* + * Join 5. Send the service a "finish" indicating that all members have + * successfully started. The newly joined member calls do_finish_new(). + */ + +static void do_finish(sm_group_t *sg) +{ + sg->state = SGST_RUN; + clear_bit(SGFL_UEVENT, &sg->flags); + sg->ops->finish(sg->service_data, sg->uevent.ue_id); +} + +/* + * Join 6. The uevent is done. If this was a uevent for a node leaving the + * SG, then send a final message to the departed node signalling that the + * remaining nodes have restarted since it left. + */ + +static void uevent_done(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + sm_msg_t reply; + + if (test_bit(UEFL_LEAVE, &uev->ue_flags)) { + reply.ms_type = SMSG_LSTART_DONE; + reply.ms_status = STATUS_POS; + reply.ms_sevent_id = uev->ue_remote_seid; + smsg_bswap_out(&reply); + send_nodeid_message((char *) &reply, sizeof(reply), + uev->ue_nodeid); + } + memset(&sg->uevent, 0, sizeof(sm_uevent_t)); +} + +/* + * Leave 1. The receive end of send_leave_stop() from a node leaving the SG. + */ + +static int process_leave_stop(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + sm_msg_t reply; + int error; + + sm_set_event_id(&uev->ue_id); + + sg->state = SGST_UEVENT; + sg->ops->stop(sg->service_data); + + reply.ms_type = SMSG_LSTOP_REP; + reply.ms_status = STATUS_POS; + reply.ms_sevent_id = uev->ue_remote_seid; + smsg_bswap_out(&reply); + + error = send_nodeid_message((char *) &reply, sizeof(reply), + uev->ue_nodeid); + if (error < 0) + return error; + return 0; +} + +/* + * Leave 2. The receive end of send_leave_start() from a node leaving the SG. + * We are re-starting the service (without the node that's left naturally.) + */ + +static int process_leave_start(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + sm_node_t *node; + uint32_t *memb; + int count = 0; + + SM_ASSERT(sg->memb_count > 1, + printk("memb_count=%u\n", sg->memb_count);); + + /* this memory is passed to the service which must free it */ + SM_RETRY(memb = + kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL), + memb); + + /* remove departed member from sg member list */ + del_memb_node(sg, uev->ue_nodeid); + + /* build member list to pass to service */ + list_for_each_entry(node, &sg->memb, list) + memb[count++] = node->id; + + /* allow us to accept the start_done callback for this start */ + set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags); + + sg->ops->start(sg->service_data, memb, count, uev->ue_id, + SERVICE_NODE_LEAVE); + return 0; +} + +/* + * Move through the steps of another node joining or leaving the SG. + */ + +static void process_one_uevent(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + int error = 0; + + log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid); + + switch (uev->ue_state) { + + /* + * a uevent is initialized with state JSTOP in + * process_stop_request + */ + + case UEST_JSTOP: + uev->ue_state = UEST_JSTART_WAITCMD; + error = process_join_stop(sg); + break; + + /* + * ue_state is changed from JSTART_WAITCMD to JSTART in + * process_start_request + */ + + case UEST_JSTART: + uev->ue_state = UEST_JSTART_SERVICEWAIT; + error = process_join_start(sg); + break; + + /* + * ue_state is changed from JSTART_SERVICEWAIT to + * JSTART_SERVICEDONE in kcl_start_done + */ + + case UEST_JSTART_SERVICEDONE: + uev->ue_state = UEST_BARRIER_WAIT; + error = startdone_barrier(sg); + break; + + /* + * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in + * process_startdone_barrier + */ + + case UEST_BARRIER_DONE: + error = check_startdone_barrier(sg); + if (error) + break; + + do_finish(sg); + uevent_done(sg); + break; + + /* + * a uevent is initialized with state LSTOP in + * process_stop_request + */ + + case UEST_LSTOP: + uev->ue_state = UEST_LSTART_WAITCMD; + error = process_leave_stop(sg); + break; + + /* + * a uevent is changed from LSTART_WAITCMD to LSTART in + * process_start_request + */ + + case UEST_LSTART: + uev->ue_state = UEST_LSTART_SERVICEWAIT; + error = process_leave_start(sg); + break; + + /* + * a uevent is changed from LSTART_SERVICEWAIT to to + * LSTART_SERVICEDONE in kcl_start_done + */ + + case UEST_LSTART_SERVICEDONE: + uev->ue_state = UEST_BARRIER_WAIT; + error = startdone_barrier(sg); + break; + + default: + error = -1; + } + + /* If we encounter an error during these routines, we do nothing, + expecting that a node failure related to this sg will cause a + recovery event to arrive and call cancel_one_uevent(). */ + + if (error) + log_error(sg, "process_one_uevent error %d state %u", + error, uev->ue_state); +} + +static sm_node_t *failed_memb(sm_group_t *sg, int *count) +{ + sm_node_t *node, *sm_node, *failed_uev_node = NULL; + + list_for_each_entry(node, &sg->memb, list) { + + sm_node = sm_find_member(node->id); + SM_ASSERT(sm_node, ); + + if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) { + (*count)++; + if (node->id == sg->uevent.ue_nodeid) + failed_uev_node = sm_node; + } + } + return failed_uev_node; +} + +static void send_recover_msg(sm_group_t *sg) +{ + char *msg; + int len = 0; + msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL); + send_members_message(sg, msg, len); +} + +static void cancel_barrier(sm_group_t *sg) +{ + sm_uevent_t *uev = &sg->uevent; + char bname[MAX_BARRIER_NAME_LEN]; + + clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags); + + memset(bname, 0, MAX_BARRIER_NAME_LEN); + snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u", + sg->global_id, uev->ue_nodeid, uev->ue_remote_seid, + sg->memb_count); + kcl_barrier_cancel(bname); +} + +static void cancel_one_uevent(sm_group_t *sg, int *effected) +{ + sm_uevent_t *uev = &sg->uevent; + int failed_count; + sm_node_t *node, *failed_joiner, *failed_leaver; + + log_debug(sg, "cancel uevent state %u node %u", uev->ue_state, + uev->ue_nodeid); + + switch (uev->ue_state) { + + case UEST_JSTOP: + case UEST_JSTART_WAITCMD: + case UEST_JSTART: + + sg->ops->stop(sg->service_data); + + failed_count = 0; + failed_joiner = failed_memb(sg, &failed_count); + SM_ASSERT(!failed_joiner, ); + + node = sm_find_member(uev->ue_nodeid); + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)) + failed_joiner = node; + + if (!failed_count) { + /* only joining node failed */ + SM_ASSERT(failed_joiner, ); + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + set_bit(SGFL_NEED_RECOVERY, &sg->flags); + (*effected)++; + /* some nodes may not have gotten a JSTOP message + in which case this will tell them to begin + recovery for this sg. */ + send_recover_msg(sg); + + } else { + /* a member node failed (and possibly joining node, it + doesn't matter) */ + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + + case UEST_JSTART_SERVICEWAIT: + case UEST_JSTART_SERVICEDONE: + + clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags); + sg->ops->stop(sg->service_data); + + failed_count = 0; + failed_joiner = failed_memb(sg, &failed_count); + SM_ASSERT(failed_count, ); + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + + if (failed_count == 1 && failed_joiner) { + /* only joining node failed */ + + } else if (failed_count && failed_joiner) { + /* joining node and another member failed */ + + } else { + /* other member failed, joining node still alive */ + SM_ASSERT(!failed_joiner, ); + del_memb_node(sg, uev->ue_nodeid); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + + case UEST_LSTOP: + case UEST_LSTART_WAITCMD: + case UEST_LSTART: + + sg->ops->stop(sg->service_data); + + failed_count = 0; + failed_leaver = failed_memb(sg, &failed_count); + SM_ASSERT(failed_count, ); + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + + if (failed_count == 1 && failed_leaver) { + /* only leaving node failed */ + + } else if (failed_count && failed_leaver) { + /* leaving node and another member failed */ + + } else { + /* other member failed, leaving node still alive */ + SM_ASSERT(!failed_leaver, ); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + + case UEST_LSTART_SERVICEWAIT: + case UEST_LSTART_SERVICEDONE: + + clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags); + sg->ops->stop(sg->service_data); + + failed_count = 0; + failed_leaver = failed_memb(sg, &failed_count); + SM_ASSERT(!failed_leaver, ); + + node = sm_find_member(uev->ue_nodeid); + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)) + failed_leaver = node; + + if (!failed_count) { + /* only leaving node failed */ + SM_ASSERT(failed_leaver, ); + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + set_bit(SGFL_NEED_RECOVERY, &sg->flags); + (*effected)++; + + } else if (failed_count && failed_leaver) { + /* leaving node and another member failed */ + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + + } else { + /* other member failed, leaving node still alive */ + SM_ASSERT(failed_count, ); + SM_ASSERT(!failed_leaver, ); + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + node = sm_new_node(sg->uevent.ue_nodeid); + add_memb_node(sg, node); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + + case UEST_BARRIER_WAIT: + + if (test_bit(UEFL_LEAVE, &uev->ue_flags)) + goto barrier_wait_leave; + + sg->ops->stop(sg->service_data); + cancel_barrier(sg); + + barrier_wait_join: + + failed_count = 0; + failed_joiner = failed_memb(sg, &failed_count); + SM_ASSERT(failed_count, ); + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + + if (failed_count == 1 && failed_joiner) { + /* only joining node failed */ + + } else if (failed_count && failed_joiner) { + /* joining node and another member failed */ + + } else { + /* other member failed, joining node still alive */ + SM_ASSERT(!failed_joiner, ); + del_memb_node(sg, uev->ue_nodeid); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + barrier_wait_leave: + + failed_count = 0; + failed_leaver = failed_memb(sg, &failed_count); + SM_ASSERT(!failed_leaver, ); + + node = sm_find_member(uev->ue_nodeid); + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)) + failed_leaver = node; + + if (!failed_count) { + /* only leaving node failed */ + SM_ASSERT(failed_leaver, ); + SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + set_bit(SGFL_NEED_RECOVERY, &sg->flags); + (*effected)++; + + } else if (failed_count && failed_leaver) { + /* leaving node and another member failed */ + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + + } else { + /* other member failed, leaving node still alive */ + SM_ASSERT(failed_count, ); + SM_ASSERT(!failed_leaver, ); + SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), ); + node = sm_new_node(sg->uevent.ue_nodeid); + add_memb_node(sg, node); + } + + clear_bit(SGFL_UEVENT, &sg->flags); + memset(uev, 0, sizeof(sm_uevent_t)); + break; + + + case UEST_BARRIER_DONE: + + if (!uev->ue_barrier_status) { + do_finish(sg); + uevent_done(sg); + break; + } + + if (test_bit(UEFL_LEAVE, &uev->ue_flags)) + goto barrier_wait_leave; + else + goto barrier_wait_join; + + + default: + log_error(sg, "cancel_one_uevent: state %d", uev->ue_state); + } +} + +void cancel_uevents(int *effected) +{ + sm_group_t *sg; + sm_node_t *node, *sgnode; + int i; + + list_for_each_entry(node, &sm_members, list) { + if (!test_bit(SNFL_NEED_RECOVERY, &node->flags)) + continue; + + /* + * Clear this dead node from the "interested in joining" list + * of any SG. The node is added to this list before the uevent + * begins. + */ + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &sm_sg[i], list) { + sgnode = sm_find_joiner(sg, node->id); + if (sgnode) { + log_debug(sg, "clear joining node %u", + sgnode->id); + list_del(&sgnode->list); + kfree(sgnode); + } + } + } + } + + /* Adjust any uevents in sg's effected by the failed node(s) */ + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &sm_sg[i], list) { + if (!test_bit(SGFL_UEVENT, &sg->flags)) + continue; + + /* We may have some cancelling to do if this sg is + flagged as having a failed member, or if a joining + or leaving node has died. */ + + if (test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + cancel_one_uevent(sg, effected); + else if (sg->uevent.ue_nodeid) { + node = sm_find_member(sg->uevent.ue_nodeid); + SM_ASSERT(node, ); + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)) + cancel_one_uevent(sg, effected); + } + } + } +} + +void process_membership(void) +{ + sm_group_t *sg; + int i; + + down(&sm_sglock); + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &sm_sg[i], list) { + if (!test_bit(SGFL_UEVENT, &sg->flags)) + continue; + + if (!test_and_clear_bit(UEFL_CHECK, + &sg->uevent.ue_flags)) + continue; + + process_one_uevent(sg); + } + } + up(&sm_sglock); +} diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h --- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_membership.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,20 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_MEMBERSHIP_DOT_H__ +#define __SM_MEMBERSHIP_DOT_H__ + +void process_membership(void); +void cancel_uevents(int *effected); + +#endif diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c --- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_message.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,867 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1) + +extern struct socket * sm_socket; +extern uint32_t sm_our_nodeid; +static uint32_t global_last_id; +static struct list_head messages; +static spinlock_t message_lock; +static char smsg_buf[SMSG_BUF_SIZE]; + +int send_nodeid_message(char *msg, int len, uint32_t nodeid); + +struct rq_entry { + struct list_head list; + char *msg; + int len; + uint32_t nodeid; +}; +typedef struct rq_entry rq_entry_t; + +void init_messages(void) +{ + global_last_id = 1; + INIT_LIST_HEAD(&messages); + spin_lock_init(&message_lock); +} + +uint32_t sm_new_global_id(int level) +{ + uint32_t id = global_last_id++; + uint8_t l = (uint8_t) level; + + if (level > 255) + return 0; + + if (id > 0x00FFFFFF) + return 0; + + id |= (l << 24); + return id; +} + +static void smsg_copy_in(char *msg, sm_msg_t *smsg) +{ + sm_msg_t *in = (sm_msg_t *) msg; + + smsg->ms_type = in->ms_type; + smsg->ms_status = in->ms_status; + smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id); + smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid); + smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid); + smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel); + smsg->ms_length = le16_to_cpu(in->ms_length); +} + +/* swapping bytes in place is an easy source of errors - be careful not to + * access the fields after calling this */ + +void smsg_bswap_out(sm_msg_t *smsg) +{ + smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id); + smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid); + smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid); + smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel); + smsg->ms_length = cpu_to_le16(smsg->ms_length); +} + +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen, + sm_sevent_t *sev) +{ + char *msg; + sm_msg_t *smsg; + int fulllen = sizeof(sm_msg_t) + datalen; + + msg = smsg_buf; + memset(smsg_buf, 0, SMSG_BUF_SIZE); + SM_ASSERT(fulllen <= SMSG_BUF_SIZE,); + + smsg = (sm_msg_t *) msg; + smsg->ms_type = type; + smsg->ms_global_sgid = sg->global_id; + smsg->ms_sglevel = sg->level; + smsg->ms_length = datalen; + smsg->ms_sevent_id = sev ? sev->se_id : 0; + + smsg_bswap_out(smsg); + *msglen = fulllen; + return msg; +} + +static unsigned int msgtype_to_flag(int type) +{ + unsigned int flag; + + switch (type) { + case SMSG_JOIN_REP: + case SMSG_JOIN_REQ: + flag = SEFL_ALLOW_JOIN; + break; + + case SMSG_JSTOP_REP: + case SMSG_JSTOP_REQ: + flag = SEFL_ALLOW_JSTOP; + break; + + case SMSG_LEAVE_REP: + case SMSG_LEAVE_REQ: + flag = SEFL_ALLOW_LEAVE; + break; + + case SMSG_LSTOP_REP: + case SMSG_LSTOP_REQ: + flag = SEFL_ALLOW_LSTOP; + break; + + default: + SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type);); + } + return flag; +} + +static int test_allowed_msgtype(sm_sevent_t * sev, int type) +{ + unsigned int flag = msgtype_to_flag(type); + + return test_bit(flag, &sev->se_flags); +} + +static void clear_allowed_msgtype(sm_sevent_t * sev, int type) +{ + unsigned int flag = msgtype_to_flag(type); + + clear_bit(flag, &sev->se_flags); +} + +static void set_allowed_msgtype(sm_sevent_t * sev, int type) +{ + unsigned int flag = msgtype_to_flag(type); + + set_bit(flag, &sev->se_flags); +} + +static int save_global_id(sm_sevent_t * sev, sm_msg_t * smsg) +{ + sm_group_t *sg = sev->se_sg; + + if (!smsg->ms_global_sgid) { + log_error(sg, "save_global_id: zero sg id"); + return -1; + } + + if (!sg->global_id) + sg->global_id = smsg->ms_global_sgid; + + if (sg->global_id != smsg->ms_global_sgid) { + log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid); + return -1; + } + return 0; +} + +static void save_lastid(sm_msg_t * smsg) +{ + uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF; + + /* + * Keep track of the highst SG id which has been used + * in the cluster in case we need to choose a new SG id. + */ + + if (gid > global_last_id) + global_last_id = gid; +} + +static int next_sev_state(int msg_type, int cur_state) +{ + int next = 0; + + switch (msg_type) { + case SMSG_JOIN_REP: + SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,); + next = SEST_JOIN_ACKED; + break; + + case SMSG_JSTOP_REP: + SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,); + next = SEST_JSTOP_ACKED; + break; + + case SMSG_LEAVE_REP: + SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,); + next = SEST_LEAVE_ACKED; + break; + + case SMSG_LSTOP_REP: + SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,); + next = SEST_LSTOP_ACKED; + break; + } + return next; +} + +/* + * Functions in sevent.c send messages to other nodes and then expect replies. + * This function collects the replies for the sevent messages and moves the + * sevent to the next stage when all the expected replies have been received. + */ + +static void process_reply(sm_msg_t * smsg, uint32_t nodeid) +{ + sm_sevent_t *sev; + int i, expected, type = smsg->ms_type; + + /* + * Find the relevant sevent. + */ + + sev = find_sevent(smsg->ms_sevent_id); + if (!sev) { + log_print("process_reply invalid id=%u nodeid=%u", + smsg->ms_sevent_id, nodeid); + goto out; + } + + /* + * Check if this message type is what this sevent is waiting for. + */ + + if (!test_allowed_msgtype(sev, type)) { + log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id); + goto out; + } + + expected = + (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count; + + SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids, + printk("type=%d expected=%d len_ids=%d node_count=%d " + "memb_count=%d\n", type, expected, sev->se_len_ids, + sev->se_node_count, sev->se_memb_count);); + + SM_ASSERT(expected * sizeof(char) <= sev->se_len_status, + printk("type=%d expected=%d len_status=%d node_count=%d " + "memb_count=%d\n", type, expected, sev->se_len_status, + sev->se_node_count, sev->se_memb_count);); + + for (i = 0; i < expected; i++) { + if (sev->se_node_ids[i] == nodeid) { + /* + * Save the status from the replying node + */ + + if (!sev->se_node_status[i]) + sev->se_node_status[i] = smsg->ms_status; + else { + log_error(sev->se_sg, "process_reply duplicate" + "id=%u nodeid=%u %u/%u", + sev->se_id, nodeid, + sev->se_node_status[i], + smsg->ms_status); + goto out; + } + + if (type == SMSG_JOIN_REP) { + save_lastid(smsg); + + if (smsg->ms_status == STATUS_POS) + save_global_id(sev, smsg); + } + + /* + * Signal sm if we have all replies + */ + + if (++sev->se_reply_count == expected) { + clear_allowed_msgtype(sev, type); + sev->se_state = next_sev_state(type, + sev->se_state); + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); + } + + break; + } + } + + out: + return; +} + +/* + * A node wants to join an SG and has run send_join_notice. If we know nothing + * about the SG , then we have no objection - send back STATUS_POS. If we're a + * member of the SG, then send back STATUS_POS (go ahead and join) if there's + * no sevent or uevent of higher priority in progress (only a single join or + * leave is permitted for the SG at once). If there happens to be a higher + * priority sevent/uevent in progress, send back STATUS_WAIT to defer the + * requested join for a bit. + */ + +static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name) +{ + sm_group_t *sg = NULL; + sm_sevent_t *sev = NULL; + sm_node_t *node; + int found = FALSE; + int level = smsg->ms_sglevel; + sm_msg_t reply; + + memset(&reply, 0, sizeof(reply)); + + down(&sm_sglock); + + if (nodeid == sm_our_nodeid) + goto next; + + /* + * search SG list for an SG with given name/len + */ + + list_for_each_entry(sg, &sm_sg[level], list) { + if ((sg->namelen != smsg->ms_length) || + memcmp(sg->name, name, sg->namelen)) + continue; + found = TRUE; + break; + } + + /* + * build reply message + */ + + next: + + if (!found) { + reply.ms_type = SMSG_JOIN_REP; + reply.ms_status = STATUS_NEG; + reply.ms_global_lastid = global_last_id; + reply.ms_sevent_id = smsg->ms_sevent_id; + } else { + reply.ms_type = SMSG_JOIN_REP; + reply.ms_status = STATUS_POS; + reply.ms_sevent_id = smsg->ms_sevent_id; + reply.ms_global_sgid = sg->global_id; + reply.ms_global_lastid = global_last_id; + + /* + * The node trying to join should wait and try again until + * we're done with recovery. + */ + + if (sg->state == SGST_RECOVER) { + reply.ms_status = STATUS_WAIT; + goto send; + } + + /* + * An sevent node trying to join may have gotten as far as + * creating a uevent with us and then backed out. That node + * will retry joining from the beginning so we should not turn + * them away. If we're handling a uevent for another node, + * tell the joining node to wait. + */ + + if (test_bit(SGFL_UEVENT, &sg->flags)) { + if (sg->uevent.ue_nodeid != nodeid) + reply.ms_status = STATUS_WAIT; + goto send; + } + + /* + * We're trying to join or leave the SG at the moment. + */ + + if (test_bit(SGFL_SEVENT, &sg->flags)) { + sev = sg->sevent; + + /* + * We're trying to leave. Make the join wait until + * we've left if we're beyond LEAVE_ACKWAIT. + */ + + if (test_bit(SEFL_LEAVE, &sev->se_flags)) { + if (sev->se_state > SEST_LEAVE_ACKED) + reply.ms_status = STATUS_WAIT; + else { + reply.ms_status = STATUS_POS; + clear_bit(SEFL_ALLOW_LEAVE, + &sev->se_flags); + set_bit(SEFL_CANCEL, &sev->se_flags); + } + } + + /* + * We're trying to join. Making the other join wait + * until we're joined if we're beyond JOIN_ACKWAIT or + * if we have a lower id. (Send NEG to allow the other + * node to go ahead because we're not in the SG.) + */ + + else { + if (sev->se_state > SEST_JOIN_ACKED) + reply.ms_status = STATUS_WAIT; + else if (sm_our_nodeid < nodeid) + reply.ms_status = STATUS_WAIT; + else { + reply.ms_status = STATUS_NEG; + clear_bit(SEFL_ALLOW_JOIN, + &sev->se_flags); + set_bit(SEFL_CANCEL, &sev->se_flags); + } + } + + if (test_bit(SEFL_CANCEL, &sev->se_flags)) { + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); + } + goto send; + } + + /* no r,u,s event, stick with STATUS_POS */ + } + + send: + + if (reply.ms_status == STATUS_POS) { + node = sm_find_joiner(sg, nodeid); + if (!node) { + node = sm_new_node(nodeid); + list_add_tail(&node->list, &sg->joining); + } + } + + up(&sm_sglock); + smsg_bswap_out(&reply); + send_nodeid_message((char *) &reply, sizeof(reply), nodeid); +} + +/* + * Another node wants us to stop a service so it can join or leave the SG. We + * do this by saving the request info in a uevent and having the sm thread do + * the processing and then replying. + */ + +static void process_stop_request(sm_msg_t * smsg, uint32_t nodeid, + uint32_t * msgbuf) +{ + sm_group_t *sg; + sm_uevent_t *uev; + sm_msg_t reply; + int type = smsg->ms_type; + + if (nodeid == sm_our_nodeid) + goto agree; + + sg = sm_global_id_to_sg(smsg->ms_global_sgid); + if (!sg) { + log_print("process_stop_request: unknown sg id %x", + smsg->ms_global_sgid); + return; + } + + /* + * We shouldn't get here with uevent already set. + */ + + if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) { + log_error(sg, "process_stop_request: uevent already set"); + return; + } + + uev = &sg->uevent; + uev->ue_nodeid = nodeid; + uev->ue_remote_seid = smsg->ms_sevent_id; + uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP; + + if (type == SMSG_JSTOP_REQ) + uev->ue_num_nodes = be32_to_cpu(*msgbuf); + else + set_bit(UEFL_LEAVE, &uev->ue_flags); + + /* + * Do process_join_stop() or process_leave_stop(). + */ + + set_bit(UEFL_CHECK, &uev->ue_flags); + wake_serviced(DO_MEMBERSHIP); + return; + + agree: + reply.ms_status = STATUS_POS; + reply.ms_type = + (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP; + reply.ms_sevent_id = smsg->ms_sevent_id; + smsg_bswap_out(&reply); + send_nodeid_message((char *) &reply, sizeof(reply), nodeid); +} + +static void process_start_request(sm_msg_t * smsg, uint32_t nodeid) +{ + sm_group_t *sg; + sm_uevent_t *uev; + int type = smsg->ms_type; + + if (nodeid == sm_our_nodeid) + return; + + sg = sm_global_id_to_sg(smsg->ms_global_sgid); + if (!sg) { + log_print("process_start_request: unknown sg id %x", + smsg->ms_global_sgid); + return; + } + + if (!test_bit(SGFL_UEVENT, &sg->flags)) { + log_error(sg, "process_start_request: no uevent"); + return; + } + + uev = &sg->uevent; + + if (type == SMSG_JSTART_CMD) + uev->ue_state = UEST_JSTART; + else + uev->ue_state = UEST_LSTART; + + set_bit(UEFL_CHECK, &uev->ue_flags); + wake_serviced(DO_MEMBERSHIP); +} + +static void process_leave_request(sm_msg_t * smsg, uint32_t nodeid) +{ + sm_group_t *sg; + sm_node_t *node; + sm_msg_t reply; + sm_sevent_t *sev; + int found = FALSE; + + sg = sm_global_id_to_sg(smsg->ms_global_sgid); + if (sg) { + if (nodeid == sm_our_nodeid) + found = TRUE; + else { + list_for_each_entry(node, &sg->memb, list) { + if (node->id != nodeid) + continue; + set_bit(SNFL_LEAVING, &node->flags); + found = TRUE; + break; + } + } + } + + if (!found) { + reply.ms_type = SMSG_LEAVE_REP; + reply.ms_status = STATUS_NEG; + reply.ms_sevent_id = smsg->ms_sevent_id; + } else { + reply.ms_type = SMSG_LEAVE_REP; + reply.ms_status = STATUS_POS; + reply.ms_sevent_id = smsg->ms_sevent_id; + + if (sg->state == SGST_RECOVER) + reply.ms_status = STATUS_WAIT; + + else if (test_bit(SGFL_SEVENT, &sg->flags) && + nodeid != sm_our_nodeid) { + sev = sg->sevent; + + /* + * We're trying to join or leave at the moment. If + * we're past JOIN/LEAVE_ACKWAIT, we make the requestor + * wait. Otherwise, if joining we'll cancel to let the + * leave happen first, or if we're leaving allow the + * lower nodeid to leave first. + */ + + if (test_bit(SEFL_LEAVE, &sev->se_flags)) { + if (sev->se_state > SEST_LEAVE_ACKWAIT) + reply.ms_status = STATUS_WAIT; + else if (sm_our_nodeid < nodeid) + reply.ms_status = STATUS_WAIT; + else { + reply.ms_status = STATUS_POS; + clear_bit(SEFL_ALLOW_LEAVE, + &sev->se_flags); + set_bit(SEFL_CANCEL, &sev->se_flags); + } + } else { + if (sev->se_state > SEST_JOIN_ACKWAIT) + reply.ms_status = STATUS_WAIT; + else { + reply.ms_status = STATUS_NEG; + clear_bit(SEFL_ALLOW_JOIN, + &sev->se_flags); + set_bit(SEFL_CANCEL, &sev->se_flags); + } + } + + if (test_bit(SEFL_CANCEL, &sev->se_flags)) { + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); + } + } + + else if (test_bit(SGFL_UEVENT, &sg->flags)) { + if (sg->uevent.ue_nodeid != nodeid) + reply.ms_status = STATUS_WAIT; + } + + } + + smsg_bswap_out(&reply); + send_nodeid_message((char *) &reply, sizeof(reply), nodeid); +} + +/* + * Each remaining node will send us a done message. We quit when we get the + * first. The subsequent done messages for the finished sevent get here and + * are ignored. + */ + +static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid) +{ + sm_sevent_t *sev; + + sev = find_sevent(smsg->ms_sevent_id); + if (!sev) + return; + + if (sev->se_state != SEST_LSTART_WAITREMOTE) + return; + + sev->se_state = SEST_LSTART_REMOTEDONE; + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); +} + +/* + * This function and everything it calls always runs in sm context. + */ + +static void process_message(char *msg, uint32_t nodeid) +{ + sm_msg_t smsg; + + smsg_copy_in(msg, &smsg); + + switch (smsg.ms_type) { + case SMSG_JOIN_REQ: + process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t)); + break; + + case SMSG_JSTOP_REQ: + process_stop_request(&smsg, nodeid, + (uint32_t *) (msg + sizeof(sm_msg_t))); + break; + + case SMSG_LEAVE_REQ: + process_leave_request(&smsg, nodeid); + break; + + case SMSG_LSTOP_REQ: + process_stop_request(&smsg, nodeid, NULL); + break; + + case SMSG_JSTART_CMD: + case SMSG_LSTART_CMD: + process_start_request(&smsg, nodeid); + break; + + case SMSG_LSTART_DONE: + process_lstart_done(&smsg, nodeid); + break; + + case SMSG_JOIN_REP: + case SMSG_JSTOP_REP: + case SMSG_LEAVE_REP: + case SMSG_LSTOP_REP: + process_reply(&smsg, nodeid); + break; + + case SMSG_RECOVER: + process_recover_msg(&smsg, nodeid); + break; + + default: + log_print("process_message: unknown type %u nodeid %u", + smsg.ms_type, nodeid); + } +} + +/* + * Always called from sm context. + */ + +void process_messages(void) +{ + rq_entry_t *re; + + while (1) { + re = NULL; + + spin_lock(&message_lock); + if (!list_empty(&messages)) { + re = list_entry(messages.next, rq_entry_t, list); + list_del(&re->list); + } + spin_unlock(&message_lock); + + if (!re) + break; + process_message(re->msg, re->nodeid); + kfree(re->msg); + kfree(re); + schedule(); + } +} + +/* + * Context: cnxman and sm + */ + +static int add_to_recvqueue(char *msg, int len, uint32_t nodeid) +{ + rq_entry_t *re; + + SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL), + re); + SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg); + + memcpy(re->msg, msg, len); + re->len = len; + re->nodeid = nodeid; + + spin_lock(&message_lock); + list_add_tail(&re->list, &messages); + spin_unlock(&message_lock); + + wake_serviced(DO_MESSAGES); + return 0; +} + +/* + * Context: cnxman + * Called by cnxman when a service manager message arrives. + */ + +int sm_cluster_message(char *msg, int len, char *addr, int addr_len, + unsigned int node_id) +{ + struct kcl_cluster_node kclnode; + uint32_t nodeid = 0; + int error = 0; + + if (!node_id) { + error = kcl_get_node_by_addr(addr, addr_len, &kclnode); + if (error) + return error; + nodeid = kclnode.node_id; + } else + nodeid = node_id; + + return add_to_recvqueue(msg, len, nodeid); +} + +/* + * These send routines are used by sm and are always called from sm context. + */ + +int send_nodeid_message(char *msg, int len, uint32_t nodeid) +{ + int error = 0; + struct sockaddr_cl saddr; + + if (nodeid == sm_our_nodeid) { + add_to_recvqueue(msg, len, nodeid); + goto out; + } + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_SERVICES; + saddr.scl_nodeid = nodeid; + error = kcl_sendmsg(sm_socket, msg, len, &saddr, + sizeof(saddr), 0); + if (error > 0) + error = 0; + + if (error) + log_print("send_nodeid_message error %d to %u", error, nodeid); + out: + return error; +} + +int send_broadcast_message(char *msg, int len) +{ + int error; + + error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0); + if (error > 0) + error = 0; + + add_to_recvqueue(msg, len, sm_our_nodeid); + + if (error) + log_print("send_broadcast_message error %d", error); + + return error; +} + +int send_members_message(sm_group_t *sg, char *msg, int len) +{ + sm_node_t *node; + int error = 0; + + list_for_each_entry(node, &sg->memb, list) { + error = send_nodeid_message(msg, len, node->id); + if (error < 0) + break; + } + return error; +} + +int send_members_message_sev(sm_group_t *sg, char *msg, int len, + sm_sevent_t * sev) +{ + int error; + sm_msg_t *smsg = (sm_msg_t *) msg; + + set_allowed_msgtype(sev, smsg->ms_type); + sev->se_reply_count = 0; + + error = send_members_message(sg, msg, len); + if (error < 0) + clear_allowed_msgtype(sev, smsg->ms_type); + + return error; +} + +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev) +{ + int error; + sm_msg_t *smsg = (sm_msg_t *) msg; + + set_allowed_msgtype(sev, smsg->ms_type); + sev->se_reply_count = 0; + + error = send_broadcast_message(msg, len); + if (error < 0) + clear_allowed_msgtype(sev, smsg->ms_type); + + return error; +} diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h --- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_message.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,34 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_MESSAGE_DOT_H__ +#define __SM_MESSAGE_DOT_H__ + +void init_messages(void); +uint32_t sm_new_global_id(int level); +void smsg_bswap_out(sm_msg_t * smsg); +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen, + sm_sevent_t *sev); +void process_messages(void); +int sm_cluster_message(char *msg, int len, char *addr, int addr_len, + unsigned int node_id); +int send_nodeid_message(char *msg, int len, uint32_t nodeid); +int send_broadcast_message(char *msg, int len); +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev); +int send_members_message(sm_group_t *sg, char *msg, int len); +int send_members_message_sev(sm_group_t *sg, char *msg, int len, + sm_sevent_t * sev); +int sm_cluster_message(char *msg, int len, char *addr, int addr_len, + unsigned int node_id); + +#endif diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c --- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_misc.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,369 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" +#include "config.h" + +#define MAX_DEBUG_MSG_LEN (40) + +extern struct list_head sm_members; +static uint32_t local_ids; +static uint32_t event_id; +static spinlock_t event_id_lock; +static char * debug_buf; +static unsigned int debug_size; +static unsigned int debug_point; +static int debug_wrap; +static spinlock_t debug_lock; + + +void init_sm_misc(void) +{ + local_ids = 1; + event_id = 1; + spin_lock_init(&event_id_lock); + debug_buf = NULL; + debug_size = 0; + debug_point = 0; + debug_wrap = 0; + spin_lock_init(&debug_lock); + + sm_debug_setup(cman_config.sm_debug_size); +} + +sm_node_t *sm_new_node(uint32_t nodeid) +{ + struct kcl_cluster_node kclnode; + sm_node_t *node; + int error; + + error = kcl_get_node_by_nodeid(nodeid, &kclnode); + SM_ASSERT(!error,); + + SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL), + node); + + memset(node, 0, sizeof(sm_node_t)); + node->id = nodeid; + node->incarnation = kclnode.incarnation; + return node; +} + +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid) +{ + sm_node_t *node; + + list_for_each_entry(node, &sg->joining, list) { + if (node->id == nodeid) + return node; + } + return NULL; +} + +sm_node_t *sm_find_member(uint32_t nodeid) +{ + sm_node_t *node; + + list_for_each_entry(node, &sm_members, list) { + if (node->id == nodeid) + return node; + } + return NULL; +} + +uint32_t sm_new_local_id(int level) +{ + uint32_t id = local_ids++; + uint8_t l = (uint8_t) level; + + if (level > 0xFF) + return 0; + + if (id > 0x00FFFFFF) + return 0; + + id |= (l << 24); + return id; +} + +int sm_id_to_level(uint32_t id) +{ + uint8_t l = (id & 0xFF000000) >> 24; + + return (int) l; +} + +void sm_set_event_id(int *id) +{ + spin_lock(&event_id_lock); + *id = event_id++; + spin_unlock(&event_id_lock); +} + +sm_group_t *sm_local_id_to_sg(int id) +{ + sm_group_t *sg; + int level = sm_id_to_level(id); + int found = FALSE; + + down(&sm_sglock); + + list_for_each_entry(sg, &sm_sg[level], list) { + if (sg->local_id == id) { + found = TRUE; + break; + } + } + up(&sm_sglock); + if (!found) + sg = NULL; + return sg; +} + +sm_group_t *sm_global_id_to_sg(int id) +{ + sm_group_t *sg; + int level = sm_id_to_level(id); + int found = FALSE; + + down(&sm_sglock); + + list_for_each_entry(sg, &sm_sg[level], list) { + if (sg->global_id == id) { + found = TRUE; + break; + } + } + up(&sm_sglock); + if (!found) + sg = NULL; + return sg; +} + +void sm_debug_log(sm_group_t *sg, const char *fmt, ...) +{ + va_list va; + int i, n, size, len; + char buf[MAX_DEBUG_MSG_LEN+1]; + + spin_lock(&debug_lock); + + if (!debug_buf) + goto out; + + size = MAX_DEBUG_MSG_LEN; + memset(buf, 0, size+1); + + n = snprintf(buf, size, "%08x ", sg->global_id); + size -= n; + + va_start(va, fmt); + vsnprintf(buf+n, size, fmt, va); + va_end(va); + + len = strlen(buf); + if (len > MAX_DEBUG_MSG_LEN-1) + len = MAX_DEBUG_MSG_LEN-1; + buf[len] = '\n'; + buf[len+1] = '\0'; + + for (i = 0; i < strlen(buf); i++) { + debug_buf[debug_point++] = buf[i]; + + if (debug_point == debug_size) { + debug_point = 0; + debug_wrap = 1; + } + } + out: + spin_unlock(&debug_lock); +} + +void sm_debug_setup(int size) +{ + char *b = kmalloc(size, GFP_KERNEL); + + spin_lock(&debug_lock); + if (debug_buf) + kfree(debug_buf); + + if (size > PAGE_SIZE) + size = PAGE_SIZE; + debug_size = size; + debug_point = 0; + debug_wrap = 0; + debug_buf = b; + memset(debug_buf, 0, debug_size); + spin_unlock(&debug_lock); +} + +#ifdef CONFIG_PROC_FS + +int sm_debug_info(char *b, char **start, off_t offset, int length) +{ + int i, n = 0; + + spin_lock(&debug_lock); + + if (debug_wrap) { + for (i = debug_point; i < debug_size; i++) + n += sprintf(b + n, "%c", debug_buf[i]); + } + for (i = 0; i < debug_point; i++) + n += sprintf(b + n, "%c", debug_buf[i]); + + spin_unlock(&debug_lock); + + return n; +} + +int sm_procdata(char *b, char **start, off_t offset, int length) +{ + sm_group_t *sg; + sm_node_t *node; + int n = 0, level, i; + + n += sprintf(b + n, "\n"); + + /* + * Header + */ + + n += sprintf(b + n, + "Service Name GID LID State Code\n"); + + down(&sm_sglock); + + for (level = 0; level < SG_LEVELS; level++) { + list_for_each_entry(sg, &sm_sg[level], list) { + + /* + * Cluster Service + */ + + switch (level) { + case SERVICE_LEVEL_FENCE: + n += sprintf(b + n, "Fence Domain: "); + break; + case SERVICE_LEVEL_GDLM: + n += sprintf(b + n, "DLM Lock Space: "); + break; + case SERVICE_LEVEL_GFS: + n += sprintf(b + n, "GFS Mount Group: "); + break; + case SERVICE_LEVEL_USER: + n += sprintf(b + n, "User: "); + break; + } + + /* + * Name + */ + + n += sprintf(b + n, "\""); + for (i = 0; i < sg->namelen; i++) + n += sprintf(b + n, "%c", sg->name[i]); + n += sprintf(b + n, "\""); + + for (; i < MAX_SERVICE_NAME_LEN-1; i++) + n += sprintf(b + n, " "); + + /* + * GID LID (sans level from top byte) + */ + + n += sprintf(b + n, "%3u %3u ", + (sg->global_id & 0x00FFFFFF), + (sg->local_id & 0x00FFFFFF)); + + /* + * State + */ + + switch (sg->state) { + case SGST_NONE: + n += sprintf(b + n, "none "); + break; + case SGST_JOIN: + n += sprintf(b + n, "join "); + break; + case SGST_RUN: + n += sprintf(b + n, "run "); + break; + case SGST_RECOVER: + n += sprintf(b + n, "recover %u ", + sg->recover_state); + break; + case SGST_UEVENT: + n += sprintf(b + n, "update "); + break; + } + + /* + * Code + */ + + if (test_bit(SGFL_SEVENT, &sg->flags)) + n += sprintf(b + n, "S"); + if (test_bit(SGFL_UEVENT, &sg->flags)) + n += sprintf(b + n, "U"); + if (test_bit(SGFL_NEED_RECOVERY, &sg->flags)) + n += sprintf(b + n, "N"); + + n += sprintf(b + n, "-"); + + if (test_bit(SGFL_SEVENT, &sg->flags) + && sg->sevent) { + n += sprintf(b + n, "%u,%lx,%u", + sg->sevent->se_state, + sg->sevent->se_flags, + sg->sevent->se_reply_count); + } + + if (test_bit(SGFL_UEVENT, &sg->flags)) { + n += sprintf(b + n, "%u,%lx,%u", + sg->uevent.ue_state, + sg->uevent.ue_flags, + sg->uevent.ue_nodeid); + } + + n += sprintf(b + n, "\n"); + + /* + * node list + */ + + i = 0; + + n += sprintf(b + n, "["); + + list_for_each_entry(node, &sg->memb, list) { + if (i && !(i % 24)) + n += sprintf(b + n, "\n"); + + if (i) + n += sprintf(b + n, " "); + + n += sprintf(b + n, "%u", node->id); + i++; + } + + n += sprintf(b + n, "]\n\n"); + } + } + + up(&sm_sglock); + + return n; +} +#endif diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h --- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_misc.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,29 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_MISC_DOT_H__ +#define __SM_MISC_DOT_H__ + +void init_sm_misc(void); +sm_node_t *sm_new_node(uint32_t nodeid); +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid); +sm_node_t *sm_find_member(uint32_t nodeid); +uint32_t sm_new_local_id(int level); +int sm_id_to_level(uint32_t id); +void sm_set_event_id(int *id); +sm_group_t *sm_local_id_to_sg(int id); +sm_group_t *sm_global_id_to_sg(int id); +void sm_debug_log(sm_group_t *sg, const char *fmt, ...); +void sm_debug_setup(int size); + +#endif diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c --- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_recover.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,522 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" +#include "config.h" + +/* + * A collection of sg's which need to be recovered due to a failed member. + * These sg's are recovered in order of level. An sg subject to cascading + * failures is moved from one of these structs to a newer one. + */ + +struct recover { + struct list_head list; /* list of current re's */ + struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */ + int event_id; /* event id */ + int cur_level; +}; +typedef struct recover recover_t; + + +extern uint32_t * sm_new_nodeids; +extern int sm_quorum, sm_quorum_next; +extern uint32_t sm_our_nodeid; +extern struct list_head sm_members; +extern int sm_member_count; +static struct list_head recoveries; + + +void init_recovery(void) +{ + INIT_LIST_HEAD(&recoveries); +} + +/* + * This is the first thing called when a change is announced in cluster + * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new + * nodes to its sm_members list which it's not seen before. Nodes which were + * alive but are now gone are marked as "need recovery". + * + * The "need recovery" status of nodes is propagated to the node's SG's in + * mark_effected_sgs. The effected SG's are themselves marked as needing + * recovery and in new_recovery the dead nodes are removed from the SG's + * individual member lists. The "need recovery" status of nodes is cleared in + * adjust_members_done(). + */ + +static int adjust_members(void) +{ + sm_node_t *node; + struct kcl_cluster_node knode; + int i, error, num_nodes, sub = 0, add = 0, found; + + /* + * Get list of current members from cnxman + */ + + memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t)); + num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes); + + /* + * Determine who's gone + */ + + list_for_each_entry(node, &sm_members, list) { + found = FALSE; + for (i = 0; i < num_nodes; i++) { + if (node->id == sm_new_nodeids[i]) { + found = TRUE; + sm_new_nodeids[i] = 0; + break; + } + } + + if (found) { + error = kcl_get_node_by_nodeid(node->id, &knode); + SM_ASSERT(!error, printk("error=%d\n", error);); + + if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) { + /* former member is back */ + set_bit(SNFL_CLUSTER_MEMBER, &node->flags); + node->incarnation = knode.incarnation; + add++; + } else { + /* current member is still alive - if the + * incarnation number is different it died and + * returned between checks */ + if (node->incarnation != knode.incarnation) { + set_bit(SNFL_NEED_RECOVERY, + &node->flags); + node->incarnation = knode.incarnation; + sub++; + } + } + } else { + /* current member has died */ + if (test_and_clear_bit(SNFL_CLUSTER_MEMBER, + &node->flags)) { + set_bit(SNFL_NEED_RECOVERY, &node->flags); + sub++; + } + } + } + + /* + * Look for new nodes + */ + + for (i = 0; i < num_nodes; i++) { + if (sm_new_nodeids[i]) { + node = sm_new_node(sm_new_nodeids[i]); + set_bit(SNFL_CLUSTER_MEMBER, &node->flags); + add++; + list_add_tail(&node->list, &sm_members); + sm_member_count++; + } + } + + /* + * Get our own nodeid + */ + + if (!sm_our_nodeid) { + list_for_each_entry(node, &sm_members, list) { + error = kcl_get_node_by_nodeid(node->id, &knode); + SM_ASSERT(!error, printk("error=%d\n", error);); + + if (knode.us) { + sm_our_nodeid = knode.node_id; + break; + } + } + } + + return sub; +} + +/* + * Given some number of dead nodes, flag SG's the dead nodes were part of. + * This requires a number of loops because each node structure does not keep a + * list of SG's it's in. + */ + +static int mark_effected_sgs(void) +{ + sm_group_t *sg; + sm_node_t *node, *sgnode; + uint32_t dead_id; + int i, effected = 0; + + down(&sm_sglock); + + list_for_each_entry(node, &sm_members, list) { + if (!test_bit(SNFL_NEED_RECOVERY, &node->flags)) + continue; + + dead_id = node->id; + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &sm_sg[i], list) { + /* check if dead node is among sg's members */ + list_for_each_entry(sgnode, &sg->memb, list) { + if (sgnode->id == dead_id) { + set_bit(SGFL_NEED_RECOVERY, + &sg->flags); + effected++; + break; + } + } + } + } + } + up(&sm_sglock); + + return effected; +} + +static recover_t *alloc_recover(void) +{ + recover_t *rev; + int i; + + SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev); + + memset(rev, 0, sizeof(recover_t)); + + sm_set_event_id(&rev->event_id); + + for (i = 0; i < SG_LEVELS; i++) { + INIT_LIST_HEAD(&rev->sgs[i]); + } + + return rev; +} + +/* + * An in-progress revent re-start for an SG is interrupted by another node + * failure in the SG. Cancel an outstanding barrier if there is one. The SG + * will be moved to the new revent and re-started as part of that. + */ + +static void cancel_prev_recovery(sm_group_t *sg) +{ + int error; + + if (sg->recover_state == RECOVER_BARRIERWAIT) { + error = kcl_barrier_cancel(sg->recover_barrier); + if (error) + log_error(sg, "cancel_prev_recovery: error %d", error); + } +} + +static void pre_recover_sg(sm_group_t *sg, recover_t *rev) +{ + if (sg->state == SGST_RECOVER) { + cancel_prev_recovery(sg); + list_del(&sg->recover_list); + } + + sg->ops->stop(sg->service_data); + sg->state = SGST_RECOVER; + sg->recover_state = RECOVER_NONE; + sg->recover_data = rev; + list_add(&sg->recover_list, &rev->sgs[sg->level]); +} + +/* + * When adjust_members finds that some nodes are dead and mark_effected_sgs + * finds that some SG's are effected by departed nodes, this is called to + * collect together the SG's which need to be recovered. An revent (recovery + * event) is the group of effected SG's. + */ + +static int new_recovery(void) +{ + sm_group_t *sg; + recover_t *rev; + sm_node_t *node, *sgnode, *safe; + int i; + + rev = alloc_recover(); + list_add_tail(&rev->list, &recoveries); + + down(&sm_sglock); + + /* + * Stop effected SG's and add them to the rev + */ + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &sm_sg[i], list) { + if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){ + if (sg->state == SGST_JOIN) + continue; + pre_recover_sg(sg, rev); + } + } + } + + /* + * For an SG needing recovery, remove dead nodes from sg->memb list + */ + + for (i = 0; i < SG_LEVELS; i++) { + list_for_each_entry(sg, &rev->sgs[i], recover_list) { + + /* Remove dead members from SG's member list */ + list_for_each_entry_safe(sgnode, safe, &sg->memb, list){ + + node = sm_find_member(sgnode->id); + SM_ASSERT(node, printk("id %u\n", sgnode->id);); + + if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){ + list_del(&sgnode->list); + kfree(sgnode); + sg->memb_count--; + log_debug(sg, "remove node %u count %d", + sgnode->id, sg->memb_count); + } + } + } + } + + up(&sm_sglock); + rev->cur_level = 0; + return 0; +} + +/* + * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in + * mark_effected_sgs() and add_revent(). After that, we're done using the bit + * and we clear it here. + */ + +static void adjust_members_done(void) +{ + sm_node_t *node; + + list_for_each_entry(node, &sm_members, list) + clear_bit(SNFL_NEED_RECOVERY, &node->flags); +} + +/* + * Start the service of the given SG. The service must be given an array of + * nodeids specifying the new sg membership. The service is responsible to + * free this chunk of memory when done with it. + */ + +static void start_sg(sm_group_t *sg, uint32_t event_id) +{ + sm_node_t *node; + uint32_t *memb; + int count = 0; + + SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL), + memb); + + list_for_each_entry(node, &sg->memb, list) + memb[count++] = node->id; + + sg->ops->start(sg->service_data, memb, count, event_id, + SERVICE_NODE_FAILED); +} + +static void recovery_barrier(sm_group_t *sg) +{ + char bname[MAX_BARRIER_NAME_LEN]; + int error, len; + + memset(bname, 0, MAX_BARRIER_NAME_LEN); + + /* bypass the barrier if we're the only member */ + if (sg->memb_count == 1) { + process_recovery_barrier(sg, 0); + return; + } + + len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u", + sg->global_id, sg->recover_stop, sg->memb_count); + + /* We save this barrier name so we can cancel it if needed. */ + memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN); + memcpy(sg->recover_barrier, bname, len); + + error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY); + if (error) + log_error(sg, "recovery_barrier error %d: %s", error, bname); +} + +static void recover_sg(sm_group_t *sg, int event_id) +{ + log_debug(sg, "recover state %d", sg->recover_state); + + switch (sg->recover_state) { + + case RECOVER_NONE: + /* must wait for recovery to stop sg on all nodes */ + sg->recover_state = RECOVER_BARRIERWAIT; + sg->recover_stop = 0; + recovery_barrier(sg); + break; + + case RECOVER_BARRIERWAIT: + break; + + case RECOVER_STOP: + /* barrier callback sets state STOP */ + sg->recover_stop = 1; + sg->recover_state = RECOVER_START; + start_sg(sg, event_id); + break; + + case RECOVER_START: + break; + + case RECOVER_STARTDONE: + /* service callback sets state STARTDONE */ + sg->recover_state = RECOVER_BARRIERWAIT; + recovery_barrier(sg); + break; + + case RECOVER_BARRIERDONE: + /* barrier callback sets state BARRIERDONE */ + sg->ops->finish(sg->service_data, event_id); + list_del(&sg->recover_list); + sg->recover_state = RECOVER_NONE; + sg->state = SGST_RUN; + + /* Continue a previous, interrupted attempt to leave the sg */ + if (sg->sevent) { + clear_bit(SEFL_DELAY, &sg->sevent->se_flags); + set_bit(SEFL_CHECK, &sg->sevent->se_flags); + wake_serviced(DO_JOINLEAVE); + } + break; + + default: + log_error(sg, "invalid recover_state %u", sg->recover_state); + } +} + +static void recover_level(recover_t *rev, int level) +{ + sm_group_t *sg, *safe; + + list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list) + recover_sg(sg, rev->event_id); +} + +static void recover_levels(recover_t *rev) +{ + for (;;) { + recover_level(rev, rev->cur_level); + + if (list_empty(&rev->sgs[rev->cur_level])) { + if (rev->cur_level == SG_LEVELS - 1) { + list_del(&rev->list); + kfree(rev); + return; + } + rev->cur_level++; + continue; + } + break; + } +} + +/* + * Called by SM thread when the cluster is quorate. It restarts + * SG's that were stopped in new_recovery() due to a member death. + * It waits for all SG's at level N to complete restart before + * restarting SG's at level N+1. + */ + +void process_recoveries(void) +{ + recover_t *rev, *safe; + + down(&sm_sglock); + list_for_each_entry_safe(rev, safe, &recoveries, list) + recover_levels(rev); + up(&sm_sglock); +} + +/* + * The cnxman membership has changed. Check if there's still quorum and + * whether any nodes have died. If nodes have died, initiate recovery on any + * SG's they were in. This begins immediately if the cluster remains quorate; + * if not this waits until the cluster regains quorum. + */ + +void process_nodechange(void) +{ + int gone, effected; + + if ((sm_quorum = sm_quorum_next)) + wake_serviced(DO_RUN); + + gone = adjust_members(); + if (gone > 0) { + effected = mark_effected_sgs(); + + backout_sevents(); + cancel_uevents(&effected); + + if (effected > 0) { + new_recovery(); + wake_serviced(DO_RECOVERIES); + } + } + adjust_members_done(); +} + +int check_recovery(sm_group_t *sg, int event_id) +{ + if (sg->state == SGST_RECOVER) { + recover_t *rev = (recover_t *) sg->recover_data; + if (rev && rev->event_id == event_id) + return 1; + } + return 0; +} + +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid) +{ + sm_group_t *sg; + recover_t *rev; + + sg = sm_global_id_to_sg(smsg->ms_global_sgid); + if (!sg) { + log_print("process_recover_msg: unknown sg id %x", + smsg->ms_global_sgid); + return; + } + + /* we already know about the recovery and can ignore the msg */ + if (sg->state == SGST_RECOVER) + return; + + if (test_bit(SGFL_UEVENT, &sg->flags)) { + /* we will initiate recovery on our own if we know about the + uevent so we can ignore this */ + log_debug(sg, "process_recover_msg: ignore from %u", nodeid); + return; + } + + log_debug(sg, "recovery initiated by msg from %u", nodeid); + rev = alloc_recover(); + list_add_tail(&rev->list, &recoveries); + pre_recover_sg(sg, rev); + wake_serviced(DO_RECOVERIES); +} diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h --- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_recover.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,23 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_RECOVER_DOT_H__ +#define __SM_RECOVER_DOT_H__ + +void init_recovery(void); +void process_recoveries(void); +void process_nodechange(void); +int check_recovery(sm_group_t *sg, int event_id); +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid); + +#endif diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c --- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_services.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,418 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" + +static struct list_head callbacks; +static spinlock_t callback_lock; +static struct list_head sg_registered[SG_LEVELS]; + +/* + * These are the functions to register, join, leave, unregister, callback + * with/to the sm. + */ + +struct sc_entry { + struct list_head list; + uint32_t local_id; + int event_id; +}; +typedef struct sc_entry sc_entry_t; + +void init_services(void) +{ + int i; + + INIT_LIST_HEAD(&callbacks); + spin_lock_init(&callback_lock); + + for (i = 0; i < SG_LEVELS; i++) { + INIT_LIST_HEAD(&sm_sg[i]); + INIT_LIST_HEAD(&sg_registered[i]); + } + init_MUTEX(&sm_sglock); +} + +/* Context: service */ + +int kcl_register_service(char *name, int namelen, int level, + struct kcl_service_ops *ops, int unique, + void *servicedata, uint32_t *service_id) +{ + sm_group_t *sg; + int found = FALSE; + int error = -EINVAL; + + if (level > SG_LEVELS - 1) + goto fail; + + if (namelen > MAX_SERVICE_NAME_LEN) + goto fail; + + error = kcl_addref_cluster(); + if (error) + goto fail; + + down(&sm_sglock); + + list_for_each_entry(sg, &sm_sg[level], list) { + if ((sg->namelen == namelen) && + (!strncmp(sg->name, name, namelen))) { + found = TRUE; + goto next; + } + } + + list_for_each_entry(sg, &sg_registered[level], list) { + if ((sg->namelen == namelen) && + (!strncmp(sg->name, name, namelen))) { + found = TRUE; + goto next; + } + } + + next: + + if (found && unique) { + error = -EEXIST; + goto fail_unlock; + } + + if (found) { + sg->refcount++; + goto out; + } + + sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL); + if (!sg) { + error = -ENOMEM; + goto fail_unlock; + } + memset(sg, 0, sizeof(sm_group_t) + namelen); + + sg->refcount = 1; + sg->service_data = servicedata; + sg->ops = ops; + sg->level = level; + sg->namelen = namelen; + memcpy(sg->name, name, namelen); + sg->local_id = sm_new_local_id(level); + sg->state = SGST_NONE; + INIT_LIST_HEAD(&sg->memb); + INIT_LIST_HEAD(&sg->joining); + init_completion(&sg->event_comp); + + list_add_tail(&sg->list, &sg_registered[level]); + + out: + *service_id = sg->local_id; + up(&sm_sglock); + return 0; + + fail_unlock: + up(&sm_sglock); + kcl_releaseref_cluster(); + fail: + return error; +} + +/* Context: service */ + +void kcl_unregister_service(uint32_t local_id) +{ + sm_group_t *sg; + int level = sm_id_to_level(local_id); + + down(&sm_sglock); + + list_for_each_entry(sg, &sg_registered[level], list) { + if (sg->local_id == local_id) { + SM_ASSERT(sg->refcount,); + sg->refcount--; + + if (!sg->refcount) { + list_del(&sg->list); + kfree(sg); + } + kcl_releaseref_cluster(); + break; + } + } + up(&sm_sglock); +} + +/* Context: service */ + +int kcl_join_service(uint32_t local_id) +{ + sm_group_t *sg; + sm_sevent_t *sev; + int level = sm_id_to_level(local_id); + int error, found = FALSE; + + down(&sm_sglock); + + list_for_each_entry(sg, &sg_registered[level], list) { + if (sg->local_id == local_id) { + found = TRUE; + break; + } + } + + if (!found) { + up(&sm_sglock); + error = -ENOENT; + goto out; + } + + if (sg->state != SGST_NONE) { + up(&sm_sglock); + error = -EINVAL; + goto out; + } + + sg->state = SGST_JOIN; + set_bit(SGFL_SEVENT, &sg->flags); + list_del(&sg->list); + list_add_tail(&sg->list, &sm_sg[sg->level]); + + up(&sm_sglock); + + /* + * The join is a service event which will be processed asynchronously. + */ + + sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL); + if (!sev) { + error = -ENOMEM; + goto out; + } + + memset(sev, 0, sizeof (sm_sevent_t)); + sev->se_state = SEST_JOIN_BEGIN; + sev->se_sg = sg; + sg->sevent = sev; + sm_set_event_id(&sev->se_id); + + new_joinleave(sev); + wait_for_completion(&sg->event_comp); + error = 0; + + out: + return error; +} + +/* Context: service */ + +int kcl_leave_service(uint32_t local_id) +{ + sm_group_t *sg = NULL; + sm_sevent_t *sev; + int error; + + error = -ENOENT; + sg = sm_local_id_to_sg(local_id); + if (!sg) + goto out; + + /* sg was never joined */ + error = -EINVAL; + if (sg->state == SGST_NONE) + goto out; + + /* may still be joining */ + error = -EBUSY; + if (test_and_set_bit(SGFL_SEVENT, &sg->flags)) + goto out; + + error = -ENOMEM; + sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL); + if (!sev) + goto out; + + memset(sev, 0, sizeof (sm_sevent_t)); + sev->se_state = SEST_LEAVE_BEGIN; + set_bit(SEFL_LEAVE, &sev->se_flags); + sev->se_sg = sg; + sg->sevent = sev; + sm_set_event_id(&sev->se_id); + + new_joinleave(sev); + wait_for_completion(&sg->event_comp); + error = 0; + + down(&sm_sglock); + list_del(&sg->list); + list_add_tail(&sg->list, &sg_registered[sg->level]); + up(&sm_sglock); + + out: + return error; +} + +static void process_callback(uint32_t local_id, int event_id) +{ + sm_group_t *sg; + sm_sevent_t *sev; + sm_uevent_t *uev; + + sg = sm_local_id_to_sg(local_id); + if (!sg) + return; + + if (sg->state == SGST_RECOVER) { + if (!check_recovery(sg, event_id)) { + log_error(sg, "process_callback invalid recover " + "event id %d", event_id); + return; + } + + if (sg->recover_state == RECOVER_START) + sg->recover_state = RECOVER_STARTDONE; + else + log_error(sg, "process_callback recover state %u", + sg->recover_state); + wake_serviced(DO_RECOVERIES); + } + + else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent && + (sg->sevent->se_id == event_id)) { + sev = sg->sevent; + + if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) && + (sev->se_state == SEST_JSTART_SERVICEWAIT)) + sev->se_state = SEST_JSTART_SERVICEDONE; + + set_bit(SEFL_CHECK, &sev->se_flags); + wake_serviced(DO_JOINLEAVE); + } + + else if (test_bit(SGFL_UEVENT, &sg->flags) && + (sg->uevent.ue_id == event_id)) { + uev = &sg->uevent; + + if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) { + if (uev->ue_state == UEST_JSTART_SERVICEWAIT) + uev->ue_state = UEST_JSTART_SERVICEDONE; + else if (uev->ue_state == UEST_LSTART_SERVICEWAIT) + uev->ue_state = UEST_LSTART_SERVICEDONE; + } + set_bit(UEFL_CHECK, &uev->ue_flags); + wake_serviced(DO_MEMBERSHIP); + } + + else + log_error(sg, "ignoring service callback id=%x event=%u", + local_id, event_id); +} + +void process_callbacks(void) +{ + sc_entry_t *se; + + while (1) { + se = NULL; + + spin_lock(&callback_lock); + if (!list_empty(&callbacks)) { + se = list_entry(callbacks.next, sc_entry_t, list); + list_del(&se->list); + } + spin_unlock(&callback_lock); + + if (!se) + break; + process_callback(se->local_id, se->event_id); + kfree(se); + schedule(); + } +} + +/* Context: service */ + +void kcl_start_done(uint32_t local_id, int event_id) +{ + sc_entry_t *se; + + SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se); + + se->local_id = local_id; + se->event_id = event_id; + + spin_lock(&callback_lock); + list_add_tail(&se->list, &callbacks); + spin_unlock(&callback_lock); + + wake_serviced(DO_CALLBACKS); +} + +/* Context: service */ + +void kcl_global_service_id(uint32_t local_id, uint32_t *global_id) +{ + sm_group_t *sg = sm_local_id_to_sg(local_id); + + if (!sg) + log_print("kcl_global_service_id: can't find %x", local_id); + else + *global_id = sg->global_id; +} + +static void copy_to_service(sm_group_t *sg, struct kcl_service *s) +{ + s->level = sg->level; + s->local_id = sg->local_id; + s->global_id = sg->global_id; + s->node_count = sg->memb_count; + strcpy(s->name, sg->name); +} + +int kcl_get_services(struct list_head *head, int level) +{ + sm_group_t *sg; + struct kcl_service *s; + int error = -ENOMEM, count = 0; + + down(&sm_sglock); + + list_for_each_entry(sg, &sg_registered[level], list) { + if (head) { + s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL); + if (!s) + goto out; + copy_to_service(sg, s); + list_add(&s->list, head); + } + count++; + } + + list_for_each_entry(sg, &sm_sg[level], list) { + if (head) { + s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL); + if (!s) + goto out; + copy_to_service(sg, s); + list_add(&s->list, head); + } + count++; + } + + error = count; + out: + up(&sm_sglock); + return error; +} + +/* These three global variables listed in extern form in sm.h. */ +struct list_head sm_sg[SG_LEVELS]; +struct semaphore sm_sglock; diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h --- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_services.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,20 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_SERVICES_DOT_H__ +#define __SM_SERVICES_DOT_H__ + +void init_services(void); +void process_callbacks(void); + +#endif diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c --- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_user.c 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,569 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "sm.h" +#include "cnxman-private.h" + +void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode); + +#define UST_REGISTER 1 +#define UST_UNREGISTER 2 +#define UST_JOIN 3 +#define UST_LEAVE 4 +#define UST_JOINED 5 + +struct event { + struct list_head list; + service_event_t type; + service_start_t start_type; + unsigned int event_id; + unsigned int last_stop; + unsigned int last_start; + unsigned int last_finish; + unsigned int node_count; + uint32_t * nodeids; +}; +typedef struct event event_t; + +struct user_service { + uint32_t local_id; + pid_t pid; + int signal; + struct socket * sock; + uint8_t state; + uint8_t async; + struct semaphore lock; + struct list_head events; + spinlock_t event_lock; + unsigned int last_stop; + unsigned int last_start; + unsigned int last_finish; + unsigned int need_startdone; + unsigned int node_count; + uint32_t * nodeids; + int name_len; + char name[MAX_SERVICE_NAME_LEN]; +}; +typedef struct user_service user_service_t; + + +static void add_event(user_service_t *us, event_t *ev) +{ + spin_lock(&us->event_lock); + list_add_tail(&ev->list, &us->events); + + switch(ev->type) { + case SERVICE_EVENT_STOP: + us->last_stop = us->last_start; + break; + case SERVICE_EVENT_START: + us->last_start = ev->event_id; + break; + case SERVICE_EVENT_FINISH: + us->last_finish = ev->event_id; + break; + case SERVICE_EVENT_LEAVEDONE: + break; + } + spin_unlock(&us->event_lock); +} + +static event_t *get_event(user_service_t *us) +{ + event_t *ev = NULL; + + spin_lock(&us->event_lock); + if (!list_empty(&us->events)) { + ev = list_entry(us->events.next, event_t, list); + ev->last_stop = us->last_stop; + ev->last_start = us->last_start; + ev->last_finish = us->last_finish; + } + spin_unlock(&us->event_lock); + return ev; +} + +static void del_event(user_service_t *us, event_t *ev) +{ + spin_lock(&us->event_lock); + list_del(&ev->list); + spin_unlock(&us->event_lock); +} + +static event_t *alloc_event(void) +{ + event_t *ev; + SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev); + memset(ev, 0, sizeof(event_t)); + return ev; +} + +/* us->lock must be held before calling */ +static void user_notify(user_service_t *us) +{ + if (us->sock) + queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT); + if (us->pid && us->signal) + kill_proc(us->pid, us->signal, 0); +} + +static service_start_t start_type(int type) +{ + switch (type) { + case SERVICE_NODE_FAILED: + return SERVICE_START_FAILED; + case SERVICE_NODE_JOIN: + return SERVICE_START_JOIN; + case SERVICE_NODE_LEAVE: + return SERVICE_START_LEAVE; + } + return 0; +} + +static int user_stop(void *servicedata) +{ + user_service_t *us = (user_service_t *) servicedata; + event_t *ev; + + down(&us->lock); + if (!us->sock) + goto out; + + ev = alloc_event(); + ev->type = SERVICE_EVENT_STOP; + + add_event(us, ev); + user_notify(us); + out: + up(&us->lock); + return 0; +} + +static int user_start(void *servicedata, uint32_t *nodeids, int count, + int event_id, int type) +{ + user_service_t *us = (user_service_t *) servicedata; + event_t *ev; + + down(&us->lock); + if (!us->sock) { + kcl_start_done(us->local_id, event_id); + goto out; + } + + us->need_startdone = event_id; + + ev = alloc_event(); + ev->type = SERVICE_EVENT_START; + ev->node_count = count; + ev->start_type = start_type(type); + ev->event_id = event_id; + ev->nodeids = nodeids; + + add_event(us, ev); + user_notify(us); + out: + up(&us->lock); + return 0; +} + +static void user_finish(void *servicedata, int event_id) +{ + user_service_t *us = (user_service_t *) servicedata; + event_t *ev; + + down(&us->lock); + if (!us->sock) + goto out; + + ev = alloc_event(); + ev->type = SERVICE_EVENT_FINISH; + ev->event_id = event_id; + + add_event(us, ev); + user_notify(us); + out: + up(&us->lock); +} + +struct kcl_service_ops user_service_ops = { + .stop = user_stop, + .start = user_start, + .finish = user_finish +}; + +static int user_register(char *u_name, user_service_t **us_data) +{ + user_service_t *us; + char name[MAX_SERVICE_NAME_LEN+1]; + int len, error; + + memset(name, 0, MAX_SERVICE_NAME_LEN+1); + + if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN)) + return -EFAULT; + + len = strlen(name); + if (len > MAX_SERVICE_NAME_LEN) + return -ENAMETOOLONG; + if (!len) + return -EINVAL; + + us = kmalloc(sizeof(user_service_t), GFP_KERNEL); + if (!us) + return -ENOMEM; + memset(us, 0, sizeof(user_service_t)); + us->nodeids = NULL; + INIT_LIST_HEAD(&us->events); + spin_lock_init(&us->event_lock); + init_MUTEX(&us->lock); + us->name_len = len; + memcpy(us->name, name, len); + + error = kcl_register_service(name, len, SERVICE_LEVEL_USER, + &user_service_ops, TRUE, (void *) us, + &us->local_id); + if (error) { + kfree(us); + us = NULL; + } + *us_data = us; + return error; +} + +static void user_unregister(user_service_t *us) +{ + event_t *ev; + + kcl_unregister_service(us->local_id); + + if (us->nodeids) + kfree(us->nodeids); + + while ((ev = get_event(us))) { + del_event(us, ev); + if (ev->nodeids) + kfree(ev->nodeids); + kfree(ev); + } +} + +static int user_join_async(void *arg) +{ + user_service_t *us = arg; + int user_gone = 0; + + daemonize("cman_userjoin"); + + kcl_join_service(us->local_id); + + down(&us->lock); + us->state = UST_JOINED; + us->async = 0; + if (!us->sock) { + if (us->need_startdone) + kcl_start_done(us->local_id, us->need_startdone); + user_gone = 1; + } + up(&us->lock); + + if (user_gone) { + kcl_leave_service(us->local_id); + user_unregister(us); + kfree(us); + } + return 0; +} + +static int user_leave_async(void *arg) +{ + user_service_t *us = arg; + + daemonize("cman_userleave"); + + kcl_leave_service(us->local_id); + + down(&us->lock); + us->async = 0; + if (!us->sock) { + user_unregister(us); + kfree(us); + } else { + event_t *ev = alloc_event(); + ev->type = SERVICE_EVENT_LEAVEDONE; + add_event(us, ev); + user_notify(us); + up(&us->lock); + } + + return 0; +} + +static int user_join(user_service_t *us, int wait) +{ + int error = 0; + + if (wait) { + error = kcl_join_service(us->local_id); + us->state = UST_JOINED; + } + else { + us->async = 1; + kernel_thread(user_join_async, us, 0); + } + + return error; +} + +static void user_leave(user_service_t *us, int wait) +{ + if (wait) + kcl_leave_service(us->local_id); + else { + us->async = 1; + kernel_thread(user_leave_async, us, 0); + } +} + +static int user_start_done(user_service_t *us, unsigned int event_id) +{ + if (!us->need_startdone) + return -EINVAL; + if (us->need_startdone == event_id) + us->need_startdone = 0; + kcl_start_done(us->local_id, event_id); + return 0; +} + +static void user_set_signal(user_service_t *us, int signal) +{ + us->pid = current->pid; + us->signal = signal; +} + +static int user_get_event(user_service_t *us, + struct cl_service_event *user_event) +{ + event_t *ev; + struct cl_service_event event; + + ev = get_event(us); + if (!ev) + return 0; + + event.type = ev->type; + event.start_type = ev->start_type; + event.event_id = ev->event_id; + event.last_stop = ev->last_stop; + event.last_start = ev->last_start; + event.last_finish = ev->last_finish; + event.node_count = ev->node_count; + + if (copy_to_user(user_event, &event, sizeof(struct cl_service_event))) + return -EFAULT; + + del_event(us, ev); + + if (ev->type == SERVICE_EVENT_START) { + if (us->nodeids) + kfree(us->nodeids); + us->nodeids = ev->nodeids; + us->node_count = ev->node_count; + } + + kfree(ev); + return 1; +} + +static int user_get_members(user_service_t *us, + struct cl_cluster_nodelist *u_nodelist) +{ + struct cl_cluster_nodelist user_nodelist; + struct cl_cluster_node user_node, *u_node; + struct cluster_node *node; + unsigned int i; + int num_nodes = 0; + + if (!u_nodelist) + return us->node_count; + + if (copy_from_user(&user_nodelist, (void __user *) u_nodelist, + sizeof(struct cl_cluster_nodelist))) + return -EFAULT; + + if (user_nodelist.max_members < us->node_count) + return -E2BIG; + + u_node = user_nodelist.nodes; + + for (i = 0; i < us->node_count; i++) { + node = find_node_by_nodeid(us->nodeids[i]); + if (!node) + continue; + + copy_to_usernode(node, &user_node); + if (copy_to_user(u_node, &user_node, + sizeof(struct cl_cluster_node))) + return -EFAULT; + + u_node++; + num_nodes++; + } + return num_nodes; +} + +static int user_global_id(user_service_t *us, uint32_t *id) +{ + uint32_t gid = 0; + + if (us->state != UST_JOINED) + return -EINVAL; + + kcl_global_service_id(us->local_id, &gid); + + if (copy_to_user(id, &gid, sizeof(uint32_t))) + return -EFAULT; + return 0; +} + +static int user_set_level(user_service_t *us, int level) +{ + int prev_id = us->local_id; + int error; + + if (us->state != UST_REGISTER) + return -EINVAL; + + error = kcl_register_service(us->name, us->name_len, level, + &user_service_ops, TRUE, (void *) us, + &us->local_id); + if (error) + return error; + + kcl_unregister_service(prev_id); + return 0; +} + +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct cluster_sock *c = cluster_sk(sock->sk); + user_service_t *us = c->service_data; + int error = 0; + + if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER) + return -EINVAL; + + switch (cmd) { + case SIOCCLUSTER_SERVICE_REGISTER: + error = user_register((char *) arg, &us); + if (!error) { + us->state = UST_REGISTER; + us->sock = sock; + c->service_data = us; + } + break; + + case SIOCCLUSTER_SERVICE_UNREGISTER: + down(&us->lock); + us->state = UST_UNREGISTER; + user_unregister(us); + up(&us->lock); + break; + + case SIOCCLUSTER_SERVICE_JOIN: + us->state = UST_JOIN; + user_join(us, 0); + break; + + case SIOCCLUSTER_SERVICE_LEAVE: + down(&us->lock); + if (us->state != UST_JOINED) { + error = -EBUSY; + up(&us->lock); + } else { + us->state = UST_LEAVE; + up(&us->lock); + user_leave(us, 0); + } + break; + + case SIOCCLUSTER_SERVICE_SETSIGNAL: + user_set_signal(us, (int) arg); + break; + + case SIOCCLUSTER_SERVICE_STARTDONE: + error = user_start_done(us, (unsigned int) arg); + break; + + case SIOCCLUSTER_SERVICE_GETEVENT: + error = user_get_event(us, (struct cl_service_event *) arg); + break; + + case SIOCCLUSTER_SERVICE_GETMEMBERS: + error = user_get_members(us, (struct cl_cluster_nodelist *)arg); + break; + + case SIOCCLUSTER_SERVICE_GLOBALID: + error = user_global_id(us, (uint32_t *) arg); + break; + + case SIOCCLUSTER_SERVICE_SETLEVEL: + error = user_set_level(us, (int) arg); + break; + + default: + error = -EINVAL; + } + + return error; +} + +void sm_sock_release(struct socket *sock) +{ + struct cluster_sock *c = cluster_sk(sock->sk); + user_service_t *us = c->service_data; + int state; + + if (!us) + return; + + down(&us->lock); + us->sock = NULL; + c->service_data = NULL; + + if (us->need_startdone) + kcl_start_done(us->local_id, us->need_startdone); + + if (us->async) { + /* async thread will clean up before exiting */ + up(&us->lock); + return; + } + state = us->state; + up(&us->lock); + + switch (state) { + case UST_JOIN: + break; + case UST_JOINED: + user_leave(us, 1); + /* fall through */ + case UST_LEAVE: + case UST_REGISTER: + user_unregister(us); + /* fall through */ + case UST_UNREGISTER: + kfree(us); + break; + } +} diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h --- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/cluster/cman/sm_user.h 2004-06-29 20:07:51.000000000 +0800 @@ -0,0 +1,21 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SM_USER_DOT_H__ +#define __SM_USER_DOT_H__ + +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +void sm_sock_release(struct socket *sock); +void sm_sock_bind(struct socket *sock); + +#endif diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h --- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/include/cluster/cnxman-socket.h 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,226 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* CMAN socket interface header, + may be include by user or kernel code */ + +#ifndef __CNXMAN_SOCKET_H +#define __CNXMAN_SOCKET_H + +/* Just made these up but the address family must be less than 32 (NPROTO) */ +#define AF_CLUSTER 31 +#define PF_CLUSTER AF_CLUSTER + +/* Protocol(socket) types */ +#define CLPROTO_MASTER 2 +#define CLPROTO_CLIENT 3 + +/* Setsockopt -- maybe should be ioctls?? */ +#define CLU_SET_MULTICAST 100 +#define CLU_JOIN_CLUSTER 101 +#define CLU_LEAVE_CLUSTER 102 +#define CLU_SET_RCVONLY 103 +#define CLU_SET_UNICAST 104 +#define KCL_SET_MULTICAST 105 +#define KCL_SET_RCVONLY 106 +#define KCL_SET_UNICAST 107 +#define KCL_SET_NODENAME 108 +#define CLU_SET_NODENAME 109 + +/* ioctls -- should register these properly */ +#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int) +#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02) +#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist) +#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int) +#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05) +#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request) +#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist) +#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int) +#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version) +#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version) +#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b) +#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int) +#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d) +#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char) +#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f) +#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10) +#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20) +#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int) +#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int) +#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event) +#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist) +#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t) +#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int) +#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node) +#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info) + +/* Maximum size of a cluster message */ +#define MAX_CLUSTER_MESSAGE 1500 +#define MAX_CLUSTER_MEMBER_NAME_LEN 255 +#define MAX_BARRIER_NAME_LEN 33 +#define MAX_SA_ADDR_LEN 12 +#define MAX_CLUSTER_NAME_LEN 16 + +/* Well-known cluster port numbers */ +#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster + * transitions! */ +#define CLUSTER_PORT_SERVICES 2 +#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */ +#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */ +#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */ + +/* Port numbers above this will be blocked when the cluster is inquorate or in + * transition */ +#define HIGH_PROTECTED_PORT 9 + +/* Reasons for leaving the cluster */ +#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */ +#define CLUSTER_LEAVEFLAG_KILLED 1 +#define CLUSTER_LEAVEFLAG_PANIC 2 +#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */ +#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the + * first place */ +#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is + * in a minority */ +#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */ +#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */ + +/* OOB messages sent to a local socket */ +#define CLUSTER_OOB_MSG_PORTCLOSED 1 +#define CLUSTER_OOB_MSG_STATECHANGE 2 +#define CLUSTER_OOB_MSG_SERVICEEVENT 3 + +/* Sendmsg flags, these are above the normal sendmsg flags so they don't + * interfere */ +#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */ +#define MSG_QUEUE 0x020000 /* Queue the message for sending later */ +#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster + */ +#define MSG_ALLINT 0x100000 /* Send out of all interfaces */ + +typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER, + NODESTATE_DEAD } nodestate_t; + + +struct sockaddr_cl { + unsigned short scl_family; + unsigned char scl_flags; + unsigned char scl_port; + int scl_nodeid; +}; + +/* This is how we pass the multicast socket into kernel space. addr is the + * multicast address to use in the address family of the socket (eg for UDP it + * might be 255.255.255.0) */ +struct cl_multicast_sock { + int fd; /* FD of master socket to do multicast on */ + int number; /* Socket number, to match up recvonly & bcast + * sockets */ +}; + +/* Cluster configuration info passed when we join the cluster */ +struct cl_join_cluster_info { + unsigned char votes; + unsigned int expected_votes; + unsigned int two_node; + unsigned int config_version; + + char cluster_name[17]; +}; + + +/* This is the structure, per node, returned from the membership ioctl */ +struct cl_cluster_node { + unsigned int size; + unsigned int node_id; + unsigned int us; + unsigned int leave_reason; + unsigned int incarnation; + nodestate_t state; + char name[MAX_CLUSTER_MEMBER_NAME_LEN]; + unsigned char votes; +}; + +/* The struct passed to the membership ioctls */ +struct cl_cluster_nodelist { + uint32_t max_members; + struct cl_cluster_node *nodes; +}; + +/* Structure passed to SIOCCLUSTER_ISLISTENING */ +struct cl_listen_request { + unsigned char port; + int nodeid; +}; + +/* A Cluster PORTCLOSED message - received by a local user as an OOB message */ +struct cl_portclosed_oob { + unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */ + unsigned char port; +}; + +/* Get all version numbers or set the config version */ +struct cl_version { + unsigned int major; + unsigned int minor; + unsigned int patch; + unsigned int config; +}; + +/* structure passed to barrier ioctls */ +struct cl_barrier_info { + char cmd; + char name[MAX_BARRIER_NAME_LEN]; + unsigned int flags; + unsigned long arg; +}; + +typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH, + SERVICE_EVENT_LEAVEDONE } service_event_t; + +typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE } + service_start_t; + +struct cl_service_event { + service_event_t type; + service_start_t start_type; + unsigned int event_id; + unsigned int last_stop; + unsigned int last_start; + unsigned int last_finish; + unsigned int node_count; +}; + + +/* Commands to the barrier ioctl */ +#define BARRIER_IOCTL_REGISTER 1 +#define BARRIER_IOCTL_CHANGE 2 +#define BARRIER_IOCTL_DELETE 3 +#define BARRIER_IOCTL_WAIT 4 + +/* Attributes of a barrier - bitmask */ +#define BARRIER_ATTR_AUTODELETE 1 +#define BARRIER_ATTR_MULTISTEP 2 +#define BARRIER_ATTR_MANUAL 4 +#define BARRIER_ATTR_ENABLED 8 +#define BARRIER_ATTR_CALLBACK 16 + +/* Attribute setting commands */ +#define BARRIER_SETATTR_AUTODELETE 1 +#define BARRIER_SETATTR_MULTISTEP 2 +#define BARRIER_SETATTR_ENABLED 3 +#define BARRIER_SETATTR_NODES 4 +#define BARRIER_SETATTR_CALLBACK 5 +#define BARRIER_SETATTR_TIMEOUT 6 + +#endif diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h --- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/include/cluster/cnxman.h 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,87 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CNXMAN_H +#define __CNXMAN_H + +#include "linux/in6.h" +#include "cluster/cnxman-socket.h" + +/* In-kernel API */ + +/* This is the structure, per node, returned from the membership request */ +struct kcl_cluster_node { + unsigned int size; + unsigned int node_id; + unsigned int us; + unsigned int leave_reason; + unsigned int incarnation; + nodestate_t state; + struct list_head list; + char name[MAX_CLUSTER_MEMBER_NAME_LEN]; + unsigned char votes; +}; + +struct cluster_node_addr { + struct list_head list; + unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */ + int addr_len; +}; + + +/* Reasons for a kernel membership callback */ +typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason; + +/* Kernel version of above, the void *sock is a struct socket */ +struct kcl_multicast_sock { + void *sock; + int number; /* Socket number, to match up recvonly & bcast + * sockets */ +}; + +extern int kcl_sendmsg(struct socket *sock, void *buf, int size, + struct sockaddr_cl *caddr, int addr_len, + unsigned int flags); +extern int kcl_register_read_callback(struct socket *sock, + int (*routine) (char *, int, char *, int, + unsigned int)); +extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long)); +extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long)); +extern int kcl_get_members(struct list_head *list); +extern int kcl_get_member_ids(uint32_t * idbuf, int size); +extern int kcl_get_all_members(struct list_head *list); +extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len, + struct kcl_cluster_node *n); +extern int kcl_get_node_by_name(unsigned char *name, + struct kcl_cluster_node *n); +extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n); +extern int kcl_is_quorate(void); +extern int kcl_addref_cluster(void); +extern int kcl_releaseref_cluster(void); +extern int kcl_cluster_name(char **cname); +extern int kcl_get_current_interface(void); +extern struct list_head *kcl_get_node_addresses(int nodeid); + +extern int kcl_barrier_register(char *name, unsigned int flags, + unsigned int nodes); +extern int kcl_barrier_setattr(char *name, unsigned int attr, + unsigned long arg); +extern int kcl_barrier_delete(char *name); +extern int kcl_barrier_wait(char *name); +extern int kcl_barrier_cancel(char *name); + +extern int kcl_register_quorum_device(char *name, int votes); +extern int kcl_unregister_quorum_device(void); +extern int kcl_quorum_device_available(int yesno); + +#endif diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h --- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730 +++ linux-patched/include/cluster/service.h 2004-06-29 20:07:50.000000000 +0800 @@ -0,0 +1,102 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __SERVICE_DOT_H__ +#define __SERVICE_DOT_H__ + +/* + * Interface between service manager and services + */ + +/* + * Service levels are started in order from lowest, so level 0 is started on + * all nodes before level 1 is started. + */ + +#define SERVICE_LEVEL_FENCE (0) +#define SERVICE_LEVEL_GDLM (1) +#define SERVICE_LEVEL_GFS (2) +#define SERVICE_LEVEL_USER (3) + +#define MAX_SERVICE_NAME_LEN (33) + +/* + * The type of start a service receives. The start (and preceding stop) may be + * due to a node joining or leaving the SG or due to a node having failed. + */ + +#define SERVICE_NODE_FAILED (1) +#define SERVICE_NODE_JOIN (2) +#define SERVICE_NODE_LEAVE (3) + + +struct kcl_service { + struct list_head list; + uint16_t level; + uint32_t local_id; + uint32_t global_id; + int node_count; + char name[MAX_SERVICE_NAME_LEN]; +}; + +int kcl_get_services(struct list_head *list, int level); + + +/* + * These routines which run in CMAN context must return quickly and cannot + * block. + */ + +struct kcl_service_ops { + int (*stop) (void *servicedata); + int (*start) (void *servicedata, uint32_t *nodeids, int count, + int event_id, int type); + void (*finish) (void *servicedata, int event_id); +}; + +/* + * Register will cause CMAN to create a Service Group (SG) for the named + * instance of the service. A local ID is returned which is used to join, + * leave and unregister the service. + */ + +int kcl_register_service(char *name, int namelen, int level, + struct kcl_service_ops *ops, int unique, + void *servicedata, uint32_t *local_id); + +void kcl_unregister_service(uint32_t local_id); + +/* + * Once a service is joined it will be managed by CMAN and receive start, stop, + * and finish calls. After leave is called the service is no longer managed by + * CMAN. The first start for a service may arrive before kcl_join_service() + * returns. + */ + +int kcl_join_service(uint32_t local_id); +int kcl_leave_service(uint32_t local_id); + +/* + * After a service is started, it can ask for its cluster-wide unique ID. + */ + +void kcl_global_service_id(uint32_t local_id, uint32_t * global_id); + +/* + * Called by a service when it's done with a start(). Cannot be called from + * the start function. + */ + +void kcl_start_done(uint32_t local_id, int event_id); + +#endif