linux-cluster-cman.patch

   1 diff -urN linux-orig/arch/alpha/Kconfig linux-orig2/arch/alpha/Kconfig
   2 --- linux-orig/arch/alpha/Kconfig       2004-10-18 16:55:37.000000000 -0500
   3 +++ linux-orig2/arch/alpha/Kconfig      2004-10-22 11:29:33.507218717 -0500
   4 @@ -600,3 +600,4 @@
   5
   6  source "lib/Kconfig"
   7
   8 +source "cluster/Kconfig"
   9 diff -urN linux-orig/arch/arm/Kconfig linux-orig2/arch/arm/Kconfig
  10 --- linux-orig/arch/arm/Kconfig 2004-10-18 16:54:31.000000000 -0500
  11 +++ linux-orig2/arch/arm/Kconfig        2004-10-22 11:30:56.358918506 -0500
  12 @@ -690,3 +690,5 @@
  13  source "crypto/Kconfig"
  14
  15  source "lib/Kconfig"
  16 +
  17 +source "cluster/Kconfig"
  18 diff -urN linux-orig/arch/arm26/Kconfig linux-orig2/arch/arm26/Kconfig
  19 --- linux-orig/arch/arm26/Kconfig       2004-10-18 16:54:32.000000000 -0500
  20 +++ linux-orig2/arch/arm26/Kconfig      2004-10-22 11:29:33.531218341 -0500
  21 @@ -222,3 +222,4 @@
  22
  23  source "lib/Kconfig"
  24
  25 +source "cluster/Kconfig"
  26 diff -urN linux-orig/arch/cris/Kconfig linux-orig2/arch/cris/Kconfig
  27 --- linux-orig/arch/cris/Kconfig        2004-10-18 16:55:07.000000000 -0500
  28 +++ linux-orig2/arch/cris/Kconfig       2004-10-22 11:31:11.965673644 -0500
  29 @@ -174,3 +174,5 @@
  30  source "crypto/Kconfig"
  31
  32  source "lib/Kconfig"
  33 +
  34 +source "cluster/Kconfig"
  35 diff -urN linux-orig/arch/i386/Kconfig linux-orig2/arch/i386/Kconfig
  36 --- linux-orig/arch/i386/Kconfig        2004-10-18 16:53:22.000000000 -0500
  37 +++ linux-orig2/arch/i386/Kconfig       2004-10-22 11:29:33.533218309 -0500
  38 @@ -1194,6 +1194,8 @@
  39
  40  source "lib/Kconfig"
  41
  42 +source "cluster/Kconfig"
  43 +
  44  config X86_SMP
  45         bool
  46         depends on SMP && !X86_VOYAGER
  47 diff -urN linux-orig/arch/ia64/Kconfig linux-orig2/arch/ia64/Kconfig
  48 --- linux-orig/arch/ia64/Kconfig        2004-10-18 16:55:27.000000000 -0500
  49 +++ linux-orig2/arch/ia64/Kconfig       2004-10-22 11:29:33.534218294 -0500
  50 @@ -390,3 +390,5 @@
  51  source "security/Kconfig"
  52
  53  source "crypto/Kconfig"
  54 +
  55 +source "cluster/Kconfig"
  56 diff -urN linux-orig/arch/m68k/Kconfig linux-orig2/arch/m68k/Kconfig
  57 --- linux-orig/arch/m68k/Kconfig        2004-10-18 16:54:32.000000000 -0500
  58 +++ linux-orig2/arch/m68k/Kconfig       2004-10-22 11:31:38.187262279 -0500
  59 @@ -655,3 +655,5 @@
  60  source "crypto/Kconfig"
  61
  62  source "lib/Kconfig"
  63 +
  64 +source "cluster/Kconfig"
  65 diff -urN linux-orig/arch/mips/Kconfig linux-orig2/arch/mips/Kconfig
  66 --- linux-orig/arch/mips/Kconfig        2004-10-18 16:54:08.000000000 -0500
  67 +++ linux-orig2/arch/mips/Kconfig       2004-10-22 11:29:33.541218184 -0500
  68 @@ -1587,6 +1587,8 @@
  69
  70  source "lib/Kconfig"
  71
  72 +source "cluster/Kconfig"
  73 +
  74  #
  75  # Use the generic interrupt handling code in kernel/irq/:
  76  #
  77 diff -urN linux-orig/arch/parisc/Kconfig linux-orig2/arch/parisc/Kconfig
  78 --- linux-orig/arch/parisc/Kconfig      2004-10-18 16:54:37.000000000 -0500
  79 +++ linux-orig2/arch/parisc/Kconfig     2004-10-22 11:31:57.146964867 -0500
  80 @@ -195,3 +195,5 @@
  81  source "crypto/Kconfig"
  82
  83  source "lib/Kconfig"
  84 +
  85 +source "cluster/Kconfig"
  86 diff -urN linux-orig/arch/ppc/Kconfig linux-orig2/arch/ppc/Kconfig
  87 --- linux-orig/arch/ppc/Kconfig 2004-10-18 16:55:29.000000000 -0500
  88 +++ linux-orig2/arch/ppc/Kconfig        2004-10-22 11:29:33.550218043 -0500
  89 @@ -1231,3 +1231,5 @@
  90  source "security/Kconfig"
  91
  92  source "crypto/Kconfig"
  93 +
  94 +source "cluster/Kconfig"
  95 diff -urN linux-orig/arch/ppc64/Kconfig linux-orig2/arch/ppc64/Kconfig
  96 --- linux-orig/arch/ppc64/Kconfig       2004-10-18 16:54:31.000000000 -0500
  97 +++ linux-orig2/arch/ppc64/Kconfig      2004-10-22 11:32:11.150745212 -0500
  98 @@ -352,3 +352,5 @@
  99  source "crypto/Kconfig"
 100
 101  source "lib/Kconfig"
 102 +
 103 +source "cluster/Kconfig"
 104 diff -urN linux-orig/arch/s390/Kconfig linux-orig2/arch/s390/Kconfig
 105 --- linux-orig/arch/s390/Kconfig        2004-10-18 16:53:51.000000000 -0500
 106 +++ linux-orig2/arch/s390/Kconfig       2004-10-22 11:32:31.175431141 -0500
 107 @@ -466,3 +466,5 @@
 108  source "crypto/Kconfig"
 109
 110  source "lib/Kconfig"
 111 +
 112 +source "cluster/Kconfig"
 113 diff -urN linux-orig/arch/sh/Kconfig linux-orig2/arch/sh/Kconfig
 114 --- linux-orig/arch/sh/Kconfig  2004-10-18 16:55:29.000000000 -0500
 115 +++ linux-orig2/arch/sh/Kconfig 2004-10-22 11:32:47.169180310 -0500
 116 @@ -748,3 +748,5 @@
 117  source "crypto/Kconfig"
 118
 119  source "lib/Kconfig"
 120 +
 121 +source "cluster/Kconfig"
 122 diff -urN linux-orig/arch/sparc/Kconfig linux-orig2/arch/sparc/Kconfig
 123 --- linux-orig/arch/sparc/Kconfig       2004-10-18 16:53:05.000000000 -0500
 124 +++ linux-orig2/arch/sparc/Kconfig      2004-10-22 11:33:06.891871022 -0500
 125 @@ -386,3 +386,5 @@
 126  source "crypto/Kconfig"
 127
 128  source "lib/Kconfig"
 129 +
 130 +source "cluster/Kconfig"
 131 diff -urN linux-orig/arch/sparc64/Kconfig linux-orig2/arch/sparc64/Kconfig
 132 --- linux-orig/arch/sparc64/Kconfig     2004-10-18 16:55:06.000000000 -0500
 133 +++ linux-orig2/arch/sparc64/Kconfig    2004-10-22 11:33:19.290676599 -0500
 134 @@ -613,3 +613,5 @@
 135  source "crypto/Kconfig"
 136
 137  source "lib/Kconfig"
 138 +
 139 +source "cluster/Kconfig"
 140 diff -urN linux-orig/arch/um/Kconfig linux-orig2/arch/um/Kconfig
 141 --- linux-orig/arch/um/Kconfig  2004-10-18 16:54:08.000000000 -0500
 142 +++ linux-orig2/arch/um/Kconfig 2004-10-22 11:29:33.564217823 -0500
 143 @@ -225,6 +225,8 @@
 144
 145  source "lib/Kconfig"
 146
 147 +source "cluster/Kconfig"
 148 +
 149  menu "SCSI support"
 150  depends on BROKEN
 151
 152 diff -urN linux-orig/arch/x86_64/Kconfig linux-orig2/arch/x86_64/Kconfig
 153 --- linux-orig/arch/x86_64/Kconfig      2004-10-18 16:54:55.000000000 -0500
 154 +++ linux-orig2/arch/x86_64/Kconfig     2004-10-22 11:33:37.130396876 -0500
 155 @@ -424,3 +424,5 @@
 156  source "crypto/Kconfig"
 157
 158  source "lib/Kconfig"
 159 +
 160 +source "cluster/Kconfig"
 161 diff -urN linux-orig/cluster/cman/Makefile linux-orig2/cluster/cman/Makefile
 162 --- linux-orig/cluster/cman/Makefile    1969-12-31 18:00:00.000000000 -0600
 163 +++ linux-orig2/cluster/cman/Makefile   2004-10-22 11:29:33.566217791 -0500
 164 @@ -0,0 +1,6 @@
 165 +cman-objs := cnxman.o config.o membership.o proc.o\
 166 +            sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
 167 +            sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
 168 +            sm_user.o
 169 +
 170 +obj-$(CONFIG_CLUSTER) := cman.o
 171 diff -urN linux-orig/cluster/Kconfig linux-orig2/cluster/Kconfig
 172 --- linux-orig/cluster/Kconfig  1969-12-31 18:00:00.000000000 -0600
 173 +++ linux-orig2/cluster/Kconfig 2004-10-22 11:29:33.565217807 -0500
 174 @@ -0,0 +1,13 @@
 175 +menu "Cluster Support"
 176 +
 177 +config CLUSTER
 178 +       tristate "Cluster support"
 179 +       ---help---
 180 +       Enable clustering support. This is not the high-performance clustering
 181 +       made famous by beowulf. It is a high-availability cluster often using
 182 +       shared storage.
 183 +       The cluster manager is the heart(beat) of the cluster system. It is
 184 +       needed by all the other components. It provides membership services
 185 +       for those other subsystems.
 186 +
 187 +endmenu
 188 diff -urN linux-orig/cluster/Makefile linux-orig2/cluster/Makefile
 189 --- linux-orig/cluster/Makefile 1969-12-31 18:00:00.000000000 -0600
 190 +++ linux-orig2/cluster/Makefile        2004-10-22 11:29:33.566217791 -0500
 191 @@ -0,0 +1,3 @@
 192 +obj-y  := nocluster.o
 193 +
 194 +obj-$(CONFIG_CLUSTER)         += cman/
 195 diff -urN linux-orig/cluster/nocluster.c linux-orig2/cluster/nocluster.c
 196 --- linux-orig/cluster/nocluster.c      1969-12-31 18:00:00.000000000 -0600
 197 +++ linux-orig2/cluster/nocluster.c     2004-10-22 11:29:33.567217776 -0500
 198 @@ -0,0 +1,20 @@
 199 +/*
 200 + * cluster/nocluster.c
 201 + *
 202 + * Copy from net/nonet.c
 203 + * Dummy functions to allow us to configure cluster support entirely
 204 + * out of the kernel.
 205 + *
 206 + * Distributed under the terms of the GNU GPL version 2.
 207 + * Copyright (c) Matthew Wilcox 2003
 208 + */
 209 +
 210 +#include <linux/module.h>
 211 +#include <linux/errno.h>
 212 +#include <linux/fs.h>
 213 +#include <linux/init.h>
 214 +#include <linux/kernel.h>
 215 +
 216 +void __init nocluster_init(void)
 217 +{
 218 +}
 219 diff -urN linux-orig/Makefile linux-orig2/Makefile
 220 --- linux-orig/Makefile 2004-10-18 16:54:38.000000000 -0500
 221 +++ linux-orig2/Makefile        2004-10-22 11:29:33.507218717 -0500
 222 @@ -445,7 +445,7 @@
 223
 224  # Objects we will link into vmlinux / subdirs we need to visit
 225  init-y         := init/
 226 -drivers-y      := drivers/ sound/
 227 +drivers-y      := drivers/ sound/ cluster/
 228  net-y          := net/
 229  libs-y         := lib/
 230  core-y         := usr/
 231 diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
 232 --- linux-orig/cluster/cman/cnxman-private.h    1970-01-01 07:30:00.000000000 +0730
 233 +++ linux-patched/cluster/cman/cnxman-private.h 2004-11-03 11:37:37.000000000 +0800
 234 @@ -0,0 +1,432 @@
 235 +/******************************************************************************
 236 +*******************************************************************************
 237 +**
 238 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 239 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 240 +**
 241 +**  This copyrighted material is made available to anyone wishing to use,
 242 +**  modify, copy, or redistribute it subject to the terms and conditions
 243 +**  of the GNU General Public License v.2.
 244 +**
 245 +*******************************************************************************
 246 +******************************************************************************/
 247 +
 248 +#ifndef __CNXMAN_PRIVATE_H
 249 +#define __CNXMAN_PRIVATE_H
 250 +
 251 +/* Version triplet */
 252 +#define CNXMAN_MAJOR_VERSION 3
 253 +#define CNXMAN_MINOR_VERSION 0
 254 +#define CNXMAN_PATCH_VERSION 1
 255 +
 256 +#define MAX_RETRIES 3          /* Maximum number of send retries */
 257 +#define CAP_CLUSTER CAP_SYS_ADMIN      /* Capability needed to manage the
 258 +                                        * cluster */
 259 +#ifdef __KERNEL__
 260 +
 261 +/* How we announce ourself in console events */
 262 +#define CMAN_NAME "CMAN"
 263 +
 264 +/* One of these per AF_CLUSTER socket */
 265 +struct cluster_sock {
 266 +       /* WARNING: sk has to be the first member */
 267 +       struct sock sk;
 268 +
 269 +       unsigned char port;     /* Bound port or zero */
 270 +       int (*kernel_callback) (char *, int, char *, int, unsigned int);
 271 +       void *service_data;
 272 +};
 273 +
 274 +#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
 275 +
 276 +/* We have one of these for each socket we use for communications */
 277 +struct cl_comms_socket {
 278 +       struct socket *sock;
 279 +       int broadcast;          /* This is a broadcast socket */
 280 +       int recv_only;          /* This is the unicast receive end of a
 281 +                                * multicast socket */
 282 +       struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
 283 +                                * the remote end(s) */
 284 +       int addr_len;           /* Length of above */
 285 +       int number;             /* Internal socket number, used to cycle around
 286 +                                * sockets in case of network errors */
 287 +       struct file *file;      /* file pointer for user-passed in sockets */
 288 +
 289 +       wait_queue_t wait;
 290 +
 291 +       /* The socket list */
 292 +       struct list_head list;
 293 +
 294 +       /* On here when it has something to say */
 295 +       struct list_head active_list;
 296 +       unsigned long active;
 297 +};
 298 +
 299 +/* A client socket. We keep a list of these so we can notify clients of cluster
 300 + * events */
 301 +struct cl_client_socket {
 302 +       struct socket    *sock;
 303 +       struct list_head  list;
 304 +};
 305 +
 306 +/* This structure is tacked onto the start of a cluster message packet for our
 307 + * own nefarious purposes. */
 308 +struct cl_protheader {
 309 +       unsigned char  tgtport; /* Target port number */
 310 +       unsigned char  srcport; /* Source (originationg) port number */
 311 +       unsigned short seq;     /* Packet sequence number, little-endian */
 312 +       unsigned short ack;     /* Inline ACK */
 313 +       unsigned short cluster; /* Our cluster number, little-endian */
 314 +       unsigned int   flags;
 315 +       int            srcid;   /* Node ID of the sender */
 316 +       int            tgtid;   /* Node ID of the target or 0 for multicast
 317 +                                * messages */
 318 +};
 319 +
 320 +/* A cluster internal protocol message - port number 0 */
 321 +struct cl_protmsg {
 322 +       struct cl_protheader header;
 323 +       unsigned char cmd;
 324 +};
 325 +
 326 +/* A Cluster ACK message */
 327 +struct cl_ackmsg {
 328 +       struct cl_protheader header;
 329 +       unsigned char  cmd;     /* Always CLUSTER_CMD_ACK */
 330 +       unsigned char  remport; /* Remote port number the original message was
 331 +                                * for */
 332 +       unsigned char  aflags;  /* ACK flags 0=OK, 1=No listener */
 333 +       unsigned char  pad;
 334 +};
 335 +
 336 +/* A Cluster LISTENREQ/LISTENRESP message */
 337 +struct cl_listenmsg {
 338 +       unsigned char  cmd;     /* CLUSTER_CMD_LISTENRESP/REQ */
 339 +       unsigned char  target_port;     /* Port to probe */
 340 +       unsigned char  listening;       /* Always 0 for LISTENREQ */
 341 +       unsigned char  pad;
 342 +       unsigned short tag;     /* PID of remote waiting process */
 343 +};
 344 +
 345 +/* A Cluster PORTCLOSED message */
 346 +struct cl_closemsg {
 347 +       unsigned char cmd;      /* CLUSTER_CMD_PORTCLOSED */
 348 +       unsigned char port;
 349 +};
 350 +
 351 +/* Structure of a newly dead node, passed from cnxman to kmembershipd */
 352 +struct cl_new_dead_node {
 353 +       struct list_head     list;
 354 +       struct cluster_node *node;
 355 +};
 356 +
 357 +/* Subcommands for BARRIER message */
 358 +#define BARRIER_REGISTER 1
 359 +#define BARRIER_CHANGE   2
 360 +#define BARRIER_WAIT     4
 361 +#define BARRIER_COMPLETE 5
 362 +
 363 +/* A Cluster BARRIER message */
 364 +struct cl_barriermsg {
 365 +       unsigned char  cmd;     /* CLUSTER_CMD_BARRIER */
 366 +       unsigned char  subcmd;  /* BARRIER sub command */
 367 +       unsigned short pad;
 368 +       unsigned int   flags;
 369 +       unsigned int   nodes;
 370 +       char name[MAX_BARRIER_NAME_LEN];
 371 +};
 372 +
 373 +/* Membership services messages, the cl_protheader is added transparently */
 374 +struct cl_mem_hello_msg {
 375 +       unsigned char  cmd;
 376 +       unsigned char  flags;
 377 +       unsigned short members;     /* Number of nodes in the cluster,
 378 +                                    * little-endian */
 379 +       unsigned int   generation;  /* Current cluster generation number */
 380 +};
 381 +
 382 +struct cl_mem_endtrans_msg {
 383 +       unsigned char  cmd;
 384 +       unsigned char  pad1;
 385 +       unsigned short pad2;
 386 +       unsigned int   quorum;
 387 +       unsigned int   total_votes;
 388 +       unsigned int   generation;      /* Current cluster generation number */
 389 +       unsigned int   new_node_id;     /* If reason is a new node joining */
 390 +};
 391 +
 392 +/* ACK types for JOINACK message */
 393 +#define JOINACK_TYPE_OK   1    /* You can join */
 394 +#define JOINACK_TYPE_NAK  2    /* You can NOT join */
 395 +#define JOINACK_TYPE_WAIT 3    /* Wait a bit longer - cluster is in transition
 396 +                                * already */
 397 +
 398 +struct cl_mem_joinack_msg {
 399 +       unsigned char cmd;
 400 +       unsigned char acktype;
 401 +};
 402 +
 403 +/* This is used by JOINREQ message */
 404 +struct cl_mem_join_msg {
 405 +       unsigned char  cmd;
 406 +       unsigned char  votes;
 407 +       unsigned short num_addr;        /* Number of addresses for this node */
 408 +       unsigned int   expected_votes;
 409 +        unsigned int   nodeid;         /* node ID we want */
 410 +       unsigned int   major_version;   /* Not backwards compatible */
 411 +       unsigned int   minor_version;   /* Backwards compatible */
 412 +       unsigned int   patch_version;   /* Backwards/forwards compatible */
 413 +       unsigned int   config_version;
 414 +        unsigned int   addr_len;        /* length of node addresses */
 415 +        char           clustername[16];
 416 +       /* Followed by <num_addr> addresses of `address_length` bytes and a
 417 +        * NUL-terminated node name */
 418 +};
 419 +
 420 +/* State transition start reasons: */
 421 +#define TRANS_NEWNODE        1 /* A new node is joining the cluster */
 422 +#define TRANS_REMNODE        2 /* a node has left the cluster */
 423 +#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
 424 +                                * transition */
 425 +#define TRANS_NEWMASTER      4 /* We have had an election and I am the new
 426 +                                * master */
 427 +#define TRANS_CHECK          5 /* A consistency check was called for */
 428 +#define TRANS_RESTART        6 /* Transition restarted because of a previous
 429 +                                * timeout */
 430 +#define TRANS_DEADMASTER     7 /* The master died during transition and I have
 431 +                                * taken over */
 432 +
 433 +/* This is used to start a state transition */
 434 +struct cl_mem_starttrans_msg {
 435 +       unsigned char  cmd;
 436 +       unsigned char  reason;  /* Why a start transition was started - see
 437 +                                * above */
 438 +       unsigned char  flags;
 439 +       unsigned char  votes;
 440 +       unsigned int   expected_votes;
 441 +       unsigned int   generation;      /* Incremented for each STARTTRANS sent
 442 +                                        */
 443 +       int            nodeid;  /* Node to be removed */
 444 +       unsigned short num_addrs;
 445 +       /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
 446 +        * `address_length` bytes and a NUL-terminated node name */
 447 +};
 448 +
 449 +struct cl_mem_startack_msg {
 450 +       unsigned char  cmd;
 451 +       unsigned char  reason;
 452 +       unsigned short pad;
 453 +       unsigned int   generation;
 454 +       unsigned int   node_id; /* node_id we think new node should have */
 455 +       unsigned int   highest_node_id; /* highest node_id on this system */
 456 +};
 457 +
 458 +/* Reconfigure a cluster parameter */
 459 +struct cl_mem_reconfig_msg {
 460 +       unsigned char  cmd;
 461 +       unsigned char  param;
 462 +       unsigned short pad;
 463 +       unsigned int   value;
 464 +};
 465 +
 466 +/* Structure containing information about an outstanding listen request */
 467 +struct cl_waiting_listen_request {
 468 +       wait_queue_head_t waitq;
 469 +       int               result;
 470 +       int               waiting;
 471 +       unsigned short    tag;
 472 +       int               nodeid;
 473 +       struct list_head  list;
 474 +};
 475 +
 476 +/* Messages from membership services */
 477 +#define CLUSTER_MEM_JOINCONF   1
 478 +#define CLUSTER_MEM_JOINREQ    2
 479 +#define CLUSTER_MEM_LEAVE      3
 480 +#define CLUSTER_MEM_HELLO      4
 481 +#define CLUSTER_MEM_KILL       5
 482 +#define CLUSTER_MEM_JOINACK    6
 483 +#define CLUSTER_MEM_ENDTRANS   7
 484 +#define CLUSTER_MEM_RECONFIG   8
 485 +#define CLUSTER_MEM_MASTERVIEW 9
 486 +#define CLUSTER_MEM_STARTTRANS 10
 487 +#define CLUSTER_MEM_JOINREJ    11
 488 +#define CLUSTER_MEM_VIEWACK    12
 489 +#define CLUSTER_MEM_STARTACK   13
 490 +#define CLUSTER_MEM_TRANSITION 14
 491 +#define CLUSTER_MEM_NEWCLUSTER 15
 492 +#define CLUSTER_MEM_CONFACK    16
 493 +#define CLUSTER_MEM_NOMINATE   17
 494 +
 495 +/* Flags in the HELLO message */
 496 +#define HELLO_FLAG_MASTER       1
 497 +#define HELLO_FLAG_QUORATE      2
 498 +
 499 +/* Parameters for RECONFIG command */
 500 +#define RECONFIG_PARAM_EXPECTED_VOTES 1
 501 +#define RECONFIG_PARAM_NODE_VOTES     2
 502 +#define RECONFIG_PARAM_CONFIG_VERSION 3
 503 +
 504 +/* Data associated with an outgoing socket */
 505 +struct cl_socket {
 506 +       struct file *file;      /* The real file */
 507 +       struct socket *socket;  /* The real sock */
 508 +       int num_nodes;          /* On this link */
 509 +       int retransmit_count;
 510 +};
 511 +
 512 +/* There's one of these for each node in the cluster */
 513 +struct cluster_node {
 514 +       struct list_head list;
 515 +       char *name;             /* Node/host name of node */
 516 +       struct list_head addr_list;
 517 +       int us;                 /* This node is us */
 518 +       unsigned int node_id;   /* Unique node ID */
 519 +       nodestate_t state;
 520 +       unsigned short last_seq_recv;
 521 +       unsigned short last_seq_acked;
 522 +       unsigned short last_seq_sent;
 523 +       unsigned int votes;
 524 +       unsigned int expected_votes;
 525 +       unsigned int leave_reason;
 526 +       unsigned int incarnation;       /* Incremented each time a node joins
 527 +                                        * the cluster */
 528 +       unsigned long last_hello;       /* Jiffies */
 529 +        struct timeval join_time;
 530 +};
 531 +
 532 +/* This is how we keep a list of user processes that are listening for cluster
 533 + * membership events */
 534 +struct notify_struct {
 535 +       struct list_head list;
 536 +       pid_t pid;
 537 +       int signal;
 538 +};
 539 +
 540 +/* This is how we keep a list of kernel callbacks that are registered for
 541 + * cluster membership events */
 542 +struct kernel_notify_struct {
 543 +       struct list_head list;
 544 +       void (*callback) (kcl_callback_reason, long arg);
 545 +};
 546 +
 547 +/* A message waiting to be sent */
 548 +struct queued_message {
 549 +       struct list_head list;
 550 +
 551 +       struct socket *socket;
 552 +       struct sockaddr_cl addr;
 553 +       int addr_len;
 554 +       int msg_len;
 555 +       unsigned char port;
 556 +       unsigned int flags;
 557 +       char msg_buffer[MAX_CLUSTER_MESSAGE];
 558 +};
 559 +
 560 +/* A barrier */
 561 +struct cl_barrier {
 562 +       struct list_head list;
 563 +
 564 +       char name[MAX_BARRIER_NAME_LEN];
 565 +       unsigned int flags;
 566 +       enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
 567 +                   BARRIER_STATE_COMPLETE } state;
 568 +       unsigned int expected_nodes;
 569 +       unsigned int registered_nodes;
 570 +       atomic_t     got_nodes;
 571 +       atomic_t     completed_nodes;
 572 +       unsigned int inuse;
 573 +       unsigned int waitsent;
 574 +       unsigned int phase;     /* Completion phase */
 575 +       unsigned int endreason; /* Reason we were woken, usually 0 */
 576 +       unsigned long timeout;  /* In seconds */
 577 +
 578 +       void (*callback) (char *name, int status);
 579 +       wait_queue_head_t waitq;
 580 +       struct semaphore lock;  /* To synch with cnxman messages */
 581 +       spinlock_t phase2_spinlock;     /* Need to synchronise with timer
 582 +                                        * interrupts */
 583 +       struct timer_list timer;
 584 +};
 585 +
 586 +/* Cluster protocol commands sent to port 0 */
 587 +#define CLUSTER_CMD_ACK        1
 588 +#define CLUSTER_CMD_LISTENREQ  2
 589 +#define CLUSTER_CMD_LISTENRESP 3
 590 +#define CLUSTER_CMD_PORTCLOSED 4
 591 +#define CLUSTER_CMD_BARRIER    5
 592 +
 593 +extern struct cluster_node *find_node_by_addr(unsigned char *addr,
 594 +                                             int addr_len);
 595 +extern struct cluster_node *find_node_by_nodeid(unsigned int id);
 596 +extern struct cluster_node *find_node_by_name(char *name);
 597 +extern void set_quorate(int);
 598 +extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
 599 +extern void notify_listeners(void);
 600 +extern void free_nodeid_array(void);
 601 +extern int send_reconfigure(int param, unsigned int value);
 602 +extern int calculate_quorum(int, int, int *);
 603 +extern void recalculate_quorum(int);
 604 +extern int send_leave(unsigned char);
 605 +extern int get_quorum(void);
 606 +extern void set_votes(int, int);
 607 +extern void kcl_wait_for_all_acks(void);
 608 +extern char *membership_state(char *, int);
 609 +extern char *leave_string(int reason);
 610 +extern void a_node_just_died(struct cluster_node *node);
 611 +extern void check_barrier_returns(void);
 612 +extern int in_transition(void);
 613 +extern void get_local_addresses(struct cluster_node *node);
 614 +extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
 615 +extern void create_proc_entries(void);
 616 +extern void cleanup_proc_entries(void);
 617 +extern unsigned int get_highest_nodeid(void);
 618 +extern int allocate_nodeid_array(void);
 619 +extern void queue_oob_skb(struct socket *sock, int cmd);
 620 +extern int new_temp_nodeid(char *addr, int addrlen);
 621 +extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
 622 +extern void purge_temp_nodeids(void);
 623 +extern inline char *print_addr(unsigned char *addr, int len, char *buf)
 624 +{
 625 +       int i;
 626 +       int ptr = 0;
 627 +
 628 +       for (i = 0; i < len; i++)
 629 +               ptr += sprintf(buf + ptr, "%02x ", addr[i]);
 630 +
 631 +       return buf;
 632 +}
 633 +
 634 +#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
 635 +
 636 +/* Debug enabling macros. Sorry about the C++ comments but they're easier to
 637 + * get rid of than C ones... */
 638 +
 639 +// #define DEBUG_MEMB
 640 +// #define DEBUG_COMMS
 641 +// #define DEBUG_BARRIER
 642 +
 643 +/* Debug macros */
 644 +#ifdef DEBUG_COMMS
 645 +#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
 646 +#else
 647 +#define P_COMMS(fmt, args...)
 648 +#endif
 649 +
 650 +#ifdef DEBUG_BARRIER
 651 +#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
 652 +#else
 653 +#define P_BARRIER(fmt, args...)
 654 +#endif
 655 +
 656 +#ifdef DEBUG_MEMB
 657 +#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
 658 +#define C_MEMB(fmt, args...) printk(fmt, ## args)
 659 +#else
 660 +#define P_MEMB(fmt, args...)
 661 +#define C_MEMB(fmt, args...)
 662 +#endif
 663 +
 664 +#endif                         /* __KERNEL */
 665 +
 666 +#endif
 667 diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
 668 --- linux-orig/cluster/cman/cnxman.c    1970-01-01 07:30:00.000000000 +0730
 669 +++ linux-patched/cluster/cman/cnxman.c 2004-11-03 11:37:37.000000000 +0800
 670 @@ -0,0 +1,4214 @@
 671 +/******************************************************************************
 672 +*******************************************************************************
 673 +**
 674 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 675 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 676 +**
 677 +**  This copyrighted material is made available to anyone wishing to use,
 678 +**  modify, copy, or redistribute it subject to the terms and conditions
 679 +**  of the GNU General Public License v.2.
 680 +**
 681 +*******************************************************************************
 682 +******************************************************************************/
 683 +
 684 +#define EXPORT_SYMTAB
 685 +#include <linux/init.h>
 686 +#include <linux/socket.h>
 687 +#include <linux/kernel.h>
 688 +#include <linux/sched.h>
 689 +#include <linux/file.h>
 690 +#include <linux/utsname.h>
 691 +#include <net/sock.h>
 692 +#include <linux/proc_fs.h>
 693 +#include <linux/poll.h>
 694 +#include <linux/module.h>
 695 +#include <linux/list.h>
 696 +#include <linux/uio.h>
 697 +#include <cluster/cnxman.h>
 698 +#include <cluster/service.h>
 699 +
 700 +#include "cnxman-private.h"
 701 +#include "sm_control.h"
 702 +#include "sm_user.h"
 703 +#include "config.h"
 704 +
 705 +#define CMAN_RELEASE_NAME "<CVS>"
 706 +
 707 +static void process_incoming_packet(struct cl_comms_socket *csock,
 708 +                                   struct msghdr *msg, struct kvec *vec, int veclen, int len);
 709 +static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
 710 +                     int addr_len, char *addr, unsigned char remport,
 711 +                     unsigned char flag);
 712 +static void send_listen_request(int nodeid, unsigned char port);
 713 +static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
 714 +                                unsigned char port, unsigned short tag);
 715 +static void resend_last_message(void);
 716 +static void start_ack_timer(void);
 717 +static int send_queued_message(struct queued_message *qmsg);
 718 +static void send_port_close_oob(unsigned char port);
 719 +static void post_close_oob(unsigned char port, int nodeid);
 720 +static void process_barrier_msg(struct cl_barriermsg *msg,
 721 +                               struct cluster_node *node);
 722 +static struct cl_barrier *find_barrier(char *name);
 723 +static void node_shutdown(void);
 724 +static void node_cleanup(void);
 725 +static int send_or_queue_message(struct socket *sock, void *buf, int len, struct sockaddr_cl *caddr,
 726 +                                unsigned int flags);
 727 +static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
 728 +static void check_for_unacked_nodes(void);
 729 +static void free_cluster_sockets(void);
 730 +static uint16_t generate_cluster_id(char *name);
 731 +static int is_valid_temp_nodeid(int nodeid);
 732 +
 733 +extern int start_membership_services(pid_t);
 734 +extern int kcl_leave_cluster(int remove);
 735 +extern int send_kill(int nodeid);
 736 +
 737 +static struct proto_ops cl_proto_ops;
 738 +static struct sock *master_sock;
 739 +static kmem_cache_t *cluster_sk_cachep;
 740 +
 741 +/* Pointer to the pseudo node that maintains quorum in a 2node system */
 742 +struct cluster_node *quorum_device = NULL;
 743 +
 744 +/* Array of "ports" allocated. This is just a list of pointers to the sock that
 745 + * has this port bound. Speed is a major issue here so 1-2K of allocated
 746 + * storage is worth sacrificing. Port 0 is reserved for protocol messages */
 747 +static struct sock *port_array[256];
 748 +static struct semaphore port_array_lock;
 749 +
 750 +/* Our cluster name & number */
 751 +uint16_t cluster_id;
 752 +char cluster_name[MAX_CLUSTER_NAME_LEN+1];
 753 +
 754 +/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
 755 + * No more than two nodes are permitted to join the cluster. */
 756 +unsigned short two_node;
 757 +
 758 +/* Cluster configuration version that must be the same among members. */
 759 +unsigned int config_version;
 760 +
 761 +/* Reference counting for cluster applications */
 762 +atomic_t use_count;
 763 +
 764 +/* Length of sockaddr address for our comms protocol */
 765 +unsigned int address_length;
 766 +
 767 +/* Message sending */
 768 +static unsigned short cur_seq; /* Last message sent */
 769 +static unsigned int ack_count; /* Number of acks received for message
 770 +                                * 'cur_seq' */
 771 +static unsigned int acks_expected;     /* Number of acks we expect to receive */
 772 +static struct semaphore send_lock;
 773 +static struct timer_list ack_timer;
 774 +
 775 +/* Saved packet information in case we need to resend it */
 776 +static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
 777 +static int saved_msg_len;
 778 +static int retry_count;
 779 +
 780 +/* Task variables */
 781 +static pid_t kcluster_pid;
 782 +static pid_t membership_pid;
 783 +extern struct task_struct *membership_task;
 784 +extern int quit_threads;
 785 +
 786 +wait_queue_head_t cnxman_waitq;
 787 +
 788 +/* Variables owned by membership services */
 789 +extern int cluster_members;
 790 +extern struct list_head cluster_members_list;
 791 +extern struct semaphore cluster_members_lock;
 792 +extern int we_are_a_cluster_member;
 793 +extern int cluster_is_quorate;
 794 +extern struct cluster_node *us;
 795 +extern struct list_head new_dead_node_list;
 796 +extern struct semaphore new_dead_node_lock;
 797 +extern char nodename[];
 798 +extern int wanted_nodeid;
 799 +
 800 +/* A list of processes listening for membership events */
 801 +static struct list_head event_listener_list;
 802 +static struct semaphore event_listener_lock;
 803 +
 804 +/* A list of kernel callbacks listening for membership events */
 805 +static struct list_head kernel_listener_list;
 806 +static struct semaphore kernel_listener_lock;
 807 +
 808 +/* A list of sockets we are listening on (and can transmit on...later) */
 809 +static struct list_head socket_list;
 810 +
 811 +/* A list of all open cluster client sockets */
 812 +static struct list_head client_socket_list;
 813 +static struct semaphore client_socket_lock;
 814 +
 815 +/* A list of all current barriers */
 816 +static struct list_head barrier_list;
 817 +static struct semaphore barrier_list_lock;
 818 +
 819 +/* When a socket is read for reading it goes on this queue */
 820 +static spinlock_t active_socket_lock;
 821 +static struct list_head active_socket_list;
 822 +
 823 +/* If the cnxman process is running and available for work */
 824 +atomic_t cnxman_running;
 825 +
 826 +/* Fkags set by timers etc for the mainloop to detect and act upon */
 827 +static unsigned long mainloop_flags;
 828 +
 829 +#define ACK_TIMEOUT   1
 830 +#define RESEND_NEEDED 2
 831 +
 832 +/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
 833 + * process context then the messages get put in here */
 834 +static struct list_head messages_list;
 835 +static struct semaphore messages_list_lock;
 836 +
 837 +static struct semaphore start_thread_sem;
 838 +
 839 +/* List of outstanding ISLISTENING requests */
 840 +static struct list_head listenreq_list;
 841 +static struct semaphore listenreq_lock;
 842 +
 843 +/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
 844 + * ACK) */
 845 +static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
 846 +
 847 +/* Wait for thread to exit properly */
 848 +struct completion cluster_thread_comp;
 849 +struct completion member_thread_comp;
 850 +
 851 +/* The resend delay to use, We increase this geometrically(word?) each time a
 852 + * send is delayed. in deci-seconds */
 853 +static int resend_delay = 1;
 854 +
 855 +/* Highest numbered interface and the current default */
 856 +static int num_interfaces;
 857 +static struct cl_comms_socket *current_interface = NULL;
 858 +
 859 +struct temp_node
 860 +{
 861 +       int nodeid;
 862 +       char addr[sizeof(struct sockaddr_in6)];
 863 +       int addrlen;
 864 +       struct list_head list;
 865 +};
 866 +static struct list_head tempnode_list;
 867 +static struct semaphore tempnode_lock;
 868 +
 869 +
 870 +/* This is what's squirrelled away in skb->cb */
 871 +struct cb_info
 872 +{
 873 +       int  orig_nodeid;
 874 +       char orig_port;
 875 +       char oob;
 876 +};
 877 +
 878 +
 879 +/* Wake up any processes that are waiting to send. This is usually called when
 880 + * all the ACKs have been gathered up or when a node has left the cluster
 881 + * unexpectedly and we reckon there are no more acks to collect */
 882 +static void unjam(void)
 883 +{
 884 +       wake_up_interruptible(&socket_waitq);
 885 +       wake_up_interruptible(&cnxman_waitq);
 886 +}
 887 +
 888 +/* Used by the data_ready routine to locate a connection given the socket */
 889 +static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
 890 +{
 891 +       struct list_head *conlist;
 892 +
 893 +       list_for_each(conlist, &socket_list) {
 894 +               struct cl_comms_socket *clsock =
 895 +                   list_entry(conlist, struct cl_comms_socket, list);
 896 +               if (clsock->sock->sk == sk) {
 897 +                       return clsock;
 898 +               }
 899 +       }
 900 +       return NULL;
 901 +}
 902 +
 903 +/* Data available on socket */
 904 +static void cnxman_data_ready(struct sock *sk, int count_unused)
 905 +{
 906 +       struct cl_comms_socket *clsock = find_comms_by_sock(sk);
 907 +
 908 +       if (clsock == NULL)     /* ASSERT ?? */
 909 +               return;
 910 +
 911 +       /* If we're already on the list then don't do it again */
 912 +       if (test_and_set_bit(1, &clsock->active))
 913 +               return;
 914 +
 915 +       spin_lock_irq(&active_socket_lock);
 916 +       list_add(&clsock->active_list, &active_socket_list);
 917 +       spin_unlock_irq(&active_socket_lock);
 918 +
 919 +       wake_up_interruptible(&cnxman_waitq);
 920 +}
 921 +
 922 +static int receive_message(struct cl_comms_socket *csock, char *iobuf)
 923 +{
 924 +       struct msghdr msg;
 925 +       struct kvec vec;
 926 +       struct sockaddr_in6 sin;
 927 +       int len;
 928 +
 929 +       memset(&sin, 0, sizeof (sin));
 930 +
 931 +       msg.msg_control = NULL;
 932 +       msg.msg_controllen = 0;
 933 +       msg.msg_name = &sin;
 934 +       msg.msg_namelen = sizeof (sin);
 935 +       msg.msg_flags = 0;
 936 +
 937 +       vec.iov_len = MAX_CLUSTER_MESSAGE;
 938 +       vec.iov_base = iobuf;
 939 +
 940 +       len = kernel_recvmsg(csock->sock, &msg,
 941 +                            &vec, 1, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
 942 +
 943 +       vec.iov_base = iobuf;
 944 +
 945 +       if (len > 0) {
 946 +               if (len > MAX_CLUSTER_MESSAGE) {
 947 +                       printk(KERN_CRIT CMAN_NAME
 948 +                              ": %d byte message far too big\n", len);
 949 +                       return 0;
 950 +               }
 951 +               process_incoming_packet(csock, &msg, &vec, 1, len);
 952 +       }
 953 +       else {
 954 +               if (len != -EAGAIN)
 955 +                       printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
 956 +                              len);
 957 +       }
 958 +       return len;
 959 +}
 960 +
 961 +static int cluster_kthread(void *unused)
 962 +{
 963 +       int len;
 964 +       char *iobuf;
 965 +       struct list_head *socklist;
 966 +       struct cl_comms_socket *csock;
 967 +       wait_queue_t cnxman_waitq_head;
 968 +       sigset_t tmpsig;
 969 +
 970 +       daemonize("cman_comms");
 971 +
 972 +       /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
 973 +       siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
 974 +       sigprocmask(SIG_BLOCK, &tmpsig, NULL);
 975 +
 976 +       /* This is the waitq we can wake the process up with */
 977 +       init_waitqueue_head(&cnxman_waitq);
 978 +       init_waitqueue_entry(&cnxman_waitq_head, current);
 979 +       add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
 980 +
 981 +       set_user_nice(current, -6);
 982 +
 983 +       /* Allow the sockets to start receiving */
 984 +       list_for_each(socklist, &socket_list) {
 985 +               csock = list_entry(socklist, struct cl_comms_socket, list);
 986 +
 987 +               clear_bit(1, &csock->active);
 988 +       }
 989 +
 990 +       iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
 991 +       if (!iobuf) {
 992 +               printk(KERN_CRIT CMAN_NAME
 993 +                      ": Cannot allocate receive buffer for cluster comms\n");
 994 +               return -1;
 995 +       }
 996 +
 997 +       complete(&cluster_thread_comp);
 998 +
 999 +       for (;;) {
1000 +               struct list_head *temp;
1001 +
1002 +               /* Wait for activity on any of the sockets */
1003 +               set_task_state(current, TASK_INTERRUPTIBLE);
1004 +
1005 +               if (list_empty(&active_socket_list))
1006 +                       schedule();
1007 +               set_task_state(current, TASK_RUNNING);
1008 +
1009 +               if (quit_threads)
1010 +                       break;
1011 +
1012 +               if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
1013 +                       check_for_unacked_nodes();
1014 +               }
1015 +
1016 +               /* Now receive any messages waiting for us */
1017 +               spin_lock_irq(&active_socket_lock);
1018 +               list_for_each_safe(socklist, temp, &active_socket_list) {
1019 +                       csock =
1020 +                           list_entry(socklist, struct cl_comms_socket,
1021 +                                      active_list);
1022 +
1023 +                       list_del(&csock->active_list);
1024 +                       clear_bit(1, &csock->active);
1025 +
1026 +                       spin_unlock_irq(&active_socket_lock);
1027 +
1028 +                       do {
1029 +                               len = receive_message(csock, iobuf);
1030 +                       }
1031 +                       while (len > 0);
1032 +
1033 +                       spin_lock_irq(&active_socket_lock);
1034 +
1035 +                       if (len == 0)
1036 +                               break;  /* EOF on socket */
1037 +               }
1038 +               spin_unlock_irq(&active_socket_lock);
1039 +
1040 +               /* Resend any unacked messages */
1041 +               if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
1042 +                   && acks_expected) {
1043 +                       resend_last_message();
1044 +               }
1045 +
1046 +               /* Send any queued messages */
1047 +               if (acks_expected == 0) {
1048 +                       struct list_head *temp;
1049 +                       struct list_head *msglist;
1050 +
1051 +                       down(&messages_list_lock);
1052 +                       list_for_each_safe(msglist, temp, &messages_list) {
1053 +                               struct queued_message *qmsg =
1054 +                                   list_entry(msglist, struct queued_message,
1055 +                                              list);
1056 +                               int status = send_queued_message(qmsg);
1057 +
1058 +                               if (status >= 0) {
1059 +                                       /* Suceeded, remove it from the queue */
1060 +                                       list_del(&qmsg->list);
1061 +                                       kfree(qmsg);
1062 +                               }
1063 +                               /* Did it fail horribly ?? */
1064 +                               if (status < 0 && status != -EAGAIN) {
1065 +                                       printk(KERN_INFO CMAN_NAME
1066 +                                              ": send_queued_message failed, error %d\n",
1067 +                                              status);
1068 +                                       list_del(&qmsg->list);
1069 +                                       kfree(qmsg);
1070 +                               }
1071 +                               break;  /* Only send one message at a time */
1072 +                       }
1073 +                       up(&messages_list_lock);
1074 +               }
1075 +
1076 +               if (signal_pending(current))
1077 +                       break;
1078 +       }
1079 +       P_COMMS("closing down\n");
1080 +
1081 +       quit_threads = 1;       /* force other thread to die too */
1082 +
1083 +       /* Wait for membership thread to finish, that way any
1084 +          LEAVE message will get sent. */
1085 +       wake_up_process(membership_task);
1086 +       wait_for_completion(&member_thread_comp);
1087 +
1088 +       node_shutdown();
1089 +
1090 +       if (timer_pending(&ack_timer))
1091 +               del_timer(&ack_timer);
1092 +
1093 +       node_cleanup();
1094 +       kfree(iobuf);
1095 +
1096 +       complete(&cluster_thread_comp);
1097 +       return 0;
1098 +}
1099 +
1100 +void notify_kernel_listeners(kcl_callback_reason reason, long arg)
1101 +{
1102 +       struct kernel_notify_struct *knotify;
1103 +       struct list_head *proclist;
1104 +
1105 +       down(&kernel_listener_lock);
1106 +       list_for_each(proclist, &kernel_listener_list) {
1107 +               knotify =
1108 +                   list_entry(proclist, struct kernel_notify_struct, list);
1109 +               knotify->callback(reason, arg);
1110 +       }
1111 +       up(&kernel_listener_lock);
1112 +}
1113 +
1114 +static void check_for_unacked_nodes()
1115 +{
1116 +       struct list_head *nodelist;
1117 +       struct list_head *temp;
1118 +       struct cluster_node *node;
1119 +
1120 +       clear_bit(RESEND_NEEDED, &mainloop_flags);
1121 +       retry_count = 0;
1122 +
1123 +       P_COMMS("Retry count exceeded -- looking for dead node\n");
1124 +
1125 +       /* Node did not ACK a message after <n> tries, remove it from the
1126 +        * cluster */
1127 +       down(&cluster_members_lock);
1128 +       list_for_each_safe(nodelist, temp, &cluster_members_list) {
1129 +               node = list_entry(nodelist, struct cluster_node, list);
1130 +
1131 +               P_COMMS("checking node %s: last_acked = %d, last_seq_sent = %d\n",
1132 +                       node->name, node->last_seq_acked, node->last_seq_sent);
1133 +               if (node->state != NODESTATE_DEAD &&
1134 +                   node->last_seq_acked != node->last_seq_sent && !node->us) {
1135 +                       printk(KERN_WARNING CMAN_NAME
1136 +                              ": node %s is not responding - removing from the cluster\n",
1137 +                              node->name);
1138 +
1139 +                       /* Drop this lock or we can deadlock with membership */
1140 +                       up(&cluster_members_lock);
1141 +
1142 +                       /* Start a state transition */
1143 +                       a_node_just_died(node);
1144 +                       down(&cluster_members_lock);
1145 +               }
1146 +       }
1147 +       up(&cluster_members_lock);
1148 +       acks_expected = ack_count = 0;
1149 +       unjam();
1150 +       return;
1151 +}
1152 +
1153 +static void ack_timer_fn(unsigned long arg)
1154 +{
1155 +       P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
1156 +
1157 +       /* Too many retries ? */
1158 +       if (++retry_count > MAX_RETRIES) {
1159 +               set_bit(ACK_TIMEOUT, &mainloop_flags);
1160 +               wake_up_interruptible(&cnxman_waitq);
1161 +       }
1162 +       else {
1163 +               /* Resend last message */
1164 +               set_bit(RESEND_NEEDED, &mainloop_flags);
1165 +               wake_up_interruptible(&cnxman_waitq);
1166 +       }
1167 +}
1168 +
1169 +/* Called to resend a packet if sock_sendmsg was busy */
1170 +static void short_timer_fn(unsigned long arg)
1171 +{
1172 +       P_COMMS("short_timer fired\n");
1173 +
1174 +       /* Resend last message */
1175 +       resend_delay <<= 1;
1176 +       set_bit(RESEND_NEEDED, &mainloop_flags);
1177 +       wake_up_interruptible(&cnxman_waitq);
1178 +}
1179 +
1180 +static void start_ack_timer()
1181 +{
1182 +       ack_timer.function = ack_timer_fn;
1183 +       ack_timer.data = 0L;
1184 +       mod_timer(&ack_timer, jiffies + HZ);
1185 +}
1186 +
1187 +static void start_short_timer(void)
1188 +{
1189 +       ack_timer.function = short_timer_fn;
1190 +       ack_timer.data = 0L;
1191 +       mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
1192 +}
1193 +
1194 +
1195 +static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
1196 +{
1197 +       struct list_head *llist;
1198 +       struct cl_waiting_listen_request *listener;
1199 +
1200 +       list_for_each(llist, &listenreq_list) {
1201 +               listener = list_entry(llist, struct cl_waiting_listen_request,
1202 +                                     list);
1203 +               if (listener->tag == tag) {
1204 +                       return listener;
1205 +               }
1206 +       }
1207 +       return NULL;
1208 +}
1209 +
1210 +static void process_ack(struct cluster_node *rem_node, unsigned short seq)
1211 +{
1212 +       if (rem_node && rem_node->state != NODESTATE_DEAD) {
1213 +               /* This copes with duplicate acks from a multipathed
1214 +                * host */
1215 +               if (rem_node->last_seq_acked !=
1216 +                   le16_to_cpu(seq)) {
1217 +                       rem_node->last_seq_acked =
1218 +                               le16_to_cpu(seq);
1219 +
1220 +                       /* Got em all */
1221 +                       if (++ack_count >= acks_expected) {
1222 +
1223 +                               /* Cancel the timer */
1224 +                               del_timer(&ack_timer);
1225 +                               acks_expected = 0;
1226 +                               unjam();
1227 +                       }
1228 +               }
1229 +       }
1230 +}
1231 +
1232 +static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
1233 +                                  int len, char *addr, int addrlen,
1234 +                                  struct cluster_node *rem_node)
1235 +{
1236 +       struct cl_protmsg *msg = (struct cl_protmsg *) data;
1237 +       struct cl_protheader *header = (struct cl_protheader *) data;
1238 +       struct cl_ackmsg *ackmsg;
1239 +       struct cl_listenmsg *listenmsg;
1240 +       struct cl_closemsg *closemsg;
1241 +       struct cl_barriermsg *barriermsg;
1242 +       struct cl_waiting_listen_request *listen_request;
1243 +
1244 +       P_COMMS("Message on port 0 is %d\n", msg->cmd);
1245 +       switch (msg->cmd) {
1246 +       case CLUSTER_CMD_ACK:
1247 +               ackmsg = (struct cl_ackmsg *) data;
1248 +
1249 +               if (rem_node && (ackmsg->aflags & 1)) {
1250 +                       if (net_ratelimit())
1251 +                               printk(KERN_INFO CMAN_NAME
1252 +                                      ": WARNING no listener for port %d on node %s\n",
1253 +                                      ackmsg->remport, rem_node->name);
1254 +               }
1255 +               P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
1256 +                       rem_node ? rem_node->name : "Unknown",
1257 +                       le16_to_cpu(ackmsg->header.ack), cur_seq);
1258 +
1259 +               /* ACK processing has already happened */
1260 +               break;
1261 +
1262 +               /* Return 1 if we have a listener on this port, 0 if not */
1263 +       case CLUSTER_CMD_LISTENREQ:
1264 +               listenmsg =
1265 +                   (struct cl_listenmsg *) (data +
1266 +                                            sizeof (struct cl_protheader));
1267 +               cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1268 +               send_listen_response(csock, le32_to_cpu(header->srcid),
1269 +                                    listenmsg->target_port, listenmsg->tag);
1270 +               break;
1271 +
1272 +       case CLUSTER_CMD_LISTENRESP:
1273 +               /* Wake up process waiting for listen response */
1274 +               listenmsg =
1275 +                   (struct cl_listenmsg *) (data +
1276 +                                            sizeof (struct cl_protheader));
1277 +               cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1278 +               down(&listenreq_lock);
1279 +               listen_request = find_listen_request(listenmsg->tag);
1280 +               if (listen_request) {
1281 +                       listen_request->result = listenmsg->listening;
1282 +                       listen_request->waiting = 0;
1283 +                       wake_up_interruptible(&listen_request->waitq);
1284 +               }
1285 +               up(&listenreq_lock);
1286 +               break;
1287 +
1288 +       case CLUSTER_CMD_PORTCLOSED:
1289 +               closemsg =
1290 +                   (struct cl_closemsg *) (data +
1291 +                                           sizeof (struct cl_protheader));
1292 +               cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1293 +               post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
1294 +               break;
1295 +
1296 +       case CLUSTER_CMD_BARRIER:
1297 +               barriermsg =
1298 +                   (struct cl_barriermsg *) (data +
1299 +                                             sizeof (struct cl_protheader));
1300 +               cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1301 +               if (rem_node)
1302 +                       process_barrier_msg(barriermsg, rem_node);
1303 +               break;
1304 +
1305 +       default:
1306 +               printk(KERN_ERR CMAN_NAME
1307 +                      ": Unknown protocol message %d received\n", msg->cmd);
1308 +               break;
1309 +
1310 +       }
1311 +       return;
1312 +}
1313 +
1314 +static int valid_addr_for_node(struct cluster_node *node, char *addr)
1315 +{
1316 +       struct list_head *addrlist;
1317 +       struct cluster_node_addr *nodeaddr;
1318 +
1319 +       /* We don't compare the first two bytes of the address because it's
1320 +        * the Address Family and always in native byte order...so it will
1321 +        * not match if we have mixed big & little-endian machines in the cluster
1322 +        */
1323 +
1324 +       list_for_each(addrlist, &node->addr_list) {
1325 +               nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
1326 +
1327 +               if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0)
1328 +                       return 1; /* TRUE */
1329 +       }
1330 +       return 0; /* FALSE */
1331 +}
1332 +
1333 +static void memcpy_fromkvec(void *data, struct kvec *vec, int len)
1334 +{
1335 +        while (len > 0) {
1336 +                if (vec->iov_len) {
1337 +                        int copy = min_t(unsigned int, len, vec->iov_len);
1338 +                        memcpy(data, vec->iov_base, copy);
1339 +                        len -= copy;
1340 +                        data += copy;
1341 +                        vec->iov_base += copy;
1342 +                        vec->iov_len -= copy;
1343 +                }
1344 +                vec++;
1345 +        }
1346 +}
1347 +
1348 +static int send_to_user_port(struct cl_comms_socket *csock,
1349 +                            struct cl_protheader *header,
1350 +                            struct msghdr *msg,
1351 +                            struct kvec *iov, int veclen,
1352 +                            int len)
1353 +{
1354 +       struct sk_buff *skb;
1355 +       struct cb_info *cbinfo;
1356 +       int err;
1357 +
1358 +        /* Get the port number and look for a listener */
1359 +       down(&port_array_lock);
1360 +       if (port_array[header->tgtport]) {
1361 +               struct cluster_sock *c = cluster_sk(port_array[header->tgtport]);
1362 +
1363 +               /* ACK it */
1364 +               if (!(header->flags & MSG_NOACK) &&
1365 +                   !(header->flags & MSG_REPLYEXP)) {
1366 +
1367 +                       cl_sendack(csock, header->seq, msg->msg_namelen,
1368 +                                  msg->msg_name, header->tgtport, 0);
1369 +               }
1370 +
1371 +               /* Call a callback if there is one */
1372 +               if (c->kernel_callback) {
1373 +                       up(&port_array_lock);
1374 +                       if (veclen == 1) {
1375 +                               c->kernel_callback(iov->iov_base,
1376 +                                                  iov->iov_len,
1377 +                                                  msg->msg_name, msg->msg_namelen,
1378 +                                                  le32_to_cpu(header->srcid));
1379 +
1380 +                       }
1381 +                       else { /* Unroll iov, this Hardly ever Happens */
1382 +                               char *data;
1383 +                               data = kmalloc(len, GFP_KERNEL);
1384 +                               if (!data)
1385 +                                       return -ENOMEM;
1386 +
1387 +                               memcpy_fromkvec(data, iov, len);
1388 +                               c->kernel_callback(data, len,
1389 +                                                  msg->msg_name, msg->msg_namelen,
1390 +                                                  le32_to_cpu(header->srcid));
1391 +                               kfree(data);
1392 +                       }
1393 +                       return len;
1394 +               }
1395 +
1396 +               /* Otherwise put it into an SKB and pass it onto the recvmsg
1397 +                * mechanism */
1398 +               skb = alloc_skb(len, GFP_KERNEL);
1399 +               if (!skb) {
1400 +                       up(&port_array_lock);
1401 +                       printk(KERN_INFO CMAN_NAME
1402 +                              ": Failed to allocate skb\n");
1403 +                       return -ENOMEM;
1404 +               }
1405 +
1406 +               skb_put(skb, len);
1407 +               memcpy_fromkvec(skb->data, iov, len);
1408 +
1409 +               /* Put metadata into cb[] */
1410 +               cbinfo = (struct cb_info *)skb->cb;
1411 +               cbinfo->orig_nodeid = le32_to_cpu(header->srcid);
1412 +               cbinfo->orig_port = header->srcport;
1413 +               cbinfo->oob = 0;
1414 +
1415 +               if ((err =
1416 +                    sock_queue_rcv_skb(port_array[header->tgtport], skb)) < 0) {
1417 +
1418 +                       printk(KERN_INFO CMAN_NAME
1419 +                              ": Error queueing request to port %d: %d\n",
1420 +                              header->tgtport, err);
1421 +                       kfree_skb(skb);
1422 +
1423 +                       /* If the port was MEMBERSHIP then we have to die */
1424 +                       if (header->tgtport == CLUSTER_PORT_MEMBERSHIP) {
1425 +                               up(&port_array_lock);
1426 +                               send_leave(CLUSTER_LEAVEFLAG_PANIC);
1427 +                               panic("membership stopped responding");
1428 +                       }
1429 +               }
1430 +               up(&port_array_lock);
1431 +
1432 +       }
1433 +       else {
1434 +               /* ACK it, but set the flag bit so remote end knows no-one
1435 +                * caught it */
1436 +               if (!(header->flags & MSG_NOACK))
1437 +                       cl_sendack(csock, header->seq,
1438 +                                  msg->msg_namelen, msg->msg_name,
1439 +                                  header->tgtport, 1);
1440 +
1441 +               /* Nobody listening, drop it */
1442 +               up(&port_array_lock);
1443 +       }
1444 +       return len;
1445 +}
1446 +
1447 +/* NOTE: This routine knows (assumes!) that there is only one
1448 +   iov element passed into it. */
1449 +static void process_incoming_packet(struct cl_comms_socket *csock,
1450 +                                   struct msghdr *msg,
1451 +                                   struct kvec *vec, int veclen, int len)
1452 +{
1453 +       char *data = vec->iov_base;
1454 +       char *addr = msg->msg_name;
1455 +       int addrlen = msg->msg_namelen;
1456 +       struct cl_protheader *header = (struct cl_protheader *) data;
1457 +       struct cluster_node *rem_node =
1458 +               find_node_by_nodeid(le32_to_cpu(header->srcid));
1459 +
1460 +       P_COMMS("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
1461 +            le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1462 +            le16_to_cpu(header->seq), rem_node,
1463 +            rem_node ? rem_node->state : -1);
1464 +
1465 +       /* If the remote end is being coy about its node ID then look it up by
1466 +        * address */
1467 +       if (!rem_node && header->srcid == 0) {
1468 +               rem_node = find_node_by_addr(addr, addrlen);
1469 +       }
1470 +
1471 +       /* If this node is an ex-member then treat it as unknown */
1472 +       if (rem_node && rem_node->state != NODESTATE_MEMBER
1473 +           && rem_node->state != NODESTATE_JOINING)
1474 +               rem_node = NULL;
1475 +
1476 +       /* Ignore messages not for our cluster */
1477 +       if (le16_to_cpu(header->cluster) != cluster_id) {
1478 +               P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
1479 +                       cluster_id, header->cluster);
1480 +               goto incoming_finish;
1481 +       }
1482 +
1483 +       /* If the message is from us then just dump it */
1484 +       if (rem_node && rem_node->us)
1485 +               goto incoming_finish;
1486 +
1487 +       /* If we can't find the nodeid then check for our own messages the hard
1488 +        * way - this only happens during joining */
1489 +       if (!rem_node) {
1490 +               struct list_head *socklist;
1491 +               struct cl_comms_socket *clsock;
1492 +
1493 +               list_for_each(socklist, &socket_list) {
1494 +                       clsock =
1495 +                           list_entry(socklist, struct cl_comms_socket, list);
1496 +
1497 +                       if (clsock->recv_only) {
1498 +
1499 +                               if (memcmp(addr, &clsock->saddr, address_length) == 0) {
1500 +                                       goto incoming_finish;
1501 +                               }
1502 +                       }
1503 +               }
1504 +
1505 +       }
1506 +
1507 +       /* Ignore messages not for us */
1508 +       if (le32_to_cpu(header->tgtid) > 0 && us
1509 +           && le32_to_cpu(header->tgtid) != us->node_id) {
1510 +               goto incoming_finish;
1511 +       }
1512 +
1513 +       P_COMMS("got message, from %d for %d, sequence num = %d\n",
1514 +               le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
1515 +               le16_to_cpu(header->seq));
1516 +
1517 +       if (header->ack && rem_node) {
1518 +               process_ack(rem_node, header->ack);
1519 +       }
1520 +
1521 +        /* Have we received this message before ? If so just ignore it, it's a
1522 +        * resend for someone else's benefit */
1523 +       if (!(header->flags & MSG_NOACK) &&
1524 +           rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
1525 +               P_COMMS
1526 +                   ("Discarding message - Already seen this sequence number %d\n",
1527 +                    rem_node->last_seq_recv);
1528 +               /* Still need to ACK it though, in case it was the ACK that got
1529 +                * lost */
1530 +               cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
1531 +               goto incoming_finish;
1532 +       }
1533 +
1534 +       /* Check that the message is from the node we think it is from */
1535 +       if (rem_node && !valid_addr_for_node(rem_node, addr)) {
1536 +               return;
1537 +       }
1538 +
1539 +       /* If it's a new node then assign it a temporary node ID */
1540 +       if (!rem_node)
1541 +               header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
1542 +
1543 +       P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
1544 +               header->flags, header->tgtport, we_are_a_cluster_member);
1545 +
1546 +
1547 +       /* If we are not part of the cluster then ignore multicast messages
1548 +        * that need an ACK as we will confuse the sender who is only expecting
1549 +        * ACKS from bona fide members */
1550 +       if ((header->flags & MSG_MULTICAST) &&
1551 +           !(header->flags & MSG_NOACK) && !we_are_a_cluster_member) {
1552 +               P_COMMS
1553 +                   ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
1554 +                    header->tgtport, header->flags);
1555 +               goto incoming_finish;
1556 +       }
1557 +
1558 +       /* Save the sequence number of this message so we can ignore duplicates
1559 +        * (above) */
1560 +       if (!(header->flags & MSG_NOACK) && rem_node) {
1561 +               P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
1562 +                       rem_node->name);
1563 +               rem_node->last_seq_recv = le16_to_cpu(header->seq);
1564 +       }
1565 +
1566 +       /* Is it a protocol message? */
1567 +       if (header->tgtport == 0) {
1568 +               process_cnxman_message(csock, data, len, addr, addrlen,
1569 +                                      rem_node);
1570 +               goto incoming_finish;
1571 +       }
1572 +
1573 +       /* Skip past the header to the data */
1574 +       vec[0].iov_base = data + sizeof (struct cl_protheader);
1575 +       vec[0].iov_len -= sizeof (struct cl_protheader);
1576 +       len -= sizeof (struct cl_protheader);
1577 +
1578 +       send_to_user_port(csock, header, msg, vec, veclen, len);
1579 +
1580 +      incoming_finish:
1581 +       return;
1582 +}
1583 +
1584 +static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
1585 +{
1586 +       struct sock *sk;
1587 +       struct cluster_sock *c;
1588 +
1589 +       if ((sk =
1590 +            sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
1591 +                     cluster_sk_cachep)) == NULL)
1592 +               goto no_sock;
1593 +
1594 +       if (sock) {
1595 +               sock->ops = &cl_proto_ops;
1596 +       }
1597 +       sock_init_data(sock, sk);
1598 +
1599 +       sk->sk_destruct = NULL;
1600 +       sk->sk_no_check = 1;
1601 +       sk->sk_family = PF_CLUSTER;
1602 +       sk->sk_allocation = gfp;
1603 +
1604 +       c = cluster_sk(sk);
1605 +       c->port = 0;
1606 +       c->service_data = NULL;
1607 +
1608 +       return sk;
1609 +      no_sock:
1610 +       return NULL;
1611 +}
1612 +
1613 +static int cl_release(struct socket *sock)
1614 +{
1615 +       struct sock *sk = sock->sk;
1616 +       struct cl_client_socket *csock;
1617 +       struct list_head *socklist;
1618 +       struct list_head *tmp;
1619 +
1620 +       down(&client_socket_lock);
1621 +       if (sk) {
1622 +               /* Remove port allocations if it's a bound socket */
1623 +               struct cluster_sock *c = cluster_sk(sk);
1624 +
1625 +               down(&port_array_lock);
1626 +               if (c->port) {
1627 +                       port_array[c->port] = NULL;
1628 +               }
1629 +               up(&port_array_lock);
1630 +
1631 +               /* Tell other nodes in the cluster that this listener is going
1632 +                * away */
1633 +               if (atomic_read(&cnxman_running) && c->port)
1634 +                       send_port_close_oob(c->port);
1635 +
1636 +               if (c->service_data)
1637 +                       sm_sock_release(sock);
1638 +
1639 +               /* Master socket released ? */
1640 +               if (sk->sk_protocol == CLPROTO_MASTER) {
1641 +                       master_sock = NULL;
1642 +
1643 +                       /* If this socket is being freed and cnxman is not
1644 +                        * started then free all the comms sockets as either
1645 +                        * the userland "join" process has crashed or the
1646 +                        * join failed.
1647 +                        */
1648 +                       if (!atomic_read(&cnxman_running)) {
1649 +                               quit_threads = 1;
1650 +                               free_cluster_sockets();
1651 +                       }
1652 +               }
1653 +
1654 +               sock_orphan(sk);
1655 +               sock_hold(sk);
1656 +               lock_sock(sk);
1657 +               release_sock(sk);
1658 +               sock_put(sk);
1659 +               sock_put(sk);
1660 +               sock->sk = NULL;
1661 +       }
1662 +
1663 +       /* Remove it from the list of clients */
1664 +       list_for_each_safe(socklist, tmp, &client_socket_list) {
1665 +               csock = list_entry(socklist, struct cl_client_socket, list);
1666 +
1667 +               if (csock->sock == sock) {
1668 +                       list_del(&csock->list);
1669 +                       kfree(csock);
1670 +                       break;
1671 +               }
1672 +       }
1673 +       up(&client_socket_lock);
1674 +
1675 +       return 0;
1676 +}
1677 +
1678 +static int cl_create(struct socket *sock, int protocol)
1679 +{
1680 +       struct sock *sk;
1681 +
1682 +       /* All are datagrams */
1683 +       if (sock->type != SOCK_DGRAM)
1684 +               return -ESOCKTNOSUPPORT;
1685 +
1686 +       if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
1687 +               return -EPERM;
1688 +
1689 +       /* Can only have one master socket */
1690 +       if (master_sock && protocol == CLPROTO_MASTER)
1691 +               return -EBUSY;
1692 +
1693 +       /* cnxman not running and a client was requested */
1694 +       if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
1695 +               return -ENETDOWN;
1696 +
1697 +       if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
1698 +               return -ENOBUFS;
1699 +
1700 +       sk->sk_protocol = protocol;
1701 +
1702 +       if (protocol == CLPROTO_MASTER)
1703 +               master_sock = sk;
1704 +
1705 +       /* Add client sockets to the list */
1706 +       if (protocol == CLPROTO_CLIENT) {
1707 +               struct cl_client_socket *clsock =
1708 +                   kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
1709 +               if (!clsock) {
1710 +                       cl_release(sock);
1711 +                       return -ENOMEM;
1712 +               }
1713 +               clsock->sock = sock;
1714 +               down(&client_socket_lock);
1715 +               list_add(&clsock->list, &client_socket_list);
1716 +               up(&client_socket_lock);
1717 +       }
1718 +
1719 +       return 0;
1720 +}
1721 +
1722 +static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1723 +{
1724 +       struct sock *sk = sock->sk;
1725 +       struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
1726 +       struct cluster_sock *c = cluster_sk(sk);
1727 +
1728 +       if (!capable(CAP_NET_BIND_SERVICE))
1729 +               return -EPERM;
1730 +
1731 +       if (sk->sk_zapped == 0)
1732 +               return -EINVAL;
1733 +
1734 +       if (addr_len != sizeof (struct sockaddr_cl))
1735 +               return -EINVAL;
1736 +
1737 +       if (saddr->scl_family != AF_CLUSTER)
1738 +               return -EINVAL;
1739 +
1740 +       if (saddr->scl_port == 0)
1741 +               return -EINVAL; /* Port 0 is reserved for protocol messages */
1742 +
1743 +       down(&port_array_lock);
1744 +
1745 +       if (port_array[saddr->scl_port]) {
1746 +               up(&port_array_lock);
1747 +               return -EADDRINUSE;
1748 +       }
1749 +
1750 +       port_array[saddr->scl_port] = sk;
1751 +
1752 +       up(&port_array_lock);
1753 +
1754 +       c->port = saddr->scl_port;
1755 +       sk->sk_zapped = 0;
1756 +
1757 +       /* If we are not a cluster member yet then make the client wait until
1758 +        * we are, this allows nodes to start cluster clients at the same time
1759 +        * as cluster services but they will wait until membership is achieved.
1760 +        * This looks odd in bind() (open would seem more obvious) but we need
1761 +        * to know which port number is being used so that things like
1762 +        * membership services don't get blocked
1763 +        */
1764 +
1765 +       if (saddr->scl_port > HIGH_PROTECTED_PORT)
1766 +               while (!we_are_a_cluster_member || !cluster_is_quorate
1767 +                      || in_transition()) {
1768 +                       DECLARE_WAITQUEUE(wq, current);
1769 +                       struct task_struct *tsk = current;
1770 +
1771 +                       set_task_state(tsk, TASK_INTERRUPTIBLE);
1772 +                       add_wait_queue(&socket_waitq, &wq);
1773 +
1774 +                       if (!we_are_a_cluster_member || !cluster_is_quorate
1775 +                           || in_transition())
1776 +                               schedule();
1777 +
1778 +                       set_task_state(tsk, TASK_RUNNING);
1779 +                       remove_wait_queue(&socket_waitq, &wq);
1780 +
1781 +                       /* We were woken up because the cluster is going down,
1782 +                        * ...and we never got a chance to do any work! (sob) */
1783 +                       if (atomic_read(&cnxman_running) == 0 || quit_threads) {
1784 +                               return -ENOTCONN;
1785 +                       }
1786 +               }
1787 +
1788 +       return 0;
1789 +}
1790 +
1791 +static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
1792 +                     int *uaddr_len, int peer)
1793 +{
1794 +       struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
1795 +       struct sock *sk = sock->sk;
1796 +       struct cluster_sock *c = cluster_sk(sk);
1797 +
1798 +       *uaddr_len = sizeof (struct sockaddr_cl);
1799 +
1800 +       lock_sock(sk);
1801 +
1802 +       sa->scl_port = c->port;
1803 +       sa->scl_flags = 0;
1804 +       sa->scl_family = AF_CLUSTER;
1805 +
1806 +       release_sock(sk);
1807 +
1808 +       return 0;
1809 +}
1810 +
1811 +static unsigned int cl_poll(struct file *file, struct socket *sock,
1812 +                           poll_table * wait)
1813 +{
1814 +       return datagram_poll(file, sock, wait);
1815 +}
1816 +
1817 +/* Copy internal node format to userland format */
1818 +void copy_to_usernode(struct cluster_node *node,
1819 +                            struct cl_cluster_node *unode)
1820 +{
1821 +       strcpy(unode->name, node->name);
1822 +       unode->size = sizeof (struct cl_cluster_node);
1823 +       unode->votes = node->votes;
1824 +       unode->state = node->state;
1825 +       unode->us = node->us;
1826 +       unode->node_id = node->node_id;
1827 +       unode->leave_reason = node->leave_reason;
1828 +       unode->incarnation = node->incarnation;
1829 +}
1830 +
1831 +static int add_clsock(int broadcast, int number, struct socket *sock,
1832 +                     struct file *file)
1833 +{
1834 +       struct cl_comms_socket *newsock =
1835 +           kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
1836 +       if (!newsock)
1837 +               return -ENOMEM;
1838 +
1839 +       memset(newsock, 0, sizeof (*newsock));
1840 +       newsock->number = number;
1841 +       newsock->sock = sock;
1842 +       if (broadcast) {
1843 +               newsock->broadcast = 1;
1844 +               newsock->recv_only = 0;
1845 +       }
1846 +       else {
1847 +               newsock->broadcast = 0;
1848 +               newsock->recv_only = 1;
1849 +       }
1850 +
1851 +       newsock->file = file;
1852 +       newsock->addr_len = sizeof(struct sockaddr_in6);
1853 +
1854 +       /* Mark it active until cnxman thread is running and ready to process
1855 +        * messages */
1856 +       set_bit(1, &newsock->active);
1857 +
1858 +       /* Find out what it's bound to */
1859 +       newsock->sock->ops->getname(newsock->sock,
1860 +                                   (struct sockaddr *)&newsock->saddr,
1861 +                                   &newsock->addr_len, 0);
1862 +
1863 +       num_interfaces = max(num_interfaces, newsock->number);
1864 +       if (!current_interface && newsock->broadcast)
1865 +               current_interface = newsock;
1866 +
1867 +       /* Hook data_ready */
1868 +       newsock->sock->sk->sk_data_ready = cnxman_data_ready;
1869 +
1870 +       /* Make an attempt to keep them in order */
1871 +       list_add_tail(&newsock->list, &socket_list);
1872 +
1873 +       address_length = newsock->addr_len;
1874 +       return 0;
1875 +}
1876 +
1877 +/* ioctl processing functions */
1878 +
1879 +static int do_ioctl_set_version(unsigned long arg)
1880 +{
1881 +       struct cl_version version, *u_version;
1882 +
1883 +       if (!capable(CAP_CLUSTER))
1884 +               return -EPERM;
1885 +       if (arg == 0)
1886 +               return -EINVAL;
1887 +
1888 +       u_version = (struct cl_version *) arg;
1889 +
1890 +       if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
1891 +               return -EFAULT;
1892 +
1893 +       if (version.major != CNXMAN_MAJOR_VERSION ||
1894 +           version.minor != CNXMAN_MINOR_VERSION ||
1895 +           version.patch != CNXMAN_PATCH_VERSION)
1896 +               return -EINVAL;
1897 +
1898 +       if (config_version == version.config)
1899 +               return 0;
1900 +
1901 +       config_version = version.config;
1902 +       send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
1903 +       return 0;
1904 +}
1905 +
1906 +static int do_ioctl_get_members(unsigned long arg)
1907 +{
1908 +       struct cluster_node *node;
1909 +       /* Kernel copies */
1910 +       struct cl_cluster_node user_format_node;
1911 +       struct cl_cluster_nodelist user_format_nodelist;
1912 +       /* User space array ptr */
1913 +       struct cl_cluster_node *user_node;
1914 +       struct list_head *nodelist;
1915 +       int num_nodes = 0;
1916 +
1917 +       if (arg == 0)
1918 +               return cluster_members;
1919 +
1920 +       if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1921 +               return -EFAULT;
1922 +
1923 +       down(&cluster_members_lock);
1924 +
1925 +       if (user_format_nodelist.max_members < cluster_members) {
1926 +               up(&cluster_members_lock);
1927 +               return -E2BIG;
1928 +       }
1929 +
1930 +       user_node = user_format_nodelist.nodes;
1931 +
1932 +       list_for_each(nodelist, &cluster_members_list) {
1933 +               node = list_entry(nodelist, struct cluster_node, list);
1934 +               if (node->state == NODESTATE_MEMBER) {
1935 +                       copy_to_usernode(node, &user_format_node);
1936 +                       if (copy_to_user(user_node, &user_format_node,
1937 +                                        sizeof (struct cl_cluster_node))) {
1938 +                               up(&cluster_members_lock);
1939 +                               return -EFAULT;
1940 +                       }
1941 +                       user_node++;
1942 +                       num_nodes++;
1943 +               }
1944 +       }
1945 +       up(&cluster_members_lock);
1946 +
1947 +       return num_nodes;
1948 +}
1949 +
1950 +static int do_ioctl_get_all_members(unsigned long arg)
1951 +{
1952 +       struct cluster_node *node;
1953 +       /* Kernel copies */
1954 +       struct cl_cluster_node user_format_node;
1955 +       struct cl_cluster_nodelist user_format_nodelist;
1956 +       /* User space array ptr*/
1957 +       struct cl_cluster_node *user_node;
1958 +       struct list_head *nodelist;
1959 +       int num_nodes = 0;
1960 +
1961 +       if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
1962 +               return -EFAULT;
1963 +
1964 +       down(&cluster_members_lock);
1965 +
1966 +       user_node = user_format_nodelist.nodes;
1967 +
1968 +       list_for_each(nodelist, &cluster_members_list) {
1969 +               node = list_entry(nodelist, struct cluster_node, list);
1970 +               if (arg) {
1971 +                       copy_to_usernode(node,
1972 +                                        &user_format_node);
1973 +
1974 +                       if (copy_to_user(user_node, &user_format_node,
1975 +                                        sizeof (struct cl_cluster_node))) {
1976 +                               up(&cluster_members_lock);
1977 +                               return -EFAULT;
1978 +                       }
1979 +                       user_node++;
1980 +                       if (--user_format_nodelist.max_members < 0) {
1981 +                               num_nodes = -EFAULT;
1982 +                               goto err_exit;
1983 +                       }
1984 +
1985 +               }
1986 +               num_nodes++;
1987 +       }
1988 +       err_exit:
1989 +       up(&cluster_members_lock);
1990 +
1991 +       return num_nodes;
1992 +}
1993 +
1994 +
1995 +static int do_ioctl_get_cluster(unsigned long arg)
1996 +{
1997 +       struct cl_cluster_info __user *info;
1998 +
1999 +       info = (struct cl_cluster_info *)arg;
2000 +
2001 +       if (copy_to_user(&info->number, &cluster_id, sizeof(cluster_id)))
2002 +           return -EFAULT;
2003 +
2004 +       if (copy_to_user(&info->name, cluster_name, strlen(cluster_name)+1))
2005 +               return -EFAULT;
2006 +
2007 +       return 0;
2008 +}
2009 +
2010 +static int do_ioctl_get_node(unsigned long arg)
2011 +{
2012 +       struct cluster_node *node;
2013 +       struct cl_cluster_node k_node, *u_node;
2014 +
2015 +       u_node = (struct cl_cluster_node *) arg;
2016 +
2017 +       if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
2018 +               return -EFAULT;
2019 +
2020 +       if (!k_node.name[0]) {
2021 +               if (k_node.node_id == 0)
2022 +                       k_node.node_id = us->node_id;
2023 +               node = find_node_by_nodeid(k_node.node_id);
2024 +       }
2025 +       else
2026 +               node = find_node_by_name(k_node.name);
2027 +
2028 +       if (!node)
2029 +               return -ENOENT;
2030 +
2031 +       copy_to_usernode(node, &k_node);
2032 +
2033 +       if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
2034 +               return -EFAULT;
2035 +
2036 +       return 0;
2037 +}
2038 +
2039 +static int do_ioctl_set_expected(unsigned long arg)
2040 +{
2041 +       struct list_head *nodelist;
2042 +       struct cluster_node *node;
2043 +       unsigned int total_votes;
2044 +       unsigned int newquorum;
2045 +
2046 +       if (!capable(CAP_CLUSTER))
2047 +               return -EPERM;
2048 +       if (arg == 0)
2049 +               return -EINVAL;
2050 +
2051 +       newquorum = calculate_quorum(1, arg, &total_votes);
2052 +
2053 +       if (newquorum < total_votes / 2
2054 +           || newquorum > total_votes) {
2055 +               return -EINVAL;
2056 +       }
2057 +
2058 +       /* Now do it */
2059 +       down(&cluster_members_lock);
2060 +       list_for_each(nodelist, &cluster_members_list) {
2061 +               node = list_entry(nodelist, struct cluster_node, list);
2062 +               if (node->state == NODESTATE_MEMBER
2063 +                   && node->expected_votes > arg) {
2064 +                       node->expected_votes = arg;
2065 +               }
2066 +       }
2067 +       up(&cluster_members_lock);
2068 +
2069 +       recalculate_quorum(1);
2070 +
2071 +       send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
2072 +       sm_member_update(cluster_is_quorate);
2073 +
2074 +       return 0;
2075 +}
2076 +
2077 +static int do_ioctl_kill_node(unsigned long arg)
2078 +{
2079 +       struct cluster_node *node;
2080 +
2081 +       if (!capable(CAP_CLUSTER))
2082 +               return -EPERM;
2083 +
2084 +
2085 +       if ((node = find_node_by_nodeid(arg)) == NULL)
2086 +               return -EINVAL;
2087 +
2088 +       /* Can't kill us */
2089 +       if (node->us)
2090 +               return -EINVAL;
2091 +
2092 +       if (node->state != NODESTATE_MEMBER)
2093 +               return -EINVAL;
2094 +
2095 +       /* Just in case it is alive, send a KILL message */
2096 +       send_kill(arg);
2097 +
2098 +       node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
2099 +       a_node_just_died(node);
2100 +
2101 +       return 0;
2102 +}
2103 +
2104 +static int do_ioctl_barrier(unsigned long arg)
2105 +{
2106 +       struct cl_barrier_info info;
2107 +
2108 +       if (!capable(CAP_CLUSTER))
2109 +                       return -EPERM;
2110 +
2111 +       if (copy_from_user(&info, (void *)arg, sizeof(info))  != 0)
2112 +               return -EFAULT;
2113 +
2114 +       switch (info.cmd) {
2115 +       case BARRIER_IOCTL_REGISTER:
2116 +               return kcl_barrier_register(info.name,
2117 +                                           info.flags,
2118 +                                           info.arg);
2119 +       case BARRIER_IOCTL_CHANGE:
2120 +               return kcl_barrier_setattr(info.name,
2121 +                                          info.flags,
2122 +                                          info.arg);
2123 +       case BARRIER_IOCTL_WAIT:
2124 +               return kcl_barrier_wait(info.name);
2125 +       case BARRIER_IOCTL_DELETE:
2126 +               return kcl_barrier_delete(info.name);
2127 +       default:
2128 +               return -EINVAL;
2129 +       }
2130 +}
2131 +
2132 +static int do_ioctl_islistening(unsigned long arg)
2133 +{
2134 +       DECLARE_WAITQUEUE(wq, current);
2135 +       struct cl_listen_request rq;
2136 +       struct cluster_node *rem_node;
2137 +       int nodeid;
2138 +       int result;
2139 +       struct cl_waiting_listen_request *listen_request;
2140 +
2141 +       if (!arg)
2142 +               return -EINVAL;
2143 +
2144 +       if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
2145 +               return -EFAULT;
2146 +
2147 +       nodeid = rq.nodeid;
2148 +       if (!nodeid)
2149 +               nodeid = us->node_id;
2150 +
2151 +       rem_node = find_node_by_nodeid(nodeid);
2152 +
2153 +       /* Node not in the cluster */
2154 +       if (!rem_node)
2155 +               return -ENOENT;
2156 +
2157 +       if (rem_node->state != NODESTATE_MEMBER)
2158 +               return -ENOTCONN;
2159 +
2160 +       /* If the request is for us then just look in the ports
2161 +        * array */
2162 +       if (rem_node->us)
2163 +               return (port_array[rq.port] != 0) ? 1 : 0;
2164 +
2165 +       /* For a remote node we need to send a request out */
2166 +
2167 +       /* If we are in transition then wait until we are not */
2168 +       while (in_transition()) {
2169 +               set_task_state(current, TASK_INTERRUPTIBLE);
2170 +               add_wait_queue(&socket_waitq, &wq);
2171 +
2172 +               if (in_transition())
2173 +                       schedule();
2174 +
2175 +               set_task_state(current, TASK_RUNNING);
2176 +               remove_wait_queue(&socket_waitq, &wq);
2177 +
2178 +               if (signal_pending(current))
2179 +                       return -EINTR;
2180 +       }
2181 +
2182 +       /* Were we shut down before it completed ? */
2183 +       if (!atomic_read(&cnxman_running))
2184 +               return -ENOTCONN;
2185 +
2186 +       listen_request =
2187 +               kmalloc(sizeof (struct cl_waiting_listen_request),
2188 +                       GFP_KERNEL);
2189 +       if (!listen_request)
2190 +               return -ENOMEM;
2191 +
2192 +       /* Build the request */
2193 +       listen_request->waiting = 1;
2194 +       listen_request->result = 0;
2195 +       listen_request->tag = current->pid;
2196 +       listen_request->nodeid = nodeid;
2197 +       init_waitqueue_head(&listen_request->waitq);
2198 +
2199 +       down(&listenreq_lock);
2200 +       list_add(&listen_request->list, &listenreq_list);
2201 +       up(&listenreq_lock);
2202 +
2203 +       /* Now wait for the response to come back */
2204 +       send_listen_request(rq.nodeid, rq.port);
2205 +
2206 +       while (listen_request->waiting) {
2207 +               set_task_state(current, TASK_INTERRUPTIBLE);
2208 +               add_wait_queue(&listen_request->waitq, &wq);
2209 +
2210 +               if (listen_request->waiting)
2211 +                       schedule();
2212 +
2213 +               set_task_state(current, TASK_RUNNING);
2214 +               remove_wait_queue(&listen_request->waitq, &wq);
2215 +
2216 +               if (signal_pending(current)) {
2217 +                       result = -ERESTARTSYS;
2218 +                       goto end_listen;
2219 +               }
2220 +       }
2221 +       result = listen_request->result;
2222 +
2223 + end_listen:
2224 +       down(&listenreq_lock);
2225 +       list_del(&listen_request->list);
2226 +       kfree(listen_request);
2227 +       up(&listenreq_lock);
2228 +       return result;
2229 +}
2230 +
2231 +static int do_ioctl_set_votes(unsigned long arg)
2232 +{
2233 +       unsigned int total_votes;
2234 +       unsigned int newquorum;
2235 +       int saved_votes;
2236 +
2237 +       if (!capable(CAP_CLUSTER))
2238 +               return -EPERM;
2239 +
2240 +       /* Check votes is valid */
2241 +       saved_votes = us->votes;
2242 +       us->votes = arg;
2243 +
2244 +       newquorum = calculate_quorum(1, 0, &total_votes);
2245 +
2246 +       if (newquorum < total_votes / 2 || newquorum > total_votes) {
2247 +               us->votes = saved_votes;
2248 +               return -EINVAL;
2249 +       }
2250 +
2251 +       recalculate_quorum(1);
2252 +
2253 +       send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
2254 +
2255 +       return 0;
2256 +}
2257 +
2258 +static int do_ioctl_pass_socket(unsigned long arg)
2259 +{
2260 +       struct cl_passed_sock sock_info;
2261 +       struct file *file;
2262 +       int error;
2263 +
2264 +       if (!capable(CAP_CLUSTER))
2265 +               return -EPERM;
2266 +
2267 +       if (atomic_read(&cnxman_running))
2268 +               return -EINVAL;
2269 +
2270 +       error = -EBADF;
2271 +
2272 +       if (copy_from_user(&sock_info, (void *)arg, sizeof(sock_info)))
2273 +               return -EFAULT;
2274 +
2275 +       file = fget(sock_info.fd);
2276 +       if (file) {
2277 +               struct inode *inode = file->f_dentry->d_inode;
2278 +
2279 +               error = add_clsock(sock_info.multicast,
2280 +                                  sock_info.number, SOCKET_I(inode),
2281 +                                  file);
2282 +               if (error)
2283 +                       fput(file);
2284 +       }
2285 +       return error;
2286 +
2287 +}
2288 +
2289 +static int do_ioctl_set_nodename(unsigned long arg)
2290 +{
2291 +       if (!capable(CAP_CLUSTER))
2292 +               return -EPERM;
2293 +       if (atomic_read(&cnxman_running))
2294 +               return -EINVAL;
2295 +       if (strncpy_from_user(nodename, (void *)arg, MAX_CLUSTER_MEMBER_NAME_LEN) < 0)
2296 +               return -EFAULT;
2297 +       return 0;
2298 +}
2299 +
2300 +static int do_ioctl_set_nodeid(unsigned long arg)
2301 +{
2302 +       int nodeid = (int)arg;
2303 +
2304 +       if (!capable(CAP_CLUSTER))
2305 +               return -EPERM;
2306 +       if (atomic_read(&cnxman_running))
2307 +               return -EINVAL;
2308 +       if (nodeid < 0 || nodeid > 4096)
2309 +               return -EINVAL;
2310 +
2311 +       wanted_nodeid = (int)arg;
2312 +       return 0;
2313 +}
2314 +
2315 +static int do_ioctl_join_cluster(unsigned long arg)
2316 +{
2317 +       struct cl_join_cluster_info join_info;
2318 +
2319 +       if (!capable(CAP_CLUSTER))
2320 +               return -EPERM;
2321 +
2322 +       if (atomic_read(&cnxman_running))
2323 +               return -EALREADY;
2324 +
2325 +       if (copy_from_user(&join_info, (void *)arg, sizeof (struct cl_join_cluster_info) ))
2326 +               return -EFAULT;
2327 +
2328 +       if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
2329 +               return -EINVAL;
2330 +
2331 +       if (list_empty(&socket_list))
2332 +               return -ENOTCONN;
2333 +
2334 +       set_votes(join_info.votes, join_info.expected_votes);
2335 +       cluster_id = generate_cluster_id(join_info.cluster_name);
2336 +       strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
2337 +       two_node = join_info.two_node;
2338 +       config_version = join_info.config_version;
2339 +
2340 +       quit_threads = 0;
2341 +       acks_expected = 0;
2342 +       init_completion(&cluster_thread_comp);
2343 +       init_completion(&member_thread_comp);
2344 +       if (allocate_nodeid_array())
2345 +               return -ENOMEM;
2346 +
2347 +       kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
2348 +       if (kcluster_pid < 0)
2349 +               return kcluster_pid;
2350 +
2351 +       wait_for_completion(&cluster_thread_comp);
2352 +       init_completion(&cluster_thread_comp);
2353 +
2354 +       atomic_set(&cnxman_running, 1);
2355 +
2356 +       /* Make sure we have a node name */
2357 +       if (nodename[0] == '\0')
2358 +               strcpy(nodename, system_utsname.nodename);
2359 +
2360 +       membership_pid = start_membership_services(kcluster_pid);
2361 +       if (membership_pid < 0) {
2362 +               quit_threads = 1;
2363 +               wait_for_completion(&cluster_thread_comp);
2364 +               init_completion(&member_thread_comp);
2365 +               return membership_pid;
2366 +       }
2367 +
2368 +       sm_start();
2369 +       return 0;
2370 +}
2371 +
2372 +static int do_ioctl_leave_cluster(unsigned long leave_flags)
2373 +{
2374 +       if (!capable(CAP_CLUSTER))
2375 +               return -EPERM;
2376 +
2377 +       if (!atomic_read(&cnxman_running))
2378 +               return -ENOTCONN;
2379 +
2380 +       if (in_transition())
2381 +               return -EBUSY;
2382 +
2383 +       /* Ignore the use count if FORCE is set */
2384 +       if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
2385 +               if (atomic_read(&use_count))
2386 +                       return -ENOTCONN;
2387 +       }
2388 +
2389 +       us->leave_reason = leave_flags;
2390 +       quit_threads = 1;
2391 +       wake_up_interruptible(&cnxman_waitq);
2392 +
2393 +       wait_for_completion(&cluster_thread_comp);
2394 +       atomic_set(&use_count, 0);
2395 +       return 0;
2396 +}
2397 +
2398 +static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2399 +{
2400 +       int err = -EOPNOTSUPP;
2401 +       struct list_head *proclist;
2402 +       struct list_head *tmp;
2403 +       struct notify_struct *notify;
2404 +       struct cl_version cnxman_version;
2405 +
2406 +       switch (cmd) {
2407 +               /* Process requests notification of cluster events */
2408 +       case SIOCCLUSTER_NOTIFY:
2409 +               notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
2410 +               if (!notify)
2411 +                       return -ENOMEM;
2412 +               notify->pid = current->pid;
2413 +               notify->signal = arg;
2414 +               down(&event_listener_lock);
2415 +               list_add(&notify->list, &event_listener_list);
2416 +               up(&event_listener_lock);
2417 +               err = 0;
2418 +               break;
2419 +
2420 +               /* Process is no longer interested cluster events */
2421 +       case SIOCCLUSTER_REMOVENOTIFY:
2422 +               err = EINVAL;
2423 +
2424 +               down(&event_listener_lock);
2425 +               list_for_each_safe(proclist, tmp, &event_listener_list) {
2426 +                       notify =
2427 +                           list_entry(proclist, struct notify_struct, list);
2428 +                       if (notify->pid == current->pid) {
2429 +                               list_del(&notify->list);
2430 +                               kfree(notify);
2431 +                               err = 0;
2432 +                       }
2433 +               }
2434 +               up(&event_listener_lock);
2435 +               break;
2436 +
2437 +               /* Return the cnxman version number */
2438 +       case SIOCCLUSTER_GET_VERSION:
2439 +               if (!arg)
2440 +                       return -EINVAL;
2441 +               err = 0;
2442 +               cnxman_version.major = CNXMAN_MAJOR_VERSION;
2443 +               cnxman_version.minor = CNXMAN_MINOR_VERSION;
2444 +               cnxman_version.patch = CNXMAN_PATCH_VERSION;
2445 +               cnxman_version.config = config_version;
2446 +               if (copy_to_user((void *) arg, &cnxman_version,
2447 +                                sizeof (struct cl_version))) {
2448 +                       return -EFAULT;
2449 +               }
2450 +               break;
2451 +
2452 +               /* Set the cnxman config version number */
2453 +       case SIOCCLUSTER_SET_VERSION:
2454 +               err = do_ioctl_set_version(arg);
2455 +               break;
2456 +
2457 +               /* Return the active membership list */
2458 +       case SIOCCLUSTER_GETMEMBERS:
2459 +               err = do_ioctl_get_members(arg);
2460 +               break;
2461 +
2462 +               /* Return the full membership list include dead nodes */
2463 +       case SIOCCLUSTER_GETALLMEMBERS:
2464 +               err = do_ioctl_get_all_members(arg);
2465 +               break;
2466 +
2467 +       case SIOCCLUSTER_GETNODE:
2468 +               err = do_ioctl_get_node(arg);
2469 +               break;
2470 +
2471 +       case SIOCCLUSTER_GETCLUSTER:
2472 +               err = do_ioctl_get_cluster(arg);
2473 +               break;
2474 +
2475 +       case SIOCCLUSTER_ISQUORATE:
2476 +               return cluster_is_quorate;
2477 +
2478 +       case SIOCCLUSTER_ISACTIVE:
2479 +               return atomic_read(&cnxman_running);
2480 +
2481 +       case SIOCCLUSTER_SETEXPECTED_VOTES:
2482 +               err = do_ioctl_set_expected(arg);
2483 +               break;
2484 +
2485 +               /* Change the number of votes for this node */
2486 +       case SIOCCLUSTER_SET_VOTES:
2487 +               err = do_ioctl_set_votes(arg);
2488 +               break;
2489 +
2490 +               /* Return 1 if the specified node is listening on a given port */
2491 +       case SIOCCLUSTER_ISLISTENING:
2492 +               err = do_ioctl_islistening(arg);
2493 +               break;
2494 +
2495 +               /* Forcibly kill a node */
2496 +       case SIOCCLUSTER_KILLNODE:
2497 +               err = do_ioctl_kill_node(arg);
2498 +               break;
2499 +
2500 +       case SIOCCLUSTER_GET_JOINCOUNT:
2501 +               if (!capable(CAP_CLUSTER))
2502 +                       return -EPERM;
2503 +               else
2504 +                       return atomic_read(&use_count);
2505 +
2506 +               /* ioctl interface to the barrier system */
2507 +       case SIOCCLUSTER_BARRIER:
2508 +               err = do_ioctl_barrier(arg);
2509 +               break;
2510 +
2511 +       case SIOCCLUSTER_PASS_SOCKET:
2512 +               if (sock->sk->sk_protocol != CLPROTO_MASTER)
2513 +                       err = -EOPNOTSUPP;
2514 +               else
2515 +                       err = do_ioctl_pass_socket(arg);
2516 +               break;
2517 +
2518 +       case SIOCCLUSTER_SET_NODENAME:
2519 +               if (sock->sk->sk_protocol != CLPROTO_MASTER)
2520 +                       err = -EOPNOTSUPP;
2521 +               else
2522 +                       err = do_ioctl_set_nodename(arg);
2523 +               break;
2524 +
2525 +       case SIOCCLUSTER_SET_NODEID:
2526 +               if (sock->sk->sk_protocol != CLPROTO_MASTER)
2527 +                       err = -EOPNOTSUPP;
2528 +               else
2529 +                       err = do_ioctl_set_nodeid(arg);
2530 +               break;
2531 +
2532 +       case SIOCCLUSTER_JOIN_CLUSTER:
2533 +               if (sock->sk->sk_protocol != CLPROTO_MASTER)
2534 +                       err = -EOPNOTSUPP;
2535 +               else
2536 +                       err = do_ioctl_join_cluster(arg);
2537 +               break;
2538 +
2539 +       case SIOCCLUSTER_LEAVE_CLUSTER:
2540 +               err = do_ioctl_leave_cluster(arg);
2541 +               break;
2542 +
2543 +       default:
2544 +               err = sm_ioctl(sock, cmd, arg);
2545 +       }
2546 +       return err;
2547 +}
2548 +
2549 +static int cl_shutdown(struct socket *sock, int how)
2550 +{
2551 +       struct sock *sk = sock->sk;
2552 +       int err = -ENOTCONN;
2553 +
2554 +       lock_sock(sk);
2555 +
2556 +       if (sock->state == SS_UNCONNECTED)
2557 +               goto out;
2558 +
2559 +       err = 0;
2560 +       if (sock->state == SS_DISCONNECTING)
2561 +               goto out;
2562 +
2563 +       err = -EINVAL;
2564 +
2565 +       if (how != SHUTDOWN_MASK)
2566 +               goto out;
2567 +
2568 +       sk->sk_shutdown = how;
2569 +       err = 0;
2570 +
2571 +      out:
2572 +       release_sock(sk);
2573 +
2574 +       return err;
2575 +}
2576 +
2577 +
2578 +/* We'll be giving out reward points next... */
2579 +/* Send the packet and save a copy in case someone loses theirs. Should be
2580 + * protected by the send mutexphore */
2581 +static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
2582 +                          struct kvec *vec, int veclen,
2583 +                          int size, int needack)
2584 +{
2585 +       int result;
2586 +       struct kvec save_vectors[veclen];
2587 +
2588 +       /* Save a copy of the IO vectors as sendmsg mucks around with them and
2589 +        * we might want to send the same stuff out more than once (for different
2590 +        * interfaces)
2591 +        */
2592 +       memcpy(save_vectors, vec,
2593 +              sizeof (struct kvec) * veclen);
2594 +
2595 +       result = kernel_sendmsg(csock->sock, msg, vec, veclen, size);
2596 +
2597 +       if (result >= 0 && acks_expected && needack) {
2598 +
2599 +               /* Start retransmit timer if it didn't go */
2600 +               if (result == 0) {
2601 +                       start_short_timer();
2602 +               }
2603 +               else {
2604 +                       resend_delay = 1;
2605 +               }
2606 +       }
2607 +
2608 +       /* Restore IOVs */
2609 +       memcpy(vec, save_vectors,
2610 +              sizeof (struct kvec) * veclen);
2611 +
2612 +       return result;
2613 +}
2614 +
2615 +static void resend_last_message()
2616 +{
2617 +       struct msghdr msg;
2618 +       struct kvec vec[1];
2619 +       int result;
2620 +
2621 +       P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
2622 +               jiffies, saved_msg_len, saved_msg_buffer[0],
2623 +               saved_msg_buffer[6]);
2624 +
2625 +       /* Assume there is something wrong with the last interface */
2626 +       current_interface = get_next_interface(current_interface);
2627 +       if (num_interfaces > 1)
2628 +               printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
2629 +                      current_interface->number);
2630 +
2631 +       vec[0].iov_base = saved_msg_buffer;
2632 +       vec[0].iov_len = saved_msg_len;
2633 +
2634 +       memset(&msg, 0, sizeof (msg));
2635 +       msg.msg_name = &current_interface->saddr;
2636 +       msg.msg_namelen = current_interface->addr_len;
2637 +
2638 +       result = kernel_sendmsg(current_interface->sock, &msg, vec, 1, saved_msg_len);
2639 +
2640 +       if (result < 0)
2641 +               printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
2642 +
2643 +       /* Try indefinitely to send this, the backlog must die down eventually
2644 +        * !? */
2645 +       if (result == 0)
2646 +               start_short_timer();
2647 +
2648 +       /* Send succeeded, continue waiting for ACKS */
2649 +       if (result > 0)
2650 +               start_ack_timer();
2651 +
2652 +}
2653 +
2654 +static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
2655 +                     struct msghdr *msg, size_t size, int flags)
2656 +{
2657 +       struct sock *sk = sock->sk;
2658 +       struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
2659 +       struct sk_buff *skb;
2660 +       struct cb_info *cbinfo;
2661 +       int copied, err = 0;
2662 +
2663 +       /* Socket was notified of shutdown, remove any pending skbs and return
2664 +        * EOF */
2665 +       if (!atomic_read(&cnxman_running)) {
2666 +               while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
2667 +                       skb_free_datagram(sk, skb);
2668 +               return 0;       /* cnxman has left the building */
2669 +       }
2670 +
2671 +       /* Generic datagram code does most of the work. If the user is not
2672 +        * interested in OOB messages then ignore them */
2673 +       do {
2674 +               skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2675 +               if (!skb)
2676 +                       goto out;
2677 +
2678 +               cbinfo = (struct cb_info *)skb->cb;
2679 +
2680 +               /* If it is OOB and the user doesn't want it, then throw it away. */
2681 +               if (cbinfo->oob && !(flags & MSG_OOB)) {
2682 +                       skb_free_datagram(sk, skb);
2683 +
2684 +                       /* If we peeked (?) an OOB but the user doesn't want it
2685 +                          then we need to discard it or we'll loop forever */
2686 +                       if (flags & MSG_PEEK) {
2687 +                               skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
2688 +                                                       MSG_DONTWAIT, &err);
2689 +                               if (skb)
2690 +                                       skb_free_datagram(sk, skb);
2691 +                       }
2692 +               }
2693 +               else
2694 +                       break;
2695 +       }
2696 +       while (cbinfo->oob && !(flags & MSG_OOB));
2697 +
2698 +       copied = skb->len;
2699 +       if (copied > size) {
2700 +               copied = size;
2701 +               msg->msg_flags |= MSG_TRUNC;
2702 +       }
2703 +       err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
2704 +
2705 +       if (err)
2706 +               goto out_free;
2707 +
2708 +       if (msg->msg_name && msg->msg_namelen) {
2709 +               memset(msg->msg_name, 0, msg->msg_namelen);
2710 +
2711 +               if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
2712 +
2713 +                       /* Nodeid is in native byte order - anything else is just
2714 +                        * perverse */
2715 +                       sin->scl_nodeid = cbinfo->orig_nodeid;
2716 +               }
2717 +               msg->msg_namelen = sizeof (struct sockaddr_cl);
2718 +               sin->scl_port = cbinfo->orig_port;
2719 +       }
2720 +
2721 +       if (cbinfo->oob) {
2722 +               msg->msg_flags |= MSG_OOB;
2723 +       }
2724 +
2725 +       sock_recv_timestamp(msg, sk, skb);
2726 +
2727 +       err = copied;
2728 +
2729 +      out_free:
2730 +       skb_free_datagram(sk, skb);
2731 +
2732 +      out:
2733 +       return err;
2734 +}
2735 +
2736 +/* Send a message out on all interfaces */
2737 +static int send_to_all_ints(int nodeid, struct msghdr *our_msg,
2738 +                           struct kvec *vec, int veclen, int size, int flags)
2739 +{
2740 +       struct sockaddr_in6 daddr;
2741 +       struct cl_comms_socket *clsock;
2742 +       int result = 0;
2743 +
2744 +       our_msg->msg_name = &daddr;
2745 +
2746 +       list_for_each_entry(clsock, &socket_list, list) {
2747 +
2748 +               /* Don't send out a recv-only socket */
2749 +               if (!clsock->recv_only) {
2750 +
2751 +                       /* For temporary node IDs send to the node's real IP address */
2752 +                       if (nodeid < 0) {
2753 +                               get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
2754 +                       }
2755 +                       else {
2756 +                               memcpy(&daddr, &clsock->saddr, clsock->addr_len);
2757 +                               our_msg->msg_namelen = clsock->addr_len;
2758 +                       }
2759 +
2760 +                       result = __send_and_save(clsock, our_msg, vec, veclen,
2761 +                                                size + sizeof (struct cl_protheader),
2762 +                                                !(flags & MSG_NOACK));
2763 +               }
2764 +       }
2765 +       return result;
2766 +}
2767 +
2768 +
2769 +/* Internal common send message routine */
2770 +static int __sendmsg(struct socket *sock, struct msghdr *msg,
2771 +                    struct kvec *vec, int veclen, int size,
2772 +                    unsigned char port)
2773 +{
2774 +       int result = 0, i;
2775 +       int flags = msg->msg_flags;
2776 +       struct msghdr our_msg;
2777 +       struct sockaddr_cl *caddr = msg->msg_name;
2778 +       struct cl_protheader header;
2779 +       struct kvec vectors[veclen + 1];
2780 +       unsigned char srcport;
2781 +       int nodeid = 0;
2782 +
2783 +       if (size > MAX_CLUSTER_MESSAGE)
2784 +               return -EINVAL;
2785 +       if (!atomic_read(&cnxman_running))
2786 +               return -ENOTCONN;
2787 +
2788 +       if (caddr)
2789 +               nodeid = caddr->scl_nodeid;
2790 +
2791 +       /* Check that the node id (if present) is valid */
2792 +       if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
2793 +                                !is_valid_temp_nodeid(nodeid))) {
2794 +               return -ENOTCONN;
2795 +       }
2796 +
2797 +       /* If there's no sending client socket then the source
2798 +          port is 0: "us" */
2799 +       if (sock) {
2800 +               struct cluster_sock *csock = cluster_sk(sock->sk);
2801 +               srcport = csock->port;
2802 +       }
2803 +       else {
2804 +               srcport = 0;
2805 +       }
2806 +
2807 +       /* We can only have one send outstanding at a time so we might as well
2808 +        * lock the whole send mechanism */
2809 +       down(&send_lock);
2810 +
2811 +       while ((port > HIGH_PROTECTED_PORT
2812 +               && (!cluster_is_quorate || in_transition()))
2813 +              || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
2814 +
2815 +               DECLARE_WAITQUEUE(wq, current);
2816 +               struct task_struct *tsk = current;
2817 +
2818 +               if (flags & MSG_DONTWAIT) {
2819 +                       up(&send_lock);
2820 +                       return -EAGAIN;
2821 +               }
2822 +
2823 +               if (current->pid == kcluster_pid) {
2824 +                       P_COMMS
2825 +                           ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
2826 +                            port, ack_count, acks_expected);
2827 +                       up(&send_lock);
2828 +                       return -EAGAIN;
2829 +               }
2830 +
2831 +               P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
2832 +                       ack_count, acks_expected);
2833 +
2834 +               set_task_state(tsk, TASK_INTERRUPTIBLE);
2835 +               add_wait_queue(&socket_waitq, &wq);
2836 +
2837 +               if ((port > HIGH_PROTECTED_PORT
2838 +                    && (!cluster_is_quorate || in_transition()))
2839 +                   || (acks_expected > 0)) {
2840 +
2841 +                       up(&send_lock);
2842 +                       schedule();
2843 +                       down(&send_lock);
2844 +               }
2845 +
2846 +               set_task_state(tsk, TASK_RUNNING);
2847 +               remove_wait_queue(&socket_waitq, &wq);
2848 +
2849 +               /* Going down */
2850 +               if (quit_threads) {
2851 +                       up(&send_lock);
2852 +                       return -ENOTCONN;
2853 +               }
2854 +
2855 +               if (signal_pending(current)) {
2856 +                       up(&send_lock);
2857 +                       return -ERESTARTSYS;
2858 +               }
2859 +
2860 +               /* Were we shut down in the meantime ? */
2861 +               if (!atomic_read(&cnxman_running)) {
2862 +                       up(&send_lock);
2863 +                       return -ENOTCONN;
2864 +               }
2865 +
2866 +       }
2867 +
2868 +       memset(&our_msg, 0, sizeof (our_msg));
2869 +
2870 +       /* Build the header */
2871 +       header.tgtport = port;
2872 +       header.srcport = srcport;
2873 +       header.flags = msg->msg_flags;
2874 +       header.cluster = cpu_to_le16(cluster_id);
2875 +       header.srcid = us ? cpu_to_le32(us->node_id) : 0;
2876 +       header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
2877 +
2878 +       ++cur_seq;
2879 +       header.seq = cpu_to_le16(cur_seq);
2880 +       header.ack = 0;
2881 +
2882 +       if (header.tgtid) {
2883 +               struct cluster_node *remnode;
2884 +
2885 +               remnode = find_node_by_nodeid(nodeid);
2886 +               if (remnode)  {
2887 +                       header.ack = cpu_to_le16(remnode->last_seq_recv);
2888 +               }
2889 +       }
2890 +
2891 +       /* Set the MULTICAST flag on messages with no particular destination */
2892 +       if (!msg->msg_namelen) {
2893 +               header.flags |= MSG_MULTICAST;
2894 +               header.tgtid = 0;
2895 +       }
2896 +
2897 +       /* Loopback shortcut */
2898 +       if (nodeid == us->node_id && nodeid != 0) {
2899 +
2900 +               up(&send_lock);
2901 +               header.flags |= MSG_NOACK; /* Don't ack it! */
2902 +
2903 +               return send_to_user_port(NULL, &header, msg, vec, veclen, size);
2904 +       }
2905 +
2906 +       /* Copy the existing kvecs into our array and add the header on at the
2907 +        * beginning */
2908 +       vectors[0].iov_base = &header;
2909 +       vectors[0].iov_len = sizeof (header);
2910 +       for (i = 0; i < veclen; i++) {
2911 +               vectors[i + 1] = vec[i];
2912 +       }
2913 +
2914 +
2915 +        /* Work out how many ACKS are wanted - *don't* reset acks_expected to
2916 +        * zero if no acks are required as an ACK-needed message may still be
2917 +        * outstanding */
2918 +       if (!(msg->msg_flags & MSG_NOACK)) {
2919 +               if (msg->msg_namelen)
2920 +                       acks_expected = 1;      /* Unicast */
2921 +               else
2922 +                       acks_expected = max(cluster_members - 1, 0);
2923 +
2924 +       }
2925 +
2926 +       P_COMMS
2927 +           ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
2928 +            nodeid, header.port,
2929 +            (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
2930 +            le16_to_cpu(header.seq), header.flags);
2931 +
2932 +       /* Don't include temp nodeids in the message itself */
2933 +       if (header.tgtid < 0)
2934 +               header.tgtid = 0;
2935 +
2936 +       /* For non-member sends we use all the interfaces */
2937 +       if ((nodeid < 0) || (flags & MSG_ALLINT)) {
2938 +
2939 +               result = send_to_all_ints(nodeid, &our_msg, vectors, veclen+1,
2940 +                                         size, msg->msg_flags);
2941 +       }
2942 +       else {
2943 +               /* Send to only the current socket - resends will use the
2944 +                * others if necessary */
2945 +               our_msg.msg_name = &current_interface->saddr;
2946 +               our_msg.msg_namelen = current_interface->addr_len;
2947 +
2948 +               result =
2949 +                   __send_and_save(current_interface, &our_msg,
2950 +                                   vectors, veclen+1,
2951 +                                   size + sizeof (header),
2952 +                                   !(msg->msg_flags & MSG_NOACK));
2953 +       }
2954 +
2955 +       /* Make a note in each nodes' structure that it has been sent a message
2956 +        * so we can see which ones went astray */
2957 +       if (!(flags & MSG_NOACK) && nodeid >= 0) {
2958 +               if (msg->msg_namelen) {
2959 +                       struct cluster_node *node;
2960 +
2961 +                       node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
2962 +                       if (node)
2963 +                               node->last_seq_sent = cur_seq;
2964 +               }
2965 +               else {
2966 +                       struct cluster_node *node;
2967 +                       struct list_head *nodelist;
2968 +
2969 +                       list_for_each(nodelist, &cluster_members_list) {
2970 +                               node =
2971 +                                   list_entry(nodelist, struct cluster_node,
2972 +                                              list);
2973 +                               if (node->state == NODESTATE_MEMBER) {
2974 +                                       node->last_seq_sent = cur_seq;
2975 +                               }
2976 +                       }
2977 +               }
2978 +       }
2979 +
2980 +       /* if the client wants a broadcast message sending back to itself
2981 +          then loop it back */
2982 +       if (nodeid == 0 && (flags & MSG_BCASTSELF)) {
2983 +               header.flags |= MSG_NOACK; /* Don't ack it! */
2984 +
2985 +               result = send_to_user_port(NULL, &header, msg, vec, veclen, size);
2986 +       }
2987 +
2988 +       /* Save a copy of the message if we're expecting an ACK */
2989 +       if (!(flags & MSG_NOACK) && acks_expected) {
2990 +               struct cl_protheader *savhdr = (struct cl_protheader *) saved_msg_buffer;
2991 +
2992 +               memcpy_fromkvec(saved_msg_buffer, vectors,
2993 +                               size + sizeof (header));
2994 +
2995 +               saved_msg_len = size + sizeof (header);
2996 +               retry_count = ack_count = 0;
2997 +               clear_bit(RESEND_NEEDED, &mainloop_flags);
2998 +
2999 +               /* Clear the REPLYEXPected flag so we force a real ACK
3000 +                  if it's necessary to resend this packet */
3001 +               savhdr->flags &= ~MSG_REPLYEXP;
3002 +               start_ack_timer();
3003 +       }
3004 +
3005 +       up(&send_lock);
3006 +       return result;
3007 +}
3008 +
3009 +static int queue_message(struct socket *sock, void *buf, int len,
3010 +                        struct sockaddr_cl *caddr,
3011 +                        unsigned char port, int flags)
3012 +{
3013 +       struct queued_message *qmsg;
3014 +
3015 +       qmsg = kmalloc(sizeof (struct queued_message),
3016 +                      (in_atomic()
3017 +                       || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
3018 +       if (qmsg == NULL)
3019 +               return -1;
3020 +
3021 +       memcpy(qmsg->msg_buffer, buf, len);
3022 +       qmsg->msg_len = len;
3023 +       if (caddr) {
3024 +               memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
3025 +               qmsg->addr_len = sizeof (struct sockaddr_cl);
3026 +       }
3027 +       else {
3028 +               qmsg->addr_len = 0;
3029 +       }
3030 +       qmsg->flags = flags;
3031 +       qmsg->port = port;
3032 +       qmsg->socket = sock;
3033 +
3034 +       down(&messages_list_lock);
3035 +       list_add_tail(&qmsg->list, &messages_list);
3036 +       up(&messages_list_lock);
3037 +
3038 +       wake_up_interruptible(&cnxman_waitq);
3039 +
3040 +       return 0;
3041 +}
3042 +
3043 +static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
3044 +                     struct msghdr *msg, size_t size)
3045 +{
3046 +       struct cluster_sock *c = cluster_sk(sock->sk);
3047 +       char *buffer;
3048 +       int status;
3049 +       uint8_t port;
3050 +       struct kvec vec;
3051 +       struct sockaddr_cl *caddr = msg->msg_name;
3052 +
3053 +       if (sock->sk->sk_protocol == CLPROTO_MASTER)
3054 +               return -EOPNOTSUPP;
3055 +
3056 +       port = c->port;
3057 +
3058 +       /* Only capable users can override the port number */
3059 +       if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
3060 +               port = caddr->scl_port;
3061 +
3062 +       if (port == 0)
3063 +               return -EDESTADDRREQ;
3064 +
3065 +       /* Allocate a kernel buffer for the data so we can put it into a kvec */
3066 +       buffer = kmalloc(size, GFP_KERNEL);
3067 +       if (!buffer)
3068 +               return -ENOMEM;
3069 +
3070 +       if (memcpy_fromiovec(buffer, msg->msg_iov, size)) {
3071 +               status = -EFAULT;
3072 +               goto end_send;
3073 +       }
3074 +
3075 +       vec.iov_len = size;
3076 +       vec.iov_base = buffer;
3077 +
3078 +       status = __sendmsg(sock, msg, &vec, 1, size, port);
3079 +
3080 + end_send:
3081 +       kfree(buffer);
3082 +
3083 +       return status;
3084 +}
3085 +
3086 +/* Kernel call to sendmsg */
3087 +int kcl_sendmsg(struct socket *sock, void *buf, int size,
3088 +               struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
3089 +{
3090 +       struct kvec vecs[1];
3091 +       struct msghdr msg;
3092 +       struct cluster_sock *c = cluster_sk(sock->sk);
3093 +       unsigned char port;
3094 +
3095 +       if (size > MAX_CLUSTER_MESSAGE)
3096 +               return -EINVAL;
3097 +       if (!atomic_read(&cnxman_running))
3098 +               return -ENOTCONN;
3099 +
3100 +       port = c->port;
3101 +       if (caddr && caddr->scl_port)
3102 +               port = caddr->scl_port;
3103 +
3104 +       if (port == 0)
3105 +               return -EDESTADDRREQ;
3106 +
3107 +       /* If we have no process context then queue it up for kclusterd to
3108 +        * send. */
3109 +       if (in_interrupt() || flags & MSG_QUEUE) {
3110 +               return queue_message(sock, buf, size, caddr, port,
3111 +                                    flags & ~MSG_QUEUE);
3112 +       }
3113 +
3114 +       vecs[0].iov_base = buf;
3115 +       vecs[0].iov_len = size;
3116 +
3117 +       memset(&msg, 0, sizeof (msg));
3118 +       msg.msg_name = caddr;
3119 +       msg.msg_namelen = addr_len;
3120 +       msg.msg_flags = flags;
3121 +
3122 +       return __sendmsg(sock, &msg, vecs, 1, size, port);
3123 +}
3124 +
3125 +static int send_queued_message(struct queued_message *qmsg)
3126 +{
3127 +       struct kvec vecs[1];
3128 +       struct msghdr msg;
3129 +
3130 +       /* Don't send blocked messages */
3131 +       if (qmsg->port > HIGH_PROTECTED_PORT
3132 +           && (!cluster_is_quorate || in_transition()))
3133 +               return -EAGAIN;
3134 +
3135 +       vecs[0].iov_base = qmsg->msg_buffer;
3136 +       vecs[0].iov_len = qmsg->msg_len;
3137 +
3138 +       memset(&msg, 0, sizeof (msg));
3139 +       msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
3140 +       msg.msg_namelen = qmsg->addr_len;
3141 +       msg.msg_flags = qmsg->flags;
3142 +
3143 +       return __sendmsg(qmsg->socket, &msg, vecs, 1,
3144 +                        qmsg->msg_len, qmsg->port);
3145 +}
3146 +
3147 +int kcl_register_read_callback(struct socket *sock,
3148 +                              int (*routine) (char *, int, char *, int,
3149 +                                              unsigned int))
3150 +{
3151 +       struct cluster_sock *c = cluster_sk(sock->sk);
3152 +
3153 +       c->kernel_callback = routine;
3154 +
3155 +       return 0;
3156 +}
3157 +
3158 +/* Used where we are in kclusterd context and we can't allow the task to wait
3159 + * as we are also responsible to processing the ACKs that do the wake up. Try
3160 + * to send the message immediately and queue it if that's not possible */
3161 +static int send_or_queue_message(struct socket *sock, void *buf, int len,
3162 +                                struct sockaddr_cl *caddr,
3163 +                                unsigned int flags)
3164 +{
3165 +       struct kvec vecs[1];
3166 +       struct msghdr msg;
3167 +       int status;
3168 +
3169 +       vecs[0].iov_base = buf;
3170 +       vecs[0].iov_len = len;
3171 +
3172 +       memset(&msg, 0, sizeof (msg));
3173 +       msg.msg_name = caddr;
3174 +       msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
3175 +       msg.msg_flags = MSG_DONTWAIT | flags;
3176 +
3177 +       status = __sendmsg(NULL, &msg, vecs, 1, len, 0);
3178 +
3179 +       /* Did it work ? */
3180 +       if (status > 0) {
3181 +               return 0;
3182 +       }
3183 +
3184 +       /* Failure other than EAGAIN is fatal */
3185 +       if (status != -EAGAIN) {
3186 +               return status;
3187 +       }
3188 +
3189 +       return queue_message(sock, buf, len, caddr, 0, flags);
3190 +}
3191 +
3192 +/* Send a listen request to a node */
3193 +static void send_listen_request(int nodeid, unsigned char port)
3194 +{
3195 +       struct cl_listenmsg listenmsg;
3196 +       struct sockaddr_cl caddr;
3197 +
3198 +       memset(&caddr, 0, sizeof (caddr));
3199 +
3200 +       /* Build the header */
3201 +       listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
3202 +       listenmsg.target_port = port;
3203 +       listenmsg.listening = 0;
3204 +       listenmsg.tag = current->pid;
3205 +
3206 +       caddr.scl_family = AF_CLUSTER;
3207 +       caddr.scl_port = 0;
3208 +       caddr.scl_nodeid = nodeid;
3209 +
3210 +       send_or_queue_message(NULL, &listenmsg, sizeof(listenmsg), &caddr, MSG_REPLYEXP);
3211 +       return;
3212 +}
3213 +
3214 +/* Return 1 or 0 to indicate if we have a listener on the requested port */
3215 +static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
3216 +                                unsigned char port, unsigned short tag)
3217 +{
3218 +       struct cl_listenmsg listenmsg;
3219 +       struct sockaddr_cl caddr;
3220 +       int status;
3221 +
3222 +       memset(&caddr, 0, sizeof (caddr));
3223 +
3224 +       /* Build the message */
3225 +       listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
3226 +       listenmsg.target_port = port;
3227 +       listenmsg.tag = tag;
3228 +       listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
3229 +
3230 +       caddr.scl_family = AF_CLUSTER;
3231 +       caddr.scl_port = 0;
3232 +       caddr.scl_nodeid = nodeid;
3233 +
3234 +       status = send_or_queue_message(NULL, &listenmsg,
3235 +                                      sizeof (listenmsg),
3236 +                                      &caddr, 0);
3237 +
3238 +       return;
3239 +}
3240 +
3241 +/* Send an ACK */
3242 +static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
3243 +                     int addr_len, char *addr, unsigned char remport,
3244 +                     unsigned char flag)
3245 +{
3246 +       struct kvec vec;
3247 +       struct cl_ackmsg ackmsg;
3248 +       struct msghdr msg;
3249 +       struct sockaddr_in6 daddr;
3250 +       int result;
3251 +
3252 +#ifdef DEBUG_COMMS
3253 +       char buf[MAX_ADDR_PRINTED_LEN];
3254 +
3255 +       P_COMMS("Sending ACK to %s, seq=%d\n",
3256 +               print_addr(addr, address_length, buf), le16_to_cpu(seq));
3257 +#endif
3258 +
3259 +       if (addr) {
3260 +               memcpy(&daddr, addr, addr_len);
3261 +       }
3262 +       else {
3263 +               memcpy(&daddr, &csock->saddr, csock->addr_len);
3264 +               addr_len = csock->addr_len;
3265 +       }
3266 +
3267 +       /* Build the header */
3268 +       ackmsg.header.tgtport = 0;      /* Protocol port */
3269 +       ackmsg.header.srcport = 0;
3270 +       ackmsg.header.seq = 0;
3271 +       ackmsg.header.flags = MSG_NOACK;
3272 +       ackmsg.header.cluster = cpu_to_le16(cluster_id);
3273 +       ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
3274 +       ackmsg.header.ack = seq; /* already in LE order */
3275 +       ackmsg.header.tgtid = 0;        /* ACKS are unicast so we don't bother
3276 +                                        * to look this up */
3277 +       ackmsg.cmd = CLUSTER_CMD_ACK;
3278 +       ackmsg.remport = remport;
3279 +       ackmsg.aflags = flag;
3280 +       vec.iov_base = &ackmsg;
3281 +       vec.iov_len = sizeof (ackmsg);
3282 +
3283 +       memset(&msg, 0, sizeof (msg));
3284 +       msg.msg_name = &daddr;
3285 +       msg.msg_namelen = addr_len;
3286 +
3287 +       result = kernel_sendmsg(csock->sock, &msg, &vec, 1, sizeof (ackmsg));
3288 +
3289 +       if (result < 0)
3290 +               printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
3291 +
3292 +       return result;
3293 +
3294 +}
3295 +
3296 +/* Wait for all ACKS to be gathered */
3297 +void kcl_wait_for_all_acks()
3298 +{
3299 +       while (ack_count < acks_expected) {
3300 +
3301 +               DECLARE_WAITQUEUE(wq, current);
3302 +               struct task_struct *tsk = current;
3303 +
3304 +               set_task_state(tsk, TASK_INTERRUPTIBLE);
3305 +               add_wait_queue(&socket_waitq, &wq);
3306 +
3307 +               if (ack_count < acks_expected) {
3308 +                       schedule();
3309 +               }
3310 +
3311 +               set_task_state(tsk, TASK_RUNNING);
3312 +               remove_wait_queue(&socket_waitq, &wq);
3313 +       }
3314 +}
3315 +
3316 +/* Send a closedown OOB message to all cluster nodes - this tells them that a
3317 + * port listener has gone away */
3318 +static void send_port_close_oob(unsigned char port)
3319 +{
3320 +       struct cl_closemsg closemsg;
3321 +
3322 +       /* Build the header */
3323 +       closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
3324 +       closemsg.port = port;
3325 +
3326 +       send_or_queue_message(NULL, &closemsg, sizeof (closemsg), NULL, 0);
3327 +       return;
3328 +}
3329 +
3330 +/* A remote port has been closed - post an OOB message to the local listen on
3331 + * that port (if there is one) */
3332 +static void post_close_oob(unsigned char port, int nodeid)
3333 +{
3334 +       struct cl_portclosed_oob *oobmsg;
3335 +       struct sk_buff *skb;
3336 +       struct sock *sock = port_array[port];
3337 +       struct cb_info *cbinfo;
3338 +
3339 +       if (!sock) {
3340 +               return;         /* No-one listening */
3341 +       }
3342 +
3343 +       skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3344 +       if (!skb)
3345 +               return;
3346 +
3347 +       skb_put(skb, sizeof (*oobmsg));
3348 +       oobmsg = (struct cl_portclosed_oob *) skb->data;
3349 +       oobmsg->port = port;
3350 +       oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
3351 +
3352 +       cbinfo = (struct cb_info *)skb->cb;
3353 +       cbinfo->oob = 1;
3354 +       cbinfo->orig_nodeid = nodeid;
3355 +       cbinfo->orig_port = port;
3356 +
3357 +       sock_queue_rcv_skb(sock, skb);
3358 +
3359 +}
3360 +
3361 +/* Leave the cluster */
3362 +static void node_shutdown()
3363 +{
3364 +       struct cl_barrier *barrier;
3365 +       struct list_head *blist;
3366 +       struct list_head *temp;
3367 +       struct list_head *socklist;
3368 +       struct cl_client_socket *csock;
3369 +       struct sk_buff *null_skb;
3370 +
3371 +       if (we_are_a_cluster_member)
3372 +               printk(KERN_INFO CMAN_NAME ": we are leaving the cluster. %s\n",
3373 +                      us->leave_reason?leave_string(us->leave_reason):"");
3374 +
3375 +       atomic_set(&cnxman_running, 0);
3376 +       unjam();
3377 +
3378 +       /* Notify kernel listeners first */
3379 +       notify_kernel_listeners(LEAVING, 0);
3380 +
3381 +       /* Notify client sockets */
3382 +       down(&client_socket_lock);
3383 +       list_for_each_safe(socklist, temp, &client_socket_list) {
3384 +               csock = list_entry(socklist, struct cl_client_socket, list);
3385 +
3386 +               null_skb = alloc_skb(0, GFP_KERNEL);
3387 +               if (null_skb)
3388 +                       sock_queue_rcv_skb(csock->sock->sk, null_skb);
3389 +               list_del(&csock->list);
3390 +               kfree(csock);
3391 +       }
3392 +       up(&client_socket_lock);
3393 +       we_are_a_cluster_member = 0;
3394 +       cluster_is_quorate = 0;
3395 +
3396 +       sm_stop(1);
3397 +
3398 +       /* Wake up any processes waiting for barriers */
3399 +       down(&barrier_list_lock);
3400 +       list_for_each(blist, &barrier_list) {
3401 +               barrier = list_entry(blist, struct cl_barrier, list);
3402 +
3403 +               /* Cancel any timers */
3404 +               if (timer_pending(&barrier->timer))
3405 +                       del_timer(&barrier->timer);
3406 +
3407 +               /* Force it to be auto-delete so it discards itself */
3408 +               if (barrier->state == BARRIER_STATE_WAITING) {
3409 +                       barrier->flags |= BARRIER_ATTR_AUTODELETE;
3410 +                       wake_up_interruptible(&barrier->waitq);
3411 +               }
3412 +               else {
3413 +                       if (barrier->callback) {
3414 +                               barrier->callback(barrier->name, -ENOTCONN);
3415 +                               barrier->callback = NULL;
3416 +                       }
3417 +               }
3418 +       }
3419 +       up(&barrier_list_lock);
3420 +
3421 +       /* Wake up any processes waiting for ISLISTENING requests */
3422 +       down(&listenreq_lock);
3423 +       list_for_each(blist, &listenreq_list) {
3424 +               struct cl_waiting_listen_request *lrequest =
3425 +                   list_entry(blist, struct cl_waiting_listen_request, list);
3426 +
3427 +               if (lrequest->waiting)
3428 +                       wake_up_interruptible(&lrequest->waitq);
3429 +       }
3430 +       up(&listenreq_lock);
3431 +}
3432 +
3433 +static void free_cluster_sockets()
3434 +{
3435 +       struct list_head *socklist;
3436 +       struct cl_comms_socket *sock;
3437 +       struct list_head *temp;
3438 +
3439 +       list_for_each_safe(socklist, temp, &socket_list) {
3440 +               sock = list_entry(socklist, struct cl_comms_socket, list);
3441 +
3442 +               list_del(&sock->list);
3443 +               fput(sock->file);
3444 +               kfree(sock);
3445 +       }
3446 +       num_interfaces = 0;
3447 +       current_interface = NULL;
3448 +}
3449 +
3450 +/* Tidy up after all the rest of the cluster bits have shut down */
3451 +static void node_cleanup()
3452 +{
3453 +       struct list_head *nodelist;
3454 +       struct list_head *proclist;
3455 +       struct list_head *temp;
3456 +       struct list_head *socklist;
3457 +       struct list_head *blist;
3458 +       struct temp_node *tn;
3459 +       struct temp_node *tmp;
3460 +       struct cl_comms_socket *sock;
3461 +       struct kernel_notify_struct *knotify;
3462 +
3463 +       /* Free list of kernel listeners */
3464 +       list_for_each_safe(proclist, temp, &kernel_listener_list) {
3465 +               knotify =
3466 +                   list_entry(proclist, struct kernel_notify_struct, list);
3467 +               list_del(&knotify->list);
3468 +               kfree(knotify);
3469 +       }
3470 +
3471 +       /* Mark the sockets as busy so they don't get added to the active
3472 +        * sockets list in the next few lines of code before we free them */
3473 +       list_for_each_safe(socklist, temp, &socket_list) {
3474 +               sock = list_entry(socklist, struct cl_comms_socket, list);
3475 +
3476 +               set_bit(1, &sock->active);
3477 +       }
3478 +
3479 +       /* Tidy the active sockets list */
3480 +       list_for_each_safe(socklist, temp, &active_socket_list) {
3481 +               sock =
3482 +                   list_entry(socklist, struct cl_comms_socket, active_list);
3483 +               list_del(&sock->active_list);
3484 +       }
3485 +
3486 +       /* Free the memory allocated to cluster nodes */
3487 +       free_nodeid_array();
3488 +       down(&cluster_members_lock);
3489 +       us = NULL;
3490 +       list_for_each_safe(nodelist, temp, &cluster_members_list) {
3491 +
3492 +               struct list_head *addrlist;
3493 +               struct list_head *addrtemp;
3494 +               struct cluster_node *node;
3495 +               struct cluster_node_addr *nodeaddr;
3496 +
3497 +               node = list_entry(nodelist, struct cluster_node, list);
3498 +
3499 +               list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
3500 +                       nodeaddr =
3501 +                           list_entry(addrlist, struct cluster_node_addr,
3502 +                                      list);
3503 +
3504 +                       list_del(&nodeaddr->list);
3505 +                       kfree(nodeaddr);
3506 +               }
3507 +               list_del(&node->list);
3508 +               kfree(node->name);
3509 +               kfree(node);
3510 +       }
3511 +       cluster_members = 0;
3512 +       up(&cluster_members_lock);
3513 +
3514 +       /* Clean the temp node IDs list. */
3515 +       down(&tempnode_lock);
3516 +       list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
3517 +               list_del(&tn->list);
3518 +               kfree(tn);
3519 +       }
3520 +       up(&tempnode_lock);
3521 +
3522 +       /* Free the memory allocated to the outgoing sockets */
3523 +       free_cluster_sockets();
3524 +
3525 +       /* Make sure that all the barriers are deleted */
3526 +       down(&barrier_list_lock);
3527 +       list_for_each_safe(blist, temp, &barrier_list) {
3528 +               struct cl_barrier *barrier =
3529 +                   list_entry(blist, struct cl_barrier, list);
3530 +
3531 +               list_del(&barrier->list);
3532 +               kfree(barrier);
3533 +       }
3534 +       up(&barrier_list_lock);
3535 +
3536 +       kcluster_pid = 0;
3537 +       clear_bit(RESEND_NEEDED, &mainloop_flags);
3538 +       acks_expected = 0;
3539 +       wanted_nodeid = 0;
3540 +}
3541 +
3542 +/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
3543 + * blocked. */
3544 +void set_quorate(int total_votes)
3545 +{
3546 +       int quorate;
3547 +
3548 +       if (get_quorum() > total_votes) {
3549 +               quorate = 0;
3550 +       }
3551 +       else {
3552 +               quorate = 1;
3553 +       }
3554 +
3555 +       /* Hide messages during startup state transition */
3556 +       if (we_are_a_cluster_member) {
3557 +               if (cluster_is_quorate && !quorate)
3558 +                       printk(KERN_CRIT CMAN_NAME
3559 +                              ": quorum lost, blocking activity\n");
3560 +               if (!cluster_is_quorate && quorate)
3561 +                       printk(KERN_CRIT CMAN_NAME
3562 +                              ": quorum regained, resuming activity\n");
3563 +       }
3564 +       cluster_is_quorate = quorate;
3565 +
3566 +       /* Wake up any sleeping processes */
3567 +       if (cluster_is_quorate) {
3568 +               unjam();
3569 +       }
3570 +
3571 +}
3572 +
3573 +void queue_oob_skb(struct socket *sock, int cmd)
3574 +{
3575 +       struct sk_buff *skb;
3576 +       struct cb_info *cbinfo;
3577 +       struct cl_portclosed_oob *oobmsg;
3578 +
3579 +       skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
3580 +       if (!skb)
3581 +               return;
3582 +
3583 +       skb_put(skb, sizeof (*oobmsg));
3584 +       oobmsg = (struct cl_portclosed_oob *) skb->data;
3585 +       oobmsg->port = 0;
3586 +       oobmsg->cmd = cmd;
3587 +
3588 +       /* There is no remote node associated with this so
3589 +          clear out the field to avoid any accidents */
3590 +       cbinfo = (struct cb_info *)skb->cb;
3591 +       cbinfo->oob = 1;
3592 +       cbinfo->orig_nodeid = 0;
3593 +       cbinfo->orig_port = 0;
3594 +
3595 +       sock_queue_rcv_skb(sock->sk, skb);
3596 +}
3597 +
3598 +/* Notify interested parties that the cluster configuration has changed */
3599 +void notify_listeners()
3600 +{
3601 +       struct notify_struct *notify;
3602 +       struct list_head *proclist;
3603 +       struct list_head *socklist;
3604 +       struct list_head *temp;
3605 +
3606 +       /* Do kernel listeners first */
3607 +       notify_kernel_listeners(CLUSTER_RECONFIG, 0);
3608 +
3609 +       /* Now we deign to tell userspace */
3610 +       down(&event_listener_lock);
3611 +       list_for_each_safe(proclist, temp, &event_listener_list) {
3612 +               notify = list_entry(proclist, struct notify_struct, list);
3613 +
3614 +               /* If the kill fails then remove the process from the list */
3615 +               if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
3616 +                       list_del(&notify->list);
3617 +                       kfree(notify);
3618 +               }
3619 +       }
3620 +       up(&event_listener_lock);
3621 +
3622 +       /* Tell userspace processes which want OOB messages */
3623 +       down(&client_socket_lock);
3624 +       list_for_each(socklist, &client_socket_list) {
3625 +               struct cl_client_socket *csock;
3626 +               csock = list_entry(socklist, struct cl_client_socket, list);
3627 +               queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
3628 +       }
3629 +       up(&client_socket_lock);
3630 +}
3631 +
3632 +/* This fills in the list of all addresses for the local node */
3633 +void get_local_addresses(struct cluster_node *node)
3634 +{
3635 +       struct list_head *socklist;
3636 +       struct cl_comms_socket *sock;
3637 +
3638 +       list_for_each(socklist, &socket_list) {
3639 +               sock = list_entry(socklist, struct cl_comms_socket, list);
3640 +
3641 +               if (sock->recv_only) {
3642 +                       add_node_address(node, (char *) &sock->saddr, address_length);
3643 +               }
3644 +       }
3645 +}
3646 +
3647 +
3648 +static uint16_t generate_cluster_id(char *name)
3649 +{
3650 +       int i;
3651 +       int value = 0;
3652 +
3653 +       for (i=0; i<strlen(name); i++) {
3654 +               value <<= 1;
3655 +               value += name[i];
3656 +       }
3657 +       return value & 0xFFFF;
3658 +}
3659 +
3660 +/* Return the next comms socket we can use. */
3661 +static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
3662 +{
3663 +       int next;
3664 +       struct list_head *socklist;
3665 +
3666 +       /* Fast path for single interface systems */
3667 +       if (num_interfaces <= 1)
3668 +               return cur;
3669 +
3670 +       /* Next number */
3671 +       next = cur->number + 1;
3672 +       if (next > num_interfaces)
3673 +               next = 1;
3674 +
3675 +       /* Find the socket with this number, I could optimise this by starting
3676 +        * at the current i/f but most systems are going to have a small number
3677 +        * of them anyway */
3678 +       list_for_each(socklist, &socket_list) {
3679 +               struct cl_comms_socket *sock;
3680 +               sock = list_entry(socklist, struct cl_comms_socket, list);
3681 +
3682 +               if (!sock->recv_only && sock->number == next)
3683 +                       return sock;
3684 +       }
3685 +
3686 +       BUG();
3687 +       return NULL;
3688 +}
3689 +
3690 +/* MUST be called with the barrier list lock held */
3691 +static struct cl_barrier *find_barrier(char *name)
3692 +{
3693 +       struct list_head *blist;
3694 +       struct cl_barrier *bar;
3695 +
3696 +       list_for_each(blist, &barrier_list) {
3697 +               bar = list_entry(blist, struct cl_barrier, list);
3698 +
3699 +               if (strcmp(name, bar->name) == 0)
3700 +                       return bar;
3701 +       }
3702 +       return NULL;
3703 +}
3704 +
3705 +/* Do the stuff we need to do when the barrier has completed phase 1 */
3706 +static void check_barrier_complete_phase1(struct cl_barrier *barrier)
3707 +{
3708 +       if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
3709 +                                                ? barrier->expected_nodes :
3710 +                                                cluster_members)) {
3711 +
3712 +               struct cl_barriermsg bmsg;
3713 +
3714 +               atomic_inc(&barrier->completed_nodes);  /* We have completed */
3715 +               barrier->phase = 2;     /* Wait for complete phase II */
3716 +
3717 +               /* Send completion message, remember: we are in cnxman context
3718 +                * and must not block */
3719 +               bmsg.cmd = CLUSTER_CMD_BARRIER;
3720 +               bmsg.subcmd = BARRIER_COMPLETE;
3721 +               bmsg.flags = 0;
3722 +               strcpy(bmsg.name, barrier->name);
3723 +
3724 +               P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
3725 +               queue_message(NULL, (char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
3726 +       }
3727 +}
3728 +
3729 +/* Do the stuff we need to do when the barrier has been reached */
3730 +/* Return 1 if we deleted the barrier */
3731 +static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
3732 +{
3733 +       spin_lock_irq(&barrier->phase2_spinlock);
3734 +
3735 +       if (barrier->state != BARRIER_STATE_COMPLETE &&
3736 +           (status == -ETIMEDOUT ||
3737 +            atomic_read(&barrier->completed_nodes) ==
3738 +            ((barrier->expected_nodes != 0)
3739 +             ? barrier->expected_nodes : cluster_members))) {
3740 +
3741 +               if (status == 0 && barrier->timeout)
3742 +                       del_timer(&barrier->timer);
3743 +               barrier->endreason = status;
3744 +
3745 +               /* Wake up listener */
3746 +               if (barrier->state == BARRIER_STATE_WAITING) {
3747 +                       wake_up_interruptible(&barrier->waitq);
3748 +               }
3749 +               else {
3750 +                       /* Additional tasks we have to do if the user was not
3751 +                        * waiting... */
3752 +                       /* Call the callback */
3753 +                       if (barrier->callback) {
3754 +                               barrier->callback(barrier->name, 0);
3755 +                               barrier->callback = NULL;
3756 +                       }
3757 +                       /* Remove it if it's AUTO-DELETE */
3758 +                       if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
3759 +                               list_del(&barrier->list);
3760 +                               spin_unlock_irq(&barrier->phase2_spinlock);
3761 +                               kfree(barrier);
3762 +                               return 1;
3763 +                       }
3764 +               }
3765 +               barrier->state = BARRIER_STATE_COMPLETE;
3766 +       }
3767 +       spin_unlock_irq(&barrier->phase2_spinlock);
3768 +       return 0;
3769 +}
3770 +
3771 +/* Called if a barrier timeout happens */
3772 +static void barrier_timer_fn(unsigned long arg)
3773 +{
3774 +       struct cl_barrier *barrier = (struct cl_barrier *) arg;
3775 +
3776 +       /* Ignore any futher messages, they are too late. */
3777 +       barrier->phase = 0;
3778 +
3779 +       /* and cause it to timeout */
3780 +       check_barrier_complete_phase2(barrier, -ETIMEDOUT);
3781 +}
3782 +
3783 +/* Process BARRIER messages from other nodes */
3784 +static void process_barrier_msg(struct cl_barriermsg *msg,
3785 +                               struct cluster_node *node)
3786 +{
3787 +       struct cl_barrier *barrier;
3788 +
3789 +       down(&barrier_list_lock);
3790 +       barrier = find_barrier(msg->name);
3791 +       up(&barrier_list_lock);
3792 +
3793 +       /* Ignore other peoples messages, in_transition() is needed here so
3794 +        * that joining nodes will see their barrier messages before the
3795 +        * we_are_a_cluster_member is set */
3796 +       if (!we_are_a_cluster_member && !in_transition())
3797 +               return;
3798 +       if (!barrier)
3799 +               return;
3800 +
3801 +       P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
3802 +                 node ? node->name : "unknown");
3803 +
3804 +       switch (msg->subcmd) {
3805 +       case BARRIER_WAIT:
3806 +               down(&barrier->lock);
3807 +               if (barrier->phase == 0)
3808 +                       barrier->phase = 1;
3809 +
3810 +               if (barrier->phase == 1) {
3811 +                       atomic_inc(&barrier->got_nodes);
3812 +                       check_barrier_complete_phase1(barrier);
3813 +               }
3814 +               else {
3815 +                       printk(KERN_WARNING CMAN_NAME
3816 +                              ": got WAIT barrier not in phase 1 %s (%d)\n",
3817 +                              msg->name, barrier->phase);
3818 +
3819 +               }
3820 +               up(&barrier->lock);
3821 +               break;
3822 +
3823 +       case BARRIER_COMPLETE:
3824 +               down(&barrier->lock);
3825 +               atomic_inc(&barrier->completed_nodes);
3826 +
3827 +               /* First node to get all the WAIT messages sends COMPLETE, so
3828 +                * we all complete */
3829 +               if (barrier->phase == 1) {
3830 +                       atomic_set(&barrier->got_nodes,
3831 +                                  barrier->expected_nodes);
3832 +                       check_barrier_complete_phase1(barrier);
3833 +               }
3834 +
3835 +               if (barrier->phase == 2) {
3836 +                       /* If it was deleted (ret==1) then no need to unlock
3837 +                        * the mutex */
3838 +                       if (check_barrier_complete_phase2(barrier, 0) == 1)
3839 +                               return;
3840 +               }
3841 +               up(&barrier->lock);
3842 +               break;
3843 +       }
3844 +}
3845 +
3846 +/* In-kernel membership API */
3847 +int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
3848 +{
3849 +       struct kernel_notify_struct *notify;
3850 +
3851 +       notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
3852 +       if (!notify)
3853 +               return -ENOMEM;
3854 +       notify->callback = callback;
3855 +
3856 +       down(&kernel_listener_lock);
3857 +       list_add(&notify->list, &kernel_listener_list);
3858 +       up(&kernel_listener_lock);
3859 +
3860 +       return 0;
3861 +}
3862 +
3863 +int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
3864 +{
3865 +       struct list_head *calllist;
3866 +       struct list_head *temp;
3867 +       struct kernel_notify_struct *notify;
3868 +
3869 +       down(&kernel_listener_lock);
3870 +       list_for_each_safe(calllist, temp, &kernel_listener_list) {
3871 +               notify = list_entry(calllist, struct kernel_notify_struct, list);
3872 +               if (notify->callback == callback){
3873 +                       list_del(&notify->list);
3874 +                       kfree(notify);
3875 +                       up(&kernel_listener_lock);
3876 +                       return 0;
3877 +               }
3878 +       }
3879 +       up(&kernel_listener_lock);
3880 +       return -EINVAL;
3881 +}
3882 +
3883 +/* Return quorate status */
3884 +int kcl_is_quorate()
3885 +{
3886 +       return cluster_is_quorate;
3887 +}
3888 +
3889 +/* Return the address list for a node */
3890 +struct list_head *kcl_get_node_addresses(int nodeid)
3891 +{
3892 +       struct cluster_node *node = find_node_by_nodeid(nodeid);
3893 +
3894 +       if (node)
3895 +               return &node->addr_list;
3896 +       else
3897 +               return NULL;
3898 +}
3899 +
3900 +static void copy_to_kclnode(struct cluster_node *node,
3901 +                           struct kcl_cluster_node *knode)
3902 +{
3903 +       strcpy(knode->name, node->name);
3904 +       knode->size = sizeof (struct kcl_cluster_node);
3905 +       knode->votes = node->votes;
3906 +       knode->state = node->state;
3907 +       knode->node_id = node->node_id;
3908 +       knode->us = node->us;
3909 +       knode->leave_reason = node->leave_reason;
3910 +       knode->incarnation = node->incarnation;
3911 +}
3912 +
3913 +/* Return the info for a node given it's address. if addr is NULL then return
3914 + * OUR info */
3915 +int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
3916 +                        struct kcl_cluster_node *n)
3917 +{
3918 +       struct cluster_node *node;
3919 +
3920 +       /* They want us */
3921 +       if (addr == NULL) {
3922 +               node = us;
3923 +       }
3924 +       else {
3925 +               node = find_node_by_addr(addr, addr_len);
3926 +               if (!node)
3927 +                       return -1;
3928 +       }
3929 +
3930 +       /* Copy to user's buffer */
3931 +       copy_to_kclnode(node, n);
3932 +       return 0;
3933 +}
3934 +
3935 +int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
3936 +{
3937 +       struct cluster_node *node;
3938 +
3939 +       /* They want us */
3940 +       if (name == NULL) {
3941 +               node = us;
3942 +               if (node == NULL)
3943 +                       return -1;
3944 +       }
3945 +       else {
3946 +               node = find_node_by_name(name);
3947 +               if (!node)
3948 +                       return -1;
3949 +       }
3950 +
3951 +       /* Copy to user's buffer */
3952 +       copy_to_kclnode(node, n);
3953 +       return 0;
3954 +}
3955 +
3956 +/* As above but by node id. MUCH faster */
3957 +int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
3958 +{
3959 +       struct cluster_node *node;
3960 +
3961 +       /* They want us */
3962 +       if (nodeid == 0) {
3963 +               node = us;
3964 +               if (node == NULL)
3965 +                       return -1;
3966 +       }
3967 +       else {
3968 +               node = find_node_by_nodeid(nodeid);
3969 +               if (!node)
3970 +                       return -1;
3971 +       }
3972 +
3973 +       /* Copy to user's buffer */
3974 +       copy_to_kclnode(node, n);
3975 +       return 0;
3976 +}
3977 +
3978 +/* Return a list of all cluster members ever */
3979 +int kcl_get_all_members(struct list_head *list)
3980 +{
3981 +       struct list_head *nodelist;
3982 +       struct cluster_node *node;
3983 +       struct kcl_cluster_node *newnode;
3984 +       int num_nodes = 0;
3985 +
3986 +       down(&cluster_members_lock);
3987 +       list_for_each(nodelist, &cluster_members_list) {
3988 +               if (list) {
3989 +                       node = list_entry(nodelist, struct cluster_node, list);
3990 +                       newnode =
3991 +                           kmalloc(sizeof (struct kcl_cluster_node),
3992 +                                   GFP_KERNEL);
3993 +                       if (newnode) {
3994 +                               copy_to_kclnode(node, newnode);
3995 +                               list_add(&newnode->list, list);
3996 +                               num_nodes++;
3997 +                       }
3998 +               }
3999 +               else {
4000 +                       num_nodes++;
4001 +               }
4002 +       }
4003 +       up(&cluster_members_lock);
4004 +
4005 +       return num_nodes;
4006 +}
4007 +
4008 +/* Return a list of cluster members */
4009 +int kcl_get_members(struct list_head *list)
4010 +{
4011 +       struct list_head *nodelist;
4012 +       struct cluster_node *node;
4013 +       struct kcl_cluster_node *newnode;
4014 +       int num_nodes = 0;
4015 +
4016 +       down(&cluster_members_lock);
4017 +       list_for_each(nodelist, &cluster_members_list) {
4018 +               node = list_entry(nodelist, struct cluster_node, list);
4019 +
4020 +               if (node->state == NODESTATE_MEMBER) {
4021 +                       if (list) {
4022 +                               newnode =
4023 +                                   kmalloc(sizeof (struct kcl_cluster_node),
4024 +                                           GFP_KERNEL);
4025 +                               if (newnode) {
4026 +                                       copy_to_kclnode(node, newnode);
4027 +                                       list_add(&newnode->list, list);
4028 +                                       num_nodes++;
4029 +                               }
4030 +                       }
4031 +                       else {
4032 +                               num_nodes++;
4033 +                       }
4034 +               }
4035 +       }
4036 +       up(&cluster_members_lock);
4037 +
4038 +       return num_nodes;
4039 +}
4040 +
4041 +/* Copy current member's nodeids into buffer */
4042 +int kcl_get_member_ids(uint32_t *idbuf, int size)
4043 +{
4044 +       struct list_head *nodelist;
4045 +       struct cluster_node *node;
4046 +       int num_nodes = 0;
4047 +
4048 +       down(&cluster_members_lock);
4049 +       list_for_each(nodelist, &cluster_members_list) {
4050 +               node = list_entry(nodelist, struct cluster_node, list);
4051 +
4052 +               if (node->state == NODESTATE_MEMBER) {
4053 +                       if (idbuf && size) {
4054 +                               idbuf[num_nodes] = node->node_id;
4055 +                               num_nodes++;
4056 +                               size--;
4057 +                       }
4058 +                       else {
4059 +                               num_nodes++;
4060 +                       }
4061 +               }
4062 +       }
4063 +       up(&cluster_members_lock);
4064 +
4065 +       return num_nodes;
4066 +}
4067 +
4068 +/* Barrier API */
4069 +int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
4070 +{
4071 +       struct cl_barrier *barrier;
4072 +
4073 +       /* We are not joined to a cluster */
4074 +       if (!we_are_a_cluster_member)
4075 +               return -ENOTCONN;
4076 +
4077 +       /* Must have a valid name */
4078 +       if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
4079 +               return -EINVAL;
4080 +
4081 +       /* We don't do this yet */
4082 +       if (flags & BARRIER_ATTR_MULTISTEP)
4083 +               return -ENOTSUPP;
4084 +
4085 +       down(&barrier_list_lock);
4086 +
4087 +       /* See if it already exists */
4088 +       if ((barrier = find_barrier(name))) {
4089 +               up(&barrier_list_lock);
4090 +               if (nodes != barrier->expected_nodes) {
4091 +                       printk(KERN_WARNING CMAN_NAME
4092 +                              ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
4093 +                              name, barrier->expected_nodes, nodes);
4094 +                       up(&barrier_list_lock);
4095 +                       return -EINVAL;
4096 +               }
4097 +               else
4098 +                       return 0;
4099 +       }
4100 +
4101 +       /* Build a new struct and add it to the list */
4102 +       barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
4103 +       if (barrier == NULL) {
4104 +               up(&barrier_list_lock);
4105 +               return -ENOMEM;
4106 +       }
4107 +       memset(barrier, 0, sizeof (*barrier));
4108 +
4109 +       strcpy(barrier->name, name);
4110 +       barrier->flags = flags;
4111 +       barrier->expected_nodes = nodes;
4112 +       atomic_set(&barrier->got_nodes, 0);
4113 +       atomic_set(&barrier->completed_nodes, 0);
4114 +       barrier->endreason = 0;
4115 +       barrier->registered_nodes = 1;
4116 +       spin_lock_init(&barrier->phase2_spinlock);
4117 +       barrier->state = BARRIER_STATE_INACTIVE;
4118 +       init_MUTEX(&barrier->lock);
4119 +
4120 +       list_add(&barrier->list, &barrier_list);
4121 +       up(&barrier_list_lock);
4122 +
4123 +       return 0;
4124 +}
4125 +
4126 +static int barrier_setattr_enabled(struct cl_barrier *barrier,
4127 +                                  unsigned int attr, unsigned long arg)
4128 +{
4129 +       int status;
4130 +
4131 +       /* Can't disable a barrier */
4132 +       if (!arg) {
4133 +               up(&barrier->lock);
4134 +               return -EINVAL;
4135 +       }
4136 +
4137 +       /* We need to send WAIT now because the user may not
4138 +        * actually call kcl_barrier_wait() */
4139 +       if (!barrier->waitsent) {
4140 +               struct cl_barriermsg bmsg;
4141 +
4142 +               /* Send it to the rest of the cluster */
4143 +               bmsg.cmd = CLUSTER_CMD_BARRIER;
4144 +               bmsg.subcmd = BARRIER_WAIT;
4145 +               strcpy(bmsg.name, barrier->name);
4146 +
4147 +               barrier->waitsent = 1;
4148 +               barrier->phase = 1;
4149 +
4150 +               atomic_inc(&barrier->got_nodes);
4151 +
4152 +               /* Start the timer if one was wanted */
4153 +               if (barrier->timeout) {
4154 +                       init_timer(&barrier->timer);
4155 +                       barrier->timer.function = barrier_timer_fn;
4156 +                       barrier->timer.data = (long) barrier;
4157 +                       mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
4158 +               }
4159 +
4160 +               /* Barrier WAIT and COMPLETE messages are
4161 +                * always queued - that way they always get
4162 +                * sent out in the right order. If we don't do
4163 +                * this then one can get sent out in the
4164 +                * context of the user process and the other in
4165 +                * cnxman and COMPLETE may /just/ slide in
4166 +                * before WAIT if its in the queue
4167 +                */
4168 +               P_BARRIER("Sending WAIT for %s\n", barrier->name);
4169 +               status = queue_message(NULL, &bmsg, sizeof (bmsg), NULL, 0, 0);
4170 +               if (status < 0) {
4171 +                       up(&barrier->lock);
4172 +                       return status;
4173 +               }
4174 +
4175 +               /* It might have been reached now */
4176 +               if (barrier
4177 +                   && barrier->state != BARRIER_STATE_COMPLETE
4178 +                   && barrier->phase == 1)
4179 +                       check_barrier_complete_phase1(barrier);
4180 +       }
4181 +       if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
4182 +               up(&barrier->lock);
4183 +               return barrier->endreason;
4184 +       }
4185 +       up(&barrier->lock);
4186 +       return 0;       /* Nothing to propogate */
4187 +}
4188 +
4189 +int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
4190 +{
4191 +       struct cl_barrier *barrier;
4192 +
4193 +       /* See if it already exists */
4194 +       down(&barrier_list_lock);
4195 +       if (!(barrier = find_barrier(name))) {
4196 +               up(&barrier_list_lock);
4197 +               return -ENOENT;
4198 +       }
4199 +       up(&barrier_list_lock);
4200 +
4201 +       down(&barrier->lock);
4202 +       if (barrier->state == BARRIER_STATE_COMPLETE) {
4203 +               up(&barrier->lock);
4204 +               return 0;
4205 +       }
4206 +
4207 +       switch (attr) {
4208 +       case BARRIER_SETATTR_AUTODELETE:
4209 +               if (arg)
4210 +                       barrier->flags |= BARRIER_ATTR_AUTODELETE;
4211 +               else
4212 +                       barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
4213 +               up(&barrier->lock);
4214 +               return 0;
4215 +               break;
4216 +
4217 +       case BARRIER_SETATTR_TIMEOUT:
4218 +               /* Can only change the timout of an inactive barrier */
4219 +               if (barrier->state == BARRIER_STATE_WAITING
4220 +                   || barrier->waitsent) {
4221 +                       up(&barrier->lock);
4222 +                       return -EINVAL;
4223 +               }
4224 +               barrier->timeout = arg;
4225 +               up(&barrier->lock);
4226 +               return 0;
4227 +
4228 +       case BARRIER_SETATTR_MULTISTEP:
4229 +               up(&barrier->lock);
4230 +               return -ENOTSUPP;
4231 +
4232 +       case BARRIER_SETATTR_ENABLED:
4233 +               return barrier_setattr_enabled(barrier, attr, arg);
4234 +
4235 +       case BARRIER_SETATTR_NODES:
4236 +               /* Can only change the expected node count of an inactive
4237 +                * barrier */
4238 +               if (barrier->state == BARRIER_STATE_WAITING
4239 +                   || barrier->waitsent)
4240 +                       return -EINVAL;
4241 +               barrier->expected_nodes = arg;
4242 +               break;
4243 +
4244 +       case BARRIER_SETATTR_CALLBACK:
4245 +               if (barrier->state == BARRIER_STATE_WAITING
4246 +                   || barrier->waitsent)
4247 +                       return -EINVAL;
4248 +               barrier->callback = (void (*)(char *, int)) arg;
4249 +               up(&barrier->lock);
4250 +               return 0;       /* Don't propgate this to other nodes */
4251 +       }
4252 +
4253 +       up(&barrier->lock);
4254 +       return 0;
4255 +}
4256 +
4257 +int kcl_barrier_delete(char *name)
4258 +{
4259 +       struct cl_barrier *barrier;
4260 +
4261 +       down(&barrier_list_lock);
4262 +       /* See if it exists */
4263 +       if (!(barrier = find_barrier(name))) {
4264 +               up(&barrier_list_lock);
4265 +               return -ENOENT;
4266 +       }
4267 +
4268 +       /* Delete it */
4269 +       list_del(&barrier->list);
4270 +       kfree(barrier);
4271 +
4272 +       up(&barrier_list_lock);
4273 +
4274 +       return 0;
4275 +}
4276 +
4277 +int kcl_barrier_cancel(char *name)
4278 +{
4279 +       struct cl_barrier *barrier;
4280 +
4281 +       /* See if it exists */
4282 +       down(&barrier_list_lock);
4283 +       if (!(barrier = find_barrier(name))) {
4284 +               up(&barrier_list_lock);
4285 +               return -ENOENT;
4286 +       }
4287 +       down(&barrier->lock);
4288 +
4289 +       barrier->endreason = -ENOTCONN;
4290 +
4291 +       if (barrier->callback) {
4292 +               barrier->callback(barrier->name, -ECONNRESET);
4293 +               barrier->callback = NULL;
4294 +       }
4295 +
4296 +       if (barrier->timeout)
4297 +               del_timer(&barrier->timer);
4298 +
4299 +       /* Remove it if it's AUTO-DELETE */
4300 +       if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4301 +               list_del(&barrier->list);
4302 +               up(&barrier->lock);
4303 +               kfree(barrier);
4304 +               up(&barrier_list_lock);
4305 +               return 0;
4306 +       }
4307 +
4308 +       if (barrier->state == BARRIER_STATE_WAITING)
4309 +               wake_up_interruptible(&barrier->waitq);
4310 +
4311 +       up(&barrier->lock);
4312 +       up(&barrier_list_lock);
4313 +       return 0;
4314 +}
4315 +
4316 +int kcl_barrier_wait(char *name)
4317 +{
4318 +       struct cl_barrier *barrier;
4319 +       int ret;
4320 +
4321 +       if (!atomic_read(&cnxman_running))
4322 +               return -ENOTCONN;
4323 +
4324 +       /* Enable it */
4325 +       kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
4326 +
4327 +       down(&barrier_list_lock);
4328 +
4329 +       /* See if it still exists - enable may have deleted it! */
4330 +       if (!(barrier = find_barrier(name))) {
4331 +               up(&barrier_list_lock);
4332 +               return -ENOENT;
4333 +       }
4334 +
4335 +       down(&barrier->lock);
4336 +
4337 +       up(&barrier_list_lock);
4338 +
4339 +       /* If it has already completed then return the status */
4340 +       if (barrier->state == BARRIER_STATE_COMPLETE) {
4341 +               up(&barrier->lock);
4342 +               return barrier->endreason;
4343 +       }
4344 +
4345 +       barrier->state = BARRIER_STATE_WAITING;
4346 +
4347 +       /* Have we all reached the barrier? */
4348 +       while (atomic_read(&barrier->completed_nodes) !=
4349 +              ((barrier->expected_nodes == 0)
4350 +               ? cluster_members : barrier->expected_nodes)
4351 +              && barrier->endreason == 0) {
4352 +
4353 +               wait_queue_t wq;
4354 +
4355 +               init_waitqueue_entry(&wq, current);
4356 +               init_waitqueue_head(&barrier->waitq);
4357 +
4358 +               /* Wait for em all */
4359 +               set_task_state(current, TASK_INTERRUPTIBLE);
4360 +               add_wait_queue(&barrier->waitq, &wq);
4361 +
4362 +               if (atomic_read(&barrier->completed_nodes) !=
4363 +                   ((barrier->expected_nodes ==
4364 +                     0) ? cluster_members : barrier->expected_nodes)
4365 +                   && barrier->endreason == 0) {
4366 +                       up(&barrier->lock);
4367 +                       schedule();
4368 +                       down(&barrier->lock);
4369 +               }
4370 +
4371 +               remove_wait_queue(&barrier->waitq, &wq);
4372 +               set_task_state(current, TASK_RUNNING);
4373 +
4374 +               if (signal_pending(current)) {
4375 +                       barrier->endreason = -EINTR;
4376 +                       break;
4377 +               }
4378 +       }
4379 +       barrier->state = BARRIER_STATE_INACTIVE;
4380 +
4381 +       if (barrier->timeout)
4382 +               del_timer(&barrier->timer);
4383 +
4384 +       /* Barrier has been reached on all nodes, call the callback */
4385 +       if (barrier->callback) {
4386 +               barrier->callback(barrier->name, barrier->endreason);
4387 +               barrier->callback = NULL;
4388 +       }
4389 +
4390 +       atomic_set(&barrier->got_nodes, 0);
4391 +
4392 +       /* Return the reason we were woken */
4393 +       ret = barrier->endreason;
4394 +
4395 +       /* Remove it if it's AUTO-DELETE */
4396 +       if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
4397 +               down(&barrier_list_lock);
4398 +               list_del(&barrier->list);
4399 +               up(&barrier_list_lock);
4400 +               up(&barrier->lock);
4401 +               kfree(barrier);
4402 +       }
4403 +       else {
4404 +               up(&barrier->lock);
4405 +       }
4406 +
4407 +       /* We were woken up because the node left the cluster ? */
4408 +       if (!atomic_read(&cnxman_running))
4409 +               ret = -ENOTCONN;
4410 +
4411 +       return ret;
4412 +}
4413 +
4414 +/* This is called from membership services when a node has left the cluster -
4415 + * we signal all waiting barriers with -ESRCH so they know to do something
4416 + * else, if the number of nodes is left at 0 then we compare the new number of
4417 + * nodes in the cluster with that at the barrier and return 0 (success) in that
4418 + * case */
4419 +void check_barrier_returns()
4420 +{
4421 +       struct list_head *blist;
4422 +       struct list_head *llist;
4423 +       struct cl_barrier *barrier;
4424 +       int status = 0;
4425 +
4426 +       down(&barrier_list_lock);
4427 +       list_for_each(blist, &barrier_list) {
4428 +               barrier = list_entry(blist, struct cl_barrier, list);
4429 +
4430 +               if (barrier->waitsent) {
4431 +                       int wakeit = 0;
4432 +
4433 +                       /* Check for a dynamic member barrier */
4434 +                       if (barrier->expected_nodes == 0) {
4435 +                               if (barrier->registered_nodes ==
4436 +                                   cluster_members) {
4437 +                                       status = 0;
4438 +                                       wakeit = 1;
4439 +                               }
4440 +                       }
4441 +                       else {
4442 +                               status = -ESRCH;
4443 +                               wakeit = 1;
4444 +                       }
4445 +
4446 +                       /* Do we need to tell the barrier? */
4447 +                       if (wakeit) {
4448 +                               if (barrier->state == BARRIER_STATE_WAITING) {
4449 +                                       barrier->endreason = status;
4450 +                                       wake_up_interruptible(&barrier->waitq);
4451 +                               }
4452 +                               else {
4453 +                                       if (barrier->callback) {
4454 +                                               barrier->callback(barrier->name,
4455 +                                                                 status);
4456 +                                       }
4457 +                               }
4458 +                       }
4459 +               }
4460 +       }
4461 +       up(&barrier_list_lock);
4462 +
4463 +       /* Part 2 check for outstanding listen requests for dead nodes and
4464 +        * cancel them */
4465 +       down(&listenreq_lock);
4466 +       list_for_each(llist, &listenreq_list) {
4467 +               struct cl_waiting_listen_request *lrequest =
4468 +                   list_entry(llist, struct cl_waiting_listen_request, list);
4469 +               struct cluster_node *node =
4470 +                   find_node_by_nodeid(lrequest->nodeid);
4471 +
4472 +               if (node && node->state != NODESTATE_MEMBER) {
4473 +                       lrequest->result = -ENOTCONN;
4474 +                       lrequest->waiting = 0;
4475 +                       wake_up_interruptible(&lrequest->waitq);
4476 +               }
4477 +       }
4478 +       up(&listenreq_lock);
4479 +}
4480 +
4481 +int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
4482 +{
4483 +       struct temp_node *tn;
4484 +       int err = 1; /* true */
4485 +#ifdef DEBUG_COMMS
4486 +       char buf[MAX_ADDR_PRINTED_LEN];
4487 +#endif
4488 +
4489 +       down(&tempnode_lock);
4490 +
4491 +       list_for_each_entry(tn, &tempnode_list, list) {
4492 +               if (tn->nodeid == nodeid) {
4493 +                       memcpy(addr, tn->addr, tn->addrlen);
4494 +                       *addrlen = tn->addrlen;
4495 +                       P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
4496 +                               tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4497 +
4498 +                       goto out;
4499 +               }
4500 +       }
4501 +       err = 0;
4502 +
4503 + out:
4504 +       up(&tempnode_lock);
4505 +       return err;
4506 +}
4507 +
4508 +/* Create a new temporary node ID. This list will only ever be very small
4509 +   (usaully only 1 item) but I can't take the risk that someone won't try to
4510 +   boot 128 nodes all at exactly the same time. */
4511 +int new_temp_nodeid(char *addr, int addrlen)
4512 +{
4513 +       struct temp_node *tn;
4514 +       int err = -1;
4515 +       int try_nodeid = 0;
4516 +#ifdef DEBUG_COMMS
4517 +       char buf[MAX_ADDR_PRINTED_LEN];
4518 +#endif
4519 +
4520 +       P_COMMS("new_temp_nodeid needed for\n: %s\n",
4521 +               print_addr(addr, addrlen, buf));
4522 +
4523 +       down(&tempnode_lock);
4524 +
4525 +       /* First see if we already know about this node */
4526 +       list_for_each_entry(tn, &tempnode_list, list) {
4527 +
4528 +               P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
4529 +                       tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
4530 +
4531 +               /* We're already in here... */
4532 +               if (tn->addrlen == addrlen &&
4533 +                   memcmp(tn->addr, addr, addrlen) == 0) {
4534 +                       P_COMMS("reused temp node ID %d\n", tn->nodeid);
4535 +                       err = tn->nodeid;
4536 +                       goto out;
4537 +               }
4538 +       }
4539 +
4540 +       /* Nope, OK, invent a suitable number */
4541 + retry:
4542 +       try_nodeid -= 1;
4543 +       list_for_each_entry(tn, &tempnode_list, list) {
4544 +
4545 +               if (tn->nodeid == try_nodeid)
4546 +                       goto retry;
4547 +       }
4548 +
4549 +       tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
4550 +       if (!tn)
4551 +               goto out;
4552 +
4553 +       memcpy(tn->addr, addr, addrlen);
4554 +       tn->addrlen = addrlen;
4555 +       tn->nodeid = try_nodeid;
4556 +       list_add_tail(&tn->list, &tempnode_list);
4557 +       err = try_nodeid;
4558 +       P_COMMS("new temp nodeid = %d\n", try_nodeid);
4559 + out:
4560 +       up(&tempnode_lock);
4561 +       return err;
4562 +}
4563 +
4564 +static int is_valid_temp_nodeid(int nodeid)
4565 +{
4566 +       struct temp_node *tn;
4567 +       int err = 1; /* true */
4568 +
4569 +       down(&tempnode_lock);
4570 +
4571 +       list_for_each_entry(tn, &tempnode_list, list) {
4572 +               if (tn->nodeid == nodeid)
4573 +                       goto out;
4574 +       }
4575 +       err = 0;
4576 +
4577 + out:
4578 +       P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
4579 +       up(&tempnode_lock);
4580 +       return err;
4581 +}
4582 +
4583 +/*
4584 + * Remove any temp nodeIDs that refer to now-valid cluster members.
4585 + */
4586 +void purge_temp_nodeids()
4587 +{
4588 +       struct temp_node *tn;
4589 +       struct temp_node *tmp;
4590 +       struct cluster_node *node;
4591 +       struct cluster_node_addr *nodeaddr;
4592 +
4593 +
4594 +       down(&tempnode_lock);
4595 +       down(&cluster_members_lock);
4596 +
4597 +       /*
4598 +        * The ordering of these nested lists is deliberately
4599 +        * arranged for the fewest list traversals overall
4600 +        */
4601 +
4602 +       /* For each node... */
4603 +       list_for_each_entry(node, &cluster_members_list, list) {
4604 +               if (node->state == NODESTATE_MEMBER) {
4605 +                       /* ...We check the temp node ID list... */
4606 +                       list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
4607 +
4608 +                               /* ...against that node's address */
4609 +                               list_for_each_entry(nodeaddr, &node->addr_list, list) {
4610 +
4611 +                                       if (memcmp(nodeaddr->addr, tn->addr, tn->addrlen) == 0) {
4612 +                                               list_del(&tn->list);
4613 +                                               kfree(tn);
4614 +                                       }
4615 +                               }
4616 +                       }
4617 +               }
4618 +       }
4619 +       up(&cluster_members_lock);
4620 +       up(&tempnode_lock);
4621 +}
4622 +
4623 +
4624 +/* Quorum device functions */
4625 +int kcl_register_quorum_device(char *name, int votes)
4626 +{
4627 +       if (quorum_device)
4628 +               return -EBUSY;
4629 +
4630 +       if (find_node_by_name(name))
4631 +               return -EINVAL;
4632 +
4633 +       quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
4634 +       if (!quorum_device)
4635 +               return -ENOMEM;
4636 +       memset(quorum_device, 0, sizeof (struct cluster_node));
4637 +
4638 +       quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
4639 +       if (!quorum_device->name) {
4640 +               kfree(quorum_device);
4641 +               quorum_device = NULL;
4642 +               return -ENOMEM;
4643 +       }
4644 +
4645 +       strcpy(quorum_device->name, name);
4646 +       quorum_device->votes = votes;
4647 +       quorum_device->state = NODESTATE_DEAD;
4648 +
4649 +       /* Keep this list valid so it doesn't confuse other code */
4650 +       INIT_LIST_HEAD(&quorum_device->addr_list);
4651 +
4652 +       return 0;
4653 +}
4654 +
4655 +int kcl_unregister_quorum_device(void)
4656 +{
4657 +       if (!quorum_device)
4658 +               return -EINVAL;
4659 +       if (quorum_device->state == NODESTATE_MEMBER)
4660 +               return -EINVAL;
4661 +
4662 +       quorum_device = NULL;
4663 +
4664 +       return 0;
4665 +}
4666 +
4667 +int kcl_quorum_device_available(int yesno)
4668 +{
4669 +       if (!quorum_device)
4670 +               return -EINVAL;
4671 +
4672 +       if (yesno) {
4673 +               quorum_device->last_hello = jiffies;
4674 +               if (quorum_device->state == NODESTATE_DEAD) {
4675 +                       quorum_device->state = NODESTATE_MEMBER;
4676 +                       recalculate_quorum(0);
4677 +               }
4678 +       }
4679 +       else {
4680 +               if (quorum_device->state == NODESTATE_MEMBER) {
4681 +                       quorum_device->state = NODESTATE_DEAD;
4682 +                       recalculate_quorum(0);
4683 +               }
4684 +       }
4685 +
4686 +       return 0;
4687 +}
4688 +
4689 +/* APIs for cluster ref counting. */
4690 +int kcl_addref_cluster()
4691 +{
4692 +       int ret = -ENOTCONN;
4693 +
4694 +       if (!atomic_read(&cnxman_running))
4695 +               goto addref_ret;
4696 +
4697 +       if (try_module_get(THIS_MODULE)) {
4698 +               atomic_inc(&use_count);
4699 +               ret = 0;
4700 +       }
4701 +
4702 +      addref_ret:
4703 +       return ret;
4704 +}
4705 +
4706 +int kcl_releaseref_cluster()
4707 +{
4708 +       if (!atomic_read(&cnxman_running))
4709 +               return -ENOTCONN;
4710 +       atomic_dec(&use_count);
4711 +       module_put(THIS_MODULE);
4712 +       return 0;
4713 +}
4714 +
4715 +int kcl_cluster_name(char **cname)
4716 +{
4717 +       char *name;
4718 +
4719 +       name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
4720 +       if (!name)
4721 +               return -ENOMEM;
4722 +
4723 +       strncpy(name, cluster_name, strlen(cluster_name)+1);
4724 +       *cname = name;
4725 +       return 0;
4726 +}
4727 +
4728 +int kcl_get_current_interface(void)
4729 +{
4730 +       return current_interface->number;
4731 +}
4732 +
4733 +/* Socket registration stuff */
4734 +static struct net_proto_family cl_family_ops = {
4735 +       .family = AF_CLUSTER,
4736 +       .create = cl_create,
4737 +       .owner  = THIS_MODULE,
4738 +};
4739 +
4740 +static struct proto_ops cl_proto_ops = {
4741 +       .family      = AF_CLUSTER,
4742 +
4743 +       .release     = cl_release,
4744 +       .bind        = cl_bind,
4745 +       .connect     = sock_no_connect,
4746 +       .socketpair  = sock_no_socketpair,
4747 +       .accept      = sock_no_accept,
4748 +       .getname     = cl_getname,
4749 +       .poll        = cl_poll,
4750 +       .ioctl       = cl_ioctl,
4751 +       .listen      = sock_no_listen,
4752 +       .shutdown    = cl_shutdown,
4753 +       .setsockopt  = sock_no_setsockopt,
4754 +       .getsockopt  = sock_no_getsockopt,
4755 +       .sendmsg     = cl_sendmsg,
4756 +       .recvmsg     = cl_recvmsg,
4757 +       .mmap        = sock_no_mmap,
4758 +       .sendpage    = sock_no_sendpage,
4759 +       .owner       = THIS_MODULE,
4760 +};
4761 +
4762 +#ifdef MODULE
4763 +MODULE_DESCRIPTION("Cluster Connection and Service Manager");
4764 +MODULE_AUTHOR("Red Hat, Inc");
4765 +MODULE_LICENSE("GPL");
4766 +#endif
4767 +
4768 +static int __init cluster_init(void)
4769 +{
4770 +       printk("CMAN %s (built %s %s) installed\n",
4771 +              CMAN_RELEASE_NAME, __DATE__, __TIME__);
4772 +
4773 +       if (sock_register(&cl_family_ops)) {
4774 +               printk(KERN_INFO "Unable to register cluster socket type\n");
4775 +               return -1;
4776 +       }
4777 +
4778 +       /* allocate our sock slab cache */
4779 +       cluster_sk_cachep = kmem_cache_create("cluster_sock",
4780 +                                             sizeof (struct cluster_sock), 0,
4781 +                                             SLAB_HWCACHE_ALIGN, 0, 0);
4782 +       if (!cluster_sk_cachep) {
4783 +               printk(KERN_CRIT
4784 +                      "cluster_init: Cannot create cluster_sock SLAB cache\n");
4785 +               sock_unregister(AF_CLUSTER);
4786 +               return -1;
4787 +       }
4788 +
4789 +#ifdef CONFIG_PROC_FS
4790 +       create_proc_entries();
4791 +#endif
4792 +
4793 +       init_MUTEX(&start_thread_sem);
4794 +       init_MUTEX(&send_lock);
4795 +       init_MUTEX(&barrier_list_lock);
4796 +       init_MUTEX(&cluster_members_lock);
4797 +       init_MUTEX(&port_array_lock);
4798 +       init_MUTEX(&messages_list_lock);
4799 +       init_MUTEX(&listenreq_lock);
4800 +       init_MUTEX(&client_socket_lock);
4801 +       init_MUTEX(&new_dead_node_lock);
4802 +       init_MUTEX(&event_listener_lock);
4803 +       init_MUTEX(&kernel_listener_lock);
4804 +       init_MUTEX(&tempnode_lock);
4805 +       spin_lock_init(&active_socket_lock);
4806 +       init_timer(&ack_timer);
4807 +
4808 +       INIT_LIST_HEAD(&event_listener_list);
4809 +       INIT_LIST_HEAD(&kernel_listener_list);
4810 +       INIT_LIST_HEAD(&socket_list);
4811 +       INIT_LIST_HEAD(&client_socket_list);
4812 +       INIT_LIST_HEAD(&active_socket_list);
4813 +       INIT_LIST_HEAD(&barrier_list);
4814 +       INIT_LIST_HEAD(&messages_list);
4815 +       INIT_LIST_HEAD(&listenreq_list);
4816 +       INIT_LIST_HEAD(&cluster_members_list);
4817 +       INIT_LIST_HEAD(&new_dead_node_list);
4818 +       INIT_LIST_HEAD(&tempnode_list);
4819 +
4820 +       atomic_set(&cnxman_running, 0);
4821 +
4822 +       sm_init();
4823 +
4824 +       return 0;
4825 +}
4826 +
4827 +static void __exit cluster_exit(void)
4828 +{
4829 +#ifdef CONFIG_PROC_FS
4830 +       cleanup_proc_entries();
4831 +#endif
4832 +
4833 +       sock_unregister(AF_CLUSTER);
4834 +       kmem_cache_destroy(cluster_sk_cachep);
4835 +}
4836 +
4837 +module_init(cluster_init);
4838 +module_exit(cluster_exit);
4839 +
4840 +EXPORT_SYMBOL(kcl_sendmsg);
4841 +EXPORT_SYMBOL(kcl_register_read_callback);
4842 +EXPORT_SYMBOL(kcl_add_callback);
4843 +EXPORT_SYMBOL(kcl_remove_callback);
4844 +EXPORT_SYMBOL(kcl_get_members);
4845 +EXPORT_SYMBOL(kcl_get_member_ids);
4846 +EXPORT_SYMBOL(kcl_get_all_members);
4847 +EXPORT_SYMBOL(kcl_is_quorate);
4848 +EXPORT_SYMBOL(kcl_get_node_by_addr);
4849 +EXPORT_SYMBOL(kcl_get_node_by_name);
4850 +EXPORT_SYMBOL(kcl_get_node_by_nodeid);
4851 +EXPORT_SYMBOL(kcl_get_node_addresses);
4852 +EXPORT_SYMBOL(kcl_addref_cluster);
4853 +EXPORT_SYMBOL(kcl_releaseref_cluster);
4854 +EXPORT_SYMBOL(kcl_cluster_name);
4855 +
4856 +EXPORT_SYMBOL(kcl_barrier_register);
4857 +EXPORT_SYMBOL(kcl_barrier_setattr);
4858 +EXPORT_SYMBOL(kcl_barrier_delete);
4859 +EXPORT_SYMBOL(kcl_barrier_wait);
4860 +EXPORT_SYMBOL(kcl_barrier_cancel);
4861 +
4862 +EXPORT_SYMBOL(kcl_register_quorum_device);
4863 +EXPORT_SYMBOL(kcl_unregister_quorum_device);
4864 +EXPORT_SYMBOL(kcl_quorum_device_available);
4865 +
4866 +EXPORT_SYMBOL(kcl_register_service);
4867 +EXPORT_SYMBOL(kcl_unregister_service);
4868 +EXPORT_SYMBOL(kcl_join_service);
4869 +EXPORT_SYMBOL(kcl_leave_service);
4870 +EXPORT_SYMBOL(kcl_global_service_id);
4871 +EXPORT_SYMBOL(kcl_start_done);
4872 +EXPORT_SYMBOL(kcl_get_services);
4873 +EXPORT_SYMBOL(kcl_get_current_interface);
4874 +
4875 +/*
4876 + * Overrides for Emacs so that we follow Linus's tabbing style.
4877 + * Emacs will notice this stuff at the end of the file and automatically
4878 + * adjust the settings for this buffer only.  This must remain at the end
4879 + * of the file.
4880 + * ---------------------------------------------------------------------------
4881 + * Local variables:
4882 + * c-file-style: "linux"
4883 + * End:
4884 + */
4885 diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
4886 --- linux-orig/cluster/cman/config.c    1970-01-01 07:30:00.000000000 +0730
4887 +++ linux-patched/cluster/cman/config.c 2004-11-03 11:37:37.000000000 +0800
4888 @@ -0,0 +1,49 @@
4889 +/******************************************************************************
4890 +*******************************************************************************
4891 +**
4892 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4893 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4894 +**
4895 +**  This copyrighted material is made available to anyone wishing to use,
4896 +**  modify, copy, or redistribute it subject to the terms and conditions
4897 +**  of the GNU General Public License v.2.
4898 +**
4899 +*******************************************************************************
4900 +******************************************************************************/
4901 +
4902 +#include "config.h"
4903 +
4904 +/* Config file defaults */
4905 +
4906 +#define DEFAULT_JOIN_WAIT_TIME   16    /* Time to wait while sending JOINREQ
4907 +                                        * messages. Should be at least twice
4908 +                                        * the HELLO timer, probably 3x */
4909 +#define DEFAULT_JOIN_TIMEOUT     30    /* How long we wait after getting a
4910 +                                        * JOINACK to regarding that node as
4911 +                                        * dead */
4912 +#define DEFAULT_HELLO_TIMER       5    /* Period between HELLO messages */
4913 +#define DEFAULT_DEADNODE_TIMER   21    /* If we don't get a message from a
4914 +                                        * node in this period kill it */
4915 +#define DEFAULT_TRANSITION_TIMER 15    /* Maximum time a state transition
4916 +                                        * should take */
4917 +#define DEFAULT_JOINCONF_TIMER    5    /* Time allowed to a node to respond to
4918 +                                        * a JOINCONF message */
4919 +#define DEFAULT_MAX_NODES       128    /* Max allowed nodes */
4920 +#define DEFAULT_TRANSITION_RESTARTS  10        /* Maximum number of transition
4921 +                                        * restarts before we die */
4922 +#define DEFAULT_SM_DEBUG_SIZE  256     /* Size in bytes of SM debug buffer */
4923 +
4924 +#define DEFAULT_NEWCLUSTER_TIMEOUT 16   /* Time to send NEWCLUSTER messages */
4925 +
4926 +struct config_info cman_config = {
4927 +       .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
4928 +       .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
4929 +       .join_timeout = DEFAULT_JOIN_TIMEOUT,
4930 +       .hello_timer = DEFAULT_HELLO_TIMER,
4931 +       .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
4932 +       .transition_timeout = DEFAULT_TRANSITION_TIMER,
4933 +       .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
4934 +       .max_nodes = DEFAULT_MAX_NODES,
4935 +       .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
4936 +       .newcluster_timeout = DEFAULT_NEWCLUSTER_TIMEOUT,
4937 +};
4938 diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
4939 --- linux-orig/cluster/cman/config.h    1970-01-01 07:30:00.000000000 +0730
4940 +++ linux-patched/cluster/cman/config.h 2004-11-03 11:37:37.000000000 +0800
4941 @@ -0,0 +1,32 @@
4942 +/******************************************************************************
4943 +*******************************************************************************
4944 +**
4945 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4946 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4947 +**
4948 +**  This copyrighted material is made available to anyone wishing to use,
4949 +**  modify, copy, or redistribute it subject to the terms and conditions
4950 +**  of the GNU General Public License v.2.
4951 +**
4952 +*******************************************************************************
4953 +******************************************************************************/
4954 +
4955 +#ifndef __CONFIG_DOT_H__
4956 +#define __CONFIG_DOT_H__
4957 +
4958 +struct config_info {
4959 +       int joinwait_timeout;
4960 +       int joinconf_timeout;
4961 +       int join_timeout;
4962 +       int hello_timer;
4963 +       int deadnode_timeout;
4964 +       int transition_timeout;
4965 +       int transition_restarts;
4966 +       int max_nodes;
4967 +       int sm_debug_size;
4968 +        int newcluster_timeout;
4969 +};
4970 +
4971 +extern struct config_info cman_config;
4972 +
4973 +#endif                         /* __CONFIG_DOT_H__ */
4974 diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
4975 --- linux-orig/cluster/cman/kjoin.c     1970-01-01 07:30:00.000000000 +0730
4976 +++ linux-patched/cluster/cman/kjoin.c  2004-11-03 11:37:37.000000000 +0800
4977 @@ -0,0 +1,238 @@
4978 +/******************************************************************************
4979 +*******************************************************************************
4980 +**
4981 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4982 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4983 +**
4984 +**  This copyrighted material is made available to anyone wishing to use,
4985 +**  modify, copy, or redistribute it subject to the terms and conditions
4986 +**  of the GNU General Public License v.2.
4987 +**
4988 +*******************************************************************************
4989 +******************************************************************************/
4990 +
4991 +#include <linux/socket.h>
4992 +#include <net/sock.h>
4993 +#include <linux/list.h>
4994 +#include <cluster/cnxman.h>
4995 +#include <linux/in.h>
4996 +
4997 +#include "cnxman-private.h"
4998 +
4999 +static struct socket *mcast_sock;
5000 +static struct socket *recv_sock;
5001 +static struct socket *cluster_sock;
5002 +
5003 +extern short cluster_id;
5004 +extern int join_count;
5005 +extern struct semaphore join_count_lock;
5006 +extern atomic_t cnxman_running;
5007 +
5008 +int kcl_join_cluster(struct cl_join_cluster_info *join_info)
5009 +{
5010 +       int result;
5011 +       int one = 1, error;
5012 +       unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
5013 +       unsigned short port = join_info->port;
5014 +       mm_segment_t fs;
5015 +       struct sockaddr_in saddr;
5016 +       struct kcl_multicast_sock mcast_info;
5017 +
5018 +       down(&join_count_lock);
5019 +       if (atomic_read(&cnxman_running))
5020 +       {
5021 +               error = 0;
5022 +               if (join_info->cluster_id == cluster_id)
5023 +                       join_count++;
5024 +               else
5025 +                       error = -EINVAL;
5026 +               up(&join_count_lock);
5027 +               return error;
5028 +       }
5029 +       up(&join_count_lock);
5030 +
5031 +       result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
5032 +       if (result < 0)
5033 +       {
5034 +               printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
5035 +               return result;
5036 +       }
5037 +
5038 +       result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
5039 +       if (result < 0)
5040 +       {
5041 +               printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
5042 +               return result;
5043 +       }
5044 +
5045 +       fs = get_fs();
5046 +       set_fs(get_ds());
5047 +
5048 +       if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
5049 +                                    (void *) &one, sizeof (int))))
5050 +       {
5051 +               set_fs(fs);
5052 +               printk("Error %d Setting master socket to SO_BROADCAST\n",
5053 +                      error);
5054 +               sock_release(mcast_sock);
5055 +               return -1;
5056 +       }
5057 +       set_fs(fs);
5058 +
5059 +       /* Bind the multicast socket */
5060 +       saddr.sin_family = AF_INET;
5061 +       saddr.sin_port = htons(port);
5062 +       saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5063 +       result =
5064 +           mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
5065 +                                 sizeof (saddr));
5066 +       if (result < 0)
5067 +       {
5068 +               printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
5069 +               sock_release(mcast_sock);
5070 +               sock_release(recv_sock);
5071 +               return result;
5072 +       }
5073 +
5074 +       /* Bind the receive socket to our IP address */
5075 +       saddr.sin_family = AF_INET;
5076 +       saddr.sin_port = htons(port);
5077 +       saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
5078 +       result =
5079 +           recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
5080 +                                sizeof (saddr));
5081 +       if (result < 0)
5082 +       {
5083 +               printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
5084 +               sock_release(mcast_sock);
5085 +               sock_release(recv_sock);
5086 +               return result;
5087 +       }
5088 +
5089 +       /* Create the cluster master socket */
5090 +       result =
5091 +           sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
5092 +       if (result < 0)
5093 +       {
5094 +               printk(KERN_ERR CMAN_NAME
5095 +                      ": Can't create cluster master socket\n");
5096 +               sock_release(mcast_sock);
5097 +               sock_release(recv_sock);
5098 +               return result;
5099 +       }
5100 +
5101 +       /* This is the broadcast transmit address */
5102 +       saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
5103 +
5104 +       /* Pass the multicast socket to kernel space */
5105 +       mcast_info.sock = mcast_sock;
5106 +       mcast_info.number = 1;
5107 +
5108 +       fs = get_fs();
5109 +       set_fs(get_ds());
5110 +
5111 +       if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5112 +                                                  KCL_SET_MULTICAST,
5113 +                                                  (void *) &mcast_info,
5114 +                                                  sizeof (mcast_info))))
5115 +       {
5116 +               set_fs(fs);
5117 +               printk(CMAN_NAME
5118 +                      ": Unable to pass multicast socket to cnxman, %d\n",
5119 +                      error);
5120 +               sock_release(mcast_sock);
5121 +               sock_release(recv_sock);
5122 +               sock_release(cluster_sock);
5123 +               return -1;
5124 +       }
5125 +
5126 +       mcast_info.sock = recv_sock;
5127 +       if ((error =
5128 +            cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
5129 +                                          KCL_SET_RCVONLY,
5130 +                                          (void *) &mcast_info,
5131 +                                          sizeof (mcast_info))))
5132 +       {
5133 +               set_fs(fs);
5134 +               printk(CMAN_NAME
5135 +                      ": Unable to pass receive socket to cnxman, %d\n",
5136 +                      error);
5137 +               sock_release(mcast_sock);
5138 +               sock_release(recv_sock);
5139 +               sock_release(cluster_sock);
5140 +               return -1;
5141 +       }
5142 +
5143 +       /* This setsockopt expects usermode variables */
5144 +
5145 +       if (cluster_sock->ops->
5146 +           setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
5147 +                      (void *) join_info,
5148 +                      sizeof (struct cl_join_cluster_info)))
5149 +
5150 +       {
5151 +               set_fs(fs);
5152 +               printk(CMAN_NAME ": Unable to join cluster\n");
5153 +               sock_release(mcast_sock);
5154 +               sock_release(recv_sock);
5155 +               sock_release(cluster_sock);
5156 +               return -1;
5157 +       }
5158 +       set_fs(fs);
5159 +
5160 +       return 0;
5161 +}
5162 +
5163 +int kcl_leave_cluster(int remove)
5164 +{
5165 +       mm_segment_t fs;
5166 +       int rem = remove;
5167 +       int ret = 0;
5168 +       struct socket *shutdown_sock = cluster_sock;
5169 +
5170 +       cluster_sock = NULL;
5171 +
5172 +       if (!shutdown_sock)
5173 +       {
5174 +               /* Create the cluster master socket */
5175 +               int result =
5176 +                   sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
5177 +                               &shutdown_sock);
5178 +               if (result < 0)
5179 +               {
5180 +                       printk(KERN_ERR CMAN_NAME
5181 +                              ": Can't create cluster master socket\n");
5182 +                       sock_release(mcast_sock);
5183 +                       sock_release(recv_sock);
5184 +                       return result;
5185 +               }
5186 +       }
5187 +
5188 +       fs = get_fs();
5189 +       set_fs(get_ds());
5190 +
5191 +       if ((ret =
5192 +            shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
5193 +                                           CLU_LEAVE_CLUSTER, (void *) &rem,
5194 +                                           sizeof (int))))
5195 +       {
5196 +               printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
5197 +                      ret);
5198 +       }
5199 +       set_fs(fs);
5200 +
5201 +       sock_release(shutdown_sock);
5202 +
5203 +       return ret;
5204 +}
5205 +
5206 +/*
5207 + * Overrides for Emacs so that we follow Linus's tabbing style.
5208 + * Emacs will notice this stuff at the end of the file and automatically
5209 + * adjust the settings for this buffer only.  This must remain at the end
5210 + * of the file.
5211 + * ---------------------------------------------------------------------------
5212 + * Local variables:
5213 + * c-file-style: "linux"
5214 + * End:
5215 + */
5216 diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
5217 --- linux-orig/cluster/cman/membership.c        1970-01-01 07:30:00.000000000 +0730
5218 +++ linux-patched/cluster/cman/membership.c     2004-11-03 11:37:37.000000000 +0800
5219 @@ -0,0 +1,3160 @@
5220 +/******************************************************************************
5221 +*******************************************************************************
5222 +**
5223 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5224 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
5225 +**
5226 +**  This copyrighted material is made available to anyone wishing to use,
5227 +**  modify, copy, or redistribute it subject to the terms and conditions
5228 +**  of the GNU General Public License v.2.
5229 +**
5230 +*******************************************************************************
5231 +******************************************************************************/
5232 +
5233 +#include <linux/socket.h>
5234 +#include <net/sock.h>
5235 +#include <linux/slab.h>
5236 +#include <linux/spinlock.h>
5237 +#include <linux/vmalloc.h>
5238 +#include <asm/uaccess.h>
5239 +#include <linux/list.h>
5240 +#include <cluster/cnxman.h>
5241 +
5242 +#include "cnxman-private.h"
5243 +#include "config.h"
5244 +#include "sm_control.h"
5245 +
5246 +#ifndef TRUE
5247 +#define TRUE 1
5248 +#endif
5249 +
5250 +/* Barrier name for membership transitions. %d is the cluster generation number
5251 + */
5252 +#define MEMBERSHIP_BARRIER_NAME        "TRANSITION.%d"
5253 +
5254 +/* Variables also used by connection manager */
5255 +struct list_head cluster_members_list;
5256 +struct semaphore cluster_members_lock;
5257 +int cluster_members;           /* Number of ACTIVE members, not a count of
5258 +                                * nodes in the list */
5259 +int we_are_a_cluster_member;
5260 +int cluster_is_quorate;
5261 +int quit_threads;
5262 +struct task_struct *membership_task;
5263 +struct cluster_node *us;
5264 +
5265 +static struct task_struct *hello_task;
5266 +static struct semaphore hello_task_lock;
5267 +
5268 +/* Variables that belong to the connection manager */
5269 +extern wait_queue_head_t cnxman_waitq;
5270 +extern struct completion member_thread_comp;
5271 +extern struct cluster_node *quorum_device;
5272 +extern unsigned short two_node;
5273 +extern char cluster_name[];
5274 +extern unsigned int config_version;
5275 +extern unsigned int address_length;
5276 +
5277 +static struct socket *mem_socket;
5278 +static pid_t kcluster_pid;
5279 +
5280 +static char iobuf[MAX_CLUSTER_MESSAGE];
5281 +static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
5282 +
5283 +/* Our node name, usually system_utsname.nodename, but can be overridden */
5284 +char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
5285 +
5286 +/* Node ID that we want. defaults of zero means
5287 + *  it will be allocated by the cluster join mechanism
5288 + */
5289 +int wanted_nodeid;
5290 +
5291 +static spinlock_t members_by_nodeid_lock;
5292 +static int sizeof_members_array;       /* Can dynamically increase (vmalloc
5293 +                                        * permitting) */
5294 +static struct cluster_node **members_by_nodeid;
5295 +
5296 +#define MEMBER_INCREMENT_SIZE 10
5297 +
5298 +static int votes = 1;          /* Votes this node has */
5299 +static int expected_votes = 1; /* Total expected votes in the cluster */
5300 +static unsigned int quorum;    /* Quorum, fewer votes than this and we stop
5301 +                                * work */
5302 +static int leavereason;                /* Saved for the duration of a state transition */
5303 +static int transitionreason;   /* Reason this transition was initiated */
5304 +static unsigned int highest_nodeid;    /* Highest node ID known to the cluster */
5305 +static struct timer_list transition_timer;     /* Kicks in if the transition
5306 +                                                * doesn't complete in a
5307 +                                                * reasonable time */
5308 +static struct timer_list hello_timer;  /* Timer to send HELLOs on */
5309 +static unsigned long join_time;        /* The time that we got our JOIN-ACK */
5310 +static unsigned long start_time; /* The time that we were started */
5311 +static int joinconf_count;     /* Number of JOINCONF messages we have sent to
5312 +                                * a new node */
5313 +static unsigned long wake_flags;/* Reason we were woken */
5314 +
5315 +/* Flags in above */
5316 +#define WAKE_FLAG_DEADNODE    1
5317 +#define WAKE_FLAG_TRANSTIMER  2
5318 +
5319 +/* The time the transition finished */
5320 +static unsigned long transition_end_time;
5321 +
5322 +/* A list of nodes that cnxman tells us are dead. I hope this never has more
5323 + * than one element in it but I can't take that chance. only non-static so it
5324 + * can be initialised in module_load. */
5325 +struct list_head new_dead_node_list;
5326 +struct semaphore new_dead_node_lock;
5327 +
5328 +static int do_membership_packet(struct msghdr *msg, char *buf, int len);
5329 +static int do_process_joinreq(struct msghdr *msg, char *buf, int len);
5330 +static int do_process_joinack(struct msghdr *msg, char *buf, int len);
5331 +static int do_process_joinconf(struct msghdr *msg, char *buf, int len);
5332 +static int do_process_leave(struct msghdr *msg, char *buf, int len);
5333 +static int do_process_hello(struct msghdr *msg, char *buf, int len);
5334 +static int do_process_kill(struct msghdr *msg, char *buf, int len);
5335 +static int do_process_reconfig(struct msghdr *msg, char *buf, int len);
5336 +static int do_process_starttrans(struct msghdr *msg, char *buf, int len);
5337 +static int do_process_masterview(struct msghdr *msg, char *buf, int len);
5338 +static int do_process_endtrans(struct msghdr *msg, char *buf, int len);
5339 +static int do_process_viewack(struct msghdr *msg, char *buf, int len);
5340 +static int do_process_startack(struct msghdr *msg, char *buf, int len);
5341 +static int do_process_newcluster(struct msghdr *msg, char *buf, int len);
5342 +static int do_process_nominate(struct msghdr *msg, char *buf, int len);
5343 +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
5344 +                            unsigned int flags, unsigned int flags2);
5345 +static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
5346 +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
5347 +static int send_hello(void);
5348 +static int send_master_hello(void);
5349 +static int send_newcluster(void);
5350 +static int end_transition(void);
5351 +static int dispatch_messages(struct socket *mem_socket);
5352 +static void check_for_dead_nodes(void);
5353 +static void confirm_joiner(void);
5354 +static void reset_hello_time(void);
5355 +static int add_us(void);
5356 +static int send_joinconf(void);
5357 +static int init_membership_services(void);
5358 +static int elect_master(struct cluster_node **);
5359 +static void trans_timer_expired(unsigned long arg);
5360 +static void hello_timer_expired(unsigned long arg);
5361 +static void join_or_form_cluster(void);
5362 +static int do_timer_wakeup(void);
5363 +static int start_transition(unsigned char reason, struct cluster_node *node);
5364 +static uint32_t low32_of_ip(void);
5365 +int send_leave(unsigned char);
5366 +int send_reconfigure(int, unsigned int);
5367 +
5368 +#ifdef DEBUG_MEMB
5369 +static char *msgname(int msg);
5370 +static int debug_sendmsg(struct socket *sock, void *buf, int size,
5371 +                        struct sockaddr_cl *caddr, int addr_len,
5372 +                        unsigned int flags)
5373 +{
5374 +       P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
5375 +              size);
5376 +       return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
5377 +}
5378 +
5379 +#define kcl_sendmsg debug_sendmsg
5380 +#endif
5381 +
5382 +/* State of the node */
5383 +static enum { STARTING, NEWCLUSTER, JOINING, JOINWAIT, JOINACK, TRANSITION,
5384 +           TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
5385 +} node_state = LEFT_CLUSTER;
5386 +
5387 +/* Sub-state when we are MASTER */
5388 +static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
5389 +           MASTER_COMPLETE } master_state;
5390 +
5391 +/* Number of responses collected while a master controlling a state transition */
5392 +static int responses_collected;
5393 +static int responses_expected;
5394 +
5395 +/* Current cluster generation number */
5396 +int cluster_generation = 1;
5397 +
5398 +/* When another node initiates a transtion then store it's pointer in here so
5399 + * we can check for other nodes trying to spoof us */
5400 +static struct cluster_node *master_node = NULL;
5401 +
5402 +/* Struct the node wanting to join us */
5403 +static struct cluster_node *joining_node = NULL;
5404 +static int joining_temp_nodeid;
5405 +
5406 +/* Last time a HELLO message was sent */
5407 +unsigned long last_hello;
5408 +
5409 +/* When we got our JOINWAIT or NEWCLUSTER */
5410 +unsigned long joinwait_time;
5411 +
5412 +/* Number of times a transition has restarted when we were master */
5413 +int transition_restarts;
5414 +
5415 +/* Variables used by the master to collect cluster status during a transition */
5416 +static int agreeing_nodes;
5417 +static int dissenting_nodes;
5418 +static uint8_t *node_opinion = NULL;
5419 +#define OPINION_AGREE    1
5420 +#define OPINION_DISAGREE 2
5421 +
5422 +/* Set node id of a node, also add it to the members array and expand the array
5423 + * if necessary */
5424 +static inline void set_nodeid(struct cluster_node *node, int nodeid)
5425 +{
5426 +       if (!nodeid)
5427 +               return;
5428 +
5429 +       node->node_id = nodeid;
5430 +       if (nodeid >= sizeof_members_array) {
5431 +               int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
5432 +               struct cluster_node **new_array;
5433 +
5434 +               if (new_size < nodeid)
5435 +                       new_size = nodeid + MEMBER_INCREMENT_SIZE;
5436 +
5437 +               new_array = vmalloc((new_size) * sizeof (struct cluster_node *));
5438 +               if (new_array) {
5439 +                       spin_lock(&members_by_nodeid_lock);
5440 +                       memcpy(new_array, members_by_nodeid,
5441 +                              sizeof_members_array *
5442 +                              sizeof (struct cluster_node *));
5443 +                       memset(&new_array[sizeof_members_array], 0,
5444 +                              (new_size - sizeof_members_array) *
5445 +                              sizeof (struct cluster_node *));
5446 +                       vfree(members_by_nodeid);
5447 +
5448 +                       members_by_nodeid = new_array;
5449 +                       sizeof_members_array = new_size;
5450 +                       spin_unlock(&members_by_nodeid_lock);
5451 +               }
5452 +               else {
5453 +                       panic("No memory for more nodes");
5454 +               }
5455 +       }
5456 +       notify_kernel_listeners(NEWNODE, (long) nodeid);
5457 +
5458 +       spin_lock(&members_by_nodeid_lock);
5459 +       members_by_nodeid[nodeid] = node;
5460 +       spin_unlock(&members_by_nodeid_lock);
5461 +}
5462 +
5463 +static int hello_kthread(void *unused)
5464 +{
5465 +       struct task_struct *tsk = current;
5466 +       sigset_t tmpsig;
5467 +
5468 +       daemonize("cman_hbeat");
5469 +
5470 +       /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5471 +       siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5472 +       sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5473 +
5474 +       down(&hello_task_lock);
5475 +       hello_task = tsk;
5476 +       up(&hello_task_lock);
5477 +
5478 +       set_user_nice(current, -6);
5479 +
5480 +       while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5481 +
5482 +               /* Scan the nodes list for dead nodes */
5483 +               if (node_state == MEMBER)
5484 +                       check_for_dead_nodes();
5485 +
5486 +               set_task_state(current, TASK_INTERRUPTIBLE);
5487 +               schedule();
5488 +               set_task_state(current, TASK_RUNNING);
5489 +
5490 +               if (node_state != REJECTED && node_state != LEFT_CLUSTER)
5491 +                       send_hello();
5492 +       }
5493 +       down(&hello_task_lock);
5494 +       hello_task = NULL;
5495 +       up(&hello_task_lock);
5496 +       P_MEMB("heartbeat closing down\n");
5497 +       return 0;
5498 +}
5499 +
5500 +/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
5501 + * that keeps track of and controls cluster membership. */
5502 +static int membership_kthread(void *unused)
5503 +{
5504 +       struct task_struct *tsk = current;
5505 +       sigset_t tmpsig;
5506 +
5507 +       daemonize("cman_memb");
5508 +
5509 +       /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
5510 +       siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
5511 +       sigprocmask(SIG_BLOCK, &tmpsig, NULL);
5512 +
5513 +       membership_task = tsk;
5514 +       set_user_nice(current, -5);
5515 +
5516 +       /* Open the socket */
5517 +       if (init_membership_services())
5518 +               return -1;
5519 +
5520 +       add_us();
5521 +       joining_node = us;
5522 +
5523 +       init_timer(&hello_timer);
5524 +       hello_timer.function = hello_timer_expired;
5525 +       hello_timer.data = 0L;
5526 +
5527 +       /* Do joining stuff */
5528 +       join_or_form_cluster();
5529 +
5530 +       transition_end_time = jiffies;
5531 +
5532 +       /* Main loop */
5533 +       while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
5534 +
5535 +               struct task_struct *tsk = current;
5536 +
5537 +               DECLARE_WAITQUEUE(wait, tsk);
5538 +
5539 +               tsk->state = TASK_INTERRUPTIBLE;
5540 +               add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5541 +
5542 +               if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
5543 +                   wake_flags == 0) {
5544 +                       if (node_state == JOINACK ||
5545 +                           node_state == JOINWAIT)
5546 +                               schedule_timeout(HZ);
5547 +                       else
5548 +                               schedule();
5549 +               }
5550 +
5551 +               tsk->state = TASK_RUNNING;
5552 +               remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5553 +
5554 +               /* Are we being shut down? */
5555 +               if (node_state == LEFT_CLUSTER || quit_threads ||
5556 +                   signal_pending(current))
5557 +                       break;
5558 +
5559 +               /* Were we woken by a dead node passed down from cnxman ? */
5560 +               if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
5561 +                       struct list_head *nodelist, *tmp;
5562 +                       struct cl_new_dead_node *deadnode;
5563 +
5564 +                       down(&new_dead_node_lock);
5565 +                       list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
5566 +                               deadnode =
5567 +                                   list_entry(nodelist,
5568 +                                              struct cl_new_dead_node, list);
5569 +
5570 +                               if (deadnode->node->state == NODESTATE_MEMBER)
5571 +                                       a_node_just_died(deadnode->node);
5572 +                               list_del(&deadnode->list);
5573 +                               kfree(deadnode);
5574 +                       }
5575 +                       up(&new_dead_node_lock);
5576 +               }
5577 +
5578 +               /* Process received messages. If dispatch_message() returns an
5579 +                * error then we shut down */
5580 +               if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5581 +                       if (dispatch_messages(mem_socket) < 0)
5582 +                               goto leave_cluster;
5583 +
5584 +               }
5585 +
5586 +               /* Were we woken by the transition timer firing ? */
5587 +               if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
5588 +                       switch (do_timer_wakeup()) {
5589 +                       case -1:
5590 +                               continue;
5591 +                       case 0:
5592 +                               break;
5593 +                       case +1:
5594 +                               goto leave_cluster;
5595 +                       }
5596 +               }
5597 +
5598 +               /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
5599 +                * messages again */
5600 +               if (node_state == JOINACK &&
5601 +                   time_after(jiffies,
5602 +                              join_time + cman_config.join_timeout * HZ)) {
5603 +                       P_MEMB
5604 +                           ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
5605 +                       node_state = JOINWAIT;
5606 +                       joinwait_time = jiffies;
5607 +               }
5608 +
5609 +               /* Have we had an ACK for our JOINREQ message ? */
5610 +               if (node_state == JOINING &&
5611 +                   time_after(jiffies,
5612 +                              join_time + cman_config.join_timeout * HZ)) {
5613 +                       P_MEMB("didn't get JOINACK, going back to JOINWAIT\n");
5614 +                       node_state = JOINWAIT;
5615 +                       joinwait_time = jiffies;
5616 +               }
5617 +
5618 +               /* Have we been in joinwait for too long... */
5619 +               if (node_state == JOINWAIT &&
5620 +                   time_after(jiffies,
5621 +                              joinwait_time + cman_config.joinwait_timeout * HZ)) {
5622 +                       printk(CMAN_NAME
5623 +                              ": Been in JOINWAIT for too long - giving up\n");
5624 +                       goto leave_cluster;
5625 +               }
5626 +       }
5627 +
5628 +      leave_cluster:
5629 +
5630 +       /* Wake up the heartbeat thread so it can exit */
5631 +       down(&hello_task_lock);
5632 +       if (hello_task)
5633 +               wake_up_process(hello_task);
5634 +       up(&hello_task_lock);
5635 +
5636 +       if (timer_pending(&hello_timer))
5637 +               del_timer(&hello_timer);
5638 +
5639 +       if (timer_pending(&transition_timer))
5640 +               del_timer(&transition_timer);
5641 +
5642 +       node_state = LEFT_CLUSTER;
5643 +       P_MEMB("closing down\n");
5644 +       quit_threads = 1;       /* force other thread to exit too */
5645 +
5646 +       send_leave(us->leave_reason);
5647 +       sock_release(mem_socket);
5648 +       highest_nodeid = 0;
5649 +       complete(&member_thread_comp);
5650 +       return 0;
5651 +}
5652 +
5653 +/* Things to do in the main thread when the transition timer has woken us.
5654 + * Usually this happens when a transition is taking too long and we need to
5655 + * take remedial action.
5656 + *
5657 + * returns: -1 continue; 0 carry on processing +1 leave cluster; */
5658 +static int do_timer_wakeup()
5659 +{
5660 +       P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
5661 +
5662 +       /* Resend JOINCONF if it got lost on the wire */
5663 +       if (node_state == MASTER && master_state == MASTER_CONFIRM) {
5664 +               mod_timer(&transition_timer,
5665 +                         jiffies + cman_config.joinconf_timeout * HZ);
5666 +               if (++joinconf_count < MAX_RETRIES) {
5667 +                       P_MEMB("Resending JOINCONF\n");
5668 +                       send_joinconf();
5669 +               }
5670 +               else {
5671 +                       P_MEMB("JOINCONF not acked, cancelling transition\n");
5672 +                       end_transition();
5673 +               }
5674 +               return -1;
5675 +       }
5676 +
5677 +       /* A joining node probably died */
5678 +       if (cluster_members == 1) {
5679 +               end_transition();
5680 +               return -1;
5681 +       }
5682 +
5683 +       /* See if the master is still there */
5684 +       if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
5685 +
5686 +               /* If we are in transition and master_node is NULL then we are
5687 +                * waiting for ENDTRANS after JOIN-CONF */
5688 +               if (!master_node) {
5689 +                       /* Hmmm. master died after sending JOINCONF, we'll have
5690 +                        * to die as we are in mid-transition */
5691 +                       printk(KERN_INFO CMAN_NAME
5692 +                              ": Master died after JOINCONF, we must leave the cluster\n");
5693 +                       quit_threads = 1;
5694 +                       return +1;
5695 +               }
5696 +
5697 +               /* No messages from the master - see if it's stil there */
5698 +               if (master_node->state == NODESTATE_MEMBER) {
5699 +                       send_master_hello();
5700 +                       mod_timer(&transition_timer,
5701 +                                 jiffies +
5702 +                                 cman_config.transition_timeout * HZ);
5703 +               }
5704 +
5705 +               /* If the master is dead then elect a new one */
5706 +               if (master_node->state == NODESTATE_DEAD) {
5707 +
5708 +                       struct cluster_node *node;
5709 +
5710 +                       P_MEMB("Master node is dead...Election!\n");
5711 +                       if (elect_master(&node)) {
5712 +
5713 +                               /* We are master now, all kneel */
5714 +                               start_transition(TRANS_DEADMASTER, master_node);
5715 +                       }
5716 +                       else {
5717 +                               /* Leave the job to someone on more pay */
5718 +                               master_node = node;
5719 +                               mod_timer(&transition_timer,
5720 +                                         jiffies +
5721 +                                         cman_config.transition_timeout * HZ);
5722 +                       }
5723 +               }
5724 +       }
5725 +
5726 +       /* If we are the master node then restart the transition */
5727 +       if (node_state == MASTER) {
5728 +               start_transition(TRANS_RESTART, us);
5729 +       }
5730 +
5731 +       return 0;
5732 +}
5733 +
5734 +static void form_cluster(void)
5735 +{
5736 +       printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
5737 +       node_state = MEMBER;
5738 +       we_are_a_cluster_member = TRUE;
5739 +       us->state = NODESTATE_MEMBER;
5740 +       if (wanted_nodeid)
5741 +               set_nodeid(us, wanted_nodeid);
5742 +       else
5743 +               set_nodeid(us, 1);
5744 +       recalculate_quorum(0);
5745 +       sm_member_update(cluster_is_quorate);
5746 +       send_hello();
5747 +       kernel_thread(hello_kthread, NULL, 0);
5748 +       mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
5749 +}
5750 +
5751 +/* This does the initial JOIN part of the membership process. Actually most of
5752 + * is done in the message processing routines but this is the main loop that
5753 + * controls it. The side-effect of this routine is "node_state" which tells the
5754 + * real main loop (in the kernel thread routine) what to do next */
5755 +static void join_or_form_cluster()
5756 +{
5757 +       start_time = jiffies;
5758 +
5759 +       printk(KERN_INFO CMAN_NAME
5760 +              ": Waiting to join or form a Linux-cluster\n");
5761 +
5762 + restart_joinwait:
5763 +       join_time = 0;
5764 +       start_time = jiffies;
5765 +       joinwait_time = jiffies;
5766 +       last_hello = 0;
5767 +
5768 +       /* Listen for HELLO or NEWCLUSTER messages */
5769 +       do {
5770 +               DECLARE_WAITQUEUE(wait, current);
5771 +               set_task_state(current, TASK_INTERRUPTIBLE);
5772 +               add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5773 +
5774 +               if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5775 +                       schedule_timeout((cman_config.joinwait_timeout * HZ) /
5776 +                                        5);
5777 +
5778 +               set_task_state(current, TASK_RUNNING);
5779 +               remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5780 +
5781 +               while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5782 +                       dispatch_messages(mem_socket);
5783 +               }
5784 +               if (quit_threads)
5785 +                       node_state = LEFT_CLUSTER;
5786 +
5787 +       }
5788 +       while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
5789 +              node_state == STARTING);
5790 +
5791 +       if (node_state == STARTING) {
5792 +               start_time = jiffies;
5793 +               joinwait_time = jiffies;
5794 +               node_state = NEWCLUSTER;
5795 +       }
5796 +
5797 +        /* If we didn't hear any HELLO messages then start sending NEWCLUSTER messages */
5798 +       while (time_before(jiffies, start_time + cman_config.newcluster_timeout * HZ) &&
5799 +              node_state == NEWCLUSTER) {
5800 +
5801 +               DECLARE_WAITQUEUE(wait, current);
5802 +
5803 +               send_newcluster();
5804 +
5805 +               set_task_state(current, TASK_INTERRUPTIBLE);
5806 +               add_wait_queue(mem_socket->sk->sk_sleep, &wait);
5807 +
5808 +               if (!skb_peek(&mem_socket->sk->sk_receive_queue))
5809 +                       schedule_timeout((cman_config.joinwait_timeout * HZ) /
5810 +                                        5);
5811 +
5812 +               set_task_state(current, TASK_RUNNING);
5813 +               remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
5814 +
5815 +               while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
5816 +                       dispatch_messages(mem_socket);
5817 +               }
5818 +               /* Did we get a lower "NEWCLUSTER" message ? */
5819 +               if (node_state == STARTING) {
5820 +                       P_MEMB("NEWCLUSTER: restarting joinwait\n");
5821 +                       goto restart_joinwait;
5822 +               }
5823 +
5824 +               if (quit_threads)
5825 +                       node_state = LEFT_CLUSTER;
5826 +
5827 +       }
5828 +
5829 +
5830 +        /* If we didn't hear any HELLO messages then form a new cluster */
5831 +       if (node_state == NEWCLUSTER) {
5832 +               form_cluster();
5833 +       }
5834 +       else
5835 +               last_hello = jiffies;
5836 +
5837 +}
5838 +
5839 +int start_membership_services(pid_t cluster_pid)
5840 +{
5841 +       kcluster_pid = cluster_pid;
5842 +
5843 +       init_timer(&transition_timer);
5844 +       transition_timer.function = trans_timer_expired;
5845 +       transition_timer.data = 0L;
5846 +
5847 +       /* Start the thread */
5848 +       return kernel_thread(membership_kthread, NULL, 0);
5849 +}
5850 +
5851 +static int init_membership_services()
5852 +{
5853 +       int result;
5854 +       struct sockaddr_cl saddr;
5855 +       struct socket *sock;
5856 +
5857 +       init_MUTEX(&hello_task_lock);
5858 +       /* Create a socket to communicate with */
5859 +       result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
5860 +       if (result < 0) {
5861 +               printk(KERN_ERR CMAN_NAME
5862 +                      ": Can't create cluster socket for membership services\n");
5863 +               return result;
5864 +       }
5865 +       mem_socket = sock;
5866 +
5867 +       /* Bind to our port */
5868 +       saddr.scl_family = AF_CLUSTER;
5869 +       saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5870 +       result =
5871 +           sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
5872 +       if (result < 0) {
5873 +               printk(KERN_ERR CMAN_NAME
5874 +                      ": Can't bind to cluster membership services port\n");
5875 +               sock_release(sock);
5876 +               return result;
5877 +       }
5878 +
5879 +       node_state = STARTING;
5880 +       return 0;
5881 +}
5882 +
5883 +static int send_joinconf()
5884 +{
5885 +       struct sockaddr_cl saddr;
5886 +       int status;
5887 +
5888 +       if (joining_temp_nodeid == 0) {
5889 +               BUG();
5890 +        }
5891 +
5892 +       master_state = MASTER_CONFIRM;
5893 +       saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
5894 +       saddr.scl_family = AF_CLUSTER;
5895 +       saddr.scl_nodeid = joining_temp_nodeid;
5896 +       status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
5897 +                                  MSG_NOACK, 0);
5898 +
5899 +       if (status < 0) {
5900 +               printk("Error %d sending JOINCONF, aborting transition\n", status);
5901 +               end_transition();
5902 +        }
5903 +       return status;
5904 +}
5905 +
5906 +static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
5907 +{
5908 +       char *msgbuf = scratchbuf;
5909 +       struct list_head *addrlist;
5910 +       int ptr = sizeof (struct cl_mem_join_msg);
5911 +       unsigned short num_addr = 0;
5912 +       struct cluster_node_addr *nodeaddr;
5913 +       struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
5914 +
5915 +       msg->cmd = CLUSTER_MEM_JOINREQ;
5916 +       msg->votes = votes;
5917 +       msg->expected_votes = cpu_to_le32(expected_votes);
5918 +       msg->nodeid         = cpu_to_le32(wanted_nodeid);
5919 +       msg->major_version  = cpu_to_le32(CNXMAN_MAJOR_VERSION);
5920 +       msg->minor_version  = cpu_to_le32(CNXMAN_MINOR_VERSION);
5921 +       msg->patch_version  = cpu_to_le32(CNXMAN_PATCH_VERSION);
5922 +       msg->config_version = cpu_to_le32(config_version);
5923 +       msg->addr_len       = cpu_to_le32(address_length);
5924 +       strcpy(msg->clustername, cluster_name);
5925 +
5926 +       /* Add our addresses */
5927 +       list_for_each(addrlist, &us->addr_list) {
5928 +               nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
5929 +
5930 +               memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
5931 +               ptr += address_length;
5932 +               num_addr++;
5933 +       }
5934 +       msg->num_addr = cpu_to_le16(num_addr);
5935 +
5936 +       /* And our name */
5937 +       strcpy(msgbuf + ptr, nodename);
5938 +       ptr += strlen(nodename) + 1;
5939 +
5940 +       return kcl_sendmsg(mem_socket, msgbuf, ptr,
5941 +                          addr, addr_len, MSG_NOACK);
5942 +}
5943 +
5944 +static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
5945 +{
5946 +       struct cl_mem_startack_msg msg;
5947 +
5948 +       msg.cmd = CLUSTER_MEM_STARTACK;
5949 +       msg.generation = cpu_to_le32(cluster_generation);
5950 +       msg.node_id = cpu_to_le32(node_id);
5951 +       msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
5952 +
5953 +       return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, MSG_REPLYEXP);
5954 +}
5955 +
5956 +static int send_newcluster()
5957 +{
5958 +       char buf[5];
5959 +       uint32_t lowip;
5960 +
5961 +       buf[0] = CLUSTER_MEM_NEWCLUSTER;
5962 +       lowip = cpu_to_le32(low32_of_ip());
5963 +       memcpy(&buf[1], &lowip, sizeof(lowip));
5964 +
5965 +       return kcl_sendmsg(mem_socket, buf, sizeof(uint32_t)+1,
5966 +                          NULL, 0,
5967 +                          MSG_NOACK);
5968 +}
5969 +
5970 +static int send_hello()
5971 +{
5972 +       struct cl_mem_hello_msg hello_msg;
5973 +       int status;
5974 +
5975 +       hello_msg.cmd = CLUSTER_MEM_HELLO;
5976 +       hello_msg.members = cpu_to_le16(cluster_members);
5977 +       hello_msg.flags = cluster_is_quorate ? HELLO_FLAG_QUORATE : 0;
5978 +       hello_msg.generation = cpu_to_le32(cluster_generation);
5979 +
5980 +       status = kcl_sendmsg(mem_socket, &hello_msg,
5981 +                            sizeof(struct cl_mem_hello_msg),
5982 +                            NULL, 0, MSG_NOACK | MSG_ALLINT);
5983 +
5984 +       last_hello = jiffies;
5985 +
5986 +       return status;
5987 +}
5988 +
5989 +/* This is a special HELLO message that requires an ACK. clients in transition
5990 + * send these to the master to check it is still alive. If it does not ACK then
5991 + * cnxman will signal it dead and we can restart the transition */
5992 +static int send_master_hello()
5993 +{
5994 +       struct cl_mem_hello_msg hello_msg;
5995 +       int status;
5996 +       struct sockaddr_cl saddr;
5997 +
5998 +       hello_msg.cmd = CLUSTER_MEM_HELLO;
5999 +       hello_msg.members = cpu_to_le16(cluster_members);
6000 +       hello_msg.flags = HELLO_FLAG_MASTER |
6001 +                         (cluster_is_quorate ? HELLO_FLAG_QUORATE : 0);
6002 +       hello_msg.generation = cpu_to_le32(cluster_generation);
6003 +
6004 +       saddr.scl_family = AF_CLUSTER;
6005 +       saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6006 +       saddr.scl_nodeid = master_node->node_id;
6007 +
6008 +       status = kcl_sendmsg(mem_socket, &hello_msg,
6009 +                            sizeof(struct cl_mem_hello_msg),
6010 +                            &saddr, sizeof (saddr), 0);
6011 +
6012 +       last_hello = jiffies;
6013 +
6014 +       return status;
6015 +}
6016 +
6017 +/* Called when the transition timer has expired, meaning we sent a transition
6018 + * message that was not ACKed */
6019 +static void trans_timer_expired(unsigned long arg)
6020 +{
6021 +       P_MEMB("Transition timer fired %ld\n", jiffies);
6022 +
6023 +       set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
6024 +       wake_up_process(membership_task);
6025 +}
6026 +
6027 +static void hello_timer_expired(unsigned long arg)
6028 +{
6029 +       P_MEMB("Hello timer fired %ld\n", jiffies);
6030 +
6031 +       mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
6032 +
6033 +       if (node_state >= TRANSITION) {
6034 +               wake_up_process(hello_task);
6035 +       }
6036 +}
6037 +
6038 +static int wait_for_completion_barrier(void)
6039 +{
6040 +       int status;
6041 +       char barriername[MAX_BARRIER_NAME_LEN];
6042 +
6043 +       sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
6044 +
6045 +       /* Make sure we all complete together */
6046 +       P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
6047 +       if ((status =
6048 +            kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
6049 +               printk(CMAN_NAME ": Error registering barrier: %d\n", status);
6050 +               return -1;
6051 +       }
6052 +       kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
6053 +                           cman_config.transition_timeout);
6054 +       status = kcl_barrier_wait(barriername);
6055 +       kcl_barrier_delete(barriername);
6056 +
6057 +       P_MEMB("Completion barrier reached : status = %d\n", status);
6058 +       return status;
6059 +}
6060 +
6061 +/* Called at the end of a state transition when we are the master */
6062 +static int end_transition()
6063 +{
6064 +       struct cl_mem_endtrans_msg msg;
6065 +       int total_votes;
6066 +       int status;
6067 +
6068 +       /* Cancel the timer */
6069 +       del_timer(&transition_timer);
6070 +
6071 +       confirm_joiner();
6072 +
6073 +       quorum = calculate_quorum(leavereason, 0, &total_votes);
6074 +
6075 +       msg.cmd = CLUSTER_MEM_ENDTRANS;
6076 +       msg.quorum = cpu_to_le32(quorum);
6077 +       msg.generation = cpu_to_le32(++cluster_generation);
6078 +       msg.total_votes = cpu_to_le32(total_votes);
6079 +       if (joining_node && transitionreason == TRANS_NEWNODE) {
6080 +               msg.new_node_id = cpu_to_le32(joining_node->node_id);
6081 +       }
6082 +       else {
6083 +               msg.new_node_id = 0;
6084 +       }
6085 +       status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
6086 +
6087 +       /* When that's all settled down, do the transition completion barrier */
6088 +       kcl_wait_for_all_acks();
6089 +
6090 +       if (wait_for_completion_barrier() != 0) {
6091 +               P_MEMB("Barrier timed out - restart\n");
6092 +               start_transition(TRANS_RESTART, us);
6093 +               return 0;
6094 +       }
6095 +
6096 +       joining_temp_nodeid = 0;
6097 +       purge_temp_nodeids();
6098 +
6099 +       set_quorate(total_votes);
6100 +
6101 +       notify_listeners();
6102 +       reset_hello_time();
6103 +
6104 +       /* Tell any waiting barriers that we had a transition */
6105 +       check_barrier_returns();
6106 +
6107 +       leavereason = 0;
6108 +       node_state = MEMBER;
6109 +       transition_end_time = jiffies;
6110 +
6111 +       sm_member_update(cluster_is_quorate);
6112 +
6113 +       return 0;
6114 +}
6115 +
6116 +int send_reconfigure(int param, unsigned int value)
6117 +{
6118 +       char msgbuf[66];
6119 +       struct cl_mem_reconfig_msg *msg =
6120 +           (struct cl_mem_reconfig_msg *) &msgbuf;
6121 +
6122 +       if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
6123 +               expected_votes = value;
6124 +
6125 +       msg->cmd = CLUSTER_MEM_RECONFIG;
6126 +       msg->param = param;
6127 +       msg->value = cpu_to_le32(value);
6128 +
6129 +       return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
6130 +}
6131 +
6132 +static int send_joinack(char *addr, int addr_len, unsigned char acktype)
6133 +{
6134 +       struct cl_mem_joinack_msg msg;
6135 +
6136 +       msg.cmd = CLUSTER_MEM_JOINACK;
6137 +       msg.acktype = acktype;
6138 +
6139 +       return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
6140 +                          (struct sockaddr_cl *)addr, addr_len,  MSG_NOACK);
6141 +}
6142 +
6143 +/* Only send a leave message to one node in the cluster so that it can master
6144 + * the state transition, otherwise we get a "thundering herd" of potential
6145 + * masters fighting it out */
6146 +int send_leave(unsigned char flags)
6147 +{
6148 +       unsigned char msg[2];
6149 +       struct sockaddr_cl saddr;
6150 +       struct cluster_node *node = NULL;
6151 +       int status;
6152 +
6153 +       if (!mem_socket)
6154 +               return 0;
6155 +
6156 +       saddr.scl_family = AF_CLUSTER;
6157 +       saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6158 +
6159 +       /* If we are in transition then use the current master */
6160 +       if (node_state == TRANSITION) {
6161 +               node = master_node;
6162 +       }
6163 +       if (!node) {
6164 +               /* If we are the master or not in transition then pick a node
6165 +                * almost at random */
6166 +               struct list_head *nodelist;
6167 +
6168 +               down(&cluster_members_lock);
6169 +               list_for_each(nodelist, &cluster_members_list) {
6170 +                       node = list_entry(nodelist, struct cluster_node, list);
6171 +
6172 +                       if (node->state == NODESTATE_MEMBER && !node->us)
6173 +                               break;
6174 +               }
6175 +               up(&cluster_members_lock);
6176 +       }
6177 +
6178 +       /* we are the only member of the cluster - there is no-one to tell */
6179 +       if (node && !node->us) {
6180 +               saddr.scl_nodeid = node->node_id;
6181 +
6182 +               P_MEMB("Sending LEAVE to %s\n", node->name);
6183 +               msg[0] = CLUSTER_MEM_LEAVE;
6184 +               msg[1] = flags;
6185 +               status = kcl_sendmsg(mem_socket, msg, 2,
6186 +                                    &saddr, sizeof (saddr),
6187 +                                    MSG_NOACK);
6188 +               if (status < 0)
6189 +                       return status;
6190 +       }
6191 +
6192 +       /* And exit */
6193 +       node_state = LEFT_CLUSTER;
6194 +       wake_up_process(membership_task);
6195 +       return 0;
6196 +}
6197 +
6198 +int send_kill(int nodeid)
6199 +{
6200 +       char killmsg;
6201 +       struct sockaddr_cl saddr;
6202 +
6203 +       killmsg = CLUSTER_MEM_KILL;
6204 +
6205 +       saddr.scl_family = AF_CLUSTER;
6206 +       saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
6207 +       saddr.scl_nodeid = nodeid;
6208 +       return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
6209 +                          sizeof (struct sockaddr_cl), MSG_NOACK);
6210 +}
6211 +
6212 +/* Process a message */
6213 +static int do_membership_packet(struct msghdr *msg, char *buf, int len)
6214 +{
6215 +       int result = -1;
6216 +       struct sockaddr_cl *saddr = msg->msg_name;
6217 +       struct cluster_node *node;
6218 +
6219 +       node = find_node_by_nodeid(saddr->scl_nodeid);
6220 +
6221 +       P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
6222 +              msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
6223 +
6224 +       switch (*buf) {
6225 +       case CLUSTER_MEM_JOINREQ:
6226 +               result = do_process_joinreq(msg, buf, len);
6227 +               break;
6228 +
6229 +       case CLUSTER_MEM_LEAVE:
6230 +               if (we_are_a_cluster_member)
6231 +                       result = do_process_leave(msg, buf, len);
6232 +               break;
6233 +
6234 +       case CLUSTER_MEM_HELLO:
6235 +               result = do_process_hello(msg, buf, len);
6236 +               break;
6237 +
6238 +       case CLUSTER_MEM_KILL:
6239 +               if (we_are_a_cluster_member)
6240 +                       result = do_process_kill(msg, buf, len);
6241 +               break;
6242 +
6243 +       case CLUSTER_MEM_JOINCONF:
6244 +               if (node_state == JOINACK) {
6245 +                       do_process_joinconf(msg, buf, len);
6246 +               }
6247 +               break;
6248 +
6249 +       case CLUSTER_MEM_CONFACK:
6250 +               if (node_state == MASTER && master_state == MASTER_CONFIRM) {
6251 +                       end_transition();
6252 +               }
6253 +               break;
6254 +
6255 +       case CLUSTER_MEM_MASTERVIEW:
6256 +               if (node_state == TRANSITION)
6257 +                       do_process_masterview(msg, buf, len);
6258 +               break;
6259 +
6260 +       case CLUSTER_MEM_JOINACK:
6261 +               if (node_state == JOINING || node_state == JOINWAIT ||
6262 +                   node_state == JOINACK) {
6263 +                       do_process_joinack(msg, buf, len);
6264 +               }
6265 +               break;
6266 +       case CLUSTER_MEM_RECONFIG:
6267 +               if (we_are_a_cluster_member) {
6268 +                       do_process_reconfig(msg, buf, len);
6269 +               }
6270 +               break;
6271 +
6272 +       case CLUSTER_MEM_STARTTRANS:
6273 +               result = do_process_starttrans(msg, buf, len);
6274 +               break;
6275 +
6276 +       case CLUSTER_MEM_ENDTRANS:
6277 +               result = do_process_endtrans(msg, buf, len);
6278 +               break;
6279 +
6280 +       case CLUSTER_MEM_VIEWACK:
6281 +               if (node_state == MASTER && master_state == MASTER_COLLECT)
6282 +                       result = do_process_viewack(msg, buf, len);
6283 +               break;
6284 +
6285 +       case CLUSTER_MEM_STARTACK:
6286 +               if (node_state == MASTER)
6287 +                       result = do_process_startack(msg, buf, len);
6288 +               break;
6289 +
6290 +       case CLUSTER_MEM_NEWCLUSTER:
6291 +               result = do_process_newcluster(msg, buf, len);
6292 +               break;
6293 +
6294 +       case CLUSTER_MEM_NOMINATE:
6295 +               if (node_state != MASTER)
6296 +                       result = do_process_nominate(msg, buf, len);
6297 +               break;
6298 +
6299 +       default:
6300 +               printk(KERN_ERR CMAN_NAME
6301 +                      ": Unknown membership services message %d received from node %d port %d\n",
6302 +                      *buf, saddr->scl_nodeid, saddr->scl_port);
6303 +               break;
6304 +
6305 +       }
6306 +       return result;
6307 +}
6308 +
6309 +/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
6310 + * to ignore request (node already joining) */
6311 +static int check_duplicate_node(char *name, struct msghdr *msg, int len)
6312 +{
6313 +       struct cluster_node *node;
6314 +       struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
6315 +       char addr[address_length];
6316 +       int addrlen;
6317 +
6318 +       if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
6319 +               return -3;
6320 +
6321 +       /* See if we already have a cluster member with that name... */
6322 +       node = find_node_by_name(name);
6323 +       if (node && node->state != NODESTATE_DEAD) {
6324 +
6325 +               if (node->state == NODESTATE_JOINING)
6326 +                       return +1;
6327 +
6328 +               printk(KERN_WARNING CMAN_NAME
6329 +                      ": Rejecting cluster membership application from %s - already have a node with that name\n",
6330 +                      name);
6331 +               return -1;
6332 +
6333 +       }
6334 +
6335 +       /* Need to check the node's address too */
6336 +       if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
6337 +           (node = find_node_by_addr(addr, addrlen)) &&
6338 +           node->state != NODESTATE_DEAD) {
6339 +
6340 +               if (node->state == NODESTATE_JOINING)
6341 +                       return +1;
6342 +
6343 +               printk(KERN_WARNING CMAN_NAME
6344 +                      ": Rejecting cluster membership application from %s - already have a node with that address\n",
6345 +                      name);
6346 +               return -1;
6347 +       }
6348 +       return 0;
6349 +}
6350 +
6351 +/* Start the state transition */
6352 +static int start_transition(unsigned char reason, struct cluster_node *node)
6353 +{
6354 +       char *startbuf = scratchbuf;
6355 +       struct cl_mem_starttrans_msg *msg =
6356 +           (struct cl_mem_starttrans_msg *) startbuf;
6357 +
6358 +       P_MEMB("Start transition - reason = %d\n", reason);
6359 +
6360 +       /* If this is a restart then zero the counters */
6361 +       if (reason == TRANS_RESTART) {
6362 +               agreeing_nodes = 0;
6363 +               dissenting_nodes = 0;
6364 +               if (node_opinion) {
6365 +                       kfree(node_opinion);
6366 +                       node_opinion = NULL;
6367 +               }
6368 +               responses_collected = 0;
6369 +       }
6370 +
6371 +       /* If we have timed out too many times then just die */
6372 +       if (reason == TRANS_RESTART
6373 +           && ++transition_restarts > cman_config.transition_restarts) {
6374 +               printk(KERN_WARNING CMAN_NAME
6375 +                      ": too many transition restarts - will die\n");
6376 +               us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
6377 +               node_state = LEFT_CLUSTER;
6378 +               quit_threads = 1;
6379 +               wake_up_process(membership_task);
6380 +               wake_up_interruptible(&cnxman_waitq);
6381 +               return 0;
6382 +       }
6383 +       if (reason != TRANS_RESTART)
6384 +               transition_restarts = 0;
6385 +
6386 +       /* Only keep the original state transition reason in the global
6387 +        * variable. */
6388 +       if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
6389 +           reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
6390 +               transitionreason = reason;
6391 +
6392 +       /* Save the info of the requesting node */
6393 +       if (reason == TRANS_NEWNODE)
6394 +               joining_node = node;
6395 +
6396 +       node_state = MASTER;
6397 +       master_state = MASTER_START;
6398 +       responses_collected = 0;
6399 +       responses_expected = cluster_members - 1;
6400 +
6401 +       /* If we are on our own then just do it */
6402 +       if (responses_expected == 0) {
6403 +               P_MEMB("We are on our own...lonely here\n");
6404 +               responses_collected--;
6405 +               do_process_startack(NULL, NULL, 0);
6406 +       }
6407 +       else {
6408 +               int ptr = sizeof (struct cl_mem_starttrans_msg);
6409 +               struct list_head *addrlist;
6410 +               unsigned short num_addrs = 0;
6411 +               int flags = MSG_REPLYEXP;
6412 +
6413 +               /* Send the STARTTRANS message */
6414 +               msg->cmd = CLUSTER_MEM_STARTTRANS;
6415 +               msg->reason = reason;
6416 +               msg->votes = node->votes;
6417 +               msg->expected_votes = cpu_to_le32(node->expected_votes);
6418 +               msg->generation = cpu_to_le32(++cluster_generation);
6419 +               msg->nodeid = cpu_to_le32(node->node_id);
6420 +
6421 +               if (reason == TRANS_NEWNODE) {
6422 +                       /* Add the addresses */
6423 +                       list_for_each(addrlist, &node->addr_list) {
6424 +                               struct cluster_node_addr *nodeaddr =
6425 +                                   list_entry(addrlist,
6426 +                                              struct cluster_node_addr, list);
6427 +
6428 +                               memcpy(startbuf + ptr, nodeaddr->addr,
6429 +                                      address_length);
6430 +                               ptr += address_length;
6431 +                               num_addrs++;
6432 +                       }
6433 +
6434 +                       /* And the name */
6435 +                       strcpy(startbuf + ptr, node->name);
6436 +                       ptr += strlen(node->name) + 1;
6437 +               }
6438 +
6439 +               /* If another node died then we must queue the STARTTRANS
6440 +                * messages so that membershipd can carry on processing the
6441 +                * other replies */
6442 +               if (reason == TRANS_ANOTHERREMNODE)
6443 +                       flags |= MSG_QUEUE;
6444 +
6445 +               msg->num_addrs = cpu_to_le16(num_addrs);
6446 +               kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
6447 +       }
6448 +       /* Set a timer in case we don't get 'em all back */
6449 +       mod_timer(&transition_timer,
6450 +                 jiffies + cman_config.transition_timeout * HZ);
6451 +       return 0;
6452 +}
6453 +
6454 +/* A node has died - decide what to do */
6455 +void a_node_just_died(struct cluster_node *node)
6456 +{
6457 +       /* If we are not in the context of kmembershipd then stick it on the
6458 +        * list and wake it */
6459 +       if (current != membership_task) {
6460 +               struct cl_new_dead_node *newnode =
6461 +                   kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
6462 +               if (!newnode)
6463 +                       return;
6464 +               newnode->node = node;
6465 +               down(&new_dead_node_lock);
6466 +               list_add_tail(&newnode->list, &new_dead_node_list);
6467 +               set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
6468 +               up(&new_dead_node_lock);
6469 +               wake_up_process(membership_task);
6470 +               P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
6471 +               return;
6472 +       }
6473 +
6474 +       /* Remove it */
6475 +       down(&cluster_members_lock);
6476 +       if (node->state == NODESTATE_MEMBER)
6477 +               cluster_members--;
6478 +       node->state = NODESTATE_DEAD;
6479 +       up(&cluster_members_lock);
6480 +
6481 +       /* Notify listeners */
6482 +       notify_kernel_listeners(DIED, (long) node->node_id);
6483 +
6484 +       /* If we are in normal operation then become master and initiate a
6485 +        * state-transition */
6486 +       if (node_state == MEMBER) {
6487 +               start_transition(TRANS_REMNODE, node);
6488 +               return;
6489 +       }
6490 +
6491 +       /* If we are a slave in transition then see if it's the master that has
6492 +        * failed. If not then ignore it. If it /is/ the master then elect a
6493 +        * new one */
6494 +       if (node_state == TRANSITION) {
6495 +               if (master_node == node) {
6496 +                       if (elect_master(&node)) {
6497 +                               del_timer(&transition_timer);
6498 +                               node_state = MASTER;
6499 +
6500 +                               start_transition(TRANS_DEADMASTER, master_node);
6501 +                       }
6502 +                       else {
6503 +                               /* Someone else can be in charge - phew! */
6504 +                       }
6505 +               }
6506 +               return;
6507 +       }
6508 +
6509 +       /* If we are the master then we need to start the transition all over
6510 +        * again */
6511 +       if (node_state == MASTER) {
6512 +               /* Cancel timer */
6513 +               del_timer(&transition_timer);
6514 +
6515 +               /* Restart the transition */
6516 +               start_transition(TRANS_ANOTHERREMNODE, node);
6517 +               transition_restarts = 0;
6518 +               return;
6519 +       }
6520 +}
6521 +
6522 +/*
6523 + * Build up and send a set of messages consisting of the whole cluster view.
6524 + * The first byte is the command (cmd as passed in), the second is a flag byte:
6525 + * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
6526 + * this is the only message sent The rest is a set of packed node entries, which
6527 + * are NOT split over packets. */
6528 +static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
6529 +                            unsigned int flags, unsigned int flags2)
6530 +{
6531 +       int ptr = 2;
6532 +       int len;
6533 +       int status = 0;
6534 +       int last_node_start = 2;
6535 +       unsigned char first_packet_flag = 1;
6536 +       struct list_head *nodelist;
6537 +       struct list_head *temp;
6538 +       struct cluster_node *node;
6539 +       char *message = scratchbuf;
6540 +
6541 +       message[0] = cmd;
6542 +
6543 +       down(&cluster_members_lock);
6544 +       list_for_each_safe(nodelist, temp, &cluster_members_list) {
6545 +               node = list_entry(nodelist, struct cluster_node, list);
6546 +
6547 +               if (node->state == NODESTATE_MEMBER || node->state == NODESTATE_DEAD) {
6548 +                       unsigned int evotes;
6549 +                       unsigned int node_id;
6550 +                       unsigned short num_addrs = 0;
6551 +                       unsigned short num_addrs_le;
6552 +                       struct list_head *addrlist;
6553 +
6554 +                       last_node_start = ptr;
6555 +
6556 +                       message[ptr++] = len = strlen(node->name);
6557 +                       strcpy(&message[ptr], node->name);
6558 +                       ptr += len;
6559 +
6560 +                       message[ptr++] = node->state;
6561 +
6562 +                       /* Count the number of addresses this node has */
6563 +                       list_for_each(addrlist, &node->addr_list) {
6564 +                               num_addrs++;
6565 +                       }
6566 +
6567 +                       num_addrs_le = cpu_to_le16(num_addrs);
6568 +                       memcpy(&message[ptr], &num_addrs_le, sizeof (short));
6569 +                       ptr += sizeof (short);
6570 +
6571 +                       /* Pack em in */
6572 +                       list_for_each(addrlist, &node->addr_list) {
6573 +
6574 +                               struct cluster_node_addr *nodeaddr =
6575 +                                       list_entry(addrlist,
6576 +                                                  struct cluster_node_addr, list);
6577 +
6578 +                               memcpy(&message[ptr], nodeaddr->addr,
6579 +                                      address_length);
6580 +                               ptr += address_length;
6581 +                       }
6582 +
6583 +                       message[ptr++] = node->votes;
6584 +
6585 +                       evotes = cpu_to_le32(node->expected_votes);
6586 +                       memcpy(&message[ptr], &evotes, sizeof (int));
6587 +                       ptr += sizeof (int);
6588 +
6589 +                       node_id = cpu_to_le32(node->node_id);
6590 +                       memcpy(&message[ptr], &node_id, sizeof (int));
6591 +                       ptr += sizeof (int);
6592 +
6593 +                       /* If the block is full then send it */
6594 +                       if (ptr > MAX_CLUSTER_MESSAGE) {
6595 +                               message[1] = first_packet_flag;
6596 +
6597 +                               up(&cluster_members_lock);
6598 +                               status = kcl_sendmsg(mem_socket, message,
6599 +                                                    last_node_start, saddr,
6600 +                                                    saddr ? sizeof (struct sockaddr_cl) : 0,
6601 +                                                    flags);
6602 +
6603 +                               if (status < 0)
6604 +                                       goto send_fail;
6605 +
6606 +                               down(&cluster_members_lock);
6607 +
6608 +                               first_packet_flag = 0;
6609 +                               /* Copy the overflow back to the start of the
6610 +                                * buffer for the next send */
6611 +                               memcpy(&message[2], &message[last_node_start],
6612 +                                      ptr - last_node_start);
6613 +                               ptr = ptr - last_node_start + 2;
6614 +                       }
6615 +               }
6616 +       }
6617 +
6618 +       up(&cluster_members_lock);
6619 +
6620 +       message[1] = first_packet_flag | 2;     /* The last may also be first */
6621 +       status = kcl_sendmsg(mem_socket, message, ptr,
6622 +                            saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
6623 +                            flags | flags2);
6624 +      send_fail:
6625 +
6626 +       return status;
6627 +}
6628 +
6629 +/* Make the JOINING node into a MEMBER */
6630 +static void confirm_joiner()
6631 +{
6632 +       if (joining_node && joining_node->state == NODESTATE_JOINING) {
6633 +               down(&cluster_members_lock);
6634 +               joining_node->state = NODESTATE_MEMBER;
6635 +               cluster_members++;
6636 +               up(&cluster_members_lock);
6637 +       }
6638 +}
6639 +
6640 +/* Reset HELLO timers for all nodes We do this after a state-transition as we
6641 + * have had HELLOS disabled during the transition and if we don't do this the
6642 + * nodes will go on an uncontrolled culling-spree afterwards */
6643 +static void reset_hello_time()
6644 +{
6645 +       struct list_head *nodelist;
6646 +       struct cluster_node *node;
6647 +
6648 +       down(&cluster_members_lock);
6649 +       list_for_each(nodelist, &cluster_members_list) {
6650 +               node = list_entry(nodelist, struct cluster_node, list);
6651 +
6652 +               if (node->state == NODESTATE_MEMBER) {
6653 +                       node->last_hello = jiffies;
6654 +               }
6655 +
6656 +       }
6657 +       up(&cluster_members_lock);
6658 +}
6659 +
6660 +/* Calculate the new quorum and return the value. do *not* set it in here as
6661 + * cnxman calls this to check if a new expected_votes value is valid. It
6662 + * (optionally) returns the total number of votes in the cluster */
6663 +int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
6664 +{
6665 +       struct list_head *nodelist;
6666 +       struct cluster_node *node;
6667 +       unsigned int total_votes = 0;
6668 +       unsigned int highest_expected = 0;
6669 +       unsigned int newquorum, q1, q2;
6670 +
6671 +       down(&cluster_members_lock);
6672 +       list_for_each(nodelist, &cluster_members_list) {
6673 +               node = list_entry(nodelist, struct cluster_node, list);
6674 +
6675 +               if (node->state == NODESTATE_MEMBER) {
6676 +                       highest_expected =
6677 +                           max(highest_expected, node->expected_votes);
6678 +                       total_votes += node->votes;
6679 +               }
6680 +       }
6681 +       up(&cluster_members_lock);
6682 +       if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
6683 +               total_votes += quorum_device->votes;
6684 +
6685 +       if (max_expected > 0)
6686 +               highest_expected = max_expected;
6687 +
6688 +       /* This quorum calculation is taken from the OpenVMS Cluster Systems
6689 +        * manual, but, then, you guessed that didn't you */
6690 +       q1 = (highest_expected + 2) / 2;
6691 +       q2 = (total_votes + 2) / 2;
6692 +       newquorum = max(q1, q2);
6693 +
6694 +       /* Normally quorum never decreases but the system administrator can
6695 +        * force it down by setting expected votes to a maximum value */
6696 +       if (!allow_decrease)
6697 +               newquorum = max(quorum, newquorum);
6698 +
6699 +       /* The special two_node mode allows each of the two nodes to retain
6700 +        * quorum if the other fails.  Only one of the two should live past
6701 +        * fencing (as both nodes try to fence each other in split-brain.) */
6702 +       if (two_node)
6703 +               newquorum = 1;
6704 +
6705 +       if (ret_total_votes)
6706 +               *ret_total_votes = total_votes;
6707 +       return newquorum;
6708 +}
6709 +
6710 +/* Recalculate cluster quorum, set quorate and notify changes */
6711 +void recalculate_quorum(int allow_decrease)
6712 +{
6713 +       int total_votes;
6714 +
6715 +       quorum = calculate_quorum(allow_decrease, 0, &total_votes);
6716 +       set_quorate(total_votes);
6717 +       notify_listeners();
6718 +}
6719 +
6720 +/* Add new node address to an existing node */
6721 +int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
6722 +{
6723 +       struct cluster_node_addr *newaddr;
6724 +
6725 +       newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
6726 +       if (!newaddr)
6727 +               return -1;
6728 +
6729 +       memcpy(newaddr->addr, addr, len);
6730 +       newaddr->addr_len = len;
6731 +       list_add_tail(&newaddr->list, &node->addr_list);
6732 +
6733 +       return 0;
6734 +}
6735 +
6736 +static struct cluster_node *add_new_node(char *name, unsigned char votes,
6737 +                                        unsigned int expected_votes,
6738 +                                        int node_id, int state)
6739 +{
6740 +       struct cluster_node *newnode;
6741 +
6742 +       /* Look for a dead node with this name */
6743 +       newnode = find_node_by_name(name);
6744 +
6745 +       /* Is it already joining */
6746 +       if (newnode && newnode->state == NODESTATE_JOINING)
6747 +               return NULL;
6748 +
6749 +       /* Update existing information */
6750 +       if (newnode && newnode->state == NODESTATE_DEAD) {
6751 +               newnode->last_hello = jiffies;
6752 +               newnode->votes = votes;
6753 +               newnode->expected_votes = expected_votes;
6754 +               newnode->state = state;
6755 +               newnode->us = 0;
6756 +               newnode->leave_reason = 0;
6757 +               newnode->last_seq_recv = 0;
6758 +               newnode->last_seq_acked = 0;
6759 +               newnode->last_seq_sent = 0;
6760 +               newnode->incarnation++;
6761 +               do_gettimeofday(&newnode->join_time);
6762 +               /* Don't overwrite the node ID */
6763 +
6764 +               if (state == NODESTATE_MEMBER) {
6765 +                       down(&cluster_members_lock);
6766 +                       cluster_members++;
6767 +                       up(&cluster_members_lock);
6768 +               }
6769 +
6770 +               printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
6771 +               return newnode;
6772 +       }
6773 +
6774 +       newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
6775 +       if (!newnode)
6776 +               goto alloc_err;
6777 +
6778 +       memset(newnode, 0, sizeof (struct cluster_node));
6779 +       newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
6780 +       if (!newnode->name)
6781 +               goto alloc_err1;
6782 +
6783 +       strcpy(newnode->name, name);
6784 +       newnode->last_hello = jiffies;
6785 +       newnode->votes = votes;
6786 +       newnode->expected_votes = expected_votes;
6787 +       newnode->state = state;
6788 +       newnode->node_id = node_id;
6789 +       newnode->us = 0;
6790 +       newnode->leave_reason = 0;
6791 +       newnode->last_seq_recv = 0;
6792 +       newnode->last_seq_acked = 0;
6793 +       newnode->last_seq_sent = 0;
6794 +       newnode->incarnation = 0;
6795 +       do_gettimeofday(&newnode->join_time);
6796 +       INIT_LIST_HEAD(&newnode->addr_list);
6797 +       set_nodeid(newnode, node_id);
6798 +
6799 +       /* Add the new node to the list */
6800 +       down(&cluster_members_lock);
6801 +       list_add(&newnode->list, &cluster_members_list);
6802 +       if (state == NODESTATE_MEMBER)
6803 +               cluster_members++;
6804 +       up(&cluster_members_lock);
6805 +
6806 +       printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
6807 +       return newnode;
6808 +
6809 +      alloc_err1:
6810 +       kfree(newnode);
6811 +      alloc_err:
6812 +       send_leave(CLUSTER_LEAVEFLAG_PANIC);
6813 +
6814 +       printk(KERN_CRIT CMAN_NAME
6815 +              ": Cannot allocate memory for new cluster node %s\n", name);
6816 +
6817 +       panic("cluster memory allocation failed");
6818 +
6819 +       return NULL;
6820 +}
6821 +
6822 +/* Remove node from a STARTTRANS message */
6823 +static struct cluster_node *remove_node(int nodeid)
6824 +{
6825 +       struct cluster_node *node = find_node_by_nodeid(nodeid);
6826 +
6827 +       if (node && node->state == NODESTATE_MEMBER) {
6828 +               P_MEMB("starttrans removes node %s\n", node->name);
6829 +               down(&cluster_members_lock);
6830 +               node->state = NODESTATE_DEAD;
6831 +               cluster_members--;
6832 +               up(&cluster_members_lock);
6833 +
6834 +               notify_kernel_listeners(DIED, (long) nodeid);
6835 +
6836 +               /* If this node is us then go quietly */
6837 +               if (node->us) {
6838 +                       printk(KERN_INFO CMAN_NAME
6839 +                              ": killed by STARTTRANS or NOMINATE\n");
6840 +                       node_state = LEFT_CLUSTER;
6841 +                       quit_threads = 1;
6842 +                       wake_up_process(membership_task);
6843 +                       wake_up_interruptible(&cnxman_waitq);
6844 +               }
6845 +       }
6846 +       return node;
6847 +}
6848 +
6849 +/* Add a node from a STARTTRANS or NOMINATE message */
6850 +static void add_node_from_starttrans(struct msghdr *msg, char *buf, int len)
6851 +{
6852 +       /* Add the new node but don't fill in the ID until the master has
6853 +        * confirmed it */
6854 +       struct cl_mem_starttrans_msg *startmsg =
6855 +           (struct cl_mem_starttrans_msg *)buf;
6856 +       int ptr = sizeof (struct cl_mem_starttrans_msg);
6857 +       int i;
6858 +       char *name = buf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
6859 +       char *nodeaddr = buf + sizeof(struct cl_mem_starttrans_msg);
6860 +
6861 +       joining_node = add_new_node(name, startmsg->votes,
6862 +                                   le32_to_cpu(startmsg->expected_votes),
6863 +                                   0, NODESTATE_JOINING);
6864 +
6865 +       /* add_new_node returns NULL if the node already exists */
6866 +       if (!joining_node)
6867 +               joining_node = find_node_by_name(name);
6868 +
6869 +       /* Add the node's addresses */
6870 +       if (list_empty(&joining_node->addr_list)) {
6871 +               for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
6872 +                       add_node_address(joining_node, buf + ptr, address_length);
6873 +                       ptr += address_length;
6874 +               }
6875 +       }
6876 +
6877 +       /* Make sure we have a temp nodeid for the new node in case we
6878 +          become master */
6879 +       joining_temp_nodeid = new_temp_nodeid(nodeaddr,
6880 +                                             address_length);
6881 +}
6882 +
6883 +/* We have been nominated as master for a transition */
6884 +static int do_process_nominate(struct msghdr *msg, char *buf, int len)
6885 +{
6886 +       struct cl_mem_starttrans_msg *startmsg =
6887 +           (struct cl_mem_starttrans_msg *)buf;
6888 +       struct cluster_node *node = NULL;
6889 +
6890 +       P_MEMB("nominate reason is %d\n", startmsg->reason);
6891 +
6892 +       if (startmsg->reason == TRANS_REMNODE) {
6893 +               node = remove_node(le32_to_cpu(startmsg->nodeid));
6894 +       }
6895 +
6896 +       if (startmsg->reason == TRANS_NEWNODE) {
6897 +               add_node_from_starttrans(msg, buf, len);
6898 +               node = joining_node;
6899 +       }
6900 +
6901 +       /* This should be a TRANS_CHECK but start_transition needs some node
6902 +        * info */
6903 +       if (node == NULL)
6904 +               node = us;
6905 +       start_transition(startmsg->reason, node);
6906 +       return 0;
6907 +}
6908 +
6909 +/* Got a STARTACK response from a node */
6910 +static int do_process_startack(struct msghdr *msg, char *buf, int len)
6911 +{
6912 +       if (node_state != MASTER && master_state != MASTER_START) {
6913 +               P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
6914 +               return 0;
6915 +       }
6916 +
6917 +       /* buf is NULL if we are called directly from start_transition */
6918 +       if (buf) {
6919 +               struct cl_mem_startack_msg *ackmsg =
6920 +                       (struct cl_mem_startack_msg *)buf;
6921 +
6922 +               /* Ignore any messages wil old generation numbers in them */
6923 +               if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
6924 +                       P_MEMB("Got old generation START-ACK msg - ignoring\n");
6925 +                       return 0;
6926 +               }
6927 +       }
6928 +
6929 +       /* If the node_id is non-zero then use it. */
6930 +       if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
6931 +               struct cl_mem_startack_msg *ackmsg =
6932 +                       (struct cl_mem_startack_msg *)buf;
6933 +
6934 +               if (ackmsg->node_id) {
6935 +                       set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
6936 +               }
6937 +               highest_nodeid =
6938 +                   max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
6939 +               P_MEMB("Node id = %d, highest node id = %d\n",
6940 +                      le32_to_cpu(ackmsg->node_id),
6941 +                      le32_to_cpu(ackmsg->highest_node_id));
6942 +       }
6943 +
6944 +       /* If we have all the responses in then move to the next stage */
6945 +       if (++responses_collected == responses_expected) {
6946 +
6947 +               /* If the new node has no node_id (ie nobody in the cluster has
6948 +                * heard of it before) then assign it a new one */
6949 +               if (transitionreason == TRANS_NEWNODE && joining_node) {
6950 +                       highest_nodeid =
6951 +                           max(highest_nodeid, get_highest_nodeid());
6952 +                       if (joining_node->node_id == 0) {
6953 +                               set_nodeid(joining_node, ++highest_nodeid);
6954 +                       }
6955 +                       P_MEMB("nodeIDs: new node: %d, highest: %d\n",
6956 +                              joining_node->node_id, highest_nodeid);
6957 +               }
6958 +
6959 +               /* Behave a little differently if we are on our own */
6960 +               if (cluster_members == 1) {
6961 +                       if (transitionreason == TRANS_NEWNODE) {
6962 +                               /* If the cluster is just us then confirm at
6963 +                                * once */
6964 +                               joinconf_count = 0;
6965 +                               mod_timer(&transition_timer,
6966 +                                         jiffies +
6967 +                                         cman_config.joinconf_timeout * HZ);
6968 +                               send_joinconf();
6969 +                               return 0;
6970 +                       }
6971 +                       else {  /* Node leaving the cluster */
6972 +                               recalculate_quorum(leavereason);
6973 +                               leavereason = 0;
6974 +                               node_state = MEMBER;
6975 +                       }
6976 +               }
6977 +               else {
6978 +                       master_state = MASTER_COLLECT;
6979 +                       responses_collected = 0;
6980 +                       responses_expected = cluster_members - 1;
6981 +                       P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
6982 +                              responses_expected);
6983 +
6984 +                       send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0, MSG_REPLYEXP);
6985 +
6986 +                       /* Set a timer in case we don't get 'em all back */
6987 +                       mod_timer(&transition_timer,
6988 +                                 jiffies +
6989 +                                 cman_config.transition_timeout * HZ);
6990 +               }
6991 +       }
6992 +       return 0;
6993 +}
6994 +
6995 +/* Got a VIEWACK response from a node */
6996 +static int do_process_viewack(struct msghdr *msg, char *reply, int len)
6997 +{
6998 +       struct sockaddr_cl *saddr = msg->msg_name;
6999 +
7000 +       if (node_opinion == NULL) {
7001 +               node_opinion =
7002 +                   kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
7003 +               if (!node_opinion) {
7004 +                       panic(": malloc agree/dissent failed\n");
7005 +               }
7006 +               memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
7007 +       }
7008 +
7009 +       /* Keep a list of agreeing and dissenting nodes */
7010 +       if (reply[1] == 1) {
7011 +               /* ACK - remote node agrees with me */
7012 +               P_MEMB("Node agrees\n");
7013 +               node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
7014 +               agreeing_nodes++;
7015 +       }
7016 +       else {
7017 +               /* Remote node disagrees */
7018 +               P_MEMB("Node disagrees\n");
7019 +               node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
7020 +               dissenting_nodes++;
7021 +       }
7022 +
7023 +       P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
7024 +              responses_expected);
7025 +
7026 +       /* Are all the results in yet ? */
7027 +       if (++responses_collected == responses_expected) {
7028 +               del_timer(&transition_timer);
7029 +
7030 +               P_MEMB("The results are in: %d agree, %d dissent\n",
7031 +                      agreeing_nodes, dissenting_nodes);
7032 +
7033 +               if (agreeing_nodes > dissenting_nodes) {
7034 +                       /* Kill dissenting nodes */
7035 +                       int i;
7036 +
7037 +                       for (i = 1; i <= responses_collected; i++) {
7038 +                               if (node_opinion[i] == OPINION_DISAGREE)
7039 +                                       send_kill(i);
7040 +                       }
7041 +               }
7042 +               else {
7043 +                       /* We must leave the cluster as we are in a minority,
7044 +                        * the rest of them can fight it out amongst
7045 +                        * themselves. */
7046 +                       us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
7047 +                       agreeing_nodes = 0;
7048 +                       dissenting_nodes = 0;
7049 +                       kfree(node_opinion);
7050 +                       node_opinion = NULL;
7051 +                       node_state = LEFT_CLUSTER;
7052 +                       quit_threads = 1;
7053 +                       wake_up_process(membership_task);
7054 +                       wake_up_interruptible(&cnxman_waitq);
7055 +                       return -1;
7056 +               }
7057 +
7058 +               /* Reset counters */
7059 +               agreeing_nodes = 0;
7060 +               dissenting_nodes = 0;
7061 +               kfree(node_opinion);
7062 +               node_opinion = NULL;
7063 +
7064 +               /* Confirm new node */
7065 +               if (transitionreason == TRANS_NEWNODE) {
7066 +                       mod_timer(&transition_timer,
7067 +                                 jiffies + cman_config.joinconf_timeout * HZ);
7068 +                       joinconf_count = 0;
7069 +                       send_joinconf();
7070 +                       return 0;
7071 +               }
7072 +
7073 +               master_state = MASTER_COMPLETE;
7074 +
7075 +               end_transition();
7076 +       }
7077 +
7078 +       return 0;
7079 +}
7080 +
7081 +/* Got an ENDTRANS message */
7082 +static int do_process_endtrans(struct msghdr *msg, char *buf, int len)
7083 +{
7084 +       struct cl_mem_endtrans_msg *endmsg =
7085 +               (struct cl_mem_endtrans_msg *)buf;
7086 +       struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7087 +
7088 +       /* Someone else's state transition */
7089 +       if (node_state != TRANSITION && node_state != JOINACK)
7090 +               return 0;
7091 +
7092 +       /* Check we got it from the MASTER node */
7093 +       if (master_node && master_node->node_id != saddr->scl_nodeid) {
7094 +               printk(KERN_INFO
7095 +                      "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
7096 +                      master_node->node_id, saddr->scl_nodeid);
7097 +               return 0;
7098 +       }
7099 +
7100 +       del_timer(&transition_timer);
7101 +
7102 +       /* Set node ID on new node */
7103 +       if (endmsg->new_node_id) {
7104 +               set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
7105 +               P_MEMB("new node %s has ID %d\n", joining_node->name,
7106 +                      joining_node->node_id);
7107 +       }
7108 +
7109 +       node_state = TRANSITION_COMPLETE;
7110 +
7111 +       /* Need to set this here or the barrier code will reject us if we've
7112 +        * just joined */
7113 +       we_are_a_cluster_member = TRUE;
7114 +
7115 +       confirm_joiner();
7116 +       cluster_generation = le32_to_cpu(endmsg->generation);
7117 +
7118 +       if (wait_for_completion_barrier() != 0) {
7119 +               P_MEMB("Barrier timed out - restart\n");
7120 +               node_state = TRANSITION;
7121 +               mod_timer(&transition_timer,
7122 +                         jiffies + cman_config.transition_timeout * HZ);
7123 +               return 0;
7124 +       }
7125 +
7126 +       quorum = le32_to_cpu(endmsg->quorum);
7127 +       set_quorate(le32_to_cpu(endmsg->total_votes));
7128 +       highest_nodeid = get_highest_nodeid();
7129 +
7130 +       /* Tell any waiting barriers that we had a transition */
7131 +       check_barrier_returns();
7132 +
7133 +       purge_temp_nodeids();
7134 +
7135 +       /* Clear the master node */
7136 +       master_node = NULL;
7137 +
7138 +       node_state = MEMBER;
7139 +
7140 +       /* Notify other listeners that transition has completed */
7141 +       notify_listeners();
7142 +       reset_hello_time();
7143 +       transition_end_time = jiffies;
7144 +
7145 +       sm_member_update(cluster_is_quorate);
7146 +       return 0;
7147 +}
7148 +
7149 +/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
7150 +static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
7151 +                        int nodeid)
7152 +{
7153 +       struct sockaddr_cl maddr;
7154 +
7155 +       maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
7156 +       maddr.scl_family = AF_CLUSTER;
7157 +       maddr.scl_nodeid = nodeid;
7158 +
7159 +       startmsg->cmd = CLUSTER_MEM_NOMINATE;
7160 +       return kcl_sendmsg(mem_socket, startmsg, msglen,
7161 +                          &maddr, sizeof (maddr), 0);
7162 +}
7163 +
7164 +/* Got a STARTTRANS message */
7165 +static int do_process_starttrans(struct msghdr *msg, char *buf, int len)
7166 +{
7167 +       struct cl_mem_starttrans_msg *startmsg =
7168 +               (struct cl_mem_starttrans_msg *)buf;
7169 +       struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
7170 +       struct cluster_node *node;
7171 +       unsigned int newgen = le32_to_cpu(startmsg->generation);
7172 +
7173 +       /* Got a WHAT from WHOM? */
7174 +       node = find_node_by_nodeid(saddr->scl_nodeid);
7175 +       if (!node || node->state != NODESTATE_MEMBER)
7176 +               return 0;
7177 +
7178 +       /* Someone else's state transition */
7179 +       if (node_state != MEMBER &&
7180 +           node_state != TRANSITION && node_state != MASTER)
7181 +               return 0;
7182 +
7183 +       /* Ignore old generation STARTTRANS messages */
7184 +       if ((newgen < cluster_generation) ||
7185 +           (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
7186 +               P_MEMB("Ignoring STARTTRANS with old generation number\n");
7187 +               return 0;
7188 +       }
7189 +
7190 +       P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
7191 +              newgen, cluster_generation, startmsg->reason);
7192 +
7193 +       /* Up the generation number */
7194 +       cluster_generation = newgen;
7195 +
7196 +       /* If we are also a master then decide between us */
7197 +       if (node_state == MASTER) {
7198 +
7199 +               /* See if we really want the responsibility of being master */
7200 +               if (elect_master(&node)) {
7201 +
7202 +                       /* I reluctantly accept this position of responsibility
7203 +                        */
7204 +                       P_MEMB("I elected myself master\n");
7205 +
7206 +                       /* start_transition will re-establish this */
7207 +                       del_timer(&transition_timer);
7208 +
7209 +                       start_transition(TRANS_NEWMASTER, node);
7210 +                       return 0;
7211 +               }
7212 +               else {
7213 +                       /* Back down */
7214 +                       P_MEMB("Backing down from MASTER status\n");
7215 +                       master_node = node;
7216 +                       node_state = MEMBER;
7217 +
7218 +                       /* If we were bringing a new node into the cluster then
7219 +                        * we will have to abandon that now and tell the new
7220 +                        * node to try again later */
7221 +                       if (transitionreason == TRANS_NEWNODE && joining_node) {
7222 +                               struct cluster_node_addr *first_addr =
7223 +                                   (struct cluster_node_addr *) joining_node->
7224 +                                   addr_list.next;
7225 +
7226 +                               P_MEMB("Postponing membership of node %s\n",
7227 +                                      joining_node->name);
7228 +                               send_joinack(first_addr->addr, address_length,
7229 +                                             JOINACK_TYPE_WAIT);
7230 +
7231 +                               /* Not dead, just sleeping */
7232 +                               joining_node->state = NODESTATE_DEAD;
7233 +                               joining_node = NULL;
7234 +                       }
7235 +
7236 +                       /* If the new master is not us OR the node we just got
7237 +                        * the STARTTRANS from then make sure it knows it has
7238 +                        * to be master */
7239 +                       if (saddr->scl_nodeid != node->node_id) {
7240 +                               send_nominate(startmsg, len, node->node_id);
7241 +                               return 0;
7242 +                       }
7243 +
7244 +                       /* Fall through into MEMBER code below if we are
7245 +                        * obeying the STARTTRANS we just received */
7246 +               }
7247 +       }
7248 +
7249 +       /* Do non-MASTER STARTTRANS bits */
7250 +       if (node_state == MEMBER) {
7251 +               int ptr = sizeof (struct cl_mem_starttrans_msg);
7252 +               int node_id = 0;
7253 +
7254 +               P_MEMB("Normal transition start\n");
7255 +
7256 +               /* If the master is adding a new node and we know it's node ID
7257 +                * then ACK with it. */
7258 +               if (startmsg->reason == TRANS_NEWNODE) {
7259 +                       struct cluster_node *node =
7260 +                           find_node_by_addr((char *) startmsg + ptr,
7261 +                                             address_length);
7262 +                       if (node)
7263 +                               node_id = node->node_id;
7264 +               }
7265 +
7266 +               /* Save the master info */
7267 +               master_node = find_node_by_nodeid(saddr->scl_nodeid);
7268 +               node_state = TRANSITION;
7269 +
7270 +               if (startmsg->reason == TRANS_NEWNODE) {
7271 +                       add_node_from_starttrans(msg, buf, len);
7272 +               }
7273 +
7274 +               if (startmsg->reason == TRANS_REMNODE ||
7275 +                   startmsg->reason == TRANS_ANOTHERREMNODE) {
7276 +                       remove_node(le32_to_cpu(startmsg->nodeid));
7277 +               }
7278 +
7279 +               send_startack(saddr, msg->msg_namelen,
7280 +                             node_id);
7281 +
7282 +               /* Establish timer in case the master dies */
7283 +               mod_timer(&transition_timer,
7284 +                         jiffies + cman_config.transition_timeout * HZ);
7285 +
7286 +               return 0;
7287 +       }
7288 +
7289 +       /* We are in transition but this may be a restart */
7290 +       if (node_state == TRANSITION) {
7291 +
7292 +               master_node = find_node_by_nodeid(saddr->scl_nodeid);
7293 +               send_startack(saddr, msg->msg_namelen, 0);
7294 +
7295 +               /* Is it a new joining node ? This happens if a master is
7296 +                * usurped */
7297 +               if (startmsg->reason == TRANS_NEWNODE) {
7298 +                       struct cluster_node *oldjoin = joining_node;
7299 +
7300 +                       add_node_from_starttrans(msg, buf, len);
7301 +
7302 +                       /* If this is a different node joining than the one we
7303 +                        * were previously joining (probably cos the master is
7304 +                        * a nominated one) then mark our "old" joiner as DEAD.
7305 +                        * The original master will already have told the node
7306 +                        * to go back into JOINWAIT state */
7307 +                       if (oldjoin && oldjoin != joining_node
7308 +                           && oldjoin->state == NODESTATE_JOINING)
7309 +                               oldjoin->state = NODESTATE_DEAD;
7310 +               }
7311 +
7312 +               /* Is it a new master node? */
7313 +               if (startmsg->reason == TRANS_NEWMASTER ||
7314 +                   startmsg->reason == TRANS_DEADMASTER) {
7315 +                       P_MEMB("starttrans %s, node=%d\n",
7316 +                              startmsg->reason ==
7317 +                              TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
7318 +                              le32_to_cpu(startmsg->nodeid));
7319 +
7320 +                       /* If the old master has died then remove it */
7321 +                       if (startmsg->reason == TRANS_DEADMASTER) {
7322 +                               remove_node(le32_to_cpu(startmsg->nodeid));
7323 +                       }
7324 +
7325 +                       /* Store new master */
7326 +                       master_node = find_node_by_nodeid(saddr->scl_nodeid);
7327 +               }
7328 +
7329 +               /* Another node has died (or been killed) */
7330 +               if (startmsg->reason == TRANS_ANOTHERREMNODE) {
7331 +                       /* Remove new dead node */
7332 +                       remove_node(le32_to_cpu(startmsg->nodeid));
7333 +               }
7334 +               /* Restart the timer */
7335 +               del_timer(&transition_timer);
7336 +               mod_timer(&transition_timer,
7337 +                         jiffies + cman_config.transition_timeout * HZ);
7338 +       }
7339 +
7340 +       return 0;
7341 +}
7342 +
7343 +/* Change a cluster parameter */
7344 +static int do_process_reconfig(struct msghdr *msg, char *buf, int len)
7345 +{
7346 +       struct cl_mem_reconfig_msg *confmsg;
7347 +       struct sockaddr_cl *saddr = msg->msg_name;
7348 +       struct cluster_node *node;
7349 +       unsigned int val;
7350 +
7351 +       if (len < sizeof(struct cl_mem_reconfig_msg))
7352 +               return -1;
7353 +
7354 +       confmsg = (struct cl_mem_reconfig_msg *)buf;
7355 +       val = le32_to_cpu(confmsg->value);
7356 +
7357 +       switch (confmsg->param) {
7358 +
7359 +       case RECONFIG_PARAM_EXPECTED_VOTES:
7360 +               /* Set any nodes with expected_votes higher than the new value
7361 +                * down */
7362 +               if (val > 0) {
7363 +                       struct cluster_node *node;
7364 +
7365 +                       down(&cluster_members_lock);
7366 +                       list_for_each_entry(node, &cluster_members_list, list) {
7367 +                               if (node->state == NODESTATE_MEMBER &&
7368 +                                   node->expected_votes > val) {
7369 +                                       node->expected_votes = val;
7370 +                               }
7371 +                       }
7372 +                       up(&cluster_members_lock);
7373 +                       if (expected_votes > val)
7374 +                               expected_votes = val;
7375 +               }
7376 +               recalculate_quorum(1);  /* Allow decrease */
7377 +               sm_member_update(cluster_is_quorate);
7378 +               break;
7379 +
7380 +       case RECONFIG_PARAM_NODE_VOTES:
7381 +               node = find_node_by_nodeid(saddr->scl_nodeid);
7382 +               node->votes = val;
7383 +               recalculate_quorum(1);  /* Allow decrease */
7384 +               sm_member_update(cluster_is_quorate);
7385 +               break;
7386 +
7387 +       case RECONFIG_PARAM_CONFIG_VERSION:
7388 +               config_version = val;
7389 +               break;
7390 +
7391 +       default:
7392 +               printk(KERN_INFO CMAN_NAME
7393 +                      ": got unknown parameter in reconfigure message. %d\n",
7394 +                      confmsg->param);
7395 +               break;
7396 +       }
7397 +       return 0;
7398 +}
7399 +
7400 +/* Response from master node */
7401 +static int do_process_joinack(struct msghdr *msg, char *buf, int len)
7402 +{
7403 +       struct cl_mem_joinack_msg *ackmsg =
7404 +               (struct cl_mem_joinack_msg *)buf;
7405 +
7406 +       join_time = jiffies;
7407 +       if (ackmsg->acktype == JOINACK_TYPE_OK) {
7408 +               node_state = JOINACK;
7409 +       }
7410 +
7411 +       if (ackmsg->acktype == JOINACK_TYPE_NAK) {
7412 +               printk(KERN_WARNING CMAN_NAME
7413 +                      ": Cluster membership rejected\n");
7414 +               P_MEMB("Got JOINACK NACK\n");
7415 +               node_state = REJECTED;
7416 +       }
7417 +
7418 +       if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
7419 +               P_MEMB("Got JOINACK WAIT\n");
7420 +               node_state = JOINWAIT;
7421 +               joinwait_time = jiffies;
7422 +       }
7423 +
7424 +       return 0;
7425 +}
7426 +
7427 +/* Check a JOINREQ message for validity,
7428 +   return -1 if we can't let the node join our cluster */
7429 +static int validate_joinmsg(struct cl_mem_join_msg *joinmsg, int len)
7430 +{
7431 +       struct cluster_node *node;
7432 +
7433 +        /* Check version number */
7434 +       if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
7435 +               char *ptr = (char *) joinmsg;
7436 +               char *name;
7437 +
7438 +               ptr += sizeof (*joinmsg);
7439 +               name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7440 +
7441 +               /* Sanity-check the num_addrs field otherwise we could oops */
7442 +               if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
7443 +                       printk(KERN_WARNING CMAN_NAME
7444 +                              ": num_addr in JOIN-REQ message is rubbish: %d\n",
7445 +                              le16_to_cpu(joinmsg->num_addr));
7446 +                       return -1;
7447 +               }
7448 +
7449 +               /* Check the cluster name matches */
7450 +               if (strcmp(cluster_name, joinmsg->clustername)) {
7451 +                       printk(KERN_WARNING CMAN_NAME
7452 +                              ": attempt to join with cluster name '%s' refused\n",
7453 +                              joinmsg->clustername);
7454 +                       return -1;
7455 +               }
7456 +
7457 +               /* Check we are not exceeding the maximum number of nodes */
7458 +               if (cluster_members >= cman_config.max_nodes) {
7459 +                       printk(KERN_WARNING CMAN_NAME
7460 +                              ": Join request from %s rejected, exceeds maximum number of nodes\n",
7461 +                              name);
7462 +                       return -1;
7463 +               }
7464 +
7465 +               /* Check that we don't exceed the two_node limit, if applicable */
7466 +               if (two_node && cluster_members == 2) {
7467 +                       printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7468 +                              "rejected, exceeds two node limit\n", name);
7469 +                       return -1;
7470 +               }
7471 +
7472 +               if (le32_to_cpu(joinmsg->config_version) != config_version) {
7473 +                       printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7474 +                              "rejected, config version local %u remote %u\n",
7475 +                              name, config_version,
7476 +                              le32_to_cpu(joinmsg->config_version));
7477 +                       return -1;
7478 +               }
7479 +
7480 +               /* Validate requested static node ID */
7481 +               if (joinmsg->nodeid &&
7482 +                   (node = find_node_by_nodeid(le32_to_cpu(joinmsg->nodeid))) &&
7483 +                   (node->state != NODESTATE_DEAD ||
7484 +                    (strcmp(node->name, name)))) {
7485 +                       printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7486 +                              "rejected, node ID %d already in use by %s\n",
7487 +                              name, node->node_id, node->name);
7488 +                       return -1;
7489 +               }
7490 +               if (joinmsg->nodeid &&
7491 +                   (node = find_node_by_name(name)) &&
7492 +                   (node->state != NODESTATE_DEAD ||
7493 +                    node->node_id != le32_to_cpu(joinmsg->nodeid))) {
7494 +                       printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7495 +                              "rejected, wanted node %d but previously had %d\n",
7496 +                              name, le32_to_cpu(joinmsg->nodeid), node->node_id);
7497 +                       return -1;
7498 +               }
7499 +
7500 +                /* If these don't match then I don't know how the message
7501 +                  arrived! However, I can't take the chance */
7502 +               if (le32_to_cpu(joinmsg->addr_len) != address_length) {
7503 +                       printk(KERN_WARNING CMAN_NAME ": Join request from %s "
7504 +                              "rejected, address length local: %u remote %u\n",
7505 +                              name, address_length,
7506 +                              le32_to_cpu(joinmsg->addr_len));
7507 +                       return -1;
7508 +               }
7509 +       }
7510 +       else {
7511 +               /* Version number mismatch, don't use any part of the message
7512 +                * other than the version numbers as things may have moved */
7513 +               printk(KERN_INFO CMAN_NAME
7514 +                      ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d)\n",
7515 +                      CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
7516 +                      CNXMAN_PATCH_VERSION,
7517 +                      le32_to_cpu(joinmsg->major_version),
7518 +                      le32_to_cpu(joinmsg->minor_version),
7519 +                      le32_to_cpu(joinmsg->patch_version));
7520 +               return -1;
7521 +       }
7522 +       return 0;
7523 +}
7524 +
7525 +
7526 +/* Request to join the cluster. This makes us the master for this state
7527 + * transition */
7528 +static int do_process_joinreq(struct msghdr *msg, char *buf, int len)
7529 +{
7530 +       static unsigned long last_joinreq = 0;
7531 +       static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
7532 +       struct cl_mem_join_msg *joinmsg = (struct cl_mem_join_msg *)buf;
7533 +       struct cluster_node *node;
7534 +       char *ptr = (char *) joinmsg;
7535 +       char *name;
7536 +       int i;
7537 +       struct sockaddr_cl *addr = msg->msg_name;
7538 +
7539 +       ptr += sizeof (*joinmsg);
7540 +       name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
7541 +
7542 +       /* If we are in a state transition then tell the new node to wait a bit
7543 +        * longer */
7544 +       if (node_state != MEMBER) {
7545 +               if (node_state == MASTER || node_state == TRANSITION) {
7546 +                       send_joinack(msg->msg_name, msg->msg_namelen,
7547 +                                     JOINACK_TYPE_WAIT);
7548 +               }
7549 +               return 0;
7550 +       }
7551 +
7552 +       /* Reject application if message is invalid for any reason */
7553 +       if (validate_joinmsg(joinmsg, len)) {
7554 +               send_joinack(msg->msg_name, msg->msg_namelen,
7555 +                            JOINACK_TYPE_NAK);
7556 +               return 0;
7557 +       }
7558 +
7559 +       /* Do we already know about this node? */
7560 +       if (check_duplicate_node(name, msg, len) < 0) {
7561 +               send_joinack(msg->msg_name, msg->msg_namelen,
7562 +                            JOINACK_TYPE_NAK);
7563 +               return 0;
7564 +       }
7565 +
7566 +       /* Duplicate checking: Because joining messages do not have
7567 +        * sequence numbers we may get as many JOINREQ messages as we
7568 +        * have interfaces. This bit of code here just checks for
7569 +        * JOINREQ messages that come in from the same node in a small
7570 +        * period of time and removes the duplicates */
7571 +       if (time_before(jiffies, last_joinreq + 10 * HZ)
7572 +           && strcmp(name, last_name) == 0) {
7573 +               return 0;
7574 +       }
7575 +
7576 +        /* OK, you can be in my gang */
7577 +       last_joinreq = jiffies;
7578 +       strcpy(last_name, name);
7579 +
7580 +       node = add_new_node(name, joinmsg->votes,
7581 +                           le32_to_cpu(joinmsg->expected_votes),
7582 +                           le32_to_cpu(joinmsg->nodeid),
7583 +                           NODESTATE_JOINING);
7584 +
7585 +       /* Add the node's addresses */
7586 +       if (list_empty(&node->addr_list)) {
7587 +               for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
7588 +                    i++) {
7589 +                       add_node_address(node, ptr, address_length);
7590 +                       ptr += address_length;
7591 +               }
7592 +       }
7593 +       send_joinack(msg->msg_name, msg->msg_namelen,
7594 +                    JOINACK_TYPE_OK);
7595 +       joining_node = node;
7596 +       joining_temp_nodeid = addr->scl_nodeid;
7597 +
7598 +       /* Start the state transition */
7599 +       start_transition(TRANS_NEWNODE, node);
7600 +
7601 +       return 0;
7602 +}
7603 +
7604 +/* A simple function to invent a small number based
7605 +   on the node name */
7606 +static int node_hash(void)
7607 +{
7608 +       int i;
7609 +       int value = 0;
7610 +
7611 +       for (i=0; i<strlen(nodename); i++) {
7612 +               value += nodename[i];
7613 +       }
7614 +       return (value & 0xF) + 1;
7615 +}
7616 +
7617 +
7618 +/* Return the low 32 bits of our IP address */
7619 +static uint32_t low32_of_ip()
7620 +{
7621 +       struct cluster_node_addr *addr;
7622 +       uint32_t lowip;
7623 +
7624 +       addr = list_entry(us->addr_list.next, struct cluster_node_addr, list);
7625 +       memcpy(&lowip, addr->addr+address_length-sizeof(uint32_t), sizeof(uint32_t));
7626 +       if (!lowip)
7627 +               memcpy(&lowip, addr->addr - sizeof(uint32_t)*2, sizeof(uint32_t));
7628 +
7629 +       return lowip;
7630 +}
7631 +
7632 +/* A new node has stated its intent to form a new cluster. we may have
7633 + * something to say about that... */
7634 +static int do_process_newcluster(struct msghdr *msg, char *buf, int len)
7635 +{
7636 +       /* If we are also in STARTING state then back down for a random period
7637 +        * of time */
7638 +       if (node_state == STARTING) {
7639 +               P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
7640 +               start_time = jiffies + node_hash() * HZ;
7641 +       }
7642 +
7643 +       if (node_state == NEWCLUSTER) {
7644 +               uint32_t otherip;
7645 +
7646 +               memcpy(&otherip, buf+1, sizeof(otherip));
7647 +               otherip = le32_to_cpu(otherip);
7648 +               P_MEMB("got NEWCLUSTER, remote ip = %x, us = %x\n", otherip, low32_of_ip());
7649 +               if (otherip < low32_of_ip())
7650 +                       node_state = STARTING;
7651 +       }
7652 +
7653 +       if (node_state == MEMBER)
7654 +               send_hello();
7655 +
7656 +       return 0;
7657 +}
7658 +
7659 +/* Called for each node by the node-message unpacker. Returns -1 if there is a
7660 + * mismatch and the caller will stop processing */
7661 +static int check_node(struct cluster_node *newnode, char *addrs,
7662 +                     unsigned short num_addr)
7663 +{
7664 +       struct cluster_node *node = find_node_by_name(newnode->name);
7665 +
7666 +       P_MEMB("check_node: %s", newnode->name);
7667 +
7668 +       if (!node) {
7669 +               C_MEMB("  - not found\n");
7670 +               return -1;
7671 +       }
7672 +
7673 +       if (node->votes != newnode->votes ||
7674 +           node->node_id != newnode->node_id ||
7675 +           node->state != newnode->state) {
7676 +               C_MEMB(" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
7677 +                      node->votes, newnode->votes, node->node_id,
7678 +                      newnode->node_id, node->state);
7679 +               return -1;
7680 +       }
7681 +       C_MEMB(" - OK\n");
7682 +       return 0;
7683 +}
7684 +
7685 +/* Called for each new node found in a JOINCONF message. Create a new node
7686 + * entry */
7687 +static int add_node(struct cluster_node *node, char *addrs,
7688 +                   unsigned short num_addr)
7689 +{
7690 +       P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
7691 +              node->expected_votes, node->node_id);
7692 +
7693 +       if (!find_node_by_name(node->name)) {
7694 +               struct cluster_node *newnode;
7695 +               int i;
7696 +
7697 +               if ((newnode =
7698 +                    add_new_node(node->name, node->votes, node->expected_votes,
7699 +                                 node->node_id, node->state)) == NULL) {
7700 +                       P_MEMB("Error adding node\n");
7701 +                       return -1;
7702 +               }
7703 +               if (list_empty(&newnode->addr_list)) {
7704 +                       for (i = 0; i < num_addr; i++) {
7705 +                               add_node_address(newnode,
7706 +                                                addrs + i * address_length, address_length);
7707 +                       }
7708 +               }
7709 +               return 0;
7710 +       }
7711 +       else {
7712 +               P_MEMB("Already got node with name %s\n", node->name);
7713 +               return -1;
7714 +       }
7715 +}
7716 +
7717 +/* Call a specified routine for each node unpacked from the message. Return
7718 + * either the number of nodes found or -1 for an error */
7719 +static int unpack_nodes(unsigned char *buf, int len,
7720 +                       int (*routine) (struct cluster_node *, char *,
7721 +                                       unsigned short))
7722 +{
7723 +       int ptr = 0;
7724 +       int num_nodes = 0;
7725 +       char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
7726 +       struct cluster_node node;
7727 +
7728 +       node.name = nodename;
7729 +
7730 +       while (ptr < len) {
7731 +               int namelen = buf[ptr++];
7732 +               unsigned int evotes;
7733 +               unsigned int node_id;
7734 +               unsigned short num_addr;
7735 +               unsigned char *addrs;
7736 +
7737 +               memcpy(nodename, &buf[ptr], namelen);
7738 +               nodename[namelen] = '\0';
7739 +               ptr += namelen;
7740 +
7741 +               node.state = buf[ptr++];
7742 +
7743 +               memcpy(&num_addr, &buf[ptr], sizeof (short));
7744 +               num_addr = le16_to_cpu(num_addr);
7745 +               ptr += sizeof (short);
7746 +
7747 +               /* Just make a note of the addrs "array" */
7748 +               addrs = &buf[ptr];
7749 +               ptr += num_addr * address_length;
7750 +
7751 +               node.votes = buf[ptr++];
7752 +
7753 +               memcpy(&evotes, &buf[ptr], sizeof (int));
7754 +               node.expected_votes = le32_to_cpu(evotes);
7755 +               ptr += sizeof (int);
7756 +
7757 +               memcpy(&node_id, &buf[ptr], sizeof (int));
7758 +               node.node_id = le32_to_cpu(node_id);
7759 +               ptr += sizeof (int);
7760 +
7761 +               /* Call the callback routine */
7762 +               if (routine(&node, addrs, num_addr) < 0)
7763 +                       return -1;
7764 +
7765 +               /* Return the number of MEMBER nodes */
7766 +               if (node.state == NODESTATE_MEMBER)
7767 +                       num_nodes++;
7768 +       }
7769 +       return num_nodes;
7770 +}
7771 +
7772 +/* Got join confirmation from a master node. This message contains a list of
7773 + * cluster nodes which we unpack and build into our cluster nodes list. When we
7774 + * have the last message we can go into TRANSITION state */
7775 +static int do_process_joinconf(struct msghdr *msg, char *buf, int len)
7776 +{
7777 +       if (unpack_nodes(buf + 2, len - 2, add_node) < 0) {
7778 +               printk(CMAN_NAME
7779 +                      ": Error procssing joinconf message - giving up on cluster join\n");
7780 +               us->leave_reason = CLUSTER_LEAVEFLAG_PANIC;
7781 +               node_state = LEFT_CLUSTER;
7782 +               return -1;
7783 +       }
7784 +
7785 +       /* Last message in the list? */
7786 +       if (buf[1] & 2) {
7787 +               char ackmsg;
7788 +               struct sockaddr_cl *addr = msg->msg_name;
7789 +
7790 +               us->state = NODESTATE_MEMBER;
7791 +               node_state = TRANSITION;
7792 +               we_are_a_cluster_member = TRUE;
7793 +
7794 +               ackmsg = CLUSTER_MEM_CONFACK;
7795 +               kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
7796 +                           sizeof (struct sockaddr_cl),
7797 +                           MSG_NOACK);
7798 +               kernel_thread(hello_kthread, NULL, 0);
7799 +               mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
7800 +       }
7801 +       return 0;
7802 +}
7803 +
7804 +/* Got the master's view of the cluster - compare it with ours and tell it the
7805 + * result */
7806 +static int do_process_masterview(struct msghdr *msg, char *buf, int len)
7807 +{
7808 +       char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
7809 +       static int num_nodes;
7810 +
7811 +       /* Someone else's state transition */
7812 +       if (node_state != MEMBER &&
7813 +           node_state != TRANSITION && node_state != MASTER)
7814 +               return 0;
7815 +
7816 +       /* First message, zero the counter */
7817 +       if (buf[1] & 1)
7818 +               num_nodes = 0;
7819 +
7820 +       num_nodes += unpack_nodes(buf + 2, len - 2, check_node);
7821 +
7822 +       /* Last message, check the count and reply */
7823 +       if (buf[1] & 2) {
7824 +               if (num_nodes == cluster_members) {
7825 +                       /* Send ACK */
7826 +                       reply[1] = 1;
7827 +               }
7828 +               else {
7829 +                       P_MEMB
7830 +                           ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
7831 +                            num_nodes, cluster_members);
7832 +                       /* Send NAK */
7833 +                       reply[1] = 0;
7834 +               }
7835 +               kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
7836 +                           msg->msg_namelen, 0);
7837 +       }
7838 +       return 0;
7839 +}
7840 +
7841 +static int do_process_leave(struct msghdr *msg, char *buf, int len)
7842 +{
7843 +       struct cluster_node *node;
7844 +       struct sockaddr_cl *saddr = msg->msg_name;
7845 +       unsigned char *leavemsg = (unsigned char *)buf;
7846 +
7847 +       if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
7848 +               unsigned char reason = leavemsg[1];
7849 +
7850 +               if (node->state != NODESTATE_DEAD) {
7851 +                       printk(KERN_INFO CMAN_NAME
7852 +                              ": Node %s is leaving the cluster, %s\n",
7853 +                              node->name, leave_string(reason));
7854 +
7855 +                       node->leave_reason = reason;
7856 +               }
7857 +               leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
7858 +
7859 +               a_node_just_died(node);
7860 +       }
7861 +       return 0;
7862 +}
7863 +
7864 +static int do_process_hello(struct msghdr *msg, char *buf, int len)
7865 +{
7866 +       struct cluster_node *node;
7867 +       struct cl_mem_hello_msg *hellomsg =
7868 +               (struct cl_mem_hello_msg *)buf;
7869 +       struct sockaddr_cl *saddr = msg->msg_name;
7870 +
7871 +       /* We are starting up. Send a join message to the node whose HELLO we
7872 +        * just received */
7873 +       if (node_state == STARTING || node_state == JOINWAIT ||
7874 +           node_state == JOINING  || node_state == NEWCLUSTER) {
7875 +               struct sockaddr_cl *addr = msg->msg_name;
7876 +
7877 +               printk(KERN_INFO CMAN_NAME ": sending membership request\n");
7878 +
7879 +               send_joinreq(addr, msg->msg_namelen);
7880 +               join_time = jiffies;
7881 +               node_state = JOINING;
7882 +               return 0;
7883 +       }
7884 +
7885 +       /* Only process HELLOs if we are not in transition */
7886 +       if (node_state == MEMBER) {
7887 +
7888 +               node = find_node_by_nodeid(saddr->scl_nodeid);
7889 +               if (node && node->state != NODESTATE_DEAD) {
7890 +
7891 +                       /* Check the cluster generation in the HELLO message.
7892 +                        * NOTE: this may be different if the message crossed
7893 +                        * on the wire with an END-TRANS so we allow a period
7894 +                        * of grace in which this is allowable */
7895 +                       if (cluster_generation !=
7896 +                           le32_to_cpu(hellomsg->generation)
7897 +                           && node_state == MEMBER
7898 +                           && time_after(jiffies,
7899 +                                         cman_config.hello_timer * HZ +
7900 +                                         transition_end_time)) {
7901 +
7902 +                               printk(KERN_INFO CMAN_NAME
7903 +                                      ": bad generation number %d in HELLO message, expected %d\n",
7904 +                                      le32_to_cpu(hellomsg->generation),
7905 +                                      cluster_generation);
7906 +
7907 +                               notify_kernel_listeners(DIED,
7908 +                                                       (long) node->node_id);
7909 +
7910 +                               send_kill(node->node_id);
7911 +                               return 0;
7912 +                       }
7913 +
7914 +                       if (cluster_members != le16_to_cpu(hellomsg->members)
7915 +                           && node_state == MEMBER) {
7916 +                               printk(KERN_INFO CMAN_NAME
7917 +                                      ": nmembers in HELLO message does not match our view (got %d, exp %d)\n",
7918 +                                      le16_to_cpu(hellomsg->members), cluster_members);
7919 +                               start_transition(TRANS_CHECK, node);
7920 +                               return 0;
7921 +                       }
7922 +                       /* The message is OK - save the time */
7923 +                       node->last_hello = jiffies;
7924 +               }
7925 +               else {
7926 +                       /* This node is a danger to our valid cluster */
7927 +                       if (cluster_is_quorate) {
7928 +                               send_kill(saddr->scl_nodeid);
7929 +                       }
7930 +               }
7931 +       }
7932 +
7933 +       return 0;
7934 +
7935 +}
7936 +
7937 +static int do_process_kill(struct msghdr *msg, char *buf, int len)
7938 +{
7939 +       struct sockaddr_cl *saddr = msg->msg_name;
7940 +       struct cluster_node *node;
7941 +
7942 +       node = find_node_by_nodeid(saddr->scl_nodeid);
7943 +       if (node && node->state == NODESTATE_MEMBER) {
7944 +
7945 +               printk(KERN_INFO CMAN_NAME
7946 +                      ": Being told to leave the cluster by node %d\n",
7947 +                      saddr->scl_nodeid);
7948 +
7949 +               node_state = LEFT_CLUSTER;
7950 +               quit_threads = 1;
7951 +               wake_up_process(membership_task);
7952 +               wake_up_interruptible(&cnxman_waitq);
7953 +       }
7954 +       else {
7955 +               P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
7956 +       }
7957 +       return 0;
7958 +}
7959 +
7960 +/* Some cluster membership utility functions */
7961 +struct cluster_node *find_node_by_name(char *name)
7962 +{
7963 +       struct list_head *nodelist;
7964 +       struct cluster_node *node;
7965 +
7966 +       down(&cluster_members_lock);
7967 +       list_for_each(nodelist, &cluster_members_list) {
7968 +               node = list_entry(nodelist, struct cluster_node, list);
7969 +
7970 +               if (strcmp(node->name, name) == 0) {
7971 +                       up(&cluster_members_lock);
7972 +                       return node;
7973 +               }
7974 +       }
7975 +       up(&cluster_members_lock);
7976 +       return NULL;
7977 +}
7978 +
7979 +/* Try to avoid using this as it's slow and holds the members lock */
7980 +struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
7981 +{
7982 +       struct list_head *nodelist;
7983 +       struct list_head *addrlist;
7984 +       struct cluster_node *node;
7985 +       struct cluster_node_addr *nodeaddr;
7986 +
7987 +       down(&cluster_members_lock);
7988 +
7989 +       list_for_each(nodelist, &cluster_members_list) {
7990 +               node = list_entry(nodelist, struct cluster_node, list);
7991 +
7992 +               list_for_each(addrlist, &node->addr_list) {
7993 +                       nodeaddr =
7994 +                           list_entry(addrlist, struct cluster_node_addr,
7995 +                                      list);
7996 +
7997 +                       if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0) {
7998 +                               up(&cluster_members_lock);
7999 +                               return node;
8000 +                       }
8001 +               }
8002 +       }
8003 +
8004 +       up(&cluster_members_lock);
8005 +       return NULL;
8006 +}
8007 +
8008 +/* This is the quick way to find a node */
8009 +struct cluster_node *find_node_by_nodeid(unsigned int id)
8010 +{
8011 +       struct cluster_node *node;
8012 +
8013 +       if (id > sizeof_members_array)
8014 +               return NULL;
8015 +
8016 +       spin_lock(&members_by_nodeid_lock);
8017 +       node = members_by_nodeid[id];
8018 +       spin_unlock(&members_by_nodeid_lock);
8019 +       return node;
8020 +}
8021 +
8022 +static int dispatch_messages(struct socket *mem_socket)
8023 +{
8024 +       int err = 0;
8025 +
8026 +       while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
8027 +               struct msghdr msg;
8028 +               struct kvec vec;
8029 +               struct sockaddr_cl sin;
8030 +               int len;
8031 +
8032 +               memset(&sin, 0, sizeof (sin));
8033 +
8034 +               msg.msg_control = NULL;
8035 +               msg.msg_controllen = 0;
8036 +               msg.msg_name = &sin;
8037 +               msg.msg_namelen = sizeof (sin);
8038 +               msg.msg_flags = 0;
8039 +
8040 +               vec.iov_len = MAX_CLUSTER_MESSAGE;
8041 +               vec.iov_base = iobuf;
8042 +
8043 +               len = kernel_recvmsg(mem_socket, &msg, &vec, 1,
8044 +                                    MAX_CLUSTER_MESSAGE,
8045 +                                    MSG_DONTWAIT);
8046 +               if (len > 0) {
8047 +                       msg.msg_name = &sin;
8048 +                       do_membership_packet(&msg, iobuf, len);
8049 +               }
8050 +               else {
8051 +                       if (len == -EAGAIN)
8052 +                               err = 0;
8053 +                       else
8054 +                               err = -1;
8055 +                       break;
8056 +               }
8057 +       }
8058 +       return err;
8059 +}
8060 +
8061 +/* Scan the nodes list for dead nodes */
8062 +static void check_for_dead_nodes()
8063 +{
8064 +       struct list_head *nodelist;
8065 +       struct cluster_node *node;
8066 +
8067 +       down(&cluster_members_lock);
8068 +       list_for_each(nodelist, &cluster_members_list) {
8069 +               node = list_entry(nodelist, struct cluster_node, list);
8070 +
8071 +               if (node->state != NODESTATE_DEAD &&
8072 +                   time_after(jiffies,
8073 +                              node->last_hello +
8074 +                              cman_config.deadnode_timeout * HZ) && !node->us) {
8075 +
8076 +                       up(&cluster_members_lock);
8077 +
8078 +                       printk(KERN_WARNING CMAN_NAME
8079 +                              ": no HELLO from %s, removing from the cluster\n",
8080 +                              node->name);
8081 +
8082 +                       P_MEMB("last hello was %ld, current time is %ld\n",
8083 +                              node->last_hello, jiffies);
8084 +
8085 +                       node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
8086 +                       leavereason = 0;
8087 +
8088 +                       /* This is unlikely to work but it's worth a try! */
8089 +                       send_kill(node->node_id);
8090 +
8091 +                       /* Start state transition */
8092 +                       a_node_just_died(node);
8093 +                       return;
8094 +               }
8095 +       }
8096 +       up(&cluster_members_lock);
8097 +
8098 +       /* Also check for a dead quorum device */
8099 +       if (quorum_device) {
8100 +               if (quorum_device->state == NODESTATE_MEMBER &&
8101 +                   time_after(jiffies,
8102 +                              quorum_device->last_hello +
8103 +                              cman_config.deadnode_timeout * HZ)) {
8104 +                       quorum_device->state = NODESTATE_DEAD;
8105 +                       printk(KERN_WARNING CMAN_NAME
8106 +                              ": Quorum device %s timed out\n",
8107 +                              quorum_device->name);
8108 +                       recalculate_quorum(0);
8109 +               }
8110 +       }
8111 +
8112 +       return;
8113 +}
8114 +
8115 +/* add "us" as a node in the cluster */
8116 +static int add_us()
8117 +{
8118 +       struct cluster_node *newnode =
8119 +           kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
8120 +
8121 +       if (!newnode) {
8122 +               /* Oh shit, we have to commit hara kiri here for the greater
8123 +                * good of the cluster */
8124 +               send_leave(CLUSTER_LEAVEFLAG_PANIC);
8125 +
8126 +               printk(KERN_CRIT CMAN_NAME
8127 +                      ": Cannot allocate memory for our node structure\n");
8128 +               panic("Must die");
8129 +
8130 +               return -1;
8131 +       }
8132 +
8133 +       memset(newnode, 0, sizeof (struct cluster_node));
8134 +       newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
8135 +       if (!newnode->name) {
8136 +               send_leave(CLUSTER_LEAVEFLAG_PANIC);
8137 +
8138 +               printk(KERN_CRIT CMAN_NAME
8139 +                      ": Cannot allocate memory for node name\n");
8140 +               kfree(newnode);
8141 +
8142 +               panic("Must die");
8143 +
8144 +               return -1;
8145 +       }
8146 +
8147 +       strcpy(newnode->name, nodename);
8148 +       newnode->last_hello = jiffies;
8149 +       newnode->votes = votes;
8150 +       newnode->expected_votes = expected_votes;
8151 +       newnode->state = NODESTATE_JOINING;
8152 +       newnode->node_id = 0;   /* Will get filled in by ENDTRANS message */
8153 +       newnode->us = 1;
8154 +       newnode->leave_reason = 0;
8155 +       INIT_LIST_HEAD(&newnode->addr_list);
8156 +       get_local_addresses(newnode);   /* Get from cnxman socket info */
8157 +
8158 +       /* Add the new node to the list */
8159 +       down(&cluster_members_lock);
8160 +       list_add(&newnode->list, &cluster_members_list);
8161 +       cluster_members++;
8162 +       up(&cluster_members_lock);
8163 +       us = newnode;
8164 +
8165 +       return 0;
8166 +}
8167 +
8168 +/* Return the highest known node_id */
8169 +unsigned int get_highest_nodeid()
8170 +{
8171 +       struct list_head *nodelist;
8172 +       struct cluster_node *node = NULL;
8173 +       unsigned int highest = 0;
8174 +
8175 +       down(&cluster_members_lock);
8176 +       list_for_each(nodelist, &cluster_members_list) {
8177 +               node = list_entry(nodelist, struct cluster_node, list);
8178 +
8179 +               if (node->node_id > highest)
8180 +                       highest = node->node_id;
8181 +       }
8182 +       up(&cluster_members_lock);
8183 +
8184 +       return highest;
8185 +}
8186 +
8187 +/* Elect a new master if there is a clash. Returns 1 if we are the new master,
8188 + * the master's struct will also be returned. This, rather primitively, uses
8189 + * the lowest node ID */
8190 +static int elect_master(struct cluster_node **master_node)
8191 +{
8192 +       int i;
8193 +
8194 +       for (i = 1; i < sizeof_members_array; i++) {
8195 +               if (members_by_nodeid[i]
8196 +                   && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
8197 +                       *master_node = members_by_nodeid[i];
8198 +                       P_MEMB("Elected master is %s\n", (*master_node)->name);
8199 +                       return (*master_node)->us;
8200 +               }
8201 +       }
8202 +       BUG();
8203 +       return 0;
8204 +}
8205 +
8206 +/* Called by node_cleanup in cnxman when we have left the cluster */
8207 +void free_nodeid_array()
8208 +{
8209 +       vfree(members_by_nodeid);
8210 +       members_by_nodeid = NULL;
8211 +       sizeof_members_array = 0;
8212 +}
8213 +
8214 +int allocate_nodeid_array()
8215 +{
8216 +       /* Allocate space for the nodeid lookup array */
8217 +       if (!members_by_nodeid) {
8218 +               spin_lock_init(&members_by_nodeid_lock);
8219 +               members_by_nodeid =
8220 +                   vmalloc(cman_config.max_nodes *
8221 +                           sizeof (struct cluster_member *));
8222 +       }
8223 +
8224 +       if (!members_by_nodeid) {
8225 +               printk(KERN_WARNING
8226 +                      "Unable to allocate members array for %d members\n",
8227 +                      cman_config.max_nodes);
8228 +               return -ENOMEM;
8229 +       }
8230 +       memset(members_by_nodeid, 0,
8231 +              cman_config.max_nodes * sizeof (struct cluster_member *));
8232 +       sizeof_members_array = cman_config.max_nodes;
8233 +
8234 +       return 0;
8235 +}
8236 +
8237 +/* Set the votes & expected_votes variables */
8238 +void set_votes(int v, int e)
8239 +{
8240 +       votes = v;
8241 +       expected_votes = e;
8242 +}
8243 +
8244 +int get_quorum()
8245 +{
8246 +       return quorum;
8247 +}
8248 +
8249 +/* Called by cnxman to see if activity should be blocked because we are in a
8250 + * state transition */
8251 +int in_transition()
8252 +{
8253 +       return node_state == TRANSITION ||
8254 +           node_state == TRANSITION_COMPLETE || node_state == MASTER;
8255 +}
8256 +
8257 +/* Return the current membership state as a string for the main line to put
8258 + * into /proc . I really should be using snprintf rather than sprintf but it's
8259 + * not exported... */
8260 +char *membership_state(char *buf, int buflen)
8261 +{
8262 +       switch (node_state) {
8263 +       case STARTING:
8264 +               strncpy(buf, "Starting", buflen);
8265 +               break;
8266 +       case NEWCLUSTER:
8267 +               strncpy(buf, "New-Cluster?", buflen);
8268 +               break;
8269 +       case JOINING:
8270 +               strncpy(buf, "Joining", buflen);
8271 +               break;
8272 +       case JOINWAIT:
8273 +               strncpy(buf, "Join-Wait", buflen);
8274 +               break;
8275 +       case JOINACK:
8276 +               strncpy(buf, "Join-Ack", buflen);
8277 +               break;
8278 +       case TRANSITION:
8279 +               sprintf(buf, "State-Transition: Master is %s",
8280 +                       master_node ? master_node->name : "Unknown");
8281 +               break;
8282 +       case MEMBER:
8283 +               strncpy(buf, "Cluster-Member", buflen);
8284 +               break;
8285 +       case REJECTED:
8286 +               strncpy(buf, "Rejected", buflen);
8287 +               break;
8288 +       case LEFT_CLUSTER:
8289 +               strncpy(buf, "Not-in-Cluster", buflen);
8290 +               break;
8291 +       case TRANSITION_COMPLETE:
8292 +               strncpy(buf, "Transition-Complete", buflen);
8293 +               break;
8294 +       case MASTER:
8295 +               strncpy(buf, "Transition-Master", buflen);
8296 +               break;
8297 +       default:
8298 +               sprintf(buf, "Unknown: code=%d", node_state);
8299 +               break;
8300 +       }
8301 +
8302 +       return buf;
8303 +}
8304 +
8305 +char *leave_string(int reason)
8306 +{
8307 +       static char msg[32];
8308 +       switch (reason)
8309 +       {
8310 +       case CLUSTER_LEAVEFLAG_DOWN:
8311 +               return "Shutdown";
8312 +       case CLUSTER_LEAVEFLAG_KILLED:
8313 +               return "Killed by another node";
8314 +       case CLUSTER_LEAVEFLAG_PANIC:
8315 +               return "Panic";
8316 +       case CLUSTER_LEAVEFLAG_REMOVED:
8317 +               return "Removed";
8318 +       case CLUSTER_LEAVEFLAG_REJECTED:
8319 +               return "Membership rejected";
8320 +       default:
8321 +               sprintf(msg, "Reason is %d\n", reason);
8322 +               return msg;
8323 +       }
8324 +}
8325 +
8326 +#ifdef DEBUG_MEMB
8327 +static char *msgname(int msg)
8328 +{
8329 +       switch (msg) {
8330 +       case CLUSTER_MEM_JOINCONF:
8331 +               return "JOINCONF";
8332 +       case CLUSTER_MEM_JOINREQ:
8333 +               return "JOINREQ";
8334 +       case CLUSTER_MEM_LEAVE:
8335 +               return "LEAVE";
8336 +       case CLUSTER_MEM_HELLO:
8337 +               return "HELLO";
8338 +       case CLUSTER_MEM_KILL:
8339 +               return "KILL";
8340 +       case CLUSTER_MEM_JOINACK:
8341 +               return "JOINACK";
8342 +       case CLUSTER_MEM_ENDTRANS:
8343 +               return "ENDTRANS";
8344 +       case CLUSTER_MEM_RECONFIG:
8345 +               return "RECONFIG";
8346 +       case CLUSTER_MEM_MASTERVIEW:
8347 +               return "MASTERVIEW";
8348 +       case CLUSTER_MEM_STARTTRANS:
8349 +               return "STARTTRANS";
8350 +       case CLUSTER_MEM_JOINREJ:
8351 +               return "JOINREJ";
8352 +       case CLUSTER_MEM_VIEWACK:
8353 +               return "VIEWACK";
8354 +       case CLUSTER_MEM_STARTACK:
8355 +               return "STARTACK";
8356 +       case CLUSTER_MEM_NEWCLUSTER:
8357 +               return "NEWCLUSTER";
8358 +       case CLUSTER_MEM_CONFACK:
8359 +               return "CONFACK";
8360 +       case CLUSTER_MEM_NOMINATE:
8361 +               return "NOMINATE";
8362 +
8363 +       default:
8364 +               return "??UNKNOWN??";
8365 +       }
8366 +}
8367 +
8368 +#endif
8369 +
8370 +/*
8371 + * Overrides for Emacs so that we follow Linus's tabbing style.
8372 + * Emacs will notice this stuff at the end of the file and automatically
8373 + * adjust the settings for this buffer only.  This must remain at the end
8374 + * of the file.
8375 + * ---------------------------------------------------------------------------
8376 + * Local variables:
8377 + * c-file-style: "linux"
8378 + * End:
8379 + */
8380 diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
8381 --- linux-orig/cluster/cman/proc.c      1970-01-01 07:30:00.000000000 +0730
8382 +++ linux-patched/cluster/cman/proc.c   2004-11-03 11:37:37.000000000 +0800
8383 @@ -0,0 +1,372 @@
8384 +/******************************************************************************
8385 +*******************************************************************************
8386 +**
8387 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8388 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8389 +**
8390 +**  This copyrighted material is made available to anyone wishing to use,
8391 +**  modify, copy, or redistribute it subject to the terms and conditions
8392 +**  of the GNU General Public License v.2.
8393 +**
8394 +*******************************************************************************
8395 +******************************************************************************/
8396 +
8397 +#include <linux/init.h>
8398 +#include <linux/socket.h>
8399 +#include <linux/kernel.h>
8400 +#include <linux/sched.h>
8401 +#include <linux/file.h>
8402 +#include <linux/proc_fs.h>
8403 +#include <linux/seq_file.h>
8404 +#include <linux/list.h>
8405 +#include <linux/in.h>
8406 +#include <net/sock.h>
8407 +#include <cluster/cnxman.h>
8408 +#include <cluster/service.h>
8409 +
8410 +#include "cnxman-private.h"
8411 +#include "config.h"
8412 +
8413 +extern int cluster_members;
8414 +extern struct list_head cluster_members_list;
8415 +extern struct semaphore cluster_members_lock;
8416 +extern struct cluster_node *quorum_device;
8417 +extern int we_are_a_cluster_member;
8418 +extern int cluster_is_quorate;
8419 +extern uint16_t cluster_id;
8420 +extern atomic_t use_count;
8421 +extern unsigned int address_length;
8422 +extern unsigned int config_version;
8423 +extern char cluster_name[];
8424 +extern struct cluster_node *us;
8425 +static struct seq_operations cluster_info_op;
8426 +
8427 +int sm_proc_open(struct inode *inode, struct file *file);
8428 +int sm_debug_info(char *b, char **start, off_t offset, int length);
8429 +
8430 +/* /proc interface to the configuration struct */
8431 +static struct config_proc_info {
8432 +    char *name;
8433 +    int  *value;
8434 +} config_proc[] = {
8435 +    {
8436 +       .name = "joinwait_timeout",
8437 +       .value = &cman_config.joinwait_timeout,
8438 +    },
8439 +    {
8440 +       .name = "joinconf_timeout",
8441 +       .value = &cman_config.joinconf_timeout,
8442 +    },
8443 +    {
8444 +       .name = "join_timeout",
8445 +       .value = &cman_config.join_timeout,
8446 +    },
8447 +    {
8448 +       .name = "hello_timer",
8449 +       .value = &cman_config.hello_timer,
8450 +    },
8451 +    {
8452 +       .name = "deadnode_timeout",
8453 +       .value = &cman_config.deadnode_timeout,
8454 +    },
8455 +    {
8456 +       .name = "transition_timeout",
8457 +       .value = &cman_config.transition_timeout,
8458 +    },
8459 +    {
8460 +       .name = "transition_restarts",
8461 +       .value = &cman_config.transition_restarts,
8462 +    },
8463 +    {
8464 +       .name = "max_nodes",
8465 +       .value = &cman_config.max_nodes,
8466 +    },
8467 +    {
8468 +       .name = "sm_debug_size",
8469 +       .value = &cman_config.sm_debug_size,
8470 +    },
8471 +    {
8472 +       .name = "newcluster_timeout",
8473 +       .value = &cman_config.newcluster_timeout,
8474 +    },
8475 +};
8476 +
8477 +
8478 +static int proc_cluster_status(char *b, char **start, off_t offset, int length)
8479 +{
8480 +    struct list_head *nodelist;
8481 +    struct cluster_node *node;
8482 +    struct cluster_node_addr *node_addr;
8483 +    unsigned int total_votes = 0;
8484 +    unsigned int max_expected = 0;
8485 +    int c = 0;
8486 +    char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
8487 +
8488 +    c += sprintf(b+c,
8489 +                "Version: %d.%d.%d\n",
8490 +                CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
8491 +                CNXMAN_PATCH_VERSION);
8492 +
8493 +    c += sprintf(b+c,
8494 +                "Config version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
8495 +                config_version,
8496 +                cluster_name, cluster_id,
8497 +                membership_state(node_buf, sizeof (node_buf)));
8498 +
8499 +    if (!we_are_a_cluster_member)
8500 +       return c;
8501 +
8502 +    /* Total the votes */
8503 +    down(&cluster_members_lock);
8504 +    list_for_each(nodelist, &cluster_members_list) {
8505 +       node = list_entry(nodelist, struct cluster_node, list);
8506 +       if (node->state == NODESTATE_MEMBER) {
8507 +           total_votes += node->votes;
8508 +           max_expected =
8509 +               max(max_expected, node->expected_votes);
8510 +       }
8511 +    }
8512 +    up(&cluster_members_lock);
8513 +
8514 +    if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
8515 +       total_votes += quorum_device->votes;
8516 +
8517 +    c += sprintf(b+c,
8518 +                "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d  %s\n",
8519 +                cluster_members, max_expected, total_votes,
8520 +                get_quorum(),
8521 +                cluster_is_quorate ? " " : "Activity blocked");
8522 +    c += sprintf(b+c, "Active subsystems: %d\n",
8523 +                atomic_read(&use_count));
8524 +
8525 +
8526 +    c += sprintf(b+c, "Node addresses: ");
8527 +    list_for_each_entry(node_addr, &us->addr_list, list) {
8528 +       struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
8529 +       if (saddr->sin6_family == AF_INET6) {
8530 +               c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x  ",
8531 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
8532 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
8533 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
8534 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
8535 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
8536 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
8537 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
8538 +                            be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
8539 +       }
8540 +       else {
8541 +           struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
8542 +           uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
8543 +           c+= sprintf(b+c, "%u.%u.%u.%u  ",
8544 +                       addr[0], addr[1], addr[2], addr[3]);
8545 +       }
8546 +    }
8547 +    c += sprintf(b+c, "\n\n");
8548 +    return c;
8549 +}
8550 +
8551 +
8552 +/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
8553 + * we are */
8554 +struct cluster_seq_info {
8555 +       int nodeid;
8556 +       int highest_nodeid;
8557 +};
8558 +
8559 +static int cluster_open(struct inode *inode, struct file *file)
8560 +{
8561 +       return seq_open(file, &cluster_info_op);
8562 +}
8563 +
8564 +static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
8565 +{
8566 +       struct cluster_seq_info *csi =
8567 +           kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
8568 +
8569 +       if (!csi)
8570 +               return NULL;
8571 +
8572 +       /* Keep highest_nodeid here so we don't need to keep traversing the
8573 +        * list to find it */
8574 +       csi->nodeid = *pos;
8575 +       csi->highest_nodeid = get_highest_nodeid();
8576 +
8577 +       /* Print the header */
8578 +       if (*pos == 0) {
8579 +               seq_printf(m, "Node  Votes Exp Sts  Name\n");
8580 +       }
8581 +       return csi;
8582 +}
8583 +
8584 +static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
8585 +{
8586 +       struct cluster_seq_info *csi = p;
8587 +
8588 +       *pos = ++csi->nodeid;
8589 +       if (csi->nodeid > csi->highest_nodeid)
8590 +               return NULL;
8591 +
8592 +       return csi;
8593 +}
8594 +
8595 +static int cluster_seq_show(struct seq_file *m, void *p)
8596 +{
8597 +       char state = '?';
8598 +       struct cluster_node *node;
8599 +       struct cluster_seq_info *csi = p;
8600 +
8601 +       /*
8602 +        * If we have "0" here then display the quorum device if
8603 +        * there is one.
8604 +        */
8605 +       if (csi->nodeid == 0)
8606 +               node = quorum_device;
8607 +       else
8608 +               node = find_node_by_nodeid(csi->nodeid);
8609 +
8610 +       if (!node)
8611 +               return 0;
8612 +
8613 +       /* Make state printable */
8614 +       switch (node->state) {
8615 +       case NODESTATE_MEMBER:
8616 +               state = 'M';
8617 +               break;
8618 +       case NODESTATE_JOINING:
8619 +               state = 'J';
8620 +               break;
8621 +       case NODESTATE_DEAD:
8622 +               state = 'X';
8623 +               break;
8624 +       }
8625 +       seq_printf(m, "%4d  %3d  %3d   %c   %s\n",
8626 +                  node->node_id,
8627 +                  node->votes,
8628 +                  node->expected_votes,
8629 +                  state,
8630 +                  node->name);
8631 +
8632 +       return 0;
8633 +}
8634 +
8635 +static void cluster_seq_stop(struct seq_file *m, void *p)
8636 +{
8637 +       kfree(p);
8638 +}
8639 +
8640 +static struct seq_operations cluster_info_op = {
8641 +       .start = cluster_seq_start,
8642 +       .next = cluster_seq_next,
8643 +       .stop = cluster_seq_stop,
8644 +       .show = cluster_seq_show
8645 +};
8646 +
8647 +static struct file_operations cluster_fops = {
8648 +       .open = cluster_open,
8649 +       .read = seq_read,
8650 +       .llseek = seq_lseek,
8651 +       .release = seq_release,
8652 +       .owner = THIS_MODULE,
8653 +};
8654 +
8655 +static struct file_operations service_fops = {
8656 +       .open = sm_proc_open,
8657 +       .read = seq_read,
8658 +       .llseek = seq_lseek,
8659 +       .release = seq_release,
8660 +       .owner = THIS_MODULE,
8661 +};
8662 +
8663 +static int cman_config_read_proc(char *page, char **start, off_t off, int count,
8664 +                                int *eof, void *data)
8665 +{
8666 +    struct config_proc_info *cinfo = data;
8667 +
8668 +    return snprintf(page, count, "%d\n", *cinfo->value);
8669 +}
8670 +
8671 +static int cman_config_write_proc(struct file *file, const char *buffer,
8672 +                                 unsigned long count, void *data)
8673 +{
8674 +    struct config_proc_info *cinfo = data;
8675 +    int value;
8676 +    char *end;
8677 +
8678 +    value = simple_strtoul(buffer, &end, 10);
8679 +    if (*end) {
8680 +       *cinfo->value = value;
8681 +    }
8682 +    return count;
8683 +}
8684 +
8685 +/* Base of the config directory for cman */
8686 +static struct proc_dir_entry *proc_cman_config;
8687 +void create_proc_entries(void)
8688 +{
8689 +       struct proc_dir_entry *procentry;
8690 +       struct proc_dir_entry *proc_cluster;
8691 +       int i;
8692 +
8693 +       proc_cluster = proc_mkdir("cluster", 0);
8694 +       if (!proc_cluster)
8695 +               return;
8696 +       proc_cluster->owner = THIS_MODULE;
8697 +
8698 +       /* Config dir filled in by us and others */
8699 +       if (!proc_mkdir("cluster/config", 0))
8700 +               return;
8701 +
8702 +       /* Don't much care if this fails, it's hardly vital */
8703 +       procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
8704 +       if (procentry)
8705 +               procentry->proc_fops = &cluster_fops;
8706 +
8707 +       procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
8708 +       if (procentry)
8709 +               procentry->get_info = proc_cluster_status;
8710 +
8711 +       procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
8712 +       if (procentry)
8713 +               procentry->proc_fops = &service_fops;
8714 +
8715 +       /* Config entries */
8716 +       proc_cman_config = proc_mkdir("cluster/config/cman", 0);
8717 +       if (!proc_cman_config)
8718 +               return;
8719 +
8720 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
8721 +               procentry = create_proc_entry(config_proc[i].name, 0660,
8722 +                                             proc_cman_config);
8723 +               if (procentry) {
8724 +                       procentry->data = &config_proc[i];
8725 +                       procentry->write_proc = cman_config_write_proc;
8726 +                       procentry->read_proc = cman_config_read_proc;
8727 +               }
8728 +       }
8729 +
8730 +       procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
8731 +       if (procentry)
8732 +               procentry->get_info = sm_debug_info;
8733 +}
8734 +
8735 +void cleanup_proc_entries(void)
8736 +{
8737 +        int i, config_count;
8738 +
8739 +       remove_proc_entry("cluster/sm_debug", NULL);
8740 +
8741 +       config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
8742 +
8743 +       if (proc_cman_config) {
8744 +               for (i=0; i<config_count; i++)
8745 +                       remove_proc_entry(config_proc[i].name, proc_cman_config);
8746 +       }
8747 +       remove_proc_entry("cluster/config/cman", NULL);
8748 +       remove_proc_entry("cluster/config", NULL);
8749 +
8750 +       remove_proc_entry("cluster/nodes", NULL);
8751 +       remove_proc_entry("cluster/status", NULL);
8752 +       remove_proc_entry("cluster/services", NULL);
8753 +       remove_proc_entry("cluster/config", NULL);
8754 +       remove_proc_entry("cluster", NULL);
8755 +}
8756 diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
8757 --- linux-orig/cluster/cman/sm.h        1970-01-01 07:30:00.000000000 +0730
8758 +++ linux-patched/cluster/cman/sm.h     2004-11-03 11:37:37.000000000 +0800
8759 @@ -0,0 +1,109 @@
8760 +/******************************************************************************
8761 +*******************************************************************************
8762 +**
8763 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8764 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8765 +**
8766 +**  This copyrighted material is made available to anyone wishing to use,
8767 +**  modify, copy, or redistribute it subject to the terms and conditions
8768 +**  of the GNU General Public License v.2.
8769 +**
8770 +*******************************************************************************
8771 +******************************************************************************/
8772 +
8773 +#ifndef __SM_DOT_H__
8774 +#define __SM_DOT_H__
8775 +
8776 +/*
8777 + * This is the main header file to be included in each Service Manager source
8778 + * file.
8779 + */
8780 +
8781 +#include <linux/list.h>
8782 +#include <linux/socket.h>
8783 +#include <linux/kernel.h>
8784 +#include <linux/sched.h>
8785 +#include <linux/file.h>
8786 +#include <linux/kthread.h>
8787 +#include <net/sock.h>
8788 +
8789 +#include <cluster/cnxman.h>
8790 +#include <cluster/service.h>
8791 +
8792 +#define SG_LEVELS (4)
8793 +
8794 +#include "sm_internal.h"
8795 +#include "sm_barrier.h"
8796 +#include "sm_control.h"
8797 +#include "sm_daemon.h"
8798 +#include "sm_joinleave.h"
8799 +#include "sm_membership.h"
8800 +#include "sm_message.h"
8801 +#include "sm_misc.h"
8802 +#include "sm_recover.h"
8803 +#include "sm_services.h"
8804 +
8805 +extern struct list_head sm_sg[SG_LEVELS];
8806 +extern struct semaphore sm_sglock;
8807 +
8808 +#ifndef TRUE
8809 +#define TRUE (1)
8810 +#endif
8811 +
8812 +#ifndef FALSE
8813 +#define FALSE (0)
8814 +#endif
8815 +
8816 +#define SM_ASSERT(x, do) \
8817 +{ \
8818 +  if (!(x)) \
8819 +  { \
8820 +    printk("\nSM:  Assertion failed on line %d of file %s\n" \
8821 +               "SM:  assertion:  \"%s\"\n" \
8822 +               "SM:  time = %lu\n", \
8823 +               __LINE__, __FILE__, #x, jiffies); \
8824 +    {do} \
8825 +    printk("\n"); \
8826 +    panic("SM:  Record message above and reboot.\n"); \
8827 +  } \
8828 +}
8829 +
8830 +#define SM_RETRY(do_this, until_this) \
8831 +for (;;) \
8832 +{ \
8833 +  do { do_this; } while (0); \
8834 +  if (until_this) \
8835 +    break; \
8836 +  printk("SM:  out of memory:  %s, %u\n", __FILE__, __LINE__); \
8837 +  schedule();\
8838 +}
8839 +
8840 +
8841 +#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
8842 +
8843 +#define log_error(sg, fmt, args...) \
8844 +       printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8845 +
8846 +
8847 +#define SM_DEBUG_LOG
8848 +
8849 +#ifdef SM_DEBUG_CONSOLE
8850 +#define log_debug(sg, fmt, args...) \
8851 +       printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
8852 +#endif
8853 +
8854 +#ifdef SM_DEBUG_LOG
8855 +#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
8856 +#endif
8857 +
8858 +#ifdef SM_DEBUG_ALL
8859 +#define log_debug(sg, fmt, args...) \
8860 +do \
8861 +{ \
8862 +       printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
8863 +       sm_debug_log(sg, fmt, ##args); \
8864 +} \
8865 +while (0)
8866 +#endif
8867 +
8868 +#endif                         /* __SM_DOT_H__ */
8869 diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
8870 --- linux-orig/cluster/cman/sm_barrier.c        1970-01-01 07:30:00.000000000 +0730
8871 +++ linux-patched/cluster/cman/sm_barrier.c     2004-11-03 11:37:37.000000000 +0800
8872 @@ -0,0 +1,232 @@
8873 +/******************************************************************************
8874 +*******************************************************************************
8875 +**
8876 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8877 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8878 +**
8879 +**  This copyrighted material is made available to anyone wishing to use,
8880 +**  modify, copy, or redistribute it subject to the terms and conditions
8881 +**  of the GNU General Public License v.2.
8882 +**
8883 +*******************************************************************************
8884 +******************************************************************************/
8885 +
8886 +#include "sm.h"
8887 +
8888 +static struct list_head        barriers;
8889 +static spinlock_t      barriers_lock;
8890 +
8891 +struct bc_entry {
8892 +       struct list_head list;
8893 +       uint32_t gid;
8894 +       int status;
8895 +       char type;
8896 +};
8897 +typedef struct bc_entry bc_entry_t;
8898 +
8899 +void init_barriers(void)
8900 +{
8901 +       INIT_LIST_HEAD(&barriers);
8902 +       spin_lock_init(&barriers_lock);
8903 +}
8904 +
8905 +static int atoi(char *c)
8906 +{
8907 +       int x = 0;
8908 +
8909 +       while ('0' <= *c && *c <= '9') {
8910 +               x = x * 10 + (*c - '0');
8911 +               c++;
8912 +       }
8913 +       return x;
8914 +}
8915 +
8916 +static void add_barrier_callback(char *name, int status, int type)
8917 +{
8918 +       char *p;
8919 +       uint32_t gid;
8920 +       bc_entry_t *be;
8921 +
8922 +       /* an ESRCH callback just means there was a cnxman transition */
8923 +       if (status == -ESRCH)
8924 +               return;
8925 +
8926 +       /* extract global id of SG from barrier name */
8927 +       p = strstr(name, "sm.");
8928 +
8929 +       SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
8930 +
8931 +       p += strlen("sm.");
8932 +       gid = atoi(p);
8933 +
8934 +       SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
8935 +
8936 +       be->gid = gid;
8937 +       be->status = status;
8938 +       be->type = type;
8939 +
8940 +       spin_lock(&barriers_lock);
8941 +       list_add_tail(&be->list, &barriers);
8942 +       spin_unlock(&barriers_lock);
8943 +
8944 +       wake_serviced(DO_BARRIERS);
8945 +}
8946 +
8947 +static void callback_recovery_barrier(char *name, int status)
8948 +{
8949 +       add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
8950 +}
8951 +
8952 +static void callback_startdone_barrier_new(char *name, int status)
8953 +{
8954 +       add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
8955 +}
8956 +
8957 +static void callback_startdone_barrier(char *name, int status)
8958 +{
8959 +       add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
8960 +}
8961 +
8962 +int sm_barrier(char *name, int count, int type)
8963 +{
8964 +       int error;
8965 +       unsigned long fn = 0;
8966 +
8967 +       switch (type) {
8968 +       case SM_BARRIER_STARTDONE:
8969 +               fn = (unsigned long) callback_startdone_barrier;
8970 +               break;
8971 +       case SM_BARRIER_STARTDONE_NEW:
8972 +               fn = (unsigned long) callback_startdone_barrier_new;
8973 +               break;
8974 +       case SM_BARRIER_RECOVERY:
8975 +               fn = (unsigned long) callback_recovery_barrier;
8976 +               break;
8977 +       }
8978 +
8979 +       error = kcl_barrier_register(name, 0, count);
8980 +       if (error) {
8981 +               log_print("barrier register error %d", error);
8982 +               goto fail;
8983 +       }
8984 +
8985 +       error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
8986 +       if (error) {
8987 +               log_print("barrier setattr autodel error %d", error);
8988 +               goto fail_bar;
8989 +       }
8990 +
8991 +       error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
8992 +       if (error) {
8993 +               log_print("barrier setattr cb error %d", error);
8994 +               goto fail_bar;
8995 +       }
8996 +
8997 +       error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
8998 +       if (error) {
8999 +               log_print("barrier setattr enabled error %d", error);
9000 +               goto fail_bar;
9001 +       }
9002 +
9003 +       return 0;
9004 +
9005 + fail_bar:
9006 +       kcl_barrier_delete(name);
9007 + fail:
9008 +       return error;
9009 +}
9010 +
9011 +void process_startdone_barrier_new(sm_group_t *sg, int status)
9012 +{
9013 +       sm_sevent_t *sev = sg->sevent;
9014 +
9015 +       if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
9016 +               log_debug(sev->se_sg, "ignore barrier cb status %d", status);
9017 +               return;
9018 +       }
9019 +
9020 +       sev->se_barrier_status = status;
9021 +       sev->se_state = SEST_BARRIER_DONE;
9022 +       set_bit(SEFL_CHECK, &sev->se_flags);
9023 +       wake_serviced(DO_JOINLEAVE);
9024 +}
9025 +
9026 +void process_startdone_barrier(sm_group_t *sg, int status)
9027 +{
9028 +       sm_uevent_t *uev = &sg->uevent;
9029 +
9030 +       if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
9031 +               log_debug(sg, "ignore barrier cb status %d", status);
9032 +               return;
9033 +       }
9034 +
9035 +       uev->ue_barrier_status = status;
9036 +       uev->ue_state = UEST_BARRIER_DONE;
9037 +       set_bit(UEFL_CHECK, &uev->ue_flags);
9038 +       wake_serviced(DO_MEMBERSHIP);
9039 +}
9040 +
9041 +void process_recovery_barrier(sm_group_t *sg, int status)
9042 +{
9043 +       if (status) {
9044 +               log_error(sg, "process_recovery_barrier status=%d", status);
9045 +               return;
9046 +       }
9047 +
9048 +       if (sg->state != SGST_RECOVER ||
9049 +           sg->recover_state != RECOVER_BARRIERWAIT) {
9050 +               log_error(sg, "process_recovery_barrier state %d recover %d",
9051 +                         sg->state, sg->recover_state);
9052 +               return;
9053 +       }
9054 +
9055 +       if (!sg->recover_stop)
9056 +               sg->recover_state = RECOVER_STOP;
9057 +       else
9058 +               sg->recover_state = RECOVER_BARRIERDONE;
9059 +
9060 +       wake_serviced(DO_RECOVERIES);
9061 +}
9062 +
9063 +void process_barriers(void)
9064 +{
9065 +       sm_group_t *sg;
9066 +       bc_entry_t *be;
9067 +
9068 +       while (1) {
9069 +               be = NULL;
9070 +
9071 +               spin_lock(&barriers_lock);
9072 +               if (!list_empty(&barriers)) {
9073 +                       be = list_entry(barriers.next, bc_entry_t, list);
9074 +                       list_del(&be->list);
9075 +               }
9076 +               spin_unlock(&barriers_lock);
9077 +
9078 +               if (!be)
9079 +                       break;
9080 +
9081 +               sg = sm_global_id_to_sg(be->gid);
9082 +               if (!sg) {
9083 +                       log_print("process_barriers: no sg %08x", be->gid);
9084 +                       break;
9085 +               }
9086 +
9087 +               switch (be->type) {
9088 +               case SM_BARRIER_STARTDONE_NEW:
9089 +                       process_startdone_barrier_new(sg, be->status);
9090 +                       break;
9091 +
9092 +               case SM_BARRIER_STARTDONE:
9093 +                       process_startdone_barrier(sg, be->status);
9094 +                       break;
9095 +
9096 +               case SM_BARRIER_RECOVERY:
9097 +                       process_recovery_barrier(sg, be->status);
9098 +                       break;
9099 +               }
9100 +
9101 +               kfree(be);
9102 +               schedule();
9103 +       }
9104 +}
9105 diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
9106 --- linux-orig/cluster/cman/sm_barrier.h        1970-01-01 07:30:00.000000000 +0730
9107 +++ linux-patched/cluster/cman/sm_barrier.h     2004-11-03 11:37:37.000000000 +0800
9108 @@ -0,0 +1,29 @@
9109 +/******************************************************************************
9110 +*******************************************************************************
9111 +**
9112 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9113 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9114 +**
9115 +**  This copyrighted material is made available to anyone wishing to use,
9116 +**  modify, copy, or redistribute it subject to the terms and conditions
9117 +**  of the GNU General Public License v.2.
9118 +**
9119 +*******************************************************************************
9120 +******************************************************************************/
9121 +
9122 +#ifndef __SM_BARRIER_DOT_H__
9123 +#define __SM_BARRIER_DOT_H__
9124 +
9125 +#define SM_BARRIER_STARTDONE           (0)
9126 +#define SM_BARRIER_STARTDONE_NEW       (1)
9127 +#define SM_BARRIER_RECOVERY            (2)
9128 +#define SM_BARRIER_RESET               (3)
9129 +
9130 +void init_barriers(void);
9131 +void process_barriers(void);
9132 +int sm_barrier(char *name, int count, int type);
9133 +void process_startdone_barrier(sm_group_t *sg, int status);
9134 +void process_startdone_barrier_new(sm_group_t *sg, int status);
9135 +void process_recovery_barrier(sm_group_t *sg, int status);
9136 +
9137 +#endif
9138 diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
9139 --- linux-orig/cluster/cman/sm_control.c        1970-01-01 07:30:00.000000000 +0730
9140 +++ linux-patched/cluster/cman/sm_control.c     2004-11-03 11:37:37.000000000 +0800
9141 @@ -0,0 +1,156 @@
9142 +/******************************************************************************
9143 +*******************************************************************************
9144 +**
9145 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9146 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9147 +**
9148 +**  This copyrighted material is made available to anyone wishing to use,
9149 +**  modify, copy, or redistribute it subject to the terms and conditions
9150 +**  of the GNU General Public License v.2.
9151 +**
9152 +*******************************************************************************
9153 +******************************************************************************/
9154 +
9155 +#include "sm.h"
9156 +#include "config.h"
9157 +
9158 +struct socket *                sm_socket;
9159 +uint32_t *             sm_new_nodeids;
9160 +uint32_t               sm_our_nodeid;
9161 +int                    sm_quorum, sm_quorum_next;
9162 +struct list_head       sm_members;
9163 +int                    sm_member_count;
9164 +
9165 +
9166 +/*
9167 + * Context: cnxman
9168 + * Called by cnxman when it has a new member list.
9169 + */
9170 +
9171 +void sm_member_update(int quorate)
9172 +{
9173 +       sm_quorum_next = quorate;
9174 +       wake_serviced(DO_START_RECOVERY);
9175 +}
9176 +
9177 +/*
9178 + * Context: cnxman
9179 + * Called when module is loaded.
9180 + */
9181 +
9182 +void sm_init(void)
9183 +{
9184 +       sm_socket = NULL;
9185 +       sm_new_nodeids = NULL;
9186 +       sm_quorum = 0;
9187 +       sm_quorum_next = 0;
9188 +       sm_our_nodeid = 0;
9189 +       INIT_LIST_HEAD(&sm_members);
9190 +       sm_member_count = 0;
9191 +
9192 +       init_services();
9193 +       init_messages();
9194 +       init_barriers();
9195 +       init_serviced();
9196 +       init_recovery();
9197 +       init_joinleave();
9198 +       init_sm_misc();
9199 +}
9200 +
9201 +/*
9202 + * Context: cnxman
9203 + * Called at beginning of cluster join procedure.
9204 + */
9205 +
9206 +void sm_start(void)
9207 +{
9208 +       struct sockaddr_cl saddr;
9209 +       struct socket *sock;
9210 +       int result;
9211 +
9212 +       /* Create a communication channel among service managers */
9213 +
9214 +       result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
9215 +       if (result < 0) {
9216 +               log_print("can't create socket %d", result);
9217 +               goto fail;
9218 +       }
9219 +
9220 +       sm_socket = sock;
9221 +
9222 +       saddr.scl_family = AF_CLUSTER;
9223 +       saddr.scl_port = CLUSTER_PORT_SERVICES;
9224 +
9225 +       result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
9226 +                                sizeof(saddr));
9227 +       if (result < 0) {
9228 +               log_print("can't bind socket %d", result);
9229 +               goto fail_release;
9230 +       }
9231 +
9232 +       result = kcl_register_read_callback(sm_socket, sm_cluster_message);
9233 +       if (result < 0) {
9234 +               log_print("can't register read callback %d", result);
9235 +               goto fail_release;
9236 +       }
9237 +
9238 +       sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
9239 +                                                    sizeof(uint32_t),
9240 +                                                    GFP_KERNEL);
9241 +       start_serviced();
9242 +
9243 +       /* cnxman should call sm_member_update() once we've joined - then we
9244 +        * can get our first list of members and our own nodeid */
9245 +
9246 +       return;
9247 +
9248 +      fail_release:
9249 +       sock_release(sm_socket);
9250 +       sm_socket = NULL;
9251 +
9252 +      fail:
9253 +       return;
9254 +}
9255 +
9256 +/*
9257 + * Context: cnxman
9258 + * Called before cnxman leaves the cluster.  If this returns an error to cman,
9259 + * cman should not leave the cluster but return EBUSY.
9260 + * If force is set we go away anyway. cman knows best in this case
9261 + */
9262 +
9263 +int sm_stop(int force)
9264 +{
9265 +       struct list_head *head;
9266 +       sm_group_t *sg;
9267 +       sm_node_t *node;
9268 +       int i, busy = FALSE, error = -EBUSY;
9269 +
9270 +       for (i = 0; i < SG_LEVELS; i++) {
9271 +               if (!list_empty(&sm_sg[i])) {
9272 +                       sg = list_entry(sm_sg[i].next, sm_group_t, list);
9273 +                       log_error(sg, "sm_stop: SG still joined");
9274 +                       busy = TRUE;
9275 +               }
9276 +       }
9277 +
9278 +       if (!busy || force) {
9279 +               stop_serviced();
9280 +
9281 +               if (sm_socket)
9282 +                       sock_release(sm_socket);
9283 +
9284 +               head = &sm_members;
9285 +               while (!list_empty(head)) {
9286 +                       node = list_entry(head->next, sm_node_t, list);
9287 +                       list_del(&node->list);
9288 +                       sm_member_count--;
9289 +                       kfree(node);
9290 +               }
9291 +
9292 +               kfree(sm_new_nodeids);
9293 +               sm_init();
9294 +               error = 0;
9295 +       }
9296 +       return error;
9297 +}
9298 diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
9299 --- linux-orig/cluster/cman/sm_control.h        1970-01-01 07:30:00.000000000 +0730
9300 +++ linux-patched/cluster/cman/sm_control.h     2004-11-03 11:37:37.000000000 +0800
9301 @@ -0,0 +1,22 @@
9302 +/******************************************************************************
9303 +*******************************************************************************
9304 +**
9305 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9306 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9307 +**
9308 +**  This copyrighted material is made available to anyone wishing to use,
9309 +**  modify, copy, or redistribute it subject to the terms and conditions
9310 +**  of the GNU General Public License v.2.
9311 +**
9312 +*******************************************************************************
9313 +******************************************************************************/
9314 +
9315 +#ifndef __SM_CONTROL_DOT_H__
9316 +#define __SM_CONTROL_DOT_H__
9317 +
9318 +void sm_init(void);
9319 +void sm_start(void);
9320 +int sm_stop(int force);
9321 +void sm_member_update(int quorate);
9322 +
9323 +#endif
9324 diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
9325 --- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
9326 +++ linux-patched/cluster/cman/sm_daemon.c      2004-11-03 11:37:37.000000000 +0800
9327 @@ -0,0 +1,100 @@
9328 +/******************************************************************************
9329 +*******************************************************************************
9330 +**
9331 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9332 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9333 +**
9334 +**  This copyrighted material is made available to anyone wishing to use,
9335 +**  modify, copy, or redistribute it subject to the terms and conditions
9336 +**  of the GNU General Public License v.2.
9337 +**
9338 +*******************************************************************************
9339 +******************************************************************************/
9340 +
9341 +#include "sm.h"
9342 +
9343 +static unsigned long           daemon_flags;
9344 +static struct task_struct *    daemon_task;
9345 +extern int                     sm_quorum;
9346 +
9347 +void init_serviced(void)
9348 +{
9349 +       daemon_flags = 0;
9350 +       daemon_task = NULL;
9351 +}
9352 +
9353 +void wake_serviced(int do_flag)
9354 +{
9355 +       set_bit(do_flag, &daemon_flags);
9356 +       wake_up_process(daemon_task);
9357 +}
9358 +
9359 +static inline int got_work(void)
9360 +{
9361 +       int rv = 0;
9362 +
9363 +       rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
9364 +             test_bit(DO_MESSAGES, &daemon_flags) ||
9365 +             test_bit(DO_BARRIERS, &daemon_flags) ||
9366 +             test_bit(DO_CALLBACKS, &daemon_flags));
9367 +
9368 +       if (sm_quorum && !rv)
9369 +               rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
9370 +                     test_bit(DO_RECOVERIES, &daemon_flags) ||
9371 +                     test_bit(DO_MEMBERSHIP, &daemon_flags));
9372 +       return rv;
9373 +}
9374 +
9375 +static int serviced(void *arg)
9376 +{
9377 +       while (!kthread_should_stop()) {
9378 +               if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
9379 +                       process_nodechange();
9380 +
9381 +               if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
9382 +                       process_messages();
9383 +
9384 +               if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
9385 +                       process_barriers();
9386 +
9387 +               if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
9388 +                       process_callbacks();
9389 +
9390 +               if (sm_quorum) {
9391 +                       if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
9392 +                               process_recoveries();
9393 +
9394 +                       if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
9395 +                               process_joinleave();
9396 +
9397 +                       if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
9398 +                               process_membership();
9399 +               }
9400 +
9401 +               set_current_state(TASK_INTERRUPTIBLE);
9402 +               if (!got_work())
9403 +                       schedule();
9404 +               set_current_state(TASK_RUNNING);
9405 +       }
9406 +
9407 +       return 0;
9408 +}
9409 +
9410 +int start_serviced(void)
9411 +{
9412 +       struct task_struct *p;
9413 +
9414 +       p = kthread_run(serviced, NULL, 0, "cman_serviced");
9415 +       if (IS_ERR(p)) {
9416 +               printk("can't start cman_serviced daemon");
9417 +               return (IS_ERR(p));
9418 +       }
9419 +
9420 +       daemon_task = p;
9421 +       return 0;
9422 +}
9423 +
9424 +void stop_serviced(void)
9425 +{
9426 +       kthread_stop(daemon_task);
9427 +}
9428 diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
9429 --- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
9430 +++ linux-patched/cluster/cman/sm_daemon.h      2004-11-03 11:37:37.000000000 +0800
9431 @@ -0,0 +1,32 @@
9432 +/******************************************************************************
9433 +*******************************************************************************
9434 +**
9435 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9436 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9437 +**
9438 +**  This copyrighted material is made available to anyone wishing to use,
9439 +**  modify, copy, or redistribute it subject to the terms and conditions
9440 +**  of the GNU General Public License v.2.
9441 +**
9442 +*******************************************************************************
9443 +******************************************************************************/
9444 +
9445 +#ifndef __SM_DAEMON_DOT_H__
9446 +#define __SM_DAEMON_DOT_H__
9447 +
9448 +#define DO_RUN                  (0)
9449 +#define DO_START_RECOVERY       (1)
9450 +#define DO_MESSAGES             (2)
9451 +#define DO_BARRIERS             (3)
9452 +#define DO_CALLBACKS            (4)
9453 +#define DO_JOINLEAVE            (5)
9454 +#define DO_RECOVERIES           (6)
9455 +#define DO_MEMBERSHIP           (7)
9456 +#define DO_RESET               (8)
9457 +
9458 +void init_serviced(void);
9459 +void wake_serviced(int do_flag);
9460 +void stop_serviced(void);
9461 +int start_serviced(void);
9462 +
9463 +#endif
9464 diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
9465 --- linux-orig/cluster/cman/sm_internal.h       1970-01-01 07:30:00.000000000 +0730
9466 +++ linux-patched/cluster/cman/sm_internal.h    2004-11-03 11:37:37.000000000 +0800
9467 @@ -0,0 +1,231 @@
9468 +/******************************************************************************
9469 +*******************************************************************************
9470 +**
9471 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9472 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9473 +**
9474 +**  This copyrighted material is made available to anyone wishing to use,
9475 +**  modify, copy, or redistribute it subject to the terms and conditions
9476 +**  of the GNU General Public License v.2.
9477 +**
9478 +*******************************************************************************
9479 +******************************************************************************/
9480 +
9481 +#ifndef __SM_INTERNAL_DOT_H__
9482 +#define __SM_INTERNAL_DOT_H__
9483 +
9484 +/*
9485 + * Any header files needed by this file should be included before it in sm.h.
9486 + * This file should only be included by sm.h.
9487 + */
9488 +
9489 +struct sm_group;
9490 +struct sm_sevent;
9491 +struct sm_uevent;
9492 +struct sm_node;
9493 +struct sm_msg;
9494 +
9495 +typedef struct sm_group sm_group_t;
9496 +typedef struct sm_sevent sm_sevent_t;
9497 +typedef struct sm_uevent sm_uevent_t;
9498 +typedef struct sm_node sm_node_t;
9499 +typedef struct sm_msg sm_msg_t;
9500 +
9501 +
9502 +/*
9503 + * Number of seconds to wait before trying again to join or leave an SG
9504 + */
9505 +#define RETRY_DELAY            (2)
9506 +
9507 +
9508 +/*
9509 + * Service Event - what a node uses to join or leave an sg
9510 + */
9511 +
9512 +/* SE Flags */
9513 +#define SEFL_CHECK              (0)
9514 +#define SEFL_ALLOW_JOIN         (1)
9515 +#define SEFL_ALLOW_JSTOP        (2)
9516 +#define SEFL_ALLOW_LEAVE        (3)
9517 +#define SEFL_ALLOW_LSTOP        (4)
9518 +#define SEFL_ALLOW_STARTDONE    (5)
9519 +#define SEFL_ALLOW_BARRIER      (6)
9520 +#define SEFL_DELAY              (7)
9521 +#define SEFL_DELAY_RECOVERY     (8)
9522 +#define SEFL_LEAVE              (9)
9523 +#define SEFL_CANCEL             (10)
9524 +
9525 +/* SE States */
9526 +#define SEST_JOIN_BEGIN         (1)
9527 +#define SEST_JOIN_ACKWAIT       (2)
9528 +#define SEST_JOIN_ACKED         (3)
9529 +#define SEST_JSTOP_ACKWAIT      (4)
9530 +#define SEST_JSTOP_ACKED        (5)
9531 +#define SEST_JSTART_SERVICEWAIT (6)
9532 +#define SEST_JSTART_SERVICEDONE (7)
9533 +#define SEST_BARRIER_WAIT       (8)
9534 +#define SEST_BARRIER_DONE       (9)
9535 +#define SEST_LEAVE_BEGIN        (10)
9536 +#define SEST_LEAVE_ACKWAIT      (11)
9537 +#define SEST_LEAVE_ACKED        (12)
9538 +#define SEST_LSTOP_ACKWAIT      (13)
9539 +#define SEST_LSTOP_ACKED        (14)
9540 +#define SEST_LSTART_WAITREMOTE  (15)
9541 +#define SEST_LSTART_REMOTEDONE  (16)
9542 +
9543 +struct sm_sevent {
9544 +       struct list_head        se_list;
9545 +       unsigned int            se_id;
9546 +       sm_group_t *            se_sg;
9547 +       unsigned long           se_flags;
9548 +       unsigned int            se_state;
9549 +
9550 +       int                     se_node_count;
9551 +       int                     se_memb_count;
9552 +       int                     se_reply_count;
9553 +
9554 +       uint32_t *              se_node_ids;
9555 +       char *                  se_node_status;
9556 +       int                     se_len_ids;     /* length of node_ids */
9557 +       int                     se_len_status;  /* length of node_status */
9558 +
9559 +       int                     se_barrier_status;
9560 +       struct timer_list       se_restart_timer;
9561 +};
9562 +
9563 +/*
9564 + * Update Event - what an sg member uses to respond to an sevent
9565 + */
9566 +
9567 +/* UE Flags */
9568 +#define UEFL_ALLOW_STARTDONE    (0)
9569 +#define UEFL_ALLOW_BARRIER      (1)
9570 +#define UEFL_CANCEL             (2)
9571 +#define UEFL_LEAVE              (3)
9572 +#define UEFL_CHECK              (4)
9573 +
9574 +/* UE States */
9575 +#define UEST_JSTOP              (1)
9576 +#define UEST_JSTART_WAITCMD     (2)
9577 +#define UEST_JSTART             (3)
9578 +#define UEST_JSTART_SERVICEWAIT (4)
9579 +#define UEST_JSTART_SERVICEDONE (5)
9580 +#define UEST_BARRIER_WAIT       (6)
9581 +#define UEST_BARRIER_DONE       (7)
9582 +#define UEST_LSTOP              (8)
9583 +#define UEST_LSTART_WAITCMD     (9)
9584 +#define UEST_LSTART             (10)
9585 +#define UEST_LSTART_SERVICEWAIT (11)
9586 +#define UEST_LSTART_SERVICEDONE (12)
9587 +
9588 +struct sm_uevent {
9589 +       unsigned int            ue_state;
9590 +       unsigned long           ue_flags;
9591 +       uint32_t                ue_id;
9592 +       uint32_t                ue_nodeid;
9593 +       int                     ue_num_nodes;
9594 +       int                     ue_barrier_status;
9595 +       uint16_t                ue_remote_seid;
9596 +};
9597 +
9598 +/*
9599 + * Service Group
9600 + */
9601 +
9602 +#define RECOVER_NONE           (0)
9603 +#define RECOVER_STOP           (1)
9604 +#define RECOVER_START          (2)
9605 +#define RECOVER_STARTDONE      (3)
9606 +#define RECOVER_BARRIERWAIT    (4)
9607 +#define RECOVER_BARRIERDONE    (5)
9608 +
9609 +/* SG Flags */
9610 +#define SGFL_SEVENT             (1)
9611 +#define SGFL_UEVENT             (2)
9612 +#define SGFL_NEED_RECOVERY      (3)
9613 +
9614 +/* SG States */
9615 +#define SGST_NONE              (0)
9616 +#define SGST_JOIN              (1)
9617 +#define SGST_RUN               (2)
9618 +#define SGST_RECOVER           (3)
9619 +#define SGST_UEVENT            (4)
9620 +
9621 +struct sm_group {
9622 +       struct list_head        list;           /* list of sg's */
9623 +       uint16_t                level;
9624 +       uint32_t                local_id;
9625 +       uint32_t                global_id;
9626 +       unsigned long           flags;
9627 +       int                     state;
9628 +       int                     refcount;       /* references from reg/unreg */
9629 +       void *                  service_data;   /* data from the service */
9630 +       struct kcl_service_ops *ops;            /* ops from the service */
9631 +       struct completion       event_comp;
9632 +
9633 +       struct list_head        memb;           /* Membership List for RC */
9634 +       int                     memb_count;     /* number of nodes in memb */
9635 +       struct list_head        joining;        /* nodes joining the sg */
9636 +       sm_sevent_t *           sevent;
9637 +       sm_uevent_t             uevent;
9638 +
9639 +       int                     recover_state;
9640 +       int                     recover_stop;
9641 +       struct list_head        recover_list;   /* recovery event list */
9642 +       void *                  recover_data;
9643 +       char                    recover_barrier[MAX_BARRIER_NAME_LEN];
9644 +
9645 +       int                     namelen;
9646 +       char                    name[1];        /* must be last field */
9647 +};
9648 +
9649 +/*
9650 + * Service Message
9651 + */
9652 +
9653 +/* SMSG Type */
9654 +#define SMSG_JOIN_REQ           (1)
9655 +#define SMSG_JOIN_REP           (2)
9656 +#define SMSG_JSTOP_REQ          (3)
9657 +#define SMSG_JSTOP_REP          (4)
9658 +#define SMSG_JSTART_CMD         (5)
9659 +#define SMSG_LEAVE_REQ          (6)
9660 +#define SMSG_LEAVE_REP          (7)
9661 +#define SMSG_LSTOP_REQ          (8)
9662 +#define SMSG_LSTOP_REP          (9)
9663 +#define SMSG_LSTART_CMD         (10)
9664 +#define SMSG_LSTART_DONE        (11)
9665 +#define SMSG_RECOVER           (12)
9666 +
9667 +/* SMSG Status */
9668 +#define STATUS_POS              (1)
9669 +#define STATUS_NEG              (2)
9670 +#define STATUS_WAIT             (3)
9671 +
9672 +struct sm_msg {
9673 +       uint8_t                 ms_type;
9674 +       uint8_t                 ms_status;
9675 +       uint16_t                ms_sevent_id;
9676 +       uint32_t                ms_global_sgid;
9677 +       uint32_t                ms_global_lastid;
9678 +       uint16_t                ms_sglevel;
9679 +       uint16_t                ms_length;
9680 +       /* buf of ms_length bytes follows */
9681 +};
9682 +
9683 +/*
9684 + * Node structure
9685 + */
9686 +
9687 +#define SNFL_NEED_RECOVERY     (0)
9688 +#define SNFL_CLUSTER_MEMBER    (1)
9689 +#define SNFL_LEAVING           (2)
9690 +
9691 +struct sm_node {
9692 +       struct list_head        list;
9693 +       uint32_t                id;             /* node id from cnxman */
9694 +       unsigned long           flags;
9695 +       int                     incarnation;    /* node incarnation number */
9696 +};
9697 +
9698 +#endif                         /* __SM_INTERNAL_DOT_H__ */
9699 diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
9700 --- linux-orig/cluster/cman/sm_joinleave.c      1970-01-01 07:30:00.000000000 +0730
9701 +++ linux-patched/cluster/cman/sm_joinleave.c   2004-11-03 11:37:37.000000000 +0800
9702 @@ -0,0 +1,1291 @@
9703 +/******************************************************************************
9704 +*******************************************************************************
9705 +**
9706 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9707 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9708 +**
9709 +**  This copyrighted material is made available to anyone wishing to use,
9710 +**  modify, copy, or redistribute it subject to the terms and conditions
9711 +**  of the GNU General Public License v.2.
9712 +**
9713 +*******************************************************************************
9714 +******************************************************************************/
9715 +
9716 +#include "sm.h"
9717 +
9718 +/*
9719 + * Routines used by nodes that are joining or leaving a SG.  These "sevent"
9720 + * routines initiate membership changes to a SG.  Existing SG members respond
9721 + * using the "uevent" membership update routines.
9722 + */
9723 +
9724 +extern uint32_t                sm_our_nodeid;
9725 +extern struct list_head        sm_members;
9726 +static struct list_head        new_event;
9727 +static spinlock_t              new_event_lock;
9728 +static struct list_head                joinleave_events;
9729 +
9730 +void init_joinleave(void)
9731 +{
9732 +       INIT_LIST_HEAD(&new_event);
9733 +       spin_lock_init(&new_event_lock);
9734 +       INIT_LIST_HEAD(&joinleave_events);
9735 +}
9736 +
9737 +void new_joinleave(sm_sevent_t *sev)
9738 +{
9739 +       spin_lock(&new_event_lock);
9740 +       list_add_tail(&sev->se_list, &new_event);
9741 +       spin_unlock(&new_event_lock);
9742 +       wake_serviced(DO_JOINLEAVE);
9743 +}
9744 +
9745 +sm_sevent_t *find_sevent(unsigned int id)
9746 +{
9747 +       sm_sevent_t *sev;
9748 +
9749 +       list_for_each_entry(sev, &joinleave_events, se_list) {
9750 +               if (sev->se_id == id)
9751 +                       return sev;
9752 +       }
9753 +       return NULL;
9754 +}
9755 +
9756 +static void release_sevent(sm_sevent_t *sev)
9757 +{
9758 +       if (sev->se_len_ids) {
9759 +               kfree(sev->se_node_ids);
9760 +               sev->se_node_ids = NULL;
9761 +       }
9762 +
9763 +       if (sev->se_len_status) {
9764 +               kfree(sev->se_node_status);
9765 +               sev->se_node_status = NULL;
9766 +       }
9767 +
9768 +       sev->se_node_count = 0;
9769 +       sev->se_memb_count = 0;
9770 +       sev->se_reply_count = 0;
9771 +}
9772 +
9773 +static int init_sevent(sm_sevent_t *sev)
9774 +{
9775 +       sm_node_t *node;
9776 +       int len1, len2, count, cluster_members = 0;
9777 +
9778 +       /* clear state from any previous attempt */
9779 +       release_sevent(sev);
9780 +
9781 +       list_for_each_entry(node, &sm_members, list) {
9782 +               if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9783 +                       cluster_members++;
9784 +       }
9785 +
9786 +       sev->se_node_count = cluster_members;
9787 +       sev->se_memb_count = sev->se_sg->memb_count;
9788 +
9789 +       /*
9790 +        * When joining, we need a node array the size of the entire cluster
9791 +        * member list because we get responses from all nodes.  When leaving,
9792 +        * we only get responses from SG members, so the node array need only
9793 +        * be that large.
9794 +        */
9795 +
9796 +       if (sev->se_state < SEST_LEAVE_BEGIN)
9797 +               count = sev->se_node_count;
9798 +       else
9799 +               count = sev->se_memb_count;
9800 +
9801 +       len1 = count * sizeof(uint32_t);
9802 +       sev->se_len_ids = len1;
9803 +
9804 +       sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
9805 +       if (!sev->se_node_ids)
9806 +               goto fail;
9807 +
9808 +       len2 = count * sizeof (char);
9809 +       sev->se_len_status = len2;
9810 +
9811 +       sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
9812 +       if (!sev->se_node_status)
9813 +               goto fail_free;
9814 +
9815 +       memset(sev->se_node_status, 0, len2);
9816 +       memset(sev->se_node_ids, 0, len1);
9817 +
9818 +       return 0;
9819 +
9820 +      fail_free:
9821 +       kfree(sev->se_node_ids);
9822 +       sev->se_node_ids = NULL;
9823 +       sev->se_len_ids = 0;
9824 +
9825 +      fail:
9826 +       return -ENOMEM;
9827 +}
9828 +
9829 +/* Context: timer */
9830 +
9831 +static void sev_restart(unsigned long data)
9832 +{
9833 +       sm_sevent_t *sev = (sm_sevent_t *) data;
9834 +
9835 +       clear_bit(SEFL_DELAY, &sev->se_flags);
9836 +       set_bit(SEFL_CHECK, &sev->se_flags);
9837 +       wake_serviced(DO_JOINLEAVE);
9838 +}
9839 +
9840 +static void schedule_sev_restart(sm_sevent_t *sev)
9841 +{
9842 +       init_timer(&sev->se_restart_timer);
9843 +       sev->se_restart_timer.function = sev_restart;
9844 +       sev->se_restart_timer.data = (long) sev;
9845 +       mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
9846 +}
9847 +
9848 +void free_sg_memb(sm_group_t *sg)
9849 +{
9850 +       sm_node_t *node;
9851 +
9852 +       while (!list_empty(&sg->memb)) {
9853 +               node = list_entry(sg->memb.next, sm_node_t, list);
9854 +               list_del(&node->list);
9855 +               kfree(node);
9856 +       }
9857 +       sg->memb_count = 0;
9858 +}
9859 +
9860 +/*
9861 + * 1.  First step in joining a SG - send a message to all nodes in the cluster
9862 + * asking to join the named SG.  If any nodes are members they will reply with
9863 + * a POS, or a WAIT (wait means try again, only one node can join at a time).
9864 + * If no one knows about this SG, they all send NEG replies which means we form
9865 + * the SG with just ourself as a member.
9866 + */
9867 +
9868 +static int send_join_notice(sm_sevent_t *sev)
9869 +{
9870 +       sm_group_t *sg = sev->se_sg;
9871 +       sm_node_t *node;
9872 +       char *msg;
9873 +       int i = 0, error, namelen, len = 0;
9874 +
9875 +       /*
9876 +        * Create node array from member list in which to collect responses.
9877 +        */
9878 +
9879 +       error = init_sevent(sev);
9880 +       if (error)
9881 +               goto out;
9882 +
9883 +       list_for_each_entry(node, &sm_members, list) {
9884 +               if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
9885 +                       sev->se_node_ids[i++] = node->id;
9886 +       }
9887 +
9888 +       /*
9889 +        * Create and send a join request message.
9890 +        *
9891 +        * Other nodes then run process_join_request and reply to us; we
9892 +        * collect the responses in process_reply and check them in
9893 +        * check_join_notice.
9894 +        */
9895 +
9896 +       namelen = sg->namelen;
9897 +       msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
9898 +       memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
9899 +
9900 +       error = send_broadcast_message_sev(msg, len, sev);
9901 +
9902 +      out:
9903 +       return error;
9904 +}
9905 +
9906 +/*
9907 + * 2.  Second step in joining a SG - after we collect all replies to our join
9908 + * request, we look at them.  If anyone told us to wait, we'll wait a while, go
9909 + * back and start at step 1 again.
9910 + */
9911 +
9912 +static int check_join_notice(sm_sevent_t *sev)
9913 +{
9914 +       int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
9915 +
9916 +       for (i = 0; i < sev->se_node_count; i++) {
9917 +               switch (sev->se_node_status[i]) {
9918 +               case STATUS_POS:
9919 +                       /* this node is in the SG and will be in new proposed
9920 +                        * memb list */
9921 +                       pos++;
9922 +                       break;
9923 +
9924 +               case STATUS_WAIT:
9925 +                       /* this node is in the SG but something else is
9926 +                        * happening with it at the moment. */
9927 +                       wait++;
9928 +                       break;
9929 +
9930 +               case STATUS_NEG:
9931 +                       /* this node has no record of the SG we're interested
9932 +                        * in */
9933 +                       neg++;
9934 +
9935 +                       if (sev->se_node_ids[i] == sm_our_nodeid)
9936 +                               sev->se_node_status[i] = STATUS_POS;
9937 +                       break;
9938 +
9939 +               default:
9940 +                       /* we didn't get a valid response from this node,
9941 +                        * restart the entire sev. */
9942 +                       restart++;
9943 +                       break;
9944 +               }
9945 +       }
9946 +
9947 +       if (pos && !wait && !restart) {
9948 +               /* all current members of this sg pos'ed our entry */
9949 +       } else if (!pos && !wait && !restart && neg) {
9950 +               /* we're the first in the cluster to join this sg */
9951 +               sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
9952 +       } else
9953 +               error = -1;
9954 +
9955 +       return error;
9956 +}
9957 +
9958 +/*
9959 + * 3.  Third step in joining the SG - tell the nodes that are already members
9960 + * to "stop" the service.  We stop them so that everyone can restart with the
9961 + * new member (us!) added.
9962 + */
9963 +
9964 +static int send_join_stop(sm_sevent_t *sev)
9965 +{
9966 +       sm_group_t *sg = sev->se_sg;
9967 +       sm_node_t *node;
9968 +       char *msg;
9969 +       uint32_t be_count;
9970 +       int i, len = 0, error = 0;
9971 +
9972 +       /*
9973 +        * Form the SG memb list with us in it.
9974 +        */
9975 +
9976 +       for (i = 0; i < sev->se_node_count; i++) {
9977 +               if (sev->se_node_status[i] != STATUS_POS)
9978 +                       continue;
9979 +
9980 +               node = sm_new_node(sev->se_node_ids[i]);
9981 +               if (!node)
9982 +                       goto fail;
9983 +
9984 +               list_add_tail(&node->list, &sg->memb);
9985 +               sg->memb_count++;
9986 +       }
9987 +
9988 +       /*
9989 +        * Re-init the node vector in which to collect responses again.
9990 +        */
9991 +
9992 +       sev->se_memb_count = sg->memb_count;
9993 +
9994 +       memset(sev->se_node_status, 0, sev->se_len_status);
9995 +       memset(sev->se_node_ids, 0, sev->se_len_ids);
9996 +       i = 0;
9997 +
9998 +       list_for_each_entry(node, &sg->memb, list)
9999 +               sev->se_node_ids[i++] = node->id;
10000 +
10001 +       /*
10002 +        * Create and send a stop message.
10003 +        *
10004 +        * Other nodes then run process_stop_request and process_join_stop and
10005 +        * reply to us.  They stop the sg we're trying to join if they agree.
10006 +        * We collect responses in process_reply and check them in
10007 +        * check_join_stop.
10008 +        */
10009 +
10010 +       msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
10011 +       be_count = cpu_to_be32(sg->memb_count);
10012 +       memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
10013 +
10014 +       error = send_members_message_sev(sg, msg, len, sev);
10015 +       if (error < 0)
10016 +               goto fail;
10017 +
10018 +       return 0;
10019 +
10020 +      fail:
10021 +       free_sg_memb(sg);
10022 +       return error;
10023 +}
10024 +
10025 +/*
10026 + * 4.  Fourth step in joining the SG - after we collect replies to our stop
10027 + * request, we look at them.  Everyone sending POS agrees with us joining and
10028 + * has stopped their SG.  If some nodes sent NEG, something is wrong and we
10029 + * don't have a good way to address that yet since some nodes may have sent
10030 + * POS.
10031 + *
10032 + * FIXME: even nodes replying with NEG should stop their SG so we can send an
10033 + * abort and have everyone at the same place to start from again.
10034 + */
10035 +
10036 +static int check_join_stop(sm_sevent_t *sev)
10037 +{
10038 +       sm_group_t *sg = sev->se_sg;
10039 +       int i, pos = 0, neg = 0;
10040 +
10041 +       for (i = 0; i < sev->se_memb_count; i++) {
10042 +               switch (sev->se_node_status[i]) {
10043 +               case STATUS_POS:
10044 +                       pos++;
10045 +                       break;
10046 +
10047 +               case STATUS_NEG:
10048 +                       log_error(sg, "check_join_stop: neg from nodeid %u "
10049 +                                 "(%d, %d, %u)", sev->se_node_ids[i],
10050 +                                 pos, neg, sev->se_memb_count);
10051 +                       neg++;
10052 +                       break;
10053 +
10054 +               default:
10055 +                       log_error(sg, "check_join_stop: unknown status=%u "
10056 +                                 "nodeid=%u", sev->se_node_status[i],
10057 +                                 sev->se_node_ids[i]);
10058 +                       neg++;
10059 +                       break;
10060 +               }
10061 +       }
10062 +
10063 +       if (pos == sg->memb_count)
10064 +               return 0;
10065 +
10066 +       free_sg_memb(sg);
10067 +       return -1;
10068 +}
10069 +
10070 +/*
10071 + * 5.  Fifth step in joining the SG - everyone has stopped their service and we
10072 + * all now start the service with us, the new member, added to the SG member
10073 + * list.  We send start to our own service here and send a message to the other
10074 + * members that they should also start their service.
10075 + */
10076 +
10077 +static int send_join_start(sm_sevent_t *sev)
10078 +{
10079 +       sm_group_t *sg = sev->se_sg;
10080 +       sm_node_t *node;
10081 +       uint32_t *memb;
10082 +       char *msg;
10083 +       int error, count = 0, len = 0;
10084 +
10085 +       /*
10086 +        * Create a start message and send it.
10087 +        */
10088 +
10089 +       msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
10090 +
10091 +       error = send_members_message(sg, msg, len);
10092 +       if (error < 0)
10093 +               goto fail;
10094 +
10095 +       /*
10096 +        * Start the service ourself.  The chunk of memory with the member ids
10097 +        * must be freed by the service when it is done with it.
10098 +        */
10099 +
10100 +       SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
10101 +                memb);
10102 +
10103 +       list_for_each_entry(node, &sg->memb, list)
10104 +               memb[count++] = node->id;
10105 +
10106 +       set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10107 +
10108 +       sg->ops->start(sg->service_data, memb, count, sev->se_id,
10109 +                      SERVICE_NODE_JOIN);
10110 +       return 0;
10111 +
10112 +      fail:
10113 +       free_sg_memb(sg);
10114 +       return error;
10115 +}
10116 +
10117 +/*
10118 + * 6.  Sixth step in joining the SG - once the service has completed its start,
10119 + * it does a kcl_start_done() to signal us that it's done.  That gets us here
10120 + * and we do a barrier with all other members which join the barrier when their
10121 + * service is done starting.
10122 + */
10123 +
10124 +static int startdone_barrier_new(sm_sevent_t *sev)
10125 +{
10126 +       sm_group_t *sg = sev->se_sg;
10127 +       char bname[MAX_BARRIER_NAME_LEN];
10128 +       int error;
10129 +
10130 +       memset(bname, 0, MAX_BARRIER_NAME_LEN);
10131 +       sev->se_barrier_status = -1;
10132 +
10133 +       set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10134 +
10135 +       /* If we're the only member, skip the barrier */
10136 +       if (sg->memb_count == 1) {
10137 +               process_startdone_barrier_new(sg, 0);
10138 +               return 0;
10139 +       }
10140 +
10141 +       snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10142 +                sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
10143 +
10144 +       error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
10145 +       if (error)
10146 +               goto fail;
10147 +
10148 +       return 0;
10149 +
10150 +      fail:
10151 +       clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10152 +       sg->ops->stop(sg->service_data);
10153 +       free_sg_memb(sg);
10154 +       return error;
10155 +}
10156 +
10157 +/*
10158 + * 7.  Seventh step in joining the SG - check that the barrier we joined with
10159 + * all other members returned with a successful status.
10160 + */
10161 +
10162 +static int check_startdone_barrier_new(sm_sevent_t *sev)
10163 +{
10164 +       sm_group_t *sg = sev->se_sg;
10165 +       int error = sev->se_barrier_status;
10166 +
10167 +       if (error) {
10168 +               sg->ops->stop(sg->service_data);
10169 +               free_sg_memb(sg);
10170 +       }
10171 +       return error;
10172 +}
10173 +
10174 +/*
10175 + * 8.  Eigth step in joining the SG - send the service a "finish" indicating
10176 + * that all members have successfully started the service.
10177 + */
10178 +
10179 +static void do_finish_new(sm_sevent_t *sev)
10180 +{
10181 +       sm_group_t *sg = sev->se_sg;
10182 +
10183 +       sg->state = SGST_RUN;
10184 +       sg->sevent = NULL;
10185 +       clear_bit(SGFL_SEVENT, &sg->flags);
10186 +
10187 +       sg->ops->finish(sg->service_data, sev->se_id);
10188 +}
10189 +
10190 +/*
10191 + * 9.  Ninth step in joining the SG - it's done so get rid of the sevent stuff
10192 + * and tell the process which initiated the join that it's done.
10193 + */
10194 +
10195 +static void sevent_done(sm_sevent_t *sev)
10196 +{
10197 +       sm_group_t *sg = sev->se_sg;
10198 +
10199 +       list_del(&sev->se_list);
10200 +       release_sevent(sev);
10201 +       kfree(sev);
10202 +       complete(&sg->event_comp);
10203 +}
10204 +
10205 +/*
10206 + * Move through the steps of a join.  Summary:
10207 + *
10208 + * 1. Send a join notice to all cluster members.
10209 + * 2. Collect and check replies to the join notice.
10210 + * 3. Send a stop message to all SG members.
10211 + * 4. Collect and check replies to the stop message.
10212 + * 5. Send a start message to all SG members and start service ourself.
10213 + * 6. Use barrier to wait for all nodes to complete the start.
10214 + * 7. Check that all SG members joined the barrier.
10215 + * 8. Send finish to the service indicating that all nodes started it.
10216 + * 9. Clean up sevent and signal completion to the process that started the join
10217 + */
10218 +
10219 +static void process_join_sevent(sm_sevent_t *sev)
10220 +{
10221 +       int error = 0;
10222 +
10223 +       /*
10224 +        * We may cancel the current join attempt if another node is also
10225 +        * attempting to join or leave. (Only a single node can join or leave
10226 +        * at once.)  If cancelled, 0ur join attempt will be restarted later.
10227 +        */
10228 +
10229 +       if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10230 +               error = 1;
10231 +               goto cancel;
10232 +       }
10233 +
10234 +       log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10235 +
10236 +       switch (sev->se_state) {
10237 +
10238 +               /*
10239 +                * An sevent is created in kcl_join_service with a state of
10240 +                * JOIN_BEGIN.
10241 +                */
10242 +
10243 +       case SEST_JOIN_BEGIN:
10244 +               sev->se_state = SEST_JOIN_ACKWAIT;
10245 +               error = send_join_notice(sev);
10246 +               break;
10247 +
10248 +               /*
10249 +                * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
10250 +                * process_reply  (when all the replies have been received)
10251 +                */
10252 +
10253 +       case SEST_JOIN_ACKED:
10254 +               error = check_join_notice(sev);
10255 +               if (error)
10256 +                       break;
10257 +
10258 +               sev->se_state = SEST_JSTOP_ACKWAIT;
10259 +               error = send_join_stop(sev);
10260 +               break;
10261 +
10262 +               /*
10263 +                * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
10264 +                * proces_reply  (when all the replies have been received)
10265 +                */
10266 +
10267 +       case SEST_JSTOP_ACKED:
10268 +               error = check_join_stop(sev);
10269 +               if (error)
10270 +                       break;
10271 +
10272 +               sev->se_state = SEST_JSTART_SERVICEWAIT;
10273 +               error = send_join_start(sev);
10274 +               break;
10275 +
10276 +               /*
10277 +                * se_state is changed from JSTART_SERVICEWAIT to
10278 +                * JSTART_SERVICEDONE in kcl_start_done
10279 +                */
10280 +
10281 +       case SEST_JSTART_SERVICEDONE:
10282 +               sev->se_state = SEST_BARRIER_WAIT;
10283 +               error = startdone_barrier_new(sev);
10284 +               break;
10285 +
10286 +               /*
10287 +                * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
10288 +                * process_startdone_barrier_new
10289 +                */
10290 +
10291 +       case SEST_BARRIER_DONE:
10292 +               error = check_startdone_barrier_new(sev);
10293 +               if (error)
10294 +                       break;
10295 +
10296 +               do_finish_new(sev);
10297 +               sevent_done(sev);
10298 +               break;
10299 +
10300 +       default:
10301 +               log_error(sev->se_sg, "no join processing for state %u",
10302 +                         sev->se_state);
10303 +       }
10304 +
10305 +      cancel:
10306 +       if (error) {
10307 +               /* restart the sevent from the beginning */
10308 +               log_debug(sev->se_sg, "process_join error %d %lx", error,
10309 +                         sev->se_flags);
10310 +               sev->se_state = SEST_JOIN_BEGIN;
10311 +               sev->se_sg->global_id = 0;
10312 +               set_bit(SEFL_DELAY, &sev->se_flags);
10313 +               schedule_sev_restart(sev);
10314 +       }
10315 +}
10316 +
10317 +/*
10318 + * 1.  First step in leaving an SG - send a message to other SG members asking
10319 + * to leave the SG.  Nodes that don't have another active sevent or uevent for
10320 + * this SG will return POS.
10321 + */
10322 +
10323 +static int send_leave_notice(sm_sevent_t *sev)
10324 +{
10325 +       sm_group_t *sg = sev->se_sg;
10326 +       sm_node_t *node;
10327 +       char *msg;
10328 +       int i = 0, error = -1, len = 0;
10329 +
10330 +       /*
10331 +        * Create a node array from member list in which to collect responses.
10332 +        */
10333 +
10334 +       error = init_sevent(sev);
10335 +       if (error)
10336 +               goto out;
10337 +
10338 +       list_for_each_entry(node, &sg->memb, list)
10339 +               sev->se_node_ids[i++] = node->id;
10340 +
10341 +       /*
10342 +        * Create and send a leave request message.
10343 +        */
10344 +
10345 +       msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
10346 +
10347 +       error = send_members_message_sev(sg, msg, len, sev);
10348 +
10349 +      out:
10350 +       return error;
10351 +}
10352 +
10353 +/*
10354 + * 2.  Second step in leaving an SG - after we collect all replies to our leave
10355 + * request, we look at them.  If anyone replied with WAIT, we abort our attempt
10356 + * at leaving and try again in a bit.
10357 + */
10358 +
10359 +static int check_leave_notice(sm_sevent_t *sev)
10360 +{
10361 +       int pos = 0, wait = 0, neg = 0, restart = 0, i;
10362 +
10363 +       for (i = 0; i < sev->se_memb_count; i++) {
10364 +               switch (sev->se_node_status[i]) {
10365 +               case STATUS_POS:
10366 +                       pos++;
10367 +                       break;
10368 +
10369 +               case STATUS_WAIT:
10370 +                       wait++;
10371 +                       break;
10372 +
10373 +               case STATUS_NEG:
10374 +                       neg++;
10375 +                       break;
10376 +
10377 +               default:
10378 +                       /* we didn't get a valid response from this node,
10379 +                        * restart the entire sev. */
10380 +                       restart++;
10381 +                       break;
10382 +               }
10383 +       }
10384 +
10385 +       /* all members approve */
10386 +       if (pos && !wait && !restart)
10387 +               return 0;
10388 +
10389 +       return -1;
10390 +}
10391 +
10392 +/*
10393 + * 3.  Third step in leaving the SG - tell the member nodes to "stop" the SG.
10394 + * They must be stopped in order to restart without us as a member.
10395 + */
10396 +
10397 +static int send_leave_stop(sm_sevent_t *sev)
10398 +{
10399 +       sm_group_t *sg = sev->se_sg;
10400 +       char *msg;
10401 +       int error, len = 0;
10402 +
10403 +       /*
10404 +        * Re-init the status vector in which to collect responses.
10405 +        */
10406 +
10407 +       memset(sev->se_node_status, 0, sev->se_len_status);
10408 +
10409 +       /*
10410 +        * Create and send a stop message.
10411 +        */
10412 +
10413 +       msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
10414 +
10415 +       error = send_members_message_sev(sg, msg, len, sev);
10416 +       if (error < 0)
10417 +               goto out;
10418 +
10419 +       /*
10420 +        * we and all others stop the SG now
10421 +        */
10422 +
10423 +       sg->ops->stop(sg->service_data);
10424 +
10425 +      out:
10426 +       return error;
10427 +}
10428 +
10429 +/*
10430 + * 4.  Fourth step in leaving the SG - check the replies to our stop request.
10431 + * Same problem with getting different replies as check_join_stop.
10432 + */
10433 +
10434 +static int check_leave_stop(sm_sevent_t *sev)
10435 +{
10436 +       sm_group_t *sg = sev->se_sg;
10437 +       int i, pos = 0, neg = 0;
10438 +
10439 +       for (i = 0; i < sev->se_memb_count; i++) {
10440 +               switch (sev->se_node_status[i]) {
10441 +               case STATUS_POS:
10442 +                       pos++;
10443 +                       break;
10444 +
10445 +               case STATUS_NEG:
10446 +                       log_error(sg, "check_leave_stop: fail from nodeid %u "
10447 +                                 "(%d, %d, %u)", sev->se_node_ids[i],
10448 +                                 pos, neg, sev->se_memb_count);
10449 +                       neg++;
10450 +                       break;
10451 +
10452 +               default:
10453 +                       log_error(sg, "check_leave_stop: status %u nodeid %u",
10454 +                                 sev->se_node_status[i], sev->se_node_ids[i]);
10455 +                       neg++;
10456 +                       break;
10457 +               }
10458 +       }
10459 +
10460 +       if (pos == sg->memb_count)
10461 +               return 0;
10462 +
10463 +       return -1;
10464 +}
10465 +
10466 +/*
10467 + * 5.  Fifth step in leaving the SG - tell the other SG members to restart the
10468 + * service without us.  We, of course, don't start our own stopped service.  If
10469 + * we're the last SG member and leaving, we jump right to the next step.
10470 + */
10471 +
10472 +static int send_leave_start(sm_sevent_t *sev)
10473 +{
10474 +       sm_group_t *sg = sev->se_sg;
10475 +       char *msg;
10476 +       int error = 0, len = 0;
10477 +
10478 +       if (sg->memb_count == 1) {
10479 +               sev->se_state = SEST_LSTART_REMOTEDONE;
10480 +               set_bit(SEFL_CHECK, &sev->se_flags);
10481 +               wake_serviced(DO_JOINLEAVE);
10482 +       } else {
10483 +               msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
10484 +               error = send_members_message(sg, msg, len);
10485 +       }
10486 +       return error;
10487 +}
10488 +
10489 +/*
10490 + * Move through the steps of a leave.  Summary:
10491 + *
10492 + * 1. Send a leave notice to all SG members.
10493 + * 2. Collect and check replies to the leave notice.
10494 + * 3. Send a stop message to all SG members and stop our own SG.
10495 + * 4. Collect and check replies to the stop message.
10496 + * 5. Send a start message to SG members.
10497 + * 6. Clean up sevent and signal completion to the process that
10498 + *    started the leave.
10499 + */
10500 +
10501 +static void process_leave_sevent(sm_sevent_t *sev)
10502 +{
10503 +       int error = 0;
10504 +
10505 +       /*
10506 +        * We may cancel the current leave attempt if another node is also
10507 +        * attempting to join or leave. (Only a single node can join or leave
10508 +        * at once.)  Our leave attempt will be restarted after being
10509 +        * cancelled.
10510 +        */
10511 +
10512 +       if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
10513 +               error = 1;
10514 +               goto cancel;
10515 +       }
10516 +
10517 +       if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
10518 +               error = 2;
10519 +               goto cancel;
10520 +       }
10521 +
10522 +       if (!list_empty(&sev->se_sg->joining)) {
10523 +               error = 3;
10524 +               goto cancel;
10525 +       }
10526 +
10527 +       log_debug(sev->se_sg, "sevent state %u", sev->se_state);
10528 +
10529 +       switch (sev->se_state) {
10530 +
10531 +               /*
10532 +                * An sevent is created in kcl_leave_service with a state of
10533 +                * LEAVE_BEGIN.
10534 +                */
10535 +
10536 +       case SEST_LEAVE_BEGIN:
10537 +               sev->se_state = SEST_LEAVE_ACKWAIT;
10538 +               error = send_leave_notice(sev);
10539 +               break;
10540 +
10541 +               /*
10542 +                * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
10543 +                * process_reply  (when all the replies have been received)
10544 +                */
10545 +
10546 +       case SEST_LEAVE_ACKED:
10547 +               error = check_leave_notice(sev);
10548 +               if (error)
10549 +                       break;
10550 +
10551 +               sev->se_state = SEST_LSTOP_ACKWAIT;
10552 +               error = send_leave_stop(sev);
10553 +               break;
10554 +
10555 +               /*
10556 +                * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
10557 +                * process_reply
10558 +                */
10559 +
10560 +       case SEST_LSTOP_ACKED:
10561 +               error = check_leave_stop(sev);
10562 +               if (error)
10563 +                       break;
10564 +
10565 +               sev->se_state = SEST_LSTART_WAITREMOTE;
10566 +               error = send_leave_start(sev);
10567 +               break;
10568 +
10569 +               /*
10570 +                * se_state is changed from LSTART_WAITREMOTE to
10571 +                * LSTART_REMOTEDONE in process_leave_done
10572 +                */
10573 +
10574 +       case SEST_LSTART_REMOTEDONE:
10575 +               sevent_done(sev);
10576 +               break;
10577 +
10578 +       default:
10579 +               log_error(sev->se_sg, "process_leave_sevent state=%u",
10580 +                         sev->se_state);
10581 +       }
10582 +
10583 + cancel:
10584 +       if (error) {
10585 +               log_debug(sev->se_sg, "process_leave error %d %lx", error,
10586 +                         sev->se_flags);
10587 +               /* restart the sevent from the beginning */
10588 +               sev->se_state = SEST_LEAVE_BEGIN;
10589 +               set_bit(SEFL_DELAY, &sev->se_flags);
10590 +               schedule_sev_restart(sev);
10591 +       }
10592 +}
10593 +
10594 +/*
10595 + * Sevent backout code.  Take appropriate steps when a recovery occurs while
10596 + * we're in the midst of an sevent.  The recovery may or may not affect the
10597 + * sevent.  If it does, it usually means cancelling the sevent and restarting
10598 + * it from the beginning once the recovery processing is done.
10599 + */
10600 +
10601 +/*
10602 + * If any of the nodes that replied with OK is dead, we give up on the current
10603 + * join attempt and restart.  Otherwise, this sevent can continue.
10604 + */
10605 +
10606 +static int backout_join_acked(sm_sevent_t *sev)
10607 +{
10608 +       sm_node_t *node;
10609 +       int i;
10610 +
10611 +       for (i = 0; i < sev->se_node_count; i++) {
10612 +               if (sev->se_node_status[i] != STATUS_POS)
10613 +                       continue;
10614 +
10615 +               list_for_each_entry(node, &sm_members, list) {
10616 +                       if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
10617 +                           (node->id == sev->se_node_ids[i]))
10618 +                               return TRUE;
10619 +               }
10620 +       }
10621 +       return FALSE;
10622 +}
10623 +
10624 +/*
10625 + * In this state our sg member list exists and mark_affected_sgs() will have
10626 + * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead.  We
10627 + * restart the join process if this is the case, otherwise this sevent can
10628 + * continue.
10629 + */
10630 +
10631 +static int backout_jstop_ackwait(sm_sevent_t *sev)
10632 +{
10633 +       sm_group_t *sg = sev->se_sg;
10634 +
10635 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10636 +               return FALSE;
10637 +
10638 +       clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
10639 +       free_sg_memb(sg);
10640 +       return TRUE;
10641 +}
10642 +
10643 +/*
10644 + * Same as previous.
10645 + */
10646 +
10647 +static int backout_jstop_acked(sm_sevent_t *sev)
10648 +{
10649 +       return backout_jstop_ackwait(sev);
10650 +}
10651 +
10652 +/*
10653 + * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10654 + * starting our service.  The recovery process will restart the service on all
10655 + * the prior sg members (not including those that died or us).  We will
10656 + * reattempt our join which should be accepted once the nodes are done with
10657 + * recovery.
10658 + */
10659 +
10660 +static int backout_jstart_servicewait(sm_sevent_t *sev)
10661 +{
10662 +       sm_group_t *sg = sev->se_sg;
10663 +
10664 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10665 +               return FALSE;
10666 +
10667 +       clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
10668 +       sg->ops->stop(sg->service_data);
10669 +       free_sg_memb(sg);
10670 +       return TRUE;
10671 +}
10672 +
10673 +/*
10674 + * Same as previous.
10675 + */
10676 +
10677 +static int backout_jstart_servicedone(sm_sevent_t *sev)
10678 +{
10679 +       return backout_jstart_servicewait(sev);
10680 +}
10681 +
10682 +/*
10683 + * If NEED_RECOVERY is set a member of the sg we're joining died while we were
10684 + * waiting on the "all done" barrier.  Stop our service that we just started
10685 + * and cancel the barrier.  The recovery process will restart the service on
10686 + * all the prior sg members (not including those that died or us).  We will
10687 + * reattempt our join which should be accepted once the nodes are done with
10688 + * recovery.
10689 + */
10690 +
10691 +static int backout_barrier_wait(sm_sevent_t *sev)
10692 +{
10693 +       sm_group_t *sg = sev->se_sg;
10694 +       char bname[MAX_BARRIER_NAME_LEN];
10695 +
10696 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10697 +               return FALSE;
10698 +
10699 +       clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
10700 +
10701 +       sg->ops->stop(sg->service_data);
10702 +
10703 +       memset(bname, 0, MAX_BARRIER_NAME_LEN);
10704 +       snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
10705 +                sg->global_id, sm_our_nodeid, sev->se_id,
10706 +                sg->memb_count);
10707 +       kcl_barrier_cancel(bname);
10708 +
10709 +       free_sg_memb(sg);
10710 +       return TRUE;
10711 +}
10712 +
10713 +/*
10714 + * If NEED_RECOVERY is set, a member of the sg we just joined has failed.  The
10715 + * recovery began after the barrier callback.  If the result in the callback is
10716 + * "success" then we are joined, this sevent is finished and we'll process the
10717 + * sg within the forthcoming recovery with the other members.
10718 + *
10719 + * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
10720 + * all nodes will receive the corresponding barrier callback *before any*
10721 + * receive an sm_member_update() due to one of those nodes failing just after
10722 + * joining the barrier.  If some nodes receive the sm_member_update() before
10723 + * the barrier callback and others receive the barrier callback before the
10724 + * sm_member_update() then they will disagree as to whether the node joining/
10725 + * leaving is in/out of the sg.
10726 + */
10727 +
10728 +static int backout_barrier_done(sm_sevent_t *sev)
10729 +{
10730 +       sm_group_t *sg = sev->se_sg;
10731 +
10732 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10733 +               return FALSE;
10734 +
10735 +       if (!sev->se_barrier_status) {
10736 +               do_finish_new(sev);
10737 +               sevent_done(sev);
10738 +               return FALSE;
10739 +       } else {
10740 +               sg->ops->stop(sg->service_data);
10741 +               free_sg_memb(sg);
10742 +               return TRUE;
10743 +       }
10744 +}
10745 +
10746 +/*
10747 + * We've done nothing yet, just restart when recovery is done (if sg is flagged
10748 + * with recovery.)
10749 + */
10750 +
10751 +static int backout_leave_begin(sm_sevent_t *sev)
10752 +{
10753 +       sm_group_t *sg = sev->se_sg;
10754 +
10755 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10756 +               return FALSE;
10757 +
10758 +       return TRUE;
10759 +}
10760 +
10761 +/*
10762 + * Ignore any replies to our leave notice and restart when recovery is done (if
10763 + * sg is flagged with recovery.)
10764 + */
10765 +
10766 +static int backout_leave_ackwait(sm_sevent_t *sev)
10767 +{
10768 +       sm_group_t *sg = sev->se_sg;
10769 +
10770 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10771 +               return FALSE;
10772 +
10773 +       clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
10774 +
10775 +       return TRUE;
10776 +}
10777 +
10778 +/*
10779 + * Same as previous.
10780 + */
10781 +
10782 +static int backout_leave_acked(sm_sevent_t *sev)
10783 +{
10784 +       return backout_leave_ackwait(sev);
10785 +}
10786 +
10787 +/*
10788 + * Ignore any stop replies.  All the members will be stopped anyway to do the
10789 + * recovery.  Let that happen and restart our leave when done.
10790 + */
10791 +
10792 +static int backout_lstop_ackwait(sm_sevent_t *sev)
10793 +{
10794 +       sm_group_t *sg = sev->se_sg;
10795 +
10796 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10797 +               return FALSE;
10798 +
10799 +       clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
10800 +
10801 +       return TRUE;
10802 +}
10803 +
10804 +/*
10805 + * Same as previous.
10806 + */
10807 +
10808 +static int backout_lstop_acked(sm_sevent_t *sev)
10809 +{
10810 +       return backout_lstop_ackwait(sev);
10811 +}
10812 +
10813 +/*
10814 + * All members will be stopped due to recovery and restarted by recovery
10815 + * processing.  That includes us, we have to retry the leave once the recovery
10816 + * is done.
10817 + */
10818 +
10819 +static int backout_lstart_waitremote(sm_sevent_t *sev)
10820 +{
10821 +       sm_group_t *sg = sev->se_sg;
10822 +
10823 +       if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
10824 +               return FALSE;
10825 +
10826 +       return TRUE;
10827 +}
10828 +
10829 +/*
10830 + * Reset an sevent to its beginning so it can be restarted.  This is necessary
10831 + * when recovery affects an SG while we're trying to join or leave (ie. a node
10832 + * in the SG fails).
10833 + */
10834 +
10835 +void backout_sevents(void)
10836 +{
10837 +       sm_sevent_t *sev, *safe;
10838 +       int delay;
10839 +
10840 +       list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10841 +
10842 +               delay = FALSE;
10843 +
10844 +               log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
10845 +
10846 +               switch (sev->se_state) {
10847 +
10848 +               /* backout after kcl_join_service and before
10849 +                * send_join_notice */
10850 +               case SEST_JOIN_BEGIN:
10851 +                       break;
10852 +
10853 +               /* backout after send_join_notice and before final
10854 +                * process_reply */
10855 +               case SEST_JOIN_ACKWAIT:
10856 +                       clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
10857 +                       sev->se_state = SEST_JOIN_BEGIN;
10858 +                       set_bit(SEFL_CHECK, &sev->se_flags);
10859 +                       wake_serviced(DO_JOINLEAVE);
10860 +                       break;
10861 +
10862 +               /* backout after final process_reply and before
10863 +                * check_join_notice */
10864 +               case SEST_JOIN_ACKED:
10865 +                       delay = backout_join_acked(sev);
10866 +                       break;
10867 +
10868 +               /* backout after send_join_stop and before final
10869 +                * process_reply */
10870 +               case SEST_JSTOP_ACKWAIT:
10871 +                       delay = backout_jstop_ackwait(sev);
10872 +                       break;
10873 +
10874 +               /* backout after final process_reply and before
10875 +                * check_join_stop */
10876 +               case SEST_JSTOP_ACKED:
10877 +                       delay = backout_jstop_acked(sev);
10878 +                       break;
10879 +
10880 +               /* backout after send_join_start and before
10881 +                * kcl_start_done */
10882 +               case SEST_JSTART_SERVICEWAIT:
10883 +                       delay = backout_jstart_servicewait(sev);
10884 +                       break;
10885 +
10886 +               /* backout after kcl_start_done and before
10887 +                * startdone_barrier_new */
10888 +               case SEST_JSTART_SERVICEDONE:
10889 +                       delay = backout_jstart_servicedone(sev);
10890 +                       break;
10891 +
10892 +               /* backout after startdone_barrier_new and before
10893 +                * callback_startdone_barrier_new */
10894 +               case SEST_BARRIER_WAIT:
10895 +                       delay = backout_barrier_wait(sev);
10896 +                       break;
10897 +
10898 +               /* backout after callback_startdone_barrier_new and
10899 +                * before check_startdone_barrier_new */
10900 +               case SEST_BARRIER_DONE:
10901 +                       delay = backout_barrier_done(sev);
10902 +                       break;
10903 +
10904 +               /* backout after kcl_leave_service and before
10905 +                * send_leave_notice */
10906 +               case SEST_LEAVE_BEGIN:
10907 +                       delay = backout_leave_begin(sev);
10908 +                       break;
10909 +
10910 +               /* backout after send_leave_notice and before final
10911 +                * process_reply */
10912 +               case SEST_LEAVE_ACKWAIT:
10913 +                       delay = backout_leave_ackwait(sev);
10914 +                       break;
10915 +
10916 +               /* backout after final process_reply and before
10917 +                * check_leave_notice */
10918 +               case SEST_LEAVE_ACKED:
10919 +                       delay = backout_leave_acked(sev);
10920 +                       break;
10921 +
10922 +               /* backout after send_leave_stop and before final
10923 +                * process_reply */
10924 +               case SEST_LSTOP_ACKWAIT:
10925 +                       delay = backout_lstop_ackwait(sev);
10926 +                       break;
10927 +
10928 +               /* backout after final process_reply and before
10929 +                * check_leave_stop */
10930 +               case SEST_LSTOP_ACKED:
10931 +                       delay = backout_lstop_acked(sev);
10932 +                       break;
10933 +
10934 +               /* backout after send_leave_start and before
10935 +                * process_lstart_done */
10936 +               case SEST_LSTART_WAITREMOTE:
10937 +                       delay = backout_lstart_waitremote(sev);
10938 +                       break;
10939 +
10940 +               /* backout after process_lstart_done and before
10941 +                * process_leave_sevent */
10942 +               case SEST_LSTART_REMOTEDONE:
10943 +                       sevent_done(sev);
10944 +                       delay = FALSE;
10945 +                       break;
10946 +
10947 +               default:
10948 +                       log_error(sev->se_sg, "backout_sevents: bad state %d",
10949 +                                 sev->se_state);
10950 +               }
10951 +
10952 +               if (delay) {
10953 +                       if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
10954 +                               sev->se_state = SEST_LEAVE_BEGIN;
10955 +                               set_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
10956 +                               set_bit(SEFL_CHECK, &sev->se_flags);
10957 +                               wake_serviced(DO_JOINLEAVE);
10958 +                       } else {
10959 +                               sev->se_state = SEST_JOIN_BEGIN;
10960 +                               set_bit(SEFL_CHECK, &sev->se_flags);
10961 +                               wake_serviced(DO_JOINLEAVE);
10962 +                       }
10963 +               }
10964 +       }
10965 +}
10966 +
10967 +void process_joinleave(void)
10968 +{
10969 +       sm_sevent_t *sev = NULL, *safe;
10970 +
10971 +       spin_lock(&new_event_lock);
10972 +       if (!list_empty(&new_event)) {
10973 +               sev = list_entry(new_event.next, sm_sevent_t, se_list);
10974 +               list_del(&sev->se_list);
10975 +               list_add_tail(&sev->se_list, &joinleave_events);
10976 +               set_bit(SEFL_CHECK, &sev->se_flags);
10977 +       }
10978 +       spin_unlock(&new_event_lock);
10979 +
10980 +       list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
10981 +               if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
10982 +                       continue;
10983 +
10984 +               if (test_bit(SEFL_DELAY, &sev->se_flags) ||
10985 +                   test_bit(SEFL_DELAY_RECOVERY, &sev->se_flags))
10986 +                       continue;
10987 +
10988 +               if (sev->se_state < SEST_LEAVE_BEGIN)
10989 +                       process_join_sevent(sev);
10990 +               else
10991 +                       process_leave_sevent(sev);
10992 +       }
10993 +}
10994 diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
10995 --- linux-orig/cluster/cman/sm_joinleave.h      1970-01-01 07:30:00.000000000 +0730
10996 +++ linux-patched/cluster/cman/sm_joinleave.h   2004-11-03 11:37:37.000000000 +0800
10997 @@ -0,0 +1,23 @@
10998 +/******************************************************************************
10999 +*******************************************************************************
11000 +**
11001 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11002 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11003 +**
11004 +**  This copyrighted material is made available to anyone wishing to use,
11005 +**  modify, copy, or redistribute it subject to the terms and conditions
11006 +**  of the GNU General Public License v.2.
11007 +**
11008 +*******************************************************************************
11009 +******************************************************************************/
11010 +
11011 +#ifndef __SM_JOINLEAVE_DOT_H__
11012 +#define __SM_JOINLEAVE_DOT_H__
11013 +
11014 +void init_joinleave(void);
11015 +void new_joinleave(sm_sevent_t *sev);
11016 +void process_joinleave(void);
11017 +void backout_sevents(void);
11018 +sm_sevent_t *find_sevent(unsigned int id);
11019 +
11020 +#endif
11021 diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
11022 --- linux-orig/cluster/cman/sm_membership.c     1970-01-01 07:30:00.000000000 +0730
11023 +++ linux-patched/cluster/cman/sm_membership.c  2004-11-03 11:37:37.000000000 +0800
11024 @@ -0,0 +1,696 @@
11025 +/******************************************************************************
11026 +*******************************************************************************
11027 +**
11028 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11029 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11030 +**
11031 +**  This copyrighted material is made available to anyone wishing to use,
11032 +**  modify, copy, or redistribute it subject to the terms and conditions
11033 +**  of the GNU General Public License v.2.
11034 +**
11035 +*******************************************************************************
11036 +******************************************************************************/
11037 +
11038 +#include "sm.h"
11039 +
11040 +extern struct list_head                sm_members;
11041 +
11042 +/*
11043 + * Routines for SG members to handle other nodes joining or leaving the SG.
11044 + * These "uevent" membership update routines are the response to an "sevent" on
11045 + * a joining/leaving node.
11046 + */
11047 +
11048 +static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
11049 +{
11050 +       sm_node_t *node;
11051 +
11052 +       list_for_each_entry(node, &sg->memb, list) {
11053 +               if (node->id != nodeid)
11054 +                       continue;
11055 +               list_del(&node->list);
11056 +               kfree(node);
11057 +               sg->memb_count--;
11058 +               log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
11059 +               break;
11060 +       }
11061 +}
11062 +
11063 +static void add_memb_node(sm_group_t *sg, sm_node_t *node)
11064 +{
11065 +       list_add_tail(&node->list, &sg->memb);
11066 +       sg->memb_count++;
11067 +       log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
11068 +}
11069 +
11070 +/*
11071 + * Join 1.  The receive end of send_join_stop() from a node requesting to join
11072 + * the SG.  We stop the service so it can be restarted with the new node.
11073 + */
11074 +
11075 +static int process_join_stop(sm_group_t *sg)
11076 +{
11077 +       sm_uevent_t *uev = &sg->uevent;
11078 +       sm_node_t *node;
11079 +       sm_msg_t reply;
11080 +       int error;
11081 +
11082 +       if (uev->ue_num_nodes != sg->memb_count + 1) {
11083 +               log_error(sg, "process_join_stop: bad num nodes %u %u",
11084 +                         uev->ue_num_nodes, sg->memb_count);
11085 +               return -1;
11086 +       }
11087 +
11088 +       sm_set_event_id(&uev->ue_id);
11089 +
11090 +       node = sm_find_joiner(sg, uev->ue_nodeid);
11091 +       SM_ASSERT(node,);
11092 +
11093 +       sg->state = SGST_UEVENT;
11094 +       sg->ops->stop(sg->service_data);
11095 +
11096 +       reply.ms_type = SMSG_JSTOP_REP;
11097 +       reply.ms_status = STATUS_POS;
11098 +       reply.ms_sevent_id = uev->ue_remote_seid;
11099 +       smsg_bswap_out(&reply);
11100 +
11101 +       error = send_nodeid_message((char *) &reply, sizeof(reply),
11102 +                                   uev->ue_nodeid);
11103 +       if (error < 0)
11104 +               return error;
11105 +       return 0;
11106 +}
11107 +
11108 +/*
11109 + * Join 2.  The receive end of send_join_start() from a node joining the SG.
11110 + * We are re-starting the service with the new member added.
11111 + */
11112 +
11113 +static int process_join_start(sm_group_t *sg)
11114 +{
11115 +       sm_uevent_t *uev = &sg->uevent;
11116 +       sm_node_t *node;
11117 +       uint32_t *memb;
11118 +       int count = 0;
11119 +
11120 +       /* this memory is passed to the service which must free it */
11121 +       SM_RETRY(memb =
11122 +                kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
11123 +                memb);
11124 +
11125 +       /* transfer joining node from joining list to member list */
11126 +       node = sm_find_joiner(sg, uev->ue_nodeid);
11127 +       SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
11128 +       list_del(&node->list);
11129 +       add_memb_node(sg, node);
11130 +
11131 +       /* the new member list for the service */
11132 +       list_for_each_entry(node, &sg->memb, list)
11133 +               memb[count++] = node->id;
11134 +
11135 +       set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11136 +
11137 +       sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11138 +                      SERVICE_NODE_JOIN);
11139 +       return 0;
11140 +}
11141 +
11142 +/*
11143 + * Join 3.  When done starting their local service, every previous SG member
11144 + * calls startdone_barrier() and the new/joining member calls
11145 + * startdone_barrier_new().  The barrier returns when everyone has started
11146 + * their service and joined the barrier.
11147 + */
11148 +
11149 +static int startdone_barrier(sm_group_t *sg)
11150 +{
11151 +       sm_uevent_t *uev = &sg->uevent;
11152 +       char bname[MAX_BARRIER_NAME_LEN];
11153 +       int error;
11154 +
11155 +       memset(bname, 0, MAX_BARRIER_NAME_LEN);
11156 +       uev->ue_barrier_status = -1;
11157 +
11158 +       set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11159 +
11160 +       /* If we're the only member, skip the barrier */
11161 +       if (sg->memb_count == 1) {
11162 +               process_startdone_barrier(sg, 0);
11163 +               return 0;
11164 +       }
11165 +
11166 +       snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11167 +                sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11168 +                sg->memb_count);
11169 +
11170 +       error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
11171 +
11172 +       return error;
11173 +}
11174 +
11175 +/*
11176 + * Join 4.  Check that the "all started" barrier returned a successful status.
11177 + * The newly joined member calls check_startdone_barrier_new().
11178 + */
11179 +
11180 +static int check_startdone_barrier(sm_group_t *sg)
11181 +{
11182 +       int error = sg->uevent.ue_barrier_status;
11183 +       return error;
11184 +}
11185 +
11186 +/*
11187 + * Join 5.  Send the service a "finish" indicating that all members have
11188 + * successfully started.  The newly joined member calls do_finish_new().
11189 + */
11190 +
11191 +static void do_finish(sm_group_t *sg)
11192 +{
11193 +       sg->state = SGST_RUN;
11194 +       clear_bit(SGFL_UEVENT, &sg->flags);
11195 +       sg->ops->finish(sg->service_data, sg->uevent.ue_id);
11196 +}
11197 +
11198 +/*
11199 + * Join 6.  The uevent is done.  If this was a uevent for a node leaving the
11200 + * SG, then send a final message to the departed node signalling that the
11201 + * remaining nodes have restarted since it left.
11202 + */
11203 +
11204 +static void uevent_done(sm_group_t *sg)
11205 +{
11206 +       sm_uevent_t *uev = &sg->uevent;
11207 +       sm_msg_t reply;
11208 +
11209 +       if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
11210 +               reply.ms_type = SMSG_LSTART_DONE;
11211 +               reply.ms_status = STATUS_POS;
11212 +               reply.ms_sevent_id = uev->ue_remote_seid;
11213 +               smsg_bswap_out(&reply);
11214 +               send_nodeid_message((char *) &reply, sizeof(reply),
11215 +                                   uev->ue_nodeid);
11216 +       }
11217 +       memset(&sg->uevent, 0, sizeof(sm_uevent_t));
11218 +}
11219 +
11220 +/*
11221 + * Leave 1.  The receive end of send_leave_stop() from a node leaving the SG.
11222 + */
11223 +
11224 +static int process_leave_stop(sm_group_t *sg)
11225 +{
11226 +       sm_uevent_t *uev = &sg->uevent;
11227 +       sm_msg_t reply;
11228 +       int error;
11229 +
11230 +       sm_set_event_id(&uev->ue_id);
11231 +
11232 +       sg->state = SGST_UEVENT;
11233 +       sg->ops->stop(sg->service_data);
11234 +
11235 +       reply.ms_type = SMSG_LSTOP_REP;
11236 +       reply.ms_status = STATUS_POS;
11237 +       reply.ms_sevent_id = uev->ue_remote_seid;
11238 +       smsg_bswap_out(&reply);
11239 +
11240 +       error = send_nodeid_message((char *) &reply, sizeof(reply),
11241 +                                   uev->ue_nodeid);
11242 +       if (error < 0)
11243 +               return error;
11244 +       return 0;
11245 +}
11246 +
11247 +/*
11248 + * Leave 2.  The receive end of send_leave_start() from a node leaving the SG.
11249 + * We are re-starting the service (without the node that's left naturally.)
11250 + */
11251 +
11252 +static int process_leave_start(sm_group_t *sg)
11253 +{
11254 +       sm_uevent_t *uev = &sg->uevent;
11255 +       sm_node_t *node;
11256 +       uint32_t *memb;
11257 +       int count = 0;
11258 +
11259 +       SM_ASSERT(sg->memb_count > 1,
11260 +                 printk("memb_count=%u\n", sg->memb_count););
11261 +
11262 +       /* this memory is passed to the service which must free it */
11263 +       SM_RETRY(memb =
11264 +                kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
11265 +                memb);
11266 +
11267 +       /* remove departed member from sg member list */
11268 +       del_memb_node(sg, uev->ue_nodeid);
11269 +
11270 +       /* build member list to pass to service */
11271 +       list_for_each_entry(node, &sg->memb, list)
11272 +               memb[count++] = node->id;
11273 +
11274 +       /* allow us to accept the start_done callback for this start */
11275 +       set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11276 +
11277 +       sg->ops->start(sg->service_data, memb, count, uev->ue_id,
11278 +                      SERVICE_NODE_LEAVE);
11279 +       return 0;
11280 +}
11281 +
11282 +/*
11283 + * Move through the steps of another node joining or leaving the SG.
11284 + */
11285 +
11286 +static void process_one_uevent(sm_group_t *sg)
11287 +{
11288 +       sm_uevent_t *uev = &sg->uevent;
11289 +       int error = 0;
11290 +
11291 +       log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
11292 +
11293 +       switch (uev->ue_state) {
11294 +
11295 +               /*
11296 +                * a uevent is initialized with state JSTOP in
11297 +                * process_stop_request
11298 +                */
11299 +
11300 +       case UEST_JSTOP:
11301 +               uev->ue_state = UEST_JSTART_WAITCMD;
11302 +               error = process_join_stop(sg);
11303 +               break;
11304 +
11305 +               /*
11306 +                * ue_state is changed from JSTART_WAITCMD to JSTART in
11307 +                * process_start_request
11308 +                */
11309 +
11310 +       case UEST_JSTART:
11311 +               uev->ue_state = UEST_JSTART_SERVICEWAIT;
11312 +               error = process_join_start(sg);
11313 +               break;
11314 +
11315 +               /*
11316 +                * ue_state is changed from JSTART_SERVICEWAIT to
11317 +                * JSTART_SERVICEDONE in kcl_start_done
11318 +                */
11319 +
11320 +       case UEST_JSTART_SERVICEDONE:
11321 +               uev->ue_state = UEST_BARRIER_WAIT;
11322 +               error = startdone_barrier(sg);
11323 +               break;
11324 +
11325 +               /*
11326 +                * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
11327 +                * process_startdone_barrier
11328 +                */
11329 +
11330 +       case UEST_BARRIER_DONE:
11331 +               error = check_startdone_barrier(sg);
11332 +               if (error)
11333 +                       break;
11334 +
11335 +               do_finish(sg);
11336 +               uevent_done(sg);
11337 +               break;
11338 +
11339 +               /*
11340 +                * a uevent is initialized with state LSTOP in
11341 +                * process_stop_request
11342 +                */
11343 +
11344 +       case UEST_LSTOP:
11345 +               uev->ue_state = UEST_LSTART_WAITCMD;
11346 +               error = process_leave_stop(sg);
11347 +               break;
11348 +
11349 +               /*
11350 +                * a uevent is changed from LSTART_WAITCMD to LSTART in
11351 +                * process_start_request
11352 +                */
11353 +
11354 +       case UEST_LSTART:
11355 +               uev->ue_state = UEST_LSTART_SERVICEWAIT;
11356 +               error = process_leave_start(sg);
11357 +               break;
11358 +
11359 +               /*
11360 +                * a uevent is changed from LSTART_SERVICEWAIT to to
11361 +                * LSTART_SERVICEDONE in kcl_start_done
11362 +                */
11363 +
11364 +       case UEST_LSTART_SERVICEDONE:
11365 +               uev->ue_state = UEST_BARRIER_WAIT;
11366 +               error = startdone_barrier(sg);
11367 +               break;
11368 +
11369 +       default:
11370 +               error = -1;
11371 +       }
11372 +
11373 +       /* If we encounter an error during these routines, we do nothing,
11374 +          expecting that a node failure related to this sg will cause a
11375 +          recovery event to arrive and call cancel_one_uevent(). */
11376 +
11377 +       if (error)
11378 +               log_error(sg, "process_one_uevent error %d state %u",
11379 +                         error, uev->ue_state);
11380 +}
11381 +
11382 +static sm_node_t *failed_memb(sm_group_t *sg, int *count)
11383 +{
11384 +       sm_node_t *node, *sm_node, *failed_uev_node = NULL;
11385 +
11386 +       list_for_each_entry(node, &sg->memb, list) {
11387 +
11388 +               sm_node = sm_find_member(node->id);
11389 +               SM_ASSERT(sm_node, );
11390 +
11391 +               if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
11392 +                       (*count)++;
11393 +                       if (node->id == sg->uevent.ue_nodeid)
11394 +                               failed_uev_node = sm_node;
11395 +               }
11396 +       }
11397 +       return failed_uev_node;
11398 +}
11399 +
11400 +static void send_recover_msg(sm_group_t *sg)
11401 +{
11402 +       char *msg;
11403 +       int len = 0;
11404 +       msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
11405 +       send_members_message(sg, msg, len);
11406 +}
11407 +
11408 +static void cancel_barrier(sm_group_t *sg)
11409 +{
11410 +       sm_uevent_t *uev = &sg->uevent;
11411 +       char bname[MAX_BARRIER_NAME_LEN];
11412 +
11413 +       clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
11414 +
11415 +       memset(bname, 0, MAX_BARRIER_NAME_LEN);
11416 +       snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
11417 +                sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
11418 +                sg->memb_count);
11419 +       kcl_barrier_cancel(bname);
11420 +}
11421 +
11422 +static void cancel_one_uevent(sm_group_t *sg, int *effected)
11423 +{
11424 +       sm_uevent_t *uev = &sg->uevent;
11425 +       int failed_count;
11426 +       sm_node_t *node, *failed_joiner, *failed_leaver;
11427 +
11428 +       log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
11429 +                 uev->ue_nodeid);
11430 +
11431 +       switch (uev->ue_state) {
11432 +
11433 +       case UEST_JSTOP:
11434 +       case UEST_JSTART_WAITCMD:
11435 +       case UEST_JSTART:
11436 +
11437 +               sg->ops->stop(sg->service_data);
11438 +
11439 +               failed_count = 0;
11440 +               failed_joiner = failed_memb(sg, &failed_count);
11441 +               SM_ASSERT(!failed_joiner, );
11442 +
11443 +               node = sm_find_member(uev->ue_nodeid);
11444 +               if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11445 +                       failed_joiner = node;
11446 +
11447 +               if (!failed_count) {
11448 +                       /* only joining node failed */
11449 +                       SM_ASSERT(failed_joiner, );
11450 +                       SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11451 +                       set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11452 +                       (*effected)++;
11453 +                       /* some nodes may not have gotten a JSTOP message
11454 +                          in which case this will tell them to begin
11455 +                          recovery for this sg. */
11456 +                       send_recover_msg(sg);
11457 +
11458 +               } else {
11459 +                       /* a member node failed (and possibly joining node, it
11460 +                          doesn't matter) */
11461 +                       SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11462 +               }
11463 +
11464 +               clear_bit(SGFL_UEVENT, &sg->flags);
11465 +               memset(uev, 0, sizeof(sm_uevent_t));
11466 +               break;
11467 +
11468 +
11469 +       case UEST_JSTART_SERVICEWAIT:
11470 +       case UEST_JSTART_SERVICEDONE:
11471 +
11472 +               clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11473 +               sg->ops->stop(sg->service_data);
11474 +
11475 +               failed_count = 0;
11476 +               failed_joiner = failed_memb(sg, &failed_count);
11477 +               SM_ASSERT(failed_count, );
11478 +               SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11479 +
11480 +               if (failed_count == 1 && failed_joiner) {
11481 +                       /* only joining node failed */
11482 +
11483 +               } else if (failed_count && failed_joiner) {
11484 +                       /* joining node and another member failed */
11485 +
11486 +               } else {
11487 +                       /* other member failed, joining node still alive */
11488 +                       SM_ASSERT(!failed_joiner, );
11489 +                       del_memb_node(sg, uev->ue_nodeid);
11490 +               }
11491 +
11492 +               clear_bit(SGFL_UEVENT, &sg->flags);
11493 +               memset(uev, 0, sizeof(sm_uevent_t));
11494 +               break;
11495 +
11496 +
11497 +       case UEST_LSTOP:
11498 +       case UEST_LSTART_WAITCMD:
11499 +       case UEST_LSTART:
11500 +
11501 +               sg->ops->stop(sg->service_data);
11502 +
11503 +               failed_count = 0;
11504 +               failed_leaver = failed_memb(sg, &failed_count);
11505 +               SM_ASSERT(failed_count, );
11506 +               SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11507 +
11508 +               if (failed_count == 1 && failed_leaver) {
11509 +                       /* only leaving node failed */
11510 +
11511 +               } else if (failed_count && failed_leaver) {
11512 +                       /* leaving node and another member failed */
11513 +
11514 +               } else {
11515 +                       /* other member failed, leaving node still alive */
11516 +                       SM_ASSERT(!failed_leaver, );
11517 +               }
11518 +
11519 +               clear_bit(SGFL_UEVENT, &sg->flags);
11520 +               memset(uev, 0, sizeof(sm_uevent_t));
11521 +               break;
11522 +
11523 +
11524 +       case UEST_LSTART_SERVICEWAIT:
11525 +       case UEST_LSTART_SERVICEDONE:
11526 +
11527 +               clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
11528 +               sg->ops->stop(sg->service_data);
11529 +
11530 +               failed_count = 0;
11531 +               failed_leaver = failed_memb(sg, &failed_count);
11532 +               SM_ASSERT(!failed_leaver, );
11533 +
11534 +               node = sm_find_member(uev->ue_nodeid);
11535 +               if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11536 +                       failed_leaver = node;
11537 +
11538 +               if (!failed_count) {
11539 +                       /* only leaving node failed */
11540 +                       SM_ASSERT(failed_leaver, );
11541 +                       SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11542 +                       set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11543 +                       (*effected)++;
11544 +
11545 +               } else if (failed_count && failed_leaver) {
11546 +                       /* leaving node and another member failed */
11547 +                       SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11548 +
11549 +               } else {
11550 +                       /* other member failed, leaving node still alive */
11551 +                       SM_ASSERT(failed_count, );
11552 +                       SM_ASSERT(!failed_leaver, );
11553 +                       SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11554 +                       node = sm_new_node(sg->uevent.ue_nodeid);
11555 +                       add_memb_node(sg, node);
11556 +               }
11557 +
11558 +               clear_bit(SGFL_UEVENT, &sg->flags);
11559 +               memset(uev, 0, sizeof(sm_uevent_t));
11560 +               break;
11561 +
11562 +
11563 +       case UEST_BARRIER_WAIT:
11564 +
11565 +               if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11566 +                       goto barrier_wait_leave;
11567 +
11568 +               sg->ops->stop(sg->service_data);
11569 +               cancel_barrier(sg);
11570 +
11571 +             barrier_wait_join:
11572 +
11573 +               failed_count = 0;
11574 +               failed_joiner = failed_memb(sg, &failed_count);
11575 +               SM_ASSERT(failed_count, );
11576 +               SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11577 +
11578 +               if (failed_count == 1 && failed_joiner) {
11579 +                       /* only joining node failed */
11580 +
11581 +               } else if (failed_count && failed_joiner) {
11582 +                       /* joining node and another member failed */
11583 +
11584 +               } else {
11585 +                       /* other member failed, joining node still alive */
11586 +                       SM_ASSERT(!failed_joiner, );
11587 +                       del_memb_node(sg, uev->ue_nodeid);
11588 +               }
11589 +
11590 +               clear_bit(SGFL_UEVENT, &sg->flags);
11591 +               memset(uev, 0, sizeof(sm_uevent_t));
11592 +               break;
11593 +
11594 +              barrier_wait_leave:
11595 +
11596 +               failed_count = 0;
11597 +               failed_leaver = failed_memb(sg, &failed_count);
11598 +               SM_ASSERT(!failed_leaver, );
11599 +
11600 +               node = sm_find_member(uev->ue_nodeid);
11601 +               if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11602 +                       failed_leaver = node;
11603 +
11604 +               if (!failed_count) {
11605 +                       /* only leaving node failed */
11606 +                       SM_ASSERT(failed_leaver, );
11607 +                       SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11608 +                       set_bit(SGFL_NEED_RECOVERY, &sg->flags);
11609 +                       (*effected)++;
11610 +
11611 +               } else if (failed_count && failed_leaver) {
11612 +                       /* leaving node and another member failed */
11613 +                       SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11614 +
11615 +               } else {
11616 +                       /* other member failed, leaving node still alive */
11617 +                       SM_ASSERT(failed_count, );
11618 +                       SM_ASSERT(!failed_leaver, );
11619 +                       SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
11620 +                       node = sm_new_node(sg->uevent.ue_nodeid);
11621 +                       add_memb_node(sg, node);
11622 +               }
11623 +
11624 +               clear_bit(SGFL_UEVENT, &sg->flags);
11625 +               memset(uev, 0, sizeof(sm_uevent_t));
11626 +               break;
11627 +
11628 +
11629 +       case UEST_BARRIER_DONE:
11630 +
11631 +               if (!uev->ue_barrier_status) {
11632 +                       do_finish(sg);
11633 +                       uevent_done(sg);
11634 +                       break;
11635 +               }
11636 +
11637 +               if (test_bit(UEFL_LEAVE, &uev->ue_flags))
11638 +                       goto barrier_wait_leave;
11639 +               else
11640 +                       goto barrier_wait_join;
11641 +
11642 +
11643 +       default:
11644 +               log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
11645 +       }
11646 +}
11647 +
11648 +void cancel_uevents(int *effected)
11649 +{
11650 +       sm_group_t *sg;
11651 +       sm_node_t *node, *sgnode;
11652 +       int i;
11653 +
11654 +       list_for_each_entry(node, &sm_members, list) {
11655 +               if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
11656 +                       continue;
11657 +
11658 +               /*
11659 +                * Clear this dead node from the "interested in joining" list
11660 +                * of any SG.  The node is added to this list before the uevent
11661 +                * begins.
11662 +                */
11663 +
11664 +               for (i = 0; i < SG_LEVELS; i++) {
11665 +                       list_for_each_entry(sg, &sm_sg[i], list) {
11666 +                               sgnode = sm_find_joiner(sg, node->id);
11667 +                               if (sgnode) {
11668 +                                       log_debug(sg, "clear joining node %u",
11669 +                                                 sgnode->id);
11670 +                                       list_del(&sgnode->list);
11671 +                                       kfree(sgnode);
11672 +                               }
11673 +                       }
11674 +               }
11675 +       }
11676 +
11677 +        /* Adjust any uevents in sg's effected by the failed node(s) */
11678 +
11679 +       for (i = 0; i < SG_LEVELS; i++) {
11680 +               list_for_each_entry(sg, &sm_sg[i], list) {
11681 +                       if (!test_bit(SGFL_UEVENT, &sg->flags))
11682 +                               continue;
11683 +
11684 +                       /* We may have some cancelling to do if this sg is
11685 +                          flagged as having a failed member, or if a joining
11686 +                          or leaving node has died. */
11687 +
11688 +                       if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
11689 +                               cancel_one_uevent(sg, effected);
11690 +                       else if (sg->uevent.ue_nodeid) {
11691 +                               node = sm_find_member(sg->uevent.ue_nodeid);
11692 +                               SM_ASSERT(node, );
11693 +                               if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
11694 +                                       cancel_one_uevent(sg, effected);
11695 +                       }
11696 +               }
11697 +       }
11698 +}
11699 +
11700 +void process_membership(void)
11701 +{
11702 +       sm_group_t *sg;
11703 +       int i;
11704 +
11705 +       down(&sm_sglock);
11706 +
11707 +       for (i = 0; i < SG_LEVELS; i++) {
11708 +               list_for_each_entry(sg, &sm_sg[i], list) {
11709 +                       if (!test_bit(SGFL_UEVENT, &sg->flags))
11710 +                               continue;
11711 +
11712 +                       if (!test_and_clear_bit(UEFL_CHECK,
11713 +                                               &sg->uevent.ue_flags))
11714 +                               continue;
11715 +
11716 +                       process_one_uevent(sg);
11717 +               }
11718 +       }
11719 +       up(&sm_sglock);
11720 +}
11721 diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
11722 --- linux-orig/cluster/cman/sm_membership.h     1970-01-01 07:30:00.000000000 +0730
11723 +++ linux-patched/cluster/cman/sm_membership.h  2004-11-03 11:37:37.000000000 +0800
11724 @@ -0,0 +1,20 @@
11725 +/******************************************************************************
11726 +*******************************************************************************
11727 +**
11728 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11729 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11730 +**
11731 +**  This copyrighted material is made available to anyone wishing to use,
11732 +**  modify, copy, or redistribute it subject to the terms and conditions
11733 +**  of the GNU General Public License v.2.
11734 +**
11735 +*******************************************************************************
11736 +******************************************************************************/
11737 +
11738 +#ifndef __SM_MEMBERSHIP_DOT_H__
11739 +#define __SM_MEMBERSHIP_DOT_H__
11740 +
11741 +void process_membership(void);
11742 +void cancel_uevents(int *effected);
11743 +
11744 +#endif
11745 diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
11746 --- linux-orig/cluster/cman/sm_message.c        1970-01-01 07:30:00.000000000 +0730
11747 +++ linux-patched/cluster/cman/sm_message.c     2004-11-03 11:37:37.000000000 +0800
11748 @@ -0,0 +1,856 @@
11749 +/******************************************************************************
11750 +*******************************************************************************
11751 +**
11752 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11753 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11754 +**
11755 +**  This copyrighted material is made available to anyone wishing to use,
11756 +**  modify, copy, or redistribute it subject to the terms and conditions
11757 +**  of the GNU General Public License v.2.
11758 +**
11759 +*******************************************************************************
11760 +******************************************************************************/
11761 +
11762 +#include "sm.h"
11763 +
11764 +#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
11765 +
11766 +extern struct socket * sm_socket;
11767 +extern uint32_t        sm_our_nodeid;
11768 +static uint32_t        global_last_id;
11769 +static struct list_head messages;
11770 +static spinlock_t      message_lock;
11771 +static char            smsg_buf[SMSG_BUF_SIZE];
11772 +
11773 +int send_nodeid_message(char *msg, int len, uint32_t nodeid);
11774 +
11775 +struct rq_entry {
11776 +       struct list_head list;
11777 +       char *msg;
11778 +       int len;
11779 +       uint32_t nodeid;
11780 +};
11781 +typedef struct rq_entry rq_entry_t;
11782 +
11783 +void init_messages(void)
11784 +{
11785 +       global_last_id = 1;
11786 +       INIT_LIST_HEAD(&messages);
11787 +       spin_lock_init(&message_lock);
11788 +}
11789 +
11790 +uint32_t sm_new_global_id(int level)
11791 +{
11792 +       uint32_t id = global_last_id++;
11793 +       uint8_t l = (uint8_t) level;
11794 +
11795 +       if (level > 255)
11796 +               return 0;
11797 +
11798 +       if (id > 0x00FFFFFF)
11799 +               return 0;
11800 +
11801 +       id |= (l << 24);
11802 +       return id;
11803 +}
11804 +
11805 +static void smsg_copy_in(char *msg, sm_msg_t *smsg)
11806 +{
11807 +       sm_msg_t *in = (sm_msg_t *) msg;
11808 +
11809 +       smsg->ms_type = in->ms_type;
11810 +       smsg->ms_status = in->ms_status;
11811 +       smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
11812 +       smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
11813 +       smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
11814 +       smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
11815 +       smsg->ms_length = le16_to_cpu(in->ms_length);
11816 +}
11817 +
11818 +/* swapping bytes in place is an easy source of errors - be careful not to
11819 + * access the fields after calling this */
11820 +
11821 +void smsg_bswap_out(sm_msg_t *smsg)
11822 +{
11823 +       smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
11824 +       smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
11825 +       smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
11826 +       smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
11827 +       smsg->ms_length = cpu_to_le16(smsg->ms_length);
11828 +}
11829 +
11830 +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
11831 +                 sm_sevent_t *sev)
11832 +{
11833 +       char *msg;
11834 +       sm_msg_t *smsg;
11835 +       int fulllen = sizeof(sm_msg_t) + datalen;
11836 +
11837 +       msg = smsg_buf;
11838 +       memset(smsg_buf, 0, SMSG_BUF_SIZE);
11839 +       SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
11840 +
11841 +       smsg = (sm_msg_t *) msg;
11842 +       smsg->ms_type = type;
11843 +       smsg->ms_global_sgid = sg->global_id;
11844 +       smsg->ms_sglevel = sg->level;
11845 +       smsg->ms_length = datalen;
11846 +       smsg->ms_sevent_id = sev ? sev->se_id : 0;
11847 +
11848 +       smsg_bswap_out(smsg);
11849 +       *msglen = fulllen;
11850 +       return msg;
11851 +}
11852 +
11853 +static unsigned int msgtype_to_flag(int type)
11854 +{
11855 +       unsigned int flag;
11856 +
11857 +       switch (type) {
11858 +       case SMSG_JOIN_REP:
11859 +       case SMSG_JOIN_REQ:
11860 +               flag = SEFL_ALLOW_JOIN;
11861 +               break;
11862 +
11863 +       case SMSG_JSTOP_REP:
11864 +       case SMSG_JSTOP_REQ:
11865 +               flag = SEFL_ALLOW_JSTOP;
11866 +               break;
11867 +
11868 +       case SMSG_LEAVE_REP:
11869 +       case SMSG_LEAVE_REQ:
11870 +               flag = SEFL_ALLOW_LEAVE;
11871 +               break;
11872 +
11873 +       case SMSG_LSTOP_REP:
11874 +       case SMSG_LSTOP_REQ:
11875 +               flag = SEFL_ALLOW_LSTOP;
11876 +               break;
11877 +
11878 +       default:
11879 +               SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
11880 +       }
11881 +       return flag;
11882 +}
11883 +
11884 +static int test_allowed_msgtype(sm_sevent_t *sev, int type)
11885 +{
11886 +       unsigned int flag = msgtype_to_flag(type);
11887 +
11888 +       return test_bit(flag, &sev->se_flags);
11889 +}
11890 +
11891 +static void clear_allowed_msgtype(sm_sevent_t *sev, int type)
11892 +{
11893 +       unsigned int flag = msgtype_to_flag(type);
11894 +
11895 +       clear_bit(flag, &sev->se_flags);
11896 +}
11897 +
11898 +static void set_allowed_msgtype(sm_sevent_t *sev, int type)
11899 +{
11900 +       unsigned int flag = msgtype_to_flag(type);
11901 +
11902 +       set_bit(flag, &sev->se_flags);
11903 +}
11904 +
11905 +static int save_global_id(sm_sevent_t *sev, sm_msg_t *smsg)
11906 +{
11907 +       sm_group_t *sg = sev->se_sg;
11908 +
11909 +       if (!smsg->ms_global_sgid) {
11910 +               log_error(sg, "save_global_id: zero sg id");
11911 +               return -1;
11912 +       }
11913 +
11914 +       if (!sg->global_id)
11915 +               sg->global_id = smsg->ms_global_sgid;
11916 +
11917 +       if (sg->global_id != smsg->ms_global_sgid) {
11918 +               log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
11919 +               return -1;
11920 +       }
11921 +       return 0;
11922 +}
11923 +
11924 +static void save_lastid(sm_msg_t *smsg)
11925 +{
11926 +       uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
11927 +
11928 +       /*
11929 +        * Keep track of the highst SG id which has been used
11930 +        * in the cluster in case we need to choose a new SG id.
11931 +        */
11932 +
11933 +       if (gid > global_last_id)
11934 +               global_last_id = gid;
11935 +}
11936 +
11937 +static int next_sev_state(int msg_type, int cur_state)
11938 +{
11939 +       int next = 0;
11940 +
11941 +       switch (msg_type) {
11942 +       case SMSG_JOIN_REP:
11943 +               SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
11944 +               next = SEST_JOIN_ACKED;
11945 +               break;
11946 +
11947 +       case SMSG_JSTOP_REP:
11948 +               SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
11949 +               next = SEST_JSTOP_ACKED;
11950 +               break;
11951 +
11952 +       case SMSG_LEAVE_REP:
11953 +               SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
11954 +               next = SEST_LEAVE_ACKED;
11955 +               break;
11956 +
11957 +       case SMSG_LSTOP_REP:
11958 +               SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
11959 +               next = SEST_LSTOP_ACKED;
11960 +               break;
11961 +       }
11962 +       return next;
11963 +}
11964 +
11965 +/*
11966 + * Functions in sevent.c send messages to other nodes and then expect replies.
11967 + * This function collects the replies for the sevent messages and moves the
11968 + * sevent to the next stage when all the expected replies have been received.
11969 + */
11970 +
11971 +static void process_reply(sm_msg_t *smsg, uint32_t nodeid)
11972 +{
11973 +       sm_sevent_t *sev;
11974 +       int i, expected, type = smsg->ms_type;
11975 +
11976 +       /*
11977 +        * Find the relevant sevent.
11978 +        */
11979 +
11980 +       sev = find_sevent(smsg->ms_sevent_id);
11981 +       if (!sev) {
11982 +               log_print("process_reply invalid id=%u nodeid=%u",
11983 +                         smsg->ms_sevent_id, nodeid);
11984 +               goto out;
11985 +       }
11986 +
11987 +       /*
11988 +        * Check if this message type is what this sevent is waiting for.
11989 +        */
11990 +
11991 +       if (!test_allowed_msgtype(sev, type)) {
11992 +               log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u "                          "id=%u", type, nodeid, sev->se_id);
11993 +               goto out;
11994 +       }
11995 +
11996 +       expected =
11997 +           (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
11998 +
11999 +       SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
12000 +                 printk("type=%d expected=%d len_ids=%d node_count=%d "
12001 +                        "memb_count=%d\n", type, expected, sev->se_len_ids,
12002 +                        sev->se_node_count, sev->se_memb_count););
12003 +
12004 +       SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
12005 +                 printk("type=%d expected=%d len_status=%d node_count=%d "
12006 +                        "memb_count=%d\n", type, expected, sev->se_len_status,
12007 +                        sev->se_node_count, sev->se_memb_count););
12008 +
12009 +       for (i = 0; i < expected; i++) {
12010 +               if (sev->se_node_ids[i] == nodeid) {
12011 +                       /*
12012 +                        * Save the status from the replying node
12013 +                        */
12014 +
12015 +                       if (!sev->se_node_status[i])
12016 +                               sev->se_node_status[i] = smsg->ms_status;
12017 +                       else {
12018 +                               log_error(sev->se_sg, "process_reply duplicate"
12019 +                                         "id=%u nodeid=%u %u/%u",
12020 +                                         sev->se_id, nodeid,
12021 +                                         sev->se_node_status[i],
12022 +                                         smsg->ms_status);
12023 +                               goto out;
12024 +                       }
12025 +
12026 +                       if (type == SMSG_JOIN_REP) {
12027 +                               save_lastid(smsg);
12028 +
12029 +                               if (smsg->ms_status == STATUS_POS)
12030 +                                       save_global_id(sev, smsg);
12031 +                       }
12032 +
12033 +                       /*
12034 +                        * Signal sm if we have all replies
12035 +                        */
12036 +
12037 +                       if (++sev->se_reply_count == expected) {
12038 +                               clear_allowed_msgtype(sev, type);
12039 +                               sev->se_state = next_sev_state(type,
12040 +                                                              sev->se_state);
12041 +                               set_bit(SEFL_CHECK, &sev->se_flags);
12042 +                               wake_serviced(DO_JOINLEAVE);
12043 +                       }
12044 +
12045 +                       break;
12046 +               }
12047 +       }
12048 +
12049 +      out:
12050 +       return;
12051 +}
12052 +
12053 +/*
12054 + * A node wants to join an SG and has run send_join_notice.  If we know nothing
12055 + * about the SG , then we have no objection - send back STATUS_POS.  If we're a
12056 + * member of the SG, then send back STATUS_POS (go ahead and join) if there's
12057 + * no sevent or uevent of higher priority in progress (only a single join or
12058 + * leave is permitted for the SG at once).  If there happens to be a higher
12059 + * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
12060 + * requested join for a bit.
12061 + */
12062 +
12063 +static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
12064 +{
12065 +       sm_group_t *sg = NULL;
12066 +       sm_sevent_t *sev = NULL;
12067 +       sm_node_t *node;
12068 +       int found = FALSE;
12069 +       int level = smsg->ms_sglevel;
12070 +       sm_msg_t reply;
12071 +
12072 +       memset(&reply, 0, sizeof(reply));
12073 +
12074 +       down(&sm_sglock);
12075 +
12076 +       if (nodeid == sm_our_nodeid)
12077 +               goto next;
12078 +
12079 +       /*
12080 +        * search SG list for an SG with given name/len
12081 +        */
12082 +
12083 +       list_for_each_entry(sg, &sm_sg[level], list) {
12084 +               if ((sg->namelen != smsg->ms_length) ||
12085 +                   memcmp(sg->name, name, sg->namelen))
12086 +                       continue;
12087 +               found = TRUE;
12088 +               break;
12089 +       }
12090 +
12091 +       /*
12092 +        * build reply message
12093 +        */
12094 +
12095 +      next:
12096 +
12097 +       if (!found) {
12098 +               reply.ms_type = SMSG_JOIN_REP;
12099 +               reply.ms_status = STATUS_NEG;
12100 +               reply.ms_global_lastid = global_last_id;
12101 +               reply.ms_sevent_id = smsg->ms_sevent_id;
12102 +       } else {
12103 +               reply.ms_type = SMSG_JOIN_REP;
12104 +               reply.ms_status = STATUS_POS;
12105 +               reply.ms_sevent_id = smsg->ms_sevent_id;
12106 +               reply.ms_global_sgid = sg->global_id;
12107 +               reply.ms_global_lastid = global_last_id;
12108 +
12109 +               /*
12110 +                * The node trying to join should wait and try again until
12111 +                * we're done with recovery.
12112 +                */
12113 +
12114 +               if (sg->state == SGST_RECOVER) {
12115 +                       reply.ms_status = STATUS_WAIT;
12116 +                       goto send;
12117 +               }
12118 +
12119 +               /*
12120 +                * An sevent node trying to join may have gotten as far as
12121 +                * creating a uevent with us and then backed out.  That node
12122 +                * will retry joining from the beginning so we should not turn
12123 +                * them away.  If we're handling a uevent for another node,
12124 +                * tell the joining node to wait.
12125 +                */
12126 +
12127 +               if (test_bit(SGFL_UEVENT, &sg->flags)) {
12128 +                       if (sg->uevent.ue_nodeid != nodeid)
12129 +                               reply.ms_status = STATUS_WAIT;
12130 +                       goto send;
12131 +               }
12132 +
12133 +               /*
12134 +                * We're trying to join or leave the SG at the moment.
12135 +                */
12136 +
12137 +               if (test_bit(SGFL_SEVENT, &sg->flags)) {
12138 +                       sev = sg->sevent;
12139 +
12140 +                       /*
12141 +                        * We're trying to leave.  Make the join wait until
12142 +                        * we've left if we're beyond LEAVE_ACKWAIT.
12143 +                        */
12144 +
12145 +                       if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12146 +                               if (sev->se_state > SEST_LEAVE_ACKED)
12147 +                                       reply.ms_status = STATUS_WAIT;
12148 +                               else {
12149 +                                       reply.ms_status = STATUS_POS;
12150 +                                       clear_bit(SEFL_ALLOW_LEAVE,
12151 +                                                 &sev->se_flags);
12152 +                                       set_bit(SEFL_CANCEL, &sev->se_flags);
12153 +                               }
12154 +                       }
12155 +
12156 +                       /*
12157 +                        * We're trying to join.  Making the other join wait
12158 +                        * until we're joined if we're beyond JOIN_ACKWAIT or
12159 +                        * if we have a lower id.  (Send NEG to allow the other
12160 +                        * node to go ahead because we're not in the SG.)
12161 +                        */
12162 +
12163 +                       else {
12164 +                               if (sev->se_state > SEST_JOIN_ACKED)
12165 +                                       reply.ms_status = STATUS_WAIT;
12166 +                               else if (sm_our_nodeid < nodeid)
12167 +                                       reply.ms_status = STATUS_WAIT;
12168 +                               else {
12169 +                                       reply.ms_status = STATUS_NEG;
12170 +                                       clear_bit(SEFL_ALLOW_JOIN,
12171 +                                                 &sev->se_flags);
12172 +                                       set_bit(SEFL_CANCEL, &sev->se_flags);
12173 +                               }
12174 +                       }
12175 +
12176 +                       if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12177 +                               set_bit(SEFL_CHECK, &sev->se_flags);
12178 +                               wake_serviced(DO_JOINLEAVE);
12179 +                       }
12180 +                       goto send;
12181 +               }
12182 +
12183 +               /* no r,u,s event, stick with STATUS_POS */
12184 +       }
12185 +
12186 +      send:
12187 +
12188 +       if (reply.ms_status == STATUS_POS) {
12189 +               node = sm_find_joiner(sg, nodeid);
12190 +               if (!node) {
12191 +                       node = sm_new_node(nodeid);
12192 +                       list_add_tail(&node->list, &sg->joining);
12193 +               }
12194 +       }
12195 +
12196 +       up(&sm_sglock);
12197 +       smsg_bswap_out(&reply);
12198 +       send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12199 +}
12200 +
12201 +/*
12202 + * Another node wants us to stop a service so it can join or leave the SG.  We
12203 + * do this by saving the request info in a uevent and having the sm thread do
12204 + * the processing and then replying.
12205 + */
12206 +
12207 +static void process_stop_request(sm_msg_t *smsg, uint32_t nodeid,
12208 +                                uint32_t *msgbuf)
12209 +{
12210 +       sm_group_t *sg;
12211 +       sm_uevent_t *uev;
12212 +       sm_msg_t reply;
12213 +       int type = smsg->ms_type;
12214 +
12215 +       if (nodeid == sm_our_nodeid)
12216 +               goto agree;
12217 +
12218 +       sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12219 +       if (!sg) {
12220 +               log_print("process_stop_request: unknown sg id %x",
12221 +                         smsg->ms_global_sgid);
12222 +               return;
12223 +       }
12224 +
12225 +       /*
12226 +        * We shouldn't get here with uevent already set.
12227 +        */
12228 +
12229 +       if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
12230 +               log_error(sg, "process_stop_request: uevent already set");
12231 +               return;
12232 +       }
12233 +
12234 +       uev = &sg->uevent;
12235 +       uev->ue_nodeid = nodeid;
12236 +       uev->ue_remote_seid = smsg->ms_sevent_id;
12237 +       uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
12238 +
12239 +       if (type == SMSG_JSTOP_REQ)
12240 +               uev->ue_num_nodes = be32_to_cpu(*msgbuf);
12241 +       else
12242 +               set_bit(UEFL_LEAVE, &uev->ue_flags);
12243 +
12244 +       /*
12245 +        * Do process_join_stop() or process_leave_stop().
12246 +        */
12247 +
12248 +       set_bit(UEFL_CHECK, &uev->ue_flags);
12249 +       wake_serviced(DO_MEMBERSHIP);
12250 +       return;
12251 +
12252 +      agree:
12253 +       reply.ms_status = STATUS_POS;
12254 +       reply.ms_type =
12255 +           (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
12256 +       reply.ms_sevent_id = smsg->ms_sevent_id;
12257 +       smsg_bswap_out(&reply);
12258 +       send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12259 +}
12260 +
12261 +static void process_start_request(sm_msg_t *smsg, uint32_t nodeid)
12262 +{
12263 +       sm_group_t *sg;
12264 +       sm_uevent_t *uev;
12265 +       int type = smsg->ms_type;
12266 +
12267 +       if (nodeid == sm_our_nodeid)
12268 +               return;
12269 +
12270 +       sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12271 +       if (!sg) {
12272 +               log_print("process_start_request: unknown sg id %x",
12273 +                         smsg->ms_global_sgid);
12274 +               return;
12275 +       }
12276 +
12277 +       if (!test_bit(SGFL_UEVENT, &sg->flags)) {
12278 +               log_error(sg, "process_start_request: no uevent");
12279 +               return;
12280 +       }
12281 +
12282 +       uev = &sg->uevent;
12283 +
12284 +       if (type == SMSG_JSTART_CMD)
12285 +               uev->ue_state = UEST_JSTART;
12286 +       else
12287 +               uev->ue_state = UEST_LSTART;
12288 +
12289 +       set_bit(UEFL_CHECK, &uev->ue_flags);
12290 +       wake_serviced(DO_MEMBERSHIP);
12291 +}
12292 +
12293 +static void process_leave_request(sm_msg_t *smsg, uint32_t nodeid)
12294 +{
12295 +       sm_group_t *sg;
12296 +       sm_node_t *node;
12297 +       sm_msg_t reply;
12298 +       sm_sevent_t *sev;
12299 +       int found = FALSE;
12300 +
12301 +       sg = sm_global_id_to_sg(smsg->ms_global_sgid);
12302 +       if (sg) {
12303 +               if (nodeid == sm_our_nodeid)
12304 +                       found = TRUE;
12305 +               else {
12306 +                       list_for_each_entry(node, &sg->memb, list) {
12307 +                               if (node->id != nodeid)
12308 +                                       continue;
12309 +                               set_bit(SNFL_LEAVING, &node->flags);
12310 +                               found = TRUE;
12311 +                               break;
12312 +                       }
12313 +               }
12314 +       }
12315 +
12316 +       if (!found) {
12317 +               reply.ms_type = SMSG_LEAVE_REP;
12318 +               reply.ms_status = STATUS_NEG;
12319 +               reply.ms_sevent_id = smsg->ms_sevent_id;
12320 +       } else {
12321 +               reply.ms_type = SMSG_LEAVE_REP;
12322 +               reply.ms_status = STATUS_POS;
12323 +               reply.ms_sevent_id = smsg->ms_sevent_id;
12324 +
12325 +               if (sg->state == SGST_RECOVER)
12326 +                       reply.ms_status = STATUS_WAIT;
12327 +
12328 +               else if (test_bit(SGFL_SEVENT, &sg->flags) &&
12329 +                        nodeid != sm_our_nodeid) {
12330 +                       sev = sg->sevent;
12331 +
12332 +                       /*
12333 +                        * We're trying to join or leave at the moment.  If
12334 +                        * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
12335 +                        * wait.  Otherwise, if joining we'll cancel to let the
12336 +                        * leave happen first, or if we're leaving allow the
12337 +                        * lower nodeid to leave first.
12338 +                        */
12339 +
12340 +                       if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
12341 +                               if (sev->se_state > SEST_LEAVE_ACKWAIT)
12342 +                                       reply.ms_status = STATUS_WAIT;
12343 +                               else if (sm_our_nodeid < nodeid)
12344 +                                       reply.ms_status = STATUS_WAIT;
12345 +                               else {
12346 +                                       reply.ms_status = STATUS_POS;
12347 +                                       clear_bit(SEFL_ALLOW_LEAVE,
12348 +                                                 &sev->se_flags);
12349 +                                       set_bit(SEFL_CANCEL, &sev->se_flags);
12350 +                               }
12351 +                       } else {
12352 +                               if (sev->se_state > SEST_JOIN_ACKWAIT)
12353 +                                       reply.ms_status = STATUS_WAIT;
12354 +                               else {
12355 +                                       reply.ms_status = STATUS_NEG;
12356 +                                       clear_bit(SEFL_ALLOW_JOIN,
12357 +                                                 &sev->se_flags);
12358 +                                       set_bit(SEFL_CANCEL, &sev->se_flags);
12359 +                               }
12360 +                       }
12361 +
12362 +                       if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
12363 +                               set_bit(SEFL_CHECK, &sev->se_flags);
12364 +                               wake_serviced(DO_JOINLEAVE);
12365 +                       }
12366 +               }
12367 +
12368 +               else if (test_bit(SGFL_UEVENT, &sg->flags)) {
12369 +                       if (sg->uevent.ue_nodeid != nodeid)
12370 +                               reply.ms_status = STATUS_WAIT;
12371 +               }
12372 +
12373 +       }
12374 +
12375 +       smsg_bswap_out(&reply);
12376 +       send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
12377 +}
12378 +
12379 +/*
12380 + * Each remaining node will send us a done message.  We quit when we get the
12381 + * first.  The subsequent done messages for the finished sevent get here and
12382 + * are ignored.
12383 + */
12384 +
12385 +static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
12386 +{
12387 +       sm_sevent_t *sev;
12388 +
12389 +       sev = find_sevent(smsg->ms_sevent_id);
12390 +       if (!sev)
12391 +               return;
12392 +
12393 +       if (sev->se_state != SEST_LSTART_WAITREMOTE)
12394 +               return;
12395 +
12396 +       sev->se_state = SEST_LSTART_REMOTEDONE;
12397 +       set_bit(SEFL_CHECK, &sev->se_flags);
12398 +       wake_serviced(DO_JOINLEAVE);
12399 +}
12400 +
12401 +/*
12402 + * This function and everything it calls always runs in sm context.
12403 + */
12404 +
12405 +static void process_message(char *msg, uint32_t nodeid)
12406 +{
12407 +       sm_msg_t smsg;
12408 +
12409 +       smsg_copy_in(msg, &smsg);
12410 +
12411 +       switch (smsg.ms_type) {
12412 +       case SMSG_JOIN_REQ:
12413 +               process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
12414 +               break;
12415 +
12416 +       case SMSG_JSTOP_REQ:
12417 +               process_stop_request(&smsg, nodeid,
12418 +                                    (uint32_t *) (msg + sizeof(sm_msg_t)));
12419 +               break;
12420 +
12421 +       case SMSG_LEAVE_REQ:
12422 +               process_leave_request(&smsg, nodeid);
12423 +               break;
12424 +
12425 +       case SMSG_LSTOP_REQ:
12426 +               process_stop_request(&smsg, nodeid, NULL);
12427 +               break;
12428 +
12429 +       case SMSG_JSTART_CMD:
12430 +       case SMSG_LSTART_CMD:
12431 +               process_start_request(&smsg, nodeid);
12432 +               break;
12433 +
12434 +       case SMSG_LSTART_DONE:
12435 +               process_lstart_done(&smsg, nodeid);
12436 +               break;
12437 +
12438 +       case SMSG_JOIN_REP:
12439 +       case SMSG_JSTOP_REP:
12440 +       case SMSG_LEAVE_REP:
12441 +       case SMSG_LSTOP_REP:
12442 +               process_reply(&smsg, nodeid);
12443 +               break;
12444 +
12445 +       case SMSG_RECOVER:
12446 +               process_recover_msg(&smsg, nodeid);
12447 +               break;
12448 +
12449 +       default:
12450 +               log_print("process_message: unknown type %u nodeid %u",
12451 +                         smsg.ms_type, nodeid);
12452 +       }
12453 +}
12454 +
12455 +/*
12456 + * Always called from sm context.
12457 + */
12458 +
12459 +void process_messages(void)
12460 +{
12461 +       rq_entry_t *re;
12462 +
12463 +       while (1) {
12464 +               re = NULL;
12465 +
12466 +               spin_lock(&message_lock);
12467 +               if (!list_empty(&messages)) {
12468 +                       re = list_entry(messages.next, rq_entry_t, list);
12469 +                       list_del(&re->list);
12470 +               }
12471 +               spin_unlock(&message_lock);
12472 +
12473 +               if (!re)
12474 +                       break;
12475 +               process_message(re->msg, re->nodeid);
12476 +               kfree(re->msg);
12477 +               kfree(re);
12478 +               schedule();
12479 +       }
12480 +}
12481 +
12482 +/*
12483 + * Context: cnxman and sm
12484 + */
12485 +
12486 +static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
12487 +{
12488 +       rq_entry_t *re;
12489 +
12490 +       SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
12491 +                re);
12492 +       SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
12493 +
12494 +       memcpy(re->msg, msg, len);
12495 +       re->len = len;
12496 +       re->nodeid = nodeid;
12497 +
12498 +       spin_lock(&message_lock);
12499 +       list_add_tail(&re->list, &messages);
12500 +       spin_unlock(&message_lock);
12501 +
12502 +       wake_serviced(DO_MESSAGES);
12503 +       return 0;
12504 +}
12505 +
12506 +/*
12507 + * Context: cnxman
12508 + * Called by cnxman when a service manager message arrives.
12509 + */
12510 +
12511 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12512 +                      unsigned int node_id)
12513 +{
12514 +        if (!node_id)
12515 +               return -EINVAL;
12516 +        return add_to_recvqueue(msg, len, node_id);
12517 +}
12518 +
12519 +/*
12520 + * These send routines are used by sm and are always called from sm context.
12521 + */
12522 +
12523 +int send_nodeid_message(char *msg, int len, uint32_t nodeid)
12524 +{
12525 +       int error = 0;
12526 +       struct sockaddr_cl saddr;
12527 +
12528 +       if (nodeid == sm_our_nodeid) {
12529 +               add_to_recvqueue(msg, len, nodeid);
12530 +               goto out;
12531 +       }
12532 +
12533 +       saddr.scl_family = AF_CLUSTER;
12534 +       saddr.scl_port = CLUSTER_PORT_SERVICES;
12535 +       saddr.scl_nodeid = nodeid;
12536 +       error = kcl_sendmsg(sm_socket, msg, len, &saddr, sizeof(saddr), 0);
12537 +       if (error > 0)
12538 +               error = 0;
12539 +
12540 +       if (error)
12541 +               log_print("send_nodeid_message error %d to %u", error, nodeid);
12542 +      out:
12543 +       return error;
12544 +}
12545 +
12546 +int send_broadcast_message(char *msg, int len)
12547 +{
12548 +       int error;
12549 +
12550 +       error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
12551 +       if (error > 0)
12552 +               error = 0;
12553 +
12554 +       add_to_recvqueue(msg, len, sm_our_nodeid);
12555 +
12556 +       if (error)
12557 +               log_print("send_broadcast_message error %d", error);
12558 +
12559 +       return error;
12560 +}
12561 +
12562 +int send_members_message(sm_group_t *sg, char *msg, int len)
12563 +{
12564 +       sm_node_t *node;
12565 +       int error = 0;
12566 +
12567 +       list_for_each_entry(node, &sg->memb, list) {
12568 +               error = send_nodeid_message(msg, len, node->id);
12569 +               if (error < 0)
12570 +                       break;
12571 +       }
12572 +       return error;
12573 +}
12574 +
12575 +int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12576 +                            sm_sevent_t * sev)
12577 +{
12578 +       int error;
12579 +       sm_msg_t *smsg = (sm_msg_t *) msg;
12580 +
12581 +       set_allowed_msgtype(sev, smsg->ms_type);
12582 +       sev->se_reply_count = 0;
12583 +
12584 +       error = send_members_message(sg, msg, len);
12585 +       if (error < 0)
12586 +               clear_allowed_msgtype(sev, smsg->ms_type);
12587 +
12588 +       return error;
12589 +}
12590 +
12591 +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
12592 +{
12593 +       int error;
12594 +       sm_msg_t *smsg = (sm_msg_t *) msg;
12595 +
12596 +       set_allowed_msgtype(sev, smsg->ms_type);
12597 +       sev->se_reply_count = 0;
12598 +
12599 +       error = send_broadcast_message(msg, len);
12600 +       if (error < 0)
12601 +               clear_allowed_msgtype(sev, smsg->ms_type);
12602 +
12603 +       return error;
12604 +}
12605 diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
12606 --- linux-orig/cluster/cman/sm_message.h        1970-01-01 07:30:00.000000000 +0730
12607 +++ linux-patched/cluster/cman/sm_message.h     2004-11-03 11:37:37.000000000 +0800
12608 @@ -0,0 +1,34 @@
12609 +/******************************************************************************
12610 +*******************************************************************************
12611 +**
12612 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12613 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12614 +**
12615 +**  This copyrighted material is made available to anyone wishing to use,
12616 +**  modify, copy, or redistribute it subject to the terms and conditions
12617 +**  of the GNU General Public License v.2.
12618 +**
12619 +*******************************************************************************
12620 +******************************************************************************/
12621 +
12622 +#ifndef __SM_MESSAGE_DOT_H__
12623 +#define __SM_MESSAGE_DOT_H__
12624 +
12625 +void init_messages(void);
12626 +uint32_t sm_new_global_id(int level);
12627 +void smsg_bswap_out(sm_msg_t * smsg);
12628 +char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
12629 +                 sm_sevent_t *sev);
12630 +void process_messages(void);
12631 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12632 +                      unsigned int node_id);
12633 +int send_nodeid_message(char *msg, int len, uint32_t nodeid);
12634 +int send_broadcast_message(char *msg, int len);
12635 +int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
12636 +int send_members_message(sm_group_t *sg, char *msg, int len);
12637 +int send_members_message_sev(sm_group_t *sg, char *msg, int len,
12638 +                            sm_sevent_t * sev);
12639 +int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
12640 +                      unsigned int node_id);
12641 +
12642 +#endif
12643 diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
12644 --- linux-orig/cluster/cman/sm_misc.c   1970-01-01 07:30:00.000000000 +0730
12645 +++ linux-patched/cluster/cman/sm_misc.c        2004-11-03 11:37:37.000000000 +0800
12646 @@ -0,0 +1,442 @@
12647 +/******************************************************************************
12648 +*******************************************************************************
12649 +**
12650 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12651 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12652 +**
12653 +**  This copyrighted material is made available to anyone wishing to use,
12654 +**  modify, copy, or redistribute it subject to the terms and conditions
12655 +**  of the GNU General Public License v.2.
12656 +**
12657 +*******************************************************************************
12658 +******************************************************************************/
12659 +
12660 +#include "sm.h"
12661 +#include "config.h"
12662 +#include <linux/seq_file.h>
12663 +
12664 +#define MAX_DEBUG_MSG_LEN      (40)
12665 +
12666 +extern struct list_head sm_members;
12667 +static uint32_t                local_ids;
12668 +static uint32_t                event_id;
12669 +static spinlock_t      event_id_lock;
12670 +static char *          debug_buf;
12671 +static unsigned int    debug_size;
12672 +static unsigned int    debug_point;
12673 +static int             debug_wrap;
12674 +static spinlock_t      debug_lock;
12675 +
12676 +
12677 +void init_sm_misc(void)
12678 +{
12679 +       local_ids = 1;
12680 +       event_id = 1;
12681 +       spin_lock_init(&event_id_lock);
12682 +       debug_buf = NULL;
12683 +       debug_size = 0;
12684 +       debug_point = 0;
12685 +       debug_wrap = 0;
12686 +       spin_lock_init(&debug_lock);
12687 +
12688 +       sm_debug_setup(cman_config.sm_debug_size);
12689 +}
12690 +
12691 +sm_node_t *sm_new_node(uint32_t nodeid)
12692 +{
12693 +       struct kcl_cluster_node kclnode;
12694 +       sm_node_t *node;
12695 +       int error;
12696 +
12697 +       error = kcl_get_node_by_nodeid(nodeid, &kclnode);
12698 +       SM_ASSERT(!error,);
12699 +
12700 +       SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
12701 +                node);
12702 +
12703 +       memset(node, 0, sizeof(sm_node_t));
12704 +       node->id = nodeid;
12705 +       node->incarnation = kclnode.incarnation;
12706 +       return node;
12707 +}
12708 +
12709 +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
12710 +{
12711 +       sm_node_t *node;
12712 +
12713 +       list_for_each_entry(node, &sg->joining, list) {
12714 +               if (node->id == nodeid)
12715 +                       return node;
12716 +       }
12717 +       return NULL;
12718 +}
12719 +
12720 +sm_node_t *sm_find_member(uint32_t nodeid)
12721 +{
12722 +       sm_node_t *node;
12723 +
12724 +       list_for_each_entry(node, &sm_members, list) {
12725 +               if (node->id == nodeid)
12726 +                       return node;
12727 +       }
12728 +       return NULL;
12729 +}
12730 +
12731 +uint32_t sm_new_local_id(int level)
12732 +{
12733 +       uint32_t id = local_ids++;
12734 +       uint8_t l = (uint8_t) level;
12735 +
12736 +       if (level > 0xFF)
12737 +               return 0;
12738 +
12739 +       if (id > 0x00FFFFFF)
12740 +               return 0;
12741 +
12742 +       id |= (l << 24);
12743 +       return id;
12744 +}
12745 +
12746 +int sm_id_to_level(uint32_t id)
12747 +{
12748 +       uint8_t l = (id & 0xFF000000) >> 24;
12749 +
12750 +       return (int) l;
12751 +}
12752 +
12753 +void sm_set_event_id(int *id)
12754 +{
12755 +       spin_lock(&event_id_lock);
12756 +       *id = event_id++;
12757 +       spin_unlock(&event_id_lock);
12758 +}
12759 +
12760 +sm_group_t *sm_local_id_to_sg(int id)
12761 +{
12762 +       sm_group_t *sg;
12763 +       int level = sm_id_to_level(id);
12764 +       int found = FALSE;
12765 +
12766 +       down(&sm_sglock);
12767 +
12768 +       list_for_each_entry(sg, &sm_sg[level], list) {
12769 +               if (sg->local_id == id) {
12770 +                       found = TRUE;
12771 +                       break;
12772 +               }
12773 +       }
12774 +       up(&sm_sglock);
12775 +       if (!found)
12776 +               sg = NULL;
12777 +       return sg;
12778 +}
12779 +
12780 +sm_group_t *sm_global_id_to_sg(int id)
12781 +{
12782 +       sm_group_t *sg;
12783 +       int level = sm_id_to_level(id);
12784 +       int found = FALSE;
12785 +
12786 +       down(&sm_sglock);
12787 +
12788 +       list_for_each_entry(sg, &sm_sg[level], list) {
12789 +               if (sg->global_id == id) {
12790 +                       found = TRUE;
12791 +                       break;
12792 +               }
12793 +       }
12794 +       up(&sm_sglock);
12795 +       if (!found)
12796 +               sg = NULL;
12797 +       return sg;
12798 +}
12799 +
12800 +void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
12801 +{
12802 +       va_list va;
12803 +       int i, n, size, len;
12804 +       char buf[MAX_DEBUG_MSG_LEN+1];
12805 +
12806 +       spin_lock(&debug_lock);
12807 +
12808 +       if (!debug_buf)
12809 +               goto out;
12810 +
12811 +       size = MAX_DEBUG_MSG_LEN;
12812 +       memset(buf, 0, size+1);
12813 +
12814 +       n = snprintf(buf, size, "%08x ", sg->global_id);
12815 +       size -= n;
12816 +
12817 +       va_start(va, fmt);
12818 +       vsnprintf(buf+n, size, fmt, va);
12819 +       va_end(va);
12820 +
12821 +       len = strlen(buf);
12822 +       if (len > MAX_DEBUG_MSG_LEN-1)
12823 +               len = MAX_DEBUG_MSG_LEN-1;
12824 +       buf[len] = '\n';
12825 +       buf[len+1] = '\0';
12826 +
12827 +       for (i = 0; i < strlen(buf); i++) {
12828 +               debug_buf[debug_point++] = buf[i];
12829 +
12830 +               if (debug_point == debug_size) {
12831 +                       debug_point = 0;
12832 +                       debug_wrap = 1;
12833 +               }
12834 +       }
12835 + out:
12836 +       spin_unlock(&debug_lock);
12837 +}
12838 +
12839 +void sm_debug_setup(int size)
12840 +{
12841 +       char *b = kmalloc(size, GFP_KERNEL);
12842 +
12843 +       spin_lock(&debug_lock);
12844 +       if (debug_buf)
12845 +               kfree(debug_buf);
12846 +
12847 +       if (size > PAGE_SIZE)
12848 +               size = PAGE_SIZE;
12849 +       debug_size = size;
12850 +       debug_point = 0;
12851 +       debug_wrap = 0;
12852 +       debug_buf = b;
12853 +       memset(debug_buf, 0, debug_size);
12854 +       spin_unlock(&debug_lock);
12855 +}
12856 +
12857 +#ifdef CONFIG_PROC_FS
12858 +static struct seq_operations sm_info_op;
12859 +
12860 +struct sm_seq_info
12861 +{
12862 +    int pos;
12863 +    int level;
12864 +    sm_group_t *sg;
12865 +};
12866 +
12867 +int sm_debug_info(char *b, char **start, off_t offset, int length)
12868 +{
12869 +       int i, n = 0;
12870 +
12871 +       spin_lock(&debug_lock);
12872 +
12873 +       if (debug_wrap) {
12874 +               for (i = debug_point; i < debug_size; i++)
12875 +                       n += sprintf(b + n, "%c", debug_buf[i]);
12876 +       }
12877 +       for (i = 0; i < debug_point; i++)
12878 +               n += sprintf(b + n, "%c", debug_buf[i]);
12879 +
12880 +       spin_unlock(&debug_lock);
12881 +
12882 +       return n;
12883 +}
12884 +
12885 +
12886 +
12887 +static sm_group_t *sm_walk(loff_t offset, int *rlevel)
12888 +{
12889 +       sm_group_t *sg;
12890 +       int  level;
12891 +       loff_t n = 0;
12892 +
12893 +       down(&sm_sglock);
12894 +
12895 +       for (level = 0; level < SG_LEVELS; level++) {
12896 +               list_for_each_entry(sg, &sm_sg[level], list) {
12897 +                       if (++n == offset)
12898 +                               goto walk_finish;
12899 +               }
12900 +       }
12901 +       sg = NULL;
12902 +
12903 + walk_finish:
12904 +       up(&sm_sglock);
12905 +       *rlevel = level;
12906 +
12907 +       return sg;
12908 +}
12909 +
12910 +
12911 +static void *sm_seq_start(struct seq_file *m, loff_t * pos)
12912 +{
12913 +       struct sm_seq_info *ssi =
12914 +               kmalloc(sizeof (struct sm_seq_info), GFP_KERNEL);
12915 +
12916 +       if (!ssi)
12917 +               return NULL;
12918 +
12919 +       ssi->pos = *pos;
12920 +       ssi->level = 0;
12921 +       ssi->sg = NULL;
12922 +
12923 +       /* Print the header */
12924 +       if (*pos == 0) {
12925 +               seq_printf(m,
12926 +                          "Service          Name                              GID LID State     Code\n");
12927 +       }
12928 +       return ssi;
12929 +}
12930 +
12931 +static void *sm_seq_next(struct seq_file *m, void *p, loff_t * pos)
12932 +{
12933 +       struct sm_seq_info *ssi = p;
12934 +
12935 +       *pos = ++ssi->pos;
12936 +
12937 +       if ( !(ssi->sg = sm_walk(ssi->pos, &ssi->level)) )
12938 +               return NULL;
12939 +
12940 +       return ssi;
12941 +}
12942 +
12943 +/* Called from /proc when /proc/cluster/services is opened */
12944 +int sm_proc_open(struct inode *inode, struct file *file)
12945 +{
12946 +       return seq_open(file, &sm_info_op);
12947 +}
12948 +
12949 +static int sm_seq_show(struct seq_file *s, void *p)
12950 +{
12951 +    struct sm_seq_info *ssi = p;
12952 +    sm_node_t *node;
12953 +    int i;
12954 +
12955 +    if (!ssi || !ssi->sg)
12956 +           return 0;
12957 +
12958 +    /*
12959 +     * Cluster Service
12960 +     */
12961 +
12962 +    switch (ssi->level) {
12963 +    case SERVICE_LEVEL_FENCE:
12964 +       seq_printf(s, "Fence Domain:    ");
12965 +       break;
12966 +    case SERVICE_LEVEL_GDLM:
12967 +       seq_printf(s, "DLM Lock Space:  ");
12968 +       break;
12969 +    case SERVICE_LEVEL_GFS:
12970 +       seq_printf(s, "GFS Mount Group: ");
12971 +       break;
12972 +    case SERVICE_LEVEL_USER:
12973 +       seq_printf(s, "User:            ");
12974 +       break;
12975 +    }
12976 +
12977 +    /*
12978 +     * Name
12979 +     */
12980 +
12981 +    seq_printf(s, "\"");
12982 +    for (i = 0; i < ssi->sg->namelen; i++)
12983 +           seq_printf(s, "%c", ssi->sg->name[i]);
12984 +    seq_printf(s, "\"");
12985 +
12986 +    for (; i < MAX_SERVICE_NAME_LEN-1; i++)
12987 +       seq_printf(s, " ");
12988 +
12989 +    /*
12990 +     * GID LID (sans level from top byte)
12991 +     */
12992 +
12993 +    seq_printf(s, "%3u %3u ",
12994 +              (ssi->sg->global_id & 0x00FFFFFF),
12995 +              (ssi->sg->local_id & 0x00FFFFFF));
12996 +
12997 +    /*
12998 +     * State
12999 +     */
13000 +
13001 +    switch (ssi->sg->state) {
13002 +    case SGST_NONE:
13003 +       seq_printf(s, "none      ");
13004 +       break;
13005 +    case SGST_JOIN:
13006 +       seq_printf(s, "join      ");
13007 +       break;
13008 +    case SGST_RUN:
13009 +       seq_printf(s, "run       ");
13010 +       break;
13011 +    case SGST_RECOVER:
13012 +       seq_printf(s, "recover %u ",
13013 +                  ssi->sg->recover_state);
13014 +       break;
13015 +    case SGST_UEVENT:
13016 +       seq_printf(s, "update    ");
13017 +       break;
13018 +    }
13019 +
13020 +    /*
13021 +     * Code
13022 +     */
13023 +
13024 +    if (test_bit(SGFL_SEVENT, &ssi->sg->flags))
13025 +           seq_printf(s, "S");
13026 +    if (test_bit(SGFL_UEVENT, &ssi->sg->flags))
13027 +           seq_printf(s, "U");
13028 +    if (test_bit(SGFL_NEED_RECOVERY, &ssi->sg->flags))
13029 +           seq_printf(s, "N");
13030 +
13031 +    seq_printf(s, "-");
13032 +
13033 +    if (test_bit(SGFL_SEVENT, &ssi->sg->flags)
13034 +       && ssi->sg->sevent) {
13035 +       seq_printf(s, "%u,%lx,%u",
13036 +                  ssi->sg->sevent->se_state,
13037 +                  ssi->sg->sevent->se_flags,
13038 +                  ssi->sg->sevent->se_reply_count);
13039 +    }
13040 +
13041 +    if (test_bit(SGFL_UEVENT, &ssi->sg->flags)) {
13042 +       seq_printf(s, "%u,%lx,%u",
13043 +                  ssi->sg->uevent.ue_state,
13044 +                  ssi->sg->uevent.ue_flags,
13045 +                  ssi->sg->uevent.ue_nodeid);
13046 +    }
13047 +
13048 +    seq_printf(s, "\n");
13049 +
13050 +    /*
13051 +     * node list
13052 +     */
13053 +
13054 +    i = 0;
13055 +
13056 +    seq_printf(s, "[");
13057 +
13058 +    list_for_each_entry(node, &ssi->sg->memb, list) {
13059 +           if (i && !(i % 24))
13060 +                   seq_printf(s, "\n");
13061 +
13062 +           if (i)
13063 +                   seq_printf(s, " ");
13064 +
13065 +       seq_printf(s, "%u", node->id);
13066 +       i++;
13067 +    }
13068 +
13069 +    seq_printf(s, "]\n\n");
13070 +
13071 +    return 0;
13072 +}
13073 +
13074 +static void sm_seq_stop(struct seq_file *m, void *p)
13075 +{
13076 +       kfree(p);
13077 +}
13078 +
13079 +
13080 +static struct seq_operations sm_info_op = {
13081 +       .start = sm_seq_start,
13082 +       .next = sm_seq_next,
13083 +       .stop = sm_seq_stop,
13084 +       .show = sm_seq_show
13085 +};
13086 +
13087 +
13088 +#endif
13089 diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
13090 --- linux-orig/cluster/cman/sm_misc.h   1970-01-01 07:30:00.000000000 +0730
13091 +++ linux-patched/cluster/cman/sm_misc.h        2004-11-03 11:37:37.000000000 +0800
13092 @@ -0,0 +1,29 @@
13093 +/******************************************************************************
13094 +*******************************************************************************
13095 +**
13096 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13097 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13098 +**
13099 +**  This copyrighted material is made available to anyone wishing to use,
13100 +**  modify, copy, or redistribute it subject to the terms and conditions
13101 +**  of the GNU General Public License v.2.
13102 +**
13103 +*******************************************************************************
13104 +******************************************************************************/
13105 +
13106 +#ifndef __SM_MISC_DOT_H__
13107 +#define __SM_MISC_DOT_H__
13108 +
13109 +void init_sm_misc(void);
13110 +sm_node_t *sm_new_node(uint32_t nodeid);
13111 +sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
13112 +sm_node_t *sm_find_member(uint32_t nodeid);
13113 +uint32_t sm_new_local_id(int level);
13114 +int sm_id_to_level(uint32_t id);
13115 +void sm_set_event_id(int *id);
13116 +sm_group_t *sm_local_id_to_sg(int id);
13117 +sm_group_t *sm_global_id_to_sg(int id);
13118 +void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
13119 +void sm_debug_setup(int size);
13120 +
13121 +#endif
13122 diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
13123 --- linux-orig/cluster/cman/sm_recover.c        1970-01-01 07:30:00.000000000 +0730
13124 +++ linux-patched/cluster/cman/sm_recover.c     2004-11-03 11:37:37.000000000 +0800
13125 @@ -0,0 +1,524 @@
13126 +/******************************************************************************
13127 +*******************************************************************************
13128 +**
13129 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13130 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13131 +**
13132 +**  This copyrighted material is made available to anyone wishing to use,
13133 +**  modify, copy, or redistribute it subject to the terms and conditions
13134 +**  of the GNU General Public License v.2.
13135 +**
13136 +*******************************************************************************
13137 +******************************************************************************/
13138 +
13139 +#include "sm.h"
13140 +#include "config.h"
13141 +
13142 +/*
13143 + * A collection of sg's which need to be recovered due to a failed member.
13144 + * These sg's are recovered in order of level.  An sg subject to cascading
13145 + * failures is moved from one of these structs to a newer one.
13146 + */
13147 +
13148 +struct recover {
13149 +       struct list_head        list;           /* list of current re's */
13150 +       struct list_head        sgs[SG_LEVELS]; /* lists of sg's by level */
13151 +       int                     event_id;       /* event id */
13152 +       int                     cur_level;
13153 +};
13154 +typedef struct recover recover_t;
13155 +
13156 +
13157 +extern uint32_t *      sm_new_nodeids;
13158 +extern int             sm_quorum, sm_quorum_next;
13159 +extern uint32_t                sm_our_nodeid;
13160 +extern struct list_head        sm_members;
13161 +extern int             sm_member_count;
13162 +static struct list_head        recoveries;
13163 +
13164 +
13165 +void init_recovery(void)
13166 +{
13167 +       INIT_LIST_HEAD(&recoveries);
13168 +}
13169 +
13170 +/*
13171 + * This is the first thing called when a change is announced in cluster
13172 + * membership.  Nodes are marked as being a CLUSTER_MEMBER or not.  SM adds new
13173 + * nodes to its sm_members list which it's not seen before.  Nodes which were
13174 + * alive but are now gone are marked as "need recovery".
13175 + *
13176 + * The "need recovery" status of nodes is propagated to the node's SG's in
13177 + * mark_effected_sgs.  The effected SG's are themselves marked as needing
13178 + * recovery and in new_recovery the dead nodes are removed from the SG's
13179 + * individual member lists.  The "need recovery" status of nodes is cleared in
13180 + * adjust_members_done().
13181 + */
13182 +
13183 +static int adjust_members(void)
13184 +{
13185 +       sm_node_t *node;
13186 +       struct kcl_cluster_node knode;
13187 +       int i, error, num_nodes, sub = 0, add = 0, found;
13188 +
13189 +       /*
13190 +        * Get list of current members from cnxman
13191 +        */
13192 +
13193 +       memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
13194 +       num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
13195 +
13196 +       /*
13197 +        * Determine who's gone
13198 +        */
13199 +
13200 +       list_for_each_entry(node, &sm_members, list) {
13201 +               found = FALSE;
13202 +               for (i = 0; i < num_nodes; i++) {
13203 +                       if (node->id == sm_new_nodeids[i]) {
13204 +                               found = TRUE;
13205 +                               sm_new_nodeids[i] = 0;
13206 +                               break;
13207 +                       }
13208 +               }
13209 +
13210 +               if (found) {
13211 +                       error = kcl_get_node_by_nodeid(node->id, &knode);
13212 +                       SM_ASSERT(!error, printk("error=%d\n", error););
13213 +
13214 +                       if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
13215 +                               /* former member is back */
13216 +                               set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13217 +                               node->incarnation = knode.incarnation;
13218 +                               add++;
13219 +                       } else {
13220 +                               /* current member is still alive - if the
13221 +                                * incarnation number is different it died and
13222 +                                * returned between checks */
13223 +                               if (node->incarnation != knode.incarnation) {
13224 +                                       set_bit(SNFL_NEED_RECOVERY,
13225 +                                               &node->flags);
13226 +                                       node->incarnation = knode.incarnation;
13227 +                                       sub++;
13228 +                               }
13229 +                       }
13230 +               } else {
13231 +                       /* current member has died */
13232 +                       if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
13233 +                                              &node->flags)) {
13234 +                               set_bit(SNFL_NEED_RECOVERY, &node->flags);
13235 +                               sub++;
13236 +                       }
13237 +               }
13238 +       }
13239 +
13240 +       /*
13241 +        * Look for new nodes
13242 +        */
13243 +
13244 +       for (i = 0; i < num_nodes; i++) {
13245 +               if (sm_new_nodeids[i]) {
13246 +                       node = sm_new_node(sm_new_nodeids[i]);
13247 +                       set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
13248 +                       add++;
13249 +                       list_add_tail(&node->list, &sm_members);
13250 +                       sm_member_count++;
13251 +               }
13252 +       }
13253 +
13254 +       /*
13255 +        * Get our own nodeid
13256 +        */
13257 +
13258 +       if (!sm_our_nodeid) {
13259 +               list_for_each_entry(node, &sm_members, list) {
13260 +                       error = kcl_get_node_by_nodeid(node->id, &knode);
13261 +                       SM_ASSERT(!error, printk("error=%d\n", error););
13262 +
13263 +                       if (knode.us) {
13264 +                               sm_our_nodeid = knode.node_id;
13265 +                               break;
13266 +                       }
13267 +               }
13268 +       }
13269 +
13270 +       return sub;
13271 +}
13272 +
13273 +/*
13274 + * Given some number of dead nodes, flag SG's the dead nodes were part of.
13275 + * This requires a number of loops because each node structure does not keep a
13276 + * list of SG's it's in.
13277 + */
13278 +
13279 +static int mark_effected_sgs(void)
13280 +{
13281 +       sm_group_t *sg;
13282 +       sm_node_t *node, *sgnode;
13283 +       uint32_t dead_id;
13284 +       int i, effected = 0;
13285 +
13286 +       down(&sm_sglock);
13287 +
13288 +       list_for_each_entry(node, &sm_members, list) {
13289 +               if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
13290 +                       continue;
13291 +
13292 +               dead_id = node->id;
13293 +
13294 +               for (i = 0; i < SG_LEVELS; i++) {
13295 +                       list_for_each_entry(sg, &sm_sg[i], list) {
13296 +                               /* check if dead node is among sg's members */
13297 +                               list_for_each_entry(sgnode, &sg->memb, list) {
13298 +                                       if (sgnode->id == dead_id) {
13299 +                                               set_bit(SGFL_NEED_RECOVERY,
13300 +                                                       &sg->flags);
13301 +                                               effected++;
13302 +                                               break;
13303 +                                       }
13304 +                               }
13305 +                       }
13306 +               }
13307 +       }
13308 +       up(&sm_sglock);
13309 +
13310 +       return effected;
13311 +}
13312 +
13313 +static recover_t *alloc_recover(void)
13314 +{
13315 +       recover_t *rev;
13316 +       int i;
13317 +
13318 +       SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
13319 +
13320 +       memset(rev, 0, sizeof(recover_t));
13321 +
13322 +       sm_set_event_id(&rev->event_id);
13323 +
13324 +       for (i = 0; i < SG_LEVELS; i++) {
13325 +               INIT_LIST_HEAD(&rev->sgs[i]);
13326 +       }
13327 +
13328 +       return rev;
13329 +}
13330 +
13331 +/*
13332 + * An in-progress revent re-start for an SG is interrupted by another node
13333 + * failure in the SG.  Cancel an outstanding barrier if there is one.  The SG
13334 + * will be moved to the new revent and re-started as part of that.
13335 + */
13336 +
13337 +static void cancel_prev_recovery(sm_group_t *sg)
13338 +{
13339 +       int error;
13340 +
13341 +       if (sg->recover_state == RECOVER_BARRIERWAIT) {
13342 +               error = kcl_barrier_cancel(sg->recover_barrier);
13343 +               if (error)
13344 +                       log_error(sg, "cancel_prev_recovery: error %d", error);
13345 +       }
13346 +}
13347 +
13348 +static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
13349 +{
13350 +       if (sg->state == SGST_RECOVER) {
13351 +               cancel_prev_recovery(sg);
13352 +               list_del(&sg->recover_list);
13353 +       }
13354 +
13355 +       sg->ops->stop(sg->service_data);
13356 +       sg->state = SGST_RECOVER;
13357 +       sg->recover_state = RECOVER_NONE;
13358 +       sg->recover_data = rev;
13359 +       list_add(&sg->recover_list, &rev->sgs[sg->level]);
13360 +}
13361 +
13362 +/*
13363 + * When adjust_members finds that some nodes are dead and mark_effected_sgs
13364 + * finds that some SG's are effected by departed nodes, this is called to
13365 + * collect together the SG's which need to be recovered.  An revent (recovery
13366 + * event) is the group of effected SG's.
13367 + */
13368 +
13369 +static int new_recovery(void)
13370 +{
13371 +       sm_group_t *sg;
13372 +       recover_t *rev;
13373 +       sm_node_t *node, *sgnode, *safe;
13374 +       int i;
13375 +
13376 +       rev = alloc_recover();
13377 +       list_add_tail(&rev->list, &recoveries);
13378 +
13379 +       down(&sm_sglock);
13380 +
13381 +       /*
13382 +        * Stop effected SG's and add them to the rev
13383 +        */
13384 +
13385 +       for (i = 0; i < SG_LEVELS; i++) {
13386 +               list_for_each_entry(sg, &sm_sg[i], list) {
13387 +                       if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
13388 +                               if (sg->state == SGST_JOIN)
13389 +                                       continue;
13390 +                               pre_recover_sg(sg, rev);
13391 +                       }
13392 +               }
13393 +       }
13394 +
13395 +       /*
13396 +        * For an SG needing recovery, remove dead nodes from sg->memb list
13397 +        */
13398 +
13399 +       for (i = 0; i < SG_LEVELS; i++) {
13400 +               list_for_each_entry(sg, &rev->sgs[i], recover_list) {
13401 +
13402 +                       /* Remove dead members from SG's member list */
13403 +                       list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
13404 +
13405 +                               node = sm_find_member(sgnode->id);
13406 +                               SM_ASSERT(node, printk("id %u\n", sgnode->id););
13407 +
13408 +                               if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
13409 +                                       list_del(&sgnode->list);
13410 +                                       kfree(sgnode);
13411 +                                       sg->memb_count--;
13412 +                                       log_debug(sg, "remove node %u count %d",
13413 +                                                 sgnode->id, sg->memb_count);
13414 +                               }
13415 +                       }
13416 +               }
13417 +       }
13418 +
13419 +       up(&sm_sglock);
13420 +       rev->cur_level = 0;
13421 +       return 0;
13422 +}
13423 +
13424 +/*
13425 + * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
13426 + * mark_effected_sgs() and add_revent().  After that, we're done using the bit
13427 + * and we clear it here.
13428 + */
13429 +
13430 +static void adjust_members_done(void)
13431 +{
13432 +       sm_node_t *node;
13433 +
13434 +       list_for_each_entry(node, &sm_members, list)
13435 +               clear_bit(SNFL_NEED_RECOVERY, &node->flags);
13436 +}
13437 +
13438 +/*
13439 + * Start the service of the given SG.  The service must be given an array of
13440 + * nodeids specifying the new sg membership.  The service is responsible to
13441 + * free this chunk of memory when done with it.
13442 + */
13443 +
13444 +static void start_sg(sm_group_t *sg, uint32_t event_id)
13445 +{
13446 +       sm_node_t *node;
13447 +       uint32_t *memb;
13448 +       int count = 0;
13449 +
13450 +       SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
13451 +                memb);
13452 +
13453 +       list_for_each_entry(node, &sg->memb, list)
13454 +               memb[count++] = node->id;
13455 +
13456 +       sg->ops->start(sg->service_data, memb, count, event_id,
13457 +                      SERVICE_NODE_FAILED);
13458 +}
13459 +
13460 +static void recovery_barrier(sm_group_t *sg)
13461 +{
13462 +       char bname[MAX_BARRIER_NAME_LEN];
13463 +       int error, len;
13464 +
13465 +       memset(bname, 0, MAX_BARRIER_NAME_LEN);
13466 +
13467 +       /* bypass the barrier if we're the only member */
13468 +       if (sg->memb_count == 1) {
13469 +               process_recovery_barrier(sg, 0);
13470 +               return;
13471 +       }
13472 +
13473 +       len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
13474 +                      sg->global_id, sg->recover_stop, sg->memb_count);
13475 +
13476 +       /* We save this barrier name so we can cancel it if needed. */
13477 +       memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
13478 +       memcpy(sg->recover_barrier, bname, len);
13479 +
13480 +       error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
13481 +       if (error)
13482 +               log_error(sg, "recovery_barrier error %d: %s", error, bname);
13483 +}
13484 +
13485 +static void recover_sg(sm_group_t *sg, int event_id)
13486 +{
13487 +       log_debug(sg, "recover state %d", sg->recover_state);
13488 +
13489 +       switch (sg->recover_state) {
13490 +
13491 +       case RECOVER_NONE:
13492 +               /* must wait for recovery to stop sg on all nodes */
13493 +               sg->recover_state = RECOVER_BARRIERWAIT;
13494 +               sg->recover_stop = 0;
13495 +               recovery_barrier(sg);
13496 +               break;
13497 +
13498 +       case RECOVER_BARRIERWAIT:
13499 +               break;
13500 +
13501 +       case RECOVER_STOP:
13502 +               /* barrier callback sets state STOP */
13503 +               sg->recover_stop = 1;
13504 +               sg->recover_state = RECOVER_START;
13505 +               start_sg(sg, event_id);
13506 +               break;
13507 +
13508 +       case RECOVER_START:
13509 +               break;
13510 +
13511 +       case RECOVER_STARTDONE:
13512 +               /* service callback sets state STARTDONE */
13513 +               sg->recover_state = RECOVER_BARRIERWAIT;
13514 +               recovery_barrier(sg);
13515 +               break;
13516 +
13517 +       case RECOVER_BARRIERDONE:
13518 +               /* barrier callback sets state BARRIERDONE */
13519 +               sg->ops->finish(sg->service_data, event_id);
13520 +               list_del(&sg->recover_list);
13521 +               sg->recover_state = RECOVER_NONE;
13522 +               sg->state = SGST_RUN;
13523 +
13524 +               /* Continue a previous, interrupted attempt to leave the sg */
13525 +               if (sg->sevent) {
13526 +                       sm_sevent_t *sev = sg->sevent;
13527 +                       log_debug(sg, "restart leave %lx", sev->se_flags);
13528 +                       clear_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
13529 +                       set_bit(SEFL_CHECK, &sev->se_flags);
13530 +                       wake_serviced(DO_JOINLEAVE);
13531 +               }
13532 +               break;
13533 +
13534 +       default:
13535 +               log_error(sg, "invalid recover_state %u", sg->recover_state);
13536 +       }
13537 +}
13538 +
13539 +static void recover_level(recover_t *rev, int level)
13540 +{
13541 +       sm_group_t *sg, *safe;
13542 +
13543 +       list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
13544 +               recover_sg(sg, rev->event_id);
13545 +}
13546 +
13547 +static void recover_levels(recover_t *rev)
13548 +{
13549 +       for (;;) {
13550 +               recover_level(rev, rev->cur_level);
13551 +
13552 +               if (list_empty(&rev->sgs[rev->cur_level])) {
13553 +                       if (rev->cur_level == SG_LEVELS - 1) {
13554 +                               list_del(&rev->list);
13555 +                               kfree(rev);
13556 +                               return;
13557 +                       }
13558 +                       rev->cur_level++;
13559 +                       continue;
13560 +               }
13561 +               break;
13562 +       }
13563 +}
13564 +
13565 +/*
13566 + * Called by SM thread when the cluster is quorate.  It restarts
13567 + * SG's that were stopped in new_recovery() due to a member death.
13568 + * It waits for all SG's at level N to complete restart before
13569 + * restarting SG's at level N+1.
13570 + */
13571 +
13572 +void process_recoveries(void)
13573 +{
13574 +       recover_t *rev, *safe;
13575 +
13576 +       down(&sm_sglock);
13577 +       list_for_each_entry_safe(rev, safe, &recoveries, list)
13578 +               recover_levels(rev);
13579 +       up(&sm_sglock);
13580 +}
13581 +
13582 +/*
13583 + * The cnxman membership has changed.  Check if there's still quorum and
13584 + * whether any nodes have died.  If nodes have died, initiate recovery on any
13585 + * SG's they were in.  This begins immediately if the cluster remains quorate;
13586 + * if not this waits until the cluster regains quorum.
13587 + */
13588 +
13589 +void process_nodechange(void)
13590 +{
13591 +       int gone, effected;
13592 +
13593 +       if ((sm_quorum = sm_quorum_next))
13594 +               wake_serviced(DO_RUN);
13595 +
13596 +       gone = adjust_members();
13597 +       if (gone > 0) {
13598 +               effected = mark_effected_sgs();
13599 +
13600 +               backout_sevents();
13601 +               cancel_uevents(&effected);
13602 +
13603 +               if (effected > 0) {
13604 +                       new_recovery();
13605 +                       wake_serviced(DO_RECOVERIES);
13606 +               }
13607 +       }
13608 +       adjust_members_done();
13609 +}
13610 +
13611 +int check_recovery(sm_group_t *sg, int event_id)
13612 +{
13613 +       if (sg->state == SGST_RECOVER) {
13614 +               recover_t *rev = (recover_t *) sg->recover_data;
13615 +               if (rev && rev->event_id == event_id)
13616 +                       return 1;
13617 +       }
13618 +       return 0;
13619 +}
13620 +
13621 +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
13622 +{
13623 +        sm_group_t *sg;
13624 +       recover_t *rev;
13625 +
13626 +       sg = sm_global_id_to_sg(smsg->ms_global_sgid);
13627 +       if (!sg) {
13628 +               log_print("process_recover_msg: unknown sg id %x",
13629 +                         smsg->ms_global_sgid);
13630 +               return;
13631 +       }
13632 +
13633 +       /* we already know about the recovery and can ignore the msg */
13634 +       if (sg->state == SGST_RECOVER)
13635 +               return;
13636 +
13637 +       if (test_bit(SGFL_UEVENT, &sg->flags)) {
13638 +               /* we will initiate recovery on our own if we know about the
13639 +                  uevent so we can ignore this */
13640 +               log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
13641 +               return;
13642 +       }
13643 +
13644 +       log_debug(sg, "recovery initiated by msg from %u", nodeid);
13645 +       rev = alloc_recover();
13646 +       list_add_tail(&rev->list, &recoveries);
13647 +       pre_recover_sg(sg, rev);
13648 +       wake_serviced(DO_RECOVERIES);
13649 +}
13650 diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
13651 --- linux-orig/cluster/cman/sm_recover.h        1970-01-01 07:30:00.000000000 +0730
13652 +++ linux-patched/cluster/cman/sm_recover.h     2004-11-03 11:37:37.000000000 +0800
13653 @@ -0,0 +1,23 @@
13654 +/******************************************************************************
13655 +*******************************************************************************
13656 +**
13657 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13658 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13659 +**
13660 +**  This copyrighted material is made available to anyone wishing to use,
13661 +**  modify, copy, or redistribute it subject to the terms and conditions
13662 +**  of the GNU General Public License v.2.
13663 +**
13664 +*******************************************************************************
13665 +******************************************************************************/
13666 +
13667 +#ifndef __SM_RECOVER_DOT_H__
13668 +#define __SM_RECOVER_DOT_H__
13669 +
13670 +void init_recovery(void);
13671 +void process_recoveries(void);
13672 +void process_nodechange(void);
13673 +int check_recovery(sm_group_t *sg, int event_id);
13674 +void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
13675 +
13676 +#endif
13677 diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
13678 --- linux-orig/cluster/cman/sm_services.c       1970-01-01 07:30:00.000000000 +0730
13679 +++ linux-patched/cluster/cman/sm_services.c    2004-11-03 11:37:37.000000000 +0800
13680 @@ -0,0 +1,426 @@
13681 +/******************************************************************************
13682 +*******************************************************************************
13683 +**
13684 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13685 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13686 +**
13687 +**  This copyrighted material is made available to anyone wishing to use,
13688 +**  modify, copy, or redistribute it subject to the terms and conditions
13689 +**  of the GNU General Public License v.2.
13690 +**
13691 +*******************************************************************************
13692 +******************************************************************************/
13693 +
13694 +#include "sm.h"
13695 +
13696 +static struct list_head        callbacks;
13697 +static spinlock_t      callback_lock;
13698 +static struct list_head        sg_registered[SG_LEVELS];
13699 +
13700 +/*
13701 + * These are the functions to register, join, leave, unregister, callback
13702 + * with/to the sm.
13703 + */
13704 +
13705 +struct sc_entry {
13706 +       struct list_head list;
13707 +       uint32_t local_id;
13708 +       int event_id;
13709 +};
13710 +typedef struct sc_entry sc_entry_t;
13711 +
13712 +void init_services(void)
13713 +{
13714 +       int i;
13715 +
13716 +       INIT_LIST_HEAD(&callbacks);
13717 +       spin_lock_init(&callback_lock);
13718 +
13719 +       for (i = 0; i < SG_LEVELS; i++) {
13720 +               INIT_LIST_HEAD(&sm_sg[i]);
13721 +               INIT_LIST_HEAD(&sg_registered[i]);
13722 +       }
13723 +       init_MUTEX(&sm_sglock);
13724 +}
13725 +
13726 +/* Context: service */
13727 +
13728 +int kcl_register_service(char *name, int namelen, int level,
13729 +                        struct kcl_service_ops *ops, int unique,
13730 +                        void *servicedata, uint32_t *service_id)
13731 +{
13732 +       sm_group_t *sg;
13733 +       int found = FALSE;
13734 +       int error = -EINVAL;
13735 +
13736 +       if (level > SG_LEVELS - 1)
13737 +               goto fail;
13738 +
13739 +       if (namelen > MAX_SERVICE_NAME_LEN)
13740 +               goto fail;
13741 +
13742 +       error = kcl_addref_cluster();
13743 +       if (error)
13744 +               goto fail;
13745 +
13746 +       down(&sm_sglock);
13747 +
13748 +       list_for_each_entry(sg, &sm_sg[level], list) {
13749 +               if ((sg->namelen == namelen) &&
13750 +                   (!strncmp(sg->name, name, namelen))) {
13751 +                       found = TRUE;
13752 +                       goto next;
13753 +               }
13754 +       }
13755 +
13756 +       list_for_each_entry(sg, &sg_registered[level], list) {
13757 +               if ((sg->namelen == namelen) &&
13758 +                   (!strncmp(sg->name, name, namelen))) {
13759 +                       found = TRUE;
13760 +                       goto next;
13761 +               }
13762 +       }
13763 +
13764 +      next:
13765 +
13766 +       if (found && unique) {
13767 +               error = -EEXIST;
13768 +               goto fail_unlock;
13769 +       }
13770 +
13771 +       if (found) {
13772 +               sg->refcount++;
13773 +               goto out;
13774 +       }
13775 +
13776 +       sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
13777 +       if (!sg) {
13778 +               error = -ENOMEM;
13779 +               goto fail_unlock;
13780 +       }
13781 +       memset(sg, 0, sizeof(sm_group_t) + namelen);
13782 +
13783 +       sg->refcount = 1;
13784 +       sg->service_data = servicedata;
13785 +       sg->ops = ops;
13786 +       sg->level = level;
13787 +       sg->namelen = namelen;
13788 +       memcpy(sg->name, name, namelen);
13789 +       sg->local_id = sm_new_local_id(level);
13790 +       sg->state = SGST_NONE;
13791 +       INIT_LIST_HEAD(&sg->memb);
13792 +       INIT_LIST_HEAD(&sg->joining);
13793 +       init_completion(&sg->event_comp);
13794 +
13795 +       list_add_tail(&sg->list, &sg_registered[level]);
13796 +
13797 +      out:
13798 +       *service_id = sg->local_id;
13799 +       up(&sm_sglock);
13800 +       return 0;
13801 +
13802 +      fail_unlock:
13803 +       up(&sm_sglock);
13804 +       kcl_releaseref_cluster();
13805 +      fail:
13806 +       return error;
13807 +}
13808 +
13809 +/* Context: service */
13810 +
13811 +void kcl_unregister_service(uint32_t local_id)
13812 +{
13813 +       sm_group_t *sg;
13814 +       int level = sm_id_to_level(local_id);
13815 +
13816 +       down(&sm_sglock);
13817 +
13818 +       list_for_each_entry(sg, &sg_registered[level], list) {
13819 +               if (sg->local_id == local_id) {
13820 +                       SM_ASSERT(sg->refcount,);
13821 +                       sg->refcount--;
13822 +
13823 +                       if (!sg->refcount) {
13824 +                               list_del(&sg->list);
13825 +                               kfree(sg);
13826 +                       }
13827 +                       kcl_releaseref_cluster();
13828 +                       break;
13829 +               }
13830 +       }
13831 +       up(&sm_sglock);
13832 +}
13833 +
13834 +/* Context: service */
13835 +
13836 +int kcl_join_service(uint32_t local_id)
13837 +{
13838 +       sm_group_t *sg;
13839 +       sm_sevent_t *sev;
13840 +       int level = sm_id_to_level(local_id);
13841 +       int error, found = FALSE;
13842 +
13843 +       down(&sm_sglock);
13844 +
13845 +       list_for_each_entry(sg, &sg_registered[level], list) {
13846 +               if (sg->local_id == local_id) {
13847 +                       found = TRUE;
13848 +                       break;
13849 +               }
13850 +       }
13851 +
13852 +       if (!found) {
13853 +               up(&sm_sglock);
13854 +               error = -ENOENT;
13855 +               goto out;
13856 +       }
13857 +
13858 +       if (sg->state != SGST_NONE) {
13859 +               up(&sm_sglock);
13860 +               error = -EINVAL;
13861 +               goto out;
13862 +       }
13863 +
13864 +       sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13865 +       if (!sev) {
13866 +               up(&sm_sglock);
13867 +               error = -ENOMEM;
13868 +               goto out;
13869 +       }
13870 +
13871 +       memset(sev, 0, sizeof (sm_sevent_t));
13872 +       sev->se_state = SEST_JOIN_BEGIN;
13873 +       sm_set_event_id(&sev->se_id);
13874 +       sev->se_sg = sg;
13875 +       sg->sevent = sev;
13876 +       sg->state = SGST_JOIN;
13877 +       set_bit(SGFL_SEVENT, &sg->flags);
13878 +       list_del(&sg->list);
13879 +       list_add_tail(&sg->list, &sm_sg[sg->level]);
13880 +
13881 +       up(&sm_sglock);
13882 +
13883 +       /*
13884 +        * The join is a service event which will be processed asynchronously.
13885 +        */
13886 +
13887 +       new_joinleave(sev);
13888 +       wait_for_completion(&sg->event_comp);
13889 +       error = 0;
13890 +
13891 +      out:
13892 +       return error;
13893 +}
13894 +
13895 +/* Context: service */
13896 +
13897 +int kcl_leave_service(uint32_t local_id)
13898 +{
13899 +       sm_group_t *sg = NULL;
13900 +       sm_sevent_t *sev;
13901 +       int error;
13902 +
13903 +       error = -ENOENT;
13904 +       sg = sm_local_id_to_sg(local_id);
13905 +       if (!sg)
13906 +               goto out;
13907 +
13908 +       /* sg was never joined */
13909 +       error = -EINVAL;
13910 +       if (sg->state == SGST_NONE)
13911 +               goto out;
13912 +
13913 +       down(&sm_sglock);
13914 +
13915 +       /* may still be joining */
13916 +       if (test_and_set_bit(SGFL_SEVENT, &sg->flags)) {
13917 +               up(&sm_sglock);
13918 +               error = -EBUSY;
13919 +               goto out;
13920 +       }
13921 +
13922 +       sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
13923 +       if (!sev) {
13924 +               up(&sm_sglock);
13925 +               error = -ENOMEM;
13926 +               goto out;
13927 +       }
13928 +
13929 +       memset(sev, 0, sizeof (sm_sevent_t));
13930 +       sev->se_state = SEST_LEAVE_BEGIN;
13931 +       sm_set_event_id(&sev->se_id);
13932 +       set_bit(SEFL_LEAVE, &sev->se_flags);
13933 +       sev->se_sg = sg;
13934 +       sg->sevent = sev;
13935 +
13936 +       up(&sm_sglock);
13937 +
13938 +       new_joinleave(sev);
13939 +       wait_for_completion(&sg->event_comp);
13940 +       error = 0;
13941 +
13942 +       down(&sm_sglock);
13943 +       list_del(&sg->list);
13944 +       list_add_tail(&sg->list, &sg_registered[sg->level]);
13945 +       up(&sm_sglock);
13946 +
13947 +      out:
13948 +       return error;
13949 +}
13950 +
13951 +static void process_callback(uint32_t local_id, int event_id)
13952 +{
13953 +       sm_group_t *sg;
13954 +       sm_sevent_t *sev;
13955 +       sm_uevent_t *uev;
13956 +
13957 +       sg = sm_local_id_to_sg(local_id);
13958 +       if (!sg)
13959 +               return;
13960 +
13961 +       if (sg->state == SGST_RECOVER) {
13962 +               if (!check_recovery(sg, event_id)) {
13963 +                       log_error(sg, "process_callback invalid recover "
13964 +                                 "event id %d", event_id);
13965 +                       return;
13966 +               }
13967 +
13968 +               if (sg->recover_state == RECOVER_START)
13969 +                       sg->recover_state = RECOVER_STARTDONE;
13970 +               else
13971 +                       log_error(sg, "process_callback recover state %u",
13972 +                                 sg->recover_state);
13973 +               wake_serviced(DO_RECOVERIES);
13974 +       }
13975 +
13976 +       else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
13977 +                (sg->sevent->se_id == event_id)) {
13978 +               sev = sg->sevent;
13979 +
13980 +               if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
13981 +                   (sev->se_state == SEST_JSTART_SERVICEWAIT))
13982 +                       sev->se_state = SEST_JSTART_SERVICEDONE;
13983 +
13984 +               set_bit(SEFL_CHECK, &sev->se_flags);
13985 +               wake_serviced(DO_JOINLEAVE);
13986 +       }
13987 +
13988 +       else if (test_bit(SGFL_UEVENT, &sg->flags) &&
13989 +                (sg->uevent.ue_id == event_id)) {
13990 +               uev = &sg->uevent;
13991 +
13992 +               if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
13993 +                       if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
13994 +                               uev->ue_state = UEST_JSTART_SERVICEDONE;
13995 +                       else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
13996 +                               uev->ue_state = UEST_LSTART_SERVICEDONE;
13997 +               }
13998 +               set_bit(UEFL_CHECK, &uev->ue_flags);
13999 +               wake_serviced(DO_MEMBERSHIP);
14000 +       }
14001 +
14002 +       else
14003 +               log_error(sg, "ignoring service callback id=%x event=%u",
14004 +                         local_id, event_id);
14005 +}
14006 +
14007 +void process_callbacks(void)
14008 +{
14009 +       sc_entry_t *se;
14010 +
14011 +       while (1) {
14012 +               se = NULL;
14013 +
14014 +               spin_lock(&callback_lock);
14015 +               if (!list_empty(&callbacks)) {
14016 +                       se = list_entry(callbacks.next, sc_entry_t, list);
14017 +                       list_del(&se->list);
14018 +               }
14019 +               spin_unlock(&callback_lock);
14020 +
14021 +               if (!se)
14022 +                       break;
14023 +               process_callback(se->local_id, se->event_id);
14024 +               kfree(se);
14025 +               schedule();
14026 +       }
14027 +}
14028 +
14029 +/* Context: service */
14030 +
14031 +void kcl_start_done(uint32_t local_id, int event_id)
14032 +{
14033 +       sc_entry_t *se;
14034 +
14035 +       SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
14036 +
14037 +       se->local_id = local_id;
14038 +       se->event_id = event_id;
14039 +
14040 +       spin_lock(&callback_lock);
14041 +       list_add_tail(&se->list, &callbacks);
14042 +       spin_unlock(&callback_lock);
14043 +
14044 +       wake_serviced(DO_CALLBACKS);
14045 +}
14046 +
14047 +/* Context: service */
14048 +
14049 +void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
14050 +{
14051 +       sm_group_t *sg = sm_local_id_to_sg(local_id);
14052 +
14053 +       if (!sg)
14054 +               log_print("kcl_global_service_id: can't find %x", local_id);
14055 +       else
14056 +               *global_id = sg->global_id;
14057 +}
14058 +
14059 +static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
14060 +{
14061 +       s->level = sg->level;
14062 +       s->local_id = sg->local_id;
14063 +       s->global_id = sg->global_id;
14064 +       s->node_count = sg->memb_count;
14065 +       strcpy(s->name, sg->name);
14066 +}
14067 +
14068 +int kcl_get_services(struct list_head *head, int level)
14069 +{
14070 +       sm_group_t *sg;
14071 +       struct kcl_service *s;
14072 +       int error = -ENOMEM, count = 0;
14073 +
14074 +       down(&sm_sglock);
14075 +
14076 +       list_for_each_entry(sg, &sg_registered[level], list) {
14077 +               if (head) {
14078 +                       s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14079 +                       if (!s)
14080 +                               goto out;
14081 +                       copy_to_service(sg, s);
14082 +                       list_add(&s->list, head);
14083 +               }
14084 +               count++;
14085 +       }
14086 +
14087 +       list_for_each_entry(sg, &sm_sg[level], list) {
14088 +               if (head) {
14089 +                       s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
14090 +                       if (!s)
14091 +                               goto out;
14092 +                       copy_to_service(sg, s);
14093 +                       list_add(&s->list, head);
14094 +               }
14095 +               count++;
14096 +       }
14097 +
14098 +       error = count;
14099 + out:
14100 +       up(&sm_sglock);
14101 +       return error;
14102 +}
14103 +
14104 +/* These three global variables listed in extern form in sm.h. */
14105 +struct list_head sm_sg[SG_LEVELS];
14106 +struct semaphore sm_sglock;
14107 diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
14108 --- linux-orig/cluster/cman/sm_services.h       1970-01-01 07:30:00.000000000 +0730
14109 +++ linux-patched/cluster/cman/sm_services.h    2004-11-03 11:37:37.000000000 +0800
14110 @@ -0,0 +1,20 @@
14111 +/******************************************************************************
14112 +*******************************************************************************
14113 +**
14114 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14115 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14116 +**
14117 +**  This copyrighted material is made available to anyone wishing to use,
14118 +**  modify, copy, or redistribute it subject to the terms and conditions
14119 +**  of the GNU General Public License v.2.
14120 +**
14121 +*******************************************************************************
14122 +******************************************************************************/
14123 +
14124 +#ifndef __SM_SERVICES_DOT_H__
14125 +#define __SM_SERVICES_DOT_H__
14126 +
14127 +void init_services(void);
14128 +void process_callbacks(void);
14129 +
14130 +#endif
14131 diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
14132 --- linux-orig/cluster/cman/sm_user.c   1970-01-01 07:30:00.000000000 +0730
14133 +++ linux-patched/cluster/cman/sm_user.c        2004-11-03 11:37:37.000000000 +0800
14134 @@ -0,0 +1,569 @@
14135 +/******************************************************************************
14136 +*******************************************************************************
14137 +**
14138 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14139 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14140 +**
14141 +**  This copyrighted material is made available to anyone wishing to use,
14142 +**  modify, copy, or redistribute it subject to the terms and conditions
14143 +**  of the GNU General Public License v.2.
14144 +**
14145 +*******************************************************************************
14146 +******************************************************************************/
14147 +
14148 +#include "sm.h"
14149 +#include "cnxman-private.h"
14150 +
14151 +void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
14152 +
14153 +#define UST_REGISTER   1
14154 +#define UST_UNREGISTER 2
14155 +#define UST_JOIN       3
14156 +#define UST_LEAVE      4
14157 +#define UST_JOINED     5
14158 +
14159 +struct event {
14160 +       struct list_head        list;
14161 +       service_event_t         type;
14162 +       service_start_t         start_type;
14163 +       unsigned int            event_id;
14164 +       unsigned int            last_stop;
14165 +       unsigned int            last_start;
14166 +       unsigned int            last_finish;
14167 +       unsigned int            node_count;
14168 +       uint32_t *              nodeids;
14169 +};
14170 +typedef struct event event_t;
14171 +
14172 +struct user_service {
14173 +       uint32_t                local_id;
14174 +       pid_t                   pid;
14175 +       int                     signal;
14176 +       struct socket *         sock;
14177 +       uint8_t                 state;
14178 +       uint8_t                 async;
14179 +       struct semaphore        lock;
14180 +       struct list_head        events;
14181 +       spinlock_t              event_lock;
14182 +       unsigned int            last_stop;
14183 +       unsigned int            last_start;
14184 +       unsigned int            last_finish;
14185 +       unsigned int            need_startdone;
14186 +       unsigned int            node_count;
14187 +       uint32_t *              nodeids;
14188 +       int                     name_len;
14189 +       char                    name[MAX_SERVICE_NAME_LEN];
14190 +};
14191 +typedef struct user_service user_service_t;
14192 +
14193 +
14194 +static void add_event(user_service_t *us, event_t *ev)
14195 +{
14196 +       spin_lock(&us->event_lock);
14197 +       list_add_tail(&ev->list, &us->events);
14198 +
14199 +       switch(ev->type) {
14200 +       case SERVICE_EVENT_STOP:
14201 +               us->last_stop = us->last_start;
14202 +               break;
14203 +       case SERVICE_EVENT_START:
14204 +               us->last_start = ev->event_id;
14205 +               break;
14206 +       case SERVICE_EVENT_FINISH:
14207 +               us->last_finish = ev->event_id;
14208 +               break;
14209 +       case SERVICE_EVENT_LEAVEDONE:
14210 +               break;
14211 +       }
14212 +       spin_unlock(&us->event_lock);
14213 +}
14214 +
14215 +static event_t *get_event(user_service_t *us)
14216 +{
14217 +       event_t *ev = NULL;
14218 +
14219 +       spin_lock(&us->event_lock);
14220 +       if (!list_empty(&us->events)) {
14221 +               ev = list_entry(us->events.next, event_t, list);
14222 +               ev->last_stop = us->last_stop;
14223 +               ev->last_start = us->last_start;
14224 +               ev->last_finish = us->last_finish;
14225 +       }
14226 +       spin_unlock(&us->event_lock);
14227 +       return ev;
14228 +}
14229 +
14230 +static void del_event(user_service_t *us, event_t *ev)
14231 +{
14232 +       spin_lock(&us->event_lock);
14233 +       list_del(&ev->list);
14234 +       spin_unlock(&us->event_lock);
14235 +}
14236 +
14237 +static event_t *alloc_event(void)
14238 +{
14239 +       event_t *ev;
14240 +       SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
14241 +       memset(ev, 0, sizeof(event_t));
14242 +       return ev;
14243 +}
14244 +
14245 +/* us->lock must be held before calling */
14246 +static void user_notify(user_service_t *us)
14247 +{
14248 +       if (us->sock)
14249 +               queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
14250 +       if (us->pid && us->signal)
14251 +               kill_proc(us->pid, us->signal, 0);
14252 +}
14253 +
14254 +static service_start_t start_type(int type)
14255 +{
14256 +       switch (type) {
14257 +       case SERVICE_NODE_FAILED:
14258 +               return SERVICE_START_FAILED;
14259 +       case SERVICE_NODE_JOIN:
14260 +               return SERVICE_START_JOIN;
14261 +       case SERVICE_NODE_LEAVE:
14262 +               return SERVICE_START_LEAVE;
14263 +       }
14264 +       return 0;
14265 +}
14266 +
14267 +static int user_stop(void *servicedata)
14268 +{
14269 +       user_service_t *us = (user_service_t *) servicedata;
14270 +       event_t *ev;
14271 +
14272 +       down(&us->lock);
14273 +       if (!us->sock)
14274 +               goto out;
14275 +
14276 +       ev = alloc_event();
14277 +       ev->type = SERVICE_EVENT_STOP;
14278 +
14279 +       add_event(us, ev);
14280 +       user_notify(us);
14281 + out:
14282 +       up(&us->lock);
14283 +       return 0;
14284 +}
14285 +
14286 +static int user_start(void *servicedata, uint32_t *nodeids, int count,
14287 +                     int event_id, int type)
14288 +{
14289 +       user_service_t *us = (user_service_t *) servicedata;
14290 +       event_t *ev;
14291 +
14292 +       down(&us->lock);
14293 +       if (!us->sock) {
14294 +               kcl_start_done(us->local_id, event_id);
14295 +               goto out;
14296 +       }
14297 +
14298 +       us->need_startdone = event_id;
14299 +
14300 +       ev = alloc_event();
14301 +       ev->type = SERVICE_EVENT_START;
14302 +       ev->node_count = count;
14303 +       ev->start_type = start_type(type);
14304 +       ev->event_id = event_id;
14305 +       ev->nodeids = nodeids;
14306 +
14307 +       add_event(us, ev);
14308 +       user_notify(us);
14309 + out:
14310 +       up(&us->lock);
14311 +       return 0;
14312 +}
14313 +
14314 +static void user_finish(void *servicedata, int event_id)
14315 +{
14316 +       user_service_t *us = (user_service_t *) servicedata;
14317 +       event_t *ev;
14318 +
14319 +       down(&us->lock);
14320 +       if (!us->sock)
14321 +               goto out;
14322 +
14323 +       ev = alloc_event();
14324 +       ev->type = SERVICE_EVENT_FINISH;
14325 +       ev->event_id = event_id;
14326 +
14327 +       add_event(us, ev);
14328 +       user_notify(us);
14329 + out:
14330 +       up(&us->lock);
14331 +}
14332 +
14333 +struct kcl_service_ops user_service_ops = {
14334 +       .stop = user_stop,
14335 +       .start = user_start,
14336 +       .finish = user_finish
14337 +};
14338 +
14339 +static int user_register(char *u_name, user_service_t **us_data)
14340 +{
14341 +       user_service_t *us;
14342 +       char name[MAX_SERVICE_NAME_LEN+1];
14343 +       int len, error;
14344 +
14345 +       memset(name, 0, MAX_SERVICE_NAME_LEN+1);
14346 +
14347 +       if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
14348 +               return -EFAULT;
14349 +
14350 +       len = strlen(name);
14351 +       if (len > MAX_SERVICE_NAME_LEN)
14352 +               return -ENAMETOOLONG;
14353 +       if (!len)
14354 +               return -EINVAL;
14355 +
14356 +       us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
14357 +       if (!us)
14358 +               return -ENOMEM;
14359 +       memset(us, 0, sizeof(user_service_t));
14360 +       us->nodeids = NULL;
14361 +       INIT_LIST_HEAD(&us->events);
14362 +       spin_lock_init(&us->event_lock);
14363 +       init_MUTEX(&us->lock);
14364 +       us->name_len = len;
14365 +       memcpy(us->name, name, len);
14366 +
14367 +       error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
14368 +                                    &user_service_ops, TRUE, (void *) us,
14369 +                                    &us->local_id);
14370 +       if (error) {
14371 +               kfree(us);
14372 +               us = NULL;
14373 +       }
14374 +       *us_data = us;
14375 +       return error;
14376 +}
14377 +
14378 +static void user_unregister(user_service_t *us)
14379 +{
14380 +       event_t *ev;
14381 +
14382 +       kcl_unregister_service(us->local_id);
14383 +
14384 +       if (us->nodeids)
14385 +               kfree(us->nodeids);
14386 +
14387 +       while ((ev = get_event(us))) {
14388 +               del_event(us, ev);
14389 +               if (ev->nodeids)
14390 +                       kfree(ev->nodeids);
14391 +               kfree(ev);
14392 +       }
14393 +}
14394 +
14395 +static int user_join_async(void *arg)
14396 +{
14397 +       user_service_t *us = arg;
14398 +       int user_gone = 0;
14399 +
14400 +       daemonize("cman_userjoin");
14401 +
14402 +       kcl_join_service(us->local_id);
14403 +
14404 +       down(&us->lock);
14405 +       us->state = UST_JOINED;
14406 +       us->async = 0;
14407 +       if (!us->sock) {
14408 +               if (us->need_startdone)
14409 +                       kcl_start_done(us->local_id, us->need_startdone);
14410 +               user_gone = 1;
14411 +       }
14412 +       up(&us->lock);
14413 +
14414 +       if (user_gone) {
14415 +               kcl_leave_service(us->local_id);
14416 +               user_unregister(us);
14417 +               kfree(us);
14418 +       }
14419 +       return 0;
14420 +}
14421 +
14422 +static int user_leave_async(void *arg)
14423 +{
14424 +       user_service_t *us = arg;
14425 +
14426 +       daemonize("cman_userleave");
14427 +
14428 +       kcl_leave_service(us->local_id);
14429 +
14430 +       down(&us->lock);
14431 +       us->async = 0;
14432 +       if (!us->sock) {
14433 +               user_unregister(us);
14434 +               kfree(us);
14435 +       } else {
14436 +               event_t *ev = alloc_event();
14437 +               ev->type = SERVICE_EVENT_LEAVEDONE;
14438 +               add_event(us, ev);
14439 +               user_notify(us);
14440 +               up(&us->lock);
14441 +       }
14442 +
14443 +       return 0;
14444 +}
14445 +
14446 +static int user_join(user_service_t *us, int wait)
14447 +{
14448 +       int error = 0;
14449 +
14450 +       if (wait) {
14451 +               error = kcl_join_service(us->local_id);
14452 +               us->state = UST_JOINED;
14453 +       }
14454 +       else {
14455 +               us->async = 1;
14456 +               kernel_thread(user_join_async, us, 0);
14457 +       }
14458 +
14459 +       return error;
14460 +}
14461 +
14462 +static void user_leave(user_service_t *us, int wait)
14463 +{
14464 +       if (wait)
14465 +               kcl_leave_service(us->local_id);
14466 +       else {
14467 +               us->async = 1;
14468 +               kernel_thread(user_leave_async, us, 0);
14469 +       }
14470 +}
14471 +
14472 +static int user_start_done(user_service_t *us, unsigned int event_id)
14473 +{
14474 +       if (!us->need_startdone)
14475 +               return -EINVAL;
14476 +       if (us->need_startdone == event_id)
14477 +               us->need_startdone = 0;
14478 +       kcl_start_done(us->local_id, event_id);
14479 +       return 0;
14480 +}
14481 +
14482 +static void user_set_signal(user_service_t *us, int signal)
14483 +{
14484 +       us->pid = current->pid;
14485 +       us->signal = signal;
14486 +}
14487 +
14488 +static int user_get_event(user_service_t *us,
14489 +                         struct cl_service_event *user_event)
14490 +{
14491 +       event_t *ev;
14492 +       struct cl_service_event event;
14493 +
14494 +       ev = get_event(us);
14495 +       if (!ev)
14496 +               return 0;
14497 +
14498 +       event.type        = ev->type;
14499 +       event.start_type  = ev->start_type;
14500 +       event.event_id    = ev->event_id;
14501 +       event.last_stop   = ev->last_stop;
14502 +       event.last_start  = ev->last_start;
14503 +       event.last_finish = ev->last_finish;
14504 +       event.node_count  = ev->node_count;
14505 +
14506 +       if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
14507 +               return -EFAULT;
14508 +
14509 +       del_event(us, ev);
14510 +
14511 +       if (ev->type == SERVICE_EVENT_START) {
14512 +               if (us->nodeids)
14513 +                       kfree(us->nodeids);
14514 +               us->nodeids = ev->nodeids;
14515 +               us->node_count = ev->node_count;
14516 +       }
14517 +
14518 +       kfree(ev);
14519 +       return 1;
14520 +}
14521 +
14522 +static int user_get_members(user_service_t *us,
14523 +                           struct cl_cluster_nodelist *u_nodelist)
14524 +{
14525 +       struct cl_cluster_nodelist user_nodelist;
14526 +       struct cl_cluster_node user_node, *u_node;
14527 +       struct cluster_node *node;
14528 +       unsigned int i;
14529 +       int num_nodes = 0;
14530 +
14531 +       if (!u_nodelist)
14532 +               return us->node_count;
14533 +
14534 +       if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
14535 +                          sizeof(struct cl_cluster_nodelist)))
14536 +               return -EFAULT;
14537 +
14538 +       if (user_nodelist.max_members < us->node_count)
14539 +               return -E2BIG;
14540 +
14541 +       u_node = user_nodelist.nodes;
14542 +
14543 +       for (i = 0; i < us->node_count; i++) {
14544 +               node = find_node_by_nodeid(us->nodeids[i]);
14545 +               if (!node)
14546 +                       continue;
14547 +
14548 +               copy_to_usernode(node, &user_node);
14549 +               if (copy_to_user(u_node, &user_node,
14550 +                                sizeof(struct cl_cluster_node)))
14551 +                       return -EFAULT;
14552 +
14553 +               u_node++;
14554 +               num_nodes++;
14555 +       }
14556 +       return num_nodes;
14557 +}
14558 +
14559 +static int user_global_id(user_service_t *us, uint32_t *id)
14560 +{
14561 +       uint32_t gid = 0;
14562 +
14563 +       if (us->state != UST_JOINED)
14564 +               return -EINVAL;
14565 +
14566 +       kcl_global_service_id(us->local_id, &gid);
14567 +
14568 +       if (copy_to_user(id, &gid, sizeof(uint32_t)))
14569 +               return -EFAULT;
14570 +       return 0;
14571 +}
14572 +
14573 +static int user_set_level(user_service_t *us, int level)
14574 +{
14575 +       int prev_id = us->local_id;
14576 +       int error;
14577 +
14578 +       if (us->state != UST_REGISTER)
14579 +               return -EINVAL;
14580 +
14581 +       error = kcl_register_service(us->name, us->name_len, level,
14582 +                                    &user_service_ops, TRUE, (void *) us,
14583 +                                    &us->local_id);
14584 +       if (error)
14585 +               return error;
14586 +
14587 +       kcl_unregister_service(prev_id);
14588 +       return 0;
14589 +}
14590 +
14591 +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
14592 +{
14593 +       struct cluster_sock *c = cluster_sk(sock->sk);
14594 +       user_service_t *us = c->service_data;
14595 +       int error = 0;
14596 +
14597 +       if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
14598 +               return -EINVAL;
14599 +
14600 +       switch (cmd) {
14601 +       case SIOCCLUSTER_SERVICE_REGISTER:
14602 +               error = user_register((char *) arg, &us);
14603 +               if (!error) {
14604 +                       us->state = UST_REGISTER;
14605 +                       us->sock = sock;
14606 +                       c->service_data = us;
14607 +               }
14608 +               break;
14609 +
14610 +       case SIOCCLUSTER_SERVICE_UNREGISTER:
14611 +               down(&us->lock);
14612 +               us->state = UST_UNREGISTER;
14613 +               user_unregister(us);
14614 +               up(&us->lock);
14615 +               break;
14616 +
14617 +       case SIOCCLUSTER_SERVICE_JOIN:
14618 +               us->state = UST_JOIN;
14619 +               user_join(us, 0);
14620 +               break;
14621 +
14622 +       case SIOCCLUSTER_SERVICE_LEAVE:
14623 +               down(&us->lock);
14624 +               if (us->state != UST_JOINED) {
14625 +                       error = -EBUSY;
14626 +                       up(&us->lock);
14627 +               } else {
14628 +                       us->state = UST_LEAVE;
14629 +                       up(&us->lock);
14630 +                       user_leave(us, 0);
14631 +               }
14632 +               break;
14633 +
14634 +       case SIOCCLUSTER_SERVICE_SETSIGNAL:
14635 +               user_set_signal(us, (int) arg);
14636 +               break;
14637 +
14638 +       case SIOCCLUSTER_SERVICE_STARTDONE:
14639 +               error = user_start_done(us, (unsigned int) arg);
14640 +               break;
14641 +
14642 +       case SIOCCLUSTER_SERVICE_GETEVENT:
14643 +               error = user_get_event(us, (struct cl_service_event *) arg);
14644 +               break;
14645 +
14646 +       case SIOCCLUSTER_SERVICE_GETMEMBERS:
14647 +               error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
14648 +               break;
14649 +
14650 +       case SIOCCLUSTER_SERVICE_GLOBALID:
14651 +               error = user_global_id(us, (uint32_t *) arg);
14652 +               break;
14653 +
14654 +       case SIOCCLUSTER_SERVICE_SETLEVEL:
14655 +               error = user_set_level(us, (int) arg);
14656 +               break;
14657 +
14658 +       default:
14659 +               error = -EINVAL;
14660 +       }
14661 +
14662 +       return error;
14663 +}
14664 +
14665 +void sm_sock_release(struct socket *sock)
14666 +{
14667 +       struct cluster_sock *c = cluster_sk(sock->sk);
14668 +       user_service_t *us = c->service_data;
14669 +       int state;
14670 +
14671 +       if (!us)
14672 +               return;
14673 +
14674 +       down(&us->lock);
14675 +       us->sock = NULL;
14676 +       c->service_data = NULL;
14677 +
14678 +       if (us->need_startdone)
14679 +               kcl_start_done(us->local_id, us->need_startdone);
14680 +
14681 +       if (us->async) {
14682 +               /* async thread will clean up before exiting */
14683 +               up(&us->lock);
14684 +               return;
14685 +       }
14686 +       state = us->state;
14687 +       up(&us->lock);
14688 +
14689 +       switch (state) {
14690 +       case UST_JOIN:
14691 +               break;
14692 +       case UST_JOINED:
14693 +               user_leave(us, 1);
14694 +               /* fall through */
14695 +       case UST_LEAVE:
14696 +       case UST_REGISTER:
14697 +               user_unregister(us);
14698 +               /* fall through */
14699 +       case UST_UNREGISTER:
14700 +               kfree(us);
14701 +               break;
14702 +       }
14703 +}
14704 diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
14705 --- linux-orig/cluster/cman/sm_user.h   1970-01-01 07:30:00.000000000 +0730
14706 +++ linux-patched/cluster/cman/sm_user.h        2004-11-03 11:37:37.000000000 +0800
14707 @@ -0,0 +1,21 @@
14708 +/******************************************************************************
14709 +*******************************************************************************
14710 +**
14711 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14712 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14713 +**
14714 +**  This copyrighted material is made available to anyone wishing to use,
14715 +**  modify, copy, or redistribute it subject to the terms and conditions
14716 +**  of the GNU General Public License v.2.
14717 +**
14718 +*******************************************************************************
14719 +******************************************************************************/
14720 +
14721 +#ifndef __SM_USER_DOT_H__
14722 +#define __SM_USER_DOT_H__
14723 +
14724 +int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
14725 +void sm_sock_release(struct socket *sock);
14726 +void sm_sock_bind(struct socket *sock);
14727 +
14728 +#endif
14729 diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
14730 --- linux-orig/include/cluster/cnxman-socket.h  1970-01-01 07:30:00.000000000 +0730
14731 +++ linux-patched/include/cluster/cnxman-socket.h       2004-11-03 11:37:37.000000000 +0800
14732 @@ -0,0 +1,233 @@
14733 +/******************************************************************************
14734 +*******************************************************************************
14735 +**
14736 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14737 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14738 +**
14739 +**  This copyrighted material is made available to anyone wishing to use,
14740 +**  modify, copy, or redistribute it subject to the terms and conditions
14741 +**  of the GNU General Public License v.2.
14742 +**
14743 +*******************************************************************************
14744 +******************************************************************************/
14745 +
14746 +/* CMAN socket interface header,
14747 +   may be include by user or kernel code */
14748 +
14749 +#ifndef __CNXMAN_SOCKET_H
14750 +#define __CNXMAN_SOCKET_H
14751 +
14752 +/* A currently unused number. TIPC also uses this number and you're unlikely
14753 +   to be using both.
14754 + */
14755 +#define AF_CLUSTER 30
14756 +#define PF_CLUSTER AF_CLUSTER
14757 +
14758 +/* Protocol(socket) types */
14759 +#define CLPROTO_MASTER 2
14760 +#define CLPROTO_CLIENT 3
14761 +
14762 +/* ioctls -- should register these properly */
14763 +#define SIOCCLUSTER_NOTIFY            _IOW('x', 0x01, int)
14764 +#define SIOCCLUSTER_REMOVENOTIFY      _IO( 'x', 0x02)
14765 +#define SIOCCLUSTER_GETMEMBERS        _IOR('x', 0x03, struct cl_cluster_nodelist)
14766 +#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
14767 +#define SIOCCLUSTER_ISQUORATE         _IO( 'x', 0x05)
14768 +#define SIOCCLUSTER_ISLISTENING       _IOW('x', 0x06, struct cl_listen_request)
14769 +#define SIOCCLUSTER_GETALLMEMBERS     _IOR('x', 0x07, struct cl_cluster_nodelist)
14770 +#define SIOCCLUSTER_SET_VOTES         _IOW('x', 0x08, int)
14771 +#define SIOCCLUSTER_GET_VERSION       _IOR('x', 0x09, struct cl_version)
14772 +#define SIOCCLUSTER_SET_VERSION       _IOW('x', 0x0a, struct cl_version)
14773 +#define SIOCCLUSTER_ISACTIVE          _IO( 'x', 0x0b)
14774 +#define SIOCCLUSTER_KILLNODE          _IOW('x', 0x0c, int)
14775 +#define SIOCCLUSTER_GET_JOINCOUNT     _IO( 'x', 0x0d)
14776 +#define SIOCCLUSTER_SERVICE_REGISTER  _IOW('x', 0x0e, char)
14777 +#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
14778 +#define SIOCCLUSTER_SERVICE_JOIN      _IO( 'x', 0x10)
14779 +#define SIOCCLUSTER_SERVICE_LEAVE     _IO( 'x', 0x20)
14780 +#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
14781 +#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
14782 +#define SIOCCLUSTER_SERVICE_GETEVENT  _IOR('x', 0x50, struct cl_service_event)
14783 +#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
14784 +#define SIOCCLUSTER_SERVICE_GLOBALID  _IOR('x', 0x70, uint32_t)
14785 +#define SIOCCLUSTER_SERVICE_SETLEVEL  _IOR('x', 0x80, int)
14786 +#define SIOCCLUSTER_GETNODE          _IOWR('x', 0x90, struct cl_cluster_node)
14787 +#define SIOCCLUSTER_GETCLUSTER       _IOWR('x', 0x91, struct cl_cluster_info)
14788 +#define SIOCCLUSTER_BARRIER           _IOW('x', 0x0a0, struct cl_barrier_info)
14789 +
14790 +/* These were setsockopts */
14791 +#define SIOCCLUSTER_PASS_SOCKET       _IOW('x', 0x0b0, struct cl_passed_sock)
14792 +#define SIOCCLUSTER_SET_NODENAME      _IOW('x', 0x0b1, char *)
14793 +#define SIOCCLUSTER_SET_NODEID        _IOW('x', 0x0b2, int)
14794 +#define SIOCCLUSTER_JOIN_CLUSTER      _IOW('x', 0x0b3, struct cl_join_cluster_info)
14795 +#define SIOCCLUSTER_LEAVE_CLUSTER     _IOW('x', 0x0b4, int)
14796 +
14797 +
14798 +/* Maximum size of a cluster message */
14799 +#define MAX_CLUSTER_MESSAGE          1500
14800 +#define MAX_CLUSTER_MEMBER_NAME_LEN   255
14801 +#define MAX_BARRIER_NAME_LEN           33
14802 +#define MAX_SA_ADDR_LEN                12
14803 +#define MAX_CLUSTER_NAME_LEN           16
14804 +
14805 +/* Well-known cluster port numbers */
14806 +#define CLUSTER_PORT_MEMBERSHIP  1     /* Mustn't block during cluster
14807 +                                        * transitions! */
14808 +#define CLUSTER_PORT_SERVICES    2
14809 +#define CLUSTER_PORT_SYSMAN      10    /* Remote execution daemon */
14810 +#define CLUSTER_PORT_CLVMD       11    /* Cluster LVM daemon */
14811 +#define CLUSTER_PORT_SLM         12    /* LVM SLM (simple lock manager) */
14812 +
14813 +/* Port numbers above this will be blocked when the cluster is inquorate or in
14814 + * transition */
14815 +#define HIGH_PROTECTED_PORT      9
14816 +
14817 +/* Reasons for leaving the cluster */
14818 +#define CLUSTER_LEAVEFLAG_DOWN     0   /* Normal shutdown */
14819 +#define CLUSTER_LEAVEFLAG_KILLED   1
14820 +#define CLUSTER_LEAVEFLAG_PANIC    2
14821 +#define CLUSTER_LEAVEFLAG_REMOVED  3   /* This one can reduce quorum */
14822 +#define CLUSTER_LEAVEFLAG_REJECTED 4   /* Not allowed into the cluster in the
14823 +                                        * first place */
14824 +#define CLUSTER_LEAVEFLAG_INCONSISTENT 5       /* Our view of the cluster is
14825 +                                                * in a minority */
14826 +#define CLUSTER_LEAVEFLAG_DEAD         6       /* Discovered to be dead */
14827 +#define CLUSTER_LEAVEFLAG_FORCE     0x10       /* Forced by command-line */
14828 +
14829 +/* OOB messages sent to a local socket */
14830 +#define CLUSTER_OOB_MSG_PORTCLOSED  1
14831 +#define CLUSTER_OOB_MSG_STATECHANGE 2
14832 +#define CLUSTER_OOB_MSG_SERVICEEVENT 3
14833 +
14834 +/* Sendmsg flags, these are above the normal sendmsg flags so they don't
14835 + * interfere */
14836 +#define MSG_NOACK     0x010000 /* Don't need an ACK for this message */
14837 +#define MSG_QUEUE     0x020000 /* Queue the message for sending later */
14838 +#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
14839 +                                */
14840 +#define MSG_ALLINT    0x100000 /* Send out of all interfaces */
14841 +#define MSG_REPLYEXP  0x200000 /* Reply is expected */
14842 +#define MSG_BCASTSELF 0x400000 /* Broadcast message also gets send to us */
14843 +
14844 +typedef enum { NODESTATE_JOINING=1, NODESTATE_MEMBER,
14845 +              NODESTATE_DEAD } nodestate_t;
14846 +
14847 +
14848 +struct sockaddr_cl {
14849 +       unsigned short scl_family;
14850 +       unsigned char scl_flags;
14851 +       unsigned char scl_port;
14852 +       int           scl_nodeid;
14853 +};
14854 +
14855 +/*
14856 + * This is how we pass the multicast & receive sockets into kernel space.
14857 + */
14858 +struct cl_passed_sock {
14859 +       int fd;                 /* FD of master socket to do multicast on */
14860 +       int number;             /* Socket number, to match up recvonly & bcast
14861 +                                * sockets */
14862 +        int multicast;          /* Is it multicast or receive ? */
14863 +};
14864 +
14865 +/* Cluster configuration info passed when we join the cluster */
14866 +struct cl_join_cluster_info {
14867 +       unsigned char votes;
14868 +       unsigned int expected_votes;
14869 +       unsigned int two_node;
14870 +       unsigned int config_version;
14871 +
14872 +        char cluster_name[17];
14873 +};
14874 +
14875 +
14876 +/* This is the structure, per node, returned from the membership ioctl */
14877 +struct cl_cluster_node {
14878 +       unsigned int size;
14879 +       unsigned int node_id;
14880 +       unsigned int us;
14881 +       unsigned int leave_reason;
14882 +       unsigned int incarnation;
14883 +       nodestate_t state;
14884 +       char name[MAX_CLUSTER_MEMBER_NAME_LEN];
14885 +       unsigned char votes;
14886 +};
14887 +
14888 +/* The struct passed to the membership ioctls */
14889 +struct cl_cluster_nodelist {
14890 +        uint32_t max_members;
14891 +        struct cl_cluster_node *nodes;
14892 +};
14893 +
14894 +/* Structure passed to SIOCCLUSTER_ISLISTENING */
14895 +struct cl_listen_request {
14896 +       unsigned char port;
14897 +        int           nodeid;
14898 +};
14899 +
14900 +/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
14901 +struct cl_portclosed_oob {
14902 +       unsigned char cmd;      /* CLUSTER_OOB_MSG_PORTCLOSED */
14903 +       unsigned char port;
14904 +};
14905 +
14906 +/* Get all version numbers or set the config version */
14907 +struct cl_version {
14908 +       unsigned int major;
14909 +       unsigned int minor;
14910 +       unsigned int patch;
14911 +       unsigned int config;
14912 +};
14913 +
14914 +/* structure passed to barrier ioctls */
14915 +struct cl_barrier_info {
14916 +       char cmd;
14917 +       char name[MAX_BARRIER_NAME_LEN];
14918 +       unsigned int flags;
14919 +       unsigned long arg;
14920 +};
14921 +
14922 +struct cl_cluster_info {
14923 +       char name[MAX_CLUSTER_NAME_LEN+1];
14924 +       uint16_t number;
14925 +};
14926 +
14927 +typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
14928 +               SERVICE_EVENT_LEAVEDONE } service_event_t;
14929 +
14930 +typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
14931 +               service_start_t;
14932 +
14933 +struct cl_service_event {
14934 +       service_event_t type;
14935 +       service_start_t start_type;
14936 +       unsigned int event_id;
14937 +       unsigned int last_stop;
14938 +       unsigned int last_start;
14939 +       unsigned int last_finish;
14940 +       unsigned int node_count;
14941 +};
14942 +
14943 +
14944 +/* Commands to the barrier ioctl */
14945 +#define BARRIER_IOCTL_REGISTER 1
14946 +#define BARRIER_IOCTL_CHANGE   2
14947 +#define BARRIER_IOCTL_DELETE   3
14948 +#define BARRIER_IOCTL_WAIT     4
14949 +
14950 +/* Attributes of a barrier - bitmask */
14951 +#define BARRIER_ATTR_AUTODELETE 1
14952 +#define BARRIER_ATTR_MULTISTEP  2
14953 +#define BARRIER_ATTR_MANUAL     4
14954 +#define BARRIER_ATTR_ENABLED    8
14955 +#define BARRIER_ATTR_CALLBACK  16
14956 +
14957 +/* Attribute setting commands */
14958 +#define BARRIER_SETATTR_AUTODELETE 1
14959 +#define BARRIER_SETATTR_MULTISTEP  2
14960 +#define BARRIER_SETATTR_ENABLED    3
14961 +#define BARRIER_SETATTR_NODES      4
14962 +#define BARRIER_SETATTR_CALLBACK   5
14963 +#define BARRIER_SETATTR_TIMEOUT    6
14964 +
14965 +#endif
14966 diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
14967 --- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
14968 +++ linux-patched/include/cluster/cnxman.h      2004-11-03 11:37:37.000000000 +0800
14969 @@ -0,0 +1,87 @@
14970 +/******************************************************************************
14971 +*******************************************************************************
14972 +**
14973 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14974 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14975 +**
14976 +**  This copyrighted material is made available to anyone wishing to use,
14977 +**  modify, copy, or redistribute it subject to the terms and conditions
14978 +**  of the GNU General Public License v.2.
14979 +**
14980 +*******************************************************************************
14981 +******************************************************************************/
14982 +
14983 +#ifndef __CNXMAN_H
14984 +#define __CNXMAN_H
14985 +
14986 +#include "linux/in6.h"
14987 +#include "cluster/cnxman-socket.h"
14988 +
14989 +/* In-kernel API */
14990 +
14991 +/* This is the structure, per node, returned from the membership request */
14992 +struct kcl_cluster_node {
14993 +       unsigned int size;
14994 +       unsigned int node_id;
14995 +       unsigned int us;
14996 +       unsigned int leave_reason;
14997 +       unsigned int incarnation;
14998 +       nodestate_t state;
14999 +       struct list_head list;
15000 +       char name[MAX_CLUSTER_MEMBER_NAME_LEN];
15001 +       unsigned char votes;
15002 +};
15003 +
15004 +struct cluster_node_addr {
15005 +       struct list_head list;
15006 +       unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
15007 +       int addr_len;
15008 +};
15009 +
15010 +
15011 +/* Reasons for a kernel membership callback */
15012 +typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
15013 +
15014 +/* Kernel version of above, the void *sock is a struct socket */
15015 +struct kcl_multicast_sock {
15016 +       void *sock;
15017 +       int number;             /* Socket number, to match up recvonly & bcast
15018 +                                * sockets */
15019 +};
15020 +
15021 +extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
15022 +                      struct sockaddr_cl *caddr, int addr_len,
15023 +                      unsigned int flags);
15024 +extern int kcl_register_read_callback(struct socket *sock,
15025 +                                     int (*routine) (char *, int, char *, int,
15026 +                                                     unsigned int));
15027 +extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
15028 +extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
15029 +extern int kcl_get_members(struct list_head *list);
15030 +extern int kcl_get_member_ids(uint32_t * idbuf, int size);
15031 +extern int kcl_get_all_members(struct list_head *list);
15032 +extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
15033 +                               struct kcl_cluster_node *n);
15034 +extern int kcl_get_node_by_name(unsigned char *name,
15035 +                               struct kcl_cluster_node *n);
15036 +extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
15037 +extern int kcl_is_quorate(void);
15038 +extern int kcl_addref_cluster(void);
15039 +extern int kcl_releaseref_cluster(void);
15040 +extern int kcl_cluster_name(char **cname);
15041 +extern int kcl_get_current_interface(void);
15042 +extern struct list_head *kcl_get_node_addresses(int nodeid);
15043 +
15044 +extern int kcl_barrier_register(char *name, unsigned int flags,
15045 +                               unsigned int nodes);
15046 +extern int kcl_barrier_setattr(char *name, unsigned int attr,
15047 +                              unsigned long arg);
15048 +extern int kcl_barrier_delete(char *name);
15049 +extern int kcl_barrier_wait(char *name);
15050 +extern int kcl_barrier_cancel(char *name);
15051 +
15052 +extern int kcl_register_quorum_device(char *name, int votes);
15053 +extern int kcl_unregister_quorum_device(void);
15054 +extern int kcl_quorum_device_available(int yesno);
15055 +
15056 +#endif
15057 diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
15058 --- linux-orig/include/cluster/service.h        1970-01-01 07:30:00.000000000 +0730
15059 +++ linux-patched/include/cluster/service.h     2004-11-03 11:37:37.000000000 +0800
15060 @@ -0,0 +1,102 @@
15061 +/******************************************************************************
15062 +*******************************************************************************
15063 +**
15064 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
15065 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
15066 +**
15067 +**  This copyrighted material is made available to anyone wishing to use,
15068 +**  modify, copy, or redistribute it subject to the terms and conditions
15069 +**  of the GNU General Public License v.2.
15070 +**
15071 +*******************************************************************************
15072 +******************************************************************************/
15073 +
15074 +#ifndef __SERVICE_DOT_H__
15075 +#define __SERVICE_DOT_H__
15076 +
15077 +/*
15078 + * Interface between service manager and services
15079 + */
15080 +
15081 +/*
15082 + * Service levels are started in order from lowest, so level 0 is started on
15083 + * all nodes before level 1 is started.
15084 + */
15085 +
15086 +#define SERVICE_LEVEL_FENCE      (0)
15087 +#define SERVICE_LEVEL_GDLM       (1)
15088 +#define SERVICE_LEVEL_GFS        (2)
15089 +#define SERVICE_LEVEL_USER      (3)
15090 +
15091 +#define MAX_SERVICE_NAME_LEN     (33)
15092 +
15093 +/*
15094 + * The type of start a service receives.  The start (and preceding stop) may be
15095 + * due to a node joining or leaving the SG or due to a node having failed.
15096 + */
15097 +
15098 +#define SERVICE_NODE_FAILED      (1)
15099 +#define SERVICE_NODE_JOIN        (2)
15100 +#define SERVICE_NODE_LEAVE       (3)
15101 +
15102 +
15103 +struct kcl_service {
15104 +       struct list_head list;
15105 +       uint16_t level;
15106 +       uint32_t local_id;
15107 +       uint32_t global_id;
15108 +       int node_count;
15109 +       char name[MAX_SERVICE_NAME_LEN];
15110 +};
15111 +
15112 +int kcl_get_services(struct list_head *list, int level);
15113 +
15114 +
15115 +/*
15116 + * These routines which run in CMAN context must return quickly and cannot
15117 + * block.
15118 + */
15119 +
15120 +struct kcl_service_ops {
15121 +       int (*stop) (void *servicedata);
15122 +       int (*start) (void *servicedata, uint32_t *nodeids, int count,
15123 +                     int event_id, int type);
15124 +       void (*finish) (void *servicedata, int event_id);
15125 +};
15126 +
15127 +/*
15128 + * Register will cause CMAN to create a Service Group (SG) for the named
15129 + * instance of the service.  A local ID is returned which is used to join,
15130 + * leave and unregister the service.
15131 + */
15132 +
15133 +int kcl_register_service(char *name, int namelen, int level,
15134 +                        struct kcl_service_ops *ops, int unique,
15135 +                        void *servicedata, uint32_t *local_id);
15136 +
15137 +void kcl_unregister_service(uint32_t local_id);
15138 +
15139 +/*
15140 + * Once a service is joined it will be managed by CMAN and receive start, stop,
15141 + * and finish calls.  After leave is called the service is no longer managed by
15142 + * CMAN.  The first start for a service may arrive before kcl_join_service()
15143 + * returns.
15144 + */
15145 +
15146 +int kcl_join_service(uint32_t local_id);
15147 +int kcl_leave_service(uint32_t local_id);
15148 +
15149 +/*
15150 + * After a service is started, it can ask for its cluster-wide unique ID.
15151 + */
15152 +
15153 +void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
15154 +
15155 +/*
15156 + * Called by a service when it's done with a start().  Cannot be called from
15157 + * the start function.
15158 + */
15159 +
15160 +void kcl_start_done(uint32_t local_id, int event_id);
15161 +
15162 +#endif