- GFS (and related)

author Arkadiusz Miśkiewicz <arekm@maven.pl>

Sun, 27 Jun 2004 00:34:14 +0000 (00:34 +0000)

committer cvs2git <feedback@pld-linux.org>

Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
author Arkadiusz Miśkiewicz <arekm@maven.pl>
Sun, 27 Jun 2004 00:34:14 +0000 (00:34 +0000)
committer cvs2git <feedback@pld-linux.org>
Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
diff --git a/linux-cluster-cman.patch b/linux-cluster-cman.patch

new file mode 100644 (file)

index 0000000..6cc54e9
--- /dev/null
+++ b/linux-cluster-cman.patch
@@ -0,0 +1,14725 @@
+# Add CMAN to build system
+diff -urN -p linux-2.6.7/Makefile linux/Makefile
+--- linux-2.6.7/Makefile       2004-06-16 13:19:37.000000000 +0800
++++ linux/Makefile     2004-06-17 14:55:06.000000000 +0800
+@@ -418,7 +418,7 @@ all:       vmlinux
+ 
+ # Objects we will link into vmlinux / subdirs we need to visit
+ init-y                := init/
+-drivers-y     := drivers/ sound/
++drivers-y     := drivers/ sound/ cluster/
+ net-y         := net/
+ libs-y                := lib/
+ core-y                := usr/
+diff -urN -p linux-2.6.7/arch/alpha/Kconfig linux/arch/alpha/Kconfig
+--- linux-2.6.7/arch/alpha/Kconfig     2004-06-16 13:19:44.000000000 +0800
++++ linux/arch/alpha/Kconfig   2004-06-17 14:55:06.000000000 +0800
+@@ -698,3 +698,4 @@ source "crypto/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/arch/i386/Kconfig linux/arch/i386/Kconfig
+--- linux-2.6.7/arch/i386/Kconfig      2004-06-16 13:18:59.000000000 +0800
++++ linux/arch/i386/Kconfig    2004-06-17 14:55:06.000000000 +0800
+@@ -1315,6 +1315,8 @@ source "crypto/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "cluster/Kconfig"
++
+ config X86_SMP
+       bool
+       depends on SMP && !X86_VOYAGER
+diff -urN -p linux-2.6.7/arch/parisc/Kconfig linux/arch/parisc/Kconfig
+--- linux-2.6.7/arch/parisc/Kconfig    2004-06-16 13:19:36.000000000 +0800
++++ linux/arch/parisc/Kconfig  2004-06-17 14:55:06.000000000 +0800
+@@ -229,3 +229,4 @@ source "crypto/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/arch/sparc64/Kconfig linux/arch/sparc64/Kconfig
+--- linux-2.6.7/arch/sparc64/Kconfig   2004-06-16 13:19:52.000000000 +0800
++++ linux/arch/sparc64/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -713,3 +713,4 @@ source "crypto/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
+--- linux-2.6.7/cluster/Kconfig        1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/Kconfig      2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,13 @@
++menu "Cluster Support"
++
++config CLUSTER
++      tristate "Cluster support"
++      ---help---
++      Enable clustering support. This is not the high-performance clustering
++      made famous by beowulf. It is a high-availability cluster often using
++      shared storage. 
++      The cluster manager is the heart(beat) of the cluster system. It is
++      needed by all the other components. It provides membership services
++      for those other subsystems.
++
++endmenu
+diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
+--- linux-2.6.7/cluster/Makefile       1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/Makefile     2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,3 @@
++obj-y := nocluster.o
++
++obj-$(CONFIG_CLUSTER)         += cman/
+diff -urN -p linux-2.6.7/cluster/cman/Makefile linux/cluster/cman/Makefile
+--- linux-2.6.7/cluster/cman/Makefile  1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/cman/Makefile        2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,6 @@
++cman-objs := cnxman.o config.o membership.o proc.o\
++           sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
++           sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
++           sm_user.o
++
++obj-$(CONFIG_CLUSTER) := cman.o
+diff -urN -p linux-2.6.7/cluster/nocluster.c linux/cluster/nocluster.c
+--- linux-2.6.7/cluster/nocluster.c    1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/nocluster.c  2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,20 @@
++/*
++ * cluster/nocluster.c
++ *
++ * Copy from net/nonet.c
++ * Dummy functions to allow us to configure cluster support entirely
++ * out of the kernel.
++ *
++ * Distributed under the terms of the GNU GPL version 2.
++ * Copyright (c) Matthew Wilcox 2003
++ */
++
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++
++void __init nocluster_init(void)
++{
++}
+diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
+--- linux-orig/cluster/cman/cnxman-private.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/cnxman-private.h        2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,427 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CNXMAN_PRIVATE_H
++#define __CNXMAN_PRIVATE_H
++
++/* Version triplet */
++#define CNXMAN_MAJOR_VERSION 2
++#define CNXMAN_MINOR_VERSION 0
++#define CNXMAN_PATCH_VERSION 1
++
++#define MAX_RETRIES 3         /* Maximum number of send retries */
++#define CAP_CLUSTER CAP_SYS_ADMIN     /* Capability needed to manage the
++                                       * cluster */
++#ifdef __KERNEL__
++
++/* How we announce ourself in console events */
++#define CMAN_NAME "CMAN"
++
++/* One of these per AF_CLUSTER socket */
++struct cluster_sock {
++      /* WARNING: sk has to be the first member */
++      struct sock sk;
++
++      unsigned char port;     /* Bound port or zero */
++      int (*kernel_callback) (char *, int, char *, int, unsigned int);
++      void *service_data;
++};
++
++#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
++
++/* We have one of these for each socket we use for communications */
++struct cl_comms_socket {
++      struct socket *sock;
++      int broadcast;          /* This is a broadcast socket */
++      int recv_only;          /* This is the unicast receive end of a
++                               * multicast socket */
++      struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
++                               * the remote end(s) */
++      int addr_len;           /* Length of above */
++      int number;             /* Internal socket number, used to cycle around
++                               * sockets in case of network errors */
++      struct file *file;      /* file pointer for user-passed in sockets */
++
++      wait_queue_t wait;
++
++      /* The socket list */
++      struct list_head list;
++
++      /* On here when it has something to say */
++      struct list_head active_list;
++      unsigned long active;
++};
++
++/* A client socket. We keep a list of these so we can notify clients of cluster
++ * events */
++struct cl_client_socket {
++      struct socket    *sock;
++      struct list_head  list;
++};
++
++/* This structure is tacked onto the start of a cluster message packet for our
++ * own nefarious purposes. */
++struct cl_protheader {
++      unsigned char  port;
++      unsigned char  flags;
++      unsigned short cluster; /* Our cluster number, little-endian */
++      unsigned short seq;     /* Packet sequence number, little-endian */
++      int            srcid;   /* Node ID of the sender */
++      int            tgtid;   /* Node ID of the target or 0 for multicast
++                               * messages */
++};
++
++/* A cluster internal protocol message - port number 0 */
++struct cl_protmsg {
++      struct cl_protheader header;
++      unsigned char cmd;
++};
++
++/* A Cluster ACK message */
++struct cl_ackmsg {
++      struct cl_protheader header;
++      unsigned char  cmd;     /* Always CLUSTER_CMD_ACK */
++      unsigned char  remport; /* Remoye port number the original message was
++                               * for */
++      unsigned char  aflags;  /* ACK flags 0=OK, 1=No listener */
++      unsigned char  pad;
++      unsigned short seq;     /* Sequence number we are acking */
++};
++
++/* A Cluster LISTENREQ/LISTENRESP message */
++struct cl_listenmsg {
++      unsigned char  cmd;     /* CLUSTER_CMD_LISTENRESP/REQ */
++      unsigned char  target_port;     /* Port to probe */
++      unsigned char  listening;       /* Always 0 for LISTENREQ */
++      unsigned char  pad;
++      unsigned short tag;     /* PID of remote waiting process */
++};
++
++/* A Cluster PORTCLOSED message */
++struct cl_closemsg {
++      unsigned char cmd;      /* CLUSTER_CMD_PORTCLOSED */
++      unsigned char port;
++};
++
++/* Structure of a newly dead node, passed from cnxman to kmembershipd */
++struct cl_new_dead_node {
++      struct list_head     list;
++      struct cluster_node *node;
++};
++
++/* Subcommands for BARRIER message */
++#define BARRIER_REGISTER 1
++#define BARRIER_CHANGE   2
++#define BARRIER_WAIT     4
++#define BARRIER_COMPLETE 5
++
++/* A Cluster BARRIER message */
++struct cl_barriermsg {
++      unsigned char  cmd;     /* CLUSTER_CMD_BARRIER */
++      unsigned char  subcmd;  /* BARRIER sub command */
++      unsigned short pad;
++      unsigned int   flags;
++      unsigned int   nodes;
++      char name[MAX_BARRIER_NAME_LEN];
++};
++
++/* Membership services messages, the cl_protheader is added transparently */
++struct cl_mem_hello_msg {
++      unsigned char  cmd;
++      unsigned char  flags;
++      unsigned short members;     /* Number of nodes in the cluster,
++                                   * little-endian */
++      unsigned int   generation;  /* Current cluster generation number */
++};
++
++struct cl_mem_endtrans_msg {
++      unsigned char  cmd;
++      unsigned char  pad1;
++      unsigned short pad2;
++      unsigned int   quorum;
++      unsigned int   total_votes;
++      unsigned int   generation;      /* Current cluster generation number */
++      unsigned int   new_node_id;     /* If reason is a new node joining */
++};
++
++/* ACK types for JOINACK message */
++#define JOINACK_TYPE_OK   1   /* You can join */
++#define JOINACK_TYPE_NAK  2   /* You can NOT join */
++#define JOINACK_TYPE_WAIT 3   /* Wait a bit longer - cluster is in transition
++                               * already */
++
++struct cl_mem_joinack_msg {
++      unsigned char cmd;
++      unsigned char acktype;
++};
++
++/* This is used by JOINREQ message */
++struct cl_mem_join_msg {
++      unsigned char  cmd;
++      unsigned char  votes;
++      unsigned short num_addr;        /* Number of addresses for this node */
++      unsigned int   expected_votes;
++      unsigned int   members; /* Number of nodes in the cluster,
++                               * little-endian */
++      unsigned int   major_version;   /* Not backwards compatible */
++      unsigned int   minor_version;   /* Backwards compatible */
++      unsigned int   patch_version;   /* Backwards/forwards compatible */
++      unsigned int   config_version;
++        unsigned int   addr_len;        /* length of node addresses */
++        char           clustername[16];
++      /* Followed by <num_addr> addresses of `address_length` bytes and a
++       * NUL-terminated node name */
++};
++
++/* State transition start reasons: */
++#define TRANS_NEWNODE        1        /* A new node is joining the cluster */
++#define TRANS_REMNODE        2        /* a node has left the cluster */
++#define TRANS_ANOTHERREMNODE 3        /* A node left the cluster while we were in
++                               * transition */
++#define TRANS_NEWMASTER      4        /* We have had an election and I am the new
++                               * master */
++#define TRANS_CHECK          5        /* A consistency check was called for */
++#define TRANS_RESTART        6        /* Transition restarted because of a previous
++                               * timeout */
++#define TRANS_DEADMASTER     7        /* The master died during transition and I have
++                               * taken over */
++
++/* This is used to start a state transition */
++struct cl_mem_starttrans_msg {
++      unsigned char  cmd;
++      unsigned char  reason;  /* Why a start transition was started - see
++                               * above */
++      unsigned char  flags;
++      unsigned char  votes;
++      unsigned int   expected_votes;
++      unsigned int   generation;      /* Incremented for each STARTTRANS sent
++                                       */
++      int            nodeid;  /* Node to be removed */
++      unsigned short num_addrs;
++      /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
++       * `address_length` bytes and a NUL-terminated node name */
++};
++
++struct cl_mem_startack_msg {
++      unsigned char  cmd;
++      unsigned char  reason;
++      unsigned short pad;
++      unsigned int   generation;
++      unsigned int   node_id; /* node_id we think new node should have */
++      unsigned int   highest_node_id; /* highest node_id on this system */
++};
++
++/* Reconfigure a cluster parameter */
++struct cl_mem_reconfig_msg {
++      unsigned char  cmd;
++      unsigned char  param;
++      unsigned short pad;
++      unsigned int   value;
++};
++
++/* Structure containing information about an outstanding listen request */
++struct cl_waiting_listen_request {
++      wait_queue_head_t waitq;
++      int               result;
++      int               waiting;
++      unsigned short    tag;
++      int               nodeid;
++      struct list_head  list;
++};
++
++/* Messages from membership services */
++#define CLUSTER_MEM_JOINCONF   1
++#define CLUSTER_MEM_JOINREQ    2
++#define CLUSTER_MEM_LEAVE      3
++#define CLUSTER_MEM_HELLO      4
++#define CLUSTER_MEM_KILL       5
++#define CLUSTER_MEM_JOINACK    6
++#define CLUSTER_MEM_ENDTRANS   7
++#define CLUSTER_MEM_RECONFIG   8
++#define CLUSTER_MEM_MASTERVIEW 9
++#define CLUSTER_MEM_STARTTRANS 10
++#define CLUSTER_MEM_JOINREJ    11
++#define CLUSTER_MEM_VIEWACK    12
++#define CLUSTER_MEM_STARTACK   13
++#define CLUSTER_MEM_TRANSITION 14
++#define CLUSTER_MEM_NEWCLUSTER 15
++#define CLUSTER_MEM_CONFACK    16
++#define CLUSTER_MEM_NOMINATE   17
++
++/* Parameters for RECONFIG command */
++#define RECONFIG_PARAM_EXPECTED_VOTES 1
++#define RECONFIG_PARAM_NODE_VOTES     2
++#define RECONFIG_PARAM_CONFIG_VERSION 3
++
++/* Data associated with an outgoing socket */
++struct cl_socket {
++      struct file *file;      /* The real file */
++      struct socket *socket;  /* The real sock */
++      struct cl_multicast_sock multicast_info;
++      int num_nodes;          /* On this link */
++      int retransmit_count;
++};
++
++/* There's one of these for each node in the cluster */
++struct cluster_node {
++      struct list_head list;
++      char *name;             /* Node/host name of node */
++      struct list_head addr_list;
++      int us;                 /* This node is us */
++      unsigned int node_id;   /* Unique node ID */
++      nodestate_t state;
++      unsigned short last_seq_recv;
++      unsigned short last_seq_acked;
++      unsigned short last_seq_sent;
++      unsigned int votes;
++      unsigned int expected_votes;
++      unsigned int leave_reason;
++      unsigned int incarnation;       /* Incremented each time a node joins
++                                       * the cluster */
++      unsigned long last_hello;       /* Jiffies */
++};
++
++/* This is how we keep a list of user processes that are listening for cluster
++ * membership events */
++struct notify_struct {
++      struct list_head list;
++      pid_t pid;
++      int signal;
++};
++
++/* This is how we keep a list of kernel callbacks that are registered for
++ * cluster membership events */
++struct kernel_notify_struct {
++      struct list_head list;
++      void (*callback) (kcl_callback_reason, long arg);
++};
++
++/* A message waiting to be sent */
++struct queued_message {
++      struct list_head list;
++
++      struct socket *socket;
++      struct sockaddr_cl addr;
++      int addr_len;
++      int msg_len;
++      unsigned char port;
++      unsigned int flags;
++      char msg_buffer[MAX_CLUSTER_MESSAGE];
++};
++
++/* A barrier */
++struct cl_barrier {
++      struct list_head list;
++
++      char name[MAX_BARRIER_NAME_LEN];
++      unsigned int flags;
++      enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
++                  BARRIER_STATE_COMPLETE } state;
++      unsigned int expected_nodes;
++      unsigned int registered_nodes;
++      atomic_t     got_nodes;
++      atomic_t     completed_nodes;
++      unsigned int inuse;
++      unsigned int waitsent;
++      unsigned int phase;     /* Completion phase */
++      unsigned int endreason; /* Reason we were woken, usually 0 */
++      unsigned long timeout;  /* In seconds */
++
++      void (*callback) (char *name, int status);
++      wait_queue_head_t waitq;
++      struct semaphore lock;  /* To synch with cnxman messages */
++      spinlock_t phase2_spinlock;     /* Need to synchronise with timer
++                                       * interrupts */
++      struct timer_list timer;
++};
++
++/* Cluster protocol commands sent to port 0 */
++#define CLUSTER_CMD_ACK        1
++#define CLUSTER_CMD_LISTENREQ  2
++#define CLUSTER_CMD_LISTENRESP 3
++#define CLUSTER_CMD_PORTCLOSED 4
++#define CLUSTER_CMD_BARRIER    5
++
++extern struct cluster_node *find_node_by_addr(unsigned char *addr,
++                                            int addr_len);
++extern struct cluster_node *find_node_by_nodeid(unsigned int id);
++extern struct cluster_node *find_node_by_name(char *name);
++extern void set_quorate(int);
++extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
++extern void notify_listeners(void);
++extern void free_nodeid_array(void);
++extern int send_reconfigure(int param, unsigned int value);
++extern int calculate_quorum(int, int, int *);
++extern void recalculate_quorum(int);
++extern int send_leave(unsigned char);
++extern int get_quorum(void);
++extern void set_votes(int, int);
++extern void kcl_wait_for_all_acks(void);
++extern char *membership_state(char *, int);
++extern void a_node_just_died(struct cluster_node *node);
++extern void check_barrier_returns(void);
++extern int in_transition(void);
++extern void get_local_addresses(struct cluster_node *node);
++extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
++extern void create_proc_entries(void);
++extern void cleanup_proc_entries(void);
++extern unsigned int get_highest_nodeid(void);
++extern int allocate_nodeid_array(void);
++extern void queue_oob_skb(struct socket *sock, int cmd);
++extern int new_temp_nodeid(char *addr, int addrlen);
++extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
++extern void remove_temp_nodeid(int nodeid);
++extern inline char *print_addr(unsigned char *addr, int len, char *buf)
++{
++      int i;
++      int ptr = 0;
++
++      for (i = 0; i < len; i++)
++              ptr += sprintf(buf + ptr, "%02x ", addr[i]);
++
++      return buf;
++}
++
++#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
++
++/* Debug enabling macros. Sorry about the C++ comments but they're easier to
++ * get rid of than C ones... */
++
++// #define DEBUG_MEMB
++// #define DEBUG_COMMS
++// #define DEBUG_BARRIER
++
++/* Debug macros */
++#ifdef DEBUG_COMMS
++#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
++#else
++#define P_COMMS(fmt, args...)
++#endif
++
++#ifdef DEBUG_BARRIER
++#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
++#else
++#define P_BARRIER(fmt, args...)
++#endif
++
++#ifdef DEBUG_MEMB
++#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
++#define C_MEMB(fmt, args...) printk(fmt, ## args)
++#else
++#define P_MEMB(fmt, args...)
++#define C_MEMB(fmt, args...)
++#endif
++
++#endif                                /* __KERNEL */
++
++#endif
+diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
+--- linux-orig/cluster/cman/cnxman.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/cnxman.c        2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,4080 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++#include <linux/init.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <linux/utsname.h>
++#include <net/sock.h>
++#include <linux/proc_fs.h>
++#include <linux/poll.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#include "cnxman-private.h"
++#include "sm_control.h"
++#include "sm_user.h"
++#include "config.h"
++
++#define CMAN_RELEASE_NAME "<CVS>"
++
++static int __cl_setsockopt(struct socket *sock, int level, int optname,
++                         char *optval, int optlen, int flags);
++static int __cl_getsockopt(struct socket *sock, int level, int optname,
++                         char *optval, int *optlen, int flags);
++static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
++                           char *addr, int addrlen);
++static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
++                    int addr_len, char *addr, unsigned char remport,
++                    unsigned char flag);
++static void send_listen_request(int nodeid, unsigned char port);
++static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
++                               unsigned char port, unsigned short tag);
++static void resend_last_message(void);
++static void start_ack_timer(void);
++static int send_queued_message(struct queued_message *qmsg);
++static void send_port_close_oob(unsigned char port);
++static void post_close_oob(unsigned char port, int nodeid);
++static void process_barrier_msg(struct cl_barriermsg *msg,
++                              struct cluster_node *node);
++static struct cl_barrier *find_barrier(char *name);
++static void node_shutdown(void);
++static void node_cleanup(void);
++static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++                               unsigned char port);
++static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
++static void check_for_unacked_nodes(void);
++static void free_cluster_sockets(void);
++static uint16_t generate_cluster_id(char *name);
++
++static int is_valid_temp_nodeid(int nodeid);
++
++extern int start_membership_services(pid_t);
++extern int kcl_leave_cluster(int remove);
++extern int send_kill(int nodeid);
++
++static struct proto_ops cl_proto_ops;
++static struct sock *master_sock;
++static kmem_cache_t *cluster_sk_cachep;
++
++/* Pointer to the pseudo node that maintains quorum in a 2node system */
++struct cluster_node *quorum_device = NULL;
++
++/* Array of "ports" allocated. This is just a list of pointers to the sock that
++ * has this port bound. Speed is a major issue here so 1-2K of allocated
++ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
++static struct sock *port_array[256];
++static struct semaphore port_array_lock;
++
++/* Our cluster name & number */
++unsigned short cluster_id;
++char cluster_name[MAX_CLUSTER_NAME_LEN+1];
++
++/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
++ * No more than two nodes are permitted to join the cluster. */
++unsigned short two_node;
++
++/* Cluster configuration version that must be the same among members. */
++unsigned int config_version;
++
++/* Reference counting for cluster applications */
++atomic_t use_count;
++
++/* Length of sockaddr address for our comms protocol */
++unsigned int address_length;
++
++/* Message sending */
++static unsigned short cur_seq;        /* Last message sent */
++static unsigned int ack_count;        /* Number of acks received for message
++                               * 'cur_seq' */
++static unsigned int acks_expected;    /* Number of acks we expect to receive */
++static struct semaphore send_lock;
++static struct timer_list ack_timer;
++
++/* Saved packet information in case we need to resend it */
++static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
++static int saved_msg_len;
++static int retry_count;
++
++/* Task variables */
++static pid_t kcluster_pid;
++static pid_t membership_pid;
++extern int quit_threads;
++
++wait_queue_head_t cnxman_waitq;
++
++/* Variables owned by membership services */
++extern int cluster_members;
++extern struct list_head cluster_members_list;
++extern struct semaphore cluster_members_lock;
++extern int we_are_a_cluster_member;
++extern int cluster_is_quorate;
++extern struct cluster_node *us;
++extern struct list_head new_dead_node_list;
++extern struct semaphore new_dead_node_lock;
++extern char nodename[];
++
++/* A list of processes listening for membership events */
++static struct list_head event_listener_list;
++static struct semaphore event_listener_lock;
++
++/* A list of kernel callbacks listening for membership events */
++static struct list_head kernel_listener_list;
++static struct semaphore kernel_listener_lock;
++
++/* A list of sockets we are listening on (and can transmit on...later) */
++static struct list_head socket_list;
++
++/* A list of all open cluster client sockets */
++static struct list_head client_socket_list;
++static struct semaphore client_socket_lock;
++
++/* A list of all current barriers */
++static struct list_head barrier_list;
++static struct semaphore barrier_list_lock;
++
++/* When a socket is read for reading it goes on this queue */
++static spinlock_t active_socket_lock;
++static struct list_head active_socket_list;
++
++/* If the cnxman process is running and available for work */
++atomic_t cnxman_running;
++
++/* Fkags set by timers etc for the mainloop to detect and act upon */
++static unsigned long mainloop_flags;
++
++#define ACK_TIMEOUT   1
++#define RESEND_NEEDED 2
++
++/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
++ * process context then the messages get put in here */
++static struct list_head messages_list;
++static struct semaphore messages_list_lock;
++
++static struct semaphore start_thread_sem;
++
++/* List of outstanding ISLISTENING requests */
++static struct list_head listenreq_list;
++static struct semaphore listenreq_lock;
++
++/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
++ * ACK) */
++static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
++
++/* Wait for thread to exit properly */
++struct completion cluster_thread_comp;
++struct completion member_thread_comp;
++
++/* The resend delay to use, We increase this geometrically(word?) each time a
++ * send is delayed. in deci-seconds */
++static int resend_delay = 1;
++
++/* Highest numbered interface and the current default */
++static int num_interfaces = 0;
++static struct cl_comms_socket *current_interface = NULL;
++
++struct temp_node
++{
++      int nodeid;
++      char addr[sizeof(struct sockaddr_in6)];
++      int addrlen;
++      struct list_head list;
++};
++static struct list_head tempnode_list;
++static struct semaphore tempnode_lock;
++
++/* Wake up any processes that are waiting to send. This is usually called when
++ * all the ACKs have been gathered up or when a node has left the cluster
++ * unexpectedly and we reckon there are no more acks to collect */
++static void unjam(void)
++{
++      wake_up_interruptible(&socket_waitq);
++      wake_up_interruptible(&cnxman_waitq);
++}
++
++/* Used by the data_ready routine to locate a connection given the socket */
++static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
++{
++      struct list_head *conlist;
++
++      list_for_each(conlist, &socket_list) {
++              struct cl_comms_socket *clsock =
++                  list_entry(conlist, struct cl_comms_socket, list);
++              if (clsock->sock->sk == sk) {
++                      return clsock;
++              }
++      }
++      return NULL;
++}
++
++/* Data available on socket */
++static void cnxman_data_ready(struct sock *sk, int count_unused)
++{
++      struct cl_comms_socket *clsock = find_comms_by_sock(sk);
++
++      if (clsock == NULL)     /* ASSERT ?? */
++              return;
++
++      /* If we're already on the list then don't do it again */
++      if (test_and_set_bit(1, &clsock->active))
++              return;
++
++      spin_lock_irq(&active_socket_lock);
++      list_add(&clsock->active_list, &active_socket_list);
++      spin_unlock_irq(&active_socket_lock);
++
++      wake_up_interruptible(&cnxman_waitq);
++}
++
++static int receive_message(struct cl_comms_socket *csock, char *iobuf)
++{
++      struct msghdr msg;
++      struct iovec iov;
++      struct sockaddr_in6 sin;
++      int len;
++      mm_segment_t fs;
++
++      memset(&sin, 0, sizeof (sin));
++
++      msg.msg_control = NULL;
++      msg.msg_controllen = 0;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = &iov;
++      msg.msg_name = &sin;
++      msg.msg_namelen = sizeof (sin);
++      msg.msg_flags = 0;
++
++      iov.iov_len = MAX_CLUSTER_MESSAGE;
++      iov.iov_base = iobuf;
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      len = sock_recvmsg(csock->sock, &msg, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
++      set_fs(fs);
++
++      if (len > 0) {
++              if (len > MAX_CLUSTER_MESSAGE) {
++                      printk(KERN_CRIT CMAN_NAME
++                             ": %d byte message far too big\n", len);
++                      return 0;
++              }
++              send_to_userport(csock, iobuf, len, msg.msg_name, msg.msg_namelen);
++      }
++      else {
++              if (len != -EAGAIN)
++                      printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
++                             len);
++      }
++      return len;
++}
++
++static int cluster_kthread(void *unused)
++{
++      int len;
++      char *iobuf;
++      struct list_head *socklist;
++      struct cl_comms_socket *csock;
++      wait_queue_t cnxman_waitq_head;
++      sigset_t tmpsig;
++
++      daemonize("cman_comms");
++
++      /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++      siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++      sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++      /* This is the waitq we can wake the process up with */
++      init_waitqueue_head(&cnxman_waitq);
++      init_waitqueue_entry(&cnxman_waitq_head, current);
++      add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
++
++      set_user_nice(current, -6);
++
++      /* Allow the sockets to start receiving */
++      list_for_each(socklist, &socket_list) {
++              csock = list_entry(socklist, struct cl_comms_socket, list);
++
++              clear_bit(1, &csock->active);
++      }
++
++      iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
++      if (!iobuf) {
++              printk(KERN_CRIT CMAN_NAME
++                     ": Cannot allocate receive buffer for cluster comms\n");
++              return -1;
++      }
++
++      complete(&cluster_thread_comp);
++
++      for (;;) {
++              struct list_head *temp;
++
++              /* Wait for activity on any of the sockets */
++              set_task_state(current, TASK_INTERRUPTIBLE);
++
++              if (list_empty(&active_socket_list))
++                      schedule();
++              set_task_state(current, TASK_RUNNING);
++
++              if (quit_threads)
++                      break;
++
++              if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
++                      check_for_unacked_nodes();
++              }
++
++              /* Now receive any messages waiting for us */
++              spin_lock_irq(&active_socket_lock);
++              list_for_each_safe(socklist, temp, &active_socket_list) {
++                      csock =
++                          list_entry(socklist, struct cl_comms_socket,
++                                     active_list);
++
++                      list_del(&csock->active_list);
++                      clear_bit(1, &csock->active);
++
++                      spin_unlock_irq(&active_socket_lock);
++
++                      do {
++                              len = receive_message(csock, iobuf);
++                      }
++                      while (len > 0);
++
++                      spin_lock_irq(&active_socket_lock);
++
++                      if (len == 0)
++                              break;  /* EOF on socket */
++              }
++              spin_unlock_irq(&active_socket_lock);
++
++              /* Resend any unacked messages */
++              if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
++                  && acks_expected) {
++                      resend_last_message();
++              }
++
++              /* Send any queued messages */
++              if (acks_expected == 0) {
++                      struct list_head *temp;
++                      struct list_head *msglist;
++
++                      down(&messages_list_lock);
++                      list_for_each_safe(msglist, temp, &messages_list) {
++                              struct queued_message *qmsg =
++                                  list_entry(msglist, struct queued_message,
++                                             list);
++                              int status = send_queued_message(qmsg);
++
++                              if (status >= 0) {
++                                      /* Suceeded, remove it from the queue */
++                                      list_del(&qmsg->list);
++                                      kfree(qmsg);
++                              }
++                              /* Did it fail horribly ?? */
++                              if (status < 0 && status != -EAGAIN) {
++                                      printk(KERN_INFO CMAN_NAME
++                                             ": send_queued_message failed, error %d\n",
++                                             status);
++                                      list_del(&qmsg->list);
++                                      kfree(qmsg);
++                              }
++                              break;  /* Only send one message at a time */
++                      }
++                      up(&messages_list_lock);
++              }
++
++              if (signal_pending(current))
++                      break;
++      }
++      P_COMMS("closing down\n");
++
++      if (we_are_a_cluster_member)
++          send_leave(us->leave_reason);
++
++      kfree(iobuf);
++      quit_threads = 1;       /* force other thread to die too */
++      node_shutdown();
++
++      if (timer_pending(&ack_timer))
++              del_timer(&ack_timer);
++
++      /* Wait for membership thread to die */
++      wait_for_completion(&member_thread_comp);
++
++      node_cleanup();
++
++      complete(&cluster_thread_comp);
++      return 0;
++}
++
++void notify_kernel_listeners(kcl_callback_reason reason, long arg)
++{
++      struct kernel_notify_struct *knotify;
++      struct list_head *proclist;
++
++      down(&kernel_listener_lock);
++      list_for_each(proclist, &kernel_listener_list) {
++              knotify =
++                  list_entry(proclist, struct kernel_notify_struct, list);
++              knotify->callback(reason, arg);
++      }
++      up(&kernel_listener_lock);
++}
++
++static void check_for_unacked_nodes()
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++
++      clear_bit(RESEND_NEEDED, &mainloop_flags);
++      retry_count = 0;
++
++      P_COMMS("Retry count exceeded -- looking for dead node\n");
++
++      /* Node did not ACK a message after <n> tries, remove it from the
++       * cluster */
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              P_COMMS
++                  ("checking node %s: last_acked = %d, last_seq_sent = %d\n",
++                   node->name, node->last_seq_acked, node->last_seq_sent);
++              if (node->state != NODESTATE_DEAD
++                  && node->last_seq_acked != node->last_seq_sent && !node->us) {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": node %s is not responding - removing from the cluster\n",
++                             node->name);
++
++                      /* Start a state transition */
++                      a_node_just_died(node);
++              }
++      }
++      up(&cluster_members_lock);
++      acks_expected = ack_count = 0;
++      unjam();
++      return;
++}
++
++static void ack_timer_fn(unsigned long arg)
++{
++      P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
++
++      /* Too many retries ? */
++      if (++retry_count > MAX_RETRIES) {
++              set_bit(ACK_TIMEOUT, &mainloop_flags);
++              wake_up_interruptible(&cnxman_waitq);
++      }
++      else {
++              /* Resend last message */
++              set_bit(RESEND_NEEDED, &mainloop_flags);
++              wake_up_interruptible(&cnxman_waitq);
++      }
++}
++
++/* Called to resend a packet if sock_sendmsg was busy */
++static void short_timer_fn(unsigned long arg)
++{
++      P_COMMS("short_timer fired\n");
++
++      /* Resend last message */
++      resend_delay <<= 1;
++      set_bit(RESEND_NEEDED, &mainloop_flags);
++      wake_up_interruptible(&cnxman_waitq);
++}
++
++static void start_ack_timer()
++{
++      ack_timer.function = ack_timer_fn;
++      ack_timer.data = 0L;
++      mod_timer(&ack_timer, jiffies + HZ);
++}
++
++static void start_short_timer(void)
++{
++      ack_timer.function = short_timer_fn;
++      ack_timer.data = 0L;
++      mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
++}
++
++
++static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
++{
++      struct list_head *llist;
++      struct cl_waiting_listen_request *listener;
++
++      down(&listenreq_lock);
++      list_for_each(llist, &listenreq_list) {
++              listener =
++                  list_entry(llist, struct cl_waiting_listen_request, list);
++              if (listener->tag == tag) {
++                      up(&listenreq_lock);
++                      return listener;
++              }
++      }
++      up(&listenreq_lock);
++      return NULL;
++}
++
++static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
++                                 int len, char *addr, int addrlen,
++                                 struct cluster_node *rem_node)
++{
++      struct cl_protmsg *msg = (struct cl_protmsg *) data;
++      struct cl_protheader *header = (struct cl_protheader *) data;
++      struct cl_ackmsg *ackmsg;
++      struct cl_listenmsg *listenmsg;
++      struct cl_closemsg *closemsg;
++      struct cl_barriermsg *barriermsg;
++      struct cl_waiting_listen_request *listen_request;
++
++      P_COMMS("Message on port 0 is %d\n", msg->cmd);
++      switch (msg->cmd) {
++      case CLUSTER_CMD_ACK:
++              ackmsg = (struct cl_ackmsg *) data;
++
++              if (ackmsg->aflags & 1) {
++                      if (net_ratelimit())
++                              printk(KERN_INFO CMAN_NAME
++                                     ": WARNING no listener for port %d on node %s\n",
++                                     ackmsg->remport, rem_node->name);
++              }
++              P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
++                      rem_node ? rem_node->name : "Unknown",
++                      le16_to_cpu(ackmsg->seq), cur_seq);
++
++              if (rem_node && rem_node->state != NODESTATE_DEAD) {
++                      /* This copes with duplicate acks from a multipathed
++                       * host */
++                      if (rem_node->last_seq_acked !=
++                          le16_to_cpu(ackmsg->seq)) {
++                              rem_node->last_seq_acked =
++                                  le16_to_cpu(ackmsg->seq);
++
++                              /* Got em all */
++                              if (++ack_count >= acks_expected) {
++
++                                      /* Cancel the timer */
++                                      del_timer(&ack_timer);
++                                      acks_expected = 0;
++                                      unjam();
++                              }
++                      }
++              }
++              else {
++                      if (cluster_members) {
++#ifdef DEBUG_COMMS
++                              char buf[MAX_ADDR_PRINTED_LEN];
++
++                              printk(KERN_INFO CMAN_NAME
++                                     ": got ack from unknown or dead node: %s\n",
++                                     print_addr(addr, addrlen, buf));
++#endif
++                      }
++              }
++              break;
++
++              /* Return 1 if we have a listener on this port, 0 if not */
++      case CLUSTER_CMD_LISTENREQ:
++              listenmsg =
++                  (struct cl_listenmsg *) (data +
++                                           sizeof (struct cl_protheader));
++              cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++              send_listen_response(csock, le32_to_cpu(header->srcid),
++                                   listenmsg->target_port, listenmsg->tag);
++              break;
++
++      case CLUSTER_CMD_LISTENRESP:
++              /* Wake up process waiting for listen response */
++              listenmsg =
++                  (struct cl_listenmsg *) (data +
++                                           sizeof (struct cl_protheader));
++              cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++              listen_request = find_listen_request(listenmsg->tag);
++              if (listen_request) {
++                      listen_request->result = listenmsg->listening;
++                      listen_request->waiting = 0;
++                      wake_up_interruptible(&listen_request->waitq);
++              }
++              break;
++
++      case CLUSTER_CMD_PORTCLOSED:
++              closemsg =
++                  (struct cl_closemsg *) (data +
++                                          sizeof (struct cl_protheader));
++              cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++              post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
++              break;
++
++      case CLUSTER_CMD_BARRIER:
++              barriermsg =
++                  (struct cl_barriermsg *) (data +
++                                            sizeof (struct cl_protheader));
++              cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++              process_barrier_msg(barriermsg, rem_node);
++              break;
++
++      default:
++              printk(KERN_ERR CMAN_NAME
++                     ": Unknown protocol message %d received\n", msg->cmd);
++              break;
++
++      }
++      return;
++}
++
++static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
++                           char *addr, int addrlen)
++{
++      int err;
++      struct cl_protheader *header = (struct cl_protheader *) data;
++      struct cluster_node *rem_node =
++          find_node_by_nodeid(le32_to_cpu(header->srcid));
++      struct sk_buff *skb = NULL;
++
++      P_COMMS
++          ("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
++           le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
++           le16_to_cpu(header->seq), rem_node,
++           rem_node ? rem_node->state : -1);
++
++      /* If the remote end is being coy about its node ID then look it up by
++       * address */
++      if (!rem_node && header->srcid == 0) {
++              rem_node = find_node_by_addr(addr, addrlen);
++      }
++
++      /* If this node is an ex-member then treat it as unknown */
++      if (rem_node && rem_node->state != NODESTATE_MEMBER
++          && rem_node->state != NODESTATE_JOINING)
++              rem_node = NULL;
++
++      /* Ignore messages not for our cluster */
++      if (le16_to_cpu(header->cluster) != cluster_id) {
++              P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
++                      cluster_id, header->cluster);
++              goto userport_finish;
++      }
++
++      /* If the message is from us then just dump it */
++      if (rem_node && rem_node->us)
++              goto userport_finish;
++
++      /* If we can't find the nodeid then check for our own messages the hard
++       * way - this only happens during joining */
++      if (!rem_node) {
++              struct list_head *socklist;
++              struct cl_comms_socket *clsock;
++
++              list_for_each(socklist, &socket_list) {
++                      clsock =
++                          list_entry(socklist, struct cl_comms_socket, list);
++
++                      if (clsock->recv_only) {
++
++                              if (memcmp(addr, &clsock->saddr, address_length) == 0) {
++                                      goto userport_finish;
++                              }
++                      }
++              }
++
++      }
++
++      /* Ignore messages not for us */
++      if (le32_to_cpu(header->tgtid) > 0 && us
++          && le32_to_cpu(header->tgtid) != us->node_id) {
++              goto userport_finish;
++      }
++
++      P_COMMS("got message, from %d for %d, sequence num = %d\n",
++              le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
++              le16_to_cpu(header->seq));
++
++      /* Have we received this message before ? If so just ignore it, it's a
++       * resend for someone else's benefit */
++      if (!(header->flags & (MSG_NOACK >> 16)) &&
++          rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
++              P_COMMS
++                  ("Discarding message - Already seen this sequence number %d\n",
++                   rem_node->last_seq_recv);
++              /* Still need to ACK it though, in case it was the ACK that got
++               * lost */
++              cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++              goto userport_finish;
++      }
++
++      /* If it's a new node then assign it a temporary node ID */
++      if (!rem_node)
++              header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
++
++      P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
++              header->flags, header->port, we_are_a_cluster_member);
++
++
++      /* If we are not part of the cluster then ignore multicast messages
++       * that need an ACK as we will confuse the sender who is only expecting
++       * ACKS from bona fide members */
++      if (header->flags & (MSG_MULTICAST >> 16) &&
++          !(header->flags & (MSG_NOACK >> 16)) && !we_are_a_cluster_member) {
++              P_COMMS
++                  ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
++                   header->port, header->flags);
++              goto userport_finish;
++      }
++
++      /* Save the sequence number of this message so we can ignore duplicates
++       * (above) */
++      if (!(header->flags & (MSG_NOACK >> 16)) && rem_node) {
++              P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
++                      rem_node->name);
++              rem_node->last_seq_recv = le16_to_cpu(header->seq);
++      }
++
++      /* Is it a protocol message? */
++      if (header->port == 0) {
++              process_cnxman_message(csock, data, len, addr, addrlen,
++                                     rem_node);
++              goto userport_finish;
++      }
++
++      /* Skip past the header to the data */
++      data += sizeof (struct cl_protheader);
++      len -= sizeof (struct cl_protheader);
++
++      /* Get the port number and look for a listener */
++      down(&port_array_lock);
++      if (port_array[header->port]) {
++              int native_srcid;
++              struct cluster_sock *c = cluster_sk(port_array[header->port]);
++
++              /* ACK it */
++              if (!(header->flags & (MSG_NOACK >> 16)))
++                      cl_sendack(csock, header->seq, addrlen, addr,
++                                 header->port, 0);
++
++              /* Call a callback if there is one */
++              if (c->kernel_callback) {
++                      up(&port_array_lock);
++                      c->kernel_callback(data, len, addr, addrlen,
++                                         le32_to_cpu(header->srcid));
++                      goto userport_finish;
++              }
++
++              /* Otherwise put it into an SKB and pass it onto the recvmsg
++               * mechanism */
++              skb = alloc_skb(len, GFP_KERNEL);
++              if (!skb) {
++                      up(&port_array_lock);
++                      printk(KERN_INFO CMAN_NAME
++                             ": Failed to allocate skb\n");
++                      return;
++              }
++
++              skb_put(skb, len);
++              memcpy(skb->data, data, len);
++
++              /* Put the nodeid into cb so we can pass it to the clients */
++              skb->cb[0] = 0; /* Clear flags */
++              native_srcid = le32_to_cpu(header->srcid);
++              memcpy(skb->cb + 1, &native_srcid, sizeof(int));
++
++              if ((err =
++                   sock_queue_rcv_skb(port_array[header->port], skb)) < 0) {
++
++                      printk(KERN_INFO CMAN_NAME
++                             ": Error queueing request to port %d: %d\n",
++                             header->port, err);
++                      kfree_skb(skb);
++
++                      /* If the port was MEMBERSHIP then we have to die */
++                      if (header->port == CLUSTER_PORT_MEMBERSHIP) {
++                              up(&port_array_lock);
++                              send_leave(CLUSTER_LEAVEFLAG_PANIC);
++                              panic("membership stopped responding");
++                      }
++              }
++              up(&port_array_lock);
++
++      }
++      else {
++              /* ACK it, but set the flag bit so remote end knows no-one
++               * caught it */
++              if (!(header->flags & (MSG_NOACK >> 16)))
++                      cl_sendack(csock, header->seq, addrlen, addr,
++                                 header->port, 1);
++
++              /* Nobody listening, drop it */
++              up(&port_array_lock);
++      }
++
++      userport_finish:
++      return;
++}
++
++static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
++{
++      struct sock *sk;
++      struct cluster_sock *c;
++
++      if ((sk =
++           sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
++                    cluster_sk_cachep)) == NULL)
++              goto no_sock;
++
++      if (sock) {
++              sock->ops = &cl_proto_ops;
++      }
++      sock_init_data(sock, sk);
++
++      sk->sk_destruct = NULL;
++      sk->sk_no_check = 1;
++      sk->sk_family = PF_CLUSTER;
++      sk->sk_allocation = gfp;
++
++      c = cluster_sk(sk);
++      c->port = 0;
++      c->service_data = NULL;
++
++      return sk;
++      no_sock:
++      return NULL;
++}
++
++static int cl_release(struct socket *sock)
++{
++      struct sock *sk = sock->sk;
++      struct cl_client_socket *csock;
++      struct list_head *socklist;
++      struct list_head *tmp;
++
++      down(&client_socket_lock);
++      if (sk) {
++              /* Remove port allocations if it's a bound socket */
++              struct cluster_sock *c = cluster_sk(sk);
++
++              down(&port_array_lock);
++              if (c->port) {
++                      port_array[c->port] = NULL;
++              }
++              up(&port_array_lock);
++
++              /* Tell other nodes in the cluster that this listener is going
++               * away */
++              if (atomic_read(&cnxman_running) && c->port)
++                      send_port_close_oob(c->port);
++
++              if (c->service_data)
++                      sm_sock_release(sock);
++
++              /* Master socket released ? */
++              if (sk->sk_protocol == CLPROTO_MASTER) {
++                      master_sock = NULL;
++
++                      /* If this socket is being freed and cnxman is not
++                       * started then free all the comms sockets as either
++                       * the userland "join" process has crashed or the
++                       * join failed.
++                       */
++                      if (!atomic_read(&cnxman_running)) {
++                              quit_threads = 1;
++                              free_cluster_sockets();
++                      }
++              }
++
++              sock_orphan(sk);
++              sock_hold(sk);
++              lock_sock(sk);
++              release_sock(sk);
++              sock_put(sk);
++              sock_put(sk);
++              sock->sk = NULL;
++      }
++
++      /* Remove it from the list of clients */
++      list_for_each_safe(socklist, tmp, &client_socket_list) {
++              csock = list_entry(socklist, struct cl_client_socket, list);
++
++              if (csock->sock == sock) {
++                      list_del(&csock->list);
++                      kfree(csock);
++                      break;
++              }
++      }
++      up(&client_socket_lock);
++
++      return 0;
++}
++
++static int cl_create(struct socket *sock, int protocol)
++{
++      struct sock *sk;
++
++      /* All are datagrams */
++      if (sock->type != SOCK_DGRAM)
++              return -ESOCKTNOSUPPORT;
++
++      if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
++              return -EPERM;
++
++      /* Can only have one master socket */
++      if (master_sock && protocol == CLPROTO_MASTER)
++              return -EBUSY;
++
++      /* cnxman not running and a client was requested */
++      if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
++              return -ENETDOWN;
++
++      if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
++              return -ENOBUFS;
++
++      sk->sk_protocol = protocol;
++
++      if (protocol == CLPROTO_MASTER)
++              master_sock = sk;
++
++      /* Add client sockets to the list */
++      if (protocol == CLPROTO_CLIENT) {
++              struct cl_client_socket *clsock =
++                  kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
++              if (!clsock) {
++                      cl_release(sock);
++                      return -ENOMEM;
++              }
++              clsock->sock = sock;
++              down(&client_socket_lock);
++              list_add(&clsock->list, &client_socket_list);
++              up(&client_socket_lock);
++      }
++
++      return 0;
++}
++
++static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
++{
++      struct sock *sk = sock->sk;
++      struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
++      struct cluster_sock *c = cluster_sk(sk);
++
++      if (!capable(CAP_NET_BIND_SERVICE))
++              return -EPERM;
++
++      if (sk->sk_zapped == 0)
++              return -EINVAL;
++
++      if (addr_len != sizeof (struct sockaddr_cl))
++              return -EINVAL;
++
++      if (saddr->scl_family != AF_CLUSTER)
++              return -EINVAL;
++
++      if (saddr->scl_port == 0)
++              return -EINVAL; /* Port 0 is reserved for protocol messages */
++
++      down(&port_array_lock);
++
++      if (port_array[saddr->scl_port]) {
++              up(&port_array_lock);
++              return -EADDRINUSE;
++      }
++
++      port_array[saddr->scl_port] = sk;
++
++      up(&port_array_lock);
++
++      c->port = saddr->scl_port;
++      sk->sk_zapped = 0;
++
++      /* If we are not a cluster member yet then make the client wait until
++       * we are, this allows nodes to start cluster clients at the same time
++       * as cluster services but they will wait until membership is achieved.
++       * This looks odd in bind() (open would seem more obvious) but we need
++       * to know which port number is being used so that things like
++       * membership services don't get blocked
++       */
++
++      if (saddr->scl_port > HIGH_PROTECTED_PORT)
++              while (!we_are_a_cluster_member || !cluster_is_quorate
++                     || in_transition()) {
++                      DECLARE_WAITQUEUE(wq, current);
++                      struct task_struct *tsk = current;
++
++                      set_task_state(tsk, TASK_INTERRUPTIBLE);
++                      add_wait_queue(&socket_waitq, &wq);
++
++                      if (!we_are_a_cluster_member || !cluster_is_quorate
++                          || in_transition())
++                              schedule();
++
++                      set_task_state(tsk, TASK_RUNNING);
++                      remove_wait_queue(&socket_waitq, &wq);
++
++                      /* We were woken up because the cluster is going down,
++                       * ...and we never got a chance to do any work! (sob) */
++                      if (atomic_read(&cnxman_running) == 0 || quit_threads) {
++                              return -ENOTCONN;
++                      }
++              }
++
++      return 0;
++}
++
++static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
++                    int *uaddr_len, int peer)
++{
++      struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
++      struct sock *sk = sock->sk;
++      struct cluster_sock *c = cluster_sk(sk);
++
++      *uaddr_len = sizeof (struct sockaddr_cl);
++
++      lock_sock(sk);
++
++      sa->scl_port = c->port;
++      sa->scl_flags = 0;
++      sa->scl_family = AF_CLUSTER;
++
++      release_sock(sk);
++
++      return 0;
++}
++
++static unsigned int cl_poll(struct file *file, struct socket *sock,
++                          poll_table * wait)
++{
++      return datagram_poll(file, sock, wait);
++}
++
++/* Copy internal node format to userland format */
++void copy_to_usernode(struct cluster_node *node,
++                           struct cl_cluster_node *unode)
++{
++      strcpy(unode->name, node->name);
++      unode->size = sizeof (struct cl_cluster_node);
++      unode->votes = node->votes;
++      unode->state = node->state;
++      unode->us = node->us;
++      unode->node_id = node->node_id;
++      unode->leave_reason = node->leave_reason;
++      unode->incarnation = node->incarnation;
++}
++
++/* ioctl processing functions */
++
++static int do_ioctl_set_version(unsigned long arg)
++{
++      struct cl_version version, *u_version;
++
++      if (!capable(CAP_CLUSTER))
++              return -EPERM;
++      if (arg == 0)
++              return -EINVAL;
++
++      u_version = (struct cl_version *) arg;
++
++      if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
++              return -EFAULT;
++
++      if (version.major != CNXMAN_MAJOR_VERSION ||
++          version.minor != CNXMAN_MINOR_VERSION ||
++          version.patch != CNXMAN_PATCH_VERSION)
++              return -EINVAL;
++
++      if (config_version == version.config)
++              return 0;
++
++      config_version = version.config;
++      send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
++      return 0;
++}
++
++static int do_ioctl_get_members(unsigned long arg)
++{
++      struct cluster_node *node;
++      /* Kernel copies */
++      struct cl_cluster_node user_format_node;
++      struct cl_cluster_nodelist user_format_nodelist;
++      /* User space array ptr */
++      struct cl_cluster_node *user_node;
++      struct list_head *nodelist;
++      int num_nodes = 0;
++
++      if (arg == 0)
++              return cluster_members;
++
++      if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
++              return -EFAULT;
++
++      down(&cluster_members_lock);
++
++      if (user_format_nodelist.max_members < cluster_members) {
++              up(&cluster_members_lock);
++              return -E2BIG;
++      }
++
++      user_node = user_format_nodelist.nodes;
++
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++              if (node->state == NODESTATE_MEMBER) {
++                      copy_to_usernode(node, &user_format_node);
++                      if (copy_to_user(user_node, &user_format_node,
++                                       sizeof (struct cl_cluster_node))) {
++                              up(&cluster_members_lock);
++                              return -EFAULT;
++                      }
++                      user_node++;
++                      num_nodes++;
++              }
++      }
++      up(&cluster_members_lock);
++
++      return num_nodes;
++}
++
++static int do_ioctl_get_all_members(unsigned long arg)
++{
++      struct cluster_node *node;
++      /* Kernel copies */
++      struct cl_cluster_node user_format_node;
++      struct cl_cluster_nodelist user_format_nodelist;
++      /* User space array ptr*/
++      struct cl_cluster_node *user_node;
++      struct list_head *nodelist;
++      int num_nodes = 0;
++
++      if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
++              return -EFAULT;
++
++      down(&cluster_members_lock);
++
++      user_node = user_format_nodelist.nodes;
++
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++              if (arg) {
++                      copy_to_usernode(node,
++                                       &user_format_node);
++
++                      if (copy_to_user(user_node, &user_format_node,
++                                       sizeof (struct cl_cluster_node))) {
++                              up(&cluster_members_lock);
++                              return -EFAULT;
++                      }
++                      user_node++;
++                      if (--user_format_nodelist.max_members < 0) {
++                              num_nodes = -EFAULT;
++                              goto err_exit;
++                      }
++
++              }
++              num_nodes++;
++      }
++      err_exit:
++      up(&cluster_members_lock);
++
++      return num_nodes;
++}
++
++static int do_ioctl_get_node(unsigned long arg)
++{
++      struct cluster_node *node;
++      struct cl_cluster_node k_node, *u_node;
++
++      u_node = (struct cl_cluster_node *) arg;
++
++      if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
++              return -EFAULT;
++
++      if (k_node.node_id)
++              node = find_node_by_nodeid(k_node.node_id);
++      else
++              node = find_node_by_name(k_node.name);
++
++      if (!node)
++              return -ENOENT;
++
++      copy_to_usernode(node, &k_node);
++
++      if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
++              return -EFAULT;
++
++      return 0;
++}
++
++static int do_ioctl_set_expected(unsigned long arg)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++      unsigned int total_votes;
++      unsigned int newquorum;
++
++      if (!capable(CAP_CLUSTER))
++              return -EPERM;
++      if (arg == 0)
++              return -EINVAL;
++
++      newquorum = calculate_quorum(1, arg, &total_votes);
++
++      if (newquorum < total_votes / 2
++          || newquorum > total_votes) {
++              return -EINVAL;
++      }
++
++      /* Now do it */
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++              if (node->state == NODESTATE_MEMBER
++                  && node->expected_votes > arg) {
++                      node->expected_votes = arg;
++              }
++      }
++      up(&cluster_members_lock);
++
++      recalculate_quorum(1);
++
++      send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
++      sm_member_update(cluster_is_quorate);
++
++      return 0;
++}
++
++static int do_ioctl_kill_node(unsigned long arg)
++{
++      struct cluster_node *node;
++
++      if (!capable(CAP_CLUSTER))
++              return -EPERM;
++
++
++      if ((node = find_node_by_nodeid(arg)) == NULL)
++              return -EINVAL;
++
++      /* Can't kill us */
++      if (node->us)
++              return -EINVAL;
++
++      if (node->state != NODESTATE_MEMBER)
++              return -EINVAL;
++
++      /* Just in case it is alive, send a KILL message */
++      send_kill(arg);
++
++      node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
++      a_node_just_died(node);
++
++      return 0;
++}
++
++static int do_ioctl_barrier(unsigned long arg)
++{
++      struct cl_barrier_info info;
++
++      if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++
++      if (copy_from_user(&info, (void *)arg, sizeof(info))  != 0)
++              return -EFAULT;
++
++      switch (info.cmd) {
++      case BARRIER_IOCTL_REGISTER:
++              return kcl_barrier_register(info.name,
++                                          info.flags,
++                                          info.arg);
++      case BARRIER_IOCTL_CHANGE:
++              return kcl_barrier_setattr(info.name,
++                                         info.flags,
++                                         info.arg);
++      case BARRIER_IOCTL_WAIT:
++              return kcl_barrier_wait(info.name);
++      case BARRIER_IOCTL_DELETE:
++              return kcl_barrier_delete(info.name);
++      default:
++              return -EINVAL;
++      }
++}
++
++static int do_ioctl_islistening(unsigned long arg)
++{
++      DECLARE_WAITQUEUE(wq, current);
++      struct cl_listen_request rq;
++      struct cluster_node *rem_node;
++      int nodeid;
++      int result;
++      struct cl_waiting_listen_request *listen_request;
++
++      if (!arg)
++              return -EINVAL;
++
++      if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
++              return -EFAULT;
++
++      nodeid = rq.nodeid;
++
++      rem_node = find_node_by_nodeid(nodeid);
++
++      /* Node not in the cluster */
++      if (!rem_node)
++              return -ENOENT;
++
++      if (rem_node->state != NODESTATE_MEMBER)
++              return -ENOTCONN;
++
++      /* If the request is for us then just look in the ports
++       * array */
++      if (nodeid == us->node_id)
++              return (port_array[rq.port] != 0) ? 1 : 0;
++
++      /* For a remote node we need to send a request out */
++
++      /* If we are in transition then wait until we are not */
++      while (in_transition()) {
++              set_task_state(current, TASK_INTERRUPTIBLE);
++              add_wait_queue(&socket_waitq, &wq);
++
++              if (in_transition())
++                      schedule();
++
++              set_task_state(current, TASK_RUNNING);
++              remove_wait_queue(&socket_waitq, &wq);
++
++              if (signal_pending(current))
++                      return -EINTR;
++      }
++
++      /* Were we shut down before it completed ? */
++      if (!atomic_read(&cnxman_running))
++              return -ENOTCONN;
++
++      listen_request =
++              kmalloc(sizeof (struct cl_waiting_listen_request),
++                      GFP_KERNEL);
++      if (!listen_request)
++              return -ENOMEM;
++
++      /* Build the request */
++      listen_request->waiting = 1;
++      listen_request->result = 0;
++      listen_request->tag = current->pid;
++      listen_request->nodeid = nodeid;
++      init_waitqueue_head(&listen_request->waitq);
++
++      down(&listenreq_lock);
++      list_add(&listen_request->list, &listenreq_list);
++      up(&listenreq_lock);
++
++      /* Now wait for the response to come back */
++      send_listen_request(rq.nodeid, rq.port);
++
++      while (listen_request->waiting) {
++              set_task_state(current, TASK_INTERRUPTIBLE);
++              add_wait_queue(&listen_request->waitq, &wq);
++
++              if (listen_request->waiting)
++                      schedule();
++
++              set_task_state(current, TASK_RUNNING);
++              remove_wait_queue(&listen_request->waitq, &wq);
++
++              if (signal_pending(current)) {
++                      list_del(&listen_request->list);
++                      kfree(listen_request);
++                      return -ERESTARTSYS;
++              }
++      }
++      result = listen_request->result;
++      list_del(&listen_request->list);
++      kfree(listen_request);
++      return result;
++}
++
++static int do_ioctl_set_votes(unsigned long arg)
++{
++      unsigned int total_votes;
++      unsigned int newquorum;
++      int saved_votes;
++
++      if (!capable(CAP_CLUSTER))
++              return -EPERM;
++
++      /* Check votes is valid */
++      saved_votes = us->votes;
++      us->votes = arg;
++
++      newquorum = calculate_quorum(1, 0, &total_votes);
++
++      if (newquorum < total_votes / 2 || newquorum > total_votes) {
++              us->votes = saved_votes;
++              return -EINVAL;
++      }
++
++      recalculate_quorum(1);
++
++      send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
++
++      return 0;
++}
++
++static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++      int err = -EOPNOTSUPP;
++      struct list_head *proclist;
++      struct list_head *tmp;
++      struct notify_struct *notify;
++      struct cl_version cnxman_version;
++
++      switch (cmd) {
++              /* Process requests notification of cluster events */
++      case SIOCCLUSTER_NOTIFY:
++              notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
++              if (!notify)
++                      return -ENOMEM;
++              notify->pid = current->pid;
++              notify->signal = arg;
++              down(&event_listener_lock);
++              list_add(&notify->list, &event_listener_list);
++              up(&event_listener_lock);
++              err = 0;
++              break;
++
++              /* Process is no longer interested cluster events */
++      case SIOCCLUSTER_REMOVENOTIFY:
++              err = EINVAL;
++
++              down(&event_listener_lock);
++              list_for_each_safe(proclist, tmp, &event_listener_list) {
++                      notify =
++                          list_entry(proclist, struct notify_struct, list);
++                      if (notify->pid == current->pid) {
++                              list_del(&notify->list);
++                              kfree(notify);
++                              err = 0;
++                      }
++              }
++              up(&event_listener_lock);
++              break;
++
++              /* Return the cnxman version number */
++      case SIOCCLUSTER_GET_VERSION:
++              if (!arg)
++                      return -EINVAL;
++              err = 0;
++              cnxman_version.major = CNXMAN_MAJOR_VERSION;
++              cnxman_version.minor = CNXMAN_MINOR_VERSION;
++              cnxman_version.patch = CNXMAN_PATCH_VERSION;
++              if (copy_to_user((void *) arg, &cnxman_version,
++                               sizeof (struct cl_version))) {
++                      return -EFAULT;
++              }
++              break;
++
++              /* Set the cnxman config version number */
++      case SIOCCLUSTER_SET_VERSION:
++              err = do_ioctl_set_version(arg);
++              break;
++
++              /* Return the active membership list */
++      case SIOCCLUSTER_GETMEMBERS:
++              err = do_ioctl_get_members(arg);
++              break;
++
++              /* Return the full membership list include dead nodes */
++      case SIOCCLUSTER_GETALLMEMBERS:
++              err = do_ioctl_get_all_members(arg);
++              break;
++
++      case SIOCCLUSTER_GETNODE:
++              err = do_ioctl_get_node(arg);
++              break;
++
++      case SIOCCLUSTER_ISQUORATE:
++              return cluster_is_quorate;
++
++      case SIOCCLUSTER_ISACTIVE:
++              return atomic_read(&cnxman_running);
++
++      case SIOCCLUSTER_SETEXPECTED_VOTES:
++              err = do_ioctl_set_expected(arg);
++              break;
++
++              /* Change the number of votes for this node */
++      case SIOCCLUSTER_SET_VOTES:
++              err = do_ioctl_set_votes(arg);
++              break;
++
++              /* Return 1 if the specified node is listening on a given port */
++      case SIOCCLUSTER_ISLISTENING:
++              err = do_ioctl_islistening(arg);
++              break;
++
++              /* Forcibly kill a node */
++      case SIOCCLUSTER_KILLNODE:
++              err = do_ioctl_kill_node(arg);
++              break;
++
++      case SIOCCLUSTER_GET_JOINCOUNT:
++              if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++              else
++                      return atomic_read(&use_count);
++
++              /* ioctl interface to the barrier system */
++      case SIOCCLUSTER_BARRIER:
++              err = do_ioctl_barrier(arg);
++              break;
++
++      default:
++              err = sm_ioctl(sock, cmd, arg);
++      }
++      return err;
++}
++
++static int cl_shutdown(struct socket *sock, int how)
++{
++      struct sock *sk = sock->sk;
++      int err = -ENOTCONN;
++
++      lock_sock(sk);
++
++      if (sock->state == SS_UNCONNECTED)
++              goto out;
++
++      err = 0;
++      if (sock->state == SS_DISCONNECTING)
++              goto out;
++
++      err = -EINVAL;
++
++      if (how != SHUTDOWN_MASK)
++              goto out;
++
++      sk->sk_shutdown = how;
++      err = 0;
++
++      out:
++      release_sock(sk);
++
++      return err;
++}
++
++static int cl_setsockopt(struct socket *sock, int level, int optname,
++                       char *optval, int optlen)
++{
++      struct sock *sk = sock->sk;
++      int err;
++
++      if (sk != master_sock)
++              return -EPERM;
++
++      lock_sock(sk);
++      err = __cl_setsockopt(sock, level, optname, optval, optlen, 0);
++      release_sock(sk);
++
++      return err;
++}
++
++static int add_clsock(int broadcast, int number, struct socket *sock,
++                    struct file *file)
++{
++      struct cl_comms_socket *newsock =
++          kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
++      if (!newsock)
++              return -ENOMEM;
++
++      memset(newsock, 0, sizeof (*newsock));
++      newsock->number = number;
++      newsock->sock = sock;
++      if (broadcast) {
++              newsock->broadcast = 1;
++              newsock->recv_only = 0;
++      }
++      else {
++              newsock->broadcast = 0;
++              newsock->recv_only = 1;
++      }
++
++      newsock->file = file;
++      newsock->addr_len = sizeof(struct sockaddr_in6);
++
++      /* Mark it active until cnxman thread is running and ready to process
++       * messages */
++      set_bit(1, &newsock->active);
++
++      /* Find out what it's bound to */
++      newsock->sock->ops->getname(newsock->sock,
++                                  (struct sockaddr *)&newsock->saddr,
++                                  &newsock->addr_len, 0);
++
++      num_interfaces = max(num_interfaces, newsock->number);
++      if (!current_interface && newsock->broadcast)
++              current_interface = newsock;
++
++      /* Hook data_ready */
++      newsock->sock->sk->sk_data_ready = cnxman_data_ready;
++
++      /* Make an attempt to keep them in order */
++      list_add_tail(&newsock->list, &socket_list);
++
++      address_length = newsock->addr_len;
++      return 0;
++}
++
++static int __cl_setsockopt(struct socket *sock, int level, int optname,
++                         char *optval, int optlen, int flags)
++{
++      struct file *file;
++      struct cl_join_cluster_info join_info;
++      int error;
++      int leave_flags;
++      struct cl_multicast_sock multicast_info;
++
++      if (optlen && !optval)
++              return -EINVAL;
++
++      switch (optname) {
++      case CLU_SET_MULTICAST:
++      case CLU_SET_RCVONLY:
++              if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++
++              if (optlen != sizeof (struct cl_multicast_sock))
++                      return -EINVAL;
++
++              if (atomic_read(&cnxman_running))
++                      return -EINVAL;
++
++              error = -EBADF;
++
++              if (copy_from_user(&multicast_info, optval, optlen))
++                      return -EFAULT;
++
++              file = fget(multicast_info.fd);
++              if (file) {
++                      struct inode *inode = file->f_dentry->d_inode;
++
++                      error =
++                          add_clsock(optname == CLU_SET_MULTICAST,
++                                     multicast_info.number, SOCKET_I(inode),
++                                     file);
++                      if (error)
++                              fput(file);
++              }
++              return error;
++
++      case CLU_SET_NODENAME:
++              if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++
++              if (atomic_read(&cnxman_running))
++                      return -EINVAL;
++
++              if (optlen > MAX_CLUSTER_MEMBER_NAME_LEN)
++                      return -EINVAL;
++
++              if (copy_from_user(nodename, optval, optlen))
++                      return -EFAULT;
++              break;
++
++      case CLU_JOIN_CLUSTER:
++              if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++
++              if (atomic_read(&cnxman_running))
++                      return -EALREADY;
++
++              if (optlen != sizeof (struct cl_join_cluster_info))
++                      return -EINVAL;
++
++              if (copy_from_user(&join_info, optval, optlen))
++                      return -EFAULT;
++
++              if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
++                      return -EINVAL;
++
++              if (list_empty(&socket_list))
++                      return -ENOTCONN;
++
++              set_votes(join_info.votes, join_info.expected_votes);
++              cluster_id = generate_cluster_id(join_info.cluster_name);
++              strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
++              two_node = join_info.two_node;
++              config_version = join_info.config_version;
++
++              quit_threads = 0;
++              acks_expected = 0;
++              init_completion(&cluster_thread_comp);
++              init_completion(&member_thread_comp);
++              if (allocate_nodeid_array())
++                      return -ENOMEM;
++
++              kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
++              if (kcluster_pid < 0)
++                      return kcluster_pid;
++
++              wait_for_completion(&cluster_thread_comp);
++              init_completion(&cluster_thread_comp);
++
++              atomic_set(&cnxman_running, 1);
++
++              /* Make sure we have a node name */
++              if (nodename[0] == '\0')
++                      strcpy(nodename, system_utsname.nodename);
++
++              membership_pid = start_membership_services(kcluster_pid);
++              if (membership_pid < 0) {
++                      quit_threads = 1;
++                      wait_for_completion(&cluster_thread_comp);
++                      init_completion(&member_thread_comp);
++                      return membership_pid;
++              }
++
++              sm_start();
++              break;
++
++      case CLU_LEAVE_CLUSTER:
++              if (!capable(CAP_CLUSTER))
++                      return -EPERM;
++
++              if (optlen != sizeof (int))
++                      return -EINVAL;
++
++              if (copy_from_user(&leave_flags, optval, optlen))
++                      return -EFAULT;
++
++              if (!atomic_read(&cnxman_running))
++                      return -ENOTCONN;
++
++              if (in_transition())
++                      return -EBUSY;
++
++              /* Ignore the use count if FORCE is set */
++              if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
++                      if (atomic_read(&use_count))
++                              return -ENOTCONN;
++              }
++
++              us->leave_reason = leave_flags;
++              quit_threads = 1;
++              wake_up_interruptible(&cnxman_waitq);
++
++              wait_for_completion(&cluster_thread_comp);
++              break;
++
++      default:
++              return -ENOPROTOOPT;
++      }
++
++      return 0;
++}
++
++static int cl_getsockopt(struct socket *sock, int level, int optname,
++                       char *optval, int *optlen)
++{
++      struct sock *sk = sock->sk;
++      int err;
++
++      lock_sock(sk);
++      err = __cl_getsockopt(sock, level, optname, optval, optlen, 0);
++      release_sock(sk);
++
++      return err;
++}
++
++static int __cl_getsockopt(struct socket *sock, int level, int optname,
++                         char *optval, int *optlen, int flags)
++{
++
++      switch (optname) {
++      default:
++              return -ENOPROTOOPT;
++      }
++
++      return 0;
++}
++
++/* We'll be giving out reward points next... */
++/* Send the packet and save a copy in case someone loses theirs. Should be
++ * protected by the send mutexphore */
++static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
++                         int size, int needack)
++{
++      mm_segment_t fs;
++      int result;
++      struct iovec save_vectors[msg->msg_iovlen];
++
++      /* Save a copy of the IO vectors as send_msg mucks around with them and
++       * we may want to send the same stuff out more than once (for different
++       * interfaces)
++       */
++      memcpy(save_vectors, msg->msg_iov,
++             sizeof (struct iovec) * msg->msg_iovlen);
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      result = sock_sendmsg(csock->sock, msg, size);
++
++      set_fs(fs);
++
++      if (result >= 0 && acks_expected && needack) {
++
++              /* Start retransmit timer if it didn't go */
++              if (result == 0) {
++                      start_short_timer();
++              }
++              else {
++                      resend_delay = 1;
++              }
++      }
++
++      /* Restore IOVs */
++      memcpy(msg->msg_iov, save_vectors,
++             sizeof (struct iovec) * msg->msg_iovlen);
++
++      return result;
++}
++
++static void resend_last_message()
++{
++      struct msghdr msg;
++      struct iovec vec[1];
++      mm_segment_t fs;
++      int result;
++
++      P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
++              jiffies, saved_msg_len, saved_msg_buffer[0],
++              saved_msg_buffer[6]);
++
++      /* Assume there is something wrong with the last interface */
++      current_interface = get_next_interface(current_interface);
++      if (num_interfaces > 1)
++              printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
++                     current_interface->number);
++
++      vec[0].iov_base = saved_msg_buffer;
++      vec[0].iov_len = saved_msg_len;
++
++      memset(&msg, 0, sizeof (msg));
++      msg.msg_name = &current_interface->saddr;
++      msg.msg_namelen = current_interface->addr_len;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = vec;
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      result = sock_sendmsg(current_interface->sock, &msg, saved_msg_len);
++
++      set_fs(fs);
++
++      if (result < 0)
++              printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
++
++      /* Try indefinitely to send this, the backlog must die down eventually
++       * !? */
++      if (result == 0)
++              start_short_timer();
++
++      /* Send succeeded, continue waiting for ACKS */
++      if (result > 0)
++              start_ack_timer();
++
++}
++
++static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
++                    struct msghdr *msg, size_t size, int flags)
++{
++      struct sock *sk = sock->sk;
++      struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
++      struct cluster_sock *c = cluster_sk(sk);
++      struct sk_buff *skb;
++      int copied, err = 0;
++      int isoob = 0;
++
++      /* Socket was notified of shutdown, remove any pending skbs and return
++       * EOF */
++      if (!atomic_read(&cnxman_running)) {
++              while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
++                      skb_free_datagram(sk, skb);
++              return 0;       /* cnxman has left the building */
++      }
++
++      /* Generic datagram code does most of the work. If the user is not
++       * interested in OOB messages then ignore them */
++      do {
++              skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
++              if (!skb)
++                      goto out;
++
++              /* Is it OOB */
++              if (skb->cb[0] & 0x80)
++                      isoob = 1;
++              else
++                      isoob = 0;
++
++              /* If it is and the user doesn't want it, then throw it away. */
++              if (isoob && !(flags & MSG_OOB)) {
++                      skb_free_datagram(sk, skb);
++
++                      /* If we peeked (?) an OOB but the user doesn't want it
++                         then we need to discard it or we'll loop forever */
++                      if (flags & MSG_PEEK) {
++                              skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
++                                                      MSG_DONTWAIT, &err);
++                              if (skb)
++                                      skb_free_datagram(sk, skb);
++                      }
++              }
++      }
++      while (isoob && !(flags & MSG_OOB));
++
++      copied = skb->len;
++      if (copied > size) {
++              copied = size;
++              msg->msg_flags |= MSG_TRUNC;
++      }
++      err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
++
++      if (err)
++              goto out_free;
++
++      if (msg->msg_name && msg->msg_namelen) {
++              memset(msg->msg_name, 0, msg->msg_namelen);
++
++              if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
++
++                      /* Nodeid is in native byte order - anything else is just
++                       * perverse */
++                      memcpy(&sin->scl_nodeid, skb->cb + 1, sizeof(int));
++              }
++              msg->msg_namelen = sizeof (struct sockaddr_cl);
++              sin->scl_port = c->port;
++      }
++
++      /* Top bit set in cb[0] means this is an OOB message */
++      if (skb->cb[0] & 0x80) {
++              msg->msg_flags |= MSG_OOB;
++      }
++
++      sock_recv_timestamp(msg, sk, skb);
++
++      err = copied;
++
++      out_free:
++      skb_free_datagram(sk, skb);
++
++      out:
++      return err;
++}
++
++/* Send a message out on all interfaces */
++static int send_to_all_ints(int nodeid, struct msghdr *our_msg, int size, int flags)
++{
++      struct sockaddr_in6 daddr;
++      struct cl_comms_socket *clsock;
++      int result = 0;
++
++      our_msg->msg_name = &daddr;
++
++      list_for_each_entry(clsock, &socket_list, list) {
++
++              /* Don't send out a recv-only socket */
++              if (!clsock->recv_only) {
++
++                      /* For temporary node IDs send to the node's real IP address */
++                      if (nodeid < 0) {
++                              get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
++                      }
++                      else {
++                              memcpy(&daddr, &clsock->saddr, clsock->addr_len);
++                              our_msg->msg_namelen = clsock->addr_len;
++                      }
++
++                      result = __send_and_save(clsock, our_msg,
++                                               size + sizeof (struct cl_protheader),
++                                               !(flags & MSG_NOACK));
++              }
++      }
++      return result;
++}
++
++
++/* Internal common send message routine */
++static int __sendmsg(struct socket *sock, struct msghdr *msg, int size,
++                   unsigned char port)
++{
++      int result = 0, i;
++      int flags = msg->msg_flags;
++      struct msghdr our_msg;
++      struct sockaddr_cl *caddr = msg->msg_name;
++      struct cl_protheader header;
++      struct iovec vectors[msg->msg_iovlen + 1];
++      int nodeid = 0;
++
++      if (size > MAX_CLUSTER_MESSAGE)
++              return -EINVAL;
++      if (!atomic_read(&cnxman_running))
++              return -ENOTCONN;
++
++      if (caddr)
++              nodeid = caddr->scl_nodeid;
++
++      /* Check that the node id (if present) is valid */
++      if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
++                               !is_valid_temp_nodeid(nodeid))) {
++              return -ENOTCONN;
++      }
++
++      /* We can only have one send outstanding at a time so we might as well
++       * lock the whole send mechanism */
++      down(&send_lock);
++
++      while ((port > HIGH_PROTECTED_PORT
++              && (!cluster_is_quorate || in_transition()))
++             || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
++
++              DECLARE_WAITQUEUE(wq, current);
++              struct task_struct *tsk = current;
++
++              if (flags & MSG_DONTWAIT) {
++                      up(&send_lock);
++                      return -EAGAIN;
++              }
++
++              if (current->pid == kcluster_pid) {
++                      P_COMMS
++                          ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
++                           port, ack_count, acks_expected);
++                      up(&send_lock);
++                      return -EAGAIN;
++              }
++
++              P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
++                      ack_count, acks_expected);
++
++              set_task_state(tsk, TASK_INTERRUPTIBLE);
++              add_wait_queue(&socket_waitq, &wq);
++
++              if ((port > HIGH_PROTECTED_PORT
++                   && (!cluster_is_quorate || in_transition()))
++                  || (acks_expected > 0)) {
++
++                      up(&send_lock);
++                      schedule();
++                      down(&send_lock);
++              }
++
++              /* Going down */
++              if (quit_threads) {
++                      up(&send_lock);
++                      return -ENOTCONN;
++              }
++
++              set_task_state(tsk, TASK_RUNNING);
++              remove_wait_queue(&socket_waitq, &wq);
++
++              if (signal_pending(current)) {
++                      up(&send_lock);
++                      return -ERESTARTSYS;
++              }
++
++              /* Were we shut down in the meantime ? */
++              if (!atomic_read(&cnxman_running)) {
++                      up(&send_lock);
++                      return -ENOTCONN;
++              }
++
++      }
++
++      memset(&our_msg, 0, sizeof (our_msg));
++
++      /* Build the header */
++      header.port = port;
++      header.flags = msg->msg_flags >> 16;
++      header.cluster = cpu_to_le16(cluster_id);
++      header.srcid = us ? cpu_to_le32(us->node_id) : 0;
++      header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
++
++      ++cur_seq;
++      header.seq = cpu_to_le16(cur_seq);
++
++      /* Set the MULTICAST flag on messages with no particular destination */
++      if (!msg->msg_namelen) {
++              header.flags |= MSG_MULTICAST >> 16;
++              header.tgtid = 0;
++      }
++
++      /* Copy the existing iovecs into our array and add the header on at the
++       * beginning */
++      vectors[0].iov_base = &header;
++      vectors[0].iov_len = sizeof (header);
++      for (i = 0; i < msg->msg_iovlen; i++) {
++              vectors[i + 1] = msg->msg_iov[i];
++      }
++
++      our_msg.msg_iovlen = msg->msg_iovlen + 1;
++      our_msg.msg_iov = vectors;
++
++      /* Work out how many ACKS are wanted - *don't* reset acks_expected to
++       * zero if no acks are required as an ACK-needed message may still be
++       * outstanding */
++      if (!(msg->msg_flags & MSG_NOACK)) {
++              if (msg->msg_namelen)
++                      acks_expected = 1;      /* Unicast */
++              else
++                      acks_expected = max(cluster_members - 1, 0);
++
++      }
++
++      P_COMMS
++          ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
++           nodeid, header.port,
++           (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
++           le16_to_cpu(header.seq), header.flags);
++
++      /* Don't include temp nodeids in the message itself */
++      if (header.tgtid < 0)
++              header.tgtid = 0;
++
++      /* For non-member sends we use all the interfaces */
++      if ((nodeid < 0) || (flags & MSG_ALLINT)) {
++
++              result = send_to_all_ints(nodeid, &our_msg, size, msg->msg_flags);
++      }
++      else {
++              /* Send to only the current socket - resends will use the
++               * others if necessary */
++              our_msg.msg_name = &current_interface->saddr;
++              our_msg.msg_namelen = current_interface->addr_len;
++
++              result =
++                  __send_and_save(current_interface, &our_msg,
++                                  size + sizeof (header),
++                                  !(msg->msg_flags & MSG_NOACK));
++      }
++
++      /* Make a note in each nodes' structure that it has been sent a message
++       * so we can see which ones went astray */
++      if (!(flags & MSG_NOACK) && nodeid >= 0) {
++              if (msg->msg_namelen) {
++                      struct cluster_node *node;
++
++                      node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
++                      if (node)
++                              node->last_seq_sent = cur_seq;
++              }
++              else {
++                      struct cluster_node *node;
++                      struct list_head *nodelist;
++
++                      list_for_each(nodelist, &cluster_members_list) {
++                              node =
++                                  list_entry(nodelist, struct cluster_node,
++                                             list);
++                              if (node->state == NODESTATE_MEMBER) {
++                                      node->last_seq_sent = cur_seq;
++                              }
++                      }
++              }
++      }
++
++      /* Save a copy of the message if we're expecting an ACK */
++      if (!(flags & MSG_NOACK) && acks_expected) {
++              mm_segment_t fs;
++
++              fs = get_fs();
++              set_fs(get_ds());
++
++              memcpy_fromiovec(saved_msg_buffer, our_msg.msg_iov,
++                               size + sizeof (header));
++              set_fs(fs);
++
++              saved_msg_len = size + sizeof (header);
++              retry_count = ack_count = 0;
++              clear_bit(RESEND_NEEDED, &mainloop_flags);
++
++              start_ack_timer();
++      }
++
++      up(&send_lock);
++      return result;
++}
++
++static int queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++                       unsigned char port, int flags)
++{
++      struct queued_message *qmsg;
++
++      qmsg = kmalloc(sizeof (struct queued_message),
++                     (in_atomic()
++                      || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
++      if (qmsg == NULL)
++              return -1;
++
++      memcpy(qmsg->msg_buffer, buf, len);
++      qmsg->msg_len = len;
++      if (caddr) {
++              memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
++              qmsg->addr_len = sizeof (struct sockaddr_cl);
++      }
++      else {
++              qmsg->addr_len = 0;
++      }
++      qmsg->flags = flags;
++      qmsg->port = port;
++      qmsg->socket = NULL;
++
++      down(&messages_list_lock);
++      list_add_tail(&qmsg->list, &messages_list);
++      up(&messages_list_lock);
++
++      wake_up_interruptible(&cnxman_waitq);
++
++      return 0;
++}
++
++static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
++                    struct msghdr *msg, size_t size)
++{
++      struct cluster_sock *c = cluster_sk(sock->sk);
++      char *buffer;
++      int status;
++      int saved_iovlen;
++      uint8_t port;
++      struct iovec iov;
++      struct iovec *saved_iov;
++      struct sockaddr_cl *caddr = msg->msg_name;
++
++      if (sock->sk->sk_protocol == CLPROTO_MASTER)
++              return -EOPNOTSUPP;
++
++      port = c->port;
++
++      /* Only capable users can override the port number */
++      if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
++              port = caddr->scl_port;
++
++      if (port == 0)
++              return -EDESTADDRREQ;
++
++        /* Hmmm. On machines with segmented user/kernel space (sparc64, hppa &
++       * m68k AFAICT) we can't mix user and kernel space addresses in the
++       * IOV. This stymies __sendmsg a little as it tries to add a header to
++       * what could possibly be a userspace iov. So, here (where all the
++       * userspace sends come) we copy it to a kernel space buffer first. If
++       * performance is a big problem here then I might #ifdef it for the
++       * affected architectures but for now I think it will probably be OK */
++      buffer = kmalloc(size, GFP_KERNEL);
++      if (!buffer)
++              return -ENOMEM;
++
++      memcpy_fromiovec(buffer, msg->msg_iov, size);
++      iov.iov_len = size;
++      iov.iov_base = buffer;
++
++      saved_iov = msg->msg_iov;
++      saved_iovlen = msg->msg_iovlen;
++      msg->msg_iov = &iov;
++      msg->msg_iovlen = 1;
++
++      status = __sendmsg(sock, msg, size, port);
++      msg->msg_iov = saved_iov;
++      msg->msg_iovlen = saved_iovlen;
++
++      kfree(buffer);
++
++      return status;
++}
++
++/* Kernel call to sendmsg */
++int kcl_sendmsg(struct socket *sock, void *buf, int size,
++              struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
++{
++      struct iovec iovecs[1];
++      struct msghdr msg;
++      struct cluster_sock *c = cluster_sk(sock->sk);
++      unsigned char port;
++
++      if (size > MAX_CLUSTER_MESSAGE)
++              return -EINVAL;
++      if (!atomic_read(&cnxman_running))
++              return -ENOTCONN;
++
++      port = c->port;
++      if (caddr && caddr->scl_port)
++              port = caddr->scl_port;
++
++      if (port == 0)
++              return -EDESTADDRREQ;
++
++      /* If we have no process context then queue it up for kclusterd to
++       * send. */
++      if (in_interrupt() || flags & MSG_QUEUE) {
++              return queue_message(buf, size, caddr, port,
++                                   flags & ~MSG_QUEUE);
++      }
++
++      iovecs[0].iov_base = buf;
++      iovecs[0].iov_len = size;
++
++      memset(&msg, 0, sizeof (msg));
++      msg.msg_name = caddr;
++      msg.msg_namelen = addr_len;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = iovecs;
++      msg.msg_flags = flags;
++
++      return __sendmsg(sock, &msg, size, port);
++}
++
++static int send_queued_message(struct queued_message *qmsg)
++{
++      struct iovec iovecs[1];
++      struct msghdr msg;
++
++      /* Don't send blocked messages */
++      if (qmsg->port > HIGH_PROTECTED_PORT
++          && (!cluster_is_quorate || in_transition()))
++              return -EAGAIN;
++
++      iovecs[0].iov_base = qmsg->msg_buffer;
++      iovecs[0].iov_len = qmsg->msg_len;
++
++      memset(&msg, 0, sizeof (msg));
++      msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
++      msg.msg_namelen = qmsg->addr_len;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = iovecs;
++      msg.msg_flags = qmsg->flags;
++
++      return __sendmsg(qmsg->socket, &msg, qmsg->msg_len, qmsg->port);
++}
++
++int kcl_register_read_callback(struct socket *sock,
++                             int (*routine) (char *, int, char *, int,
++                                             unsigned int))
++{
++      struct cluster_sock *c = cluster_sk(sock->sk);
++
++      c->kernel_callback = routine;
++
++      return 0;
++}
++
++/* Used where we are in kclusterd context and we can't allow the task to wait
++ * as we are also responsible to processing the ACKs that do the wake up. Try
++ * to send the message immediately and queue it if that's not possible */
++static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++                               unsigned char port)
++{
++      struct iovec iovecs[1];
++      struct msghdr msg;
++
++      int status;
++
++      /* Don't send blocked messages */
++      if (port > HIGH_PROTECTED_PORT
++          && (!cluster_is_quorate || in_transition())) {
++              return queue_message(buf, len, caddr, port, 0);
++      }
++
++      iovecs[0].iov_base = buf;
++      iovecs[0].iov_len = len;
++
++      memset(&msg, 0, sizeof (msg));
++      msg.msg_name = caddr;
++      msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = iovecs;
++      msg.msg_flags = MSG_DONTWAIT;
++
++      status = __sendmsg(NULL, &msg, len, port);
++
++      /* Did it work ? */
++      if (status > 0) {
++              return 0;
++      }
++
++      /* Failure other than EAGAIN is fatal */
++      if (status != -EAGAIN) {
++              return status;
++      }
++
++      return queue_message(buf, len, caddr, port, 0);
++}
++
++/* Send a listen request to a node */
++static void send_listen_request(int nodeid, unsigned char port)
++{
++      struct cl_listenmsg listenmsg;
++      struct sockaddr_cl caddr;
++
++      memset(&caddr, 0, sizeof (caddr));
++
++      /* Build the header */
++      listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
++      listenmsg.target_port = port;
++      listenmsg.listening = 0;
++      listenmsg.tag = current->pid;
++
++      caddr.scl_family = AF_CLUSTER;
++      caddr.scl_port = 0;
++      caddr.scl_nodeid = nodeid;
++
++      send_or_queue_message(&listenmsg, sizeof(listenmsg), &caddr, 0);
++      return;
++}
++
++/* Return 1 or 0 to indicate if we have a listener on the requested port */
++static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
++                               unsigned char port, unsigned short tag)
++{
++      struct cl_listenmsg listenmsg;
++      struct sockaddr_cl caddr;
++      int status;
++
++      memset(&caddr, 0, sizeof (caddr));
++
++      /* Build the message */
++      listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
++      listenmsg.target_port = port;
++      listenmsg.tag = tag;
++      listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
++
++      caddr.scl_family = AF_CLUSTER;
++      caddr.scl_port = 0;
++      caddr.scl_nodeid = nodeid;
++
++      status = send_or_queue_message(&listenmsg,
++                                     sizeof (listenmsg),
++                                     &caddr, 0);
++
++      return;
++}
++
++/* Send an ACK */
++static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
++                    int addr_len, char *addr, unsigned char remport,
++                    unsigned char flag)
++{
++      mm_segment_t fs;
++      struct iovec vec;
++      struct cl_ackmsg ackmsg;
++      struct msghdr msg;
++      struct sockaddr_in6 daddr;
++      int result;
++
++#ifdef DEBUG_COMMS
++      char buf[MAX_ADDR_PRINTED_LEN];
++
++      P_COMMS("Sending ACK to %s, seq=%d\n",
++              print_addr(addr, address_length, buf), le16_to_cpu(seq));
++#endif
++
++      if (addr) {
++              memcpy(&daddr, addr, addr_len);
++      }
++      else {
++              memcpy(&daddr, &csock->saddr, csock->addr_len);
++              addr_len = csock->addr_len;
++      }
++
++      /* Build the header */
++      ackmsg.header.port = 0; /* Protocol port */
++      ackmsg.header.seq = 0;
++      ackmsg.header.flags = MSG_NOACK >> 16;
++      ackmsg.header.cluster = cpu_to_le16(cluster_id);
++      ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
++      ackmsg.header.tgtid = 0;        /* ACKS are unicast so we don't bother
++                                       * to look this up */
++      ackmsg.cmd = CLUSTER_CMD_ACK;
++      ackmsg.remport = remport;
++      ackmsg.aflags = flag;
++      ackmsg.seq = seq;       /* Already in LE order */
++      vec.iov_base = &ackmsg;
++      vec.iov_len = sizeof (ackmsg);
++
++      memset(&msg, 0, sizeof (msg));
++      msg.msg_name = &daddr;
++      msg.msg_namelen = addr_len;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = &vec;
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      result = sock_sendmsg(csock->sock, &msg, sizeof (ackmsg));
++
++      set_fs(fs);
++
++      if (result < 0)
++              printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
++
++      return result;
++
++}
++
++/* Wait for all ACKS to be gathered */
++void kcl_wait_for_all_acks()
++{
++      while (ack_count < acks_expected) {
++
++              DECLARE_WAITQUEUE(wq, current);
++              struct task_struct *tsk = current;
++
++              set_task_state(tsk, TASK_INTERRUPTIBLE);
++              add_wait_queue(&socket_waitq, &wq);
++
++              if (ack_count < acks_expected) {
++                      schedule();
++              }
++
++              set_task_state(tsk, TASK_RUNNING);
++              remove_wait_queue(&socket_waitq, &wq);
++      }
++}
++
++/* Send a closedown OOB message to all cluster nodes - this tells them that a
++ * port listener has gone away */
++static void send_port_close_oob(unsigned char port)
++{
++      struct cl_closemsg closemsg;
++
++      /* Build the header */
++      closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
++      closemsg.port = port;
++
++      send_or_queue_message(&closemsg, sizeof (closemsg), NULL, 0);
++      return;
++}
++
++/* A remote port has been closed - post an OOB message to the local listen on
++ * that port (if there is one) */
++static void post_close_oob(unsigned char port, int nodeid)
++{
++      struct cl_portclosed_oob *oobmsg;
++      struct sk_buff *skb;
++      struct sock *sock = port_array[port];
++
++      if (!sock) {
++              return;         /* No-one listening */
++      }
++
++      skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
++      if (!skb)
++              return;
++
++      skb_put(skb, sizeof (*oobmsg));
++      oobmsg = (struct cl_portclosed_oob *) skb->data;
++      oobmsg->port = port;
++      oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
++      skb->cb[0] = 0x80;
++      memcpy(skb->cb + 1, &nodeid, sizeof(int));
++
++      sock_queue_rcv_skb(sock, skb);
++
++}
++
++/* Leave the cluster */
++static void node_shutdown()
++{
++      struct cl_barrier *barrier;
++      struct list_head *blist;
++      struct list_head *temp;
++      struct list_head *socklist;
++      struct cl_client_socket *csock;
++      struct sk_buff *null_skb;
++
++      printk(KERN_INFO CMAN_NAME ": we are leaving the cluster\n");
++
++      atomic_set(&cnxman_running, 0);
++      unjam();
++
++      /* Notify kernel listeners first */
++      notify_kernel_listeners(LEAVING, 0);
++
++      /* Notify client sockets */
++      down(&client_socket_lock);
++      list_for_each_safe(socklist, temp, &client_socket_list) {
++              csock = list_entry(socklist, struct cl_client_socket, list);
++
++              null_skb = alloc_skb(0, GFP_KERNEL);
++              if (null_skb)
++                      sock_queue_rcv_skb(csock->sock->sk, null_skb);
++              list_del(&csock->list);
++              kfree(csock);
++      }
++      up(&client_socket_lock);
++      we_are_a_cluster_member = 0;
++
++      sm_stop(1);
++
++      /* Wake up any processes waiting for barriers */
++      down(&barrier_list_lock);
++      list_for_each(blist, &barrier_list) {
++              barrier = list_entry(blist, struct cl_barrier, list);
++
++              /* Cancel any timers */
++              if (timer_pending(&barrier->timer))
++                      del_timer(&barrier->timer);
++
++              /* Force it to be auto-delete so it discards itself */
++              if (barrier->state == BARRIER_STATE_WAITING) {
++                      barrier->flags |= BARRIER_ATTR_AUTODELETE;
++                      wake_up_interruptible(&barrier->waitq);
++              }
++              else {
++                      if (barrier->callback) {
++                              barrier->callback(barrier->name, -ENOTCONN);
++                              barrier->callback = NULL;
++                      }
++              }
++      }
++      up(&barrier_list_lock);
++
++      /* Wake up any processes waiting for ISLISTENING requests */
++      down(&listenreq_lock);
++      list_for_each(blist, &listenreq_list) {
++              struct cl_waiting_listen_request *lrequest =
++                  list_entry(blist, struct cl_waiting_listen_request, list);
++
++              if (lrequest->waiting)
++                      wake_up_interruptible(&lrequest->waitq);
++      }
++      up(&listenreq_lock);
++}
++
++static void free_cluster_sockets()
++{
++      struct list_head *socklist;
++      struct cl_comms_socket *sock;
++      struct list_head *temp;
++
++      list_for_each_safe(socklist, temp, &socket_list) {
++              sock = list_entry(socklist, struct cl_comms_socket, list);
++
++              list_del(&sock->list);
++              fput(sock->file);
++              kfree(sock);
++      }
++      num_interfaces = 0;
++      current_interface = NULL;
++}
++
++/* Tidy up after all the rest of the cluster bits have shut down */
++static void node_cleanup()
++{
++      struct list_head *nodelist;
++      struct list_head *proclist;
++      struct list_head *temp;
++      struct list_head *socklist;
++      struct list_head *blist;
++      struct cl_comms_socket *sock;
++      struct kernel_notify_struct *knotify;
++
++      /* Free list of kernel listeners */
++      list_for_each_safe(proclist, temp, &kernel_listener_list) {
++              knotify =
++                  list_entry(proclist, struct kernel_notify_struct, list);
++              list_del(&knotify->list);
++              kfree(knotify);
++      }
++
++      /* Mark the sockets as busy so they don't get added to the active
++       * sockets list in the next few lines of code before we free them */
++      list_for_each_safe(socklist, temp, &socket_list) {
++              sock = list_entry(socklist, struct cl_comms_socket, list);
++
++              set_bit(1, &sock->active);
++      }
++
++      /* Tidy the active sockets list */
++      list_for_each_safe(socklist, temp, &active_socket_list) {
++              sock =
++                  list_entry(socklist, struct cl_comms_socket, active_list);
++              list_del(&sock->active_list);
++      }
++
++      /* Free the memory allocated to cluster nodes */
++      free_nodeid_array();
++      down(&cluster_members_lock);
++      us = NULL;
++      list_for_each_safe(nodelist, temp, &cluster_members_list) {
++
++              struct list_head *addrlist;
++              struct list_head *addrtemp;
++              struct cluster_node *node;
++              struct cluster_node_addr *nodeaddr;
++
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
++                      nodeaddr =
++                          list_entry(addrlist, struct cluster_node_addr,
++                                     list);
++
++                      list_del(&nodeaddr->list);
++                      kfree(nodeaddr);
++              }
++              list_del(&node->list);
++              kfree(node->name);
++              kfree(node);
++      }
++      cluster_members = 0;
++      up(&cluster_members_lock);
++
++      /* Free the memory allocated to the outgoing sockets */
++      free_cluster_sockets();
++
++      /* Make sure that all the barriers are deleted */
++      down(&barrier_list_lock);
++      list_for_each_safe(blist, temp, &barrier_list) {
++              struct cl_barrier *barrier =
++                  list_entry(blist, struct cl_barrier, list);
++
++              list_del(&barrier->list);
++              kfree(barrier);
++      }
++      up(&barrier_list_lock);
++
++      kcluster_pid = 0;
++      clear_bit(RESEND_NEEDED, &mainloop_flags);
++      acks_expected = 0;
++}
++
++/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
++ * blocked. */
++void set_quorate(int total_votes)
++{
++      int quorate;
++
++      if (get_quorum() > total_votes) {
++              quorate = 0;
++      }
++      else {
++              quorate = 1;
++      }
++
++      /* Hide messages during startup state transition */
++      if (we_are_a_cluster_member) {
++              if (cluster_is_quorate && !quorate)
++                      printk(KERN_CRIT CMAN_NAME
++                             ": quorum lost, blocking activity\n");
++              if (!cluster_is_quorate && quorate)
++                      printk(KERN_CRIT CMAN_NAME
++                             ": quorum regained, resuming activity\n");
++      }
++      cluster_is_quorate = quorate;
++
++      /* Wake up any sleeping processes */
++      if (cluster_is_quorate) {
++              unjam();
++      }
++
++}
++
++void queue_oob_skb(struct socket *sock, int cmd)
++{
++      struct sk_buff *skb;
++      struct cl_portclosed_oob *oobmsg;
++
++      skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
++      if (!skb)
++              return;
++
++      skb_put(skb, sizeof (*oobmsg));
++      oobmsg = (struct cl_portclosed_oob *) skb->data;
++      oobmsg->port = 0;
++      oobmsg->cmd = cmd;
++
++      /* There is no remote node associated with this so
++         clear out the field to avoid any accidents */
++      memset(skb->cb, 0, sizeof(int));
++      skb->cb[0] = 0x80;
++
++      sock_queue_rcv_skb(sock->sk, skb);
++}
++
++/* Notify interested parties that the cluster configuration has changed */
++void notify_listeners()
++{
++      struct notify_struct *notify;
++      struct list_head *proclist;
++      struct list_head *socklist;
++      struct list_head *temp;
++
++      /* Do kernel listeners first */
++      notify_kernel_listeners(CLUSTER_RECONFIG, 0);
++
++      /* Now we deign to tell userspace */
++      down(&event_listener_lock);
++      list_for_each_safe(proclist, temp, &event_listener_list) {
++              notify = list_entry(proclist, struct notify_struct, list);
++
++              /* If the kill fails then remove the process from the list */
++              if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
++                      list_del(&notify->list);
++                      kfree(notify);
++              }
++      }
++      up(&event_listener_lock);
++
++      /* Tell userspace processes which want OOB messages */
++      down(&client_socket_lock);
++      list_for_each(socklist, &client_socket_list) {
++              struct cl_client_socket *csock;
++              csock = list_entry(socklist, struct cl_client_socket, list);
++              queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
++      }
++      up(&client_socket_lock);
++}
++
++/* This fills in the list of all addresses for the local node */
++void get_local_addresses(struct cluster_node *node)
++{
++      struct list_head *socklist;
++      struct cl_comms_socket *sock;
++
++      list_for_each(socklist, &socket_list) {
++              sock = list_entry(socklist, struct cl_comms_socket, list);
++
++              if (sock->recv_only) {
++                      add_node_address(node, (char *) &sock->saddr, address_length);
++              }
++      }
++}
++
++
++static uint16_t generate_cluster_id(char *name)
++{
++      int i;
++      int value = 0;
++
++      for (i=0; i<strlen(name); i++) {
++              value <<= 1;
++              value += name[i];
++      }
++      return value & 0xFFFF;
++}
++
++/* Return the next comms socket we can use. */
++static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
++{
++      int next;
++      struct list_head *socklist;
++
++      /* Fast path for single interface systems */
++      if (num_interfaces <= 1)
++              return cur;
++
++      /* Next number */
++      next = cur->number + 1;
++      if (next > num_interfaces)
++              next = 1;
++
++      /* Find the socket with this number, I could optimise this by starting
++       * at the current i/f but most systems are going to have a small number
++       * of them anyway */
++      list_for_each(socklist, &socket_list) {
++              struct cl_comms_socket *sock;
++              sock = list_entry(socklist, struct cl_comms_socket, list);
++
++              if (!sock->recv_only && sock->number == next)
++                      return sock;
++      }
++
++      BUG();
++      return NULL;
++}
++
++/* MUST be called with the barrier list lock held */
++static struct cl_barrier *find_barrier(char *name)
++{
++      struct list_head *blist;
++      struct cl_barrier *bar;
++
++      list_for_each(blist, &barrier_list) {
++              bar = list_entry(blist, struct cl_barrier, list);
++
++              if (strcmp(name, bar->name) == 0)
++                      return bar;
++      }
++      return NULL;
++}
++
++/* Do the stuff we need to do when the barrier has completed phase 1 */
++static void check_barrier_complete_phase1(struct cl_barrier *barrier)
++{
++      if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
++                                               ? barrier->expected_nodes :
++                                               cluster_members)) {
++
++              struct cl_barriermsg bmsg;
++
++              atomic_inc(&barrier->completed_nodes);  /* We have completed */
++              barrier->phase = 2;     /* Wait for complete phase II */
++
++              /* Send completion message, remember: we are in cnxman context
++               * and must not block */
++              bmsg.cmd = CLUSTER_CMD_BARRIER;
++              bmsg.subcmd = BARRIER_COMPLETE;
++              bmsg.flags = 0;
++              strcpy(bmsg.name, barrier->name);
++
++              P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
++              queue_message((char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
++      }
++}
++
++/* Do the stuff we need to do when the barrier has been reached */
++/* Return 1 if we deleted the barrier */
++static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
++{
++      spin_lock_irq(&barrier->phase2_spinlock);
++
++      if (barrier->state != BARRIER_STATE_COMPLETE &&
++          (status == -ETIMEDOUT ||
++           atomic_read(&barrier->completed_nodes) ==
++           ((barrier->expected_nodes != 0)
++            ? barrier->expected_nodes : cluster_members))) {
++
++              if (status == 0 && barrier->timeout)
++                      del_timer(&barrier->timer);
++              barrier->endreason = status;
++
++              /* Wake up listener */
++              if (barrier->state == BARRIER_STATE_WAITING) {
++                      wake_up_interruptible(&barrier->waitq);
++              }
++              else {
++                      /* Additional tasks we have to do if the user was not
++                       * waiting... */
++                      /* Call the callback */
++                      if (barrier->callback) {
++                              barrier->callback(barrier->name, 0);
++                              barrier->callback = NULL;
++                      }
++                      /* Remove it if it's AUTO-DELETE */
++                      if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++                              list_del(&barrier->list);
++                              spin_unlock_irq(&barrier->phase2_spinlock);
++                              kfree(barrier);
++                              return 1;
++                      }
++              }
++              barrier->state = BARRIER_STATE_COMPLETE;
++      }
++      spin_unlock_irq(&barrier->phase2_spinlock);
++      return 0;
++}
++
++/* Called if a barrier timeout happens */
++static void barrier_timer_fn(unsigned long arg)
++{
++      struct cl_barrier *barrier = (struct cl_barrier *) arg;
++
++      /* Ignore any futher messages, they are too late. */
++      barrier->phase = 0;
++
++      /* and cause it to timeout */
++      check_barrier_complete_phase2(barrier, -ETIMEDOUT);
++}
++
++/* Process BARRIER messages from other nodes */
++static void process_barrier_msg(struct cl_barriermsg *msg,
++                              struct cluster_node *node)
++{
++      struct cl_barrier *barrier;
++
++      down(&barrier_list_lock);
++      barrier = find_barrier(msg->name);
++      up(&barrier_list_lock);
++
++      /* Ignore other peoples messages, in_transition() is needed here so
++       * that joining nodes will see their barrier messages before the
++       * we_are_a_cluster_member is set */
++      if (!we_are_a_cluster_member && !in_transition())
++              return;
++      if (!barrier)
++              return;
++
++      P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
++                node ? node->name : "unknown");
++
++      switch (msg->subcmd) {
++      case BARRIER_WAIT:
++              down(&barrier->lock);
++              if (barrier->phase == 0)
++                      barrier->phase = 1;
++
++              if (barrier->phase == 1) {
++                      atomic_inc(&barrier->got_nodes);
++                      check_barrier_complete_phase1(barrier);
++              }
++              else {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": got WAIT barrier not in phase 1 %s (%d)\n",
++                             msg->name, barrier->phase);
++
++              }
++              up(&barrier->lock);
++              break;
++
++      case BARRIER_COMPLETE:
++              down(&barrier->lock);
++              atomic_inc(&barrier->completed_nodes);
++
++              /* First node to get all the WAIT messages sends COMPLETE, so
++               * we all complete */
++              if (barrier->phase == 1) {
++                      atomic_set(&barrier->got_nodes,
++                                 barrier->expected_nodes);
++                      check_barrier_complete_phase1(barrier);
++              }
++
++              if (barrier->phase == 2) {
++                      /* If it was deleted (ret==1) then no need to unlock
++                       * the mutex */
++                      if (check_barrier_complete_phase2(barrier, 0) == 1)
++                              return;
++              }
++              up(&barrier->lock);
++              break;
++      }
++}
++
++/* In-kernel membership API */
++int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
++{
++      struct kernel_notify_struct *notify;
++
++      notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
++      if (!notify)
++              return -ENOMEM;
++      notify->callback = callback;
++
++      down(&kernel_listener_lock);
++      list_add(&notify->list, &kernel_listener_list);
++      up(&kernel_listener_lock);
++
++      return 0;
++}
++
++int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
++{
++      struct list_head *calllist;
++      struct list_head *temp;
++      struct kernel_notify_struct *notify;
++
++      down(&kernel_listener_lock);
++      list_for_each_safe(calllist, temp, &kernel_listener_list) {
++              notify = list_entry(calllist, struct kernel_notify_struct, list);
++              if (notify->callback == callback){
++                      list_del(&notify->list);
++                      kfree(notify);
++                      up(&kernel_listener_lock);
++                      return 0;
++              }
++      }
++      up(&kernel_listener_lock);
++      return -EINVAL;
++}
++
++/* Return quorate status */
++int kcl_is_quorate()
++{
++      return cluster_is_quorate;
++}
++
++/* Return the address list for a node */
++struct list_head *kcl_get_node_addresses(int nodeid)
++{
++      struct cluster_node *node = find_node_by_nodeid(nodeid);
++
++      if (node)
++              return &node->addr_list;
++      else
++              return NULL;
++}
++
++static void copy_to_kclnode(struct cluster_node *node,
++                          struct kcl_cluster_node *knode)
++{
++      strcpy(knode->name, node->name);
++      knode->size = sizeof (struct kcl_cluster_node);
++      knode->votes = node->votes;
++      knode->state = node->state;
++      knode->node_id = node->node_id;
++      knode->us = node->us;
++      knode->leave_reason = node->leave_reason;
++      knode->incarnation = node->incarnation;
++}
++
++/* Return the info for a node given it's address. if addr is NULL then return
++ * OUR info */
++int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
++                       struct kcl_cluster_node *n)
++{
++      struct cluster_node *node;
++
++      /* They want us */
++      if (addr == NULL) {
++              node = us;
++      }
++      else {
++              node = find_node_by_addr(addr, addr_len);
++              if (!node)
++                      return -1;
++      }
++
++      /* Copy to user's buffer */
++      copy_to_kclnode(node, n);
++      return 0;
++}
++
++int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
++{
++      struct cluster_node *node;
++
++      /* They want us */
++      if (name == NULL) {
++              node = us;
++              if (node == NULL)
++                      return -1;
++      }
++      else {
++              node = find_node_by_name(name);
++              if (!node)
++                      return -1;
++      }
++
++      /* Copy to user's buffer */
++      copy_to_kclnode(node, n);
++      return 0;
++}
++
++/* As above but by node id. MUCH faster */
++int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
++{
++      struct cluster_node *node;
++
++      /* They want us */
++      if (nodeid == 0) {
++              node = us;
++              if (node == NULL)
++                      return -1;
++      }
++      else {
++              node = find_node_by_nodeid(nodeid);
++              if (!node)
++                      return -1;
++      }
++
++      /* Copy to user's buffer */
++      copy_to_kclnode(node, n);
++      return 0;
++}
++
++/* Return a list of all cluster members ever */
++int kcl_get_all_members(struct list_head *list)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++      struct kcl_cluster_node *newnode;
++      int num_nodes = 0;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              if (list) {
++                      node = list_entry(nodelist, struct cluster_node, list);
++                      newnode =
++                          kmalloc(sizeof (struct kcl_cluster_node),
++                                  GFP_KERNEL);
++                      if (newnode) {
++                              copy_to_kclnode(node, newnode);
++                              list_add(&newnode->list, list);
++                              num_nodes++;
++                      }
++              }
++              else {
++                      num_nodes++;
++              }
++      }
++      up(&cluster_members_lock);
++
++      return num_nodes;
++}
++
++/* Return a list of cluster members */
++int kcl_get_members(struct list_head *list)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++      struct kcl_cluster_node *newnode;
++      int num_nodes = 0;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state == NODESTATE_MEMBER) {
++                      if (list) {
++                              newnode =
++                                  kmalloc(sizeof (struct kcl_cluster_node),
++                                          GFP_KERNEL);
++                              if (newnode) {
++                                      copy_to_kclnode(node, newnode);
++                                      list_add(&newnode->list, list);
++                                      num_nodes++;
++                              }
++                      }
++                      else {
++                              num_nodes++;
++                      }
++              }
++      }
++      up(&cluster_members_lock);
++
++      return num_nodes;
++}
++
++/* Copy current member's nodeids into buffer */
++int kcl_get_member_ids(uint32_t *idbuf, int size)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++      int num_nodes = 0;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state == NODESTATE_MEMBER) {
++                      if (idbuf && size) {
++                              idbuf[num_nodes] = node->node_id;
++                              num_nodes++;
++                              size--;
++                      }
++                      else {
++                              num_nodes++;
++                      }
++              }
++      }
++      up(&cluster_members_lock);
++
++      return num_nodes;
++}
++
++/* Barrier API */
++int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
++{
++      struct cl_barrier *barrier;
++
++      /* We are not joined to a cluster */
++      if (!we_are_a_cluster_member)
++              return -ENOTCONN;
++
++      /* Must have a valid name */
++      if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
++              return -EINVAL;
++
++      /* We don't do this yet */
++      if (flags & BARRIER_ATTR_MULTISTEP)
++              return -ENOTSUPP;
++
++      down(&barrier_list_lock);
++
++      /* See if it already exists */
++      if ((barrier = find_barrier(name))) {
++              up(&barrier_list_lock);
++              if (nodes != barrier->expected_nodes) {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
++                             name, barrier->expected_nodes, nodes);
++                      up(&barrier_list_lock);
++                      return -EINVAL;
++              }
++              else
++                      return 0;
++      }
++
++      /* Build a new struct and add it to the list */
++      barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
++      if (barrier == NULL) {
++              up(&barrier_list_lock);
++              return -ENOMEM;
++      }
++      memset(barrier, 0, sizeof (*barrier));
++
++      strcpy(barrier->name, name);
++      barrier->flags = flags;
++      barrier->expected_nodes = nodes;
++      atomic_set(&barrier->got_nodes, 0);
++      atomic_set(&barrier->completed_nodes, 0);
++      barrier->endreason = 0;
++      barrier->registered_nodes = 1;
++      spin_lock_init(&barrier->phase2_spinlock);
++      barrier->state = BARRIER_STATE_INACTIVE;
++      init_MUTEX(&barrier->lock);
++
++      list_add(&barrier->list, &barrier_list);
++      up(&barrier_list_lock);
++
++      return 0;
++}
++
++static int barrier_setattr_enabled(struct cl_barrier *barrier,
++                                 unsigned int attr, unsigned long arg)
++{
++      int status;
++
++      /* Can't disable a barrier */
++      if (!arg) {
++              up(&barrier->lock);
++              return -EINVAL;
++      }
++
++      /* We need to send WAIT now because the user may not
++       * actually call kcl_barrier_wait() */
++      if (!barrier->waitsent) {
++              struct cl_barriermsg bmsg;
++
++              /* Send it to the rest of the cluster */
++              bmsg.cmd = CLUSTER_CMD_BARRIER;
++              bmsg.subcmd = BARRIER_WAIT;
++              strcpy(bmsg.name, barrier->name);
++
++              barrier->waitsent = 1;
++              barrier->phase = 1;
++
++              atomic_inc(&barrier->got_nodes);
++
++              /* Start the timer if one was wanted */
++              if (barrier->timeout) {
++                      init_timer(&barrier->timer);
++                      barrier->timer.function = barrier_timer_fn;
++                      barrier->timer.data = (long) barrier;
++                      mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
++              }
++
++              /* Barrier WAIT and COMPLETE messages are
++               * always queued - that way they always get
++               * sent out in the right order. If we don't do
++               * this then one can get sent out in the
++               * context of the user process and the other in
++               * cnxman and COMPLETE may /just/ slide in
++               * before WAIT if its in the queue
++               */
++              P_BARRIER("Sending WAIT for %s\n", name);
++              status = queue_message(&bmsg, sizeof (bmsg), NULL, 0, 0);
++              if (status < 0) {
++                      up(&barrier->lock);
++                      return status;
++              }
++
++              /* It might have been reached now */
++              if (barrier
++                  && barrier->state != BARRIER_STATE_COMPLETE
++                  && barrier->phase == 1)
++                      check_barrier_complete_phase1(barrier);
++      }
++      if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
++              up(&barrier->lock);
++              return barrier->endreason;
++      }
++      up(&barrier->lock);
++      return 0;       /* Nothing to propogate */
++}
++
++int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
++{
++      struct cl_barrier *barrier;
++
++      /* See if it already exists */
++      down(&barrier_list_lock);
++      if (!(barrier = find_barrier(name))) {
++              up(&barrier_list_lock);
++              return -ENOENT;
++      }
++      up(&barrier_list_lock);
++
++      down(&barrier->lock);
++      if (barrier->state == BARRIER_STATE_COMPLETE) {
++              up(&barrier->lock);
++              return 0;
++      }
++
++      switch (attr) {
++      case BARRIER_SETATTR_AUTODELETE:
++              if (arg)
++                      barrier->flags |= BARRIER_ATTR_AUTODELETE;
++              else
++                      barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
++              up(&barrier->lock);
++              return 0;
++              break;
++
++      case BARRIER_SETATTR_TIMEOUT:
++              /* Can only change the timout of an inactive barrier */
++              if (barrier->state == BARRIER_STATE_WAITING
++                  || barrier->waitsent) {
++                      up(&barrier->lock);
++                      return -EINVAL;
++              }
++              barrier->timeout = arg;
++              up(&barrier->lock);
++              return 0;
++
++      case BARRIER_SETATTR_MULTISTEP:
++              up(&barrier->lock);
++              return -ENOTSUPP;
++
++      case BARRIER_SETATTR_ENABLED:
++              return barrier_setattr_enabled(barrier, attr, arg);
++
++      case BARRIER_SETATTR_NODES:
++              /* Can only change the expected node count of an inactive
++               * barrier */
++              if (barrier->state == BARRIER_STATE_WAITING
++                  || barrier->waitsent)
++                      return -EINVAL;
++              barrier->expected_nodes = arg;
++              break;
++
++      case BARRIER_SETATTR_CALLBACK:
++              if (barrier->state == BARRIER_STATE_WAITING
++                  || barrier->waitsent)
++                      return -EINVAL;
++              barrier->callback = (void (*)(char *, int)) arg;
++              up(&barrier->lock);
++              return 0;       /* Don't propgate this to other nodes */
++      }
++
++      up(&barrier->lock);
++      return 0;
++}
++
++int kcl_barrier_delete(char *name)
++{
++      struct cl_barrier *barrier;
++
++      down(&barrier_list_lock);
++      /* See if it exists */
++      if (!(barrier = find_barrier(name))) {
++              up(&barrier_list_lock);
++              return -ENOENT;
++      }
++
++      /* Delete it */
++      list_del(&barrier->list);
++      kfree(barrier);
++
++      up(&barrier_list_lock);
++
++      return 0;
++}
++
++int kcl_barrier_cancel(char *name)
++{
++      struct cl_barrier *barrier;
++
++      /* See if it exists */
++      down(&barrier_list_lock);
++      if (!(barrier = find_barrier(name))) {
++              up(&barrier_list_lock);
++              return -ENOENT;
++      }
++      down(&barrier->lock);
++
++      barrier->endreason = -ENOTCONN;
++
++      if (barrier->callback) {
++              barrier->callback(barrier->name, -ECONNRESET);
++              barrier->callback = NULL;
++      }
++
++      if (barrier->timeout)
++              del_timer(&barrier->timer);
++
++      /* Remove it if it's AUTO-DELETE */
++      if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++              list_del(&barrier->list);
++              up(&barrier->lock);
++              kfree(barrier);
++              up(&barrier_list_lock);
++              return 0;
++      }
++
++      if (barrier->state == BARRIER_STATE_WAITING)
++              wake_up_interruptible(&barrier->waitq);
++
++      up(&barrier->lock);
++      up(&barrier_list_lock);
++      return 0;
++}
++
++int kcl_barrier_wait(char *name)
++{
++      struct cl_barrier *barrier;
++      int ret;
++
++      if (!atomic_read(&cnxman_running))
++              return -ENOTCONN;
++
++      /* Enable it */
++      kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
++
++      down(&barrier_list_lock);
++
++      /* See if it still exists - enable may have deleted it! */
++      if (!(barrier = find_barrier(name))) {
++              up(&barrier_list_lock);
++              return -ENOENT;
++      }
++
++      down(&barrier->lock);
++
++      up(&barrier_list_lock);
++
++      /* If it has already completed then return the status */
++      if (barrier->state == BARRIER_STATE_COMPLETE) {
++              up(&barrier->lock);
++              return barrier->endreason;
++      }
++
++      barrier->state = BARRIER_STATE_WAITING;
++
++      /* Have we all reached the barrier? */
++      while (atomic_read(&barrier->completed_nodes) !=
++             ((barrier->expected_nodes == 0)
++              ? cluster_members : barrier->expected_nodes)
++             && barrier->endreason == 0) {
++
++              wait_queue_t wq;
++
++              init_waitqueue_entry(&wq, current);
++              init_waitqueue_head(&barrier->waitq);
++
++              /* Wait for em all */
++              set_task_state(current, TASK_INTERRUPTIBLE);
++              add_wait_queue(&barrier->waitq, &wq);
++
++              if (atomic_read(&barrier->completed_nodes) !=
++                  ((barrier->expected_nodes ==
++                    0) ? cluster_members : barrier->expected_nodes)
++                  && barrier->endreason == 0) {
++                      up(&barrier->lock);
++                      schedule();
++                      down(&barrier->lock);
++              }
++
++              remove_wait_queue(&barrier->waitq, &wq);
++              set_task_state(current, TASK_RUNNING);
++
++              if (signal_pending(current)) {
++                      barrier->endreason = -EINTR;
++                      break;
++              }
++      }
++      barrier->state = BARRIER_STATE_INACTIVE;
++
++      if (barrier->timeout)
++              del_timer(&barrier->timer);
++
++      /* Barrier has been reached on all nodes, call the callback */
++      if (barrier->callback) {
++              barrier->callback(barrier->name, barrier->endreason);
++              barrier->callback = NULL;
++      }
++
++      atomic_set(&barrier->got_nodes, 0);
++
++      /* Return the reason we were woken */
++      ret = barrier->endreason;
++
++      /* Remove it if it's AUTO-DELETE */
++      if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++              down(&barrier_list_lock);
++              list_del(&barrier->list);
++              up(&barrier_list_lock);
++              up(&barrier->lock);
++              kfree(barrier);
++      }
++      else {
++              up(&barrier->lock);
++      }
++
++      /* We were woken up because the node left the cluster ? */
++      if (!atomic_read(&cnxman_running))
++              ret = -ENOTCONN;
++
++      return ret;
++}
++
++/* This is called from membership services when a node has left the cluster -
++ * we signal all waiting barriers with -ESRCH so they know to do something
++ * else, if the number of nodes is left at 0 then we compare the new number of
++ * nodes in the cluster with that at the barrier and return 0 (success) in that
++ * case */
++void check_barrier_returns()
++{
++      struct list_head *blist;
++      struct list_head *llist;
++      struct cl_barrier *barrier;
++      int status = 0;
++
++      down(&barrier_list_lock);
++      list_for_each(blist, &barrier_list) {
++              barrier = list_entry(blist, struct cl_barrier, list);
++
++              if (barrier->waitsent) {
++                      int wakeit = 0;
++
++                      /* Check for a dynamic member barrier */
++                      if (barrier->expected_nodes == 0) {
++                              if (barrier->registered_nodes ==
++                                  cluster_members) {
++                                      status = 0;
++                                      wakeit = 1;
++                              }
++                      }
++                      else {
++                              status = -ESRCH;
++                              wakeit = 1;
++                      }
++
++                      /* Do we need to tell the barrier? */
++                      if (wakeit) {
++                              if (barrier->state == BARRIER_STATE_WAITING) {
++                                      barrier->endreason = status;
++                                      wake_up_interruptible(&barrier->waitq);
++                              }
++                              else {
++                                      if (barrier->callback) {
++                                              barrier->callback(barrier->name,
++                                                                status);
++                                      }
++                              }
++                      }
++              }
++      }
++      up(&barrier_list_lock);
++
++      /* Part 2 check for outstanding listen requests for dead nodes and
++       * cancel them */
++      down(&listenreq_lock);
++      list_for_each(llist, &listenreq_list) {
++              struct cl_waiting_listen_request *lrequest =
++                  list_entry(llist, struct cl_waiting_listen_request, list);
++              struct cluster_node *node =
++                  find_node_by_nodeid(lrequest->nodeid);
++
++              if (node && node->state != NODESTATE_MEMBER) {
++                      lrequest->result = -ENOTCONN;
++                      lrequest->waiting = 0;
++                      wake_up_interruptible(&lrequest->waitq);
++              }
++      }
++      up(&listenreq_lock);
++}
++
++int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
++{
++      struct temp_node *tn;
++      int err = 1; /* true */
++#ifdef DEBUG_COMMS
++      char buf[MAX_ADDR_PRINTED_LEN];
++#endif
++
++      down(&tempnode_lock);
++
++      list_for_each_entry(tn, &tempnode_list, list) {
++              if (tn->nodeid == nodeid) {
++                      memcpy(addr, tn->addr, tn->addrlen);
++                      *addrlen = tn->addrlen;
++                      P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
++                              tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
++
++                      goto out;
++              }
++      }
++      err = 0;
++
++ out:
++      up(&tempnode_lock);
++      return err;
++}
++
++/* Create a new temporary node ID. This list will only ever be very small
++   (usaully only 1 item) but I can't take the risk that someone won't try to
++   boot 128 nodes all at exactly the same time. */
++int new_temp_nodeid(char *addr, int addrlen)
++{
++      struct temp_node *tn;
++      int err = -1;
++      int try_nodeid = 0;
++#ifdef DEBUG_COMMS
++      char buf[MAX_ADDR_PRINTED_LEN];
++#endif
++
++      P_COMMS("new_temp_nodeid needed for\n: %s\n",
++              print_addr(addr, addrlen, buf));
++
++      down(&tempnode_lock);
++
++      /* First see if we already know about this node */
++      list_for_each_entry(tn, &tempnode_list, list) {
++
++              P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
++                      tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
++
++              /* We're already in here... */
++              if (tn->addrlen == addrlen &&
++                  memcmp(tn->addr, addr, addrlen) == 0) {
++                      P_COMMS("reused temp node ID %d\n", tn->nodeid);
++                      err = tn->nodeid;
++                      goto out;
++              }
++      }
++
++      /* Nope, OK, invent a suitable number */
++ retry:
++      try_nodeid -= 1;
++      list_for_each_entry(tn, &tempnode_list, list) {
++
++              if (tn->nodeid == try_nodeid)
++                      goto retry;
++      }
++
++      tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
++      if (!tn)
++              goto out;
++
++      memcpy(tn->addr, addr, addrlen);
++      tn->addrlen = addrlen;
++      tn->nodeid = try_nodeid;
++      list_add_tail(&tn->list, &tempnode_list);
++      err = try_nodeid;
++      P_COMMS("new temp nodeid = %d\n", try_nodeid);
++ out:
++      up(&tempnode_lock);
++      return err;
++}
++
++static int is_valid_temp_nodeid(int nodeid)
++{
++      struct temp_node *tn;
++      int err = 1; /* true */
++
++      down(&tempnode_lock);
++
++      list_for_each_entry(tn, &tempnode_list, list) {
++              if (tn->nodeid == nodeid)
++                      goto out;
++      }
++      err = 0;
++
++ out:
++      P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
++      up(&tempnode_lock);
++      return err;
++}
++
++/* TODO: This needs to clean the list more fully of
++   nodes that are now full members but we did not master the transition */
++void remove_temp_nodeid(int nodeid)
++{
++      struct temp_node *tn;
++      struct temp_node *tmp;
++
++      down(&tempnode_lock);
++
++      list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
++              if (nodeid == tn->nodeid) {
++                      list_del(&tn->list);
++                      kfree(tn);
++                      up(&tempnode_lock);
++                      return;
++              }
++      }
++
++      up(&tempnode_lock);
++}
++
++/* Quorum device functions */
++int kcl_register_quorum_device(char *name, int votes)
++{
++      if (quorum_device)
++              return -EBUSY;
++
++      if (find_node_by_name(name))
++              return -EINVAL;
++
++      quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++      if (!quorum_device)
++              return -ENOMEM;
++      memset(quorum_device, 0, sizeof (struct cluster_node));
++
++      quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
++      if (!quorum_device->name) {
++              kfree(quorum_device);
++              quorum_device = NULL;
++              return -ENOMEM;
++      }
++
++      strcpy(quorum_device->name, name);
++      quorum_device->votes = votes;
++      quorum_device->state = NODESTATE_DEAD;
++
++      /* Keep this list valid so it doesn't confuse other code */
++      INIT_LIST_HEAD(&quorum_device->addr_list);
++
++      return 0;
++}
++
++int kcl_unregister_quorum_device(void)
++{
++      if (!quorum_device)
++              return -EINVAL;
++      if (quorum_device->state == NODESTATE_MEMBER)
++              return -EINVAL;
++
++      quorum_device = NULL;
++
++      return 0;
++}
++
++int kcl_quorum_device_available(int yesno)
++{
++      if (!quorum_device)
++              return -EINVAL;
++
++      if (yesno) {
++              quorum_device->last_hello = jiffies;
++              if (quorum_device->state == NODESTATE_DEAD) {
++                      quorum_device->state = NODESTATE_MEMBER;
++                      recalculate_quorum(0);
++              }
++      }
++      else {
++              if (quorum_device->state == NODESTATE_MEMBER) {
++                      quorum_device->state = NODESTATE_DEAD;
++                      recalculate_quorum(0);
++              }
++      }
++
++      return 0;
++}
++
++/* APIs for cluster ref counting. */
++int kcl_addref_cluster()
++{
++      int ret = -ENOTCONN;
++
++      if (!atomic_read(&cnxman_running))
++              goto addref_ret;
++
++      if (try_module_get(THIS_MODULE)) {
++              atomic_inc(&use_count);
++              ret = 0;
++      }
++
++      addref_ret:
++      return ret;
++}
++
++int kcl_releaseref_cluster()
++{
++      if (!atomic_read(&cnxman_running))
++              return -ENOTCONN;
++      atomic_dec(&use_count);
++      module_put(THIS_MODULE);
++      return 0;
++}
++
++int kcl_cluster_name(char **cname)
++{
++      char *name;
++
++      name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
++      if (!name)
++              return -ENOMEM;
++
++      strncpy(name, cluster_name, strlen(cluster_name)+1);
++      *cname = name;
++      return 0;
++}
++
++int kcl_get_current_interface(void)
++{
++      return current_interface->number;
++}
++
++/* Socket registration stuff */
++static struct net_proto_family cl_family_ops = {
++      .family = AF_CLUSTER,
++      .create = cl_create
++};
++
++static struct proto_ops cl_proto_ops = {
++      .family      = AF_CLUSTER,
++
++      .release     = cl_release,
++      .bind        = cl_bind,
++      .connect     = sock_no_connect,
++      .socketpair  = sock_no_socketpair,
++      .accept      = sock_no_accept,
++      .getname     = cl_getname,
++      .poll        = cl_poll,
++      .ioctl       = cl_ioctl,
++      .listen      = sock_no_listen,
++      .shutdown    = cl_shutdown,
++      .setsockopt  = cl_setsockopt,
++      .getsockopt  = cl_getsockopt,
++      .sendmsg     = cl_sendmsg,
++      .recvmsg     = cl_recvmsg,
++      .mmap        = sock_no_mmap,
++      .sendpage    = sock_no_sendpage,
++};
++
++#ifdef MODULE
++MODULE_DESCRIPTION("Cluster Connection and Service Manager");
++MODULE_AUTHOR("Red Hat, Inc");
++MODULE_LICENSE("GPL");
++#endif
++
++static int __init cluster_init(void)
++{
++      printk("CMAN %s (built %s %s) installed\n",
++             CMAN_RELEASE_NAME, __DATE__, __TIME__);
++
++      /* allocate our sock slab cache */
++      cluster_sk_cachep = kmem_cache_create("cluster_sock",
++                                            sizeof (struct cluster_sock), 0,
++                                            SLAB_HWCACHE_ALIGN, 0, 0);
++      if (!cluster_sk_cachep) {
++              printk(KERN_CRIT
++                     "cluster_init: Cannot create cluster_sock SLAB cache\n");
++              return -1;
++
++      }
++
++      if (sock_register(&cl_family_ops)) {
++              printk(KERN_INFO "Unable to register cluster socket type\n");
++              kmem_cache_destroy(cluster_sk_cachep);
++              return -1;
++      }
++
++
++#ifdef CONFIG_PROC_FS
++      create_proc_entries();
++#endif
++
++      init_MUTEX(&start_thread_sem);
++      init_MUTEX(&send_lock);
++      init_MUTEX(&barrier_list_lock);
++      init_MUTEX(&cluster_members_lock);
++      init_MUTEX(&port_array_lock);
++      init_MUTEX(&messages_list_lock);
++      init_MUTEX(&listenreq_lock);
++      init_MUTEX(&client_socket_lock);
++      init_MUTEX(&new_dead_node_lock);
++      init_MUTEX(&event_listener_lock);
++      init_MUTEX(&kernel_listener_lock);
++      init_MUTEX(&tempnode_lock);
++      spin_lock_init(&active_socket_lock);
++      init_timer(&ack_timer);
++
++      INIT_LIST_HEAD(&event_listener_list);
++      INIT_LIST_HEAD(&kernel_listener_list);
++      INIT_LIST_HEAD(&socket_list);
++      INIT_LIST_HEAD(&client_socket_list);
++      INIT_LIST_HEAD(&active_socket_list);
++      INIT_LIST_HEAD(&barrier_list);
++      INIT_LIST_HEAD(&messages_list);
++      INIT_LIST_HEAD(&listenreq_list);
++      INIT_LIST_HEAD(&cluster_members_list);
++      INIT_LIST_HEAD(&new_dead_node_list);
++      INIT_LIST_HEAD(&tempnode_list);
++
++      atomic_set(&cnxman_running, 0);
++
++      sm_init();
++
++      return 0;
++}
++
++static void __exit cluster_exit(void)
++{
++#ifdef CONFIG_PROC_FS
++      cleanup_proc_entries();
++#endif
++
++      sock_unregister(AF_CLUSTER);
++      kmem_cache_destroy(cluster_sk_cachep);
++}
++
++module_init(cluster_init);
++module_exit(cluster_exit);
++
++EXPORT_SYMBOL(kcl_sendmsg);
++EXPORT_SYMBOL(kcl_register_read_callback);
++EXPORT_SYMBOL(kcl_add_callback);
++EXPORT_SYMBOL(kcl_remove_callback);
++EXPORT_SYMBOL(kcl_get_members);
++EXPORT_SYMBOL(kcl_get_member_ids);
++EXPORT_SYMBOL(kcl_get_all_members);
++EXPORT_SYMBOL(kcl_is_quorate);
++EXPORT_SYMBOL(kcl_get_node_by_addr);
++EXPORT_SYMBOL(kcl_get_node_by_name);
++EXPORT_SYMBOL(kcl_get_node_by_nodeid);
++EXPORT_SYMBOL(kcl_get_node_addresses);
++EXPORT_SYMBOL(kcl_addref_cluster);
++EXPORT_SYMBOL(kcl_releaseref_cluster);
++EXPORT_SYMBOL(kcl_cluster_name);
++
++EXPORT_SYMBOL(kcl_barrier_register);
++EXPORT_SYMBOL(kcl_barrier_setattr);
++EXPORT_SYMBOL(kcl_barrier_delete);
++EXPORT_SYMBOL(kcl_barrier_wait);
++EXPORT_SYMBOL(kcl_barrier_cancel);
++
++EXPORT_SYMBOL(kcl_register_quorum_device);
++EXPORT_SYMBOL(kcl_unregister_quorum_device);
++EXPORT_SYMBOL(kcl_quorum_device_available);
++
++EXPORT_SYMBOL(kcl_register_service);
++EXPORT_SYMBOL(kcl_unregister_service);
++EXPORT_SYMBOL(kcl_join_service);
++EXPORT_SYMBOL(kcl_leave_service);
++EXPORT_SYMBOL(kcl_global_service_id);
++EXPORT_SYMBOL(kcl_start_done);
++EXPORT_SYMBOL(kcl_get_services);
++EXPORT_SYMBOL(kcl_get_current_interface);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
+--- linux-orig/cluster/cman/config.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/config.c        2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,46 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "config.h"
++
++/* Config file defaults */
++
++#define DEFAULT_JOIN_WAIT_TIME   11   /* Time to wait while sending JOINREQ
++                                       * messages. Should be at least twice
++                                       * the HELLO timer */
++#define DEFAULT_JOIN_TIMEOUT     30   /* How long we wait after getting a
++                                       * JOINACK to regarding that node as
++                                       * dead */
++#define DEFAULT_HELLO_TIMER       5   /* Period between HELLO messages */
++#define DEFAULT_DEADNODE_TIMER   21   /* If we don't get a message from a
++                                       * node in this period kill it */
++#define DEFAULT_TRANSITION_TIMER 15   /* Maximum time a state transition
++                                       * should take */
++#define DEFAULT_JOINCONF_TIMER    5   /* Time allowed to a node to respond to 
++                                       * a JOINCONF message */
++#define DEFAULT_MAX_NODES       128   /* Max allowed nodes */
++#define DEFAULT_TRANSITION_RESTARTS  10       /* Maximum number of transition
++                                       * restarts before we die */
++#define DEFAULT_SM_DEBUG_SIZE 256     /* Size in bytes of SM debug buffer */
++
++struct config_info cman_config = {
++      .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
++      .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
++      .join_timeout = DEFAULT_JOIN_TIMEOUT,
++      .hello_timer = DEFAULT_HELLO_TIMER,
++      .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
++      .transition_timeout = DEFAULT_TRANSITION_TIMER,
++      .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
++      .max_nodes = DEFAULT_MAX_NODES,
++      .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
++};
+diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
+--- linux-orig/cluster/cman/config.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/config.h        2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,31 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CONFIG_DOT_H__
++#define __CONFIG_DOT_H__
++
++struct config_info {
++      int joinwait_timeout;
++      int joinconf_timeout;
++      int join_timeout;
++      int hello_timer;
++      int deadnode_timeout;
++      int transition_timeout;
++      int transition_restarts;
++      int max_nodes;
++      int sm_debug_size;
++};
++
++extern struct config_info cman_config;
++
++#endif                                /* __CONFIG_DOT_H__ */
+diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
+--- linux-orig/cluster/cman/kjoin.c    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/kjoin.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,238 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++#include <linux/in.h>
++
++#include "cnxman-private.h"
++
++static struct socket *mcast_sock;
++static struct socket *recv_sock;
++static struct socket *cluster_sock;
++
++extern short cluster_id;
++extern int join_count;
++extern struct semaphore join_count_lock;
++extern atomic_t cnxman_running;
++
++int kcl_join_cluster(struct cl_join_cluster_info *join_info)
++{
++      int result;
++      int one = 1, error;
++      unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
++      unsigned short port = join_info->port;
++      mm_segment_t fs;
++      struct sockaddr_in saddr;
++      struct kcl_multicast_sock mcast_info;
++
++      down(&join_count_lock);
++      if (atomic_read(&cnxman_running))
++      {
++              error = 0;
++              if (join_info->cluster_id == cluster_id)
++                      join_count++;
++              else
++                      error = -EINVAL;
++              up(&join_count_lock);
++              return error;
++      }
++      up(&join_count_lock);
++
++      result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
++      if (result < 0)
++      {
++              printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
++              return result;
++      }
++
++      result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
++      if (result < 0)
++      {
++              printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
++              return result;
++      }
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
++                                   (void *) &one, sizeof (int))))
++      {
++              set_fs(fs);
++              printk("Error %d Setting master socket to SO_BROADCAST\n",
++                     error);
++              sock_release(mcast_sock);
++              return -1;
++      }
++      set_fs(fs);
++
++      /* Bind the multicast socket */
++      saddr.sin_family = AF_INET;
++      saddr.sin_port = htons(port);
++      saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
++      result =
++          mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
++                                sizeof (saddr));
++      if (result < 0)
++      {
++              printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              return result;
++      }
++
++      /* Bind the receive socket to our IP address */
++      saddr.sin_family = AF_INET;
++      saddr.sin_port = htons(port);
++      saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
++      result =
++          recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
++                               sizeof (saddr));
++      if (result < 0)
++      {
++              printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              return result;
++      }
++
++      /* Create the cluster master socket */
++      result =
++          sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
++      if (result < 0)
++      {
++              printk(KERN_ERR CMAN_NAME
++                     ": Can't create cluster master socket\n");
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              return result;
++      }
++
++      /* This is the broadcast transmit address */
++      saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
++
++      /* Pass the multicast socket to kernel space */
++      mcast_info.sock = mcast_sock;
++      mcast_info.number = 1;
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
++                                                 KCL_SET_MULTICAST,
++                                                 (void *) &mcast_info,
++                                                 sizeof (mcast_info))))
++      {
++              set_fs(fs);
++              printk(CMAN_NAME
++                     ": Unable to pass multicast socket to cnxman, %d\n",
++                     error);
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              sock_release(cluster_sock);
++              return -1;
++      }
++
++      mcast_info.sock = recv_sock;
++      if ((error =
++           cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
++                                         KCL_SET_RCVONLY,
++                                         (void *) &mcast_info,
++                                         sizeof (mcast_info))))
++      {
++              set_fs(fs);
++              printk(CMAN_NAME
++                     ": Unable to pass receive socket to cnxman, %d\n",
++                     error);
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              sock_release(cluster_sock);
++              return -1;
++      }
++
++      /* This setsockopt expects usermode variables */
++
++      if (cluster_sock->ops->
++          setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
++                     (void *) join_info,
++                     sizeof (struct cl_join_cluster_info)))
++
++      {
++              set_fs(fs);
++              printk(CMAN_NAME ": Unable to join cluster\n");
++              sock_release(mcast_sock);
++              sock_release(recv_sock);
++              sock_release(cluster_sock);
++              return -1;
++      }
++      set_fs(fs);
++
++      return 0;
++}
++
++int kcl_leave_cluster(int remove)
++{
++      mm_segment_t fs;
++      int rem = remove;
++      int ret = 0;
++      struct socket *shutdown_sock = cluster_sock;
++
++      cluster_sock = NULL;
++
++      if (!shutdown_sock)
++      {
++              /* Create the cluster master socket */
++              int result =
++                  sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
++                              &shutdown_sock);
++              if (result < 0)
++              {
++                      printk(KERN_ERR CMAN_NAME
++                             ": Can't create cluster master socket\n");
++                      sock_release(mcast_sock);
++                      sock_release(recv_sock);
++                      return result;
++              }
++      }
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      if ((ret =
++           shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
++                                          CLU_LEAVE_CLUSTER, (void *) &rem,
++                                          sizeof (int))))
++      {
++              printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
++                     ret);
++      }
++      set_fs(fs);
++
++      sock_release(shutdown_sock);
++
++      return ret;
++}
++
++/* 
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
+--- linux-orig/cluster/cman/membership.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/membership.c    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,3069 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++#include <linux/slab.h>
++#include <linux/spinlock.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++
++#include "cnxman-private.h"
++#include "config.h"
++#include "sm_control.h"
++
++#ifndef TRUE
++#define TRUE 1
++#endif
++
++/* Barrier name for membership transitions. %d is the cluster generation number
++ */
++#define MEMBERSHIP_BARRIER_NAME       "TRANSITION.%d"
++
++/* Variables also used by connection manager */
++struct list_head cluster_members_list;
++struct semaphore cluster_members_lock;
++int cluster_members;          /* Number of ACTIVE members, not a count of
++                               * nodes in the list */
++int we_are_a_cluster_member = 0;
++int cluster_is_quorate;
++int quit_threads = 0;
++struct task_struct *membership_task;
++struct cluster_node *us;
++
++static struct task_struct *hello_task;
++static struct semaphore hello_task_lock;
++
++/* Variables that belong to the connection manager */
++extern wait_queue_head_t cnxman_waitq;
++extern struct completion member_thread_comp;
++extern struct cluster_node *quorum_device;
++extern unsigned short two_node;
++extern char cluster_name[];
++extern unsigned int config_version;
++extern unsigned int address_length;
++
++static struct socket *mem_socket;
++static pid_t kcluster_pid;
++
++static char iobuf[MAX_CLUSTER_MESSAGE];
++static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
++
++/* Our node name, usually system_utsname.nodename, but can be overridden */
++char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
++
++static spinlock_t members_by_nodeid_lock;
++static int sizeof_members_array = 0;  /* Can dynamically increase (vmalloc
++                                       * permitting) */
++static struct cluster_node **members_by_nodeid;
++
++#define MEMBER_INCREMENT_SIZE 10
++
++static int votes = 1;         /* Votes this node has */
++static int expected_votes = 1;        /* Total expected votes in the cluster */
++static unsigned int quorum;   /* Quorum, fewer votes than this and we stop
++                               * work */
++static int leavereason;               /* Saved for the duration of a state transition */
++static int transitionreason;  /* Reason this transition was initiated */
++static unsigned int highest_nodeid;   /* Highest node ID known to the cluster */
++static struct timer_list transition_timer;    /* Kicks in if the transition
++                                               * doesn't complete in a
++                                               * reasonable time */
++static struct timer_list hello_timer; /* Timer to send HELLOs on */
++static unsigned long join_time;       /* The time that we got our JOIN-ACK */
++static unsigned long start_time; /* The time that we were started */
++static int joinconf_count;    /* Number of JOINCONF messages we have sent to
++                               * a new node */
++static unsigned long wake_flags;/* Reason we were woken */
++
++/* Flags in above */
++#define WAKE_FLAG_DEADNODE    1
++#define WAKE_FLAG_TRANSTIMER  2
++
++/* The time the transition finished */
++static unsigned long transition_end_time;
++
++/* A list of nodes that cnxman tells us are dead. I hope this never has more
++ * than one element in it but I can't take that chance. only non-static so it
++ * can be initialised in module_load. */
++struct list_head new_dead_node_list;
++struct semaphore new_dead_node_lock;
++
++static int do_membership_packet(struct msghdr *msg, int len);
++static int do_process_joinreq(struct msghdr *msg, int len);
++static int do_process_joinack(struct msghdr *msg, int len);
++static int do_process_joinconf(struct msghdr *msg, int len);
++static int do_process_leave(struct msghdr *msg, int len);
++static int do_process_hello(struct msghdr *msg, int len);
++static int do_process_kill(struct msghdr *msg, int len);
++static int do_process_reconfig(struct msghdr *msg, int len);
++static int do_process_starttrans(struct msghdr *msg, int len);
++static int do_process_masterview(struct msghdr *msg, int len);
++static int do_process_endtrans(struct msghdr *msg, int len);
++static int do_process_viewack(struct msghdr *msg, int len);
++static int do_process_startack(struct msghdr *msg, int len);
++static int do_process_newcluster(struct msghdr *msg, int len);
++static int do_process_nominate(struct msghdr *msg, int len);
++static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
++                           unsigned int flags);
++static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
++static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
++static int send_hello(void);
++static int send_master_hello(void);
++static int send_newcluster(void);
++static int end_transition(void);
++static int dispatch_messages(struct socket *mem_socket);
++static void check_for_dead_nodes(void);
++static void confirm_joiner(void);
++static void reset_hello_time(void);
++static int add_us(void);
++static int send_joinconf(void);
++static int init_membership_services(void);
++static int elect_master(struct cluster_node **);
++static void trans_timer_expired(unsigned long arg);
++static void hello_timer_expired(unsigned long arg);
++static void join_or_form_cluster(void);
++static int do_timer_wakeup(void);
++static int start_transition(unsigned char reason, struct cluster_node *node);
++int send_leave(unsigned char);
++int send_reconfigure(int, unsigned int);
++
++#ifdef DEBUG_MEMB
++static char *msgname(int msg);
++static int debug_sendmsg(struct socket *sock, void *buf, int size,
++                       struct sockaddr_cl *caddr, int addr_len,
++                       unsigned int flags)
++{
++      P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
++             size);
++      return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
++}
++
++#define kcl_sendmsg debug_sendmsg
++#endif
++
++/* State of the node */
++static enum { STARTING, JOINING, JOINWAIT, JOINACK, TRANSITION,
++          TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
++} node_state = STARTING;
++
++/* Sub-state when we are MASTER */
++static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
++          MASTER_COMPLETE } master_state;
++
++/* Number of responses collected while a master controlling a state transition */
++static int responses_collected;
++static int responses_expected;
++
++/* Current cluster generation number */
++static int cluster_generation = 1;
++
++/* When another node initiates a transtion then store it's pointer in here so
++ * we can check for other nodes trying to spoof us */
++static struct cluster_node *master_node = NULL;
++
++/* Struct the node wanting to join us */
++static struct cluster_node *joining_node = NULL;
++static int joining_temp_nodeid = 0;
++
++/* Last time a HELLO message was sent */
++unsigned long last_hello = 0;
++
++/* When we got our JOINWAIT or NEWCLUSTER */
++unsigned long joinwait_time = 0;
++
++/* Number of times a transition has restarted when we were master */
++int transition_restarts = 0;
++
++/* Variables used by the master to collect cluster status during a transition */
++static int agreeing_nodes = 0;
++static int dissenting_nodes = 0;
++static uint8_t *node_opinion = NULL;
++#define OPINION_AGREE    1
++#define OPINION_DISAGREE 2
++
++/* Set node id of a node, also add it to the members array and expand the array
++ * if necessary */
++static inline void set_nodeid(struct cluster_node *node, int nodeid)
++{
++      if (!nodeid)
++              return;
++
++      node->node_id = nodeid;
++      if (nodeid > sizeof_members_array) {
++              int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
++              struct cluster_node **new_array =
++                  vmalloc((new_size) * sizeof (struct cluster_node *));
++              if (new_array) {
++                      spin_lock(&members_by_nodeid_lock);
++                      memcpy(new_array, members_by_nodeid,
++                             sizeof_members_array *
++                             sizeof (struct cluster_node *));
++                      memset(&new_array[sizeof_members_array], 0,
++                             MEMBER_INCREMENT_SIZE *
++                             sizeof (struct cluster_node *));
++                      vfree(members_by_nodeid);
++                      members_by_nodeid = new_array;
++                      sizeof_members_array = new_size;
++                      spin_unlock(&members_by_nodeid_lock);
++              }
++              else {
++                      panic("No memory for more nodes");
++              }
++      }
++      notify_kernel_listeners(NEWNODE, (long) nodeid);
++
++      spin_lock(&members_by_nodeid_lock);
++      members_by_nodeid[nodeid] = node;
++      spin_unlock(&members_by_nodeid_lock);
++}
++
++static int hello_kthread(void *unused)
++{
++      struct task_struct *tsk = current;
++      sigset_t tmpsig;
++
++      daemonize("cman_hbeat");
++
++      /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++      siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++      sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++      down(&hello_task_lock);
++      hello_task = tsk;
++      up(&hello_task_lock);
++
++      set_user_nice(current, -6);
++
++      while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
++              send_hello();
++
++              /* Scan the nodes list for dead nodes */
++              if (node_state == MEMBER)
++                      check_for_dead_nodes();
++
++              set_task_state(current, TASK_INTERRUPTIBLE);
++              schedule();
++              set_task_state(current, TASK_RUNNING);
++      }
++      down(&hello_task_lock);
++      hello_task = NULL;
++      up(&hello_task_lock);
++      P_MEMB("heartbeat closing down\n");
++      return 0;
++}
++
++/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
++ * that keeps track of and controls cluster membership. */
++static int membership_kthread(void *unused)
++{
++      struct task_struct *tsk = current;
++      struct socket *tmp_socket;
++      sigset_t tmpsig;
++
++      daemonize("cman_memb");
++
++      /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++      siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++      sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++      membership_task = tsk;
++      set_user_nice(current, -5);
++
++      /* Open the socket */
++      if (init_membership_services())
++              return -1;
++
++      add_us();
++      joining_node = us;
++
++      init_timer(&hello_timer);
++      hello_timer.function = hello_timer_expired;
++      hello_timer.data = 0L;
++
++      /* Do joining stuff */
++      join_or_form_cluster();
++
++      transition_end_time = jiffies;
++
++      /* Main loop */
++      while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
++
++              struct task_struct *tsk = current;
++
++              DECLARE_WAITQUEUE(wait, tsk);
++
++              tsk->state = TASK_INTERRUPTIBLE;
++              add_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++              if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
++                  wake_flags == 0) {
++                      if (node_state == JOINACK ||
++                          node_state == JOINWAIT)
++                              schedule_timeout(HZ);
++                      else
++                              schedule();
++              }
++
++              tsk->state = TASK_RUNNING;
++              remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++              /* Are we being shut down? */
++              if (node_state == LEFT_CLUSTER || quit_threads ||
++                  signal_pending(current))
++                      break;
++
++              /* Were we woken by a dead node passed down from cnxman ? */
++              if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
++                      struct list_head *nodelist, *tmp;
++                      struct cl_new_dead_node *deadnode;
++
++                      down(&new_dead_node_lock);
++                      list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
++                              deadnode =
++                                  list_entry(nodelist,
++                                             struct cl_new_dead_node, list);
++
++                              if (deadnode->node->state == NODESTATE_MEMBER)
++                                      a_node_just_died(deadnode->node);
++                              list_del(&deadnode->list);
++                              kfree(deadnode);
++                      }
++                      up(&new_dead_node_lock);
++              }
++
++              /* Process received messages. If dispatch_message() returns an
++               * error then we shut down */
++              if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++                      if (dispatch_messages(mem_socket) < 0)
++                              goto leave_cluster;
++
++              }
++
++              /* Were we woken by the transition timer firing ? */
++              if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
++                      switch (do_timer_wakeup()) {
++                      case -1:
++                              continue;
++                      case 0:
++                              break;
++                      case +1:
++                              goto leave_cluster;
++                      }
++              }
++
++              /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
++               * messages again */
++              if (node_state == JOINACK
++                  && time_after(jiffies,
++                                join_time + cman_config.join_timeout * HZ)) {
++                      P_MEMB
++                          ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
++                      node_state = JOINWAIT;
++                      joinwait_time = jiffies;
++              }
++
++              /* Have we been in joinwait for too long... */
++              if (node_state == JOINWAIT
++                  && time_after(jiffies, joinwait_time +
++                                 cman_config.join_timeout * HZ)) {
++                      printk(CMAN_NAME
++                             ": Been in JOINWAIT for too long - giving up\n");
++                      goto leave_cluster;
++              }
++      }
++
++      leave_cluster:
++
++      /* Wake up the heartbeat thread so it can exit */
++      down(&hello_task_lock);
++      if (hello_task)
++              wake_up_process(hello_task);
++      up(&hello_task_lock);
++
++      if (timer_pending(&hello_timer))
++              del_timer(&hello_timer);
++
++      if (timer_pending(&transition_timer))
++              del_timer(&transition_timer);
++
++      node_state = LEFT_CLUSTER;
++      P_MEMB("closing down\n");
++      quit_threads = 1;       /* force other thread to exit too */
++
++      /* Close the socket, NULL the pointer first so it doesn't get used
++       * by send_leave()
++       */
++      tmp_socket = mem_socket;
++      mem_socket = NULL;
++      sock_release(tmp_socket);
++      highest_nodeid = 0;
++      complete(&member_thread_comp);
++      return 0;
++}
++
++/* Things to do in the main thread when the transition timer has woken us.
++ * Usually this happens when a transition is taking too long and we need to
++ * take remedial action.
++ *
++ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
++static int do_timer_wakeup()
++{
++      P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
++
++      /* Resend JOINCONF if it got lost on the wire */
++      if (node_state == MASTER && master_state == MASTER_CONFIRM) {
++              mod_timer(&transition_timer,
++                        jiffies + cman_config.joinconf_timeout * HZ);
++              if (++joinconf_count < MAX_RETRIES) {
++                      P_MEMB("Resending JOINCONF\n");
++                      send_joinconf();
++              }
++              else {
++                      P_MEMB("JOINCONF not acked, cancelling transition\n");
++                      end_transition();
++              }
++              return -1;
++      }
++
++      /* A joining node probably died */
++      if (cluster_members == 1) {
++              end_transition();
++              return -1;
++      }
++
++      /* See if the master is still there */
++      if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
++
++              /* If we are in transition and master_node is NULL then we are
++               * waiting for ENDTRANS after JOIN-CONF */
++              if (!master_node) {
++                      /* Hmmm. master died after sending JOINCONF, we'll have
++                       * to die as we are in mid-transition */
++                      printk(KERN_INFO CMAN_NAME
++                             ": Master died after JOINCONF, we must leave the cluster\n");
++                      quit_threads = 1;
++                      return +1;
++              }
++
++              /* No messages from the master - see if it's stil there */
++              if (master_node->state == NODESTATE_MEMBER) {
++                      send_master_hello();
++                      mod_timer(&transition_timer,
++                                jiffies +
++                                cman_config.transition_timeout * HZ);
++              }
++
++              /* If the master is dead then elect a new one */
++              if (master_node->state == NODESTATE_DEAD) {
++
++                      struct cluster_node *node;
++
++                      P_MEMB("Master node is dead...Election!\n");
++                      if (elect_master(&node)) {
++
++                              /* We are master now, all kneel */
++                              start_transition(TRANS_DEADMASTER, master_node);
++                      }
++                      else {
++                              /* Leave the job to someone on more pay */
++                              master_node = node;
++                              mod_timer(&transition_timer,
++                                        jiffies +
++                                        cman_config.transition_timeout * HZ);
++                      }
++              }
++      }
++
++      /* If we are the master node then restart the transition */
++      if (node_state == MASTER) {
++              start_transition(TRANS_RESTART, us);
++      }
++
++      return 0;
++}
++
++static void form_cluster(void)
++{
++      printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
++      node_state = MEMBER;
++      we_are_a_cluster_member = TRUE;
++      us->node_id = 1;
++      us->state = NODESTATE_MEMBER;
++      set_nodeid(us, 1);
++      recalculate_quorum(0);
++      sm_member_update(cluster_is_quorate);
++      send_hello();
++      kernel_thread(hello_kthread, NULL, 0);
++      mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++}
++
++/* This does the initial JOIN part of the membership process. Actually most of
++ * is done in the message processing routines but this is the main loop that
++ * controls it. The side-effect of this routine is "node_state" which tells the
++ * real main loop (in the kernel thread routine) what to do next */
++static void join_or_form_cluster()
++{
++      start_time = jiffies;
++
++      printk(KERN_INFO CMAN_NAME
++             ": Waiting to join or form a Linux-cluster\n");
++      join_time = 0;
++      start_time = jiffies;
++      joinwait_time = jiffies;
++      last_hello = 0;
++      send_newcluster();
++
++      /* Listen for a reply */
++      do {
++              DECLARE_WAITQUEUE(wait, current);
++              set_task_state(current, TASK_INTERRUPTIBLE);
++              add_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++              if (!skb_peek(&mem_socket->sk->sk_receive_queue))
++                      schedule_timeout((cman_config.joinwait_timeout * HZ) /
++                                       5);
++
++              set_task_state(current, TASK_RUNNING);
++              remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++              while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++                      dispatch_messages(mem_socket);
++              }
++              if (quit_threads)
++                      node_state = LEFT_CLUSTER;
++
++      }
++      while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
++             node_state == STARTING);
++
++      /* If we didn't hear any HELLO messages then form a new cluster */
++      if (node_state == STARTING) {
++              form_cluster();
++      }
++      else
++              last_hello = jiffies;
++
++}
++
++int start_membership_services(pid_t cluster_pid)
++{
++      kcluster_pid = cluster_pid;
++
++      init_timer(&transition_timer);
++      transition_timer.function = trans_timer_expired;
++      transition_timer.data = 0L;
++
++      /* Start the thread */
++      return kernel_thread(membership_kthread, NULL, 0);
++}
++
++static int init_membership_services()
++{
++      int result;
++      struct sockaddr_cl saddr;
++      struct socket *sock;
++
++      init_MUTEX(&hello_task_lock);
++      /* Create a socket to communicate with */
++      result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
++      if (result < 0) {
++              printk(KERN_ERR CMAN_NAME
++                     ": Can't create cluster socket for membership services\n");
++              return result;
++      }
++      mem_socket = sock;
++
++      /* Bind to our port */
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++      result =
++          sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
++      if (result < 0) {
++              printk(KERN_ERR CMAN_NAME
++                     ": Can't bind to cluster membership services port\n");
++              sock_release(sock);
++              return result;
++      }
++
++      node_state = STARTING;
++      return 0;
++}
++
++static int send_joinconf()
++{
++      struct sockaddr_cl saddr;
++      int status;
++
++      if (joining_temp_nodeid == 0) {
++              BUG();
++        }
++
++      master_state = MASTER_CONFIRM;
++      saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_nodeid = joining_temp_nodeid;
++      status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
++                                 MSG_NOACK);
++
++      if (status < 0) {
++              printk("Error %d sending JOINCONF, aborting transition\n", status);
++              end_transition();
++        }
++      return status;
++}
++
++static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
++{
++      char *msgbuf = scratchbuf;
++      struct list_head *addrlist;
++      int ptr = sizeof (struct cl_mem_join_msg);
++      unsigned short num_addr = 0;
++      struct cluster_node_addr *nodeaddr;
++      struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
++
++      msg->cmd = CLUSTER_MEM_JOINREQ;
++      msg->votes = votes;
++      msg->expected_votes = cpu_to_le32(expected_votes);
++      msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
++      msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
++      msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
++      msg->config_version = cpu_to_le32(config_version);
++      msg->addr_len       = cpu_to_le32(address_length);
++      strcpy(msg->clustername, cluster_name);
++
++      /* Add our addresses */
++      list_for_each(addrlist, &us->addr_list) {
++              nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
++
++              memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
++              ptr += address_length;
++              num_addr++;
++      }
++      msg->num_addr = cpu_to_le16(num_addr);
++
++      /* And our name */
++      strcpy(msgbuf + ptr, nodename);
++      ptr += strlen(nodename) + 1;
++
++      return kcl_sendmsg(mem_socket, msgbuf, ptr,
++                         addr, addr_len, MSG_NOACK);
++}
++
++static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
++{
++      struct cl_mem_startack_msg msg;
++
++      msg.cmd = CLUSTER_MEM_STARTACK;
++      msg.generation = cpu_to_le32(cluster_generation);
++      msg.node_id = cpu_to_le32(node_id);
++      msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
++
++      return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, 0);
++}
++
++static int send_newcluster()
++{
++      char buf[1];
++
++      buf[0] = CLUSTER_MEM_NEWCLUSTER;
++
++      return kcl_sendmsg(mem_socket, buf, 1, NULL, 0,
++                         MSG_NOACK);
++}
++
++static int send_hello()
++{
++      struct cl_mem_hello_msg hello_msg;
++      int status;
++
++      hello_msg.cmd = CLUSTER_MEM_HELLO;
++      hello_msg.members = cpu_to_le16(cluster_members);
++      hello_msg.flags = 0;
++      hello_msg.generation = cpu_to_le32(cluster_generation);
++
++      status =
++          kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), NULL, 0,
++                      MSG_NOACK | MSG_ALLINT);
++
++      last_hello = jiffies;
++
++      return status;
++}
++
++/* This is a special HELLO message that requires an ACK. clients in transition
++ * send these to the master to check it is till alive. if it does not ACK then
++ * cnxman will signal it dead and we can restart the transition */
++static int send_master_hello()
++{
++      struct cl_mem_hello_msg hello_msg;
++      int status;
++      struct sockaddr_cl saddr;
++
++      hello_msg.cmd = CLUSTER_MEM_HELLO;
++      hello_msg.members = cpu_to_le16(cluster_members);
++      hello_msg.flags = 1;
++      hello_msg.generation = cpu_to_le32(cluster_generation);
++
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++      saddr.scl_nodeid = master_node->node_id;
++      status =
++          kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg),
++                      &saddr, sizeof (saddr), 0);
++
++      last_hello = jiffies;
++
++      return status;
++}
++
++/* Called when the transition timer has expired, meaning we sent a transition
++ * message that was not ACKed */
++static void trans_timer_expired(unsigned long arg)
++{
++      P_MEMB("Transition timer fired %ld\n", jiffies);
++
++      set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
++      wake_up_process(membership_task);
++}
++
++static void hello_timer_expired(unsigned long arg)
++{
++      P_MEMB("Hello timer fired %ld\n", jiffies);
++
++      mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++
++      if (node_state >= TRANSITION) {
++              wake_up_process(hello_task);
++      }
++}
++
++static int wait_for_completion_barrier(void)
++{
++      int status;
++      char barriername[MAX_BARRIER_NAME_LEN];
++
++      sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
++
++      /* Make sure we all complete together */
++      P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
++      if ((status =
++           kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
++              printk(CMAN_NAME ": Error registering barrier: %d\n", status);
++              return -1;
++      }
++      kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
++                          cman_config.transition_timeout);
++      status = kcl_barrier_wait(barriername);
++      kcl_barrier_delete(barriername);
++
++      P_MEMB("Completion barrier reached : status = %d\n", status);
++      return status;
++}
++
++/* Called at the end of a state transition when we are the master */
++static int end_transition()
++{
++      struct cl_mem_endtrans_msg msg;
++      int total_votes;
++      int status;
++
++      /* Cancel the timer */
++      del_timer(&transition_timer);
++
++      confirm_joiner();
++
++      quorum = calculate_quorum(leavereason, 0, &total_votes);
++
++      msg.cmd = CLUSTER_MEM_ENDTRANS;
++      msg.quorum = cpu_to_le32(quorum);
++      msg.generation = cpu_to_le32(++cluster_generation);
++      msg.total_votes = cpu_to_le32(total_votes);
++      if (joining_node && transitionreason == TRANS_NEWNODE) {
++              msg.new_node_id = cpu_to_le32(joining_node->node_id);
++      }
++      else {
++              msg.new_node_id = 0;
++      }
++      status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
++
++      /* When that's all settled down, do the transition completion barrier */
++      kcl_wait_for_all_acks();
++
++      if (wait_for_completion_barrier() != 0) {
++              P_MEMB("Barrier timed out - restart\n");
++              start_transition(TRANS_RESTART, us);
++              return 0;
++      }
++
++      set_quorate(total_votes);
++
++      notify_listeners();
++      reset_hello_time();
++
++      /* Tell any waiting barriers that we had a transition */
++      check_barrier_returns();
++
++      leavereason = 0;
++      node_state = MEMBER;
++      transition_end_time = jiffies;
++
++      sm_member_update(cluster_is_quorate);
++
++      return 0;
++}
++
++int send_reconfigure(int param, unsigned int value)
++{
++      char msgbuf[66];
++      struct cl_mem_reconfig_msg *msg =
++          (struct cl_mem_reconfig_msg *) &msgbuf;
++
++      if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
++              expected_votes = value;
++
++      msg->cmd = CLUSTER_MEM_RECONFIG;
++      msg->param = param;
++      msg->value = cpu_to_le32(value);
++
++      return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
++}
++
++static int send_joinack(char *addr, int addr_len, unsigned char acktype)
++{
++      struct cl_mem_joinack_msg msg;
++
++      msg.cmd = CLUSTER_MEM_JOINACK;
++      msg.acktype = acktype;
++
++      return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
++                         (struct sockaddr_cl *)addr, addr_len,  MSG_NOACK);
++}
++
++/* Only send a leave message to one node in the cluster so that it can master
++ * the state transition, otherwise we get a "thundering herd" of potential
++ * masters fighting it out */
++int send_leave(unsigned char flags)
++{
++      unsigned char msg[2];
++      struct sockaddr_cl saddr;
++      struct cluster_node *node = NULL;
++      int status;
++
++      if (!mem_socket)
++                      return 0;
++
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++
++      /* If we are in transition then use the current master */
++      if (node_state == TRANSITION) {
++              node = master_node;
++      }
++      if (!node) {
++              /* If we are the master or not in transition then pick a node
++               * almost at random */
++              struct list_head *nodelist;
++
++              down(&cluster_members_lock);
++              list_for_each(nodelist, &cluster_members_list) {
++                      node = list_entry(nodelist, struct cluster_node, list);
++
++                      if (node->state == NODESTATE_MEMBER && !node->us)
++                              break;
++              }
++              up(&cluster_members_lock);
++      }
++
++      /* we are the only member of the cluster - there is no-one to tell */
++      if (node && !node->us) {
++              saddr.scl_nodeid = node->node_id;
++
++              P_MEMB("Sending LEAVE to %s\n", node->name);
++              msg[0] = CLUSTER_MEM_LEAVE;
++              msg[1] = flags;
++              status =
++                  kcl_sendmsg(mem_socket, msg, 2,
++                              &saddr, sizeof (saddr),
++                              MSG_NOACK);
++
++              if (status < 0)
++                      return status;
++      }
++
++      /* And exit */
++      node_state = LEFT_CLUSTER;
++      wake_up_process(membership_task);
++      return 0;
++}
++
++int send_kill(int nodeid)
++{
++      char killmsg;
++      struct sockaddr_cl saddr;
++
++      killmsg = CLUSTER_MEM_KILL;
++
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++      saddr.scl_nodeid = nodeid;
++      return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
++                         sizeof (struct sockaddr_cl), MSG_NOACK);
++}
++
++/* Process a message */
++static int do_membership_packet(struct msghdr *msg, int len)
++{
++      int result = -1;
++      unsigned char *buf = msg->msg_iov->iov_base;
++      struct sockaddr_cl *saddr = msg->msg_name;
++      struct cluster_node *node;
++
++      node = find_node_by_nodeid(saddr->scl_nodeid);
++
++      P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
++             msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
++
++      switch (*buf) {
++      case CLUSTER_MEM_JOINREQ:
++              result = do_process_joinreq(msg, len);
++              break;
++
++      case CLUSTER_MEM_LEAVE:
++              if (we_are_a_cluster_member)
++                      result = do_process_leave(msg, len);
++              break;
++
++      case CLUSTER_MEM_HELLO:
++              result = do_process_hello(msg, len);
++              break;
++
++      case CLUSTER_MEM_KILL:
++              if (we_are_a_cluster_member)
++                      result = do_process_kill(msg, len);
++              break;
++
++      case CLUSTER_MEM_JOINCONF:
++              if (node_state == JOINACK) {
++                      do_process_joinconf(msg, len);
++              }
++              break;
++
++      case CLUSTER_MEM_CONFACK:
++              if (node_state == MASTER && master_state == MASTER_CONFIRM) {
++                      end_transition();
++              }
++              break;
++
++      case CLUSTER_MEM_MASTERVIEW:
++              if (node_state == TRANSITION)
++                      do_process_masterview(msg, len);
++              break;
++
++      case CLUSTER_MEM_JOINACK:
++              if (node_state == JOINING || node_state == JOINWAIT) {
++                      do_process_joinack(msg, len);
++              }
++              break;
++      case CLUSTER_MEM_RECONFIG:
++              if (we_are_a_cluster_member) {
++                      do_process_reconfig(msg, len);
++              }
++              break;
++
++      case CLUSTER_MEM_STARTTRANS:
++              result = do_process_starttrans(msg, len);
++              break;
++
++      case CLUSTER_MEM_ENDTRANS:
++              result = do_process_endtrans(msg, len);
++              break;
++
++      case CLUSTER_MEM_VIEWACK:
++              result = do_process_viewack(msg, len);
++              break;
++
++      case CLUSTER_MEM_STARTACK:
++              if (node_state == MASTER)
++                      result = do_process_startack(msg, len);
++              break;
++
++      case CLUSTER_MEM_NEWCLUSTER:
++              result = do_process_newcluster(msg, len);
++              break;
++
++      case CLUSTER_MEM_NOMINATE:
++              if (node_state != MASTER)
++                      result = do_process_nominate(msg, len);
++              break;
++
++      default:
++              printk(KERN_ERR CMAN_NAME
++                     ": Unknown membership services message %d received\n",
++                     *buf);
++              break;
++
++      }
++      return result;
++}
++
++/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
++ * to ignore request (node already joining) */
++static int check_duplicate_node(char *name, struct msghdr *msg, int len)
++{
++      struct cluster_node *node;
++      struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
++      char addr[address_length];
++      int addrlen;
++
++      if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
++              return -3;
++
++      /* See if we already have a cluster member with that name... */
++      node = find_node_by_name(name);
++      if (node && node->state != NODESTATE_DEAD) {
++
++              if ((node->state == NODESTATE_JOINING ||
++                   node->state == NODESTATE_REMOTEMEMBER))
++                      return +1;
++
++              printk(KERN_WARNING CMAN_NAME
++                     ": Rejecting cluster membership application from %s - already have a node with that name\n",
++                     name);
++              return -1;
++
++      }
++
++      /* Need to check the node's address too */
++      if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
++          (node = find_node_by_addr(addr, addrlen)) &&
++          node->state != NODESTATE_DEAD) {
++
++              if ((node->state == NODESTATE_JOINING ||
++                   node->state == NODESTATE_REMOTEMEMBER))
++                      return +1;
++
++              printk(KERN_WARNING CMAN_NAME
++                     ": Rejecting cluster membership application from %s - already have a node with that address\n",
++                     name);
++              return -1;
++      }
++      return 0;
++}
++
++/* Start the state transition */
++static int start_transition(unsigned char reason, struct cluster_node *node)
++{
++      char *startbuf = scratchbuf;
++      struct cl_mem_starttrans_msg *msg =
++          (struct cl_mem_starttrans_msg *) startbuf;
++
++      P_MEMB("Start transition - reason = %d\n", reason);
++
++      /* If this is a restart then zero the counters */
++      if (reason == TRANS_RESTART) {
++              agreeing_nodes = 0;
++              dissenting_nodes = 0;
++              if (node_opinion) {
++                      kfree(node_opinion);
++                      node_opinion = NULL;
++              }
++              responses_collected = 0;
++      }
++
++      /* If we have timed out too many times then just die */
++      if (reason == TRANS_RESTART
++          && ++transition_restarts > cman_config.transition_restarts) {
++              printk(KERN_WARNING CMAN_NAME
++                     ": too many transition restarts - will die\n");
++              send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
++              node_state = LEFT_CLUSTER;
++              quit_threads = 1;
++              wake_up_process(membership_task);
++              wake_up_interruptible(&cnxman_waitq);
++              return 0;
++      }
++      if (reason != TRANS_RESTART)
++              transition_restarts = 0;
++
++      /* Only keep the original state transition reason in the global
++       * variable. */
++      if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
++          reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
++              transitionreason = reason;
++
++      /* Save the info of the requesting node */
++      if (reason == TRANS_NEWNODE)
++              joining_node = node;
++
++      node_state = MASTER;
++      master_state = MASTER_START;
++      responses_collected = 0;
++      responses_expected = cluster_members - 1;
++
++      /* If we are on our own then just do it */
++      if (responses_expected == 0) {
++              P_MEMB("We are on our own...lonely here\n");
++              responses_collected--;
++              do_process_startack(NULL, 0);
++      }
++      else {
++              int ptr = sizeof (struct cl_mem_starttrans_msg);
++              struct list_head *addrlist;
++              unsigned short num_addrs = 0;
++              int flags = 0;
++
++              /* Send the STARTTRANS message */
++              msg->cmd = CLUSTER_MEM_STARTTRANS;
++              msg->reason = reason;
++              msg->votes = node->votes;
++              msg->expected_votes = cpu_to_le32(node->expected_votes);
++              msg->generation = cpu_to_le32(++cluster_generation);
++              msg->nodeid = cpu_to_le32(node->node_id);
++
++              if (reason == TRANS_NEWNODE) {
++                      /* Add the addresses */
++                      list_for_each(addrlist, &node->addr_list) {
++                              struct cluster_node_addr *nodeaddr =
++                                  list_entry(addrlist,
++                                             struct cluster_node_addr, list);
++
++                              memcpy(startbuf + ptr, nodeaddr->addr,
++                                     address_length);
++                              ptr += address_length;
++                              num_addrs++;
++                      }
++
++                      /* And the name */
++                      strcpy(startbuf + ptr, node->name);
++                      ptr += strlen(node->name) + 1;
++              }
++
++              /* If another node died then we must queue the STARTTRANS
++               * messages so that membershipd can carry on processing the
++               * other replies */
++              if (reason == TRANS_ANOTHERREMNODE)
++                      flags |= MSG_QUEUE;
++
++              msg->num_addrs = cpu_to_le16(num_addrs);
++              kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
++      }
++      /* Set a timer in case we don't get 'em all back */
++      mod_timer(&transition_timer,
++                jiffies + cman_config.transition_timeout * HZ);
++      return 0;
++}
++
++/* A node has died - decide what to do */
++void a_node_just_died(struct cluster_node *node)
++{
++      /* If we are not in the context of kmembershipd then stick it on the
++       * list and wake it */
++      if (current != membership_task) {
++              struct cl_new_dead_node *newnode =
++                  kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
++              if (!newnode)
++                      return;
++              newnode->node = node;
++              down(&new_dead_node_lock);
++              list_add_tail(&newnode->list, &new_dead_node_list);
++              set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
++              up(&new_dead_node_lock);
++              wake_up_process(membership_task);
++              P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
++              return;
++      }
++
++      /* Remove it */
++      down(&cluster_members_lock);
++      if (node->state == NODESTATE_MEMBER)
++              cluster_members--;
++      node->state = NODESTATE_DEAD;
++      up(&cluster_members_lock);
++
++      /* Notify listeners */
++      notify_kernel_listeners(DIED, (long) node->node_id);
++
++      /* If we are in normal operation then become master and initiate a
++       * state-transition */
++      if (node_state == MEMBER) {
++              start_transition(TRANS_REMNODE, node);
++              return;
++      }
++
++      /* If we are a slave in transition then see if it's the master that has
++       * failed. If not then ignore it. If it /is/ the master then elect a
++       * new one */
++      if (node_state == TRANSITION) {
++              if (master_node == node) {
++                      if (elect_master(&node)) {
++                              del_timer(&transition_timer);
++                              node_state = MASTER;
++
++                              start_transition(TRANS_DEADMASTER, master_node);
++                      }
++                      else {
++                              /* Someone else can be in charge - phew! */
++                      }
++              }
++              return;
++      }
++
++      /* If we are the master then we need to start the transition all over
++       * again */
++      if (node_state == MASTER) {
++              /* Cancel timer */
++              del_timer(&transition_timer);
++
++              /* Restart the transition */
++              start_transition(TRANS_ANOTHERREMNODE, node);
++              transition_restarts = 0;
++              return;
++      }
++}
++
++/*
++ * Build up and send a set of messages consisting of the whole cluster view.
++ * The first byte is the command (cmd as passed in), the second is a flag byte:
++ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
++ * this is the only message sent The rest is a set of packed node entries, which
++ * are NOT split over packets. */
++static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
++                           unsigned int flags)
++{
++      int ptr = 2;
++      int len;
++      int status = 0;
++      int last_node_start = 2;
++      unsigned char first_packet_flag = 1;
++      struct list_head *nodelist;
++      struct list_head *temp;
++      struct cluster_node *node;
++      char *message = scratchbuf;
++
++      message[0] = cmd;
++
++      down(&cluster_members_lock);
++      list_for_each_safe(nodelist, temp, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state == NODESTATE_MEMBER) {
++                      unsigned int evotes;
++                      unsigned int node_id;
++                      unsigned short num_addrs = 0;
++                      unsigned short num_addrs_le;
++                      struct list_head *addrlist;
++
++                      last_node_start = ptr;
++
++                      message[ptr++] = len = strlen(node->name);
++                      strcpy(&message[ptr], node->name);
++                      ptr += len;
++
++                      /* Count the number of addresses this node has */
++                      list_for_each(addrlist, &node->addr_list) {
++                              num_addrs++;
++                      }
++
++                      num_addrs_le = cpu_to_le16(num_addrs);
++                      memcpy(&message[ptr], &num_addrs_le, sizeof (short));
++                      ptr += sizeof (short);
++
++                      /* Pack em in */
++                      list_for_each(addrlist, &node->addr_list) {
++
++                              struct cluster_node_addr *nodeaddr =
++                                  list_entry(addrlist,
++                                             struct cluster_node_addr, list);
++
++                              memcpy(&message[ptr], nodeaddr->addr,
++                                     address_length);
++                              ptr += address_length;
++                      }
++
++                      message[ptr++] = node->votes;
++
++                      evotes = cpu_to_le32(node->expected_votes);
++                      memcpy(&message[ptr], &evotes, sizeof (int));
++                      ptr += sizeof (int);
++
++                      node_id = cpu_to_le32(node->node_id);
++                      memcpy(&message[ptr], &node_id, sizeof (int));
++                      ptr += sizeof (int);
++
++                      /* If the block is full then send it */
++                      if (ptr > MAX_CLUSTER_MESSAGE) {
++                              message[1] = first_packet_flag;
++
++                              up(&cluster_members_lock);
++                              status =
++                                  kcl_sendmsg(mem_socket, message,
++                                              last_node_start, saddr,
++                                              saddr ? sizeof (struct sockaddr_cl) : 0,
++                                              flags);
++
++                              if (status < 0)
++                                      goto send_fail;
++
++                              down(&cluster_members_lock);
++
++                              first_packet_flag = 0;
++                              /* Copy the overflow back to the start of the
++                               * buffer for the next send */
++                              memcpy(&message[2], &message[last_node_start],
++                                     ptr - last_node_start);
++                              ptr = ptr - last_node_start + 2;
++                      }
++              }
++      }
++
++      up(&cluster_members_lock);
++
++      message[1] = first_packet_flag | 2;     /* The last may also be first */
++      status = kcl_sendmsg(mem_socket, message, ptr,
++                           saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
++                           flags);
++      send_fail:
++
++      return status;
++}
++
++/* Make the JOINING node into a MEMBER */
++static void confirm_joiner()
++{
++      if (joining_node && joining_node->state == NODESTATE_JOINING) {
++              down(&cluster_members_lock);
++              joining_node->state = NODESTATE_MEMBER;
++              cluster_members++;
++              up(&cluster_members_lock);
++      }
++      remove_temp_nodeid(joining_temp_nodeid);
++      joining_temp_nodeid = 0;
++}
++
++/* Reset HELLO timers for all nodes We do this after a state-transition as we
++ * have had HELLOS disabled during the transition and if we don't do this the
++ * nodes will go on an uncontrolled culling-spree afterwards */
++static void reset_hello_time()
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state == NODESTATE_MEMBER) {
++                      node->last_hello = jiffies;
++              }
++
++      }
++      up(&cluster_members_lock);
++}
++
++/* Calculate the new quorum and return the value. do *not* set it in here as
++ * cnxman calls this to check if a new expected_votes value is valid. It
++ * (optionally) returns the total number of votes in the cluster */
++int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++      unsigned int total_votes = 0;
++      unsigned int highest_expected = 0;
++      unsigned int newquorum, q1, q2;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state == NODESTATE_MEMBER) {
++                      highest_expected =
++                          max(highest_expected, node->expected_votes);
++                      total_votes += node->votes;
++              }
++      }
++      up(&cluster_members_lock);
++      if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
++              total_votes += quorum_device->votes;
++
++      if (max_expected > 0)
++              highest_expected = max_expected;
++
++      /* This quorum calculation is taken from the OpenVMS Cluster Systems
++       * manual, but, then, you guessed that didn't you */
++      q1 = (highest_expected + 2) / 2;
++      q2 = (total_votes + 2) / 2;
++      newquorum = max(q1, q2);
++
++      /* Normally quorum never decreases but the system administrator can
++       * force it down by setting expected votes to a maximum value */
++      if (!allow_decrease)
++              newquorum = max(quorum, newquorum);
++
++      /* The special two_node mode allows each of the two nodes to retain
++       * quorum if the other fails.  Only one of the two should live past
++       * fencing (as both nodes try to fence each other in split-brain.) */
++      if (two_node)
++              newquorum = 1;
++
++      if (ret_total_votes)
++              *ret_total_votes = total_votes;
++      return newquorum;
++}
++
++/* Recalculate cluster quorum, set quorate and notify changes */
++void recalculate_quorum(int allow_decrease)
++{
++      int total_votes;
++
++      quorum = calculate_quorum(allow_decrease, 0, &total_votes);
++      set_quorate(total_votes);
++      notify_listeners();
++}
++
++/* Add new node address to an existing node */
++int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
++{
++      struct cluster_node_addr *newaddr;
++
++      newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
++      if (!newaddr)
++              return -1;
++
++      memcpy(newaddr->addr, addr, len);
++      newaddr->addr_len = len;
++      list_add_tail(&newaddr->list, &node->addr_list);
++
++      return 0;
++}
++
++static struct cluster_node *add_new_node(char *name, unsigned char votes,
++                                       unsigned int expected_votes,
++                                       int node_id, int state)
++{
++      struct cluster_node *newnode;
++
++      /* Look for a dead node with this name */
++      newnode = find_node_by_name(name);
++
++      /* Is it already joining */
++      if (newnode && newnode->state == NODESTATE_JOINING)
++              return NULL;
++
++      /* Update existing information */
++      if (newnode && newnode->state == NODESTATE_DEAD) {
++              newnode->last_hello = jiffies;
++              newnode->votes = votes;
++              newnode->expected_votes = expected_votes;
++              newnode->state = state;
++              newnode->us = 0;
++              newnode->leave_reason = 0;
++              newnode->last_seq_recv = 0;
++              newnode->last_seq_acked = 0;
++              newnode->last_seq_sent = 0;
++              newnode->incarnation++;
++              /* Don't overwrite the node ID */
++
++              if (state == NODESTATE_MEMBER) {
++                      down(&cluster_members_lock);
++                      cluster_members++;
++                      up(&cluster_members_lock);
++              }
++
++              printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
++              return newnode;
++      }
++
++      newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++      if (!newnode)
++              goto alloc_err;
++
++      memset(newnode, 0, sizeof (struct cluster_node));
++      newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
++      if (!newnode->name)
++              goto alloc_err1;
++
++      strcpy(newnode->name, name);
++      newnode->last_hello = jiffies;
++      newnode->votes = votes;
++      newnode->expected_votes = expected_votes;
++      newnode->state = state;
++      newnode->node_id = node_id;
++      newnode->us = 0;
++      newnode->leave_reason = 0;
++      newnode->last_seq_recv = 0;
++      newnode->last_seq_acked = 0;
++      newnode->last_seq_sent = 0;
++      newnode->incarnation = 0;
++      INIT_LIST_HEAD(&newnode->addr_list);
++      set_nodeid(newnode, node_id);
++
++      /* Add the new node to the list */
++      down(&cluster_members_lock);
++      list_add(&newnode->list, &cluster_members_list);
++      if (state == NODESTATE_MEMBER)
++              cluster_members++;
++      up(&cluster_members_lock);
++
++      printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
++      return newnode;
++
++      alloc_err1:
++      kfree(newnode);
++      alloc_err:
++      send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++      printk(KERN_CRIT CMAN_NAME
++             ": Cannot allocate memory for new cluster node %s\n", name);
++
++      panic("cluster memory allocation failed");
++
++      return NULL;
++}
++
++/* Remove node from a STARTTRANS message */
++static struct cluster_node *remove_node(int nodeid)
++{
++      struct cluster_node *node = find_node_by_nodeid(nodeid);
++
++      if (node && node->state == NODESTATE_MEMBER) {
++              P_MEMB("starttrans removes node %s\n", node->name);
++              down(&cluster_members_lock);
++              node->state = NODESTATE_DEAD;
++              cluster_members--;
++              up(&cluster_members_lock);
++
++              notify_kernel_listeners(DIED, (long) nodeid);
++
++              /* If this node is us then go quietly */
++              if (node->us) {
++                      printk(KERN_INFO CMAN_NAME
++                             ": killed by STARTTRANS or NOMINATE\n");
++                      quit_threads = 1;
++                      wake_up_process(membership_task);
++                      wake_up_interruptible(&cnxman_waitq);
++              }
++      }
++      return node;
++}
++
++/* Add a node from a STARTTRANS or NOMINATE message */
++static void add_node_from_starttrans(struct msghdr *msg, int len)
++{
++      /* Add the new node but don't fill in the ID until the master has
++       * confirmed it */
++      struct cl_mem_starttrans_msg *startmsg =
++          (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
++      char *msgbuf = (char *) msg->msg_iov->iov_base;
++      int ptr = sizeof (struct cl_mem_starttrans_msg);
++      char *name =
++          msgbuf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
++      int i;
++
++      joining_node = add_new_node(name, startmsg->votes,
++                                  le32_to_cpu(startmsg->expected_votes),
++                                  0, NODESTATE_JOINING);
++
++      /* add_new_node returns NULL if the node already exists */
++      if (!joining_node)
++              joining_node = find_node_by_name(name);
++
++      /* Add the node's addresses */
++      if (list_empty(&joining_node->addr_list)) {
++              for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
++                      add_node_address(joining_node, msgbuf + ptr, address_length);
++                      ptr += address_length;
++              }
++      }
++}
++
++/* We have been nominated as master for a transition */
++static int do_process_nominate(struct msghdr *msg, int len)
++{
++      struct cl_mem_starttrans_msg *startmsg =
++          (struct cl_mem_starttrans_msg *)msg->msg_iov->iov_base;
++      struct cluster_node *node = NULL;
++      char *nodeaddr = msg->msg_iov->iov_base + sizeof(struct cl_mem_starttrans_msg);
++
++      P_MEMB("nominate reason is %d\n", startmsg->reason);
++
++      if (startmsg->reason == TRANS_REMNODE) {
++              node = remove_node(le32_to_cpu(startmsg->nodeid));
++      }
++
++      if (startmsg->reason == TRANS_NEWNODE) {
++              add_node_from_starttrans(msg, len);
++              node = joining_node;
++              /* Make sure we have a temp nodeid for the new node */
++              joining_temp_nodeid = new_temp_nodeid(nodeaddr,
++                                                    address_length);
++      }
++
++      /* This should be a TRANS_CHECK but start_transition needs some node
++       * info */
++      if (node == NULL)
++              node = us;
++      start_transition(startmsg->reason, node);
++      return 0;
++}
++
++/* Got a STARTACK response from a node */
++static int do_process_startack(struct msghdr *msg, int len)
++{
++      if (node_state != MASTER && master_state != MASTER_START) {
++              P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
++              return 0;
++      }
++
++      /* msg is NULL if we are called directly from start_transition */
++      if (msg) {
++              struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
++
++              /* Ignore any messages wil old generation numbers in them */
++              if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
++                      P_MEMB("Got old generation START-ACK msg - ignoring\n");
++                      return 0;
++              }
++      }
++
++      /* If the node_id is non-zero then use it. */
++      if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
++              struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
++
++              if (ackmsg->node_id) {
++                      set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
++              }
++              highest_nodeid =
++                  max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
++              P_MEMB("Node id = %d, highest node id = %d\n",
++                     le32_to_cpu(ackmsg->node_id),
++                     le32_to_cpu(ackmsg->highest_node_id));
++      }
++
++      /* If we have all the responses in then move to the next stage */
++      if (++responses_collected == responses_expected) {
++
++              /* If the new node has no node_id (ie nobody in the cluster has
++               * heard of it before) then assign it a new one */
++              if (transitionreason == TRANS_NEWNODE && joining_node) {
++                      highest_nodeid =
++                          max(highest_nodeid, get_highest_nodeid());
++                      if (joining_node->node_id == 0) {
++                              set_nodeid(joining_node, ++highest_nodeid);
++                      }
++                      P_MEMB("nodeIDs: new node: %d, highest: %d\n",
++                             joining_node->node_id, highest_nodeid);
++              }
++
++              /* Behave a little differently if we are on our own */
++              if (cluster_members == 1) {
++                      if (transitionreason == TRANS_NEWNODE) {
++                              /* If the cluster is just us then confirm at
++                               * once */
++                              joinconf_count = 0;
++                              mod_timer(&transition_timer,
++                                        jiffies +
++                                        cman_config.joinconf_timeout * HZ);
++                              send_joinconf();
++                              return 0;
++                      }
++                      else {  /* Node leaving the cluster */
++                              recalculate_quorum(leavereason);
++                              leavereason = 0;
++                              node_state = MEMBER;
++                      }
++              }
++              else {
++                      master_state = MASTER_COLLECT;
++                      responses_collected = 0;
++                      responses_expected = cluster_members - 1;
++                      P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
++                             responses_expected);
++
++                      send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0);
++
++                      /* Set a timer in case we don't get 'em all back */
++                      mod_timer(&transition_timer,
++                                jiffies +
++                                cman_config.transition_timeout * HZ);
++              }
++      }
++      return 0;
++}
++
++/* Got a VIEWACK response from a node */
++static int do_process_viewack(struct msghdr *msg, int len)
++{
++      char *reply = msg->msg_iov->iov_base;
++      struct sockaddr_cl *saddr = msg->msg_name;
++
++      if (master_state != MASTER_COLLECT) {
++              printk(KERN_INFO CMAN_NAME
++                     ": got VIEWACK while not in state transition\n");
++              return 0;
++      }
++
++      if (node_opinion == NULL) {
++              node_opinion =
++                  kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
++              if (!node_opinion) {
++                      panic(": malloc agree/dissent failed\n");
++              }
++              memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
++      }
++
++      /* Keep a list of agreeing and dissenting nodes */
++      if (reply[1] == 1) {
++              /* ACK - remote node agrees with me */
++              P_MEMB("Node agrees\n");
++              node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
++              agreeing_nodes++;
++      }
++      else {
++              /* Remote node disagrees */
++              P_MEMB("Node disagrees\n");
++              node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
++              dissenting_nodes++;
++      }
++
++      P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
++             responses_expected);
++
++      /* Are all the results in yet ? */
++      if (++responses_collected == responses_expected) {
++              del_timer(&transition_timer);
++
++              P_MEMB("The results are in: %d agree, %d dissent\n",
++                     agreeing_nodes, dissenting_nodes);
++
++              if (agreeing_nodes > dissenting_nodes) {
++                      /* Kill dissenting nodes */
++                      int i;
++
++                      for (i = 1; i <= responses_collected; i++) {
++                              if (node_opinion[i] == OPINION_DISAGREE)
++                                      send_kill(i);
++                      }
++              }
++              else {
++                      /* We must leave the cluster as we are in a minority,
++                       * the rest of them can fight it out amongst
++                       * themselves. */
++                      send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
++
++                      agreeing_nodes = 0;
++                      dissenting_nodes = 0;
++                      kfree(node_opinion);
++                      node_opinion = NULL;
++                      node_state = LEFT_CLUSTER;
++                      quit_threads = 1;
++                      wake_up_process(membership_task);
++                      wake_up_interruptible(&cnxman_waitq);
++                      return -1;
++              }
++
++              /* Reset counters */
++              agreeing_nodes = 0;
++              dissenting_nodes = 0;
++              kfree(node_opinion);
++              node_opinion = NULL;
++
++              /* Confirm new node */
++              if (transitionreason == TRANS_NEWNODE) {
++                      mod_timer(&transition_timer,
++                                jiffies + cman_config.joinconf_timeout * HZ);
++                      joinconf_count = 0;
++                      send_joinconf();
++                      return 0;
++              }
++
++              master_state = MASTER_COMPLETE;
++
++              end_transition();
++      }
++
++      return 0;
++}
++
++/* Got an ENDTRANS message */
++static int do_process_endtrans(struct msghdr *msg, int len)
++{
++      struct cl_mem_endtrans_msg *endmsg =
++          (struct cl_mem_endtrans_msg *) msg->msg_iov->iov_base;
++      struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
++
++      /* Someone else's state transition */
++      if (node_state != TRANSITION && node_state != JOINACK)
++              return 0;
++
++      /* Check we got it from the MASTER node */
++      if (master_node && master_node->node_id != saddr->scl_nodeid) {
++              printk(KERN_INFO
++                     "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
++                     master_node->node_id, saddr->scl_nodeid);
++              return 0;
++      }
++
++      del_timer(&transition_timer);
++
++      /* Set node ID on new node */
++      if (endmsg->new_node_id) {
++              set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
++              P_MEMB("new node %s has ID %d\n", joining_node->name,
++                     joining_node->node_id);
++      }
++
++      node_state = TRANSITION_COMPLETE;
++
++      /* Need to set this here or the barrier code will reject us if we've
++       * just joined */
++      we_are_a_cluster_member = TRUE;
++
++      confirm_joiner();
++      cluster_generation = le32_to_cpu(endmsg->generation);
++
++      if (wait_for_completion_barrier() != 0) {
++              P_MEMB("Barrier timed out - restart\n");
++              node_state = TRANSITION;
++              mod_timer(&transition_timer,
++                        jiffies + cman_config.transition_timeout * HZ);
++              return 0;
++      }
++
++      quorum = le32_to_cpu(endmsg->quorum);
++      set_quorate(le32_to_cpu(endmsg->total_votes));
++
++      /* Tell any waiting barriers that we had a transition */
++      check_barrier_returns();
++
++      /* Clear the master node */
++      master_node = NULL;
++
++      node_state = MEMBER;
++
++      /* Notify other listeners that transition has completed */
++      notify_listeners();
++      reset_hello_time();
++      transition_end_time = jiffies;
++
++      sm_member_update(cluster_is_quorate);
++      return 0;
++}
++
++/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
++static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
++                       int nodeid)
++{
++      struct sockaddr_cl maddr;
++
++      maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++      maddr.scl_family = AF_CLUSTER;
++      maddr.scl_nodeid = nodeid;
++
++      startmsg->cmd = CLUSTER_MEM_NOMINATE;
++      return kcl_sendmsg(mem_socket, startmsg, msglen,
++                         &maddr, sizeof (maddr), 0);
++}
++
++/* Got a STARTTRANS message */
++static int do_process_starttrans(struct msghdr *msg, int len)
++{
++      struct cl_mem_starttrans_msg *startmsg =
++          (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
++      struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
++      struct cluster_node *node;
++      unsigned int newgen = le32_to_cpu(startmsg->generation);
++
++      /* Got a WHAT from WHOM? */
++      node = find_node_by_nodeid(saddr->scl_nodeid);
++      if (!node || node->state != NODESTATE_MEMBER)
++              return 0;
++
++      /* Someone else's state transition */
++      if (node_state != MEMBER &&
++          node_state != TRANSITION && node_state != MASTER)
++              return 0;
++
++      /* Ignore old generation STARTTRANS messages */
++      if ((newgen < cluster_generation) ||
++          (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
++              P_MEMB("Ignoring STARTTRANS with old generation number\n");
++              return 0;
++      }
++
++      P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
++             newgen, cluster_generation, startmsg->reason);
++
++      /* Up the generation number */
++      cluster_generation = newgen;
++
++      /* If we are also a master then decide between us */
++      if (node_state == MASTER) {
++
++              /* See if we really want the responsibility of being master */
++              if (elect_master(&node)) {
++
++                      /* I reluctantly accept this position of responsibility
++                       */
++                      P_MEMB("I elected myself master\n");
++
++                      /* start_transition will re-establish this */
++                      del_timer(&transition_timer);
++
++                      start_transition(TRANS_NEWMASTER, node);
++                      return 0;
++              }
++              else {
++                      /* Back down */
++                      P_MEMB("Backing down from MASTER status\n");
++                      master_node = node;
++                      node_state = MEMBER;
++
++                      /* If we were bringing a new node into the cluster then
++                       * we will have to abandon that now and tell the new
++                       * node to try again later */
++                      if (transitionreason == TRANS_NEWNODE && joining_node) {
++                              struct cluster_node_addr *first_addr =
++                                  (struct cluster_node_addr *) joining_node->
++                                  addr_list.next;
++
++                              P_MEMB("Postponing membership of node %s\n",
++                                     joining_node->name);
++                              send_joinack(first_addr->addr, address_length,
++                                            JOINACK_TYPE_WAIT);
++
++                              /* Not dead, just sleeping */
++                              joining_node->state = NODESTATE_DEAD;
++                              joining_node = NULL;
++                      }
++
++                      /* If the new master is not us OR the node we just got
++                       * the STARTTRANS from then make sure it knows it has
++                       * to be master */
++                      if (saddr->scl_nodeid != node->node_id) {
++                              send_nominate(startmsg, len, node->node_id);
++                              return 0;
++                      }
++
++                      /* Fall through into MEMBER code below if we are
++                       * obeying the STARTTRANS we just received */
++              }
++      }
++
++      /* Do non-MASTER STARTTRANS bits */
++      if (node_state == MEMBER) {
++              int ptr = sizeof (struct cl_mem_starttrans_msg);
++              int node_id = 0;
++
++              P_MEMB("Normal transition start\n");
++
++              /* If the master is adding a new node and we know it's node ID
++               * then ACK with it. */
++              if (startmsg->reason == TRANS_NEWNODE) {
++                      struct cluster_node *node =
++                          find_node_by_addr((char *) startmsg + ptr,
++                                            address_length);
++                      if (node)
++                              node_id = node->node_id;
++              }
++
++              /* Save the master info */
++              master_node = find_node_by_nodeid(saddr->scl_nodeid);
++              node_state = TRANSITION;
++
++              if (startmsg->reason == TRANS_NEWNODE) {
++                      add_node_from_starttrans(msg, len);
++              }
++
++              if (startmsg->reason == TRANS_REMNODE ||
++                  startmsg->reason == TRANS_ANOTHERREMNODE) {
++                      remove_node(le32_to_cpu(startmsg->nodeid));
++              }
++
++              send_startack(saddr, msg->msg_namelen,
++                            node_id);
++
++              /* Establish timer in case the master dies */
++              mod_timer(&transition_timer,
++                        jiffies + cman_config.transition_timeout * HZ);
++
++              return 0;
++      }
++
++      /* We are in transition but this may be a restart */
++      if (node_state == TRANSITION) {
++
++              master_node = find_node_by_nodeid(saddr->scl_nodeid);
++              send_startack(saddr, msg->msg_namelen, 0);
++
++              /* Is it a new joining node ? This happens if a master is
++               * usurped */
++              if (startmsg->reason == TRANS_NEWNODE) {
++                      struct cluster_node *oldjoin = joining_node;
++
++                      add_node_from_starttrans(msg, len);
++
++                      /* If this is a different node joining than the one we
++                       * were previously joining (probably cos the master is
++                       * a nominated one) then mark our "old" joiner as DEAD.
++                       * The original master will already have told the node
++                       * to go back into JOINWAIT state */
++                      if (oldjoin && oldjoin != joining_node
++                          && oldjoin->state == NODESTATE_JOINING)
++                              oldjoin->state = NODESTATE_DEAD;
++              }
++
++              /* Is it a new master node? */
++              if (startmsg->reason == TRANS_NEWMASTER ||
++                  startmsg->reason == TRANS_DEADMASTER) {
++                      P_MEMB("starttrans %s, node=%d\n",
++                             startmsg->reason ==
++                             TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
++                             le32_to_cpu(startmsg->nodeid));
++
++                      /* If the old master has died then remove it */
++                      node =
++                          find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
++
++                      if (startmsg->reason == TRANS_DEADMASTER &&
++                          node && node->state == NODESTATE_MEMBER) {
++                              down(&cluster_members_lock);
++                              node->state = NODESTATE_DEAD;
++                              cluster_members--;
++                              up(&cluster_members_lock);
++                      }
++
++                      /* Store new master */
++                      master_node = find_node_by_nodeid(saddr->scl_nodeid);
++              }
++
++              /* Another node has died (or been killed) */
++              if (startmsg->reason == TRANS_ANOTHERREMNODE) {
++                      /* Remove new dead node */
++                      node =
++                          find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
++                      if (node && node->state == NODESTATE_MEMBER) {
++                              down(&cluster_members_lock);
++                              node->state = NODESTATE_DEAD;
++                              cluster_members--;
++                              up(&cluster_members_lock);
++                      }
++              }
++              /* Restart the timer */
++              del_timer(&transition_timer);
++              mod_timer(&transition_timer,
++                        jiffies + cman_config.transition_timeout * HZ);
++      }
++
++      return 0;
++}
++
++/* Change a cluster parameter */
++static int do_process_reconfig(struct msghdr *msg, int len)
++{
++      struct cl_mem_reconfig_msg *confmsg;
++      struct sockaddr_cl *saddr = msg->msg_name;
++      struct cluster_node *node;
++      unsigned int val;
++
++      if (len < sizeof(struct cl_mem_reconfig_msg))
++              return -1;
++
++      confmsg = (struct cl_mem_reconfig_msg *) msg->msg_iov->iov_base;
++      val = le32_to_cpu(confmsg->value);
++
++      switch (confmsg->param) {
++
++      case RECONFIG_PARAM_EXPECTED_VOTES:
++              /* Set any nodes with expected_votes higher than the new value
++               * down */
++              if (val > 0) {
++                      struct cluster_node *node;
++
++                      down(&cluster_members_lock);
++                      list_for_each_entry(node, &cluster_members_list, list) {
++                              if (node->state == NODESTATE_MEMBER &&
++                                  node->expected_votes > val) {
++                                      node->expected_votes = val;
++                              }
++                      }
++                      up(&cluster_members_lock);
++                      if (expected_votes > val)
++                              expected_votes = val;
++              }
++              recalculate_quorum(1);  /* Allow decrease */
++              sm_member_update(cluster_is_quorate);
++              break;
++
++      case RECONFIG_PARAM_NODE_VOTES:
++              node = find_node_by_nodeid(saddr->scl_nodeid);
++              node->votes = val;
++              recalculate_quorum(1);  /* Allow decrease */
++              sm_member_update(cluster_is_quorate);
++              break;
++
++      case RECONFIG_PARAM_CONFIG_VERSION:
++              config_version = val;
++              break;
++
++      default:
++              printk(KERN_INFO CMAN_NAME
++                     ": got unknown parameter in reconfigure message. %d\n",
++                     confmsg->param);
++              break;
++      }
++      return 0;
++}
++
++/* Response from master node */
++static int do_process_joinack(struct msghdr *msg, int len)
++{
++      struct cl_mem_joinack_msg *ackmsg = msg->msg_iov->iov_base;
++
++      join_time = jiffies;
++      if (ackmsg->acktype == JOINACK_TYPE_OK) {
++              node_state = JOINACK;
++      }
++
++      if (ackmsg->acktype == JOINACK_TYPE_NAK) {
++              printk(KERN_WARNING CMAN_NAME
++                     ": Cluster membership rejected\n");
++              P_MEMB("Got JOINACK NACK\n");
++              node_state = REJECTED;
++      }
++
++      if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
++              P_MEMB("Got JOINACK WAIT\n");
++              node_state = JOINWAIT;
++              joinwait_time = jiffies;
++      }
++
++      return 0;
++}
++
++/* Request to join the cluster. This makes us the master for this state
++ * transition */
++static int do_process_joinreq(struct msghdr *msg, int len)
++{
++      int status;
++      static unsigned long last_joinreq = 0;
++      static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
++      struct cl_mem_join_msg *joinmsg = msg->msg_iov->iov_base;
++      struct cluster_node *node;
++
++      /* If we are in a state transition then tell the new node to wait a bit
++       * longer */
++      if (node_state != MEMBER) {
++              if (node_state == MASTER || node_state == TRANSITION) {
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_WAIT);
++              }
++              return 0;
++      }
++
++      /* Check version number */
++      if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
++              char *ptr = (char *) joinmsg;
++              char *name;
++
++              /* Sanity-check the num_addrs field otherwise we could oops */
++              if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": num_addr in JOIN-REQ message is rubbish: %d\n",
++                             le16_to_cpu(joinmsg->num_addr));
++                      return 0;
++              }
++
++              /* Check the cluster name matches */
++              if (strcmp(cluster_name, joinmsg->clustername)) {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": attempt to join with cluster name '%s' refused\n",
++                             joinmsg->clustername);
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              ptr += sizeof (*joinmsg);
++              name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
++
++              /* Check we are not exceeding the maximum number of nodes */
++              if (cluster_members > cman_config.max_nodes) {
++                      printk(KERN_WARNING CMAN_NAME
++                             ": Join request from %s rejected, exceeds maximum number of nodes\n",
++                             name);
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              /* Check that we don't exceed the two_node limit */
++              if (two_node && cluster_members == 2) {
++                      printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++                             "rejected, exceeds two node limit\n", name);
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              if (le16_to_cpu(joinmsg->config_version) != config_version) {
++                      printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++                             "rejected, config version local %u remote %u\n",
++                             name, config_version,
++                             le16_to_cpu(joinmsg->config_version));
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              /* If these don't match then I don't know how the message
++                 arrived! However, I can't take the chance */
++              if (le32_to_cpu(joinmsg->addr_len) != address_length) {
++                      printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++                             "rejected, address length local: %u remote %u\n",
++                             name, address_length,
++                             le32_to_cpu(joinmsg->addr_len));
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              /* Duplicate checking: Because joining messages do not have
++               * sequence numbers we may get as many JOINREQ messages as we
++               * have interfaces. This bit of code here just checks for
++               * JOINREQ messages that come in from the same node in a small
++               * period of time and removes the duplicates */
++              if (time_before(jiffies, last_joinreq + 10 * HZ)
++                  && strcmp(name, last_name) == 0) {
++                      return 0;
++              }
++
++              /* Do we already know about this node? */
++              status = check_duplicate_node(name, msg, len);
++
++              if (status < 0) {
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_NAK);
++                      return 0;
++              }
++
++              /* OK, you can be in my gang */
++              if (status == 0) {
++                      int i;
++                      struct sockaddr_cl *addr = msg->msg_name;
++
++                      last_joinreq = jiffies;
++                      strcpy(last_name, name);
++
++                      node =
++                          add_new_node(name, joinmsg->votes,
++                                       le32_to_cpu(joinmsg->expected_votes),
++                                       0, NODESTATE_JOINING);
++
++                      /* Add the node's addresses */
++                      if (list_empty(&node->addr_list)) {
++                              for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
++                                   i++) {
++                                      add_node_address(node, ptr, address_length);
++                                      ptr += address_length;
++                              }
++                      }
++
++                      send_joinack(msg->msg_name, msg->msg_namelen,
++                                    JOINACK_TYPE_OK);
++                      joining_node = node;
++                      joining_temp_nodeid = addr->scl_nodeid;
++
++                      /* Start the state transition */
++                      start_transition(TRANS_NEWNODE, node);
++              }
++      }
++      else {
++              /* Version number mismatch, don't use any part of the message
++               * other than the version numbers as things may have moved */
++              char buf[MAX_ADDR_PRINTED_LEN];
++
++              printk(KERN_INFO CMAN_NAME
++                     ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d) addr: %s\n",
++                     CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
++                     CNXMAN_PATCH_VERSION,
++                     le32_to_cpu(joinmsg->major_version),
++                     le32_to_cpu(joinmsg->minor_version),
++                     le32_to_cpu(joinmsg->patch_version),
++                     print_addr(msg->msg_name, msg->msg_namelen, buf));
++
++              send_joinack(msg->msg_name, msg->msg_namelen,
++                            JOINACK_TYPE_NAK);
++              return 0;
++      }
++
++      return 0;
++}
++
++/* A simple function to invent a small number based
++   on the node name */
++static int node_hash(void)
++{
++      int i;
++      int value = 0;
++
++      for (i=0; i<strlen(nodename); i++) {
++              value += nodename[i];
++      }
++      return value & 0xF;
++}
++
++/* A new node has stated its intent to form a new cluster. we may have
++ * something to say about that... */
++static int do_process_newcluster(struct msghdr *msg, int len)
++{
++      /* If we are also in STARTING state then back down for a random period
++       * of time */
++      if (node_state == STARTING) {
++              P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
++              start_time = jiffies + node_hash() * HZ;
++      }
++
++      return 0;
++}
++
++/* Called for each node by the node-message unpacker. Returns -1 if there is a
++ * mismatch and the caller will stop processing */
++static int check_node(struct cluster_node *newnode, char *addrs,
++                    unsigned short num_addr)
++{
++      struct cluster_node *node = find_node_by_name(newnode->name);
++
++      P_MEMB("check_node: %s", newnode->name);
++
++      if (!node) {
++              C_MEMB("  - not found\n");
++              return -1;
++      }
++
++      if (node->votes != newnode->votes ||
++          node->node_id != newnode->node_id ||
++          node->state != NODESTATE_MEMBER) {
++              C_MEMB
++                  (" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
++                   node->votes, newnode->votes, node->node_id,
++                   newnode->node_id, node->state);
++              return -1;
++      }
++      C_MEMB(" - OK\n");
++      return 0;
++}
++
++/* Called for each new node found in a JOINCONF message. Create a new node
++ * entry */
++static int add_node(struct cluster_node *node, char *addrs,
++                  unsigned short num_addr)
++{
++      P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
++             node->expected_votes, node->node_id);
++
++      if (!find_node_by_name(node->name)) {
++              struct cluster_node *newnode;
++              int i;
++
++              if ((newnode =
++                   add_new_node(node->name, node->votes, node->expected_votes,
++                                node->node_id, NODESTATE_MEMBER)) == NULL) {
++                      P_MEMB("Error adding node\n");
++                      return -1;
++              }
++              if (list_empty(&newnode->addr_list)) {
++                      for (i = 0; i < num_addr; i++) {
++                              add_node_address(newnode,
++                                               addrs + i * address_length, address_length);
++                      }
++              }
++              return 0;
++      }
++      else {
++              P_MEMB("Already got node with name %s\n", node->name);
++              return -1;
++      }
++}
++
++/* Call a specified routine for each node unpacked from the message. Return
++ * either the number of nodes found or -1 for an error */
++static int unpack_nodes(unsigned char *buf, int len,
++                      int (*routine) (struct cluster_node *, char *,
++                                      unsigned short))
++{
++      int ptr = 0;
++      int num_nodes = 0;
++      char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
++      struct cluster_node node;
++
++      node.name = nodename;
++
++      while (ptr < len) {
++              int namelen = buf[ptr++];
++              unsigned int evotes;
++              unsigned int node_id;
++              unsigned short num_addr;
++              unsigned char *addrs;
++
++              memcpy(nodename, &buf[ptr], namelen);
++              nodename[namelen] = '\0';
++              ptr += namelen;
++
++              memcpy(&num_addr, &buf[ptr], sizeof (short));
++              num_addr = le16_to_cpu(num_addr);
++              ptr += sizeof (short);
++
++              /* Just make a note of the addrs "array" */
++              addrs = &buf[ptr];
++              ptr += num_addr * address_length;
++
++              node.votes = buf[ptr++];
++
++              memcpy(&evotes, &buf[ptr], sizeof (int));
++              node.expected_votes = le32_to_cpu(evotes);
++              ptr += sizeof (int);
++
++              memcpy(&node_id, &buf[ptr], sizeof (int));
++              node.node_id = le32_to_cpu(node_id);
++              ptr += sizeof (int);
++
++              /* Call the callback routine */
++              if (routine(&node, addrs, num_addr) < 0)
++                      return -1;
++              num_nodes++;
++      }
++      return num_nodes;
++}
++
++/* Got join confirmation from a master node. This message contains a list of
++ * cluster nodes which we unpack and build into our cluster nodes list. When we
++ * have the last message we can go into TRANSITION state */
++static int do_process_joinconf(struct msghdr *msg, int len)
++{
++      char *message = msg->msg_iov->iov_base;
++
++      if (unpack_nodes(message + 2, len - 2, add_node) < 0) {
++              printk(CMAN_NAME
++                     ": Error procssing joinconf message - giving up on cluster join\n");
++              send_leave(CLUSTER_LEAVEFLAG_PANIC);
++              return -1;
++      }
++
++      /* Last message in the list? */
++      if (message[1] & 2) {
++              char ackmsg;
++              struct sockaddr_cl *addr = msg->msg_name;
++
++              us->state = NODESTATE_MEMBER;
++              node_state = TRANSITION;
++              we_are_a_cluster_member = TRUE;
++
++              ackmsg = CLUSTER_MEM_CONFACK;
++              kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
++                          sizeof (struct sockaddr_cl),
++                          MSG_NOACK);
++              kernel_thread(hello_kthread, NULL, 0);
++              mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++      }
++      return 0;
++}
++
++/* Got the master's view of the cluster - compare it with ours and tell it the
++ * result */
++static int do_process_masterview(struct msghdr *msg, int len)
++{
++      char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
++      char *message = msg->msg_iov->iov_base;
++      static int num_nodes;
++
++      /* Someone else's state transition */
++      if (node_state != MEMBER &&
++          node_state != TRANSITION && node_state != MASTER)
++              return 0;
++
++      /* First message, zero the counter */
++      if (message[1] & 1)
++              num_nodes = 0;
++
++      num_nodes +=
++          unpack_nodes(msg->msg_iov->iov_base + 2, len - 2, check_node);
++
++      /* Last message, check the count and reply */
++      if (message[1] & 2) {
++              if (num_nodes == cluster_members) {
++                      /* Send ACK */
++                      reply[1] = 1;
++              }
++              else {
++                      P_MEMB
++                          ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
++                           num_nodes, cluster_members);
++                      /* Send NAK */
++                      reply[1] = 0;
++              }
++              kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
++                          msg->msg_namelen, 0);
++      }
++      return 0;
++}
++
++static int do_process_leave(struct msghdr *msg, int len)
++{
++      struct cluster_node *node;
++      struct sockaddr_cl *saddr = msg->msg_name;
++      unsigned char *leavemsg = (unsigned char *) msg->msg_iov->iov_base;
++
++      if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
++              unsigned char reason = leavemsg[1];
++
++              if (node->state != NODESTATE_DEAD) {
++                      printk(KERN_INFO CMAN_NAME
++                             ": Node %s is leaving the cluster, reason %d\n",
++                             node->name, reason);
++
++                      node->leave_reason = reason;
++              }
++              leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
++
++              a_node_just_died(node);
++
++              /* If it was the master node, then we have been nominated as
++               * the sucessor */
++              if (node == master_node) {
++                      start_transition(TRANS_DEADMASTER, master_node);
++              }
++
++      }
++      return 0;
++}
++
++static int do_process_hello(struct msghdr *msg, int len)
++{
++      struct cluster_node *node;
++      struct cl_mem_hello_msg *hellomsg =
++          (struct cl_mem_hello_msg *) msg->msg_iov->iov_base;
++      struct sockaddr_cl *saddr = msg->msg_name;
++
++      /* We are starting up. Send a join message to the node whose HELLO we
++       * just received */
++      if (node_state == STARTING || node_state == JOINWAIT) {
++              struct sockaddr_cl *addr = msg->msg_name;
++
++              printk(KERN_INFO CMAN_NAME ": sending membership request\n");
++
++              send_joinreq(addr, msg->msg_namelen);
++              join_time = jiffies;
++              node_state = JOINING;
++              return 0;
++      }
++
++      /* Only process HELLOs if we are not in transition */
++      if (node_state == MEMBER) {
++              if (len < sizeof (struct cl_mem_hello_msg)) {
++                      printk(KERN_ERR CMAN_NAME
++                             ": short hello message from node %d\n",
++                             saddr->scl_nodeid);
++                      return -1;
++              }
++
++              node = find_node_by_nodeid(saddr->scl_nodeid);
++              if (node && node->state != NODESTATE_DEAD) {
++
++                      /* Check the cluster generation in the HELLO message.
++                       * NOTE: this may be different if the message crossed
++                       * on the wire with an END-TRANS so we allow a period
++                       * of grace in which this is allowable */
++                      if (cluster_generation !=
++                          le32_to_cpu(hellomsg->generation)
++                          && node_state == MEMBER
++                          && time_after(jiffies,
++                                        cman_config.hello_timer * HZ +
++                                        transition_end_time)) {
++                              char killmsg;
++
++                              printk(KERN_INFO CMAN_NAME
++                                     ": bad generation number %d in HELLO message, expected %d\n",
++                                     le32_to_cpu(hellomsg->generation),
++                                     cluster_generation);
++
++                              notify_kernel_listeners(DIED,
++                                                      (long) node->node_id);
++
++                              killmsg = CLUSTER_MEM_KILL;
++                              kcl_sendmsg(mem_socket, &killmsg, 1,
++                                          saddr, sizeof (struct sockaddr_cl),
++                                          MSG_NOACK);
++                              return 0;
++                      }
++
++                      if (cluster_members != le16_to_cpu(hellomsg->members)
++                          && node_state == MEMBER) {
++                              printk(KERN_INFO CMAN_NAME
++                                     ": nmembers in HELLO message does not match our view\n");
++                              start_transition(TRANS_CHECK, node);
++                              return 0;
++                      }
++                      /* The message is OK - save the time */
++                      node->last_hello = jiffies;
++
++              }
++              else {
++                      struct sockaddr_cl *addr = msg->msg_name;
++
++                      /* This node is a danger to our valid cluster */
++                      if (cluster_is_quorate) {
++                              char killmsg;
++
++                              killmsg = CLUSTER_MEM_KILL;
++                              kcl_sendmsg(mem_socket, &killmsg, 1, addr,
++                                          sizeof (struct sockaddr_cl),
++                                          MSG_NOACK);
++                      }
++
++              }
++      }
++
++      return 0;
++
++}
++
++static int do_process_kill(struct msghdr *msg, int len)
++{
++      struct sockaddr_cl *saddr = msg->msg_name;
++      struct cluster_node *node;
++
++      node = find_node_by_nodeid(saddr->scl_nodeid);
++      if (node && node->state == NODESTATE_MEMBER) {
++
++              printk(KERN_INFO CMAN_NAME
++                     ": Being told to leave the cluster by node %d\n",
++                     saddr->scl_nodeid);
++
++              node_state = LEFT_CLUSTER;
++              quit_threads = 1;
++              wake_up_process(membership_task);
++              wake_up_interruptible(&cnxman_waitq);
++      }
++      else {
++              P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
++      }
++      return 0;
++}
++
++/* Some cluster membership utility functions */
++struct cluster_node *find_node_by_name(char *name)
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (strcmp(node->name, name) == 0) {
++                      up(&cluster_members_lock);
++                      return node;
++              }
++      }
++      up(&cluster_members_lock);
++      return NULL;
++}
++
++/* Try to avoid using this as it's slow and holds the members lock */
++struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
++{
++      struct list_head *nodelist;
++      struct list_head *addrlist;
++      struct cluster_node *node;
++      struct cluster_node_addr *nodeaddr;
++
++      down(&cluster_members_lock);
++
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              list_for_each(addrlist, &node->addr_list) {
++                      nodeaddr =
++                          list_entry(addrlist, struct cluster_node_addr,
++                                     list);
++
++                      if (memcmp(nodeaddr->addr, addr, address_length) == 0) {
++                              up(&cluster_members_lock);
++                              return node;
++                      }
++              }
++      }
++
++      up(&cluster_members_lock);
++      return NULL;
++}
++
++/* This is the quick way to find a node */
++struct cluster_node *find_node_by_nodeid(unsigned int id)
++{
++      struct cluster_node *node;
++
++      if (id > sizeof_members_array)
++              return NULL;
++
++      spin_lock(&members_by_nodeid_lock);
++      node = members_by_nodeid[id];
++      spin_unlock(&members_by_nodeid_lock);
++      return node;
++}
++
++static int dispatch_messages(struct socket *mem_socket)
++{
++      int err = 0;
++
++      while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++              struct msghdr msg;
++              struct iovec iov;
++              struct sockaddr_cl sin;
++              int len;
++              mm_segment_t fs;
++
++              memset(&sin, 0, sizeof (sin));
++
++              msg.msg_control = NULL;
++              msg.msg_controllen = 0;
++              msg.msg_iovlen = 1;
++              msg.msg_iov = &iov;
++              msg.msg_name = &sin;
++              msg.msg_namelen = sizeof (sin);
++              msg.msg_flags = 0;
++
++              iov.iov_len = MAX_CLUSTER_MESSAGE;
++              iov.iov_base = iobuf;
++
++              fs = get_fs();
++              set_fs(get_ds());
++
++              len =
++                  sock_recvmsg(mem_socket, &msg, MAX_CLUSTER_MESSAGE,
++                               MSG_DONTWAIT);
++              set_fs(fs);
++              if (len > 0) {
++                      iov.iov_base = iobuf;   /* Reinstate pointer */
++                      msg.msg_name = &sin;
++                      do_membership_packet(&msg, len);
++              }
++              else {
++                      if (len == -EAGAIN)
++                              err = 0;
++                      else
++                              err = -1;
++                      break;
++              }
++      }
++      return err;
++}
++
++/* Scan the nodes list for dead nodes */
++static void check_for_dead_nodes()
++{
++      struct list_head *nodelist;
++      struct cluster_node *node;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->state != NODESTATE_DEAD &&
++                  time_after(jiffies,
++                             node->last_hello +
++                             cman_config.deadnode_timeout * HZ) && !node->us) {
++
++                      up(&cluster_members_lock);
++
++                      printk(KERN_WARNING CMAN_NAME
++                             ": no HELLO from %s, removing from the cluster\n",
++                             node->name);
++
++                      P_MEMB("last hello was %ld, current time is %ld\n",
++                             node->last_hello, jiffies);
++
++                      node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
++                      leavereason = 0;
++
++                      /* This is unlikely to work but it's worth a try! */
++                      send_kill(node->node_id);
++
++                      /* Start state transition */
++                      a_node_just_died(node);
++                      return;
++              }
++      }
++      up(&cluster_members_lock);
++
++      /* Also check for a dead quorum device */
++      if (quorum_device) {
++              if (quorum_device->state == NODESTATE_MEMBER &&
++                  time_after(jiffies,
++                             quorum_device->last_hello +
++                             cman_config.deadnode_timeout * HZ)) {
++                      quorum_device->state = NODESTATE_DEAD;
++                      printk(KERN_WARNING CMAN_NAME
++                             ": Quorum device %s timed out\n",
++                             quorum_device->name);
++                      recalculate_quorum(0);
++              }
++      }
++
++      return;
++}
++
++/* add "us" as a node in the cluster */
++static int add_us()
++{
++      struct cluster_node *newnode =
++          kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++
++      if (!newnode) {
++              /* Oh shit, we have to commit hara kiri here for the greater
++               * good of the cluster */
++              send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++              printk(KERN_CRIT CMAN_NAME
++                     ": Cannot allocate memory for our node structure\n");
++              panic("Must die");
++
++              return -1;
++      }
++
++      memset(newnode, 0, sizeof (struct cluster_node));
++      newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
++      if (!newnode->name) {
++              send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++              printk(KERN_CRIT CMAN_NAME
++                     ": Cannot allocate memory for node name\n");
++              kfree(newnode);
++
++              panic("Must die");
++
++              return -1;
++      }
++
++      strcpy(newnode->name, nodename);
++      newnode->last_hello = jiffies;
++      newnode->votes = votes;
++      newnode->expected_votes = expected_votes;
++      newnode->state = NODESTATE_JOINING;
++      newnode->node_id = 0;   /* Will get filled in by ENDTRANS message */
++      newnode->us = 1;
++      newnode->leave_reason = 0;
++      INIT_LIST_HEAD(&newnode->addr_list);
++      get_local_addresses(newnode);   /* Get from cnxman socket info */
++
++      /* Add the new node to the list */
++      down(&cluster_members_lock);
++      list_add(&newnode->list, &cluster_members_list);
++      cluster_members++;
++      up(&cluster_members_lock);
++      us = newnode;
++
++      return 0;
++}
++
++/* Return the highest known node_id */
++unsigned int get_highest_nodeid()
++{
++      struct list_head *nodelist;
++      struct cluster_node *node = NULL;
++      unsigned int highest = 0;
++
++      down(&cluster_members_lock);
++      list_for_each(nodelist, &cluster_members_list) {
++              node = list_entry(nodelist, struct cluster_node, list);
++
++              if (node->node_id > highest)
++                      highest = node->node_id;
++      }
++      up(&cluster_members_lock);
++
++      return highest;
++}
++
++/* Elect a new master if there is a clash. Returns 1 if we are the new master,
++ * the master's struct will also be returned. This, rather primitively, uses
++ * the lowest node ID */
++static int elect_master(struct cluster_node **master_node)
++{
++      int i;
++
++      for (i = 1; i < sizeof_members_array; i++) {
++              if (members_by_nodeid[i]
++                  && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
++                      *master_node = members_by_nodeid[i];
++                      P_MEMB("Elected master is %s\n", (*master_node)->name);
++                      return (*master_node)->us;
++              }
++      }
++      BUG();
++      return 0;
++}
++
++/* Called by node_cleanup in cnxman when we have left the cluster */
++void free_nodeid_array()
++{
++      vfree(members_by_nodeid);
++      members_by_nodeid = NULL;
++      sizeof_members_array = 0;
++}
++
++int allocate_nodeid_array()
++{
++      /* Allocate space for the nodeid lookup array */
++      if (!members_by_nodeid) {
++              spin_lock_init(&members_by_nodeid_lock);
++              members_by_nodeid =
++                  vmalloc(cman_config.max_nodes *
++                          sizeof (struct cluster_member *));
++      }
++
++      if (!members_by_nodeid) {
++              printk(KERN_WARNING
++                     "Unable to allocate members array for %d members\n",
++                     cman_config.max_nodes);
++              return -ENOMEM;
++      }
++      memset(members_by_nodeid, 0,
++             cman_config.max_nodes * sizeof (struct cluster_member *));
++      sizeof_members_array = cman_config.max_nodes;
++
++      return 0;
++}
++
++/* Set the votes & expected_votes variables */
++void set_votes(int v, int e)
++{
++      votes = v;
++      expected_votes = e;
++}
++
++int get_quorum()
++{
++      return quorum;
++}
++
++/* Called by cnxman to see if activity should be blocked because we are in a
++ * state transition */
++int in_transition()
++{
++      return node_state == TRANSITION ||
++          node_state == TRANSITION_COMPLETE || node_state == MASTER;
++}
++
++/* Return the current membership state as a string for the main line to put
++ * into /proc . I really should be using snprintf rather than sprintf but it's
++ * not exported... */
++char *membership_state(char *buf, int buflen)
++{
++      switch (node_state) {
++      case STARTING:
++              strncpy(buf, "Starting", buflen);
++              break;
++      case JOINING:
++              strncpy(buf, "Joining", buflen);
++              break;
++      case JOINWAIT:
++              strncpy(buf, "Join-Wait", buflen);
++              break;
++      case JOINACK:
++              strncpy(buf, "Join-Ack", buflen);
++              break;
++      case TRANSITION:
++              sprintf(buf, "State-Transition: Master is %s",
++                      master_node ? master_node->name : "Unknown");
++              break;
++      case MEMBER:
++              strncpy(buf, "Cluster-Member", buflen);
++              break;
++      case REJECTED:
++              strncpy(buf, "Rejected", buflen);
++              break;
++      case LEFT_CLUSTER:
++              strncpy(buf, "Left-Cluster", buflen);
++              break;
++      case TRANSITION_COMPLETE:
++              strncpy(buf, "Transition-Complete", buflen);
++              break;
++      case MASTER:
++              strncpy(buf, "Transition-Master", buflen);
++              break;
++      default:
++              sprintf(buf, "Unknown: code=%d", node_state);
++              break;
++      }
++
++      return buf;
++}
++
++#ifdef DEBUG_MEMB
++static char *msgname(int msg)
++{
++      switch (msg) {
++      case CLUSTER_MEM_JOINCONF:
++              return "JOINCONF";
++      case CLUSTER_MEM_JOINREQ:
++              return "JOINREQ";
++      case CLUSTER_MEM_LEAVE:
++              return "LEAVE";
++      case CLUSTER_MEM_HELLO:
++              return "HELLO";
++      case CLUSTER_MEM_KILL:
++              return "KILL";
++      case CLUSTER_MEM_JOINACK:
++              return "JOINACK";
++      case CLUSTER_MEM_ENDTRANS:
++              return "ENDTRANS";
++      case CLUSTER_MEM_RECONFIG:
++              return "RECONFIG";
++      case CLUSTER_MEM_MASTERVIEW:
++              return "MASTERVIEW";
++      case CLUSTER_MEM_STARTTRANS:
++              return "STARTTRANS";
++      case CLUSTER_MEM_JOINREJ:
++              return "JOINREJ";
++      case CLUSTER_MEM_VIEWACK:
++              return "VIEWACK";
++      case CLUSTER_MEM_STARTACK:
++              return "STARTACK";
++      case CLUSTER_MEM_NEWCLUSTER:
++              return "NEWCLUSTER";
++      case CLUSTER_MEM_CONFACK:
++              return "CONFACK";
++      case CLUSTER_MEM_NOMINATE:
++              return "NOMINATE";
++
++      default:
++              return "??UNKNOWN??";
++      }
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
+--- linux-orig/cluster/cman/proc.c     1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/proc.c  2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,364 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/init.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/list.h>
++#include <linux/in.h>
++#include <net/sock.h>
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#include "cnxman-private.h"
++#include "config.h"
++
++extern int cluster_members;
++extern struct list_head cluster_members_list;
++extern struct semaphore cluster_members_lock;
++extern struct cluster_node *quorum_device;
++extern int we_are_a_cluster_member;
++extern int cluster_is_quorate;
++extern unsigned short cluster_id;
++extern atomic_t use_count;
++extern unsigned int address_length;
++extern unsigned int config_version;
++extern char cluster_name[];
++extern struct cluster_node *us;
++static struct seq_operations cluster_info_op;
++
++int sm_procdata(char *b, char **start, off_t offset, int length);
++int sm_debug_info(char *b, char **start, off_t offset, int length);
++
++/* /proc interface to the configuration struct */
++static struct config_proc_info {
++    char *name;
++    int  *value;
++} config_proc[] = {
++    {
++      .name = "joinwait_timeout",
++      .value = &cman_config.joinwait_timeout,
++    },
++    {
++      .name = "joinconf_timeout",
++      .value = &cman_config.joinconf_timeout,
++    },
++    {
++      .name = "join_timeout",
++      .value = &cman_config.join_timeout,
++    },
++    {
++      .name = "hello_timer",
++      .value = &cman_config.hello_timer,
++    },
++    {
++      .name = "deadnode_timeout",
++      .value = &cman_config.deadnode_timeout,
++    },
++    {
++      .name = "transition_timeout",
++      .value = &cman_config.transition_timeout,
++    },
++    {
++      .name = "transition_restarts",
++      .value = &cman_config.transition_restarts,
++    },
++    {
++      .name = "max_nodes",
++      .value = &cman_config.max_nodes,
++    },
++    {
++      .name = "sm_debug_size",
++      .value = &cman_config.sm_debug_size,
++    },
++};
++
++
++static int proc_cluster_status(char *b, char **start, off_t offset, int length)
++{
++    struct list_head *nodelist;
++    struct cluster_node *node;
++    struct cluster_node_addr *node_addr;
++    unsigned int total_votes = 0;
++    unsigned int max_expected = 0;
++    int c = 0;
++    char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
++
++    if (!we_are_a_cluster_member) {
++      c += sprintf(b+c, "Not a cluster member. State: %s\n",
++                   membership_state(node_buf,
++                                    sizeof (node_buf)));
++      return c;
++    }
++
++    /* Total the votes */
++    down(&cluster_members_lock);
++    list_for_each(nodelist, &cluster_members_list) {
++      node = list_entry(nodelist, struct cluster_node, list);
++      if (node->state == NODESTATE_MEMBER) {
++          total_votes += node->votes;
++          max_expected =
++              max(max_expected, node->expected_votes);
++      }
++    }
++    up(&cluster_members_lock);
++
++    if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
++      total_votes += quorum_device->votes;
++
++    c += sprintf(b+c,
++               "Version: %d.%d.%d\nConfig version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
++               CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
++               CNXMAN_PATCH_VERSION,
++               config_version,
++               cluster_name, cluster_id,
++               membership_state(node_buf, sizeof (node_buf)));
++    c += sprintf(b+c,
++               "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d  %s\n",
++               cluster_members, max_expected, total_votes,
++               get_quorum(),
++               cluster_is_quorate ? " " : "Activity blocked");
++    c += sprintf(b+c, "Active subsystems: %d\n",
++               atomic_read(&use_count));
++
++
++    c += sprintf(b+c, "Node addresses: ");
++    list_for_each_entry(node_addr, &us->addr_list, list) {
++      struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
++      if (saddr->sin6_family == AF_INET6) {
++              c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x  ",
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
++                           be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
++      }
++      else {
++          struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
++          uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
++          c+= sprintf(b+c, "%u.%u.%u.%u  ",
++                      addr[0], addr[1], addr[2], addr[3]);
++      }
++    }
++    c += sprintf(b+c, "\n\n");
++    return c;
++}
++
++
++/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
++ * we are */
++struct cluster_seq_info {
++      int nodeid;
++      int highest_nodeid;
++};
++
++static int cluster_open(struct inode *inode, struct file *file)
++{
++      return seq_open(file, &cluster_info_op);
++}
++
++static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
++{
++      struct cluster_seq_info *csi =
++          kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
++
++      if (!csi)
++              return NULL;
++
++      /* Keep highest_nodeid here so we don't need to keep traversing the
++       * list to find it */
++      csi->nodeid = *pos;
++      csi->highest_nodeid = get_highest_nodeid();
++
++      /* Print the header */
++      if (*pos == 0) {
++              seq_printf(m,
++                         "Node  Votes Exp Sts  Name\n");
++              return csi;
++      }
++      return csi;
++}
++
++static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
++{
++      struct cluster_seq_info *csi = p;
++
++      *pos = ++csi->nodeid;
++      if (csi->nodeid > csi->highest_nodeid)
++              return NULL;
++
++      return csi;
++}
++
++static int cluster_seq_show(struct seq_file *m, void *p)
++{
++      char state = '?';
++      struct cluster_node *node;
++      struct cluster_seq_info *csi = p;
++
++      /*
++       * If we have "0" here then display the quorum device if
++       * there is one.
++       */
++      if (csi->nodeid == 0)
++              node = quorum_device;
++      else
++              node = find_node_by_nodeid(csi->nodeid);
++
++      if (!node)
++              return 0;
++
++      /* Make state printable */
++      switch (node->state) {
++      case NODESTATE_MEMBER:
++              state = 'M';
++              break;
++      case NODESTATE_JOINING:
++              state = 'J';
++              break;
++      case NODESTATE_REMOTEMEMBER:
++              state = 'R';
++              break;
++      case NODESTATE_DEAD:
++              state = 'X';
++              break;
++      }
++      seq_printf(m, " %3d  %3d  %3d   %c   %s\n",
++                 node->node_id,
++                 node->votes,
++                 node->expected_votes,
++                 state,
++                 node->name);
++
++      return 0;
++}
++
++static void cluster_seq_stop(struct seq_file *m, void *p)
++{
++      kfree(p);
++}
++
++static struct seq_operations cluster_info_op = {
++      .start = cluster_seq_start,
++      .next = cluster_seq_next,
++      .stop = cluster_seq_stop,
++      .show = cluster_seq_show
++};
++
++static struct file_operations cluster_fops = {
++      .open = cluster_open,
++      .read = seq_read,
++      .llseek = seq_lseek,
++      .release = seq_release,
++};
++
++static int cman_config_read_proc(char *page, char **start, off_t off, int count,
++                               int *eof, void *data)
++{
++    struct config_proc_info *cinfo = data;
++
++    return snprintf(page, count, "%d\n", *cinfo->value);
++}
++
++static int cman_config_write_proc(struct file *file, const char *buffer,
++                                unsigned long count, void *data)
++{
++    struct config_proc_info *cinfo = data;
++    int value;
++    char *end;
++
++    value = simple_strtoul(buffer, &end, 10);
++    if (*end) {
++      *cinfo->value = value;
++    }
++    return count;
++}
++
++/* Base of the config directory for cman */
++static struct proc_dir_entry *proc_cman_config;
++void create_proc_entries(void)
++{
++      struct proc_dir_entry *procentry;
++      struct proc_dir_entry *proc_cluster;
++      int i;
++
++      proc_cluster = proc_mkdir("cluster", 0);
++      if (!proc_cluster)
++              return;
++      proc_cluster->owner = THIS_MODULE;
++
++      /* Config dir filled in by us and others */
++      if (!proc_mkdir("cluster/config", 0))
++              return;
++
++      /* Don't much care if this fails, it's hardly vital */
++      procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
++      if (procentry)
++              procentry->proc_fops = &cluster_fops;
++
++      procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
++      if (procentry)
++              procentry->get_info = proc_cluster_status;
++
++      procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
++      if (procentry)
++              procentry->get_info = sm_procdata;
++
++      /* Config entries */
++      proc_cman_config = proc_mkdir("cluster/config/cman", 0);
++      if (!proc_cman_config)
++              return;
++
++      for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
++              procentry = create_proc_entry(config_proc[i].name, 0660,
++                                            proc_cman_config);
++              if (procentry) {
++                      procentry->data = &config_proc[i];
++                      procentry->write_proc = cman_config_write_proc;
++                      procentry->read_proc = cman_config_read_proc;
++              }
++      }
++
++      procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
++      if (procentry)
++              procentry->get_info = sm_debug_info;
++}
++
++void cleanup_proc_entries(void)
++{
++        int i, config_count;
++
++      remove_proc_entry("cluster/sm_debug", NULL);
++
++      config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
++
++      if (proc_cman_config) {
++              for (i=0; i<config_count; i++)
++                      remove_proc_entry(config_proc[i].name, proc_cman_config);
++      }
++      remove_proc_entry("cluster/config/cman", NULL);
++      remove_proc_entry("cluster/config", NULL);
++
++      remove_proc_entry("cluster/nodes", NULL);
++      remove_proc_entry("cluster/status", NULL);
++      remove_proc_entry("cluster/services", NULL);
++      remove_proc_entry("cluster/config", NULL);
++      remove_proc_entry("cluster", NULL);
++}
+diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
+--- linux-orig/cluster/cman/sm.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,108 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_DOT_H__
++#define __SM_DOT_H__
++
++/* 
++ * This is the main header file to be included in each Service Manager source
++ * file.
++ */
++
++#include <linux/list.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <net/sock.h>
++
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#define SG_LEVELS (4)
++
++#include "sm_internal.h"
++#include "sm_barrier.h"
++#include "sm_control.h"
++#include "sm_daemon.h"
++#include "sm_joinleave.h"
++#include "sm_membership.h"
++#include "sm_message.h"
++#include "sm_misc.h"
++#include "sm_recover.h"
++#include "sm_services.h"
++
++extern struct list_head sm_sg[SG_LEVELS];
++extern struct semaphore sm_sglock;
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#define SM_ASSERT(x, do) \
++{ \
++  if (!(x)) \
++  { \
++    printk("\nSM:  Assertion failed on line %d of file %s\n" \
++               "SM:  assertion:  \"%s\"\n" \
++               "SM:  time = %lu\n", \
++               __LINE__, __FILE__, #x, jiffies); \
++    {do} \
++    printk("\n"); \
++    panic("SM:  Record message above and reboot.\n"); \
++  } \
++}
++
++#define SM_RETRY(do_this, until_this) \
++for (;;) \
++{ \
++  do { do_this; } while (0); \
++  if (until_this) \
++    break; \
++  printk("SM:  out of memory:  %s, %u\n", __FILE__, __LINE__); \
++  schedule();\
++}
++
++
++#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
++
++#define log_error(sg, fmt, args...) \
++      printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
++
++
++#define SM_DEBUG_LOG
++
++#ifdef SM_DEBUG_CONSOLE
++#define log_debug(sg, fmt, args...) \
++      printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
++#endif
++
++#ifdef SM_DEBUG_LOG
++#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
++#endif
++
++#ifdef SM_DEBUG_ALL
++#define log_debug(sg, fmt, args...) \
++do \
++{ \
++      printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
++      sm_debug_log(sg, fmt, ##args); \
++} \
++while (0)
++#endif
++
++#endif                                /* __SM_DOT_H__ */
+diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
+--- linux-orig/cluster/cman/sm_barrier.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_barrier.c    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,232 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static struct list_head       barriers;
++static spinlock_t     barriers_lock;
++
++struct bc_entry {
++      struct list_head list;
++      uint32_t gid;
++      int status;
++      char type;
++};
++typedef struct bc_entry bc_entry_t;
++
++void init_barriers(void)
++{
++      INIT_LIST_HEAD(&barriers);
++      spin_lock_init(&barriers_lock);
++}
++
++static int atoi(char *c)
++{
++      int x = 0;
++
++      while ('0' <= *c && *c <= '9') {
++              x = x * 10 + (*c - '0');
++              c++;
++      }
++      return x;
++}
++
++static void add_barrier_callback(char *name, int status, int type)
++{
++      char *p;
++      uint32_t gid;
++      bc_entry_t *be;
++
++      /* an ESRCH callback just means there was a cnxman transition */
++      if (status == -ESRCH)
++              return;
++
++      /* extract global id of SG from barrier name */
++      p = strstr(name, "sm.");
++
++      SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
++
++      p += strlen("sm.");
++      gid = atoi(p);
++
++      SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
++
++      be->gid = gid;
++      be->status = status;
++      be->type = type;
++
++      spin_lock(&barriers_lock);
++      list_add_tail(&be->list, &barriers);
++      spin_unlock(&barriers_lock);
++
++      wake_serviced(DO_BARRIERS);
++}
++
++static void callback_recovery_barrier(char *name, int status)
++{
++      add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
++}
++
++static void callback_startdone_barrier_new(char *name, int status)
++{
++      add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
++}
++
++static void callback_startdone_barrier(char *name, int status)
++{
++      add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
++}
++
++int sm_barrier(char *name, int count, int type)
++{
++      int error;
++      unsigned long fn = 0;
++
++      switch (type) {
++      case SM_BARRIER_STARTDONE:
++              fn = (unsigned long) callback_startdone_barrier;
++              break;
++      case SM_BARRIER_STARTDONE_NEW:
++              fn = (unsigned long) callback_startdone_barrier_new;
++              break;
++      case SM_BARRIER_RECOVERY:
++              fn = (unsigned long) callback_recovery_barrier;
++              break;
++      }
++
++      error = kcl_barrier_register(name, 0, count);
++      if (error) {
++              log_print("barrier register error %d", error);
++              goto fail;
++      }
++
++      error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
++      if (error) {
++              log_print("barrier setattr autodel error %d", error);
++              goto fail_bar;
++      }
++
++      error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
++      if (error) {
++              log_print("barrier setattr cb error %d", error);
++              goto fail_bar;
++      }
++
++      error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
++      if (error) {
++              log_print("barrier setattr enabled error %d", error);
++              goto fail_bar;
++      }
++
++      return 0;
++
++ fail_bar:
++      kcl_barrier_delete(name);
++ fail:
++      return error;
++}
++
++void process_startdone_barrier_new(sm_group_t *sg, int status)
++{
++      sm_sevent_t *sev = sg->sevent;
++
++      if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
++              log_debug(sev->se_sg, "ignore barrier cb status %d", status);
++              return;
++      }
++
++      sev->se_barrier_status = status;
++      sev->se_state = SEST_BARRIER_DONE;
++      set_bit(SEFL_CHECK, &sev->se_flags);
++      wake_serviced(DO_JOINLEAVE);
++}
++
++void process_startdone_barrier(sm_group_t *sg, int status)
++{
++      sm_uevent_t *uev = &sg->uevent;
++
++      if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
++              log_debug(sg, "ignore barrier cb status %d", status);
++              return;
++      }
++
++      uev->ue_barrier_status = status;
++      uev->ue_state = UEST_BARRIER_DONE;
++      set_bit(UEFL_CHECK, &uev->ue_flags);
++      wake_serviced(DO_MEMBERSHIP);
++}
++
++void process_recovery_barrier(sm_group_t *sg, int status)
++{
++      if (status) {
++              log_error(sg, "process_recovery_barrier status=%d", status);
++              return;
++      }
++
++      if (sg->state != SGST_RECOVER ||
++          sg->recover_state != RECOVER_BARRIERWAIT) {
++              log_error(sg, "process_recovery_barrier state %d recover %d",
++                        sg->state, sg->recover_state);
++              return;
++      }
++
++      if (!sg->recover_stop)
++              sg->recover_state = RECOVER_STOP;
++      else
++              sg->recover_state = RECOVER_BARRIERDONE;
++
++      wake_serviced(DO_RECOVERIES);
++}
++
++void process_barriers(void)
++{
++      sm_group_t *sg;
++      bc_entry_t *be;
++
++      while (1) {
++              be = NULL;
++
++              spin_lock(&barriers_lock);
++              if (!list_empty(&barriers)) {
++                      be = list_entry(barriers.next, bc_entry_t, list);
++                      list_del(&be->list);
++              }
++              spin_unlock(&barriers_lock);
++
++              if (!be)
++                      break;
++
++              sg = sm_global_id_to_sg(be->gid);
++              if (!sg) {
++                      log_print("process_barriers: no sg %08x", be->gid);
++                      break;
++              }
++
++              switch (be->type) {
++              case SM_BARRIER_STARTDONE_NEW:
++                      process_startdone_barrier_new(sg, be->status);
++                      break;
++
++              case SM_BARRIER_STARTDONE:
++                      process_startdone_barrier(sg, be->status);
++                      break;
++
++              case SM_BARRIER_RECOVERY:
++                      process_recovery_barrier(sg, be->status);
++                      break;
++              }
++
++              kfree(be);
++              schedule();
++      }
++}
+diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
+--- linux-orig/cluster/cman/sm_barrier.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_barrier.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_BARRIER_DOT_H__
++#define __SM_BARRIER_DOT_H__
++
++#define SM_BARRIER_STARTDONE          (0)
++#define SM_BARRIER_STARTDONE_NEW      (1)
++#define SM_BARRIER_RECOVERY           (2)
++#define SM_BARRIER_RESET              (3)
++
++void init_barriers(void);
++void process_barriers(void);
++int sm_barrier(char *name, int count, int type);
++void process_startdone_barrier(sm_group_t *sg, int status);
++void process_startdone_barrier_new(sm_group_t *sg, int status);
++void process_recovery_barrier(sm_group_t *sg, int status);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
+--- linux-orig/cluster/cman/sm_control.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_control.c    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,156 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++struct socket *               sm_socket;
++uint32_t *            sm_new_nodeids;
++uint32_t              sm_our_nodeid;
++int                   sm_quorum, sm_quorum_next;
++struct list_head      sm_members;
++int                   sm_member_count;
++
++
++/* 
++ * Context: cnxman
++ * Called by cnxman when it has a new member list.
++ */
++
++void sm_member_update(int quorate)
++{
++      sm_quorum_next = quorate;
++      wake_serviced(DO_START_RECOVERY);
++}
++
++/* 
++ * Context: cnxman
++ * Called when module is loaded.
++ */
++
++void sm_init(void)
++{
++      sm_socket = NULL;
++      sm_new_nodeids = NULL;
++      sm_quorum = 0;
++      sm_quorum_next = 0;
++      sm_our_nodeid = 0;
++      INIT_LIST_HEAD(&sm_members);
++      sm_member_count = 0;
++
++      init_services();
++      init_messages();
++      init_barriers();
++      init_serviced();
++      init_recovery();
++      init_joinleave();
++      init_sm_misc();
++}
++
++/* 
++ * Context: cnxman
++ * Called at beginning of cluster join procedure.
++ */
++
++void sm_start(void)
++{
++      struct sockaddr_cl saddr;
++      struct socket *sock;
++      int result;
++
++      /* Create a communication channel among service managers */
++
++      result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
++      if (result < 0) {
++              log_print("can't create socket %d", result);
++              goto fail;
++      }
++
++      sm_socket = sock;
++
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_SERVICES;
++
++      result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
++                               sizeof(saddr));
++      if (result < 0) {
++              log_print("can't bind socket %d", result);
++              goto fail_release;
++      }
++
++      result = kcl_register_read_callback(sm_socket, sm_cluster_message);
++      if (result < 0) {
++              log_print("can't register read callback %d", result);
++              goto fail_release;
++      }
++
++      sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
++                                                   sizeof(uint32_t),
++                                                   GFP_KERNEL);
++      start_serviced();
++
++      /* cnxman should call sm_member_update() once we've joined - then we
++       * can get our first list of members and our own nodeid */
++
++      return;
++
++      fail_release:
++      sock_release(sm_socket);
++      sm_socket = NULL;
++
++      fail:
++      return;
++}
++
++/* 
++ * Context: cnxman
++ * Called before cnxman leaves the cluster.  If this returns an error to cman,
++ * cman should not leave the cluster but return EBUSY.
++ * If force is set we go away anyway. cman knows best in this case
++ */
++
++int sm_stop(int force)
++{
++      struct list_head *head;
++      sm_group_t *sg;
++      sm_node_t *node;
++      int i, busy = FALSE, error = -EBUSY;
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              if (!list_empty(&sm_sg[i])) {
++                      sg = list_entry(sm_sg[i].next, sm_group_t, list);
++                      log_error(sg, "sm_stop: SG still joined");
++                      busy = TRUE;
++              }
++      }
++
++      if (!busy || force) {
++              stop_serviced();
++
++              if (sm_socket)
++                      sock_release(sm_socket);
++
++              head = &sm_members;
++              while (!list_empty(head)) {
++                      node = list_entry(head->next, sm_node_t, list);
++                      list_del(&node->list);
++                      sm_member_count--;
++                      kfree(node);
++              }
++
++              kfree(sm_new_nodeids);
++              sm_init();
++              error = 0;
++      }
++      return error;
++}
+diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
+--- linux-orig/cluster/cman/sm_control.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_control.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_CONTROL_DOT_H__
++#define __SM_CONTROL_DOT_H__
++
++void sm_init(void);
++void sm_start(void);
++int sm_stop(int force);
++void sm_member_update(int quorate);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
+--- linux-orig/cluster/cman/sm_daemon.c        1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_daemon.c     2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,120 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static unsigned long          daemon_flags;
++static struct task_struct *   daemon_task;
++static struct completion      daemon_done;
++static wait_queue_head_t      daemon_wait;
++extern int                    sm_quorum;
++
++void init_serviced(void)
++{
++      daemon_flags = 0;
++      daemon_task = NULL;
++      init_completion(&daemon_done);
++      init_waitqueue_head(&daemon_wait);
++}
++
++void wake_serviced(int do_flag)
++{
++      set_bit(do_flag, &daemon_flags);
++      wake_up(&daemon_wait);
++}
++
++static inline int got_work(void)
++{
++      int rv = 0;
++
++      rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
++            test_bit(DO_MESSAGES, &daemon_flags) ||
++            test_bit(DO_BARRIERS, &daemon_flags) ||
++            test_bit(DO_CALLBACKS, &daemon_flags));
++
++      if (sm_quorum && !rv)
++              rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
++                    test_bit(DO_RECOVERIES, &daemon_flags) ||
++                    test_bit(DO_MEMBERSHIP, &daemon_flags));
++      return rv;
++}
++
++static int serviced(void *arg)
++{
++      DECLARE_WAITQUEUE(wait, current);
++
++      daemonize("cman_serviced");
++      daemon_task = current;
++      set_bit(DO_RUN, &daemon_flags);
++      complete(&daemon_done);
++
++      for (;;) {
++              if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
++                      process_nodechange();
++
++              if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
++                      process_messages();
++
++              if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
++                      process_barriers();
++
++              if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
++                      process_callbacks();
++
++              if (sm_quorum) {
++                      if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
++                              process_recoveries();
++
++                      if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
++                              process_joinleave();
++
++                      if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
++                              process_membership();
++              }
++
++              if (!test_bit(DO_RUN, &daemon_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              add_wait_queue(&daemon_wait, &wait);
++              if (!got_work() && test_bit(DO_RUN, &daemon_flags))
++                      schedule();
++              remove_wait_queue(&daemon_wait, &wait);
++              current->state = TASK_RUNNING;
++      }
++
++      complete(&daemon_done);
++      return 0;
++}
++
++int start_serviced(void)
++{
++      int error;
++
++      error = kernel_thread(serviced, NULL, 0);
++      if (error < 0)
++              goto out;
++
++      error = 0;
++      wait_for_completion(&daemon_done);
++
++      out:
++      return error;
++}
++
++void stop_serviced(void)
++{
++      clear_bit(DO_RUN, &daemon_flags);
++      wake_up(&daemon_wait);
++      wait_for_completion(&daemon_done);
++}
+diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
+--- linux-orig/cluster/cman/sm_daemon.h        1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_daemon.h     2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_DAEMON_DOT_H__
++#define __SM_DAEMON_DOT_H__
++
++#define DO_RUN                  (0)
++#define DO_START_RECOVERY       (1)
++#define DO_MESSAGES             (2)
++#define DO_BARRIERS             (3)
++#define DO_CALLBACKS            (4)
++#define DO_JOINLEAVE            (5)
++#define DO_RECOVERIES           (6)
++#define DO_MEMBERSHIP           (7)
++#define DO_RESET              (8)
++
++void init_serviced(void);
++void wake_serviced(int do_flag);
++void stop_serviced(void);
++int start_serviced(void);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
+--- linux-orig/cluster/cman/sm_internal.h      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_internal.h   2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,230 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_INTERNAL_DOT_H__
++#define __SM_INTERNAL_DOT_H__
++
++/* 
++ * Any header files needed by this file should be included before it in sm.h.
++ * This file should only be included by sm.h.
++ */
++
++struct sm_group;
++struct sm_sevent;
++struct sm_uevent;
++struct sm_node;
++struct sm_msg;
++
++typedef struct sm_group sm_group_t;
++typedef struct sm_sevent sm_sevent_t;
++typedef struct sm_uevent sm_uevent_t;
++typedef struct sm_node sm_node_t;
++typedef struct sm_msg sm_msg_t;
++
++
++/* 
++ * Number of seconds to wait before trying again to join or leave an SG
++ */
++#define RETRY_DELAY           (2)
++
++
++/* 
++ * Service Event - what a node uses to join or leave an sg
++ */
++
++/* SE Flags */
++#define SEFL_CHECK              (0)
++#define SEFL_ALLOW_JOIN         (1)
++#define SEFL_ALLOW_JSTOP        (2)
++#define SEFL_ALLOW_LEAVE        (3)
++#define SEFL_ALLOW_LSTOP        (4)
++#define SEFL_ALLOW_STARTDONE    (5)
++#define SEFL_ALLOW_BARRIER      (6)
++#define SEFL_DELAY              (7)
++#define SEFL_LEAVE              (8)
++#define SEFL_CANCEL             (9)
++
++/* SE States */
++#define SEST_JOIN_BEGIN         (1)
++#define SEST_JOIN_ACKWAIT       (2)
++#define SEST_JOIN_ACKED         (3)
++#define SEST_JSTOP_ACKWAIT      (4)
++#define SEST_JSTOP_ACKED        (5)
++#define SEST_JSTART_SERVICEWAIT (6)
++#define SEST_JSTART_SERVICEDONE (7)
++#define SEST_BARRIER_WAIT       (8)
++#define SEST_BARRIER_DONE       (9)
++#define SEST_LEAVE_BEGIN        (10)
++#define SEST_LEAVE_ACKWAIT      (11)
++#define SEST_LEAVE_ACKED        (12)
++#define SEST_LSTOP_ACKWAIT      (13)
++#define SEST_LSTOP_ACKED        (14)
++#define SEST_LSTART_WAITREMOTE  (15)
++#define SEST_LSTART_REMOTEDONE  (16)
++
++struct sm_sevent {
++      struct list_head        se_list;
++      unsigned int            se_id;
++      sm_group_t *            se_sg;
++      unsigned long           se_flags;
++      unsigned int            se_state;
++
++      int                     se_node_count;
++      int                     se_memb_count;
++      int                     se_reply_count;
++
++      uint32_t *              se_node_ids;
++      char *                  se_node_status;
++      int                     se_len_ids;     /* length of node_ids */
++      int                     se_len_status;  /* length of node_status */
++
++      int                     se_barrier_status;
++      struct timer_list       se_restart_timer;
++};
++
++/* 
++ * Update Event - what an sg member uses to respond to an sevent 
++ */
++
++/* UE Flags */
++#define UEFL_ALLOW_STARTDONE    (0)
++#define UEFL_ALLOW_BARRIER      (1)
++#define UEFL_CANCEL             (2)
++#define UEFL_LEAVE              (3)
++#define UEFL_CHECK              (4)
++
++/* UE States */
++#define UEST_JSTOP              (1)
++#define UEST_JSTART_WAITCMD     (2)
++#define UEST_JSTART             (3)
++#define UEST_JSTART_SERVICEWAIT (4)
++#define UEST_JSTART_SERVICEDONE (5)
++#define UEST_BARRIER_WAIT       (6)
++#define UEST_BARRIER_DONE       (7)
++#define UEST_LSTOP              (8)
++#define UEST_LSTART_WAITCMD     (9)
++#define UEST_LSTART             (10)
++#define UEST_LSTART_SERVICEWAIT (11)
++#define UEST_LSTART_SERVICEDONE (12)
++
++struct sm_uevent {
++      unsigned int            ue_state;
++      unsigned long           ue_flags;
++      uint32_t                ue_id;
++      uint32_t                ue_nodeid;
++      int                     ue_num_nodes;
++      int                     ue_barrier_status;
++      uint16_t                ue_remote_seid;
++};
++
++/* 
++ * Service Group
++ */
++
++#define RECOVER_NONE          (0)
++#define RECOVER_STOP          (1)
++#define RECOVER_START         (2)
++#define RECOVER_STARTDONE     (3)
++#define RECOVER_BARRIERWAIT   (4)
++#define RECOVER_BARRIERDONE   (5)
++
++/* SG Flags */
++#define SGFL_SEVENT             (1)
++#define SGFL_UEVENT             (2)
++#define SGFL_NEED_RECOVERY      (3)
++
++/* SG States */
++#define SGST_NONE             (0)
++#define SGST_JOIN             (1)
++#define SGST_RUN              (2)
++#define SGST_RECOVER          (3)
++#define SGST_UEVENT           (4)
++
++struct sm_group {
++      struct list_head        list;           /* list of sg's */
++      uint16_t                level;
++      uint32_t                local_id;
++      uint32_t                global_id;
++      unsigned long           flags;
++      int                     state;
++      int                     refcount;       /* references from reg/unreg */
++      void *                  service_data;   /* data from the service */
++      struct kcl_service_ops *ops;            /* ops from the service */
++      struct completion       event_comp;
++
++      struct list_head        memb;           /* Membership List for RC */
++      int                     memb_count;     /* number of nodes in memb */
++      struct list_head        joining;        /* nodes joining the sg */
++      sm_sevent_t *           sevent;
++      sm_uevent_t             uevent;
++
++      int                     recover_state;
++      int                     recover_stop;
++      struct list_head        recover_list;   /* recovery event list */
++      void *                  recover_data;
++      char                    recover_barrier[MAX_BARRIER_NAME_LEN];
++
++      int                     namelen;
++      char                    name[1];        /* must be last field */
++};
++
++/* 
++ * Service Message
++ */
++
++/* SMSG Type */
++#define SMSG_JOIN_REQ           (1)
++#define SMSG_JOIN_REP           (2)
++#define SMSG_JSTOP_REQ          (3)
++#define SMSG_JSTOP_REP          (4)
++#define SMSG_JSTART_CMD         (5)
++#define SMSG_LEAVE_REQ          (6)
++#define SMSG_LEAVE_REP          (7)
++#define SMSG_LSTOP_REQ          (8)
++#define SMSG_LSTOP_REP          (9)
++#define SMSG_LSTART_CMD         (10)
++#define SMSG_LSTART_DONE        (11)
++#define SMSG_RECOVER          (12)
++
++/* SMSG Status */
++#define STATUS_POS              (1)
++#define STATUS_NEG              (2)
++#define STATUS_WAIT             (3)
++
++struct sm_msg {
++      uint8_t                 ms_type;
++      uint8_t                 ms_status;
++      uint16_t                ms_sevent_id;
++      uint32_t                ms_global_sgid;
++      uint32_t                ms_global_lastid;
++      uint16_t                ms_sglevel;
++      uint16_t                ms_length;
++      /* buf of ms_length bytes follows */
++};
++
++/* 
++ * Node structure
++ */
++
++#define SNFL_NEED_RECOVERY    (0)
++#define SNFL_CLUSTER_MEMBER   (1)
++#define SNFL_LEAVING          (2)
++
++struct sm_node {
++      struct list_head        list;
++      uint32_t                id;             /* node id from cnxman */
++      unsigned long           flags;
++      int                     incarnation;    /* node incarnation number */
++};
++
++#endif                                /* __SM_INTERNAL_DOT_H__ */
+diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
+--- linux-orig/cluster/cman/sm_joinleave.c     1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_joinleave.c  2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,1286 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++/*
++ * Routines used by nodes that are joining or leaving a SG.  These "sevent"
++ * routines initiate membership changes to a SG.  Existing SG members respond
++ * using the "uevent" membership update routines.
++ */
++
++extern uint32_t               sm_our_nodeid;
++extern struct list_head       sm_members;
++static struct list_head       new_event;
++static spinlock_t             new_event_lock;
++static struct list_head               joinleave_events;
++
++void init_joinleave(void)
++{
++      INIT_LIST_HEAD(&new_event);
++      spin_lock_init(&new_event_lock);
++      INIT_LIST_HEAD(&joinleave_events);
++}
++
++void new_joinleave(sm_sevent_t *sev)
++{
++      spin_lock(&new_event_lock);
++      list_add_tail(&sev->se_list, &new_event);
++      spin_unlock(&new_event_lock);
++      wake_serviced(DO_JOINLEAVE);
++}
++
++sm_sevent_t *find_sevent(unsigned int id)
++{
++      sm_sevent_t *sev;
++
++      list_for_each_entry(sev, &joinleave_events, se_list) {
++              if (sev->se_id == id)
++                      return sev;
++      }
++      return NULL;
++}
++
++static void release_sevent(sm_sevent_t *sev)
++{
++      if (sev->se_len_ids) {
++              kfree(sev->se_node_ids);
++              sev->se_node_ids = NULL;
++      }
++
++      if (sev->se_len_status) {
++              kfree(sev->se_node_status);
++              sev->se_node_status = NULL;
++      }
++
++      sev->se_node_count = 0;
++      sev->se_memb_count = 0;
++      sev->se_reply_count = 0;
++}
++
++static int init_sevent(sm_sevent_t *sev)
++{
++      sm_node_t *node;
++      int len1, len2, count, cluster_members = 0;
++
++      /* clear state from any previous attempt */
++      release_sevent(sev);
++
++      list_for_each_entry(node, &sm_members, list) {
++              if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
++                      cluster_members++;
++      }
++
++      sev->se_node_count = cluster_members;
++      sev->se_memb_count = sev->se_sg->memb_count;
++
++      /*
++       * When joining, we need a node array the size of the entire cluster
++       * member list because we get responses from all nodes.  When leaving,
++       * we only get responses from SG members, so the node array need only
++       * be that large.
++       */
++
++      if (sev->se_state < SEST_LEAVE_BEGIN)
++              count = sev->se_node_count;
++      else
++              count = sev->se_memb_count;
++
++      len1 = count * sizeof(uint32_t);
++      sev->se_len_ids = len1;
++
++      sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
++      if (!sev->se_node_ids)
++              goto fail;
++
++      len2 = count * sizeof (char);
++      sev->se_len_status = len2;
++
++      sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
++      if (!sev->se_node_status)
++              goto fail_free;
++
++      memset(sev->se_node_status, 0, len2);
++      memset(sev->se_node_ids, 0, len1);
++
++      return 0;
++
++      fail_free:
++      kfree(sev->se_node_ids);
++      sev->se_node_ids = NULL;
++      sev->se_len_ids = 0;
++
++      fail:
++      return -ENOMEM;
++}
++
++/* Context: timer */
++
++static void sev_restart(unsigned long data)
++{
++      sm_sevent_t *sev = (sm_sevent_t *) data;
++
++      clear_bit(SEFL_DELAY, &sev->se_flags);
++      set_bit(SEFL_CHECK, &sev->se_flags);
++      wake_serviced(DO_JOINLEAVE);
++}
++
++static void schedule_sev_restart(sm_sevent_t *sev)
++{
++      init_timer(&sev->se_restart_timer);
++      sev->se_restart_timer.function = sev_restart;
++      sev->se_restart_timer.data = (long) sev;
++      mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
++}
++
++void free_sg_memb(sm_group_t *sg)
++{
++      sm_node_t *node;
++
++      while (!list_empty(&sg->memb)) {
++              node = list_entry(sg->memb.next, sm_node_t, list);
++              list_del(&node->list);
++              kfree(node);
++      }
++      sg->memb_count = 0;
++}
++
++/*
++ * 1.  First step in joining a SG - send a message to all nodes in the cluster
++ * asking to join the named SG.  If any nodes are members they will reply with
++ * a POS, or a WAIT (wait means try again, only one node can join at a time).
++ * If no one knows about this SG, they all send NEG replies which means we form
++ * the SG with just ourself as a member.
++ */
++
++static int send_join_notice(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      sm_node_t *node;
++      char *msg;
++      int i = 0, error, namelen, len = 0;
++
++      /*
++       * Create node array from member list in which to collect responses.
++       */
++
++      error = init_sevent(sev);
++      if (error)
++              goto out;
++
++      list_for_each_entry(node, &sm_members, list) {
++              if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
++                      sev->se_node_ids[i++] = node->id;
++      }
++
++      /*
++       * Create and send a join request message.
++       *
++       * Other nodes then run process_join_request and reply to us; we
++       * collect the responses in process_reply and check them in
++       * check_join_notice.
++       */
++
++      namelen = sg->namelen;
++      msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
++      memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
++
++      error = send_broadcast_message_sev(msg, len, sev);
++
++      out:
++      return error;
++}
++
++/*
++ * 2.  Second step in joining a SG - after we collect all replies to our join
++ * request, we look at them.  If anyone told us to wait, we'll wait a while, go
++ * back and start at step 1 again.
++ */
++
++static int check_join_notice(sm_sevent_t *sev)
++{
++      int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
++
++      for (i = 0; i < sev->se_node_count; i++) {
++              switch (sev->se_node_status[i]) {
++              case STATUS_POS:
++                      /* this node is in the SG and will be in new proposed
++                       * memb list */
++                      pos++;
++                      break;
++
++              case STATUS_WAIT:
++                      /* this node is in the SG but something else is
++                       * happening with it at the moment. */
++                      wait++;
++                      break;
++
++              case STATUS_NEG:
++                      /* this node has no record of the SG we're interested
++                       * in */
++                      neg++;
++
++                      if (sev->se_node_ids[i] == sm_our_nodeid)
++                              sev->se_node_status[i] = STATUS_POS;
++                      break;
++
++              default:
++                      /* we didn't get a valid response from this node,
++                       * restart the entire sev. */
++                      restart++;
++                      break;
++              }
++      }
++
++      if (pos && !wait && !restart) {
++              /* all current members of this sg pos'ed our entry */
++      } else if (!pos && !wait && !restart && neg) {
++              /* we're the first in the cluster to join this sg */
++              sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
++      } else
++              error = -1;
++
++      return error;
++}
++
++/*
++ * 3.  Third step in joining the SG - tell the nodes that are already members
++ * to "stop" the service.  We stop them so that everyone can restart with the
++ * new member (us!) added.
++ */
++
++static int send_join_stop(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      sm_node_t *node;
++      char *msg;
++      uint32_t be_count;
++      int i, len = 0, error = 0;
++
++      /*
++       * Form the SG memb list with us in it.
++       */
++
++      for (i = 0; i < sev->se_node_count; i++) {
++              if (sev->se_node_status[i] != STATUS_POS)
++                      continue;
++
++              node = sm_new_node(sev->se_node_ids[i]);
++              if (!node)
++                      goto fail;
++
++              list_add_tail(&node->list, &sg->memb);
++              sg->memb_count++;
++      }
++
++      /*
++       * Re-init the node vector in which to collect responses again.
++       */
++
++      sev->se_memb_count = sg->memb_count;
++
++      memset(sev->se_node_status, 0, sev->se_len_status);
++      memset(sev->se_node_ids, 0, sev->se_len_ids);
++      i = 0;
++
++      list_for_each_entry(node, &sg->memb, list)
++              sev->se_node_ids[i++] = node->id;
++
++      /*
++       * Create and send a stop message.
++       *
++       * Other nodes then run process_stop_request and process_join_stop and
++       * reply to us.  They stop the sg we're trying to join if they agree.
++       * We collect responses in process_reply and check them in
++       * check_join_stop.
++       */
++
++      msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
++      be_count = cpu_to_be32(sg->memb_count);
++      memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
++
++      error = send_members_message_sev(sg, msg, len, sev);
++      if (error < 0)
++              goto fail;
++
++      return 0;
++
++      fail:
++      free_sg_memb(sg);
++      return error;
++}
++
++/*
++ * 4.  Fourth step in joining the SG - after we collect replies to our stop
++ * request, we look at them.  Everyone sending POS agrees with us joining and
++ * has stopped their SG.  If some nodes sent NEG, something is wrong and we
++ * don't have a good way to address that yet since some nodes may have sent
++ * POS.
++ *
++ * FIXME: even nodes replying with NEG should stop their SG so we can send an
++ * abort and have everyone at the same place to start from again.
++ */
++
++static int check_join_stop(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      int i, pos = 0, neg = 0;
++
++      for (i = 0; i < sev->se_memb_count; i++) {
++              switch (sev->se_node_status[i]) {
++              case STATUS_POS:
++                      pos++;
++                      break;
++
++              case STATUS_NEG:
++                      log_error(sg, "check_join_stop: neg from nodeid %u "
++                                "(%d, %d, %u)", sev->se_node_ids[i],
++                                pos, neg, sev->se_memb_count);
++                      neg++;
++                      break;
++
++              default:
++                      log_error(sg, "check_join_stop: unknown status=%u "
++                                "nodeid=%u", sev->se_node_status[i],
++                                sev->se_node_ids[i]);
++                      neg++;
++                      break;
++              }
++      }
++
++      if (pos == sg->memb_count)
++              return 0;
++
++      free_sg_memb(sg);
++      return -1;
++}
++
++/*
++ * 5.  Fifth step in joining the SG - everyone has stopped their service and we
++ * all now start the service with us, the new member, added to the SG member
++ * list.  We send start to our own service here and send a message to the other
++ * members that they should also start their service.
++ */
++
++static int send_join_start(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      sm_node_t *node;
++      uint32_t *memb;
++      char *msg;
++      int error, count = 0, len = 0;
++
++      /*
++       * Create a start message and send it.
++       */
++
++      msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
++
++      error = send_members_message(sg, msg, len);
++      if (error < 0)
++              goto fail;
++
++      /*
++       * Start the service ourself.  The chunk of memory with the member ids
++       * must be freed by the service when it is done with it.
++       */
++
++      SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
++               memb);
++
++      list_for_each_entry(node, &sg->memb, list)
++              memb[count++] = node->id;
++
++      set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
++
++      sg->ops->start(sg->service_data, memb, count, sev->se_id,
++                     SERVICE_NODE_JOIN);
++      return 0;
++
++      fail:
++      free_sg_memb(sg);
++      return error;
++}
++
++/*
++ * 6.  Sixth step in joining the SG - once the service has completed its start,
++ * it does a kcl_start_done() to signal us that it's done.  That gets us here
++ * and we do a barrier with all other members which join the barrier when their
++ * service is done starting.
++ */
++
++static int startdone_barrier_new(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      char bname[MAX_BARRIER_NAME_LEN];
++      int error;
++
++      memset(bname, 0, MAX_BARRIER_NAME_LEN);
++      sev->se_barrier_status = -1;
++
++      set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++
++      /* If we're the only member, skip the barrier */
++      if (sg->memb_count == 1) {
++              process_startdone_barrier_new(sg, 0);
++              return 0;
++      }
++
++      snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++               sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
++
++      error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
++      if (error)
++              goto fail;
++
++      return 0;
++
++      fail:
++      clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++      sg->ops->stop(sg->service_data);
++      free_sg_memb(sg);
++      return error;
++}
++
++/*
++ * 7.  Seventh step in joining the SG - check that the barrier we joined with
++ * all other members returned with a successful status.
++ */
++
++static int check_startdone_barrier_new(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      int error = sev->se_barrier_status;
++
++      if (error) {
++              sg->ops->stop(sg->service_data);
++              free_sg_memb(sg);
++      }
++      return error;
++}
++
++/*
++ * 8.  Eigth step in joining the SG - send the service a "finish" indicating
++ * that all members have successfully started the service.
++ */
++
++static void do_finish_new(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      sg->state = SGST_RUN;
++      sg->sevent = NULL;
++      clear_bit(SGFL_SEVENT, &sg->flags);
++
++      sg->ops->finish(sg->service_data, sev->se_id);
++}
++
++/*
++ * 9.  Ninth step in joining the SG - it's done so get rid of the sevent stuff
++ * and tell the process which initiated the join that it's done.
++ */
++
++static void sevent_done(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      list_del(&sev->se_list);
++      release_sevent(sev);
++      kfree(sev);
++      complete(&sg->event_comp);
++}
++
++/*
++ * Move through the steps of a join.  Summary:
++ *
++ * 1. Send a join notice to all cluster members.
++ * 2. Collect and check replies to the join notice.
++ * 3. Send a stop message to all SG members.
++ * 4. Collect and check replies to the stop message.
++ * 5. Send a start message to all SG members and start service ourself.
++ * 6. Use barrier to wait for all nodes to complete the start.
++ * 7. Check that all SG members joined the barrier.
++ * 8. Send finish to the service indicating that all nodes started it.
++ * 9. Clean up sevent and signal completion to the process that started the join
++ */
++
++static void process_join_sevent(sm_sevent_t *sev)
++{
++      int error = 0;
++
++      /*
++       * We may cancel the current join attempt if another node is also
++       * attempting to join or leave. (Only a single node can join or leave
++       * at once.)  If cancelled, 0ur join attempt will be restarted later.
++       */
++
++      if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
++              error = -1;
++              goto cancel;
++      }
++
++      log_debug(sev->se_sg, "sevent state %u", sev->se_state);
++
++      switch (sev->se_state) {
++
++              /*
++               * An sevent is created in kcl_join_service with a state of
++               * JOIN_BEGIN.
++               */
++
++      case SEST_JOIN_BEGIN:
++              sev->se_state = SEST_JOIN_ACKWAIT;
++              error = send_join_notice(sev);
++              break;
++
++              /*
++               * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in 
++               * process_reply  (when all the replies have been received)
++               */
++
++      case SEST_JOIN_ACKED:
++              error = check_join_notice(sev);
++              if (error)
++                      break;
++
++              sev->se_state = SEST_JSTOP_ACKWAIT;
++              error = send_join_stop(sev);
++              break;
++
++              /*
++               * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
++               * proces_reply  (when all the replies have been received)
++               */
++
++      case SEST_JSTOP_ACKED:
++              error = check_join_stop(sev);
++              if (error)
++                      break;
++
++              sev->se_state = SEST_JSTART_SERVICEWAIT;
++              error = send_join_start(sev);
++              break;
++
++              /*
++               * se_state is changed from JSTART_SERVICEWAIT to
++               * JSTART_SERVICEDONE in kcl_start_done
++               */
++
++      case SEST_JSTART_SERVICEDONE:
++              sev->se_state = SEST_BARRIER_WAIT;
++              error = startdone_barrier_new(sev);
++              break;
++
++              /*
++               * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
++               * process_startdone_barrier_new 
++               */
++
++      case SEST_BARRIER_DONE:
++              error = check_startdone_barrier_new(sev);
++              if (error)
++                      break;
++
++              do_finish_new(sev);
++              sevent_done(sev);
++              break;
++
++      default:
++              log_error(sev->se_sg, "no join processing for state %u",
++                        sev->se_state);
++      }
++
++      cancel:
++      if (error) {
++              /* restart the sevent from the beginning */
++              sev->se_state = SEST_JOIN_BEGIN;
++              sev->se_sg->global_id = 0;
++              set_bit(SEFL_DELAY, &sev->se_flags);
++              schedule_sev_restart(sev);
++      }
++}
++
++/*
++ * 1.  First step in leaving an SG - send a message to other SG members asking
++ * to leave the SG.  Nodes that don't have another active sevent or uevent for
++ * this SG will return POS.
++ */
++
++static int send_leave_notice(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      sm_node_t *node;
++      char *msg;
++      int i = 0, error = -1, len = 0;
++
++      /*
++       * Create a node array from member list in which to collect responses.
++       */
++
++      error = init_sevent(sev);
++      if (error)
++              goto out;
++
++      list_for_each_entry(node, &sg->memb, list)
++              sev->se_node_ids[i++] = node->id;
++
++      /*
++       * Create and send a leave request message.
++       */
++
++      msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
++
++      error = send_members_message_sev(sg, msg, len, sev);
++
++      out:
++      return error;
++}
++
++/*
++ * 2.  Second step in leaving an SG - after we collect all replies to our leave
++ * request, we look at them.  If anyone replied with WAIT, we abort our attempt
++ * at leaving and try again in a bit.
++ */
++
++static int check_leave_notice(sm_sevent_t *sev)
++{
++      int pos = 0, wait = 0, neg = 0, restart = 0, i;
++
++      for (i = 0; i < sev->se_memb_count; i++) {
++              switch (sev->se_node_status[i]) {
++              case STATUS_POS:
++                      pos++;
++                      break;
++
++              case STATUS_WAIT:
++                      wait++;
++                      break;
++
++              case STATUS_NEG:
++                      neg++;
++                      break;
++
++              default:
++                      /* we didn't get a valid response from this node,
++                       * restart the entire sev. */
++                      restart++;
++                      break;
++              }
++      }
++
++      /* all members approve */
++      if (pos && !wait && !restart)
++              return 0;
++
++      return -1;
++}
++
++/*
++ * 3.  Third step in leaving the SG - tell the member nodes to "stop" the SG.
++ * They must be stopped in order to restart without us as a member.
++ */
++
++static int send_leave_stop(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      char *msg;
++      int error, len = 0;
++
++      /*
++       * Re-init the status vector in which to collect responses.
++       */
++
++      memset(sev->se_node_status, 0, sev->se_len_status);
++
++      /*
++       * Create and send a stop message.
++       */
++
++      msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
++
++      error = send_members_message_sev(sg, msg, len, sev);
++      if (error < 0)
++              goto out;
++
++      /*
++       * we and all others stop the SG now 
++       */
++
++      sg->ops->stop(sg->service_data);
++
++      out:
++      return error;
++}
++
++/*
++ * 4.  Fourth step in leaving the SG - check the replies to our stop request.
++ * Same problem with getting different replies as check_join_stop.
++ */
++
++static int check_leave_stop(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      int i, pos = 0, neg = 0;
++
++      for (i = 0; i < sev->se_memb_count; i++) {
++              switch (sev->se_node_status[i]) {
++              case STATUS_POS:
++                      pos++;
++                      break;
++
++              case STATUS_NEG:
++                      log_error(sg, "check_leave_stop: fail from nodeid %u "
++                                "(%d, %d, %u)", sev->se_node_ids[i],
++                                pos, neg, sev->se_memb_count);
++                      neg++;
++                      break;
++
++              default:
++                      log_error(sg, "check_leave_stop: status %u nodeid %u",
++                                sev->se_node_status[i], sev->se_node_ids[i]);
++                      neg++;
++                      break;
++              }
++      }
++
++      if (pos == sg->memb_count)
++              return 0;
++
++      return -1;
++}
++
++/*
++ * 5.  Fifth step in leaving the SG - tell the other SG members to restart the
++ * service without us.  We, of course, don't start our own stopped service.  If
++ * we're the last SG member and leaving, we jump right to the next step.
++ */
++
++static int send_leave_start(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      char *msg;
++      int error = 0, len = 0;
++
++      if (sg->memb_count == 1) {
++              sev->se_state = SEST_LSTART_REMOTEDONE;
++              set_bit(SEFL_CHECK, &sev->se_flags);
++              wake_serviced(DO_JOINLEAVE);
++      } else {
++              msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
++              error = send_members_message(sg, msg, len);
++      }
++      return error;
++}
++
++/*
++ * Move through the steps of a leave.  Summary:
++ *
++ * 1. Send a leave notice to all SG members.
++ * 2. Collect and check replies to the leave notice.
++ * 3. Send a stop message to all SG members and stop our own SG.
++ * 4. Collect and check replies to the stop message.
++ * 5. Send a start message to SG members.
++ * 6. Clean up sevent and signal completion to the process that
++ *    started the leave.
++ */
++
++static void process_leave_sevent(sm_sevent_t *sev)
++{
++      int error = 0;
++
++      /*
++       * We may cancel the current leave attempt if another node is also
++       * attempting to join or leave. (Only a single node can join or leave
++       * at once.)  Our leave attempt will be restarted after being
++       * cancelled.
++       */
++
++      if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
++              error = 1;
++              goto cancel;
++      }
++
++      if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
++              error = 2;
++              goto cancel;
++      }
++
++      if (!list_empty(&sev->se_sg->joining)) {
++              error = 3;
++              goto cancel;
++      }
++
++      log_debug(sev->se_sg, "sevent state %u", sev->se_state);
++
++      switch (sev->se_state) {
++
++              /*
++               * An sevent is created in kcl_leave_service with a state of
++               * LEAVE_BEGIN.
++               */
++
++      case SEST_LEAVE_BEGIN:
++              sev->se_state = SEST_LEAVE_ACKWAIT;
++              error = send_leave_notice(sev);
++              break;
++
++              /*
++               * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in 
++               * process_reply  (when all the replies have been received)
++               */
++
++      case SEST_LEAVE_ACKED:
++              error = check_leave_notice(sev);
++              if (error)
++                      break;
++
++              sev->se_state = SEST_LSTOP_ACKWAIT;
++              error = send_leave_stop(sev);
++              break;
++
++              /*
++               * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
++               * process_reply
++               */
++
++      case SEST_LSTOP_ACKED:
++              error = check_leave_stop(sev);
++              if (error)
++                      break;
++
++              sev->se_state = SEST_LSTART_WAITREMOTE;
++              error = send_leave_start(sev);
++              break;
++
++              /*
++               * se_state is changed from LSTART_WAITREMOTE to
++               * LSTART_REMOTEDONE in process_leave_done
++               */
++
++      case SEST_LSTART_REMOTEDONE:
++              sevent_done(sev);
++              break;
++
++      default:
++              log_error(sev->se_sg, "process_leave_sevent state=%u\n",
++                        sev->se_state);
++      }
++
++      cancel:
++      if (error) {
++              /* restart the sevent from the beginning */
++              sev->se_state = SEST_LEAVE_BEGIN;
++              set_bit(SEFL_DELAY, &sev->se_flags);
++              schedule_sev_restart(sev);
++      }
++}
++
++/*
++ * Sevent backout code.  Take appropriate steps when a recovery occurs while
++ * we're in the midst of an sevent.  The recovery may or may not affect the
++ * sevent.  If it does, it usually means cancelling the sevent and restarting
++ * it from the beginning once the recovery processing is done.
++ */
++
++/*
++ * If any of the nodes that replied with OK is dead, we give up on the current
++ * join attempt and restart.  Otherwise, this sevent can continue.
++ */
++
++static int backout_join_acked(sm_sevent_t *sev)
++{
++      sm_node_t *node;
++      int i;
++
++      for (i = 0; i < sev->se_node_count; i++) {
++              if (sev->se_node_status[i] != STATUS_POS)
++                      continue;
++
++              list_for_each_entry(node, &sm_members, list) {
++                      if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
++                          (node->id == sev->se_node_ids[i]))
++                              return TRUE;
++              }
++      }
++      return FALSE;
++}
++
++/*
++ * In this state our sg member list exists and mark_affected_sgs() will have
++ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead.  We
++ * restart the join process if this is the case, otherwise this sevent can
++ * continue.
++ */
++
++static int backout_jstop_ackwait(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
++      free_sg_memb(sg);
++      return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_jstop_acked(sm_sevent_t *sev)
++{
++      return backout_jstop_ackwait(sev);
++}
++
++/*
++ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
++ * starting our service.  The recovery process will restart the service on all
++ * the prior sg members (not including those that died or us).  We will
++ * reattempt our join which should be accepted once the nodes are done with
++ * recovery.
++ */
++
++static int backout_jstart_servicewait(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
++      sg->ops->stop(sg->service_data);
++      free_sg_memb(sg);
++      return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_jstart_servicedone(sm_sevent_t *sev)
++{
++      return backout_jstart_servicewait(sev);
++}
++
++/*
++ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
++ * waiting on the "all done" barrier.  Stop our service that we just started
++ * and cancel the barrier.  The recovery process will restart the service on
++ * all the prior sg members (not including those that died or us).  We will
++ * reattempt our join which should be accepted once the nodes are done with
++ * recovery.
++ */
++
++static int backout_barrier_wait(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++      char bname[MAX_BARRIER_NAME_LEN];
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++
++      sg->ops->stop(sg->service_data);
++
++      memset(bname, 0, MAX_BARRIER_NAME_LEN);
++      snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++               sg->global_id, sm_our_nodeid, sev->se_id,
++               sg->memb_count);
++      kcl_barrier_cancel(bname);
++
++      free_sg_memb(sg);
++      return TRUE;
++}
++
++/*
++ * If NEED_RECOVERY is set, a member of the sg we just joined has failed.  The
++ * recovery began after the barrier callback.  If the result in the callback is
++ * "success" then we are joined, this sevent is finished and we'll process the
++ * sg within the forthcoming recovery with the other members.
++ *
++ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
++ * all nodes will receive the corresponding barrier callback *before any*
++ * receive an sm_member_update() due to one of those nodes failing just after
++ * joining the barrier.  If some nodes receive the sm_member_update() before
++ * the barrier callback and others receive the barrier callback before the
++ * sm_member_update() then they will disagree as to whether the node joining/
++ * leaving is in/out of the sg.
++ */
++
++static int backout_barrier_done(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      if (!sev->se_barrier_status) {
++              do_finish_new(sev);
++              sevent_done(sev);
++              return FALSE;
++      } else {
++              sg->ops->stop(sg->service_data);
++              free_sg_memb(sg);
++              return TRUE;
++      }
++}
++
++/*
++ * We've done nothing yet, just restart when recovery is done (if sg is flagged
++ * with recovery.)
++ */
++
++static int backout_leave_begin(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      return TRUE;
++}
++
++/*
++ * Ignore any replies to our leave notice and restart when recovery is done (if
++ * sg is flagged with recovery.)
++ */
++
++static int backout_leave_ackwait(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
++
++      return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_leave_acked(sm_sevent_t *sev)
++{
++      return backout_leave_ackwait(sev);
++}
++
++/*
++ * Ignore any stop replies.  All the members will be stopped anyway to do the
++ * recovery.  Let that happen and restart our leave when done.
++ */
++
++static int backout_lstop_ackwait(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
++
++      return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_lstop_acked(sm_sevent_t *sev)
++{
++      return backout_lstop_ackwait(sev);
++}
++
++/*
++ * All members will be stopped due to recovery and restarted by recovery
++ * processing.  That includes us, we have to retry the leave once the recovery
++ * is done.
++ */
++
++static int backout_lstart_waitremote(sm_sevent_t *sev)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++              return FALSE;
++
++      return TRUE;
++}
++
++/*
++ * Reset an sevent to its beginning so it can be restarted.  This is necessary
++ * when recovery affects an SG while we're trying to join or leave (ie. a node
++ * in the SG fails).
++ */
++
++void backout_sevents(void)
++{
++      sm_sevent_t *sev, *safe;
++      int delay;
++
++      list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
++
++              delay = FALSE;
++
++              log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
++
++              switch (sev->se_state) {
++
++              /* backout after kcl_join_service and before
++               * send_join_notice */
++              case SEST_JOIN_BEGIN:
++                      break;
++
++              /* backout after send_join_notice and before final
++               * process_reply */
++              case SEST_JOIN_ACKWAIT:
++                      clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
++                      sev->se_state = SEST_JOIN_BEGIN;
++                      schedule_sev_restart(sev);
++                      break;
++
++              /* backout after final process_reply and before
++               * check_join_notice */
++              case SEST_JOIN_ACKED:
++                      delay = backout_join_acked(sev);
++                      break;
++
++              /* backout after send_join_stop and before final
++               * process_reply */
++              case SEST_JSTOP_ACKWAIT:
++                      delay = backout_jstop_ackwait(sev);
++                      break;
++
++              /* backout after final process_reply and before
++               * check_join_stop */
++              case SEST_JSTOP_ACKED:
++                      delay = backout_jstop_acked(sev);
++                      break;
++
++              /* backout after send_join_start and before
++               * kcl_start_done */
++              case SEST_JSTART_SERVICEWAIT:
++                      delay = backout_jstart_servicewait(sev);
++                      break;
++
++              /* backout after kcl_start_done and before
++               * startdone_barrier_new */
++              case SEST_JSTART_SERVICEDONE:
++                      delay = backout_jstart_servicedone(sev);
++                      break;
++
++              /* backout after startdone_barrier_new and before
++               * callback_startdone_barrier_new */
++              case SEST_BARRIER_WAIT:
++                      delay = backout_barrier_wait(sev);
++                      break;
++
++              /* backout after callback_startdone_barrier_new and
++               * before check_startdone_barrier_new */
++              case SEST_BARRIER_DONE:
++                      delay = backout_barrier_done(sev);
++                      break;
++
++              /* backout after kcl_leave_service and before
++               * send_leave_notice */
++              case SEST_LEAVE_BEGIN:
++                      delay = backout_leave_begin(sev);
++                      break;
++
++              /* backout after send_leave_notice and before final
++               * process_reply */
++              case SEST_LEAVE_ACKWAIT:
++                      delay = backout_leave_ackwait(sev);
++                      break;
++
++              /* backout after final process_reply and before
++               * check_leave_notice */
++              case SEST_LEAVE_ACKED:
++                      delay = backout_leave_acked(sev);
++                      break;
++
++              /* backout after send_leave_stop and before final
++               * process_reply */
++              case SEST_LSTOP_ACKWAIT:
++                      delay = backout_lstop_ackwait(sev);
++                      break;
++
++              /* backout after final process_reply and before
++               * check_leave_stop */
++              case SEST_LSTOP_ACKED:
++                      delay = backout_lstop_acked(sev);
++                      break;
++
++              /* backout after send_leave_start and before
++               * process_lstart_done */
++              case SEST_LSTART_WAITREMOTE:
++                      delay = backout_lstart_waitremote(sev);
++                      break;
++
++              /* backout after process_lstart_done and before
++               * process_leave_sevent */
++              case SEST_LSTART_REMOTEDONE:
++                      sevent_done(sev);
++                      delay = FALSE;
++                      break;
++
++              default:
++                      log_error(sev->se_sg, "backout_sevents: bad state %d",
++                                sev->se_state);
++              }
++
++              if (delay) {
++                      set_bit(SEFL_DELAY, &sev->se_flags);
++
++                      if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++                              sev->se_state = SEST_LEAVE_BEGIN;
++                              /* The DELAY flag will be cleared once recovery
++                               * is done allowing the leave to be retried. */
++                      } else {
++                              sev->se_state = SEST_JOIN_BEGIN;
++                              /* restart timer function will clear DELAY */
++                              schedule_sev_restart(sev);
++                      }
++              }
++      }
++}
++
++void process_joinleave(void)
++{
++      sm_sevent_t *sev = NULL, *safe;
++
++      spin_lock(&new_event_lock);
++      if (!list_empty(&new_event)) {
++              sev = list_entry(new_event.next, sm_sevent_t, se_list);
++              list_del(&sev->se_list);
++              list_add_tail(&sev->se_list, &joinleave_events);
++              set_bit(SEFL_CHECK, &sev->se_flags);
++      }
++      spin_unlock(&new_event_lock);
++
++      list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
++              if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
++                      continue;
++
++              if (test_bit(SEFL_DELAY, &sev->se_flags))
++                      continue;
++
++              if (sev->se_state < SEST_LEAVE_BEGIN)
++                      process_join_sevent(sev);
++              else
++                      process_leave_sevent(sev);
++      }
++}
+diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
+--- linux-orig/cluster/cman/sm_joinleave.h     1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_joinleave.h  2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,23 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_JOINLEAVE_DOT_H__
++#define __SM_JOINLEAVE_DOT_H__
++
++void init_joinleave(void);
++void new_joinleave(sm_sevent_t *sev);
++void process_joinleave(void);
++void backout_sevents(void);
++sm_sevent_t *find_sevent(unsigned int id);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
+--- linux-orig/cluster/cman/sm_membership.c    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_membership.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,696 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++extern struct list_head               sm_members;
++
++/*
++ * Routines for SG members to handle other nodes joining or leaving the SG.
++ * These "uevent" membership update routines are the response to an "sevent" on
++ * a joining/leaving node.
++ */
++
++static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
++{
++      sm_node_t *node;
++
++      list_for_each_entry(node, &sg->memb, list) {
++              if (node->id != nodeid)
++                      continue;
++              list_del(&node->list);
++              kfree(node);
++              sg->memb_count--;
++              log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
++              break;
++      }
++}
++
++static void add_memb_node(sm_group_t *sg, sm_node_t *node)
++{
++      list_add_tail(&node->list, &sg->memb);
++      sg->memb_count++;
++      log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
++}
++
++/*
++ * Join 1.  The receive end of send_join_stop() from a node requesting to join
++ * the SG.  We stop the service so it can be restarted with the new node.
++ */
++
++static int process_join_stop(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      sm_node_t *node;
++      sm_msg_t reply;
++      int error;
++
++      if (uev->ue_num_nodes != sg->memb_count + 1) {
++              log_error(sg, "process_join_stop: bad num nodes %u %u",
++                        uev->ue_num_nodes, sg->memb_count);
++              return -1;
++      }
++
++      sm_set_event_id(&uev->ue_id);
++
++      node = sm_find_joiner(sg, uev->ue_nodeid);
++      SM_ASSERT(node,);
++
++      sg->state = SGST_UEVENT;
++      sg->ops->stop(sg->service_data);
++
++      reply.ms_type = SMSG_JSTOP_REP;
++      reply.ms_status = STATUS_POS;
++      reply.ms_sevent_id = uev->ue_remote_seid;
++      smsg_bswap_out(&reply);
++
++      error = send_nodeid_message((char *) &reply, sizeof(reply),
++                                  uev->ue_nodeid);
++      if (error < 0)
++              return error;
++      return 0;
++}
++
++/*
++ * Join 2.  The receive end of send_join_start() from a node joining the SG.
++ * We are re-starting the service with the new member added.
++ */
++
++static int process_join_start(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      sm_node_t *node;
++      uint32_t *memb;
++      int count = 0;
++
++      /* this memory is passed to the service which must free it */
++      SM_RETRY(memb =
++               kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
++               memb);
++
++      /* transfer joining node from joining list to member list */
++      node = sm_find_joiner(sg, uev->ue_nodeid);
++      SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
++      list_del(&node->list);
++      add_memb_node(sg, node);
++
++      /* the new member list for the service */
++      list_for_each_entry(node, &sg->memb, list)
++              memb[count++] = node->id;
++
++      set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++
++      sg->ops->start(sg->service_data, memb, count, uev->ue_id,
++                     SERVICE_NODE_JOIN);
++      return 0;
++}
++
++/*
++ * Join 3.  When done starting their local service, every previous SG member
++ * calls startdone_barrier() and the new/joining member calls
++ * startdone_barrier_new().  The barrier returns when everyone has started
++ * their service and joined the barrier.
++ */
++
++static int startdone_barrier(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      char bname[MAX_BARRIER_NAME_LEN];
++      int error;
++
++      memset(bname, 0, MAX_BARRIER_NAME_LEN);
++      uev->ue_barrier_status = -1;
++
++      set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
++
++      /* If we're the only member, skip the barrier */
++      if (sg->memb_count == 1) {
++              process_startdone_barrier(sg, 0);
++              return 0;
++      }
++
++      snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++               sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
++               sg->memb_count);
++
++      error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
++
++      return error;
++}
++
++/*
++ * Join 4.  Check that the "all started" barrier returned a successful status.
++ * The newly joined member calls check_startdone_barrier_new().
++ */
++
++static int check_startdone_barrier(sm_group_t *sg)
++{
++      int error = sg->uevent.ue_barrier_status;
++      return error;
++}
++
++/*
++ * Join 5.  Send the service a "finish" indicating that all members have
++ * successfully started.  The newly joined member calls do_finish_new().
++ */
++
++static void do_finish(sm_group_t *sg)
++{
++      sg->state = SGST_RUN;
++      clear_bit(SGFL_UEVENT, &sg->flags);
++      sg->ops->finish(sg->service_data, sg->uevent.ue_id);
++}
++
++/*
++ * Join 6.  The uevent is done.  If this was a uevent for a node leaving the
++ * SG, then send a final message to the departed node signalling that the
++ * remaining nodes have restarted since it left.
++ */
++
++static void uevent_done(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      sm_msg_t reply;
++
++      if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
++              reply.ms_type = SMSG_LSTART_DONE;
++              reply.ms_status = STATUS_POS;
++              reply.ms_sevent_id = uev->ue_remote_seid;
++              smsg_bswap_out(&reply);
++              send_nodeid_message((char *) &reply, sizeof(reply),
++                                  uev->ue_nodeid);
++      }
++      memset(&sg->uevent, 0, sizeof(sm_uevent_t));
++}
++
++/*
++ * Leave 1.  The receive end of send_leave_stop() from a node leaving the SG.
++ */
++
++static int process_leave_stop(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      sm_msg_t reply;
++      int error;
++
++      sm_set_event_id(&uev->ue_id);
++
++      sg->state = SGST_UEVENT;
++      sg->ops->stop(sg->service_data);
++
++      reply.ms_type = SMSG_LSTOP_REP;
++      reply.ms_status = STATUS_POS;
++      reply.ms_sevent_id = uev->ue_remote_seid;
++      smsg_bswap_out(&reply);
++
++      error = send_nodeid_message((char *) &reply, sizeof(reply),
++                                  uev->ue_nodeid);
++      if (error < 0)
++              return error;
++      return 0;
++}
++
++/*
++ * Leave 2.  The receive end of send_leave_start() from a node leaving the SG.
++ * We are re-starting the service (without the node that's left naturally.)
++ */
++
++static int process_leave_start(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      sm_node_t *node;
++      uint32_t *memb;
++      int count = 0;
++
++      SM_ASSERT(sg->memb_count > 1,
++                printk("memb_count=%u\n", sg->memb_count););
++
++      /* this memory is passed to the service which must free it */
++      SM_RETRY(memb =
++               kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
++               memb);
++
++      /* remove departed member from sg member list */
++      del_memb_node(sg, uev->ue_nodeid);
++
++      /* build member list to pass to service */
++      list_for_each_entry(node, &sg->memb, list)
++              memb[count++] = node->id;
++
++      /* allow us to accept the start_done callback for this start */
++      set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++
++      sg->ops->start(sg->service_data, memb, count, uev->ue_id,
++                     SERVICE_NODE_LEAVE);
++      return 0;
++}
++
++/*
++ * Move through the steps of another node joining or leaving the SG.
++ */
++
++static void process_one_uevent(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      int error = 0;
++
++      log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
++
++      switch (uev->ue_state) {
++
++              /*
++               * a uevent is initialized with state JSTOP in
++               * process_stop_request
++               */
++
++      case UEST_JSTOP:
++              uev->ue_state = UEST_JSTART_WAITCMD;
++              error = process_join_stop(sg);
++              break;
++
++              /*
++               * ue_state is changed from JSTART_WAITCMD to JSTART in
++               * process_start_request
++               */
++
++      case UEST_JSTART:
++              uev->ue_state = UEST_JSTART_SERVICEWAIT;
++              error = process_join_start(sg);
++              break;
++
++              /*
++               * ue_state is changed from JSTART_SERVICEWAIT to
++               * JSTART_SERVICEDONE in kcl_start_done
++               */
++
++      case UEST_JSTART_SERVICEDONE:
++              uev->ue_state = UEST_BARRIER_WAIT;
++              error = startdone_barrier(sg);
++              break;
++
++              /*
++               * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
++               * process_startdone_barrier
++               */
++
++      case UEST_BARRIER_DONE:
++              error = check_startdone_barrier(sg);
++              if (error)
++                      break;
++
++              do_finish(sg);
++              uevent_done(sg);
++              break;
++
++              /*
++               * a uevent is initialized with state LSTOP in
++               * process_stop_request
++               */
++
++      case UEST_LSTOP:
++              uev->ue_state = UEST_LSTART_WAITCMD;
++              error = process_leave_stop(sg);
++              break;
++
++              /*
++               * a uevent is changed from LSTART_WAITCMD to LSTART in
++               * process_start_request
++               */
++
++      case UEST_LSTART:
++              uev->ue_state = UEST_LSTART_SERVICEWAIT;
++              error = process_leave_start(sg);
++              break;
++
++              /*
++               * a uevent is changed from LSTART_SERVICEWAIT to to
++               * LSTART_SERVICEDONE in kcl_start_done
++               */
++
++      case UEST_LSTART_SERVICEDONE:
++              uev->ue_state = UEST_BARRIER_WAIT;
++              error = startdone_barrier(sg);
++              break;
++
++      default:
++              error = -1;
++      }
++
++      /* If we encounter an error during these routines, we do nothing, 
++         expecting that a node failure related to this sg will cause a
++         recovery event to arrive and call cancel_one_uevent(). */
++
++      if (error)
++              log_error(sg, "process_one_uevent error %d state %u",
++                        error, uev->ue_state);
++}
++
++static sm_node_t *failed_memb(sm_group_t *sg, int *count)
++{
++      sm_node_t *node, *sm_node, *failed_uev_node = NULL;
++
++      list_for_each_entry(node, &sg->memb, list) {
++
++              sm_node = sm_find_member(node->id);
++              SM_ASSERT(sm_node, );
++
++              if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
++                      (*count)++;
++                      if (node->id == sg->uevent.ue_nodeid)
++                              failed_uev_node = sm_node;
++              }
++      }
++      return failed_uev_node;
++}
++
++static void send_recover_msg(sm_group_t *sg)
++{
++      char *msg;
++      int len = 0;
++      msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
++      send_members_message(sg, msg, len);
++}
++
++static void cancel_barrier(sm_group_t *sg)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      char bname[MAX_BARRIER_NAME_LEN];
++
++      clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
++
++      memset(bname, 0, MAX_BARRIER_NAME_LEN);
++      snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++               sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
++               sg->memb_count);
++      kcl_barrier_cancel(bname);
++}
++
++static void cancel_one_uevent(sm_group_t *sg, int *effected)
++{
++      sm_uevent_t *uev = &sg->uevent;
++      int failed_count;
++      sm_node_t *node, *failed_joiner, *failed_leaver;
++
++      log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
++                uev->ue_nodeid);
++
++      switch (uev->ue_state) {
++
++      case UEST_JSTOP:
++      case UEST_JSTART_WAITCMD:
++      case UEST_JSTART:
++
++              sg->ops->stop(sg->service_data);
++
++              failed_count = 0;
++              failed_joiner = failed_memb(sg, &failed_count);
++              SM_ASSERT(!failed_joiner, );
++
++              node = sm_find_member(uev->ue_nodeid);
++              if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                      failed_joiner = node;
++
++              if (!failed_count) {
++                      /* only joining node failed */
++                      SM_ASSERT(failed_joiner, );
++                      SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++                      set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++                      (*effected)++;
++                      /* some nodes may not have gotten a JSTOP message
++                         in which case this will tell them to begin
++                         recovery for this sg. */
++                      send_recover_msg(sg);
++
++              } else {
++                      /* a member node failed (and possibly joining node, it
++                         doesn't matter) */
++                      SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++
++      case UEST_JSTART_SERVICEWAIT:
++      case UEST_JSTART_SERVICEDONE:
++
++              clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++              sg->ops->stop(sg->service_data);
++
++              failed_count = 0;
++              failed_joiner = failed_memb(sg, &failed_count);
++              SM_ASSERT(failed_count, );
++              SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++              if (failed_count == 1 && failed_joiner) {
++                      /* only joining node failed */
++
++              } else if (failed_count && failed_joiner) {
++                      /* joining node and another member failed */
++
++              } else {
++                      /* other member failed, joining node still alive */
++                      SM_ASSERT(!failed_joiner, );
++                      del_memb_node(sg, uev->ue_nodeid);
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++
++      case UEST_LSTOP:
++      case UEST_LSTART_WAITCMD:
++      case UEST_LSTART:
++
++              sg->ops->stop(sg->service_data);
++
++              failed_count = 0;
++              failed_leaver = failed_memb(sg, &failed_count);
++              SM_ASSERT(failed_count, );
++              SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++              if (failed_count == 1 && failed_leaver) {
++                      /* only leaving node failed */
++
++              } else if (failed_count && failed_leaver) {
++                      /* leaving node and another member failed */
++
++              } else {
++                      /* other member failed, leaving node still alive */
++                      SM_ASSERT(!failed_leaver, );
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++
++      case UEST_LSTART_SERVICEWAIT:
++      case UEST_LSTART_SERVICEDONE:
++
++              clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++              sg->ops->stop(sg->service_data);
++
++              failed_count = 0;
++              failed_leaver = failed_memb(sg, &failed_count);
++              SM_ASSERT(!failed_leaver, );
++
++              node = sm_find_member(uev->ue_nodeid);
++              if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                      failed_leaver = node;
++
++              if (!failed_count) {
++                      /* only leaving node failed */
++                      SM_ASSERT(failed_leaver, );
++                      SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++                      set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++                      (*effected)++;
++
++              } else if (failed_count && failed_leaver) {
++                      /* leaving node and another member failed */
++                      SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++              } else {
++                      /* other member failed, leaving node still alive */
++                      SM_ASSERT(failed_count, );
++                      SM_ASSERT(!failed_leaver, );
++                      SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++                      node = sm_new_node(sg->uevent.ue_nodeid);
++                      add_memb_node(sg, node);
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++
++      case UEST_BARRIER_WAIT:
++
++              if (test_bit(UEFL_LEAVE, &uev->ue_flags))
++                      goto barrier_wait_leave;
++
++              sg->ops->stop(sg->service_data);
++              cancel_barrier(sg);
++
++            barrier_wait_join:
++
++              failed_count = 0;
++              failed_joiner = failed_memb(sg, &failed_count);
++              SM_ASSERT(failed_count, );
++              SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++              if (failed_count == 1 && failed_joiner) {
++                      /* only joining node failed */
++
++              } else if (failed_count && failed_joiner) {
++                      /* joining node and another member failed */
++
++              } else {
++                      /* other member failed, joining node still alive */
++                      SM_ASSERT(!failed_joiner, );
++                      del_memb_node(sg, uev->ue_nodeid);
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++              barrier_wait_leave:
++
++              failed_count = 0;
++              failed_leaver = failed_memb(sg, &failed_count);
++              SM_ASSERT(!failed_leaver, );
++
++              node = sm_find_member(uev->ue_nodeid);
++              if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                      failed_leaver = node;
++
++              if (!failed_count) {
++                      /* only leaving node failed */
++                      SM_ASSERT(failed_leaver, );
++                      SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++                      set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++                      (*effected)++;
++
++              } else if (failed_count && failed_leaver) {
++                      /* leaving node and another member failed */
++                      SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++              } else {
++                      /* other member failed, leaving node still alive */
++                      SM_ASSERT(failed_count, );
++                      SM_ASSERT(!failed_leaver, );
++                      SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++                      node = sm_new_node(sg->uevent.ue_nodeid);
++                      add_memb_node(sg, node);
++              }
++
++              clear_bit(SGFL_UEVENT, &sg->flags);
++              memset(uev, 0, sizeof(sm_uevent_t));
++              break;
++
++
++      case UEST_BARRIER_DONE:
++
++              if (!uev->ue_barrier_status) {
++                      do_finish(sg);
++                      uevent_done(sg);
++                      break;
++              } 
++
++              if (test_bit(UEFL_LEAVE, &uev->ue_flags))
++                      goto barrier_wait_leave;
++              else
++                      goto barrier_wait_join;
++
++
++      default:
++              log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
++      }
++}
++
++void cancel_uevents(int *effected)
++{
++      sm_group_t *sg;
++      sm_node_t *node, *sgnode;
++      int i;
++
++      list_for_each_entry(node, &sm_members, list) {
++              if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                      continue;
++
++              /*
++               * Clear this dead node from the "interested in joining" list
++               * of any SG.  The node is added to this list before the uevent
++               * begins.
++               */
++
++              for (i = 0; i < SG_LEVELS; i++) {
++                      list_for_each_entry(sg, &sm_sg[i], list) {
++                              sgnode = sm_find_joiner(sg, node->id);
++                              if (sgnode) {
++                                      log_debug(sg, "clear joining node %u",
++                                                sgnode->id);
++                                      list_del(&sgnode->list);
++                                      kfree(sgnode);
++                              }
++                      }
++              }
++      }
++
++       /* Adjust any uevents in sg's effected by the failed node(s) */
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              list_for_each_entry(sg, &sm_sg[i], list) {
++                      if (!test_bit(SGFL_UEVENT, &sg->flags))
++                              continue;
++
++                      /* We may have some cancelling to do if this sg is
++                         flagged as having a failed member, or if a joining
++                         or leaving node has died. */
++                         
++                      if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++                              cancel_one_uevent(sg, effected);
++                      else if (sg->uevent.ue_nodeid) {
++                              node = sm_find_member(sg->uevent.ue_nodeid);
++                              SM_ASSERT(node, );
++                              if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                                      cancel_one_uevent(sg, effected);
++                      }
++              }
++      }
++}
++
++void process_membership(void)
++{
++      sm_group_t *sg;
++      int i;
++
++      down(&sm_sglock);
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              list_for_each_entry(sg, &sm_sg[i], list) {
++                      if (!test_bit(SGFL_UEVENT, &sg->flags))
++                              continue;
++
++                      if (!test_and_clear_bit(UEFL_CHECK,
++                                              &sg->uevent.ue_flags))
++                              continue;
++
++                      process_one_uevent(sg);
++              }
++      }
++      up(&sm_sglock);
++}
+diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
+--- linux-orig/cluster/cman/sm_membership.h    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_membership.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MEMBERSHIP_DOT_H__
++#define __SM_MEMBERSHIP_DOT_H__
++
++void process_membership(void);
++void cancel_uevents(int *effected);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
+--- linux-orig/cluster/cman/sm_message.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_message.c    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,867 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
++
++extern struct socket *        sm_socket;
++extern uint32_t       sm_our_nodeid;
++static uint32_t       global_last_id;
++static struct list_head messages;
++static spinlock_t     message_lock;
++static char           smsg_buf[SMSG_BUF_SIZE];
++
++int send_nodeid_message(char *msg, int len, uint32_t nodeid);
++
++struct rq_entry {
++      struct list_head list;
++      char *msg;
++      int len;
++      uint32_t nodeid;
++};
++typedef struct rq_entry rq_entry_t;
++
++void init_messages(void)
++{
++      global_last_id = 1;
++      INIT_LIST_HEAD(&messages);
++      spin_lock_init(&message_lock);
++}
++
++uint32_t sm_new_global_id(int level)
++{
++      uint32_t id = global_last_id++;
++      uint8_t l = (uint8_t) level;
++
++      if (level > 255)
++              return 0;
++
++      if (id > 0x00FFFFFF)
++              return 0;
++
++      id |= (l << 24);
++      return id;
++}
++
++static void smsg_copy_in(char *msg, sm_msg_t *smsg)
++{
++      sm_msg_t *in = (sm_msg_t *) msg;
++
++      smsg->ms_type = in->ms_type;
++      smsg->ms_status = in->ms_status;
++      smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
++      smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
++      smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
++      smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
++      smsg->ms_length = le16_to_cpu(in->ms_length);
++}
++
++/* swapping bytes in place is an easy source of errors - be careful not to
++ * access the fields after calling this */
++
++void smsg_bswap_out(sm_msg_t *smsg)
++{
++      smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
++      smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
++      smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
++      smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
++      smsg->ms_length = cpu_to_le16(smsg->ms_length);
++}
++
++char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
++                sm_sevent_t *sev)
++{
++      char *msg;
++      sm_msg_t *smsg;
++      int fulllen = sizeof(sm_msg_t) + datalen;
++
++      msg = smsg_buf;
++      memset(smsg_buf, 0, SMSG_BUF_SIZE);
++      SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
++
++      smsg = (sm_msg_t *) msg;
++      smsg->ms_type = type;
++      smsg->ms_global_sgid = sg->global_id;
++      smsg->ms_sglevel = sg->level;
++      smsg->ms_length = datalen;
++      smsg->ms_sevent_id = sev ? sev->se_id : 0;
++
++      smsg_bswap_out(smsg);
++      *msglen = fulllen;
++      return msg;
++}
++
++static unsigned int msgtype_to_flag(int type)
++{
++      unsigned int flag;
++
++      switch (type) {
++      case SMSG_JOIN_REP:
++      case SMSG_JOIN_REQ:
++              flag = SEFL_ALLOW_JOIN;
++              break;
++
++      case SMSG_JSTOP_REP:
++      case SMSG_JSTOP_REQ:
++              flag = SEFL_ALLOW_JSTOP;
++              break;
++
++      case SMSG_LEAVE_REP:
++      case SMSG_LEAVE_REQ:
++              flag = SEFL_ALLOW_LEAVE;
++              break;
++
++      case SMSG_LSTOP_REP:
++      case SMSG_LSTOP_REQ:
++              flag = SEFL_ALLOW_LSTOP;
++              break;
++
++      default:
++              SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
++      }
++      return flag;
++}
++
++static int test_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++      unsigned int flag = msgtype_to_flag(type);
++
++      return test_bit(flag, &sev->se_flags);
++}
++
++static void clear_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++      unsigned int flag = msgtype_to_flag(type);
++
++      clear_bit(flag, &sev->se_flags);
++}
++
++static void set_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++      unsigned int flag = msgtype_to_flag(type);
++
++      set_bit(flag, &sev->se_flags);
++}
++
++static int save_global_id(sm_sevent_t * sev, sm_msg_t * smsg)
++{
++      sm_group_t *sg = sev->se_sg;
++
++      if (!smsg->ms_global_sgid) {
++              log_error(sg, "save_global_id: zero sg id");
++              return -1;
++      }
++
++      if (!sg->global_id)
++              sg->global_id = smsg->ms_global_sgid;
++
++      if (sg->global_id != smsg->ms_global_sgid) {
++              log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
++              return -1;
++      }
++      return 0;
++}
++
++static void save_lastid(sm_msg_t * smsg)
++{
++      uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
++
++      /*
++       * Keep track of the highst SG id which has been used
++       * in the cluster in case we need to choose a new SG id.
++       */
++
++      if (gid > global_last_id)
++              global_last_id = gid;
++}
++
++static int next_sev_state(int msg_type, int cur_state)
++{
++      int next = 0;
++
++      switch (msg_type) {
++      case SMSG_JOIN_REP:
++              SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
++              next = SEST_JOIN_ACKED;
++              break;
++
++      case SMSG_JSTOP_REP:
++              SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
++              next = SEST_JSTOP_ACKED;
++              break;
++
++      case SMSG_LEAVE_REP:
++              SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
++              next = SEST_LEAVE_ACKED;
++              break;
++
++      case SMSG_LSTOP_REP:
++              SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
++              next = SEST_LSTOP_ACKED;
++              break;
++      }
++      return next;
++}
++
++/*
++ * Functions in sevent.c send messages to other nodes and then expect replies.
++ * This function collects the replies for the sevent messages and moves the
++ * sevent to the next stage when all the expected replies have been received.
++ */
++
++static void process_reply(sm_msg_t * smsg, uint32_t nodeid)
++{
++      sm_sevent_t *sev;
++      int i, expected, type = smsg->ms_type;
++
++      /*
++       * Find the relevant sevent.
++       */
++
++      sev = find_sevent(smsg->ms_sevent_id);
++      if (!sev) {
++              log_print("process_reply invalid id=%u nodeid=%u",
++                        smsg->ms_sevent_id, nodeid);
++              goto out;
++      }
++
++      /*
++       * Check if this message type is what this sevent is waiting for.
++       */
++
++      if (!test_allowed_msgtype(sev, type)) {
++              log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u "                          "id=%u", type, nodeid, sev->se_id);
++              goto out;
++      }
++
++      expected =
++          (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
++
++      SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
++                printk("type=%d expected=%d len_ids=%d node_count=%d "
++                       "memb_count=%d\n", type, expected, sev->se_len_ids,
++                       sev->se_node_count, sev->se_memb_count););
++
++      SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
++                printk("type=%d expected=%d len_status=%d node_count=%d "
++                       "memb_count=%d\n", type, expected, sev->se_len_status,
++                       sev->se_node_count, sev->se_memb_count););
++
++      for (i = 0; i < expected; i++) {
++              if (sev->se_node_ids[i] == nodeid) {
++                      /*
++                       * Save the status from the replying node
++                       */
++
++                      if (!sev->se_node_status[i])
++                              sev->se_node_status[i] = smsg->ms_status;
++                      else {
++                              log_error(sev->se_sg, "process_reply duplicate"
++                                        "id=%u nodeid=%u %u/%u",
++                                        sev->se_id, nodeid,
++                                        sev->se_node_status[i],
++                                        smsg->ms_status);
++                              goto out;
++                      }
++
++                      if (type == SMSG_JOIN_REP) {
++                              save_lastid(smsg);
++
++                              if (smsg->ms_status == STATUS_POS)
++                                      save_global_id(sev, smsg);
++                      }
++
++                      /*
++                       * Signal sm if we have all replies
++                       */
++
++                      if (++sev->se_reply_count == expected) {
++                              clear_allowed_msgtype(sev, type);
++                              sev->se_state = next_sev_state(type, 
++                                                             sev->se_state);
++                              set_bit(SEFL_CHECK, &sev->se_flags);
++                              wake_serviced(DO_JOINLEAVE);
++                      }
++
++                      break;
++              }
++      }
++
++      out:
++      return;
++}
++
++/*
++ * A node wants to join an SG and has run send_join_notice.  If we know nothing
++ * about the SG , then we have no objection - send back STATUS_POS.  If we're a
++ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
++ * no sevent or uevent of higher priority in progress (only a single join or
++ * leave is permitted for the SG at once).  If there happens to be a higher
++ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
++ * requested join for a bit.
++ */
++
++static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
++{
++      sm_group_t *sg = NULL;
++      sm_sevent_t *sev = NULL;
++      sm_node_t *node;
++      int found = FALSE;
++      int level = smsg->ms_sglevel;
++      sm_msg_t reply;
++
++      memset(&reply, 0, sizeof(reply));
++
++      down(&sm_sglock);
++
++      if (nodeid == sm_our_nodeid)
++              goto next;
++
++      /*
++       * search SG list for an SG with given name/len
++       */
++
++      list_for_each_entry(sg, &sm_sg[level], list) {
++              if ((sg->namelen != smsg->ms_length) ||
++                  memcmp(sg->name, name, sg->namelen))
++                      continue;
++              found = TRUE;
++              break;
++      }
++
++      /*
++       * build reply message
++       */
++
++      next:
++
++      if (!found) {
++              reply.ms_type = SMSG_JOIN_REP;
++              reply.ms_status = STATUS_NEG;
++              reply.ms_global_lastid = global_last_id;
++              reply.ms_sevent_id = smsg->ms_sevent_id;
++      } else {
++              reply.ms_type = SMSG_JOIN_REP;
++              reply.ms_status = STATUS_POS;
++              reply.ms_sevent_id = smsg->ms_sevent_id;
++              reply.ms_global_sgid = sg->global_id;
++              reply.ms_global_lastid = global_last_id;
++
++              /*
++               * The node trying to join should wait and try again until
++               * we're done with recovery.
++               */
++
++              if (sg->state == SGST_RECOVER) {
++                      reply.ms_status = STATUS_WAIT;
++                      goto send;
++              }
++
++              /*
++               * An sevent node trying to join may have gotten as far as
++               * creating a uevent with us and then backed out.  That node
++               * will retry joining from the beginning so we should not turn
++               * them away.  If we're handling a uevent for another node,
++               * tell the joining node to wait.
++               */
++
++              if (test_bit(SGFL_UEVENT, &sg->flags)) {
++                      if (sg->uevent.ue_nodeid != nodeid)
++                              reply.ms_status = STATUS_WAIT;
++                      goto send;
++              }
++
++              /*
++               * We're trying to join or leave the SG at the moment.
++               */
++
++              if (test_bit(SGFL_SEVENT, &sg->flags)) {
++                      sev = sg->sevent;
++
++                      /*
++                       * We're trying to leave.  Make the join wait until
++                       * we've left if we're beyond LEAVE_ACKWAIT.
++                       */
++
++                      if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++                              if (sev->se_state > SEST_LEAVE_ACKED)
++                                      reply.ms_status = STATUS_WAIT;
++                              else {
++                                      reply.ms_status = STATUS_POS;
++                                      clear_bit(SEFL_ALLOW_LEAVE,
++                                                &sev->se_flags);
++                                      set_bit(SEFL_CANCEL, &sev->se_flags);
++                              }
++                      }
++
++                      /*
++                       * We're trying to join.  Making the other join wait
++                       * until we're joined if we're beyond JOIN_ACKWAIT or
++                       * if we have a lower id.  (Send NEG to allow the other
++                       * node to go ahead because we're not in the SG.)
++                       */
++
++                      else {
++                              if (sev->se_state > SEST_JOIN_ACKED)
++                                      reply.ms_status = STATUS_WAIT;
++                              else if (sm_our_nodeid < nodeid)
++                                      reply.ms_status = STATUS_WAIT;
++                              else {
++                                      reply.ms_status = STATUS_NEG;
++                                      clear_bit(SEFL_ALLOW_JOIN,
++                                                &sev->se_flags);
++                                      set_bit(SEFL_CANCEL, &sev->se_flags);
++                              }
++                      }
++
++                      if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
++                              set_bit(SEFL_CHECK, &sev->se_flags);
++                              wake_serviced(DO_JOINLEAVE);
++                      }
++                      goto send;
++              }
++
++              /* no r,u,s event, stick with STATUS_POS */
++      }
++
++      send:
++
++      if (reply.ms_status == STATUS_POS) {
++              node = sm_find_joiner(sg, nodeid);
++              if (!node) {
++                      node = sm_new_node(nodeid);
++                      list_add_tail(&node->list, &sg->joining);
++              }
++      }
++
++      up(&sm_sglock);
++      smsg_bswap_out(&reply);
++      send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++/*
++ * Another node wants us to stop a service so it can join or leave the SG.  We
++ * do this by saving the request info in a uevent and having the sm thread do
++ * the processing and then replying.
++ */
++
++static void process_stop_request(sm_msg_t * smsg, uint32_t nodeid,
++                               uint32_t * msgbuf)
++{
++      sm_group_t *sg;
++      sm_uevent_t *uev;
++      sm_msg_t reply;
++      int type = smsg->ms_type;
++
++      if (nodeid == sm_our_nodeid)
++              goto agree;
++
++      sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++      if (!sg) {
++              log_print("process_stop_request: unknown sg id %x",
++                        smsg->ms_global_sgid);
++              return;
++      }
++
++      /*
++       * We shouldn't get here with uevent already set.
++       */
++
++      if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
++              log_error(sg, "process_stop_request: uevent already set");
++              return;
++      }
++
++      uev = &sg->uevent;
++      uev->ue_nodeid = nodeid;
++      uev->ue_remote_seid = smsg->ms_sevent_id;
++      uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
++
++      if (type == SMSG_JSTOP_REQ)
++              uev->ue_num_nodes = be32_to_cpu(*msgbuf);
++      else
++              set_bit(UEFL_LEAVE, &uev->ue_flags);
++
++      /*
++       * Do process_join_stop() or process_leave_stop().
++       */
++
++      set_bit(UEFL_CHECK, &uev->ue_flags);
++      wake_serviced(DO_MEMBERSHIP);
++      return;
++
++      agree:
++      reply.ms_status = STATUS_POS;
++      reply.ms_type =
++          (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
++      reply.ms_sevent_id = smsg->ms_sevent_id;
++      smsg_bswap_out(&reply);
++      send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++static void process_start_request(sm_msg_t * smsg, uint32_t nodeid)
++{
++      sm_group_t *sg;
++      sm_uevent_t *uev;
++      int type = smsg->ms_type;
++
++      if (nodeid == sm_our_nodeid)
++              return;
++
++      sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++      if (!sg) {
++              log_print("process_start_request: unknown sg id %x",
++                        smsg->ms_global_sgid);
++              return;
++      }
++
++      if (!test_bit(SGFL_UEVENT, &sg->flags)) {
++              log_error(sg, "process_start_request: no uevent");
++              return;
++      }
++
++      uev = &sg->uevent;
++
++      if (type == SMSG_JSTART_CMD)
++              uev->ue_state = UEST_JSTART;
++      else
++              uev->ue_state = UEST_LSTART;
++
++      set_bit(UEFL_CHECK, &uev->ue_flags);
++      wake_serviced(DO_MEMBERSHIP);
++}
++
++static void process_leave_request(sm_msg_t * smsg, uint32_t nodeid)
++{
++      sm_group_t *sg;
++      sm_node_t *node;
++      sm_msg_t reply;
++      sm_sevent_t *sev;
++      int found = FALSE;
++
++      sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++      if (sg) {
++              if (nodeid == sm_our_nodeid)
++                      found = TRUE;
++              else {
++                      list_for_each_entry(node, &sg->memb, list) {
++                              if (node->id != nodeid)
++                                      continue;
++                              set_bit(SNFL_LEAVING, &node->flags);
++                              found = TRUE;
++                              break;
++                      }
++              }
++      }
++
++      if (!found) {
++              reply.ms_type = SMSG_LEAVE_REP;
++              reply.ms_status = STATUS_NEG;
++              reply.ms_sevent_id = smsg->ms_sevent_id;
++      } else {
++              reply.ms_type = SMSG_LEAVE_REP;
++              reply.ms_status = STATUS_POS;
++              reply.ms_sevent_id = smsg->ms_sevent_id;
++
++              if (sg->state == SGST_RECOVER)
++                      reply.ms_status = STATUS_WAIT;
++
++              else if (test_bit(SGFL_SEVENT, &sg->flags) &&
++                       nodeid != sm_our_nodeid) {
++                      sev = sg->sevent;
++
++                      /*
++                       * We're trying to join or leave at the moment.  If
++                       * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
++                       * wait.  Otherwise, if joining we'll cancel to let the
++                       * leave happen first, or if we're leaving allow the
++                       * lower nodeid to leave first.
++                       */
++
++                      if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++                              if (sev->se_state > SEST_LEAVE_ACKWAIT)
++                                      reply.ms_status = STATUS_WAIT;
++                              else if (sm_our_nodeid < nodeid)
++                                      reply.ms_status = STATUS_WAIT;
++                              else {
++                                      reply.ms_status = STATUS_POS;
++                                      clear_bit(SEFL_ALLOW_LEAVE,
++                                                &sev->se_flags);
++                                      set_bit(SEFL_CANCEL, &sev->se_flags);
++                              }
++                      } else {
++                              if (sev->se_state > SEST_JOIN_ACKWAIT)
++                                      reply.ms_status = STATUS_WAIT;
++                              else {
++                                      reply.ms_status = STATUS_NEG;
++                                      clear_bit(SEFL_ALLOW_JOIN,
++                                                &sev->se_flags);
++                                      set_bit(SEFL_CANCEL, &sev->se_flags);
++                              }
++                      }
++
++                      if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
++                              set_bit(SEFL_CHECK, &sev->se_flags);
++                              wake_serviced(DO_JOINLEAVE);
++                      }
++              }
++
++              else if (test_bit(SGFL_UEVENT, &sg->flags)) {
++                      if (sg->uevent.ue_nodeid != nodeid)
++                              reply.ms_status = STATUS_WAIT;
++              }
++
++      }
++
++      smsg_bswap_out(&reply);
++      send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++/*
++ * Each remaining node will send us a done message.  We quit when we get the
++ * first.  The subsequent done messages for the finished sevent get here and
++ * are ignored.
++ */
++
++static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
++{
++      sm_sevent_t *sev;
++
++      sev = find_sevent(smsg->ms_sevent_id);
++      if (!sev)
++              return;
++
++      if (sev->se_state != SEST_LSTART_WAITREMOTE)
++              return;
++
++      sev->se_state = SEST_LSTART_REMOTEDONE;
++      set_bit(SEFL_CHECK, &sev->se_flags);
++      wake_serviced(DO_JOINLEAVE);
++}
++
++/*
++ * This function and everything it calls always runs in sm context.
++ */
++
++static void process_message(char *msg, uint32_t nodeid)
++{
++      sm_msg_t smsg;
++
++      smsg_copy_in(msg, &smsg);
++
++      switch (smsg.ms_type) {
++      case SMSG_JOIN_REQ:
++              process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
++              break;
++
++      case SMSG_JSTOP_REQ:
++              process_stop_request(&smsg, nodeid,
++                                   (uint32_t *) (msg + sizeof(sm_msg_t)));
++              break;
++
++      case SMSG_LEAVE_REQ:
++              process_leave_request(&smsg, nodeid);
++              break;
++
++      case SMSG_LSTOP_REQ:
++              process_stop_request(&smsg, nodeid, NULL);
++              break;
++
++      case SMSG_JSTART_CMD:
++      case SMSG_LSTART_CMD:
++              process_start_request(&smsg, nodeid);
++              break;
++
++      case SMSG_LSTART_DONE:
++              process_lstart_done(&smsg, nodeid);
++              break;
++
++      case SMSG_JOIN_REP:
++      case SMSG_JSTOP_REP:
++      case SMSG_LEAVE_REP:
++      case SMSG_LSTOP_REP:
++              process_reply(&smsg, nodeid);
++              break;
++
++      case SMSG_RECOVER:
++              process_recover_msg(&smsg, nodeid);
++              break;
++
++      default:
++              log_print("process_message: unknown type %u nodeid %u",
++                        smsg.ms_type, nodeid);
++      }
++}
++
++/*
++ * Always called from sm context.
++ */
++
++void process_messages(void)
++{
++      rq_entry_t *re;
++
++      while (1) {
++              re = NULL;
++
++              spin_lock(&message_lock);
++              if (!list_empty(&messages)) {
++                      re = list_entry(messages.next, rq_entry_t, list);
++                      list_del(&re->list);
++              }
++              spin_unlock(&message_lock);
++
++              if (!re)
++                      break;
++              process_message(re->msg, re->nodeid);
++              kfree(re->msg);
++              kfree(re);
++              schedule();
++      }
++}
++
++/*
++ * Context: cnxman and sm
++ */
++
++static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
++{
++      rq_entry_t *re;
++
++      SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
++               re);
++      SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
++
++      memcpy(re->msg, msg, len);
++      re->len = len;
++      re->nodeid = nodeid;
++
++      spin_lock(&message_lock);
++      list_add_tail(&re->list, &messages);
++      spin_unlock(&message_lock);
++
++      wake_serviced(DO_MESSAGES);
++      return 0;
++}
++
++/*
++ * Context: cnxman
++ * Called by cnxman when a service manager message arrives.
++ */
++
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++                     unsigned int node_id)
++{
++      struct kcl_cluster_node kclnode;
++      uint32_t nodeid = 0;
++      int error = 0;
++
++      if (!node_id) {
++              error = kcl_get_node_by_addr(addr, addr_len, &kclnode);
++              if (error)
++                      return error;
++              nodeid = kclnode.node_id;
++      } else
++              nodeid = node_id;
++
++      return add_to_recvqueue(msg, len, nodeid);
++}
++
++/*
++ * These send routines are used by sm and are always called from sm context.
++ */
++
++int send_nodeid_message(char *msg, int len, uint32_t nodeid)
++{
++      int error = 0;
++      struct sockaddr_cl saddr;
++
++      if (nodeid == sm_our_nodeid) {
++              add_to_recvqueue(msg, len, nodeid);
++              goto out;
++      }
++
++      saddr.scl_family = AF_CLUSTER;
++      saddr.scl_port = CLUSTER_PORT_SERVICES;
++      saddr.scl_nodeid = nodeid;
++      error = kcl_sendmsg(sm_socket, msg, len, &saddr,
++                          sizeof(saddr), 0);
++      if (error > 0)
++              error = 0;
++
++      if (error)
++              log_print("send_nodeid_message error %d to %u", error, nodeid);
++      out:
++      return error;
++}
++
++int send_broadcast_message(char *msg, int len)
++{
++      int error;
++
++      error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
++      if (error > 0)
++              error = 0;
++
++      add_to_recvqueue(msg, len, sm_our_nodeid);
++
++      if (error)
++              log_print("send_broadcast_message error %d", error);
++
++      return error;
++}
++
++int send_members_message(sm_group_t *sg, char *msg, int len)
++{
++      sm_node_t *node;
++      int error = 0;
++
++      list_for_each_entry(node, &sg->memb, list) {
++              error = send_nodeid_message(msg, len, node->id);
++              if (error < 0)
++                      break;
++      }
++      return error;
++}
++
++int send_members_message_sev(sm_group_t *sg, char *msg, int len,
++                           sm_sevent_t * sev)
++{
++      int error;
++      sm_msg_t *smsg = (sm_msg_t *) msg;
++
++      set_allowed_msgtype(sev, smsg->ms_type);
++      sev->se_reply_count = 0;
++
++      error = send_members_message(sg, msg, len);
++      if (error < 0)
++              clear_allowed_msgtype(sev, smsg->ms_type);
++
++      return error;
++}
++
++int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
++{
++      int error;
++      sm_msg_t *smsg = (sm_msg_t *) msg;
++
++      set_allowed_msgtype(sev, smsg->ms_type);
++      sev->se_reply_count = 0;
++
++      error = send_broadcast_message(msg, len);
++      if (error < 0)
++              clear_allowed_msgtype(sev, smsg->ms_type);
++
++      return error;
++}
+diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
+--- linux-orig/cluster/cman/sm_message.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_message.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MESSAGE_DOT_H__
++#define __SM_MESSAGE_DOT_H__
++
++void init_messages(void);
++uint32_t sm_new_global_id(int level);
++void smsg_bswap_out(sm_msg_t * smsg);
++char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
++                sm_sevent_t *sev);
++void process_messages(void);
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++                     unsigned int node_id);
++int send_nodeid_message(char *msg, int len, uint32_t nodeid);
++int send_broadcast_message(char *msg, int len);
++int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
++int send_members_message(sm_group_t *sg, char *msg, int len);
++int send_members_message_sev(sm_group_t *sg, char *msg, int len,
++                           sm_sevent_t * sev);
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++                     unsigned int node_id);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
+--- linux-orig/cluster/cman/sm_misc.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_misc.c       2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,369 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++#define MAX_DEBUG_MSG_LEN     (40)
++
++extern struct list_head sm_members;
++static uint32_t               local_ids;
++static uint32_t               event_id;
++static spinlock_t     event_id_lock;
++static char *         debug_buf;
++static unsigned int   debug_size;
++static unsigned int   debug_point;
++static int            debug_wrap;
++static spinlock_t     debug_lock;
++
++
++void init_sm_misc(void)
++{
++      local_ids = 1;
++      event_id = 1;
++      spin_lock_init(&event_id_lock);
++      debug_buf = NULL;
++      debug_size = 0;
++      debug_point = 0;
++      debug_wrap = 0;
++      spin_lock_init(&debug_lock);
++
++      sm_debug_setup(cman_config.sm_debug_size);
++}
++
++sm_node_t *sm_new_node(uint32_t nodeid)
++{
++      struct kcl_cluster_node kclnode;
++      sm_node_t *node;
++      int error;
++
++      error = kcl_get_node_by_nodeid(nodeid, &kclnode);
++      SM_ASSERT(!error,);
++
++      SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
++               node);
++
++      memset(node, 0, sizeof(sm_node_t));
++      node->id = nodeid;
++      node->incarnation = kclnode.incarnation;
++      return node;
++}
++
++sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
++{
++      sm_node_t *node;
++
++      list_for_each_entry(node, &sg->joining, list) {
++              if (node->id == nodeid)
++                      return node;
++      }
++      return NULL;
++}
++
++sm_node_t *sm_find_member(uint32_t nodeid)
++{
++      sm_node_t *node;
++
++      list_for_each_entry(node, &sm_members, list) {
++              if (node->id == nodeid)
++                      return node;
++      }
++      return NULL;
++}
++
++uint32_t sm_new_local_id(int level)
++{
++      uint32_t id = local_ids++;
++      uint8_t l = (uint8_t) level;
++
++      if (level > 0xFF)
++              return 0;
++
++      if (id > 0x00FFFFFF)
++              return 0;
++
++      id |= (l << 24);
++      return id;
++}
++
++int sm_id_to_level(uint32_t id)
++{
++      uint8_t l = (id & 0xFF000000) >> 24;
++
++      return (int) l;
++}
++
++void sm_set_event_id(int *id)
++{
++      spin_lock(&event_id_lock);
++      *id = event_id++;
++      spin_unlock(&event_id_lock);
++}
++
++sm_group_t *sm_local_id_to_sg(int id)
++{
++      sm_group_t *sg;
++      int level = sm_id_to_level(id);
++      int found = FALSE;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sm_sg[level], list) {
++              if (sg->local_id == id) {
++                      found = TRUE;
++                      break;
++              }
++      }
++      up(&sm_sglock);
++      if (!found)
++              sg = NULL;
++      return sg;
++}
++
++sm_group_t *sm_global_id_to_sg(int id)
++{
++      sm_group_t *sg;
++      int level = sm_id_to_level(id);
++      int found = FALSE;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sm_sg[level], list) {
++              if (sg->global_id == id) {
++                      found = TRUE;
++                      break;
++              }
++      }
++      up(&sm_sglock);
++      if (!found)
++              sg = NULL;
++      return sg;
++}
++
++void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
++{
++      va_list va;
++      int i, n, size, len;
++      char buf[MAX_DEBUG_MSG_LEN+1];
++
++      spin_lock(&debug_lock);
++
++      if (!debug_buf)
++              goto out;
++
++      size = MAX_DEBUG_MSG_LEN;
++      memset(buf, 0, size+1);
++
++      n = snprintf(buf, size, "%08x ", sg->global_id);
++      size -= n;
++
++      va_start(va, fmt);
++      vsnprintf(buf+n, size, fmt, va);
++      va_end(va);
++
++      len = strlen(buf);
++      if (len > MAX_DEBUG_MSG_LEN-1)
++              len = MAX_DEBUG_MSG_LEN-1;
++      buf[len] = '\n';
++      buf[len+1] = '\0';
++
++      for (i = 0; i < strlen(buf); i++) {
++              debug_buf[debug_point++] = buf[i];
++
++              if (debug_point == debug_size) {
++                      debug_point = 0;
++                      debug_wrap = 1;
++              }
++      }
++ out:
++      spin_unlock(&debug_lock);
++}
++
++void sm_debug_setup(int size)
++{
++      char *b = kmalloc(size, GFP_KERNEL);
++
++      spin_lock(&debug_lock);
++      if (debug_buf)
++              kfree(debug_buf);
++
++      if (size > PAGE_SIZE)
++              size = PAGE_SIZE;
++      debug_size = size;
++      debug_point = 0;
++      debug_wrap = 0;
++      debug_buf = b;
++      memset(debug_buf, 0, debug_size);
++      spin_unlock(&debug_lock);
++}
++
++#ifdef CONFIG_PROC_FS
++
++int sm_debug_info(char *b, char **start, off_t offset, int length)
++{
++      int i, n = 0;
++
++      spin_lock(&debug_lock);
++
++      if (debug_wrap) {
++              for (i = debug_point; i < debug_size; i++)
++                      n += sprintf(b + n, "%c", debug_buf[i]);
++      }
++      for (i = 0; i < debug_point; i++)
++              n += sprintf(b + n, "%c", debug_buf[i]);
++
++      spin_unlock(&debug_lock);
++
++      return n;
++}
++
++int sm_procdata(char *b, char **start, off_t offset, int length)
++{
++      sm_group_t *sg;
++      sm_node_t *node;
++      int n = 0, level, i;
++
++      n += sprintf(b + n, "\n");
++
++      /* 
++       * Header
++       */
++
++      n += sprintf(b + n,
++                   "Service          Name                              GID LID State     Code\n");
++
++      down(&sm_sglock);
++
++      for (level = 0; level < SG_LEVELS; level++) {
++              list_for_each_entry(sg, &sm_sg[level], list) {
++
++                      /* 
++                       * Cluster Service
++                       */
++
++                      switch (level) {
++                      case SERVICE_LEVEL_FENCE:
++                              n += sprintf(b + n, "Fence Domain:    ");
++                              break;
++                      case SERVICE_LEVEL_GDLM:
++                              n += sprintf(b + n, "DLM Lock Space:  ");
++                              break;
++                      case SERVICE_LEVEL_GFS:
++                              n += sprintf(b + n, "GFS Mount Group: ");
++                              break;
++                      case SERVICE_LEVEL_USER:
++                              n += sprintf(b + n, "User:            ");
++                              break;
++                      }
++
++                      /* 
++                       * Name
++                       */
++
++                      n += sprintf(b + n, "\"");
++                      for (i = 0; i < sg->namelen; i++)
++                              n += sprintf(b + n, "%c", sg->name[i]);
++                      n += sprintf(b + n, "\"");
++
++                      for (; i < MAX_SERVICE_NAME_LEN-1; i++)
++                              n += sprintf(b + n, " ");
++
++                      /* 
++                       * GID LID (sans level from top byte)
++                       */
++
++                      n += sprintf(b + n, "%3u %3u ",
++                                   (sg->global_id & 0x00FFFFFF),
++                                   (sg->local_id & 0x00FFFFFF));
++
++                      /* 
++                       * State
++                       */
++
++                      switch (sg->state) {
++                      case SGST_NONE:
++                              n += sprintf(b + n, "none      ");
++                              break;
++                      case SGST_JOIN:
++                              n += sprintf(b + n, "join      ");
++                              break;
++                      case SGST_RUN:
++                              n += sprintf(b + n, "run       ");
++                              break;
++                      case SGST_RECOVER:
++                              n += sprintf(b + n, "recover %u ",
++                                              sg->recover_state);
++                              break;
++                      case SGST_UEVENT:
++                              n += sprintf(b + n, "update    ");
++                              break;
++                      }
++
++                      /* 
++                       * Code
++                       */
++
++                      if (test_bit(SGFL_SEVENT, &sg->flags))
++                              n += sprintf(b + n, "S");
++                      if (test_bit(SGFL_UEVENT, &sg->flags))
++                              n += sprintf(b + n, "U");
++                      if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++                              n += sprintf(b + n, "N");
++
++                      n += sprintf(b + n, "-");
++
++                      if (test_bit(SGFL_SEVENT, &sg->flags)
++                          && sg->sevent) {
++                              n += sprintf(b + n, "%u,%lx,%u",
++                                           sg->sevent->se_state,
++                                           sg->sevent->se_flags,
++                                           sg->sevent->se_reply_count);
++                      }
++
++                      if (test_bit(SGFL_UEVENT, &sg->flags)) {
++                              n += sprintf(b + n, "%u,%lx,%u",
++                                           sg->uevent.ue_state,
++                                           sg->uevent.ue_flags,
++                                           sg->uevent.ue_nodeid);
++                      }
++
++                      n += sprintf(b + n, "\n");
++
++                      /* 
++                       * node list
++                       */
++
++                      i = 0;
++
++                      n += sprintf(b + n, "[");
++
++                      list_for_each_entry(node, &sg->memb, list) {
++                              if (i && !(i % 24))
++                                      n += sprintf(b + n, "\n");
++
++                              if (i)
++                                      n += sprintf(b + n, " ");
++
++                              n += sprintf(b + n, "%u", node->id);
++                              i++;
++                      }
++
++                      n += sprintf(b + n, "]\n\n");
++              }
++      }
++
++      up(&sm_sglock);
++
++      return n;
++}
++#endif
+diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
+--- linux-orig/cluster/cman/sm_misc.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_misc.h       2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MISC_DOT_H__
++#define __SM_MISC_DOT_H__
++
++void init_sm_misc(void);
++sm_node_t *sm_new_node(uint32_t nodeid);
++sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
++sm_node_t *sm_find_member(uint32_t nodeid);
++uint32_t sm_new_local_id(int level);
++int sm_id_to_level(uint32_t id);
++void sm_set_event_id(int *id);
++sm_group_t *sm_local_id_to_sg(int id);
++sm_group_t *sm_global_id_to_sg(int id);
++void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
++void sm_debug_setup(int size);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
+--- linux-orig/cluster/cman/sm_recover.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_recover.c    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,522 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++/*
++ * A collection of sg's which need to be recovered due to a failed member.
++ * These sg's are recovered in order of level.  An sg subject to cascading
++ * failures is moved from one of these structs to a newer one.
++ */
++
++struct recover {
++      struct list_head        list;           /* list of current re's */
++      struct list_head        sgs[SG_LEVELS]; /* lists of sg's by level */
++      int                     event_id;       /* event id */
++      int                     cur_level;
++};
++typedef struct recover recover_t;
++
++
++extern uint32_t *     sm_new_nodeids;
++extern int            sm_quorum, sm_quorum_next;
++extern uint32_t               sm_our_nodeid;
++extern struct list_head       sm_members;
++extern int            sm_member_count;
++static struct list_head       recoveries;
++
++
++void init_recovery(void)
++{
++      INIT_LIST_HEAD(&recoveries);
++}
++
++/* 
++ * This is the first thing called when a change is announced in cluster
++ * membership.  Nodes are marked as being a CLUSTER_MEMBER or not.  SM adds new
++ * nodes to its sm_members list which it's not seen before.  Nodes which were
++ * alive but are now gone are marked as "need recovery".
++ *
++ * The "need recovery" status of nodes is propagated to the node's SG's in
++ * mark_effected_sgs.  The effected SG's are themselves marked as needing
++ * recovery and in new_recovery the dead nodes are removed from the SG's
++ * individual member lists.  The "need recovery" status of nodes is cleared in
++ * adjust_members_done().
++ */
++
++static int adjust_members(void)
++{
++      sm_node_t *node;
++      struct kcl_cluster_node knode;
++      int i, error, num_nodes, sub = 0, add = 0, found;
++
++      /* 
++       * Get list of current members from cnxman
++       */
++
++      memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
++      num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
++
++      /* 
++       * Determine who's gone
++       */
++
++      list_for_each_entry(node, &sm_members, list) {
++              found = FALSE;
++              for (i = 0; i < num_nodes; i++) {
++                      if (node->id == sm_new_nodeids[i]) {
++                              found = TRUE;
++                              sm_new_nodeids[i] = 0;
++                              break;
++                      }
++              }
++
++              if (found) {
++                      error = kcl_get_node_by_nodeid(node->id, &knode);
++                      SM_ASSERT(!error, printk("error=%d\n", error););
++
++                      if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
++                              /* former member is back */
++                              set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
++                              node->incarnation = knode.incarnation;
++                              add++;
++                      } else {
++                              /* current member is still alive - if the
++                               * incarnation number is different it died and
++                               * returned between checks */
++                              if (node->incarnation != knode.incarnation) {
++                                      set_bit(SNFL_NEED_RECOVERY,
++                                              &node->flags);
++                                      node->incarnation = knode.incarnation;
++                                      sub++;
++                              }
++                      }
++              } else {
++                      /* current member has died */
++                      if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
++                                             &node->flags)) {
++                              set_bit(SNFL_NEED_RECOVERY, &node->flags);
++                              sub++;
++                      }
++              }
++      }
++
++      /*
++       * Look for new nodes
++       */
++
++      for (i = 0; i < num_nodes; i++) {
++              if (sm_new_nodeids[i]) {
++                      node = sm_new_node(sm_new_nodeids[i]);
++                      set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
++                      add++;
++                      list_add_tail(&node->list, &sm_members);
++                      sm_member_count++;
++              }
++      }
++
++      /*
++       * Get our own nodeid
++       */
++
++      if (!sm_our_nodeid) {
++              list_for_each_entry(node, &sm_members, list) {
++                      error = kcl_get_node_by_nodeid(node->id, &knode);
++                      SM_ASSERT(!error, printk("error=%d\n", error););
++
++                      if (knode.us) {
++                              sm_our_nodeid = knode.node_id;
++                              break;
++                      }
++              }
++      }
++
++      return sub;
++}
++
++/*
++ * Given some number of dead nodes, flag SG's the dead nodes were part of.
++ * This requires a number of loops because each node structure does not keep a
++ * list of SG's it's in.
++ */
++
++static int mark_effected_sgs(void)
++{
++      sm_group_t *sg;
++      sm_node_t *node, *sgnode;
++      uint32_t dead_id;
++      int i, effected = 0;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(node, &sm_members, list) {
++              if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
++                      continue;
++
++              dead_id = node->id;
++
++              for (i = 0; i < SG_LEVELS; i++) {
++                      list_for_each_entry(sg, &sm_sg[i], list) {
++                              /* check if dead node is among sg's members */
++                              list_for_each_entry(sgnode, &sg->memb, list) {
++                                      if (sgnode->id == dead_id) {
++                                              set_bit(SGFL_NEED_RECOVERY,
++                                                      &sg->flags);
++                                              effected++;
++                                              break;
++                                      }
++                              }
++                      }
++              }
++      }
++      up(&sm_sglock);
++
++      return effected;
++}
++
++static recover_t *alloc_recover(void)
++{
++      recover_t *rev;
++      int i;
++
++      SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
++
++      memset(rev, 0, sizeof(recover_t));
++
++      sm_set_event_id(&rev->event_id);
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              INIT_LIST_HEAD(&rev->sgs[i]);
++      }
++
++      return rev;
++}
++
++/*
++ * An in-progress revent re-start for an SG is interrupted by another node
++ * failure in the SG.  Cancel an outstanding barrier if there is one.  The SG
++ * will be moved to the new revent and re-started as part of that.
++ */
++
++static void cancel_prev_recovery(sm_group_t *sg)
++{
++      int error;
++
++      if (sg->recover_state == RECOVER_BARRIERWAIT) {
++              error = kcl_barrier_cancel(sg->recover_barrier);
++              if (error)
++                      log_error(sg, "cancel_prev_recovery: error %d", error);
++      }
++}
++
++static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
++{
++      if (sg->state == SGST_RECOVER) {
++              cancel_prev_recovery(sg);
++              list_del(&sg->recover_list);
++      }
++
++      sg->ops->stop(sg->service_data);
++      sg->state = SGST_RECOVER;
++      sg->recover_state = RECOVER_NONE;
++      sg->recover_data = rev;
++      list_add(&sg->recover_list, &rev->sgs[sg->level]); 
++}
++
++/*
++ * When adjust_members finds that some nodes are dead and mark_effected_sgs
++ * finds that some SG's are effected by departed nodes, this is called to
++ * collect together the SG's which need to be recovered.  An revent (recovery
++ * event) is the group of effected SG's.
++ */
++
++static int new_recovery(void)
++{
++      sm_group_t *sg;
++      recover_t *rev;
++      sm_node_t *node, *sgnode, *safe;
++      int i;
++
++      rev = alloc_recover();
++      list_add_tail(&rev->list, &recoveries);
++
++      down(&sm_sglock);
++
++      /*
++       * Stop effected SG's and add them to the rev
++       */
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              list_for_each_entry(sg, &sm_sg[i], list) {
++                      if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
++                              if (sg->state == SGST_JOIN)
++                                      continue;
++                              pre_recover_sg(sg, rev);
++                      }
++              }
++      }
++
++      /*
++       * For an SG needing recovery, remove dead nodes from sg->memb list
++       */
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              list_for_each_entry(sg, &rev->sgs[i], recover_list) {
++
++                      /* Remove dead members from SG's member list */
++                      list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
++
++                              node = sm_find_member(sgnode->id);
++                              SM_ASSERT(node, printk("id %u\n", sgnode->id););
++
++                              if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
++                                      list_del(&sgnode->list);
++                                      kfree(sgnode);
++                                      sg->memb_count--;
++                                      log_debug(sg, "remove node %u count %d",
++                                                sgnode->id, sg->memb_count);
++                              }
++                      }
++              }
++      }
++
++      up(&sm_sglock);
++      rev->cur_level = 0;
++      return 0;
++}
++
++/*
++ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
++ * mark_effected_sgs() and add_revent().  After that, we're done using the bit
++ * and we clear it here.
++ */
++
++static void adjust_members_done(void)
++{
++      sm_node_t *node;
++
++      list_for_each_entry(node, &sm_members, list)
++              clear_bit(SNFL_NEED_RECOVERY, &node->flags);
++}
++
++/*
++ * Start the service of the given SG.  The service must be given an array of
++ * nodeids specifying the new sg membership.  The service is responsible to
++ * free this chunk of memory when done with it.
++ */
++
++static void start_sg(sm_group_t *sg, uint32_t event_id)
++{
++      sm_node_t *node;
++      uint32_t *memb;
++      int count = 0;
++
++      SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
++               memb);
++
++      list_for_each_entry(node, &sg->memb, list)
++              memb[count++] = node->id;
++
++      sg->ops->start(sg->service_data, memb, count, event_id,
++                     SERVICE_NODE_FAILED);
++}
++
++static void recovery_barrier(sm_group_t *sg)
++{
++      char bname[MAX_BARRIER_NAME_LEN];
++      int error, len;
++
++      memset(bname, 0, MAX_BARRIER_NAME_LEN);
++
++      /* bypass the barrier if we're the only member */
++      if (sg->memb_count == 1) {
++              process_recovery_barrier(sg, 0);
++              return;
++      }
++
++      len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
++                     sg->global_id, sg->recover_stop, sg->memb_count);
++
++      /* We save this barrier name so we can cancel it if needed. */
++      memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
++      memcpy(sg->recover_barrier, bname, len);
++
++      error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
++      if (error)
++              log_error(sg, "recovery_barrier error %d: %s", error, bname);
++}
++
++static void recover_sg(sm_group_t *sg, int event_id)
++{
++      log_debug(sg, "recover state %d", sg->recover_state);
++
++      switch (sg->recover_state) {
++
++      case RECOVER_NONE:
++              /* must wait for recovery to stop sg on all nodes */
++              sg->recover_state = RECOVER_BARRIERWAIT;
++              sg->recover_stop = 0;
++              recovery_barrier(sg);
++              break;
++
++      case RECOVER_BARRIERWAIT:
++              break;
++
++      case RECOVER_STOP:
++              /* barrier callback sets state STOP */
++              sg->recover_stop = 1;
++              sg->recover_state = RECOVER_START;
++              start_sg(sg, event_id);
++              break;
++
++      case RECOVER_START:
++              break;
++
++      case RECOVER_STARTDONE:
++              /* service callback sets state STARTDONE */
++              sg->recover_state = RECOVER_BARRIERWAIT;
++              recovery_barrier(sg);
++              break;
++
++      case RECOVER_BARRIERDONE:
++              /* barrier callback sets state BARRIERDONE */
++              sg->ops->finish(sg->service_data, event_id);
++              list_del(&sg->recover_list);
++              sg->recover_state = RECOVER_NONE;
++              sg->state = SGST_RUN;
++
++              /* Continue a previous, interrupted attempt to leave the sg */
++              if (sg->sevent) {
++                      clear_bit(SEFL_DELAY, &sg->sevent->se_flags);
++                      set_bit(SEFL_CHECK, &sg->sevent->se_flags);
++                      wake_serviced(DO_JOINLEAVE);
++              }
++              break;
++
++      default:
++              log_error(sg, "invalid recover_state %u", sg->recover_state);
++      }
++}
++
++static void recover_level(recover_t *rev, int level)
++{
++      sm_group_t *sg, *safe;
++
++      list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
++              recover_sg(sg, rev->event_id);
++}
++
++static void recover_levels(recover_t *rev)
++{
++      for (;;) {
++              recover_level(rev, rev->cur_level);
++
++              if (list_empty(&rev->sgs[rev->cur_level])) {
++                      if (rev->cur_level == SG_LEVELS - 1) {
++                              list_del(&rev->list);
++                              kfree(rev);
++                              return;
++                      }
++                      rev->cur_level++;
++                      continue;
++              }
++              break;
++      }
++}
++
++/*
++ * Called by SM thread when the cluster is quorate.  It restarts
++ * SG's that were stopped in new_recovery() due to a member death.
++ * It waits for all SG's at level N to complete restart before
++ * restarting SG's at level N+1.
++ */
++
++void process_recoveries(void)
++{
++      recover_t *rev, *safe;
++
++      down(&sm_sglock);
++      list_for_each_entry_safe(rev, safe, &recoveries, list)
++              recover_levels(rev);
++      up(&sm_sglock);
++}
++
++/*
++ * The cnxman membership has changed.  Check if there's still quorum and
++ * whether any nodes have died.  If nodes have died, initiate recovery on any
++ * SG's they were in.  This begins immediately if the cluster remains quorate;
++ * if not this waits until the cluster regains quorum.
++ */
++
++void process_nodechange(void)
++{
++      int gone, effected;
++
++      if ((sm_quorum = sm_quorum_next))
++              wake_serviced(DO_RUN);
++
++      gone = adjust_members();
++      if (gone > 0) {
++              effected = mark_effected_sgs();
++
++              backout_sevents();
++              cancel_uevents(&effected);
++
++              if (effected > 0) {
++                      new_recovery();
++                      wake_serviced(DO_RECOVERIES);
++              }
++      }
++      adjust_members_done();
++}
++
++int check_recovery(sm_group_t *sg, int event_id)
++{
++      if (sg->state == SGST_RECOVER) {
++              recover_t *rev = (recover_t *) sg->recover_data;
++              if (rev && rev->event_id == event_id)
++                      return 1;
++      }
++      return 0;
++}
++
++void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
++{
++        sm_group_t *sg;
++      recover_t *rev;
++
++      sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++      if (!sg) {
++              log_print("process_recover_msg: unknown sg id %x",
++                        smsg->ms_global_sgid);
++              return;
++      }
++
++      /* we already know about the recovery and can ignore the msg */
++      if (sg->state == SGST_RECOVER)
++              return;
++
++      if (test_bit(SGFL_UEVENT, &sg->flags)) {
++              /* we will initiate recovery on our own if we know about the
++                 uevent so we can ignore this */
++              log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
++              return;
++      }
++
++      log_debug(sg, "recovery initiated by msg from %u", nodeid);
++      rev = alloc_recover();
++      list_add_tail(&rev->list, &recoveries);
++      pre_recover_sg(sg, rev);
++      wake_serviced(DO_RECOVERIES);
++}
+diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
+--- linux-orig/cluster/cman/sm_recover.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_recover.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,23 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_RECOVER_DOT_H__
++#define __SM_RECOVER_DOT_H__
++
++void init_recovery(void);
++void process_recoveries(void);
++void process_nodechange(void);
++int check_recovery(sm_group_t *sg, int event_id);
++void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
+--- linux-orig/cluster/cman/sm_services.c      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_services.c   2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,418 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static struct list_head       callbacks;
++static spinlock_t     callback_lock;
++static struct list_head       sg_registered[SG_LEVELS];
++
++/*
++ * These are the functions to register, join, leave, unregister, callback
++ * with/to the sm.
++ */
++
++struct sc_entry {
++      struct list_head list;
++      uint32_t local_id;
++      int event_id;
++};
++typedef struct sc_entry sc_entry_t;
++
++void init_services(void)
++{
++      int i;
++
++      INIT_LIST_HEAD(&callbacks);
++      spin_lock_init(&callback_lock);
++
++      for (i = 0; i < SG_LEVELS; i++) {
++              INIT_LIST_HEAD(&sm_sg[i]);
++              INIT_LIST_HEAD(&sg_registered[i]);
++      }
++      init_MUTEX(&sm_sglock);
++}
++
++/* Context: service */
++
++int kcl_register_service(char *name, int namelen, int level,
++                       struct kcl_service_ops *ops, int unique,
++                       void *servicedata, uint32_t *service_id)
++{
++      sm_group_t *sg;
++      int found = FALSE;
++      int error = -EINVAL;
++
++      if (level > SG_LEVELS - 1)
++              goto fail;
++
++      if (namelen > MAX_SERVICE_NAME_LEN)
++              goto fail;
++
++      error = kcl_addref_cluster();
++      if (error)
++              goto fail;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sm_sg[level], list) {
++              if ((sg->namelen == namelen) &&
++                  (!strncmp(sg->name, name, namelen))) {
++                      found = TRUE;
++                      goto next;
++              }
++      }
++
++      list_for_each_entry(sg, &sg_registered[level], list) {
++              if ((sg->namelen == namelen) &&
++                  (!strncmp(sg->name, name, namelen))) {
++                      found = TRUE;
++                      goto next;
++              }
++      }
++
++      next:
++
++      if (found && unique) {
++              error = -EEXIST;
++              goto fail_unlock;
++      }
++
++      if (found) {
++              sg->refcount++;
++              goto out;
++      }
++
++      sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
++      if (!sg) {
++              error = -ENOMEM;
++              goto fail_unlock;
++      }
++      memset(sg, 0, sizeof(sm_group_t) + namelen);
++
++      sg->refcount = 1;
++      sg->service_data = servicedata;
++      sg->ops = ops;
++      sg->level = level;
++      sg->namelen = namelen;
++      memcpy(sg->name, name, namelen);
++      sg->local_id = sm_new_local_id(level);
++      sg->state = SGST_NONE;
++      INIT_LIST_HEAD(&sg->memb);
++      INIT_LIST_HEAD(&sg->joining);
++      init_completion(&sg->event_comp);
++
++      list_add_tail(&sg->list, &sg_registered[level]);
++
++      out:
++      *service_id = sg->local_id;
++      up(&sm_sglock);
++      return 0;
++
++      fail_unlock:
++      up(&sm_sglock);
++      kcl_releaseref_cluster();
++      fail:
++      return error;
++}
++
++/* Context: service */
++
++void kcl_unregister_service(uint32_t local_id)
++{
++      sm_group_t *sg;
++      int level = sm_id_to_level(local_id);
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sg_registered[level], list) {
++              if (sg->local_id == local_id) {
++                      SM_ASSERT(sg->refcount,);
++                      sg->refcount--;
++
++                      if (!sg->refcount) {
++                              list_del(&sg->list);
++                              kfree(sg);
++                      }
++                      kcl_releaseref_cluster();
++                      break;
++              }
++      }
++      up(&sm_sglock);
++}
++
++/* Context: service */
++
++int kcl_join_service(uint32_t local_id)
++{
++      sm_group_t *sg;
++      sm_sevent_t *sev;
++      int level = sm_id_to_level(local_id);
++      int error, found = FALSE;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sg_registered[level], list) {
++              if (sg->local_id == local_id) {
++                      found = TRUE;
++                      break;
++              }
++      }
++
++      if (!found) {
++              up(&sm_sglock);
++              error = -ENOENT;
++              goto out;
++      }
++
++      if (sg->state != SGST_NONE) {
++              up(&sm_sglock);
++              error = -EINVAL;
++              goto out;
++      }
++
++      sg->state = SGST_JOIN;
++      set_bit(SGFL_SEVENT, &sg->flags);
++      list_del(&sg->list);
++      list_add_tail(&sg->list, &sm_sg[sg->level]);
++
++      up(&sm_sglock);
++
++      /*
++       * The join is a service event which will be processed asynchronously.
++       */
++
++      sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
++      if (!sev) {
++              error = -ENOMEM;
++              goto out;
++      }
++
++      memset(sev, 0, sizeof (sm_sevent_t));
++      sev->se_state = SEST_JOIN_BEGIN;
++      sev->se_sg = sg;
++      sg->sevent = sev;
++      sm_set_event_id(&sev->se_id);
++
++      new_joinleave(sev);
++      wait_for_completion(&sg->event_comp);
++      error = 0;
++
++      out:
++      return error;
++}
++
++/* Context: service */
++
++int kcl_leave_service(uint32_t local_id)
++{
++      sm_group_t *sg = NULL;
++      sm_sevent_t *sev;
++      int error;
++
++      error = -ENOENT;
++      sg = sm_local_id_to_sg(local_id);
++      if (!sg)
++              goto out;
++
++      /* sg was never joined */
++      error = -EINVAL;
++      if (sg->state == SGST_NONE)
++              goto out;
++
++      /* may still be joining */
++      error = -EBUSY;
++      if (test_and_set_bit(SGFL_SEVENT, &sg->flags))
++              goto out;
++
++      error = -ENOMEM;
++      sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
++      if (!sev)
++              goto out;
++
++      memset(sev, 0, sizeof (sm_sevent_t));
++      sev->se_state = SEST_LEAVE_BEGIN;
++      set_bit(SEFL_LEAVE, &sev->se_flags);
++      sev->se_sg = sg;
++      sg->sevent = sev;
++      sm_set_event_id(&sev->se_id);
++
++      new_joinleave(sev);
++      wait_for_completion(&sg->event_comp);
++      error = 0;
++
++      down(&sm_sglock);
++      list_del(&sg->list);
++      list_add_tail(&sg->list, &sg_registered[sg->level]);
++      up(&sm_sglock);
++
++      out:
++      return error;
++}
++
++static void process_callback(uint32_t local_id, int event_id)
++{
++      sm_group_t *sg;
++      sm_sevent_t *sev;
++      sm_uevent_t *uev;
++
++      sg = sm_local_id_to_sg(local_id);
++      if (!sg)
++              return;
++
++      if (sg->state == SGST_RECOVER) {
++              if (!check_recovery(sg, event_id)) {
++                      log_error(sg, "process_callback invalid recover "
++                                "event id %d", event_id);
++                      return;
++              }
++
++              if (sg->recover_state == RECOVER_START)
++                      sg->recover_state = RECOVER_STARTDONE;
++              else
++                      log_error(sg, "process_callback recover state %u",
++                                sg->recover_state);
++              wake_serviced(DO_RECOVERIES);
++      }
++
++      else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
++               (sg->sevent->se_id == event_id)) {
++              sev = sg->sevent;
++
++              if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
++                  (sev->se_state == SEST_JSTART_SERVICEWAIT))
++                      sev->se_state = SEST_JSTART_SERVICEDONE;
++
++              set_bit(SEFL_CHECK, &sev->se_flags);
++              wake_serviced(DO_JOINLEAVE);
++      }
++
++      else if (test_bit(SGFL_UEVENT, &sg->flags) &&
++               (sg->uevent.ue_id == event_id)) {
++              uev = &sg->uevent;
++
++              if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
++                      if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
++                              uev->ue_state = UEST_JSTART_SERVICEDONE;
++                      else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
++                              uev->ue_state = UEST_LSTART_SERVICEDONE;
++              }
++              set_bit(UEFL_CHECK, &uev->ue_flags);
++              wake_serviced(DO_MEMBERSHIP);
++      }
++
++      else
++              log_error(sg, "ignoring service callback id=%x event=%u",
++                        local_id, event_id);
++}
++
++void process_callbacks(void)
++{
++      sc_entry_t *se;
++
++      while (1) {
++              se = NULL;
++
++              spin_lock(&callback_lock);
++              if (!list_empty(&callbacks)) {
++                      se = list_entry(callbacks.next, sc_entry_t, list);
++                      list_del(&se->list);
++              }
++              spin_unlock(&callback_lock);
++
++              if (!se)
++                      break;
++              process_callback(se->local_id, se->event_id);
++              kfree(se);
++              schedule();
++      }
++}
++
++/* Context: service */
++
++void kcl_start_done(uint32_t local_id, int event_id)
++{
++      sc_entry_t *se;
++
++      SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
++
++      se->local_id = local_id;
++      se->event_id = event_id;
++
++      spin_lock(&callback_lock);
++      list_add_tail(&se->list, &callbacks);
++      spin_unlock(&callback_lock);
++
++      wake_serviced(DO_CALLBACKS);
++}
++
++/* Context: service */
++
++void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
++{
++      sm_group_t *sg = sm_local_id_to_sg(local_id);
++
++      if (!sg)
++              log_print("kcl_global_service_id: can't find %x", local_id);
++      else
++              *global_id = sg->global_id;
++}
++
++static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
++{
++      s->level = sg->level;
++      s->local_id = sg->local_id;
++      s->global_id = sg->global_id;
++      s->node_count = sg->memb_count;
++      strcpy(s->name, sg->name);
++}
++
++int kcl_get_services(struct list_head *head, int level)
++{
++      sm_group_t *sg;
++      struct kcl_service *s;
++      int error = -ENOMEM, count = 0;
++
++      down(&sm_sglock);
++
++      list_for_each_entry(sg, &sg_registered[level], list) {
++              if (head) {
++                      s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
++                      if (!s)
++                              goto out;
++                      copy_to_service(sg, s);
++                      list_add(&s->list, head);
++              }
++              count++;
++      }
++
++      list_for_each_entry(sg, &sm_sg[level], list) {
++              if (head) {
++                      s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
++                      if (!s)
++                              goto out;
++                      copy_to_service(sg, s);
++                      list_add(&s->list, head);
++              }
++              count++;
++      }
++
++      error = count;
++ out:
++      up(&sm_sglock);
++      return error;
++}
++
++/* These three global variables listed in extern form in sm.h. */
++struct list_head sm_sg[SG_LEVELS];
++struct semaphore sm_sglock;
+diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
+--- linux-orig/cluster/cman/sm_services.h      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_services.h   2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_SERVICES_DOT_H__
++#define __SM_SERVICES_DOT_H__
++
++void init_services(void);
++void process_callbacks(void);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
+--- linux-orig/cluster/cman/sm_user.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_user.c       2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,563 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "cnxman-private.h"
++
++void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
++
++#define UST_REGISTER  1
++#define UST_UNREGISTER        2
++#define UST_JOIN      3
++#define UST_LEAVE     4
++#define UST_JOINED    5
++
++struct event {
++      struct list_head        list;
++      service_event_t         type;
++      service_start_t         start_type;
++      unsigned int            event_id;
++      unsigned int            last_stop;
++      unsigned int            last_start;
++      unsigned int            last_finish;
++      unsigned int            node_count;
++      uint32_t *              nodeids;
++};
++typedef struct event event_t;
++
++struct user_service {
++      uint32_t                local_id;
++      pid_t                   pid;
++      int                     signal;
++      struct socket *         sock;
++      uint8_t                 state;
++      uint8_t                 async;
++      struct semaphore        lock;
++      struct list_head        events;
++      spinlock_t              event_lock;
++      unsigned int            last_stop;
++      unsigned int            last_start;
++      unsigned int            last_finish;
++      unsigned int            need_startdone;
++      unsigned int            node_count;
++      uint32_t *              nodeids;
++      int                     name_len;
++      char                    name[MAX_SERVICE_NAME_LEN];
++};
++typedef struct user_service user_service_t;
++
++
++static void add_event(user_service_t *us, event_t *ev)
++{
++      spin_lock(&us->event_lock);
++      list_add_tail(&ev->list, &us->events);
++
++      switch(ev->type) {
++      case SERVICE_EVENT_STOP:
++              us->last_stop = us->last_start;
++              break;
++      case SERVICE_EVENT_START:
++              us->last_start = ev->event_id;
++              break;
++      case SERVICE_EVENT_FINISH:
++              us->last_finish = ev->event_id;
++              break;
++      case SERVICE_EVENT_LEAVEDONE:
++              break;
++      }
++      spin_unlock(&us->event_lock);
++}
++
++static event_t *get_event(user_service_t *us)
++{
++      event_t *ev = NULL;
++
++      spin_lock(&us->event_lock);
++      if (!list_empty(&us->events)) {
++              ev = list_entry(us->events.next, event_t, list);
++              ev->last_stop = us->last_stop;
++              ev->last_start = us->last_start;
++              ev->last_finish = us->last_finish;
++      }
++      spin_unlock(&us->event_lock);
++      return ev;
++}
++
++static void del_event(user_service_t *us, event_t *ev)
++{
++      spin_lock(&us->event_lock);
++      list_del(&ev->list);
++      spin_unlock(&us->event_lock);
++}
++
++static event_t *alloc_event(void)
++{
++      event_t *ev;
++      SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
++      memset(ev, 0, sizeof(event_t));
++      return ev;
++}
++
++/* us->lock must be held before calling */
++static void user_notify(user_service_t *us)
++{
++      if (us->sock)
++              queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
++      if (us->pid && us->signal)
++              kill_proc(us->pid, us->signal, 0);
++}
++
++static service_start_t start_type(int type)
++{
++      switch (type) {
++      case SERVICE_NODE_FAILED:
++              return SERVICE_START_FAILED;
++      case SERVICE_NODE_JOIN:
++              return SERVICE_START_JOIN;
++      case SERVICE_NODE_LEAVE:
++              return SERVICE_START_LEAVE;
++      }
++      return 0;
++}
++
++static int user_stop(void *servicedata)
++{
++      user_service_t *us = (user_service_t *) servicedata;
++      event_t *ev;
++
++      down(&us->lock);
++      if (!us->sock)
++              goto out;
++
++      ev = alloc_event();
++      ev->type = SERVICE_EVENT_STOP;
++
++      add_event(us, ev);
++      user_notify(us);
++ out:
++      up(&us->lock);
++      return 0;
++}
++
++static int user_start(void *servicedata, uint32_t *nodeids, int count,
++                    int event_id, int type)
++{
++      user_service_t *us = (user_service_t *) servicedata;
++      event_t *ev;
++
++      down(&us->lock);
++      if (!us->sock) {
++              kcl_start_done(us->local_id, event_id);
++              goto out;
++      }
++
++      us->need_startdone = event_id;
++
++      ev = alloc_event();
++      ev->type = SERVICE_EVENT_START;
++      ev->node_count = count;
++      ev->start_type = start_type(type);
++      ev->event_id = event_id;
++      ev->nodeids = nodeids;
++
++      add_event(us, ev);
++      user_notify(us);
++ out:
++      up(&us->lock);
++      return 0;
++}
++
++static void user_finish(void *servicedata, int event_id)
++{
++      user_service_t *us = (user_service_t *) servicedata;
++      event_t *ev;
++
++      down(&us->lock);
++      if (!us->sock)
++              goto out;
++
++      ev = alloc_event();
++      ev->type = SERVICE_EVENT_FINISH;
++      ev->event_id = event_id;
++
++      add_event(us, ev);
++      user_notify(us);
++ out:
++      up(&us->lock);
++}
++
++struct kcl_service_ops user_service_ops = {
++      .stop = user_stop,
++      .start = user_start,
++      .finish = user_finish
++};
++
++static int user_register(char *name, user_service_t **us_data)
++{
++      user_service_t *us;
++      int len = strlen(name);
++      int error;
++
++      if (len > MAX_SERVICE_NAME_LEN - 1)
++              return -ENAMETOOLONG;
++      if (!len)
++              return -EINVAL;
++
++      us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
++      if (!us)
++              return -ENOMEM;
++      memset(us, 0, sizeof(user_service_t));
++      us->nodeids = NULL;
++      INIT_LIST_HEAD(&us->events);
++      spin_lock_init(&us->event_lock);
++      init_MUTEX(&us->lock);
++      us->name_len = len;
++      memcpy(us->name, name, len);
++
++      error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
++                                   &user_service_ops, TRUE, (void *) us,
++                                   &us->local_id);
++      if (error) {
++              kfree(us);
++              us = NULL;
++      }
++      *us_data = us;
++      return error;
++}
++
++static void user_unregister(user_service_t *us)
++{
++      event_t *ev;
++
++      kcl_unregister_service(us->local_id);
++
++      if (us->nodeids)
++              kfree(us->nodeids);
++
++      while ((ev = get_event(us))) {
++              del_event(us, ev);
++              if (ev->nodeids)
++                      kfree(ev->nodeids);
++              kfree(ev);
++      }
++}
++
++static int user_join_async(void *arg)
++{
++      user_service_t *us = arg;
++      int user_gone = 0;
++
++      daemonize("cman_userjoin");
++
++      kcl_join_service(us->local_id);
++
++      down(&us->lock);
++      us->state = UST_JOINED;
++      us->async = 0;
++      if (!us->sock) {
++              if (us->need_startdone)
++                      kcl_start_done(us->local_id, us->need_startdone);
++              user_gone = 1;
++      }
++      up(&us->lock);
++
++      if (user_gone) {
++              kcl_leave_service(us->local_id);
++              user_unregister(us);
++              kfree(us);
++      }
++      return 0;
++}
++
++static int user_leave_async(void *arg)
++{
++      user_service_t *us = arg;
++
++      daemonize("cman_userleave");
++
++      kcl_leave_service(us->local_id);
++
++      down(&us->lock);
++      us->async = 0;
++      if (!us->sock) {
++              user_unregister(us);
++              kfree(us);
++      } else {
++              event_t *ev = alloc_event();
++              ev->type = SERVICE_EVENT_LEAVEDONE;
++              add_event(us, ev);
++              user_notify(us);
++              up(&us->lock);
++      }
++
++      return 0;
++}
++
++static int user_join(user_service_t *us, int wait)
++{
++      int error = 0;
++
++      if (wait) {
++              error = kcl_join_service(us->local_id);
++              us->state = UST_JOINED;
++      }
++      else {
++              us->async = 1;
++              kernel_thread(user_join_async, us, 0);
++      }
++
++      return error;
++}
++
++static void user_leave(user_service_t *us, int wait)
++{
++      if (wait)
++              kcl_leave_service(us->local_id);
++      else {
++              us->async = 1;
++              kernel_thread(user_leave_async, us, 0);
++      }
++}
++
++static int user_start_done(user_service_t *us, unsigned int event_id)
++{
++      if (!us->need_startdone)
++              return -EINVAL;
++      if (us->need_startdone == event_id)
++              us->need_startdone = 0;
++      kcl_start_done(us->local_id, event_id);
++      return 0;
++}
++
++static void user_set_signal(user_service_t *us, int signal)
++{
++      us->pid = current->pid;
++      us->signal = signal;
++}
++
++static int user_get_event(user_service_t *us,
++                        struct cl_service_event *user_event)
++{
++      event_t *ev;
++      struct cl_service_event event;
++
++      ev = get_event(us);
++      if (!ev)
++              return 0;
++
++      event.type        = ev->type;
++      event.start_type  = ev->start_type;
++      event.event_id    = ev->event_id;
++      event.last_stop   = ev->last_stop;
++      event.last_start  = ev->last_start;
++      event.last_finish = ev->last_finish;
++      event.node_count  = ev->node_count;
++
++      if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
++              return -EFAULT;
++
++      del_event(us, ev);
++
++      if (ev->type == SERVICE_EVENT_START) {
++              if (us->nodeids)
++                      kfree(us->nodeids);
++              us->nodeids = ev->nodeids;
++              us->node_count = ev->node_count;
++      }
++
++      kfree(ev);
++      return 1;
++}
++
++static int user_get_members(user_service_t *us,
++                          struct cl_cluster_nodelist *u_nodelist)
++{
++      struct cl_cluster_nodelist user_nodelist;
++      struct cl_cluster_node user_node, *u_node;
++      struct cluster_node *node;
++      unsigned int i;
++      int num_nodes = 0;
++
++      if (!u_nodelist)
++              return us->node_count;
++
++      if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
++                         sizeof(struct cl_cluster_nodelist)))
++              return -EFAULT;
++
++      if (user_nodelist.max_members < us->node_count)
++              return -E2BIG;
++
++      u_node = user_nodelist.nodes;
++
++      for (i = 0; i < us->node_count; i++) {
++              node = find_node_by_nodeid(us->nodeids[i]);
++              if (!node)
++                      continue;
++
++              copy_to_usernode(node, &user_node);
++              if (copy_to_user(u_node, &user_node,
++                               sizeof(struct cl_cluster_node)))
++                      return -EFAULT;
++
++              u_node++;
++              num_nodes++;
++      }
++      return num_nodes;
++}
++
++static int user_global_id(user_service_t *us, uint32_t *id)
++{
++      uint32_t gid = 0;
++
++      if (us->state != UST_JOINED)
++              return -EINVAL;
++
++      kcl_global_service_id(us->local_id, &gid);
++
++      if (copy_to_user(id, &gid, sizeof(uint32_t)))
++              return -EFAULT;
++      return 0;
++}
++
++static int user_set_level(user_service_t *us, int level)
++{
++      int prev_id = us->local_id;
++      int error;
++
++      if (us->state != UST_REGISTER)
++              return -EINVAL;
++
++      error = kcl_register_service(us->name, us->name_len, level,
++                                   &user_service_ops, TRUE, (void *) us,
++                                   &us->local_id);
++      if (error)
++              return error;
++
++      kcl_unregister_service(prev_id);
++      return 0;
++}
++
++int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++      struct cluster_sock *c = cluster_sk(sock->sk);
++      user_service_t *us = c->service_data;
++      int error = 0;
++
++      if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
++              return -EINVAL;
++
++      switch (cmd) {
++      case SIOCCLUSTER_SERVICE_REGISTER:
++              error = user_register((char *) arg, &us);
++              if (!error) {
++                      us->state = UST_REGISTER;
++                      us->sock = sock;
++                      c->service_data = us;
++              }
++              break;
++
++      case SIOCCLUSTER_SERVICE_UNREGISTER:
++              down(&us->lock);
++              us->state = UST_UNREGISTER;
++              user_unregister(us);
++              up(&us->lock);
++              break;
++
++      case SIOCCLUSTER_SERVICE_JOIN:
++              us->state = UST_JOIN;
++              user_join(us, 0);
++              break;
++
++      case SIOCCLUSTER_SERVICE_LEAVE:
++              down(&us->lock);
++              if (us->state != UST_JOINED) {
++                      error = -EBUSY;
++                      up(&us->lock);
++              } else {
++                      us->state = UST_LEAVE;
++                      up(&us->lock);
++                      user_leave(us, 0);
++              }
++              break;
++
++      case SIOCCLUSTER_SERVICE_SETSIGNAL:
++              user_set_signal(us, (int) arg);
++              break;
++
++      case SIOCCLUSTER_SERVICE_STARTDONE:
++              error = user_start_done(us, (unsigned int) arg);
++              break;
++
++      case SIOCCLUSTER_SERVICE_GETEVENT:
++              error = user_get_event(us, (struct cl_service_event *) arg);
++              break;
++
++      case SIOCCLUSTER_SERVICE_GETMEMBERS:
++              error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
++              break;
++
++      case SIOCCLUSTER_SERVICE_GLOBALID:
++              error = user_global_id(us, (uint32_t *) arg);
++              break;
++
++      case SIOCCLUSTER_SERVICE_SETLEVEL:
++              error = user_set_level(us, (int) arg);
++              break;
++
++      default:
++              error = -EINVAL;
++      }
++
++      return error;
++}
++
++void sm_sock_release(struct socket *sock)
++{
++      struct cluster_sock *c = cluster_sk(sock->sk);
++      user_service_t *us = c->service_data;
++      int state;
++
++      if (!us)
++              return;
++
++      down(&us->lock);
++      us->sock = NULL;
++      c->service_data = NULL;
++
++      if (us->need_startdone)
++              kcl_start_done(us->local_id, us->need_startdone);
++
++      if (us->async) {
++              /* async thread will clean up before exiting */
++              up(&us->lock);
++              return;
++      }
++      state = us->state;
++      up(&us->lock);
++
++      switch (state) {
++      case UST_JOIN:
++              break;
++      case UST_JOINED:
++              user_leave(us, 1);
++              /* fall through */
++      case UST_LEAVE:
++      case UST_REGISTER:
++              user_unregister(us);
++              /* fall through */
++      case UST_UNREGISTER:
++              kfree(us);
++              break;
++      }
++}
+diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
+--- linux-orig/cluster/cman/sm_user.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_user.h       2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,21 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_USER_DOT_H__
++#define __SM_USER_DOT_H__
++
++int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
++void sm_sock_release(struct socket *sock);
++void sm_sock_bind(struct socket *sock);
++
++#endif
+diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
+--- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/cnxman-socket.h      2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,226 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* CMAN socket interface header,
++   may be include by user or kernel code */
++
++#ifndef __CNXMAN_SOCKET_H
++#define __CNXMAN_SOCKET_H
++
++/* Just made these up but the address family must be less than 32 (NPROTO) */
++#define AF_CLUSTER 31
++#define PF_CLUSTER AF_CLUSTER
++
++/* Protocol(socket) types */
++#define CLPROTO_MASTER 2
++#define CLPROTO_CLIENT 3
++
++/* Setsockopt -- maybe should be ioctls?? */
++#define CLU_SET_MULTICAST  100
++#define CLU_JOIN_CLUSTER   101
++#define CLU_LEAVE_CLUSTER  102
++#define CLU_SET_RCVONLY    103
++#define CLU_SET_UNICAST    104
++#define KCL_SET_MULTICAST  105
++#define KCL_SET_RCVONLY    106
++#define KCL_SET_UNICAST    107
++#define KCL_SET_NODENAME   108
++#define CLU_SET_NODENAME   109
++
++/* ioctls -- should register these properly */
++#define SIOCCLUSTER_NOTIFY            _IOW('x', 0x01, int)
++#define SIOCCLUSTER_REMOVENOTIFY      _IO( 'x', 0x02)
++#define SIOCCLUSTER_GETMEMBERS        _IOR('x', 0x03, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
++#define SIOCCLUSTER_ISQUORATE         _IO( 'x', 0x05)
++#define SIOCCLUSTER_ISLISTENING       _IOW('x', 0x06, struct cl_listen_request)
++#define SIOCCLUSTER_GETALLMEMBERS     _IOR('x', 0x07, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SET_VOTES         _IOW('x', 0x08, int)
++#define SIOCCLUSTER_GET_VERSION       _IOR('x', 0x09, struct cl_version)
++#define SIOCCLUSTER_SET_VERSION       _IOW('x', 0x0a, struct cl_version)
++#define SIOCCLUSTER_ISACTIVE          _IO( 'x', 0x0b)
++#define SIOCCLUSTER_KILLNODE          _IOW('x', 0x0c, int)
++#define SIOCCLUSTER_GET_JOINCOUNT     _IO( 'x', 0x0d)
++#define SIOCCLUSTER_SERVICE_REGISTER  _IOW('x', 0x0e, char)
++#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
++#define SIOCCLUSTER_SERVICE_JOIN      _IO( 'x', 0x10)
++#define SIOCCLUSTER_SERVICE_LEAVE     _IO( 'x', 0x20)
++#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
++#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
++#define SIOCCLUSTER_SERVICE_GETEVENT  _IOR('x', 0x50, struct cl_service_event)
++#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SERVICE_GLOBALID  _IOR('x', 0x70, uint32_t)
++#define SIOCCLUSTER_SERVICE_SETLEVEL  _IOR('x', 0x80, int)
++#define SIOCCLUSTER_GETNODE         _IOWR('x', 0x90, struct cl_cluster_node)
++#define SIOCCLUSTER_BARRIER           _IOW('x', 0x0a0, struct cl_barrier_info)
++
++/* Maximum size of a cluster message */
++#define MAX_CLUSTER_MESSAGE          1500
++#define MAX_CLUSTER_MEMBER_NAME_LEN   255
++#define MAX_BARRIER_NAME_LEN           33
++#define MAX_SA_ADDR_LEN                12
++#define MAX_CLUSTER_NAME_LEN           16
++
++/* Well-known cluster port numbers */
++#define CLUSTER_PORT_MEMBERSHIP  1    /* Mustn't block during cluster
++                                       * transitions! */
++#define CLUSTER_PORT_SERVICES    2
++#define CLUSTER_PORT_SYSMAN      10   /* Remote execution daemon */
++#define CLUSTER_PORT_CLVMD       11   /* Cluster LVM daemon */
++#define CLUSTER_PORT_SLM         12   /* LVM SLM (simple lock manager) */
++
++/* Port numbers above this will be blocked when the cluster is inquorate or in
++ * transition */
++#define HIGH_PROTECTED_PORT      9
++
++/* Reasons for leaving the cluster */
++#define CLUSTER_LEAVEFLAG_DOWN     0  /* Normal shutdown */
++#define CLUSTER_LEAVEFLAG_KILLED   1
++#define CLUSTER_LEAVEFLAG_PANIC    2
++#define CLUSTER_LEAVEFLAG_REMOVED  3  /* This one can reduce quorum */
++#define CLUSTER_LEAVEFLAG_REJECTED 4  /* Not allowed into the cluster in the
++                                       * first place */
++#define CLUSTER_LEAVEFLAG_INCONSISTENT 5      /* Our view of the cluster is
++                                               * in a minority */
++#define CLUSTER_LEAVEFLAG_DEAD         6      /* Discovered to be dead */
++#define CLUSTER_LEAVEFLAG_FORCE     0x10      /* Forced by command-line */
++
++/* OOB messages sent to a local socket */
++#define CLUSTER_OOB_MSG_PORTCLOSED  1
++#define CLUSTER_OOB_MSG_STATECHANGE 2
++#define CLUSTER_OOB_MSG_SERVICEEVENT 3
++
++/* Sendmsg flags, these are above the normal sendmsg flags so they don't
++ * interfere */
++#define MSG_NOACK     0x010000        /* Don't need an ACK for this message */
++#define MSG_QUEUE     0x020000        /* Queue the message for sending later */
++#define MSG_MULTICAST 0x080000        /* Message was sent to all nodes in the cluster
++                               */
++#define MSG_ALLINT    0x100000        /* Send out of all interfaces */
++
++typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER,
++          NODESTATE_DEAD } nodestate_t;
++
++
++struct sockaddr_cl {
++      unsigned short scl_family;
++      unsigned char scl_flags;
++      unsigned char scl_port;
++      int           scl_nodeid;
++};
++
++/* This is how we pass the multicast socket into kernel space. addr is the
++ * multicast address to use in the address family of the socket (eg for UDP it
++ * might be 255.255.255.0) */
++struct cl_multicast_sock {
++      int fd;                 /* FD of master socket to do multicast on */
++      int number;             /* Socket number, to match up recvonly & bcast
++                               * sockets */
++};
++
++/* Cluster configuration info passed when we join the cluster */
++struct cl_join_cluster_info {
++      unsigned char votes;
++      unsigned int expected_votes;
++      unsigned int two_node;
++      unsigned int config_version;
++
++        char cluster_name[17];
++};
++
++
++/* This is the structure, per node, returned from the membership ioctl */
++struct cl_cluster_node {
++      unsigned int size;
++      unsigned int node_id;
++      unsigned int us;
++      unsigned int leave_reason;
++      unsigned int incarnation;
++      nodestate_t state;
++      char name[MAX_CLUSTER_MEMBER_NAME_LEN];
++      unsigned char votes;
++};
++
++/* The struct passed to the membership ioctls */
++struct cl_cluster_nodelist {
++        uint32_t max_members;
++        struct cl_cluster_node *nodes;
++};
++
++/* Structure passed to SIOCCLUSTER_ISLISTENING */
++struct cl_listen_request {
++      unsigned char port;
++        int           nodeid;
++};
++
++/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
++struct cl_portclosed_oob {
++      unsigned char cmd;      /* CLUSTER_OOB_MSG_PORTCLOSED */
++      unsigned char port;
++};
++
++/* Get all version numbers or set the config version */
++struct cl_version {
++      unsigned int major;
++      unsigned int minor;
++      unsigned int patch;
++      unsigned int config;
++};
++
++/* structure passed to barrier ioctls */
++struct cl_barrier_info {
++      char cmd;
++      char name[MAX_BARRIER_NAME_LEN];
++      unsigned int flags;
++      unsigned long arg;
++};
++
++typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
++              SERVICE_EVENT_LEAVEDONE } service_event_t;
++
++typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
++              service_start_t;
++
++struct cl_service_event {
++      service_event_t type;
++      service_start_t start_type;
++      unsigned int event_id;
++      unsigned int last_stop;
++      unsigned int last_start;
++      unsigned int last_finish;
++      unsigned int node_count;
++};
++
++
++/* Commands to the barrier ioctl */
++#define BARRIER_IOCTL_REGISTER 1
++#define BARRIER_IOCTL_CHANGE   2
++#define BARRIER_IOCTL_DELETE   3
++#define BARRIER_IOCTL_WAIT     4
++
++/* Attributes of a barrier - bitmask */
++#define BARRIER_ATTR_AUTODELETE 1
++#define BARRIER_ATTR_MULTISTEP  2
++#define BARRIER_ATTR_MANUAL     4
++#define BARRIER_ATTR_ENABLED    8
++#define BARRIER_ATTR_CALLBACK  16
++
++/* Attribute setting commands */
++#define BARRIER_SETATTR_AUTODELETE 1
++#define BARRIER_SETATTR_MULTISTEP  2
++#define BARRIER_SETATTR_ENABLED    3
++#define BARRIER_SETATTR_NODES      4
++#define BARRIER_SETATTR_CALLBACK   5
++#define BARRIER_SETATTR_TIMEOUT    6
++
++#endif
+diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
+--- linux-orig/include/cluster/cnxman.h        1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/cnxman.h     2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,87 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CNXMAN_H
++#define __CNXMAN_H
++
++#include "linux/in6.h"
++#include "cluster/cnxman-socket.h"
++
++/* In-kernel API */
++
++/* This is the structure, per node, returned from the membership request */
++struct kcl_cluster_node {
++      unsigned int size;
++      unsigned int node_id;
++      unsigned int us;
++      unsigned int leave_reason;
++      unsigned int incarnation;
++      nodestate_t state;
++      struct list_head list;
++      char name[MAX_CLUSTER_MEMBER_NAME_LEN];
++      unsigned char votes;
++};
++
++struct cluster_node_addr {
++      struct list_head list;
++      unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
++      int addr_len;
++};
++
++
++/* Reasons for a kernel membership callback */
++typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
++
++/* Kernel version of above, the void *sock is a struct socket */
++struct kcl_multicast_sock {
++      void *sock;
++      int number;             /* Socket number, to match up recvonly & bcast
++                               * sockets */
++};
++
++extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
++                     struct sockaddr_cl *caddr, int addr_len,
++                     unsigned int flags);
++extern int kcl_register_read_callback(struct socket *sock,
++                                    int (*routine) (char *, int, char *, int,
++                                                    unsigned int));
++extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
++extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
++extern int kcl_get_members(struct list_head *list);
++extern int kcl_get_member_ids(uint32_t * idbuf, int size);
++extern int kcl_get_all_members(struct list_head *list);
++extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
++                              struct kcl_cluster_node *n);
++extern int kcl_get_node_by_name(unsigned char *name,
++                              struct kcl_cluster_node *n);
++extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
++extern int kcl_is_quorate(void);
++extern int kcl_addref_cluster(void);
++extern int kcl_releaseref_cluster(void);
++extern int kcl_cluster_name(char **cname);
++extern int kcl_get_current_interface(void);
++extern struct list_head *kcl_get_node_addresses(int nodeid);
++
++extern int kcl_barrier_register(char *name, unsigned int flags,
++                              unsigned int nodes);
++extern int kcl_barrier_setattr(char *name, unsigned int attr,
++                             unsigned long arg);
++extern int kcl_barrier_delete(char *name);
++extern int kcl_barrier_wait(char *name);
++extern int kcl_barrier_cancel(char *name);
++
++extern int kcl_register_quorum_device(char *name, int votes);
++extern int kcl_unregister_quorum_device(void);
++extern int kcl_quorum_device_available(int yesno);
++
++#endif
+diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
+--- linux-orig/include/cluster/service.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/service.h    2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,102 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SERVICE_DOT_H__
++#define __SERVICE_DOT_H__
++
++/* 
++ * Interface between service manager and services
++ */
++
++/* 
++ * Service levels are started in order from lowest, so level 0 is started on
++ * all nodes before level 1 is started.
++ */
++
++#define SERVICE_LEVEL_FENCE      (0)
++#define SERVICE_LEVEL_GDLM       (1)
++#define SERVICE_LEVEL_GFS        (2)
++#define SERVICE_LEVEL_USER     (3)
++
++#define MAX_SERVICE_NAME_LEN     (33)
++
++/* 
++ * The type of start a service receives.  The start (and preceding stop) may be
++ * due to a node joining or leaving the SG or due to a node having failed.
++ */
++
++#define SERVICE_NODE_FAILED      (1)
++#define SERVICE_NODE_JOIN        (2)
++#define SERVICE_NODE_LEAVE       (3)
++
++
++struct kcl_service {
++      struct list_head list;
++      uint16_t level;
++      uint32_t local_id;
++      uint32_t global_id;
++      int node_count;
++      char name[MAX_SERVICE_NAME_LEN];
++};
++
++int kcl_get_services(struct list_head *list, int level);
++
++
++/* 
++ * These routines which run in CMAN context must return quickly and cannot
++ * block.
++ */
++
++struct kcl_service_ops {
++      int (*stop) (void *servicedata);
++      int (*start) (void *servicedata, uint32_t *nodeids, int count,
++                    int event_id, int type);
++      void (*finish) (void *servicedata, int event_id);
++};
++
++/* 
++ * Register will cause CMAN to create a Service Group (SG) for the named
++ * instance of the service.  A local ID is returned which is used to join,
++ * leave and unregister the service.
++ */
++
++int kcl_register_service(char *name, int namelen, int level,
++                       struct kcl_service_ops *ops, int unique,
++                       void *servicedata, uint32_t *local_id);
++
++void kcl_unregister_service(uint32_t local_id);
++
++/* 
++ * Once a service is joined it will be managed by CMAN and receive start, stop,
++ * and finish calls.  After leave is called the service is no longer managed by
++ * CMAN.  The first start for a service may arrive before kcl_join_service()
++ * returns.
++ */
++
++int kcl_join_service(uint32_t local_id);
++int kcl_leave_service(uint32_t local_id);
++
++/* 
++ * After a service is started, it can ask for its cluster-wide unique ID.
++ */
++
++void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
++
++/* 
++ * Called by a service when it's done with a start().  Cannot be called from
++ * the start function.
++ */
++
++void kcl_start_done(uint32_t local_id, int event_id);
++
++#endif
diff --git a/linux-cluster-dlm.patch b/linux-cluster-dlm.patch

new file mode 100644 (file)

index 0000000..5f51760
--- /dev/null
+++ b/linux-cluster-dlm.patch
@@ -0,0 +1,14171 @@
+# Add DLM to the build system
+diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
+--- linux-2.6.7/cluster/Kconfig        2004-06-17 15:00:36.000000000 +0800
++++ linux/cluster/Kconfig      2004-06-17 15:00:57.000000000 +0800
+@@ -10,4 +10,22 @@ config CLUSTER
+       needed by all the other components. It provides membership services
+       for those other subsystems.
+ 
++config CLUSTER_DLM
++      tristate "Distributed Lock Manager"
++      depends on CLUSTER
++      ---help---
++      A fully distributed lock manager, providing cluster-wide locking services
++      and protected lock namespaces for kernel and userland applications.
++
++config CLUSTER_DLM_PROCLOCKS
++       boolean "/proc/locks support for DLM"
++       depends on CLUSTER_DLM
++       depends on PROC_FS
++       ---help---
++       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
++       write into this "file" the name of a lockspace known to the DLM and then
++       read out a list of all the resources and locks in that lockspace that are
++       known to the local node. Note because the DLM is distributed this may not
++       be the full lock picture.
++
+ endmenu
+diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
+--- linux-2.6.7/cluster/Makefile       2004-06-17 15:00:36.000000000 +0800
++++ linux/cluster/Makefile     2004-06-17 15:00:57.000000000 +0800
+@@ -1,3 +1,4 @@
+ obj-y := nocluster.o
+ 
+ obj-$(CONFIG_CLUSTER)         += cman/
++obj-$(CONFIG_CLUSTER_DLM)     += dlm/
+diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
+--- linux-2.6.7/cluster/dlm/Makefile   1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800
+@@ -0,0 +1,23 @@
++dlm-objs                :=    ast.o \
++                              config.o \
++                              device.o \
++                              dir.o \
++                              lkb.o \
++                              locking.o \
++                              lockqueue.o \
++                              lockspace.o \
++                              lowcomms.o \
++                              main.o \
++                              memory.o \
++                              midcomms.o \
++                              nodes.o \
++                              proc.o \
++                              queries.o \
++                              rebuild.o \
++                              reccomms.o \
++                              recover.o \
++                              recoverd.o \
++                              rsb.o \
++                              util.o \
++
++obj-$(CONFIG_CLUSTER_DLM) += dlm.o
+diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
+--- linux-orig/cluster/dlm/ast.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/ast.c    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,581 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* 
++ * This delivers ASTs and checks for dead remote requests and deadlocks.
++ */
++
++#include <linux/timer.h>
++
++#include "dlm_internal.h"
++#include "rsb.h"
++#include "lockqueue.h"
++#include "dir.h"
++#include "locking.h"
++#include "lkb.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "ast.h"
++#include "nodes.h"
++#include "config.h"
++
++/* Wake up flags for astd */
++#define GDLMD_WAKE_ASTS  1
++#define GDLMD_WAKE_TIMER 2
++
++static struct list_head _deadlockqueue;
++static struct semaphore _deadlockqueue_lock;
++static struct list_head _lockqueue;
++static struct semaphore _lockqueue_lock;
++static struct timer_list _lockqueue_timer;
++static struct list_head _ast_queue;
++static struct semaphore _ast_queue_lock;
++static wait_queue_head_t _astd_waitchan;
++static atomic_t _astd_running;
++static long _astd_pid;
++static unsigned long _astd_wakeflags;
++static struct completion _astd_done;
++
++void add_to_lockqueue(gd_lkb_t *lkb)
++{
++      /* Time stamp the entry so we know if it's been waiting too long */
++      lkb->lkb_lockqueue_time = jiffies;
++
++      down(&_lockqueue_lock);
++      list_add(&lkb->lkb_lockqueue, &_lockqueue);
++      up(&_lockqueue_lock);
++}
++
++void remove_from_lockqueue(gd_lkb_t *lkb)
++{
++      down(&_lockqueue_lock);
++      list_del(&lkb->lkb_lockqueue);
++      up(&_lockqueue_lock);
++}
++
++void add_to_deadlockqueue(gd_lkb_t *lkb)
++{
++      if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
++              return;
++      lkb->lkb_duetime = jiffies;
++      down(&_deadlockqueue_lock);
++      list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
++      up(&_deadlockqueue_lock);
++}
++
++void remove_from_deadlockqueue(gd_lkb_t *lkb)
++{
++      if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
++              return;
++
++      down(&_deadlockqueue_lock);
++      list_del(&lkb->lkb_deadlockq);
++      up(&_deadlockqueue_lock);
++
++      /* Invalidate the due time */
++      memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
++}
++
++void remove_from_astqueue(gd_lkb_t *lkb)
++{
++      down(&_ast_queue_lock);
++      if (lkb->lkb_asts_to_deliver)
++              list_del(&lkb->lkb_astqueue);
++      lkb->lkb_asts_to_deliver = 0;
++      up(&_ast_queue_lock);
++}
++
++/* 
++ * Actually deliver an AST to a user. The caller MUST hold the ast queue lock
++ * and we unlock it for the duration of the user call, otherwise things can
++ * deadlock.
++ */
++
++static void deliver_ast(gd_lkb_t *lkb, gd_ast_type_t astt)
++{
++      void (*cast) (long param) = lkb->lkb_astaddr;
++      void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
++
++      up(&_ast_queue_lock);
++
++      if (cast && (astt == GDLM_QUEUE_COMPAST))
++              cast(lkb->lkb_astparam);
++
++      else if (bast && (astt == GDLM_QUEUE_BLKAST)
++               && (lkb->lkb_status == GDLM_LKSTS_GRANTED))
++              bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
++
++      /* 
++       * Remove LKB if requested.  It is up to the caller to remove the LKB
++       * from any resource queue it may be on.
++       *
++       * NOTE: we check lkb_asts_to_deliver here in case an ast for us was
++       * queued during the AST delivery itself (eg a user called dlm_unlock
++       * in the AST routine!
++       */
++
++      if (lkb->lkb_flags & GDLM_LKFLG_DELAST && astt == GDLM_QUEUE_COMPAST &&
++          lkb->lkb_asts_to_deliver == 0) {
++              gd_res_t *rsb = lkb->lkb_resource;
++              struct rw_semaphore *in_recovery = &rsb->res_ls->ls_in_recovery;
++
++              down_read(in_recovery);
++              release_lkb(rsb->res_ls, lkb);
++              release_rsb(rsb);
++              up_read(in_recovery);
++      }
++
++      /* This queue can get very big so we schedule here to give the rest of
++       * the cluster chance to do some work. */
++      schedule();
++
++      down(&_ast_queue_lock);
++}
++
++/* 
++ * Queue an AST for delivery, this will only deal with
++ * kernel ASTs, usermode API will piggyback on top of this.
++ *
++ * This can be called in either the user or DLM context.
++ * ASTs are queued EVEN IF we are already running in gdlm_astd
++ * context as we don't know what other locks are held (eg we could
++ * be being called from a lock operation that was called from
++ * another AST!
++ * If the AST is to be queued remotely then a message is sent to
++ * the target system via midcomms.
++ */
++
++void queue_ast(gd_lkb_t *lkb, gd_ast_type_t astt, uint8_t rqmode)
++{
++      struct gd_remlockrequest req;
++
++      if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++              /* 
++               * Send a message to have an ast queued remotely.  Note: we do
++               * not send remote completion asts, they are handled as part of
++               * remote lock granting.
++               */
++
++              if (astt == GDLM_QUEUE_BLKAST) {
++                      req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
++                      req.rr_header.rh_length = sizeof(req);
++                      req.rr_header.rh_flags = 0;
++                      req.rr_header.rh_lkid = lkb->lkb_id;
++                      req.rr_header.rh_lockspace =
++                          lkb->lkb_resource->res_ls->ls_global_id;
++                      req.rr_status = lkb->lkb_retstatus;
++                      req.rr_remlkid = lkb->lkb_remid;
++                      req.rr_rqmode = rqmode;
++
++                      midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
++                                            lkb->lkb_resource->res_ls->ls_allocation);
++
++              } else if (lkb->lkb_retstatus == -EDEADLOCK) {
++                      /* 
++                       * We only queue remote Completion ASTs here for error
++                       * completions that happen out of band.
++                       * DEADLOCK is one such.
++                       */
++
++                      req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
++                      req.rr_header.rh_length = sizeof(req);
++                      req.rr_header.rh_flags = 0;
++                      req.rr_header.rh_lkid = lkb->lkb_id;
++                      req.rr_header.rh_lockspace =
++                          lkb->lkb_resource->res_ls->ls_global_id;
++                      req.rr_status = lkb->lkb_retstatus;
++                      req.rr_remlkid = lkb->lkb_remid;
++                      req.rr_rqmode = rqmode;
++
++                      midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
++                                            lkb->lkb_resource->res_ls->ls_allocation);
++              }
++      } else {
++              /* 
++               * Prepare info which will be returned in ast/bast.
++               */
++
++              if (astt == GDLM_QUEUE_BLKAST) {
++                      lkb->lkb_bastmode = rqmode;
++              } else {
++                      lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
++
++                      if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
++                              lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
++                      else
++                              lkb->lkb_lksb->sb_flags = 0;
++              }
++
++              /* 
++               * Queue ast/bast or deliver directly.  astd can deliver ASTs
++               * during deadlock detection or lock timeouts.
++               */
++
++              down(&_ast_queue_lock);
++
++              if (!lkb->lkb_asts_to_deliver)
++                      list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
++              lkb->lkb_asts_to_deliver |= astt;
++
++              up(&_ast_queue_lock);
++
++              /* It is the responsibility of the caller to call wake_astd()
++               * after it has finished other locking operations that request
++               * the ASTs to be delivered after */
++      }
++}
++
++/* 
++ * Process any LKBs on the AST queue.  The were queued in queue_ast().
++ */
++
++static void process_asts(void)
++{
++      gd_lkb_t *lkb, *safe;
++      uint32_t to_deliver;
++
++      down(&_ast_queue_lock);
++
++      list_for_each_entry_safe(lkb, safe, &_ast_queue, lkb_astqueue) {
++
++              /* The lkb can be placed back on _ast_queue as soon as
++               * _ast_queue_lock is released. */
++
++              to_deliver = lkb->lkb_asts_to_deliver;
++              lkb->lkb_asts_to_deliver = 0;
++              list_del(&lkb->lkb_astqueue);
++
++              if ((to_deliver & GDLM_QUEUE_COMPAST))
++                      deliver_ast(lkb, GDLM_QUEUE_COMPAST);
++
++              if ((to_deliver & GDLM_QUEUE_BLKAST))
++                      deliver_ast(lkb, GDLM_QUEUE_BLKAST);
++      }
++      up(&_ast_queue_lock);
++}
++
++void lockqueue_lkb_mark(gd_ls_t *ls)
++{
++      gd_lkb_t *lkb, *safe;
++      int count = 0;
++
++      log_all(ls, "mark waiting requests");
++
++      down(&_lockqueue_lock);
++
++      list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++
++              if (lkb->lkb_resource->res_ls != ls)
++                      continue;
++
++              /* 
++               * These lkb's are new and the master is being looked up.  Mark
++               * the lkb request to be resent.  Even if the destination node
++               * for the request is still living and has our request, it will
++               * purge all resdir requests in purge_requestqueue.  If there's
++               * a reply to the LOOKUP request in our requestqueue (the reply
++               * arrived after ls_stop), it is invalid and will be discarded
++               * in purge_requestqueue, too.
++               */
++
++              if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
++                      GDLM_ASSERT(lkb->lkb_nodeid == -1,
++                                  log_error(ls, "nodeid=%d\n",
++                                            lkb->lkb_nodeid););
++
++                      lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
++                      count++;
++                      continue;
++              }
++
++              /* 
++               * These lkb's have an outstanding request to a bygone node.
++               * The request will be redirected to the new master node in
++               * resend_cluster_requests().  Don't mark the request for
++               * resending if there's a reply for it saved in the
++               * requestqueue.
++               */
++
++              if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
++                  !reply_in_requestqueue(ls, lkb->lkb_id)) {
++
++                      lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
++
++                      /* 
++                       * Don't rebuild this lkb on a new rsb in
++                       * rebuild_rsbs_send().
++                       */
++
++                      if (lkb->lkb_lockqueue_state ==
++                          GDLM_LQSTATE_WAIT_CONDGRANT) {
++                              GDLM_ASSERT(lkb->lkb_status ==
++                                          GDLM_LKSTS_WAITING, );
++                              lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
++                      }
++
++                      /* 
++                       * This flag indicates to the new master that his lkb
++                       * is in the midst of a convert request and should be
++                       * placed on the granted queue rather than the convert
++                       * queue.  We will resend this convert request to the
++                       * new master.
++                       */
++
++                      else if (lkb->lkb_lockqueue_state ==
++                               GDLM_LQSTATE_WAIT_CONVERT) {
++                              GDLM_ASSERT(lkb->lkb_status ==
++                                          GDLM_LKSTS_CONVERT, );
++                              lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
++                      }
++
++                      count++;
++              }
++      }
++      up(&_lockqueue_lock);
++
++      log_all(ls, "marked %d requests", count);
++}
++
++int resend_cluster_requests(gd_ls_t *ls)
++{
++      gd_lkb_t *lkb, *safe;
++      int error = 0, state, count = 0;
++
++      log_all(ls, "resend marked requests");
++
++      down(&_lockqueue_lock);
++
++      list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++
++              if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++                      log_debug(ls, "resend_cluster_requests: aborted");
++                      error = -EINTR;
++                      break;
++              }
++
++              if (lkb->lkb_resource->res_ls != ls)
++                      continue;
++
++              log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
++                        "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
++                        lkb->lkb_lockqueue_state, lkb->lkb_flags);
++
++              /* 
++               * Resend/process the lockqueue lkb's (in-progres requests)
++               * that were flagged at the start of recovery in
++               * lockqueue_lkb_mark().
++               */
++
++              if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
++                      lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
++                      lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
++                      lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
++
++                      if (lkb->lkb_nodeid == -1) {
++                              /* 
++                               * Send lookup to new resdir node.
++                               */
++                              lkb->lkb_lockqueue_time = jiffies;
++                              send_cluster_request(lkb,
++                                                   lkb->lkb_lockqueue_state);
++                      }
++
++                      else if (lkb->lkb_nodeid != 0) {
++                              /* 
++                               * There's a new RSB master (that's not us.)
++                               */
++                              lkb->lkb_lockqueue_time = jiffies;
++                              send_cluster_request(lkb,
++                                                   lkb->lkb_lockqueue_state);
++                      }
++
++                      else {
++                              /* 
++                               * We are the new RSB master for this lkb
++                               * request.
++                               */
++                              state = lkb->lkb_lockqueue_state;
++                              lkb->lkb_lockqueue_state = 0;
++                              /* list_del equals remove_from_lockqueue() */
++                              list_del(&lkb->lkb_lockqueue);
++                              process_remastered_lkb(lkb, state);
++                      }
++
++                      count++;
++              }
++      }
++      up(&_lockqueue_lock);
++
++      log_all(ls, "resent %d requests", count);
++      return error;
++}
++
++/* 
++ * Process any LKBs on the Lock queue, this
++ * just looks at the entries to see if they have been
++ * on the queue too long and fails the requests if so.
++ */
++
++static void process_lockqueue(void)
++{
++      gd_lkb_t *lkb, *safe;
++      gd_ls_t *ls;
++      int count = 0;
++
++      down(&_lockqueue_lock);
++
++      list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++              ls = lkb->lkb_resource->res_ls;
++
++              if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
++                      continue;
++
++              /* Don't time out locks that are in transition */
++              if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
++                      continue;
++
++              if (check_timeout(lkb->lkb_lockqueue_time,
++                                dlm_config.lock_timeout)) {
++                      count++;
++                      list_del(&lkb->lkb_lockqueue);
++                      up(&_lockqueue_lock);
++                      cancel_lockop(lkb, -ETIMEDOUT);
++                      down(&_lockqueue_lock);
++              }
++      }
++      up(&_lockqueue_lock);
++
++      if (count)
++              wake_astd();
++
++      if (atomic_read(&_astd_running))
++              mod_timer(&_lockqueue_timer,
++                        jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
++}
++
++/* Look for deadlocks */
++static void process_deadlockqueue(void)
++{
++      gd_lkb_t *lkb, *safe;
++
++      down(&_deadlockqueue_lock);
++
++      list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
++              gd_lkb_t *kill_lkb;
++
++              /* Only look at "due" locks */
++              if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
++                      break;
++
++              /* Don't look at locks that are in transition */
++              if (!test_bit(LSFL_LS_RUN,
++                            &lkb->lkb_resource->res_ls->ls_flags))
++                      continue;
++
++              up(&_deadlockqueue_lock);
++
++              /* Lock has hit due time, check for conversion deadlock */
++              kill_lkb = conversion_deadlock_check(lkb);
++              if (kill_lkb)
++                      cancel_conversion(kill_lkb, -EDEADLOCK);
++
++              down(&_deadlockqueue_lock);
++      }
++      up(&_deadlockqueue_lock);
++}
++
++static __inline__ int no_asts(void)
++{
++      int ret;
++
++      down(&_ast_queue_lock);
++      ret = list_empty(&_ast_queue);
++      up(&_ast_queue_lock);
++      return ret;
++}
++
++static void lockqueue_timer_fn(unsigned long arg)
++{
++      set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
++      wake_up(&_astd_waitchan);
++}
++
++/* 
++ * DLM daemon which delivers asts.
++ */
++
++static int dlm_astd(void *data)
++{
++      daemonize("dlm_astd");
++
++      INIT_LIST_HEAD(&_lockqueue);
++      init_MUTEX(&_lockqueue_lock);
++      INIT_LIST_HEAD(&_deadlockqueue);
++      init_MUTEX(&_deadlockqueue_lock);
++      INIT_LIST_HEAD(&_ast_queue);
++      init_MUTEX(&_ast_queue_lock);
++      init_waitqueue_head(&_astd_waitchan);
++      complete(&_astd_done);
++
++      /* 
++       * Set a timer to check the lockqueue for dead locks (and deadlocks).
++       */
++
++      init_timer(&_lockqueue_timer);
++      _lockqueue_timer.function = lockqueue_timer_fn;
++      _lockqueue_timer.data = 0;
++      mod_timer(&_lockqueue_timer,
++                jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
++
++      while (atomic_read(&_astd_running)) {
++              wchan_cond_sleep_intr(_astd_waitchan, no_asts());
++
++              if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
++                      process_asts();
++
++              if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
++                      process_lockqueue();
++                      if (dlm_config.deadlocktime)
++                              process_deadlockqueue();
++              }
++      }
++
++      if (timer_pending(&_lockqueue_timer))
++              del_timer(&_lockqueue_timer);
++
++      complete(&_astd_done);
++
++      return 0;
++}
++
++void wake_astd(void)
++{
++      set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
++      wake_up(&_astd_waitchan);
++}
++
++int astd_start()
++{
++      init_completion(&_astd_done);
++      atomic_set(&_astd_running, 1);
++      _astd_pid = kernel_thread(dlm_astd, NULL, 0);
++      wait_for_completion(&_astd_done);
++      return 0;
++}
++
++void astd_stop()
++{
++      atomic_set(&_astd_running, 0);
++      wake_astd();
++      wait_for_completion(&_astd_done);
++}
+diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
+--- linux-orig/cluster/dlm/ast.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/ast.h    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __AST_DOT_H__
++#define __AST_DOT_H__
++
++void lockqueue_lkb_mark(gd_ls_t * ls);
++int resend_cluster_requests(gd_ls_t * ls);
++void add_to_lockqueue(gd_lkb_t * lkb);
++void remove_from_lockqueue(gd_lkb_t * lkb);
++void add_to_deadlockqueue(gd_lkb_t * lkb);
++void remove_from_deadlockqueue(gd_lkb_t * lkb);
++void remove_from_astqueue(gd_lkb_t * lkb);
++void queue_ast(gd_lkb_t * lkb, gd_ast_type_t astt, uint8_t rqmode);
++void wake_astd(void);
++int astd_start(void);
++void astd_stop(void);
++
++#endif                                /* __AST_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
+--- linux-orig/cluster/dlm/config.c    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/config.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,125 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "config.h"
++
++/* Config file defaults */
++#define DEFAULT_TCP_PORT       21064
++#define DEFAULT_LOCK_TIMEOUT      30
++#define DEFAULT_BUFFER_SIZE     4096
++#define DEFAULT_RESHASHTBL       256
++#define DEFAULT_LOCKIDTBL       1024
++#define DEFAULT_MAX_CONNECTIONS  128
++#define DEFAULT_DEADLOCKTIME      10
++
++struct config_info dlm_config = {
++      .tcp_port = DEFAULT_TCP_PORT,
++      .lock_timeout = DEFAULT_LOCK_TIMEOUT,
++      .buffer_size = DEFAULT_BUFFER_SIZE,
++      .reshashtbl = DEFAULT_RESHASHTBL,
++      .lockidtbl = DEFAULT_LOCKIDTBL,
++      .max_connections = DEFAULT_MAX_CONNECTIONS,
++      .deadlocktime = DEFAULT_DEADLOCKTIME,
++};
++
++
++static struct config_proc_info {
++    char *name;
++    int  *value;
++} config_proc[] = {
++    {
++      .name = "tcp_port",
++      .value = &dlm_config.tcp_port,
++    },
++    {
++      .name = "lock_timeout",
++      .value = &dlm_config.lock_timeout,
++    },
++    {
++      .name = "buffer_size",
++      .value = &dlm_config.buffer_size,
++    },
++    {
++      .name = "reshashtbl",
++      .value = &dlm_config.reshashtbl,
++    },
++    {
++      .name = "lockidtbl",
++      .value = &dlm_config.lockidtbl,
++    },
++    {
++      .name = "max_connections",
++      .value = &dlm_config.max_connections,
++    },
++    {
++      .name = "deadlocktime",
++      .value = &dlm_config.deadlocktime,
++    },
++};
++static struct proc_dir_entry *dlm_dir;
++
++static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
++                              int *eof, void *data)
++{
++      struct config_proc_info *cinfo = data;
++      return snprintf(page, count, "%d\n", *cinfo->value);
++}
++
++static int dlm_config_write_proc(struct file *file, const char *buffer,
++                               unsigned long count, void *data)
++{
++      struct config_proc_info *cinfo = data;
++      int value;
++      char *end;
++
++      value = simple_strtoul(buffer, &end, 10);
++      if (*end)
++              *cinfo->value = value;
++      return count;
++}
++
++int dlm_config_init(void)
++{
++      int i;
++      struct proc_dir_entry *pde;
++
++      dlm_dir = proc_mkdir("cluster/config/dlm", 0);
++      if (!dlm_dir)
++              return -1;
++
++      dlm_dir->owner = THIS_MODULE;
++
++      for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
++              pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
++              if (pde) {
++                      pde->data = &config_proc[i];
++                      pde->write_proc = dlm_config_write_proc;
++                      pde->read_proc = dlm_config_read_proc;
++              }
++      }
++      return 0;
++}
++
++void dlm_config_exit(void)
++{
++      int i;
++
++      for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
++              remove_proc_entry(config_proc[i].name, dlm_dir);
++      remove_proc_entry("cluster/config/dlm", NULL);
++}
+diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
+--- linux-orig/cluster/dlm/config.h    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/config.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,31 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CONFIG_DOT_H__
++#define __CONFIG_DOT_H__
++
++struct config_info {
++      int tcp_port;
++      int lock_timeout;
++      int buffer_size;
++      int reshashtbl;
++      int lockidtbl;
++      int max_connections;
++      int deadlocktime;
++};
++
++extern struct config_info dlm_config;
++extern int  dlm_config_init(void);
++extern void dlm_config_exit(void);
++
++#endif                                /* __CONFIG_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
+--- linux-orig/cluster/dlm/device.c    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/device.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1020 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * device.c
++ *
++ * This is the userland interface to the DLM.
++ *
++ * The locking is done via a misc char device (find the
++ * registered minor number in /proc/misc).
++ *
++ * User code should not use this interface directly but
++ * call the library routines in libdlm.a instead.
++ *
++ */
++
++#include <linux/miscdevice.h>
++#include <linux/init.h>
++#include <linux/wait.h>
++#include <linux/module.h>
++#include <linux/file.h>
++#include <linux/fs.h>
++#include <linux/poll.h>
++#include <linux/signal.h>
++#include <linux/spinlock.h>
++#include <asm/ioctls.h>
++
++#include "dlm_internal.h"
++#include "device.h"
++
++extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
++static struct file_operations _dlm_fops;
++static const char *name_prefix="dlm";
++static struct list_head user_ls_list;
++
++/* Flags in li_flags */
++#define LI_FLAG_COMPLETE  1
++#define LI_FLAG_FIRSTLOCK 2
++
++struct lock_info {
++      uint8_t li_cmd;
++      struct dlm_lksb li_lksb;
++      wait_queue_head_t li_waitq;
++      unsigned long li_flags;
++      void __user *li_astparam;
++      void __user *li_astaddr;
++      void __user *li_bastaddr;
++      struct file_info *li_file;
++      struct dlm_lksb __user *li_user_lksb;
++      struct semaphore li_firstlock;
++      struct dlm_queryinfo *li_queryinfo;
++      struct dlm_queryinfo __user *li_user_queryinfo;
++};
++
++/* A queued AST no less */
++struct ast_info {
++      struct dlm_lock_result result;
++      struct dlm_queryinfo *queryinfo;
++      struct dlm_queryinfo __user *user_queryinfo;
++      struct list_head list;
++};
++
++/* One of these per userland lockspace */
++struct user_ls {
++      void    *ls_lockspace;
++      atomic_t ls_refcnt;
++      long     ls_flags; /* bit 1 means LS has been deleted */
++
++      /* Passed into misc_register() */
++      struct miscdevice ls_miscinfo;
++      struct list_head  ls_list;
++};
++
++/* misc_device info for the control device */
++static struct miscdevice ctl_device;
++
++/*
++ * Stuff we hang off the file struct.
++ * The first two are to cope with unlocking all the
++ * locks help by a process when it dies.
++ */
++struct file_info {
++      struct list_head    fi_lkb_list;     /* List of active lkbs */
++      spinlock_t          fi_lkb_lock;
++      struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
++      spinlock_t          fi_ast_lock;
++      wait_queue_head_t   fi_wait;
++      struct user_ls     *fi_ls;
++      atomic_t            fi_refcnt;       /* Number of users */
++      unsigned long       fi_flags;        /* Bit 1 means the device is open */
++};
++
++
++/* get and put ops for file_info.
++   Actually I don't really like "get" and "put", but everyone
++   else seems to use them and I can't think of anything
++   nicer at the moment */
++static void get_file_info(struct file_info *f)
++{
++      atomic_inc(&f->fi_refcnt);
++}
++
++static void put_file_info(struct file_info *f)
++{
++      if (atomic_dec_and_test(&f->fi_refcnt))
++              kfree(f);
++}
++
++/* Find a lockspace struct given the device minor number */
++static struct user_ls *find_lockspace(int minor)
++{
++      struct user_ls *lsinfo;
++
++      list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
++
++              if (lsinfo->ls_miscinfo.minor == minor)
++                      return lsinfo;
++      }
++      return NULL;
++}
++
++static void add_lockspace_to_list(struct user_ls *lsinfo)
++{
++      list_add(&lsinfo->ls_list, &user_ls_list);
++}
++
++/* Register a lockspace with the DLM and create a misc
++   device for userland to access it */
++static int register_lockspace(char *name, struct user_ls **ls)
++{
++      struct user_ls *newls;
++      int status;
++      int namelen;
++
++      namelen = strlen(name)+strlen(name_prefix)+2;
++
++      newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
++      if (!newls)
++              return -ENOMEM;
++      memset(newls, 0, sizeof(struct user_ls));
++
++      newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
++      if (!newls->ls_miscinfo.name) {
++              kfree(newls);
++              return -ENOMEM;
++      }
++      snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
++
++      status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
++                                  strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
++                                  &newls->ls_lockspace, 0);
++
++      if (status != 0) {
++              kfree(newls->ls_miscinfo.name);
++              kfree(newls);
++              return status;
++      }
++
++      newls->ls_miscinfo.fops = &_dlm_fops;
++      newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
++
++      status = misc_register(&newls->ls_miscinfo);
++      if (status) {
++              log_print("failed to register misc device for %s", name);
++              dlm_release_lockspace(newls->ls_lockspace, 0);
++              kfree(newls->ls_miscinfo.name);
++              kfree(newls);
++              return status;
++      }
++
++
++      add_lockspace_to_list(newls);
++      *ls = newls;
++      return 0;
++}
++
++static int unregister_lockspace(struct user_ls *lsinfo, int force)
++{
++      int status;
++
++      status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
++      if (status)
++              return status;
++
++      status = misc_deregister(&lsinfo->ls_miscinfo);
++      if (status)
++              return status;
++
++      list_del(&lsinfo->ls_list);
++      kfree(lsinfo->ls_miscinfo.name);
++      kfree(lsinfo);
++
++      return 0;
++}
++
++/* Add it to userland's AST queue */
++static void add_to_astqueue(struct lock_info *li, void *astaddr)
++{
++      struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
++      if (!ast)
++              return;
++
++      ast->result.astparam  = li->li_astparam;
++      ast->result.astaddr   = astaddr;
++      ast->result.user_lksb = li->li_user_lksb;
++      ast->result.cmd       = li->li_cmd;
++      memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
++
++      /* These two will both be NULL for anything other than queries */
++      ast->queryinfo        = li->li_queryinfo;
++      ast->user_queryinfo   = li->li_user_queryinfo;
++
++      spin_lock(&li->li_file->fi_ast_lock);
++      list_add_tail(&ast->list, &li->li_file->fi_ast_list);
++      spin_unlock(&li->li_file->fi_ast_lock);
++      wake_up_interruptible(&li->li_file->fi_wait);
++}
++
++static void bast_routine(void *param, int mode)
++{
++      struct lock_info *li = param;
++
++      if (param) {
++              add_to_astqueue(li, li->li_bastaddr);
++      }
++}
++
++/*
++ * This is the kernel's AST routine.
++ * All lock, unlock & query operations complete here.
++ * The only syncronous ops are those done during device close.
++ */
++static void ast_routine(void *param)
++{
++      struct lock_info *li = param;
++
++      /* Param may be NULL if a persistent lock is unlocked by someone else */
++      if (!param)
++              return;
++
++      /* If it's an async request then post data to the user's AST queue. */
++      if (li->li_astaddr) {
++
++              /* Only queue AST if the device is still open */
++              if (test_bit(1, &li->li_file->fi_flags))
++                      add_to_astqueue(li, li->li_astaddr);
++
++              /* If it's a new lock operation that failed, then
++               * remove it from the owner queue and free the
++               * lock_info. The DLM will not free the LKB until this
++               * AST has completed.
++               */
++              if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
++                  li->li_lksb.sb_status != 0) {
++                      gd_lkb_t *lkb;
++
++                      /* Wait till dlm_lock() has finished */
++                      down(&li->li_firstlock);
++                      lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
++                      if (lkb) {
++                              spin_lock(&li->li_file->fi_lkb_lock);
++                              list_del(&lkb->lkb_ownerqueue);
++                              spin_unlock(&li->li_file->fi_lkb_lock);
++                      }
++                      up(&li->li_firstlock);
++                      put_file_info(li->li_file);
++                      kfree(li);
++                      return;
++              }
++              /* Free unlocks & queries */
++              if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
++                  li->li_cmd == DLM_USER_QUERY) {
++                      put_file_info(li->li_file);
++                      kfree(li);
++              }
++      }
++      else {
++              /* Syncronous request, just wake up the caller */
++              set_bit(LI_FLAG_COMPLETE, &li->li_flags);
++              wake_up_interruptible(&li->li_waitq);
++      }
++}
++
++/*
++ * Wait for the lock op to complete and return the status.
++ */
++static int wait_for_ast(struct lock_info *li)
++{
++      /* Wait for the AST routine to complete */
++      set_task_state(current, TASK_INTERRUPTIBLE);
++      while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
++              schedule();
++
++      set_task_state(current, TASK_RUNNING);
++
++      return li->li_lksb.sb_status;
++}
++
++
++/* Open on control device */
++static int dlm_ctl_open(struct inode *inode, struct file *file)
++{
++      return 0;
++}
++
++/* Close on control device */
++static int dlm_ctl_close(struct inode *inode, struct file *file)
++{
++      return 0;
++}
++
++/* Open on lockspace device */
++static int dlm_open(struct inode *inode, struct file *file)
++{
++      struct file_info *f;
++      struct user_ls *lsinfo;
++
++      lsinfo = find_lockspace(iminor(inode));
++      if (!lsinfo)
++              return -ENOENT;
++
++      f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
++      if (!f)
++              return -ENOMEM;
++
++      atomic_inc(&lsinfo->ls_refcnt);
++      INIT_LIST_HEAD(&f->fi_lkb_list);
++      INIT_LIST_HEAD(&f->fi_ast_list);
++      spin_lock_init(&f->fi_ast_lock);
++      spin_lock_init(&f->fi_lkb_lock);
++      init_waitqueue_head(&f->fi_wait);
++      f->fi_ls = lsinfo;
++      atomic_set(&f->fi_refcnt, 1);
++      set_bit(1, &f->fi_flags);
++
++      file->private_data = f;
++
++      return 0;
++}
++
++/* Check the user's version matches ours */
++static int check_version(struct dlm_lock_params *params)
++{
++      if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
++          (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
++           params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
++
++              log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
++                     params->version[0],
++                     params->version[1],
++                     params->version[2],
++                     DLM_DEVICE_VERSION_MAJOR,
++                     DLM_DEVICE_VERSION_MINOR,
++                     DLM_DEVICE_VERSION_PATCH);
++              return -EINVAL;
++      }
++      return 0;
++}
++
++/* Close on lockspace device */
++static int dlm_close(struct inode *inode, struct file *file)
++{
++      struct file_info *f = file->private_data;
++      struct lock_info li;
++      sigset_t tmpsig;
++      sigset_t allsigs;
++      gd_lkb_t *lkb, *safe;
++      struct user_ls *lsinfo;
++      DECLARE_WAITQUEUE(wq, current);
++
++      lsinfo = find_lockspace(iminor(inode));
++      if (!lsinfo)
++              return -ENOENT;
++
++      /* Mark this closed so that ASTs will not be delivered any more */
++      clear_bit(1, &f->fi_flags);
++
++      /* Block signals while we are doing this */
++      sigfillset(&allsigs);
++      sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
++
++      /* We use our own lock_info struct here, so that any
++       * outstanding "real" ASTs will be delivered with the
++       * corresponding "real" params, thus freeing the lock_info
++       * that belongs the lock. This catches the corner case where
++       * a lock is BUSY when we try to unlock it here
++       */
++      memset(&li, 0, sizeof(li));
++      clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++      init_waitqueue_head(&li.li_waitq);
++      add_wait_queue(&li.li_waitq, &wq);
++
++      /*
++       * Free any outstanding locks, they are on the
++       * list in LIFO order so there should be no problems
++       * about unlocking parents before children.
++       * Although we don't remove the lkbs from the list here
++       * (what would be the point?), foreach_safe is needed
++       * because the lkbs are freed during dlm_unlock operations
++       */
++      list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
++              int status;
++              int lock_status;
++              int flags = 0;
++              struct lock_info *old_li;
++
++              /* Make a copy of this pointer. If all goes well we will
++               * free it later. if not it will be left to the AST routine
++               * to tidy up
++               */
++              old_li = (struct lock_info *)lkb->lkb_astparam;
++
++              /* Don't unlock persistent locks */
++              if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
++                      list_del(&lkb->lkb_ownerqueue);
++
++                      /* But tidy our references in it */
++                      kfree(old_li);
++                      lkb->lkb_astparam = (long)NULL;
++                      put_file_info(f);
++                      continue;
++              }
++
++              clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++
++              /* If it's not granted then cancel the request.
++               * If the lock was WAITING then it will be dropped,
++               *    if it was converting then it will be reverted to GRANTED,
++               *    then we will unlock it.
++               */
++              lock_status = lkb->lkb_status;
++
++              if (lock_status != GDLM_LKSTS_GRANTED)
++                      flags = DLM_LKF_CANCEL;
++
++              status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
++
++              /* Must wait for it to complete as the next lock could be its
++               * parent */
++              if (status == 0)
++                      wait_for_ast(&li);
++
++              /* If it was waiting for a conversion, it will
++                 now be granted so we can unlock it properly */
++              if (lock_status == GDLM_LKSTS_CONVERT) {
++
++                      clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++                      status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
++
++                      if (status == 0)
++                              wait_for_ast(&li);
++              }
++              /* Unlock suceeded, free the lock_info struct. */
++              if (status == 0) {
++                      kfree(old_li);
++                      put_file_info(f);
++              }
++      }
++
++      remove_wait_queue(&li.li_waitq, &wq);
++
++      /* If this is the last reference, and the lockspace has been deleted
++         the free the struct */
++      if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
++              kfree(lsinfo);
++      }
++
++      /* Restore signals */
++      sigprocmask(SIG_SETMASK, &tmpsig, NULL);
++      recalc_sigpending();
++
++      return 0;
++}
++
++/*
++ * ioctls to create/remove lockspaces, and check how many
++ * outstanding ASTs there are against a particular LS.
++ */
++static int dlm_ioctl(struct inode *inode, struct file *file,
++                   uint command, ulong u)
++{
++      struct file_info *fi = file->private_data;
++      int status = -EINVAL;
++      int count;
++      struct list_head *tmp_list;
++
++      switch (command) {
++
++              /* Are there any ASTs for us to read?
++               * Warning, this returns the number of messages (ASTs)
++               * in the queue, NOT the number of bytes to read
++               */
++      case FIONREAD:
++              count = 0;
++              spin_lock(&fi->fi_ast_lock);
++              list_for_each(tmp_list, &fi->fi_ast_list)
++                      count++;
++              spin_unlock(&fi->fi_ast_lock);
++              status = put_user(count, (int *)u);
++              break;
++
++      default:
++              return -ENOTTY;
++      }
++
++      return status;
++}
++
++/*
++ * ioctls to create/remove lockspaces.
++ */
++static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
++                       uint command, ulong u)
++{
++      int status = -EINVAL;
++      char ls_name[MAX_LS_NAME_LEN];
++      struct user_ls *lsinfo;
++      int force = 0;
++
++      switch (command) {
++      case DLM_CREATE_LOCKSPACE:
++              if (!capable(CAP_SYS_ADMIN))
++                      return -EPERM;
++
++              if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
++                      return -EFAULT;
++              status = register_lockspace(ls_name, &lsinfo);
++
++              /* If it succeeded then return the minor number */
++              if (status == 0)
++                      status = lsinfo->ls_miscinfo.minor;
++              break;
++
++      case DLM_FORCE_RELEASE_LOCKSPACE:
++              force = 2;
++
++      case DLM_RELEASE_LOCKSPACE:
++              if (!capable(CAP_SYS_ADMIN))
++                      return -EPERM;
++
++              lsinfo = find_lockspace(u);
++              if (!lsinfo)
++                      return -EINVAL;
++              status = unregister_lockspace(lsinfo, force);
++              break;
++
++      default:
++              return -ENOTTY;
++      }
++
++      return status;
++}
++
++/* Deal with the messy stuff of copying a web of structs
++   from kernel space to userspace */
++static int copy_query_result(struct ast_info *ast)
++{
++      int status = -EFAULT;
++      struct dlm_queryinfo qi;
++
++      /* Get the pointers to userspace structs */
++      if (copy_from_user(&qi, ast->user_queryinfo,
++                         sizeof(struct dlm_queryinfo)))
++              goto copy_out;
++
++      /* TODO: does this deref a user pointer? */
++      if (put_user(ast->queryinfo->gqi_lockcount,
++                   &ast->user_queryinfo->gqi_lockcount))
++              goto copy_out;
++
++      if (qi.gqi_resinfo) {
++              if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
++                               sizeof(struct dlm_resinfo)))
++                      goto copy_out;
++      }
++
++      if (qi.gqi_lockinfo) {
++              if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
++                               sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
++                      goto copy_out;
++      }
++
++      status = 0;
++
++      if (ast->queryinfo->gqi_lockinfo)
++              kfree(ast->queryinfo->gqi_lockinfo);
++
++      if (ast->queryinfo->gqi_resinfo)
++              kfree(ast->queryinfo->gqi_resinfo);
++
++      kfree(ast->queryinfo);
++
++ copy_out:
++      return status;
++}
++
++/* Read call, might block if no ASTs are waiting.
++ * It will only ever return one message at a time, regardless
++ * of how many are pending.
++ */
++static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
++{
++      struct file_info *fi = file->private_data;
++      struct ast_info *ast;
++      int ret;
++      DECLARE_WAITQUEUE(wait, current);
++
++      if (count < sizeof(struct dlm_lock_result))
++              return -EINVAL;
++
++      spin_lock(&fi->fi_ast_lock);
++      if (list_empty(&fi->fi_ast_list)) {
++
++              /* No waiting ASTs.
++               * Return EOF if the lockspace been deleted.
++               */
++              if (test_bit(1, &fi->fi_ls->ls_flags))
++                      return 0;
++
++              if (file->f_flags & O_NONBLOCK) {
++                      spin_unlock(&fi->fi_ast_lock);
++                      return -EAGAIN;
++              }
++
++              add_wait_queue(&fi->fi_wait, &wait);
++
++      repeat:
++              set_current_state(TASK_INTERRUPTIBLE);
++              if (list_empty(&fi->fi_ast_list) &&
++                  !signal_pending(current)) {
++
++                      spin_unlock(&fi->fi_ast_lock);
++                      schedule();
++                      spin_lock(&fi->fi_ast_lock);
++                      goto repeat;
++              }
++
++              current->state = TASK_RUNNING;
++              remove_wait_queue(&fi->fi_wait, &wait);
++
++              if (signal_pending(current)) {
++                      spin_unlock(&fi->fi_ast_lock);
++                      return -ERESTARTSYS;
++              }
++      }
++
++      ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
++      list_del(&ast->list);
++      spin_unlock(&fi->fi_ast_lock);
++
++      ret = sizeof(struct dlm_lock_result);
++      if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
++              ret = -EFAULT;
++
++      /* If it was a query then copy the result block back here */
++      if (ast->queryinfo) {
++              int status = copy_query_result(ast);
++              if (status)
++                      ret = status;
++      }
++
++      kfree(ast);
++      return ret;
++}
++
++static unsigned int dlm_poll(struct file *file, poll_table *wait)
++{
++      struct file_info *fi = file->private_data;
++
++      poll_wait(file, &fi->fi_wait, wait);
++
++      spin_lock(&fi->fi_ast_lock);
++      if (!list_empty(&fi->fi_ast_list)) {
++              spin_unlock(&fi->fi_ast_lock);
++              return POLLIN | POLLRDNORM;
++      }
++
++      spin_unlock(&fi->fi_ast_lock);
++      return 0;
++}
++
++static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
++{
++      struct lock_info *li;
++      int status;
++
++      li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
++      if (!li)
++              return -ENOMEM;
++
++      get_file_info(fi);
++      li->li_user_lksb = kparams->lksb;
++      li->li_astparam  = kparams->astparam;
++      li->li_bastaddr  = kparams->bastaddr;
++      li->li_astaddr   = kparams->astaddr;
++      li->li_file      = fi;
++      li->li_flags     = 0;
++      li->li_cmd       = kparams->cmd;
++      clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
++
++      if (copy_from_user(&li->li_lksb, kparams->lksb,
++                         sizeof(struct dlm_lksb))) {
++              kfree(li);
++              return -EFAULT;
++      }
++      li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
++
++      /* Allocate query structs */
++      status = -ENOMEM;
++      li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
++      if (!li->li_queryinfo)
++              goto out1;
++
++      /* Mainly to get gqi_lock buffer size */
++      if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
++                         sizeof(struct dlm_queryinfo))) {
++              status = -EFAULT;
++              goto out1;
++      }
++
++      /* Overwrite userspace pointers we just copied with kernel space ones */
++      if (li->li_queryinfo->gqi_resinfo) {
++              li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
++              if (!li->li_queryinfo->gqi_resinfo)
++                      goto out1;
++      }
++      if (li->li_queryinfo->gqi_lockinfo) {
++              li->li_queryinfo->gqi_lockinfo =
++                      kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
++                              GFP_KERNEL);
++              if (!li->li_queryinfo->gqi_lockinfo)
++                      goto out2;
++      }
++
++      li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
++
++      return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
++                        kparams->flags, /* query */
++                        li->li_queryinfo,
++                        ast_routine, li);
++
++ out2:
++      kfree(li->li_queryinfo);
++
++ out1:
++      kfree(li);
++      return status;
++}
++
++static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
++                      const char *buffer)
++{
++      struct lock_info *li;
++      int status;
++      char name[DLM_RESNAME_MAXLEN];
++
++      /*
++       * Validate things that we need to have correct.
++       */
++      if (kparams->namelen > DLM_RESNAME_MAXLEN)
++              return -EINVAL;
++
++      if (!kparams->astaddr)
++              return -EINVAL;
++
++      if (!kparams->lksb)
++              return -EINVAL;
++
++      /* Get the lock name */
++      if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
++                         kparams->namelen)) {
++              return -EFAULT;
++      }
++
++      /* For conversions, the lock will already have a lock_info
++         block squirelled away in astparam */
++      if (kparams->flags & DLM_LKF_CONVERT) {
++              gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
++              if (!lkb) {
++                      return -EINVAL;
++              }
++              li = (struct lock_info *)lkb->lkb_astparam;
++
++              /* Only override these if they are provided */
++              if (li->li_user_lksb)
++                      li->li_user_lksb = kparams->lksb;
++              if (li->li_astparam)
++                      li->li_astparam  = kparams->astparam;
++              if (li->li_bastaddr)
++                      li->li_bastaddr  = kparams->bastaddr;
++              if (li->li_bastaddr)
++                      li->li_astaddr   = kparams->astaddr;
++              li->li_flags     = 0;
++      }
++      else {
++              li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
++              if (!li)
++                      return -ENOMEM;
++
++              li->li_user_lksb = kparams->lksb;
++              li->li_astparam  = kparams->astparam;
++              li->li_bastaddr  = kparams->bastaddr;
++              li->li_astaddr   = kparams->astaddr;
++              li->li_file      = fi;
++              li->li_flags     = 0;
++              li->li_cmd       = kparams->cmd;
++              li->li_queryinfo  = NULL;
++
++              /* semaphore to allow us to complete our work before
++                 the AST routine runs. In fact we only need (and use) this
++                 when the initial lock fails */
++              init_MUTEX_LOCKED(&li->li_firstlock);
++              set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
++
++              get_file_info(fi);
++      }
++
++      /* Copy the user's LKSB into kernel space,
++         needed for conversions & value block operations */
++      if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
++                                          sizeof(struct dlm_lksb)))
++              return -EFAULT;
++
++      /* Lock it ... */
++      status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
++                         kparams->flags, name, kparams->namelen,
++                         kparams->parent,
++                         ast_routine,
++                         li,
++                         li->li_bastaddr ? bast_routine : NULL,
++                         kparams->range.ra_end ? &kparams->range : NULL);
++
++      /* If it succeeded (this far) with a new lock then keep track of
++         it on the file's lkb list */
++      if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
++              gd_lkb_t *lkb;
++              lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
++
++              if (lkb) {
++                      spin_lock(&fi->fi_lkb_lock);
++                      list_add(&lkb->lkb_ownerqueue,
++                               &fi->fi_lkb_list);
++                      spin_unlock(&fi->fi_lkb_lock);
++              }
++              else {
++                      log_print("failed to get lkb for new lock");
++              }
++              up(&li->li_firstlock);
++      }
++
++      return status;
++}
++
++static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
++{
++      struct lock_info *li;
++      gd_lkb_t *lkb;
++      int status;
++
++      lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
++      if (!lkb) {
++              return -EINVAL;
++      }
++
++      li = (struct lock_info *)lkb->lkb_astparam;
++
++      li->li_user_lksb = kparams->lksb;
++      li->li_astparam  = kparams->astparam;
++      li->li_cmd       = kparams->cmd;
++
++      /* Have to do it here cos the lkb may not exist after
++       * dlm_unlock() */
++      spin_lock(&fi->fi_lkb_lock);
++      list_del(&lkb->lkb_ownerqueue);
++      spin_unlock(&fi->fi_lkb_lock);
++
++      /* Use existing lksb & astparams */
++      status = dlm_unlock(fi->fi_ls->ls_lockspace,
++                           kparams->lkid,
++                           kparams->flags, NULL, NULL);
++
++      return status;
++}
++
++/* Write call, submit a locking request */
++static ssize_t dlm_write(struct file *file, const char __user *buffer,
++                       size_t count, loff_t *ppos)
++{
++      struct file_info *fi = file->private_data;
++      struct dlm_lock_params kparams;
++      sigset_t tmpsig;
++      sigset_t allsigs;
++      int status;
++
++      if (count < sizeof(kparams))
++              return -EINVAL;
++
++      /* Has the lockspace been deleted */
++      if (test_bit(1, &fi->fi_ls->ls_flags))
++              return -ENOENT;
++
++      /* Get the command info */
++      if (copy_from_user(&kparams, buffer, sizeof(kparams)))
++              return -EFAULT;
++
++      if (check_version(&kparams))
++              return -EINVAL;
++
++      /* Block signals while we are doing this */
++      sigfillset(&allsigs);
++      sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
++
++      switch (kparams.cmd)
++      {
++      case DLM_USER_LOCK:
++              status = do_user_lock(fi, &kparams, buffer);
++              break;
++
++      case DLM_USER_UNLOCK:
++              status = do_user_unlock(fi, &kparams);
++              break;
++
++      case DLM_USER_QUERY:
++              status = do_user_query(fi, &kparams);
++              break;
++
++      default:
++              status = -EINVAL;
++              break;
++      }
++      /* Restore signals */
++      sigprocmask(SIG_SETMASK, &tmpsig, NULL);
++      recalc_sigpending();
++
++      if (status == 0)
++              return count;
++      else
++              return status;
++}
++
++void dlm_device_free_devices()
++{
++      struct user_ls *tmp;
++      struct user_ls *lsinfo;
++
++      list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
++              misc_deregister(&lsinfo->ls_miscinfo);
++
++              /* Tidy up, but don't delete the lsinfo struct until
++                 all the users have closed their devices */
++              list_del(&lsinfo->ls_list);
++              kfree(lsinfo->ls_miscinfo.name);
++              set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
++      }
++}
++
++static struct file_operations _dlm_fops = {
++      .open    = dlm_open,
++      .release = dlm_close,
++      .ioctl   = dlm_ioctl,
++      .read    = dlm_read,
++      .write   = dlm_write,
++      .poll    = dlm_poll,
++      .owner   = THIS_MODULE,
++};
++
++static struct file_operations _dlm_ctl_fops = {
++      .open    = dlm_ctl_open,
++      .release = dlm_ctl_close,
++      .ioctl   = dlm_ctl_ioctl,
++      .owner   = THIS_MODULE,
++};
++
++/*
++ * Create control device
++ */
++int dlm_device_init(void)
++{
++      int r;
++
++      INIT_LIST_HEAD(&user_ls_list);
++
++      ctl_device.name = "dlm-control";
++      ctl_device.fops = &_dlm_ctl_fops;
++      ctl_device.minor = MISC_DYNAMIC_MINOR;
++
++      r = misc_register(&ctl_device);
++      if (r) {
++              log_print("misc_register failed for DLM control device");
++              return r;
++      }
++
++      return 0;
++}
++
++void dlm_device_exit(void)
++{
++      misc_deregister(&ctl_device);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
+--- linux-orig/cluster/dlm/device.h    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/device.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DEVICE_DOT_H__
++#define __DEVICE_DOT_H__
++
++extern void dlm_device_free_devices(void);
++
++#endif                                /* __DEVICE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
+--- linux-orig/cluster/dlm/dir.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dir.c    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,430 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "lowcomms.h"
++#include "reccomms.h"
++#include "rsb.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++#include "util.h"
++
++/* 
++ * We use the upper 16 bits of the hash value to select the directory node.
++ * Low bits are used for distribution of rsb's among hash buckets on each node.
++ *
++ * From the hash value, we are interested in arriving at a final value between
++ * zero and the number of nodes minus one (num_nodes - 1).
++ *
++ * To accomplish this scaling, we take the nearest power of two larger than
++ * num_nodes and subtract one to create a bit mask.  The mask is applied to the
++ * hash, reducing the range to nearer the final range.
++ *
++ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
++ * num_nodes to the previously masked hash value.
++ *
++ * This value in the desired range is used as an offset into the sorted list of
++ * nodeid's to give the particular nodeid of the directory node.
++ */
++
++uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
++{
++      struct list_head *tmp;
++      gd_csb_t *csb = NULL;
++      uint32_t hash, node, n = 0, nodeid;
++
++      if (ls->ls_num_nodes == 1) {
++              nodeid = our_nodeid();
++              goto out;
++      }
++
++      hash = gdlm_hash(name, length);
++      node = (hash >> 16) & ls->ls_nodes_mask;
++      node %= ls->ls_num_nodes;
++
++      list_for_each(tmp, &ls->ls_nodes) {
++              if (n++ != node)
++                      continue;
++              csb = list_entry(tmp, gd_csb_t, csb_list);
++              break;
++      }
++
++      GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
++                              ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
++      nodeid = csb->csb_node->gn_nodeid;
++
++      out:
++      return nodeid;
++}
++
++uint32_t get_directory_nodeid(gd_res_t *rsb)
++{
++      return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
++                                      rsb->res_length);
++}
++
++static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
++{
++      uint32_t val;
++
++      val = gdlm_hash(name, len);
++      val &= RESDIRHASH_MASK;
++
++      return val;
++}
++
++static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
++{
++      gd_resdir_bucket_t *bucket;
++      uint32_t hashval;
++
++      hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
++      bucket = &ls->ls_resdir_hash[hashval];
++
++      list_add_tail(&rd->rd_list, &bucket->rb_reslist);
++}
++
++static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
++                                   uint32_t bucket)
++{
++      struct list_head *head;
++      gd_resdata_t *rd;
++
++      head = &ls->ls_resdir_hash[bucket].rb_reslist;
++      list_for_each_entry(rd, head, rd_list) {
++              if (rd->rd_length == namelen &&
++                  !memcmp(name, rd->rd_name, namelen))
++                      goto out;
++      }
++      rd = NULL;
++      out:
++      return rd;
++}
++
++void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
++                  uint8_t sequence)
++{
++      gd_resdata_t *rd;
++      uint32_t bucket;
++
++      bucket = rd_hash(ls, name, namelen);
++
++      write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++      rd = search_rdbucket(ls, name, namelen, bucket);
++
++      if (!rd) {
++              log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
++              goto out;
++      }
++
++      if (rd->rd_master_nodeid != nodeid) {
++              log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
++              goto out;
++      }
++
++      if (rd->rd_sequence == sequence) {
++              list_del(&rd->rd_list);
++              free_resdata(rd);
++      } else {
++              /* 
++              log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
++                        nodeid, rd->rd_sequence, sequence);
++              */
++      }
++
++      out:
++      write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++}
++
++void resdir_clear(gd_ls_t *ls)
++{
++      struct list_head *head;
++      gd_resdata_t *rd;
++      int i;
++
++      for (i = 0; i < RESDIRHASH_SIZE; i++) {
++              head = &ls->ls_resdir_hash[i].rb_reslist;
++              while (!list_empty(head)) {
++                      rd = list_entry(head->next, gd_resdata_t, rd_list);
++                      list_del(&rd->rd_list);
++                      free_resdata(rd);
++              }
++      }
++}
++
++static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
++{
++      gd_resmov_t tmp;
++
++      memcpy(&tmp, buf, sizeof(gd_resmov_t));
++
++      rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
++      rm->rm_length = be16_to_cpu(tmp.rm_length);
++}
++
++int resdir_rebuild_local(gd_ls_t *ls)
++{
++      gd_csb_t *csb;
++      gd_resdata_t *rd;
++      gd_rcom_t *rc;
++      gd_resmov_t mov, last_mov;
++      char *b, *last_name;
++      int error = -ENOMEM, count = 0;
++
++      log_all(ls, "rebuild resource directory");
++
++      resdir_clear(ls);
++
++      rc = allocate_rcom_buffer(ls);
++      if (!rc)
++              goto out;
++
++      last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
++      if (!last_name)
++              goto free_rc;
++
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              last_mov.rm_length = 0;
++              for (;;) {
++                      error = gdlm_recovery_stopped(ls);
++                      if (error)
++                              goto free_last;
++
++                      memcpy(rc->rc_buf, last_name, last_mov.rm_length);
++                      rc->rc_datalen = last_mov.rm_length;
++
++                      error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
++                                                RECCOMM_RECOVERNAMES, rc, 1);
++                      if (error)
++                              goto free_last;
++
++                      schedule();
++
++                      /* 
++                       * pick each res out of buffer
++                       */
++
++                      b = rc->rc_buf;
++
++                      for (;;) {
++                              gdlm_resmov_in(&mov, b);
++                              b += sizeof(gd_resmov_t);
++
++                              /* Length of 0 with a non-zero nodeid marks the 
++                               * end of the list */
++                              if (!mov.rm_length && mov.rm_nodeid)
++                                      goto done;
++
++                              /* This is just the end of the block */
++                              if (!mov.rm_length)
++                                      break;
++
++                              error = -ENOMEM;
++                              rd = allocate_resdata(ls, mov.rm_length);
++                              if (!rd)
++                                      goto free_last;
++
++                              rd->rd_master_nodeid = mov.rm_nodeid;
++                              rd->rd_length = mov.rm_length;
++                              rd->rd_sequence = 1;
++
++                              memcpy(rd->rd_name, b, mov.rm_length);
++                              b += mov.rm_length;
++
++                              add_resdata_to_hash(ls, rd);
++                              count++;
++
++                              last_mov = mov;
++                              memset(last_name, 0, DLM_RESNAME_MAXLEN);
++                              memcpy(last_name, rd->rd_name, rd->rd_length);
++                      }
++              }
++            done:
++              ;
++      }
++
++      set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
++      error = 0;
++
++      log_all(ls, "rebuilt %d resources", count);
++
++      free_last:
++      kfree(last_name);
++
++      free_rc:
++      free_rcom_buffer(rc);
++
++      out:
++      return error;
++}
++
++/* 
++ * The reply end of resdir_rebuild_local/RECOVERNAMES.  Collect and send as
++ * many resource names as can fit in the buffer.
++ */
++
++int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
++                      int outlen, uint32_t nodeid)
++{
++      struct list_head *list;
++      gd_res_t *start_rsb = NULL, *rsb;
++      int offset = 0, start_namelen, error;
++      char *start_name;
++      gd_resmov_t tmp;
++      uint32_t dir_nodeid;
++
++      /* 
++       * Find the rsb where we left off (or start again)
++       */
++
++      start_namelen = inlen;
++      start_name = inbuf;
++
++      if (start_namelen > 1) {
++              error = find_or_create_rsb(ls, NULL, start_name,
++                                         start_namelen, 0, &start_rsb);
++              GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
++              release_rsb(start_rsb);
++      }
++
++      /* 
++       * Send rsb names for rsb's we're master of and whose directory node
++       * matches the requesting node.
++       */
++
++      down_read(&ls->ls_rec_rsblist);
++      if (start_rsb)
++              list = start_rsb->res_rootlist.next;
++      else
++              list = ls->ls_rootres.next;
++
++      for (offset = 0; list != &ls->ls_rootres; list = list->next) {
++              rsb = list_entry(list, gd_res_t, res_rootlist);
++              if (rsb->res_nodeid)
++                      continue;
++
++              dir_nodeid = get_directory_nodeid(rsb);
++              if (dir_nodeid != nodeid)
++                      continue;
++
++              if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
++                      /* Write end-of-block record */
++                      memset(&tmp, 0, sizeof(gd_resmov_t));
++                      memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++                      offset += sizeof(gd_resmov_t);
++                      goto out;
++              }
++
++              memset(&tmp, 0, sizeof(gd_resmov_t));
++              tmp.rm_nodeid = cpu_to_be32(our_nodeid());
++              tmp.rm_length = cpu_to_be16(rsb->res_length);
++
++              memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++              offset += sizeof(gd_resmov_t);
++
++              memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
++              offset += rsb->res_length;
++      }
++
++      /* 
++       * If we've reached the end of the list (and there's room) write a
++       * terminating record.
++       */
++
++      if ((list == &ls->ls_rootres) &&
++          (offset + sizeof(gd_resmov_t) <= outlen)) {
++
++              memset(&tmp, 0, sizeof(gd_resmov_t));
++              /* This only needs to be non-zero */
++              tmp.rm_nodeid = cpu_to_be32(1);
++              /* and this must be zero */
++              tmp.rm_length = 0;
++              memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++              offset += sizeof(gd_resmov_t);
++      }
++
++ out:
++      up_read(&ls->ls_rec_rsblist);
++      return offset;
++}
++
++int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
++              gd_resdata_t **rdp, int recovery)
++{
++      gd_resdata_t *rd;
++      gd_resdata_t *tmp;
++      uint32_t bucket;
++
++      bucket = rd_hash(ls, name, namelen);
++
++      read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++      rd = search_rdbucket(ls, name, namelen, bucket);
++      read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++      if (rd)
++              goto out;
++
++      rd = allocate_resdata(ls, namelen);
++      if (!rd)
++              return -ENOMEM;
++
++      rd->rd_master_nodeid = nodeid;
++      rd->rd_length = namelen;
++      memcpy(rd->rd_name, name, namelen);
++
++      write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++      tmp = search_rdbucket(ls, name, namelen, bucket);
++      if (!tmp)
++              list_add_tail(&rd->rd_list,
++                            &ls->ls_resdir_hash[bucket].rb_reslist);
++      write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++      if (tmp) {
++              free_resdata(rd);
++              rd = tmp;
++      }
++
++      out:
++      *rdp = rd;
++
++      if (!recovery) {
++              if (++rd->rd_sequence == 0)
++                      rd->rd_sequence++;
++      } else
++              rd->rd_sequence = 1;
++
++      return 0;
++}
++
++/* 
++ * The node with lowest id queries all nodes to determine when all are done.
++ * All other nodes query the low nodeid for this.
++ */
++
++int resdir_rebuild_wait(gd_ls_t *ls)
++{
++      int error;
++
++      if (ls->ls_low_nodeid == our_nodeid()) {
++              error = gdlm_wait_status_all(ls, RESDIR_VALID);
++              if (!error)
++                      set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
++      } else
++              error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
++
++      return error;
++}
+diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
+--- linux-orig/cluster/dlm/dir.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dir.h    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIR_DOT_H__
++#define __DIR_DOT_H__
++
++uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
++uint32_t get_directory_nodeid(gd_res_t * rsb);
++void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
++                  uint8_t sequence);
++int resdir_rebuild_local(gd_ls_t * ls);
++int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
++                      int outlen, uint32_t nodeid);
++int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
++              gd_resdata_t ** rdp, int recovery);
++int resdir_rebuild_wait(gd_ls_t * ls);
++void resdir_clear(gd_ls_t * ls);
++void resdir_dump(gd_ls_t * ls);
++
++#endif                                /* __DIR_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
+--- linux-orig/cluster/dlm/dlm_internal.h      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dlm_internal.h   2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,634 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DLM_INTERNAL_DOT_H__
++#define __DLM_INTERNAL_DOT_H__
++
++/*
++ * This is the main header file to be included in each DLM source file.
++ */
++
++#define DLM_RELEASE_NAME "<CVS>"
++
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <asm/semaphore.h>
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/list.h>
++#include <linux/errno.h>
++#include <linux/random.h>
++
++#include <cluster/dlm.h>
++#include <cluster/dlm_device.h>
++#include <cluster/service.h>
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++#define wchan_cond_sleep_intr(chan, sleep_cond) \
++do \
++{ \
++  DECLARE_WAITQUEUE(__wait_chan, current); \
++  current->state = TASK_INTERRUPTIBLE; \
++  add_wait_queue(&chan, &__wait_chan); \
++  if ((sleep_cond)) \
++    schedule(); \
++  remove_wait_queue(&chan, &__wait_chan); \
++  current->state = TASK_RUNNING; \
++} \
++while (0)
++
++static inline int check_timeout(unsigned long stamp, unsigned int seconds)
++{
++    return time_after(jiffies, stamp + seconds * HZ);
++}
++
++
++#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
++
++#define log_all(ls, fmt, args...) \
++      do { \
++              printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
++              dlm_debug_log(ls, fmt, ##args); \
++      } while (0)
++
++#define log_error log_all
++
++
++#define DLM_DEBUG
++#if defined(DLM_DEBUG)
++#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
++#else
++#define log_debug(ls, fmt, args...)
++#endif
++
++#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
++#undef log_debug
++#define log_debug log_all
++#endif
++
++
++#define GDLM_ASSERT(x, do) \
++{ \
++  if (!(x)) \
++  { \
++    dlm_debug_dump(); \
++    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
++               "DLM:  assertion:  \"%s\"\n" \
++               "DLM:  time = %lu\n", \
++               __LINE__, __FILE__, #x, jiffies); \
++    {do} \
++    printk("\n"); \
++    BUG(); \
++    panic("DLM:  Record message above and reboot.\n"); \
++  } \
++}
++
++
++struct gd_ls;
++struct gd_lkb;
++struct gd_res;
++struct gd_csb;
++struct gd_node;
++struct gd_resmov;
++struct gd_resdata;
++struct gd_recover;
++struct gd_recinfo;
++struct gd_resdir_bucket;
++struct gd_remlockreply;
++struct gd_remlockrequest;
++struct gd_rcom;
++
++typedef struct gd_ls gd_ls_t;
++typedef struct gd_lkb gd_lkb_t;
++typedef struct gd_res gd_res_t;
++typedef struct gd_csb gd_csb_t;
++typedef struct gd_node gd_node_t;
++typedef struct gd_resmov gd_resmov_t;
++typedef struct gd_resdata gd_resdata_t;
++typedef struct gd_recover gd_recover_t;
++typedef struct gd_resdir_bucket gd_resdir_bucket_t;
++typedef struct gd_rcom gd_rcom_t;
++
++/*
++ * Resource Data - an entry for a resource in the resdir hash table
++ */
++
++struct gd_resdata {
++      struct list_head rd_list;
++      uint32_t rd_master_nodeid;
++      uint16_t rd_length;
++      uint8_t rd_sequence;
++      char rd_name[1];        /* <rd_length> bytes */
++};
++
++/*
++ * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
++ * hash table
++ */
++
++struct gd_resdir_bucket {
++      struct list_head rb_reslist;
++      rwlock_t rb_lock;
++};
++
++/*
++ * A resource description as moved between nodes
++ */
++
++struct gd_resmov {
++      uint32_t rm_nodeid;
++      uint16_t rm_length;
++      uint16_t rm_pad;
++};
++
++/*
++ * An entry in the lock ID table.  Locks for this bucket are kept on list.
++ * Counter is used to assign an id to locks as they are added to this bucket.
++ */
++
++struct gd_lockidtbl_entry {
++      struct list_head list;
++      uint16_t counter;
++};
++
++/* Elements in the range array */
++
++#define GR_RANGE_START 0
++#define GR_RANGE_END   1
++#define RQ_RANGE_START 2
++#define RQ_RANGE_END   3
++
++/*
++ * Lockspace structure.  The context for GDLM locks.
++ */
++
++#define RESHASHTBL_SIZE     (256)
++
++#define RESDIRHASH_SHIFT    (9)
++#define RESDIRHASH_SIZE     (1 << RESDIRHASH_SHIFT)
++#define RESDIRHASH_MASK     (RESDIRHASH_SIZE - 1)
++
++#define LSFL_WORK               (0)
++#define LSFL_LS_RUN             (1)
++#define LSFL_LS_STOP            (2)
++#define LSFL_LS_START           (3)
++#define LSFL_LS_FINISH          (4)
++#define LSFL_RECCOMM_WAIT       (5)
++#define LSFL_RECCOMM_READY      (6)
++#define LSFL_NOTIMERS           (7)
++#define LSFL_FINISH_RECOVERY    (8)
++#define LSFL_RESDIR_VALID       (9)
++#define LSFL_ALL_RESDIR_VALID   (10)
++#define LSFL_NODES_VALID        (11)
++#define LSFL_ALL_NODES_VALID    (12)
++#define LSFL_REQUEST_WARN       (13)
++
++#define LSST_NONE           (0)
++#define LSST_INIT           (1)
++#define LSST_INIT_DONE      (2)
++#define LSST_CLEAR          (3)
++#define LSST_WAIT_START     (4)
++#define LSST_RECONFIG_DONE  (5)
++
++struct gd_ls {
++      struct list_head ls_list;       /* list of lockspaces */
++      uint32_t ls_local_id;   /* local unique lockspace ID */
++      uint32_t ls_global_id;  /* global unique lockspace ID */
++      int ls_allocation;      /* Memory allocation policy */
++      unsigned long ls_flags; /* LSFL_ */
++
++      struct list_head ls_rootres;    /* List of root resources */
++
++      int ls_hashsize;
++      int ls_hashmask;
++      struct list_head *ls_reshashtbl;        /* Hash table for resources */
++      rwlock_t ls_reshash_lock;       /* Lock for hash table */
++
++      struct gd_lockidtbl_entry *ls_lockidtbl;
++      uint32_t ls_lockidtbl_size;     /* Size of lock id table */
++      rwlock_t ls_lockidtbl_lock;
++
++      struct list_head ls_nodes;      /* current nodes in RC */
++      uint32_t ls_num_nodes;  /* number of nodes in RC */
++      uint32_t ls_nodes_mask;
++      uint32_t ls_low_nodeid;
++
++      int ls_state;           /* state changes for recovery */
++      struct list_head ls_recover;    /* gr_recover_t structs */
++      int ls_last_stop;       /* event ids from sm */
++      int ls_last_start;
++      int ls_last_finish;
++      spinlock_t ls_recover_lock;
++      struct list_head ls_nodes_gone; /* dead node list for recovery */
++
++      wait_queue_head_t ls_wait_general;
++
++      gd_rcom_t *ls_rcom;
++      uint32_t ls_rcom_msgid;
++      struct semaphore ls_rcom_lock;
++
++      struct list_head ls_recover_list;
++      int ls_recover_list_count;
++      spinlock_t ls_recover_list_lock;
++
++      struct rw_semaphore ls_in_recovery;     /* held in write during
++                                               * recovery, read for normal
++                                               * locking ops */
++      struct rw_semaphore ls_unlock_sem;      /* To prevent unlock on a
++                                               * parent lock racing with a
++                                               * new child lock */
++
++      struct rw_semaphore ls_rec_rsblist;     /* To prevent incoming recovery
++                                               * operations happening while
++                                               * we are purging */
++
++      struct rw_semaphore ls_gap_rsblist;     /* To protect rootres list
++                                               * in grant_after_purge() which
++                                               * runs outside recovery */
++
++      struct list_head ls_rebuild_rootrsb_list;       /* Root of lock trees
++                                                       * we are deserialising
++                                                       */
++
++      struct list_head ls_deadlockq;  /* List of locks in conversion ordered
++                                       * by duetime. for deadlock detection */
++
++      struct list_head ls_requestqueue;       /* List of incoming requests
++                                               * held while we are in
++                                               * recovery */
++
++      gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
++
++      int ls_namelen;
++      char ls_name[1];        /* <namelen> bytes */
++};
++
++/*
++ * Cluster node (per node in cluster)
++ */
++
++struct gd_node {
++      struct list_head gn_list;       /* global list of cluster nodes */
++      uint32_t gn_nodeid;     /* cluster unique nodeid (cman) */
++      uint32_t gn_ipaddr;     /* node's first IP address (cman) */
++      int gn_refcount;        /* number of csb's referencing */
++};
++
++/*
++ * Cluster System Block (per node in a ls)
++ */
++
++struct gd_csb {
++      struct list_head csb_list;      /* per-lockspace list of nodes */
++      gd_node_t *csb_node;    /* global node structure */
++      int csb_gone_event;     /* event id when node was removed */
++
++      uint32_t csb_names_send_count;
++      uint32_t csb_names_send_msgid;
++      uint32_t csb_names_recv_count;
++      uint32_t csb_names_recv_msgid;
++      uint32_t csb_locks_send_count;
++      uint32_t csb_locks_send_msgid;
++      uint32_t csb_locks_recv_count;
++      uint32_t csb_locks_recv_msgid;
++};
++
++/*
++ * Resource block
++ */
++
++/* status */
++
++#define GDLM_RESSTS_DIRENTRY     1    /* This is a directory entry */
++#define GDLM_RESSTS_LVBINVALID   2    /* The LVB is invalid */
++
++#define RESFL_NEW_MASTER         (0)
++#define RESFL_RECOVER_LIST       (1)
++
++struct gd_res {
++      struct list_head res_hashchain; /* Chain of resources in this hash
++                                       * bucket */
++
++      gd_ls_t *res_ls;        /* The owning lockspace */
++
++      struct list_head res_rootlist;  /* List of root resources in lockspace */
++
++      struct list_head res_subreslist;        /* List of all sub-resources
++                                               * for this root res. */
++      /* This is a list head on the root res and holds the whole tree below
++       * it. */
++      uint8_t res_depth;      /* Depth in resource tree */
++      uint16_t res_status;
++      unsigned long res_flags;        /* Flags, RESFL_ */
++
++      struct list_head res_grantqueue;
++      struct list_head res_convertqueue;
++      struct list_head res_waitqueue;
++
++      uint32_t res_nodeid;    /* nodeid of master node */
++
++      gd_res_t *res_root;     /* If a subresource, this is our root */
++      gd_res_t *res_parent;   /* Our parent resource (if any) */
++
++      atomic_t res_ref;       /* No of lkb's */
++      uint16_t res_remasterid;        /* ID used during remaster */
++      struct list_head res_recover_list;      /* General list for use during
++                                               * recovery */
++      int res_recover_msgid;
++      int res_newlkid_expect;
++
++      struct rw_semaphore res_lock;
++
++      char *res_lvbptr;       /* Lock value block */
++
++      uint8_t res_resdir_seq; /* Last directory sequence number */
++
++      uint8_t res_length;
++      char res_name[1];       /* <res_length> bytes */
++};
++
++/*
++ * Lock block. To avoid confusion, where flags mirror the
++ * public flags, they should have the same value.
++ */
++
++#define GDLM_LKSTS_NEW          (0)
++#define GDLM_LKSTS_WAITING      (1)
++#define GDLM_LKSTS_GRANTED      (2)
++#define GDLM_LKSTS_CONVERT      (3)
++
++#define GDLM_LKFLG_VALBLK       (0x00000008)
++#define GDLM_LKFLG_PERSISTENT   (0x00000080)  /* Don't unlock when process exits */
++#define GDLM_LKFLG_NODLCKWT     (0x00000100)       /* Don't do deadlock detection */
++#define GDLM_LKFLG_EXPEDITE     (0x00000400)       /* Move to head of convert queue */
++
++/* Internal flags */
++#define GDLM_LKFLG_RANGE        (0x00001000)  /* Range field is present (remote protocol only) */
++#define GDLM_LKFLG_MSTCPY       (0x00002000)
++#define GDLM_LKFLG_DELETED      (0x00004000)  /* LKB is being deleted */
++#define GDLM_LKFLG_DELAST       (0x00008000)  /* Delete after delivering AST */
++#define GDLM_LKFLG_LQRESEND     (0x00010000)  /* LKB on lockqueue must be resent */
++#define GDLM_LKFLG_DEMOTED      (0x00020000)
++#define GDLM_LKFLG_RESENT       (0x00040000)
++#define GDLM_LKFLG_NOREBUILD    (0x00080000)
++#define GDLM_LKFLG_LQCONVERT    (0x00100000)
++
++struct gd_lkb {
++      void *lkb_astaddr;
++      void *lkb_bastaddr;
++      long lkb_astparam;
++
++      uint32_t lkb_flags;
++      uint16_t lkb_status;    /* LKSTS_ granted, waiting, converting */
++      int8_t lkb_rqmode;      /* Requested lock mode */
++      int8_t lkb_grmode;      /* Granted lock mode */
++      uint8_t lkb_bastmode;   /* Requested mode returned in bast */
++      uint8_t lkb_highbast;   /* Highest mode we have sent a BAST for */
++      uint32_t lkb_retstatus; /* Status to return in lksb */
++
++      uint32_t lkb_id;        /* Our lock ID */
++      struct dlm_lksb *lkb_lksb;      /* Lock status block of caller */
++      struct list_head lkb_idtbl_list;        /* list pointer into the
++                                               * lockidtbl */
++
++      struct list_head lkb_statequeue;        /* List of locks in this state */
++
++      struct list_head lkb_ownerqueue;        /* List of locks owned by a
++                                               * process */
++
++      gd_lkb_t *lkb_parent;   /* Pointer to parent if any */
++
++      atomic_t lkb_childcnt;  /* Number of children */
++
++      struct list_head lkb_lockqueue; /* For when we are on the lock queue */
++      int lkb_lockqueue_state;
++      int lkb_lockqueue_flags;        /* As passed into lock/unlock */
++      unsigned long lkb_lockqueue_time;       /* Time we went on the lock
++                                               * queue */
++
++      gd_res_t *lkb_resource;
++
++      unsigned long lkb_duetime;      /* For deadlock detection */
++
++      uint32_t lkb_remid;     /* Remote partner */
++      uint32_t lkb_nodeid;
++
++      struct list_head lkb_astqueue;  /* For when we are on the AST queue */
++      uint32_t lkb_asts_to_deliver;
++
++      struct gd_remlockrequest *lkb_request;
++
++      struct list_head lkb_deadlockq; /* on ls_deadlockq list */
++
++      char *lkb_lvbptr;       /* Points to lksb on a local lock, allocated
++                               * LVB (if necessary) on a remote lock */
++      uint64_t *lkb_range;    /* Points to an array of 64 bit numbers that
++                               * represent the requested and granted ranges
++                               * of the lock. NULL implies 0-ffffffffffffffff
++                               */
++};
++
++/*
++ * Used to save and manage recovery state for a lockspace.
++ */
++
++struct gd_recover {
++      struct list_head gr_list;
++      uint32_t *gr_nodeids;
++      int gr_node_count;
++      int gr_event_id;
++};
++
++/*
++ * Header part of the mid-level comms system. All packets start with
++ * this header so we can identify them. The comms packet can
++ * contain many of these structs but the are split into individual
++ * work units before being passed to the lockqueue routines.
++ * below this are the structs that this is a header for
++ */
++
++struct gd_req_header {
++      uint8_t rh_cmd;         /* What we are */
++      uint8_t rh_flags;       /* maybe just a pad */
++      uint16_t rh_length;     /* Length of struct (so we can send several in
++                               * one message) */
++      uint32_t rh_lkid;       /* Lock ID tag: ie the local (requesting) lock
++                               * ID */
++      uint32_t rh_lockspace;  /* Lockspace ID */
++};
++
++/*
++ * This is the struct used in a remote lock/unlock/convert request
++ * The mid-level comms API should turn this into native byte order.
++ * Most "normal" lock operations will use these two structs for
++ * communications. Recovery operations use their own structs
++ * but still with the gd_req_header on the front.
++ */
++
++struct gd_remlockrequest {
++      struct gd_req_header rr_header;
++
++      uint32_t rr_remlkid;    /* Remote lock ID */
++      uint32_t rr_remparid;   /* Parent's remote lock ID or 0 */
++      uint32_t rr_flags;      /* Flags from lock/convert request */
++        uint64_t rr_range_start;/* Yes, these are in the right place... */
++      uint64_t rr_range_end;
++      uint32_t rr_status;     /* Status to return if this is an AST request */
++      uint8_t rr_rqmode;      /* Requested lock mode */
++      uint8_t rr_asts;        /* Whether the LKB has ASTs or not */
++      uint8_t rr_resdir_seq;  /* Directory sequence number */
++      char rr_lvb[DLM_LVB_LEN];       /* Value block */
++      char rr_name[1];        /* As long as needs be. Only used for directory
++                               * lookups. The length of this can be worked
++                               * out from the packet length */
++};
++
++/*
++ * This is the struct returned by a remote lock/unlock/convert request
++ * The mid-level comms API should turn this into native byte order.
++ */
++
++struct gd_remlockreply {
++      struct gd_req_header rl_header;
++
++      uint32_t rl_lockstate;  /* Whether request was queued/granted/waiting */
++      uint32_t rl_nodeid;     /* nodeid of lock master */
++      uint32_t rl_status;     /* Status to return to caller */
++      uint32_t rl_lkid;       /* Remote lkid */
++      uint8_t rl_resdir_seq;  /* Returned directory sequence number */
++      char rl_lvb[DLM_LVB_LEN];       /* LVB itself */
++};
++
++/*
++ * Recovery comms message
++ */
++
++struct gd_rcom {
++      struct gd_req_header rc_header; /* 32 byte aligned */
++      uint32_t rc_msgid;
++      uint16_t rc_datalen;
++      uint8_t rc_expanded;
++      uint8_t rc_subcmd;      /* secondary command */
++      char rc_buf[1];         /* first byte of data goes here and extends
++                               * beyond here for another datalen - 1 bytes.
++                               * rh_length is set to sizeof(gd_rcom_t) +
++                               * datalen - 1 */
++};
++
++
++/* A remote query: GDLM_REMCMD_QUERY */
++struct gd_remquery {
++      struct gd_req_header rq_header;
++
++      uint32_t rq_mstlkid;   /* LockID on master node */
++        uint32_t rq_query;     /* query from the user */
++        uint32_t rq_maxlocks;  /* max number of locks we can cope with */
++};
++
++/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
++/* There may be subsequent blocks of
++   lock info in GDLM_REMCMD_QUERYCONT messages which just have
++   a normal header. The last of these will have rh_flags set to
++   GDLM_REMFLAG_ENDQUERY
++ */
++struct gd_remqueryreply {
++      struct gd_req_header rq_header;
++
++        uint32_t rq_numlocks;  /* Number of locks in reply */
++        uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
++        uint32_t rq_status;
++
++        /* Resource information */
++      uint32_t rq_grantcount; /* No. of nodes on grant queue */
++      uint32_t rq_convcount;  /* No. of nodes on convert queue */
++      uint32_t rq_waitcount;  /* No. of nodes on wait queue */
++        char rq_valblk[DLM_LVB_LEN];  /* Master's LVB contents, if applicable */
++};
++
++/*
++ * Lockqueue wait lock states
++ */
++
++#define GDLM_LQSTATE_WAIT_RSB       1
++#define GDLM_LQSTATE_WAIT_CONVERT   2
++#define GDLM_LQSTATE_WAIT_CONDGRANT 3
++#define GDLM_LQSTATE_WAIT_UNLOCK    4
++
++/* Commands sent across the comms link */
++#define GDLM_REMCMD_LOOKUP          1
++#define GDLM_REMCMD_LOCKREQUEST     2
++#define GDLM_REMCMD_UNLOCKREQUEST   3
++#define GDLM_REMCMD_CONVREQUEST     4
++#define GDLM_REMCMD_LOCKREPLY       5
++#define GDLM_REMCMD_LOCKGRANT       6
++#define GDLM_REMCMD_SENDBAST        7
++#define GDLM_REMCMD_SENDCAST        8
++#define GDLM_REMCMD_REM_RESDATA     9
++#define GDLM_REMCMD_RECOVERMESSAGE  20
++#define GDLM_REMCMD_RECOVERREPLY    21
++#define GDLM_REMCMD_QUERY           30
++#define GDLM_REMCMD_QUERYREPLY      31
++
++/* Set in rh_flags when this is the last block of
++   query information. Note this could also be the first
++   block */
++#define GDLM_REMFLAG_ENDQUERY       1
++
++/*
++ * This is a both a parameter to queue_ast and also the bitmap of ASTs in
++ * lkb_asts_to_deliver
++ */
++
++typedef enum { GDLM_QUEUE_COMPAST = 1, GDLM_QUEUE_BLKAST = 2 } gd_ast_type_t;
++
++#ifndef BUG_ON
++#define BUG_ON(x)
++#endif
++
++void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
++void dlm_debug_dump(void);
++
++#endif                                /* __DLM_INTERNAL_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
+--- linux-orig/cluster/dlm/lkb.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lkb.c    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,225 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* 
++ * lkb.c
++ *
++ * Allocate and free locks on the lock ID table.
++ *
++ * This is slightly naff but I don't really like the
++ * VMS lockidtbl stuff as it uses a realloced array
++ * to hold the locks in. I think this is slightly better
++ * in some ways.
++ *
++ * Any better suggestions gratefully received. Patrick
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "lkb.h"
++#include "config.h"
++#include "rsb.h"
++#include "memory.h"
++#include "lockspace.h"
++#include "util.h"
++
++/* 
++ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
++ */
++
++static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
++{
++      uint16_t entry = lkid & 0xFFFF;
++      gd_lkb_t *lkb;
++
++      if (entry >= ls->ls_lockidtbl_size)
++              goto out;
++
++      list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
++              if (lkb->lkb_id == lkid)
++                      return lkb;
++      }
++
++      out:
++      return NULL;
++}
++
++/* 
++ * Should be called at lockspace initialisation time.
++ */
++
++int init_lockidtbl(gd_ls_t *ls, int entries)
++{
++      int i;
++
++      /* Make sure it's a power of two */
++      GDLM_ASSERT(!(entries & (entries - 1)),);
++
++      ls->ls_lockidtbl_size = entries;
++      rwlock_init(&ls->ls_lockidtbl_lock);
++
++      ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
++                                 GFP_KERNEL);
++      if (!ls->ls_lockidtbl)
++              return -ENOMEM;
++
++      for (i = 0; i < entries; i++) {
++              INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
++              ls->ls_lockidtbl[i].counter = 1;
++      }
++
++      return 0;
++}
++
++/* 
++ * Free up the space - returns an error if there are still locks hanging around
++ */
++
++int free_lockidtbl(gd_ls_t *ls)
++{
++      int i;
++
++      write_lock(&ls->ls_lockidtbl_lock);
++
++      for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++              if (!list_empty(&ls->ls_lockidtbl[i].list)) {
++                      write_unlock(&ls->ls_lockidtbl_lock);
++                      return -1;
++              }
++      }
++      kfree(ls->ls_lockidtbl);
++
++      write_unlock(&ls->ls_lockidtbl_lock);
++
++      return 0;
++}
++
++/* 
++ * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
++ * random number between 0 and lockidtbl_size-1.  This random number specifies
++ * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
++ * assigned per-bucket id.
++ *
++ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
++ * against the lkid of all lkb's in the bucket to avoid duplication.
++ *
++ */
++
++gd_lkb_t *create_lkb(gd_ls_t *ls)
++{
++      gd_lkb_t *lkb;
++      uint32_t lkid;
++      uint16_t bucket;
++
++      lkb = allocate_lkb(ls);
++      if (!lkb)
++              goto out;
++
++      write_lock(&ls->ls_lockidtbl_lock);
++      do {
++              get_random_bytes(&bucket, sizeof(bucket));
++              bucket &= (ls->ls_lockidtbl_size - 1);
++              lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
++      }
++      while (__find_lock_by_id(ls, lkid));
++
++      lkb->lkb_id = (uint32_t) lkid;
++      list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
++      write_unlock(&ls->ls_lockidtbl_lock);
++
++      out:
++      return lkb;
++}
++
++/* 
++ * Free LKB and remove it from the lockidtbl.
++ * NB - this always frees the lkb whereas release_rsb doesn't free an
++ * rsb unless its reference count is zero.
++ */
++
++void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
++{
++      if (lkb->lkb_status) {
++              log_error(ls, "release lkb with status %u", lkb->lkb_status);
++              print_lkb(lkb);
++              return;
++      }
++
++      if (lkb->lkb_parent)
++              atomic_dec(&lkb->lkb_parent->lkb_childcnt);
++
++      write_lock(&ls->ls_lockidtbl_lock);
++      list_del(&lkb->lkb_idtbl_list);
++      write_unlock(&ls->ls_lockidtbl_lock);
++
++      /* if this is not a master copy then lvbptr points into the user's
++       * lksb, so don't free it */
++      if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++              free_lvb(lkb->lkb_lvbptr);
++
++      if (lkb->lkb_range)
++              free_range(lkb->lkb_range);
++
++      free_lkb(lkb);
++}
++
++gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
++{
++      gd_lkb_t *lkb;
++
++      read_lock(&ls->ls_lockidtbl_lock);
++      lkb = __find_lock_by_id(ls, lkid);
++      read_unlock(&ls->ls_lockidtbl_lock);
++
++      return lkb;
++}
++
++gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
++{
++        gd_ls_t *lspace = find_lockspace_by_local_id(ls);
++      return find_lock_by_id(lspace, lkid);
++}
++
++/*
++ * Initialise the range parts of an LKB.
++ */
++
++int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
++{
++      int ret = -ENOMEM;
++
++      /*
++       * if this wasn't already a range lock, make it one
++       */
++      if (!lkb->lkb_range) {
++              lkb->lkb_range = allocate_range(lspace);
++              if (!lkb->lkb_range)
++                      goto out;
++
++              /*
++               * This is needed for conversions that contain ranges where the
++               * original lock didn't but it's harmless for new locks too.
++               */
++              lkb->lkb_range[GR_RANGE_START] = 0LL;
++              lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
++      }
++
++      lkb->lkb_range[RQ_RANGE_START] = start;
++      lkb->lkb_range[RQ_RANGE_END] = end;
++
++      ret = 0;
++
++      out:
++      return ret;
++}
+diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
+--- linux-orig/cluster/dlm/lkb.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lkb.h    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,27 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LKB_DOT_H__
++#define __LKB_DOT_H__
++
++int free_lockidtbl(gd_ls_t * lspace);
++int init_lockidtbl(gd_ls_t * lspace, int entries);
++
++gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
++gd_lkb_t *create_lkb(gd_ls_t *ls);
++void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
++gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
++int verify_lkb_nodeids(gd_ls_t *ls);
++int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
++
++#endif                                /* __LKB_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
+--- linux-orig/cluster/dlm/locking.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/locking.c        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1225 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* 
++ * locking.c
++ *
++ * This is where the main work of the DLM goes on
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "locking.h"
++#include "lockspace.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "memory.h"
++#include "rsb.h"
++
++#define MAX(a, b) (((a) > (b)) ? (a) : (b))
++
++/* 
++ * Lock compatibilty matrix - thanks Steve
++ * UN = Unlocked state. Not really a state, used as a flag
++ * PD = Padding. Used to make the matrix a nice power of two in size
++ * Other states are the same as the VMS DLM.
++ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
++ */
++
++#define modes_compat(gr, rq) \
++      __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
++
++const int __dlm_compat_matrix[8][8] = {
++      /* UN NL CR CW PR PW EX PD */
++      {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
++      {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
++      {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
++      {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
++      {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
++      {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
++      {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
++      {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
++};
++
++/* 
++ * Compatibility matrix for conversions with QUECVT set.
++ * Granted mode is the row; requested mode is the column.
++ * Usage: matrix[grmode+1][rqmode+1]
++ */
++
++const int __quecvt_compat_matrix[8][8] = {
++      /* UN NL CR CW PR PW EX PD */
++      {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
++      {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
++      {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
++      {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
++      {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
++      {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
++      {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
++      {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
++};
++
++/* 
++ * This defines the direction of transfer of LVB data.
++ * Granted mode is the row; requested mode is the column.
++ * Usage: matrix[grmode+1][rqmode+1]
++ * 1 = LVB is returned to the caller
++ * 0 = LVB is written to the resource
++ * -1 = nothing happens to the LVB
++ */
++
++const int __lvb_operations[8][8] = {
++      /* UN   NL  CR  CW  PR  PW  EX  PD*/
++      {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
++      {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
++      {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
++      {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
++      {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
++      {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
++      {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
++      {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
++};
++
++static void grant_lock(gd_lkb_t * lkb, int send_remote);
++static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
++static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
++static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
++                      int flags, void *ast, void *astarg, void *bast,
++                      struct dlm_range *range);
++static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
++                         char *name, int namelen);
++
++
++static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
++{
++      gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
++
++      if (lkb->lkb_id == first->lkb_id)
++              return 1;
++
++      return 0;
++}
++
++/* 
++ * Return 1 if the locks' ranges overlap
++ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
++ */
++
++static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
++{
++      if (!lkb1->lkb_range || !lkb2->lkb_range)
++              return 1;
++
++      if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
++          lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
++              return 0;
++
++      return 1;
++}
++
++/*
++ * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
++ * locks on the convert queue.  One of the deadlocked locks is allowed to
++ * retain its original granted state (we choose the lkb provided although it
++ * shouldn't matter which.)  We do not change the granted mode on locks without
++ * the CONVDEADLK flag.  If any of these exist (there shouldn't if the app uses
++ * the flag consistently) the false return value is used.
++ */
++
++static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      gd_lkb_t *this;
++      int rv = TRUE;
++
++      list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++              if (this == lkb)
++                      continue;
++
++              if (!ranges_overlap(lkb, this))
++                      continue;
++
++              if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
++
++                      if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
++                              rv = FALSE;
++                              continue;
++                      }
++                      this->lkb_grmode = DLM_LOCK_NL;
++                      this->lkb_flags |= GDLM_LKFLG_DEMOTED;
++              }
++      }
++      return rv;
++}
++
++/*
++ * "A conversion deadlock arises with a pair of lock requests in the converting
++ * queue for one resource.  The granted mode of each lock blocks the requested
++ * mode of the other lock."
++ */
++
++static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      gd_lkb_t *this;
++
++      list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++              if (this == lkb)
++                      continue;
++
++              if (!ranges_overlap(lkb, this))
++                      continue;
++
++              if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
++                      return TRUE;
++      }
++      return FALSE;
++}
++
++/*
++ * Check if the given lkb conflicts with another lkb on the queue.
++ */
++
++static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
++{
++      gd_lkb_t *this;
++
++      list_for_each_entry(this, head, lkb_statequeue) {
++              if (this == lkb)
++                      continue;
++              if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
++                      return TRUE;
++      }
++      return FALSE;
++}
++
++/*
++ * Deadlock can arise when using the QUECVT flag if the requested mode of the
++ * first converting lock is incompatible with the granted mode of another
++ * converting lock further down the queue.  To prevent this deadlock, a
++ * requested QUEUECVT lock is granted immediately if adding it to the end of
++ * the queue would prevent a lock ahead of it from being granted.
++ */
++
++static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      gd_lkb_t *this;
++
++      list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++              if (this == lkb)
++                      break;
++
++              if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
++                      return TRUE;
++      }
++      return FALSE;
++}
++
++/* 
++ * Return 1 if the lock can be granted, 0 otherwise.
++ * Also detect and resolve conversion deadlocks.
++ */
++
++static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      if (lkb->lkb_rqmode == DLM_LOCK_NL)
++              return TRUE;
++
++      if (lkb->lkb_rqmode == lkb->lkb_grmode)
++              return TRUE;
++
++      if (queue_conflict(&rsb->res_grantqueue, lkb))
++              return FALSE;
++
++      if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
++              if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
++                      return TRUE;
++
++              if (list_empty(&rsb->res_convertqueue) ||
++                  first_in_list(lkb, &rsb->res_convertqueue) ||
++                  queuecvt_deadlock_detect(rsb, lkb))
++                      return TRUE;
++              else
++                      return FALSE;
++      }
++
++      /* there *is* a conflict between this lkb and a converting lock so
++         we return false unless conversion deadlock resolution is permitted
++         (only conversion requests will have the CONVDEADLK flag set) */
++
++      if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
++              return FALSE;
++
++      if (!conversion_deadlock_detect(rsb, lkb))
++              return FALSE;
++
++      if (conversion_deadlock_resolve(rsb, lkb))
++              return TRUE;
++
++      return FALSE;
++}
++
++int dlm_lock(void *lockspace,
++           uint32_t mode,
++           struct dlm_lksb *lksb,
++           uint32_t flags,
++           void *name,
++           unsigned int namelen,
++           uint32_t parent,
++           void (*ast) (void *astarg),
++           void *astarg,
++           void (*bast) (void *astarg, int mode),
++           struct dlm_range *range)
++{
++      gd_ls_t *lspace;
++      gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
++      int ret = -EINVAL;
++
++      lspace = find_lockspace_by_local_id(lockspace);
++      if (!lspace)
++              goto out;
++
++      if (mode < 0 || mode > DLM_LOCK_EX)
++              goto out;
++
++      if (namelen > DLM_RESNAME_MAXLEN)
++              goto out;
++
++      if (flags & DLM_LKF_CANCEL)
++              goto out;
++
++      if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
++              goto out;
++
++      if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
++              goto out;
++
++      if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
++              goto out;
++
++      if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
++              goto out;
++
++      if (!ast || !lksb)
++              goto out;
++
++      if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
++              goto out;
++
++      if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
++              goto out;
++
++      /* 
++       * Take conversion path.
++       */
++
++      if (flags & DLM_LKF_CONVERT) {
++              ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
++                                 bast, range);
++              goto out;
++      }
++
++      /* 
++       * Take new lock path.
++       */
++
++      if (parent) {
++              down_read(&lspace->ls_unlock_sem);
++
++              parent_lkb = find_lock_by_id(lspace, parent);
++
++              if (!parent_lkb ||
++                  parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
++                  parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
++                  parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
++                      up_read(&lspace->ls_unlock_sem);
++                      goto out;
++              }
++
++              atomic_inc(&parent_lkb->lkb_childcnt);
++              up_read(&lspace->ls_unlock_sem);
++      }
++
++      down_read(&lspace->ls_in_recovery);
++
++      ret = -ENOMEM;
++
++      lkb = create_lkb(lspace);
++      if (!lkb)
++              goto fail_dec;
++      lkb->lkb_astaddr = ast;
++      lkb->lkb_astparam = (long) astarg;
++      lkb->lkb_bastaddr = bast;
++      lkb->lkb_rqmode = mode;
++      lkb->lkb_grmode = DLM_LOCK_IV;
++      lkb->lkb_lksb = lksb;
++      lkb->lkb_parent = parent_lkb;
++      lkb->lkb_lockqueue_flags = flags;
++      lkb->lkb_lvbptr = lksb->sb_lvbptr;
++
++      /* Copy the range if appropriate */
++      if (range) {
++              if (range->ra_start > range->ra_end) {
++                      ret = -EINVAL;
++                      goto fail_free;
++              }
++
++              if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
++                      goto fail_free;
++      }
++
++      /* Convert relevant flags to internal numbers */
++      if (flags & DLM_LKF_VALBLK)
++              lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++      if (flags & DLM_LKF_PERSISTENT)
++              lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
++      if (flags & DLM_LKF_NODLCKWT)
++              lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
++
++      lksb->sb_lkid = lkb->lkb_id;
++
++      ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
++      if (ret)
++              goto fail_free;
++
++      up_read(&lspace->ls_in_recovery);
++
++      wake_astd();
++
++      return 0;
++
++      fail_free:
++      release_lkb(lspace, lkb);
++      goto fail_unlock;
++
++      fail_dec:
++      if (parent_lkb)
++              atomic_dec(&parent_lkb->lkb_childcnt);
++
++      fail_unlock:
++      up_read(&lspace->ls_in_recovery);
++
++      out:
++      return ret;
++}
++
++int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
++                  int namelen)
++{
++      gd_res_t *rsb, *parent_rsb = NULL;
++      gd_lkb_t *parent_lkb = lkb->lkb_parent;
++      gd_resdata_t *rd;
++      uint32_t nodeid;
++      int error;
++
++      if (parent_lkb)
++              parent_rsb = parent_lkb->lkb_resource;
++
++      error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
++      if (error)
++              goto out;
++
++      lkb->lkb_resource = rsb;
++      lkb->lkb_nodeid = rsb->res_nodeid;
++
++      /* 
++       * Next stage, do we need to find the master or can
++       * we get on with the real locking work ?
++       */
++
++      if (rsb->res_nodeid == -1) {
++              if (get_directory_nodeid(rsb) != our_nodeid()) {
++                      error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
++                      goto out;
++              }
++
++              error = get_resdata(ls, our_nodeid(), rsb->res_name,
++                                  rsb->res_length, &rd, 0);
++              if (error)
++                      goto out;
++
++              nodeid = rd->rd_master_nodeid;
++              if (nodeid == our_nodeid())
++                      nodeid = 0;
++              rsb->res_nodeid = nodeid;
++              lkb->lkb_nodeid = nodeid;
++              rsb->res_resdir_seq = rd->rd_sequence;
++      }
++
++      error = dlm_lock_stage2(ls, lkb, rsb, flags);
++
++      out:
++      if (error)
++              release_rsb(rsb);
++
++      return error;
++}
++
++/* 
++ * Locking routine called after we have an RSB, either a copy of a remote one
++ * or a local one, or perhaps a shiny new one all of our very own
++ */
++
++int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
++{
++      int error = 0;
++
++      if (rsb->res_nodeid) {
++              res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++              error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
++      } else {
++              dlm_lock_stage3(lkb);
++      }
++
++      return error;
++}
++
++/* 
++ * Called on an RSB's master node to do stage2 locking for a remote lock
++ * request.  Returns a proper lkb with rsb ready for lock processing.
++ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
++ */
++
++gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
++                      struct gd_remlockrequest *freq)
++{
++      gd_res_t *rsb = NULL, *parent_rsb = NULL;
++      gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
++      int error, namelen;
++
++      if (freq->rr_remparid) {
++              parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
++              if (!parent_lkb)
++                      goto fail;
++
++              atomic_inc(&parent_lkb->lkb_childcnt);
++              parent_rsb = parent_lkb->lkb_resource;
++      }
++
++      /* 
++       * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
++       * node actually holding the (non-MSTCPY) lkb.  AST address are just
++       * flags in the master copy.
++       */
++
++      lkb = create_lkb(ls);
++      if (!lkb)
++              goto fail_dec;
++      lkb->lkb_grmode = DLM_LOCK_IV;
++      lkb->lkb_rqmode = freq->rr_rqmode;
++      lkb->lkb_parent = parent_lkb;
++      lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_COMPAST);
++      lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_BLKAST);
++      lkb->lkb_nodeid = remote_nodeid;
++      lkb->lkb_remid = freq->rr_header.rh_lkid;
++      lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
++      lkb->lkb_lockqueue_flags = freq->rr_flags;
++
++      if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
++              lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++              allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
++              if (!lkb->lkb_lvbptr)
++                      goto fail_free;
++      }
++
++      if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
++              error = lkb_set_range(ls, lkb, freq->rr_range_start,
++                                    freq->rr_range_end);
++              if (error)
++                      goto fail_free;
++      }
++
++      /* 
++       * Get the RSB which this lock is for.  Create a new RSB if this is a
++       * new lock on a new resource.  We must be the master of any new rsb.
++       */
++
++      namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++
++      error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
++                                 &rsb);
++      if (error)
++              goto fail_free;
++
++      lkb->lkb_resource = rsb;
++      if (rsb->res_nodeid == -1)
++              rsb->res_nodeid = 0;
++      if (freq->rr_resdir_seq)
++              rsb->res_resdir_seq = freq->rr_resdir_seq;
++
++      return lkb;
++
++
++      fail_free:
++      /* release_lkb handles parent */
++      release_lkb(ls, lkb);
++      parent_lkb = NULL;
++
++      fail_dec:
++      if (parent_lkb)
++              atomic_dec(&parent_lkb->lkb_childcnt);
++      fail:
++      return NULL;
++}
++
++/* 
++ * The final bit of lock request processing on the master node.  Here the lock
++ * is granted and the completion ast is queued, or the lock is put on the
++ * waitqueue and blocking asts are sent.
++ */
++
++void dlm_lock_stage3(gd_lkb_t *lkb)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++
++      /* 
++       * This is a locally mastered lock on a resource that already exists,
++       * see if it can be  granted or if it must wait.  When this function is
++       * called for a remote lock request (process_cluster_request,
++       * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
++       * requesting node at the end of process_cluster_request, not at the
++       * end of grant_lock.
++       */
++
++      down_write(&rsb->res_lock);
++
++      if (can_be_granted(rsb, lkb)) {
++              grant_lock(lkb, 0);
++              goto out;
++      }
++
++      /* 
++       * This request is not a conversion, so the lkb didn't exist other than
++       * for this request and should be freed after EAGAIN is returned in the
++       * ast.
++       */
++
++      if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
++              lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++              lkb->lkb_retstatus = -EAGAIN;
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
++                      send_blocking_asts_all(rsb, lkb);
++              goto out;
++      }
++
++      /* 
++       * The requested lkb must wait.  Because the rsb of the requested lkb
++       * is mastered here, send blocking asts for the lkb's blocking the
++       * request.
++       */
++
++      lkb->lkb_retstatus = 0;
++      lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++
++      send_blocking_asts(rsb, lkb);
++
++      out:
++      up_write(&rsb->res_lock);
++}
++
++int dlm_unlock(void *lockspace,
++             uint32_t lkid,
++             uint32_t flags,
++             struct dlm_lksb *lksb,
++             void *astarg)
++{
++      gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
++      gd_lkb_t *lkb;
++      gd_res_t *rsb;
++      int ret = -EINVAL;
++
++      if (!ls)
++              goto out;
++
++      lkb = find_lock_by_id(ls, lkid);
++      if (!lkb)
++              goto out;
++
++      /* Can't dequeue a master copy (a remote node's mastered lock) */
++      if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++              goto out;
++
++      /* Already waiting for a remote lock operation */
++      if (lkb->lkb_lockqueue_state) {
++              ret = -EBUSY;
++              goto out;
++      }
++
++      /* Can only cancel WAITING or CONVERTing locks.
++       * This is just a quick check - it is also checked in unlock_stage2()
++       * (which may be on the master) under the semaphore.
++       */
++      if ((flags & DLM_LKF_CANCEL) &&
++          (lkb->lkb_status == GDLM_LKSTS_GRANTED))
++              goto out;
++
++      /* "Normal" unlocks must operate on a granted lock */
++      if (!(flags & DLM_LKF_CANCEL) &&
++          (lkb->lkb_status != GDLM_LKSTS_GRANTED))
++              goto out;
++
++      down_write(&ls->ls_unlock_sem);
++
++      /* Can't dequeue a lock with sublocks */
++      if (atomic_read(&lkb->lkb_childcnt)) {
++              up_write(&ls->ls_unlock_sem);
++              ret = -ENOTEMPTY;
++              goto out;
++      }
++
++      /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
++      if (!(flags & DLM_LKF_CANCEL))
++              lkb->lkb_flags |= GDLM_LKFLG_DELETED;
++      up_write(&ls->ls_unlock_sem);
++
++      /* Save any new params */
++      if (lksb)
++              lkb->lkb_lksb = lksb;
++      if (astarg)
++              lkb->lkb_astparam = (long) astarg;
++
++      lkb->lkb_lockqueue_flags = flags;
++
++      rsb = lkb->lkb_resource;
++
++      down_read(&ls->ls_in_recovery);
++
++      if (rsb->res_nodeid)
++              ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
++      else
++              ret = dlm_unlock_stage2(lkb, flags);
++
++      up_read(&ls->ls_in_recovery);
++
++      wake_astd();
++
++      out:
++      return ret;
++}
++
++int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++      int old_status;
++      int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
++
++      down_write(&rsb->res_lock);
++
++      /* Can only cancel WAITING or CONVERTing locks */
++      if ((flags & DLM_LKF_CANCEL) &&
++          (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
++              lkb->lkb_retstatus = -EINVAL;
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              goto out;
++      }
++
++      old_status = lkb_dequeue(lkb);
++
++      /* 
++       * If was granted grant any converting or waiting locks.
++       */
++
++      if (old_status == GDLM_LKSTS_GRANTED)
++              grant_pending_locks(rsb);
++
++      /* 
++       * Cancelling a conversion
++       */
++
++      if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
++              /* VMS semantics say we should send blocking ASTs again here */
++              send_blocking_asts(rsb, lkb);
++
++              /* Remove from deadlock detection */
++              if (lkb->lkb_duetime)
++                      remove_from_deadlockqueue(lkb);
++
++              /* Stick it back on the granted queue */
++              lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++              lkb->lkb_rqmode = lkb->lkb_grmode;
++
++              /* Was it blocking any other locks? */
++              if (first_in_list(lkb, &rsb->res_convertqueue))
++                      grant_pending_locks(rsb);
++
++              lkb->lkb_retstatus = -DLM_ECANCEL;
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              goto out;
++      }
++
++      /* 
++       * The lvb can be saved or cleared on unlock.
++       */
++
++      if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
++              if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
++                      memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++              if (flags & DLM_LKF_IVVALBLK)
++                      memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
++      }
++
++      lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++      lkb->lkb_retstatus =
++          (flags & DLM_LKF_CANCEL) ? -DLM_ECANCEL : -DLM_EUNLOCK;
++      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++      /* 
++       * Only free the LKB if we are the master copy.  Otherwise the AST
++       * delivery routine will free it after delivery.  queue_ast for MSTCPY
++       * lkb just sends a message.
++       */
++
++      if (remote) {
++              up_write(&rsb->res_lock);
++              release_lkb(rsb->res_ls, lkb);
++              release_rsb(rsb);
++              goto out2;
++      }
++
++      out:
++      up_write(&rsb->res_lock);
++      out2:
++      wake_astd();
++      return 0;
++}
++
++/* 
++ * Lock conversion
++ */
++
++static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
++                      int flags, void *ast, void *astarg, void *bast,
++                      struct dlm_range *range)
++{
++      gd_lkb_t *lkb;
++      gd_res_t *rsb;
++      int ret = -EINVAL;
++
++      lkb = find_lock_by_id(ls, lksb->sb_lkid);
++      if (!lkb) {
++              goto out;
++      }
++
++      if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
++              ret = -EBUSY;
++              goto out;
++      }
++
++      if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++              goto out;
++      }
++
++      if ((flags & DLM_LKF_QUECVT) &&
++          !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
++              goto out;
++      }
++
++      if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
++              goto out;
++      }
++
++      if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
++              goto out;
++      }
++
++      /* Set up the ranges as appropriate */
++      if (range) {
++              if (range->ra_start > range->ra_end)
++                      goto out;
++
++              if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
++                      ret = -ENOMEM;
++                      goto out;
++              }
++      }
++
++      rsb = lkb->lkb_resource;
++      down_read(&rsb->res_ls->ls_in_recovery);
++
++      lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
++      lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
++
++      if (flags & DLM_LKF_NODLCKWT)
++              lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
++      if (ast)
++              lkb->lkb_astaddr = ast;
++      if (astarg)
++              lkb->lkb_astparam = (long) astarg;
++      if (bast)
++              lkb->lkb_bastaddr = bast;
++      lkb->lkb_rqmode = mode;
++      lkb->lkb_lockqueue_flags = flags;
++      lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
++      lkb->lkb_lvbptr = lksb->sb_lvbptr;
++
++      if (rsb->res_nodeid) {
++              res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++              ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
++      } else {
++              ret = dlm_convert_stage2(lkb, FALSE);
++      }
++
++      up_read(&rsb->res_ls->ls_in_recovery);
++
++      wake_astd();
++
++      out:
++      return ret;
++}
++
++/* 
++ * For local conversion requests on locally mastered locks this is called
++ * directly from dlm_lock/convert_lock.  This function is also called for
++ * remote conversion requests of MSTCPY locks (from process_cluster_request).
++ */
++
++int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++      int ret = 0;
++
++      down_write(&rsb->res_lock);
++
++      if (can_be_granted(rsb, lkb)) {
++              grant_lock(lkb, 0);
++              grant_pending_locks(rsb);
++              goto out;
++      }
++
++      /* 
++       * Remove lkb from granted queue.
++       */
++
++      lkb_dequeue(lkb);
++
++      /* 
++       * The user won't wait so stick it back on the grant queue
++       */
++
++      if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
++              lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++              ret = lkb->lkb_retstatus = -EAGAIN;
++              if (do_ast)
++                      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
++                      send_blocking_asts_all(rsb, lkb);
++              goto out;
++      }
++
++      /* 
++       * The lkb's status tells which queue it's on.  Put back on convert
++       * queue.  (QUECVT requests added at end of the queue, all others in
++       * order.)
++       */
++
++      lkb->lkb_retstatus = 0;
++      lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++
++      /* 
++       * If the request can't be granted
++       */
++
++      send_blocking_asts(rsb, lkb);
++
++      if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
++              add_to_deadlockqueue(lkb);
++
++      out:
++      up_write(&rsb->res_lock);
++      return ret;
++}
++
++/* 
++ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
++ * completion ast.  rsb res_lock must be held in write when this is called.
++ */
++
++static void grant_lock(gd_lkb_t *lkb, int send_remote)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++
++      if (lkb->lkb_duetime)
++              remove_from_deadlockqueue(lkb);
++
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++              int b;
++              GDLM_ASSERT(lkb->lkb_lvbptr,);
++
++              if (!rsb->res_lvbptr)
++                      rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++              b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
++              if (b)
++                      memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
++              else
++                      memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++      }
++
++      if (lkb->lkb_range) {
++              lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
++              lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
++      }
++
++      lkb->lkb_grmode = lkb->lkb_rqmode;
++      lkb->lkb_rqmode = DLM_LOCK_IV;
++      lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++      lkb->lkb_highbast = 0;
++      lkb->lkb_retstatus = 0;
++      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++      /* 
++       * A remote conversion request has been granted, either immediately
++       * upon being requested or after waiting a bit.  In the former case,
++       * reply_and_grant() is called.  In the later case send_remote is 1 and
++       * remote_grant() is called.
++       *
++       * The "send_remote" flag is set only for locks which are granted "out
++       * of band" - ie by another lock being converted or unlocked.
++       *
++       * The second case occurs when this lkb is granted right away as part
++       * of processing the initial request.  In that case, we send a single
++       * message in reply_and_grant which combines the request reply with the
++       * grant message.
++       */
++
++      if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
++              if (send_remote)
++                      remote_grant(lkb);
++              else if (lkb->lkb_request)
++                      reply_and_grant(lkb);
++      }
++
++}
++
++static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
++{
++      gd_lkb_t *gr;
++
++      list_for_each_entry(gr, head, lkb_statequeue) {
++              if (gr->lkb_bastaddr &&
++                  gr->lkb_highbast < lkb->lkb_rqmode &&
++                  ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
++                      queue_ast(gr, GDLM_QUEUE_BLKAST, lkb->lkb_rqmode);
++                      gr->lkb_highbast = lkb->lkb_rqmode;
++              }
++      }
++}
++
++/* 
++ * Notify granted locks if they are blocking a newly forced-to-wait lock.
++ */
++
++static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      send_bast_queue(&rsb->res_grantqueue, lkb);
++      /* check if the following improves performance */
++      /* send_bast_queue(&rsb->res_convertqueue, lkb); */
++}
++
++static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++      send_bast_queue(&rsb->res_grantqueue, lkb);
++      send_bast_queue(&rsb->res_convertqueue, lkb);
++}
++
++/* 
++ * Called when a lock has been dequeued. Look for any locks to grant that are
++ * waiting for conversion or waiting to be granted.
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int grant_pending_locks(gd_res_t *rsb)
++{
++      gd_lkb_t *lkb;
++      struct list_head *list;
++      struct list_head *temp;
++      int8_t high = DLM_LOCK_IV;
++
++      list_for_each_safe(list, temp, &rsb->res_convertqueue) {
++              lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++              if (can_be_granted(rsb, lkb))
++                      grant_lock(lkb, 1);
++              else
++                      high = MAX(lkb->lkb_rqmode, high);
++      }
++
++      list_for_each_safe(list, temp, &rsb->res_waitqueue) {
++              lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++              if (can_be_granted(rsb, lkb))
++                      grant_lock(lkb, 1);
++              else
++                      high = MAX(lkb->lkb_rqmode, high);
++      }
++
++      /* 
++       * If there are locks left on the wait/convert queue then send blocking
++       * ASTs to granted locks that are blocking
++       *
++       * FIXME: This might generate some spurious blocking ASTs for range
++       * locks.
++       */
++
++      if (high > DLM_LOCK_IV) {
++              list_for_each_safe(list, temp, &rsb->res_grantqueue) {
++                      lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++                      if (lkb->lkb_bastaddr &&
++                          (lkb->lkb_highbast < high) &&
++                          !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
++
++                              queue_ast(lkb, GDLM_QUEUE_BLKAST, high);
++                              lkb->lkb_highbast = high;
++                      }
++              }
++      }
++
++      return 0;
++}
++
++/* 
++ * Called to cancel a locking operation that failed due to some internal
++ * reason.
++ *
++ * Waiting locks will be removed, converting locks will be reverted to their
++ * granted status, unlocks will be left where they are.
++ *
++ * A completion AST will be delivered to the caller.
++ */
++
++int cancel_lockop(gd_lkb_t *lkb, int status)
++{
++      int state = lkb->lkb_lockqueue_state;
++
++      lkb->lkb_lockqueue_state = 0;
++
++      switch (state) {
++      case GDLM_LQSTATE_WAIT_RSB:
++              lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONDGRANT:
++              res_lkb_dequeue(lkb);
++              lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONVERT:
++              res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
++
++              /* Remove from deadlock detection */
++              if (lkb->lkb_duetime) {
++                      remove_from_deadlockqueue(lkb);
++              }
++              break;
++
++      case GDLM_LQSTATE_WAIT_UNLOCK:
++              /* We can leave this. I think.... */
++              break;
++      }
++
++      lkb->lkb_retstatus = status;
++      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++      return 0;
++}
++
++/* 
++ * Check for conversion deadlock. If a deadlock was found
++ * return lkb to kill, else return NULL
++ */
++
++gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++      struct list_head *entry;
++
++      GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
++
++      /* Work our way up to the head of the queue looking for locks that
++       * conflict with us */
++
++      down_read(&rsb->res_lock);
++
++      entry = lkb->lkb_statequeue.prev;
++      while (entry != &rsb->res_convertqueue) {
++              gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
++
++              if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
++                      up_read(&rsb->res_lock);
++                      return lkb;
++              }
++              entry = entry->prev;
++      }
++      up_read(&rsb->res_lock);
++
++      return 0;
++}
++
++/* 
++ * Conversion operation was cancelled by us (not the user).
++ * ret contains the return code to pass onto the user
++ */
++
++void cancel_conversion(gd_lkb_t *lkb, int ret)
++{
++      gd_res_t *rsb = lkb->lkb_resource;
++
++      /* Stick it back on the granted queue */
++      res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++      lkb->lkb_rqmode = lkb->lkb_grmode;
++
++      remove_from_deadlockqueue(lkb);
++
++      lkb->lkb_retstatus = ret;
++      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++      wake_astd();
++}
++
++/* 
++ * As new master of the rsb for this lkb, we need to handle these requests
++ * removed from the lockqueue and originating from local processes:
++ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
++ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
++ */
++
++void process_remastered_lkb(gd_lkb_t *lkb, int state)
++{
++      switch (state) {
++      case GDLM_LQSTATE_WAIT_RSB:
++              dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
++                              lkb->lkb_lockqueue_flags,
++                              lkb->lkb_resource->res_name,
++                              lkb->lkb_resource->res_length);
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONDGRANT:
++              res_lkb_dequeue(lkb);
++              dlm_lock_stage3(lkb);
++              break;
++
++      case GDLM_LQSTATE_WAIT_UNLOCK:
++              dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONVERT:
++              dlm_convert_stage2(lkb, TRUE);
++              break;
++
++      default:
++              GDLM_ASSERT(0,);
++      }
++}
+diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
+--- linux-orig/cluster/dlm/locking.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/locking.h        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,33 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKING_DOT_H__
++#define __LOCKING_DOT_H__
++
++void process_remastered_lkb(gd_lkb_t * lkb, int state);
++void dlm_lock_stage3(gd_lkb_t * lkb);
++int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
++int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
++int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
++                  int flags);
++gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
++int free_rsb_if_unused(gd_res_t * rsb);
++gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
++                      struct gd_remlockrequest *freq);
++int cancel_lockop(gd_lkb_t * lkb, int status);
++int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
++int grant_pending_locks(gd_res_t * rsb);
++void cancel_conversion(gd_lkb_t * lkb, int ret);
++gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
++
++#endif                                /* __LOCKING_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
+--- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockqueue.c      2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,954 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * lockqueue.c
++ *
++ * This controls the lock queue, which is where locks
++ * come when they need to wait for a remote operation
++ * to complete.
++ *
++ * This could also be thought of as the "high-level" comms
++ * layer.
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "dir.h"
++#include "locking.h"
++#include "lkb.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "reccomms.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "ast.h"
++#include "memory.h"
++#include "rsb.h"
++#include "queries.h"
++
++static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
++static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
++
++/*
++ * format of an entry on the request queue
++ */
++struct rq_entry {
++      struct list_head rqe_list;
++      uint32_t rqe_nodeid;
++      char rqe_request[1];
++};
++
++/*
++ * Add a new request (if appropriate) to the request queue and send the remote
++ * request out.  - runs in the context of the locking caller
++ *
++ * Recovery of a remote_stage request if the remote end fails while the lkb
++ * is still on the lockqueue:
++ *
++ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
++ *   lockqueue_lkb_mark() at the start of recovery.
++ *
++ * o Some lkb's will be rebuilt on new master rsb's during recovery.
++ *   (depends on the type of request, see below).
++ *
++ * o At the end of recovery, resend_cluster_requests() looks at these
++ *   LQRESEND lkb's and either:
++ *
++ *   i) resends the request to the new master for the rsb where the
++ *      request is processed as usual.  The lkb remains on the lockqueue until
++ *      the new master replies and we run process_lockqueue_reply().
++ *
++ *   ii) if we've become the rsb master, remove the lkb from the lockqueue
++ *       and processes the request locally via process_remastered_lkb().
++ *
++ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
++ * and the request should be resent if dest node is failed.
++ *
++ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
++ * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
++ * makes send_lkb_queue() skip it).  Resend this request to the new master.
++ *
++ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
++ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
++ * Resend this request to the new master.
++ *
++ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
++ * It will be rebuilt on the new master rsb's granted queue.  Resend this
++ * request to the new master.
++ */
++
++int remote_stage(gd_lkb_t *lkb, int state)
++{
++      int error;
++
++      lkb->lkb_lockqueue_state = state;
++      add_to_lockqueue(lkb);
++
++      error = send_cluster_request(lkb, state);
++      if (error < 0) {
++              log_print("remote_stage error sending request %d", error);
++
++              /* Leave on lockqueue, it will be resent to correct node during
++               * recovery. */
++
++               /*
++               lkb->lkb_lockqueue_state = 0;
++               remove_from_lockqueue(lkb);
++               return -ENOTCONN;
++               */
++      }
++      return 0;
++}
++
++/*
++ * Requests received while the lockspace is in recovery get added to the
++ * request queue and processed when recovery is complete.
++ */
++
++void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
++{
++      struct rq_entry *entry;
++
++      if (in_nodes_gone(ls, nodeid))
++              return;
++
++      entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
++      if (!entry) {
++              // TODO something better
++              printk("dlm: add_to_requestqueue: out of memory\n");
++              return;
++      }
++
++      log_debug(ls, "add_to_requestqueue %d", nodeid);
++      entry->rqe_nodeid = nodeid;
++      memcpy(entry->rqe_request, request, length);
++      list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
++}
++
++int process_requestqueue(gd_ls_t *ls)
++{
++      int error = 0, count = 0;
++      struct rq_entry *entry, *safe;
++      struct gd_req_header *req;
++
++      log_all(ls, "process held requests");
++
++      list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++              req = (struct gd_req_header *) entry->rqe_request;
++              log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
++
++              if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++                      log_debug(ls, "process_requestqueue aborted");
++                      error = -EINTR;
++                      break;
++              }
++
++              error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
++              if (error == -EINTR) {
++                      log_debug(ls, "process_requestqueue interrupted");
++                      break;
++              }
++
++              list_del(&entry->rqe_list);
++              kfree(entry);
++              count++;
++              error = 0;
++      }
++
++      log_all(ls, "processed %d requests", count);
++      return error;
++}
++
++void wait_requestqueue(gd_ls_t *ls)
++{
++      while (!list_empty(&ls->ls_requestqueue) &&
++              test_bit(LSFL_LS_RUN, &ls->ls_flags))
++              schedule();
++}
++
++/*
++ * Resdir requests (lookup or remove) and replies from before recovery are
++ * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
++ * gone are also invalid.
++ */
++
++void purge_requestqueue(gd_ls_t *ls)
++{
++      int count = 0;
++      struct rq_entry *entry, *safe;
++      struct gd_req_header *req;
++      struct gd_remlockrequest *freq;
++      gd_lkb_t *lkb;
++
++      log_all(ls, "purge requests");
++
++      list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++              req = (struct gd_req_header *) entry->rqe_request;
++              freq = (struct gd_remlockrequest *) req;
++
++              if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
++                  req->rh_cmd == GDLM_REMCMD_LOOKUP ||
++                  in_nodes_gone(ls, entry->rqe_nodeid)) {
++
++                      list_del(&entry->rqe_list);
++                      kfree(entry);
++                      count++;
++
++              } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
++
++                      /*
++                       * Replies to resdir lookups are invalid and must be
++                       * purged.  The lookup requests are marked in
++                       * lockqueue_lkb_mark and will be resent in
++                       * resend_cluster_requests.  The only way to check if
++                       * this is a lookup reply is to look at the
++                       * lockqueue_state of the lkb.
++                       */
++
++                      lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
++                      GDLM_ASSERT(lkb,);
++                      if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
++                              list_del(&entry->rqe_list);
++                              kfree(entry);
++                              count++;
++                      }
++              }
++      }
++
++      log_all(ls, "purged %d requests", count);
++}
++
++/*
++ * Check if there's a reply for the given lkid in the requestqueue.
++ */
++
++int reply_in_requestqueue(gd_ls_t *ls, int lkid)
++{
++      int rv = FALSE;
++      struct rq_entry *entry, *safe;
++      struct gd_req_header *req;
++      struct gd_remlockrequest *freq;
++
++      list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++              req = (struct gd_req_header *) entry->rqe_request;
++              freq = (struct gd_remlockrequest *) req;
++
++              if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
++                  freq->rr_header.rh_lkid == lkid) {
++                      rv = TRUE;
++                      break;
++              }
++      }
++
++      return rv;
++}
++
++void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
++{
++      if (!*lvbptr)
++              *lvbptr = allocate_lvb(ls);
++      if (*lvbptr)
++              memcpy(*lvbptr, src, DLM_LVB_LEN);
++}
++
++/*
++ * Process a lockqueue LKB after it has had it's remote processing complete and
++ * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread on
++ * the machine that requested the lock.
++ */
++
++static void process_lockqueue_reply(gd_lkb_t *lkb,
++                                  struct gd_remlockreply *reply)
++{
++      int state = lkb->lkb_lockqueue_state;
++      int oldstate;
++      gd_res_t *rsb = lkb->lkb_resource;
++      gd_ls_t *ls = rsb->res_ls;
++
++      lkb->lkb_lockqueue_state = 0;
++      if (state)
++              remove_from_lockqueue(lkb);
++
++      switch (state) {
++      case GDLM_LQSTATE_WAIT_RSB:
++
++              GDLM_ASSERT(reply->rl_status == 0,);
++
++              if (reply->rl_nodeid == our_nodeid())
++                      rsb->res_nodeid = 0;
++              else
++                      rsb->res_nodeid = reply->rl_nodeid;
++
++              rsb->res_resdir_seq = reply->rl_resdir_seq;
++              lkb->lkb_nodeid = rsb->res_nodeid;
++
++              dlm_lock_stage2(rsb->res_ls, lkb, rsb,
++                              lkb->lkb_lockqueue_flags);
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONVERT:
++      case GDLM_LQSTATE_WAIT_CONDGRANT:
++
++              /*
++               * After a remote lock/conversion/grant request we put the lock
++               * on the right queue and send an AST if appropriate.  Any lock
++               * shuffling (eg newly granted locks because this one was
++               * converted downwards) will be dealt with in seperate messages
++               * (which may be in the same network message)
++               */
++
++              if (!lkb->lkb_remid)
++                      lkb->lkb_remid = reply->rl_lkid;
++
++              /*
++               * The remote request failed (we assume because of NOQUEUE).
++               * If this is a new request (non-conv) the lkb was created just
++               * for it so the lkb should be freed.  If this was a
++               * conversion, the lkb already existed so we should put it back
++               * on the grant queue.
++               */
++
++              if (reply->rl_status != 0) {
++                      GDLM_ASSERT(reply->rl_status == -EAGAIN,);
++
++                      if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
++                              res_lkb_dequeue(lkb);
++                              lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++                      } else
++                              res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++                      lkb->lkb_retstatus = reply->rl_status;
++                      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++                      break;
++              }
++
++              /*
++               * The remote request was successful in granting the request or
++               * queuing it to be granted later.  Add the lkb to the
++               * appropriate rsb queue.
++               */
++
++              switch (reply->rl_lockstate) {
++              case GDLM_LKSTS_GRANTED:
++
++                      /* Compact version of grant_lock(). */
++
++                      down_write(&rsb->res_lock);
++                      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++                              memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
++                                     DLM_LVB_LEN);
++
++                      lkb->lkb_grmode = lkb->lkb_rqmode;
++                      lkb->lkb_rqmode = DLM_LOCK_IV;
++                      lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++                      if (lkb->lkb_range) {
++                              lkb->lkb_range[GR_RANGE_START] =
++                                  lkb->lkb_range[RQ_RANGE_START];
++                              lkb->lkb_range[GR_RANGE_END] =
++                                  lkb->lkb_range[RQ_RANGE_END];
++                      }
++                      up_write(&rsb->res_lock);
++
++                      lkb->lkb_retstatus = 0;
++                      queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++                      break;
++
++              case GDLM_LKSTS_WAITING:
++
++                      if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++                              res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++                      else
++                              log_error(ls, "wait reply for granted %x %u",
++                                        lkb->lkb_id, lkb->lkb_nodeid);
++                      break;
++
++              case GDLM_LKSTS_CONVERT:
++
++                      if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++                              res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++                      else
++                              log_error(ls, "convert reply for granted %x %u",
++                                        lkb->lkb_id, lkb->lkb_nodeid);
++                      break;
++
++              default:
++                      log_error(ls, "process_lockqueue_reply state %d",
++                                reply->rl_lockstate);
++              }
++
++              break;
++
++      case GDLM_LQSTATE_WAIT_UNLOCK:
++
++              /*
++               * Unlocks should never fail.  Update local lock info.  This
++               * always sends completion AST with status in lksb
++               */
++
++              GDLM_ASSERT(reply->rl_status == 0,);
++              oldstate = res_lkb_dequeue(lkb);
++
++              /* Differentiate between unlocks and conversion cancellations */
++              if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
++                  oldstate == GDLM_LKSTS_CONVERT) {
++                      res_lkb_enqueue(lkb->lkb_resource, lkb,
++                                      GDLM_LKSTS_GRANTED);
++                      lkb->lkb_retstatus = -DLM_ECANCEL;
++              } else {
++                      lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++                      lkb->lkb_retstatus = -DLM_EUNLOCK;
++              }
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              break;
++
++      default:
++              log_error(ls, "process_lockqueue_reply id %x state %d",
++                        lkb->lkb_id, state);
++      }
++}
++
++/*
++ * Tell a remote node to grant a lock.  This happens when we are the master
++ * copy for a lock that is actually held on a remote node.  The remote end is
++ * also responsible for sending the completion AST.
++ */
++
++void remote_grant(gd_lkb_t *lkb)
++{
++      struct writequeue_entry *e;
++      struct gd_remlockrequest *req;
++
++      // TODO Error handling
++      e = lowcomms_get_buffer(lkb->lkb_nodeid,
++                              sizeof(struct gd_remlockrequest),
++                              lkb->lkb_resource->res_ls->ls_allocation,
++                              (char **) &req);
++      if (!e)
++              return;
++
++      req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
++      req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
++      req->rr_header.rh_flags = 0;
++      req->rr_header.rh_lkid = lkb->lkb_id;
++      req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
++      req->rr_remlkid = lkb->lkb_remid;
++      req->rr_flags = 0;
++
++      if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
++              /* This is a confusing non-standard use of rr_flags which is
++               * usually used to pass lockqueue_flags. */
++              req->rr_flags |= GDLM_LKFLG_DEMOTED;
++      }
++
++      add_request_lvb(lkb, req);
++      midcomms_send_buffer(&req->rr_header, e);
++}
++
++void reply_and_grant(gd_lkb_t *lkb)
++{
++      struct gd_remlockrequest *req = lkb->lkb_request;
++      struct gd_remlockreply *reply;
++      struct writequeue_entry *e;
++
++      // TODO Error handling
++      e = lowcomms_get_buffer(lkb->lkb_nodeid,
++                              sizeof(struct gd_remlockreply),
++                              lkb->lkb_resource->res_ls->ls_allocation,
++                              (char **) &reply);
++      if (!e)
++              return;
++
++      reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
++      reply->rl_header.rh_flags = 0;
++      reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
++      reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
++      reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
++
++      reply->rl_status = lkb->lkb_retstatus;
++      reply->rl_lockstate = lkb->lkb_status;
++      reply->rl_lkid = lkb->lkb_id;
++
++      GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
++
++      lkb->lkb_request = NULL;
++
++      add_reply_lvb(lkb, reply);
++      midcomms_send_buffer(&reply->rl_header, e);
++}
++
++/*
++ * Request removal of a dead entry in the resource directory
++ */
++
++void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
++                         uint8_t sequence)
++{
++      struct writequeue_entry *e;
++      struct gd_remlockrequest *req;
++
++      if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++              gd_rcom_t *rc = allocate_rcom_buffer(ls);
++
++              memcpy(rc->rc_buf, name, namelen);
++              rc->rc_datalen = namelen;
++
++              rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
++
++              free_rcom_buffer(rc);
++              return;
++      }
++      // TODO Error handling
++      e = lowcomms_get_buffer(nodeid,
++                              sizeof(struct gd_remlockrequest) + namelen - 1,
++                              ls->ls_allocation, (char **) &req);
++      if (!e)
++              return;
++
++      memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
++      req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
++      req->rr_header.rh_length =
++          sizeof(struct gd_remlockrequest) + namelen - 1;
++      req->rr_header.rh_flags = 0;
++      req->rr_header.rh_lkid = 0;
++      req->rr_header.rh_lockspace = ls->ls_global_id;
++      req->rr_remlkid = 0;
++      req->rr_resdir_seq = sequence;
++      memcpy(req->rr_name, name, namelen);
++
++      midcomms_send_buffer(&req->rr_header, e);
++}
++
++/*
++ * Send remote cluster request to directory or master node before the request
++ * is put on the lock queue.  Runs in the context of the locking caller.
++ */
++
++int send_cluster_request(gd_lkb_t *lkb, int state)
++{
++      uint32_t target_nodeid;
++      gd_res_t *rsb = lkb->lkb_resource;
++      gd_ls_t *ls = rsb->res_ls;
++      struct gd_remlockrequest *req;
++      struct writequeue_entry *e;
++
++      /* Need to know the target nodeid before we allocate a send buffer */
++      target_nodeid = lkb->lkb_nodeid;
++      GDLM_ASSERT(target_nodeid != 0,);
++
++      if (state == GDLM_LQSTATE_WAIT_RSB)
++              target_nodeid = get_directory_nodeid(rsb);
++
++      GDLM_ASSERT(target_nodeid,);
++
++      if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++              /* this may happen when called by resend_cluster_request */
++              log_error(ls, "send_cluster_request to %u state %d recovery",
++                        target_nodeid, state);
++      }
++
++      e = lowcomms_get_buffer(target_nodeid,
++                              sizeof(struct gd_remlockrequest) +
++                              rsb->res_length - 1, ls->ls_allocation,
++                              (char **) &req);
++      if (!e)
++              return -ENOBUFS;
++      memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
++
++      /* Common stuff, some are just defaults */
++
++      if (lkb->lkb_bastaddr)
++              req->rr_asts = GDLM_QUEUE_BLKAST;
++      if (lkb->lkb_astaddr)
++              req->rr_asts |= GDLM_QUEUE_COMPAST;
++      if (lkb->lkb_parent)
++              req->rr_remparid = lkb->lkb_parent->lkb_remid;
++
++      req->rr_flags = lkb->lkb_lockqueue_flags;
++      req->rr_rqmode = lkb->lkb_rqmode;
++      req->rr_remlkid = lkb->lkb_remid;
++      req->rr_header.rh_length =
++          sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
++      req->rr_header.rh_flags = 0;
++      req->rr_header.rh_lkid = lkb->lkb_id;
++      req->rr_header.rh_lockspace = ls->ls_global_id;
++
++      switch (state) {
++
++      case GDLM_LQSTATE_WAIT_RSB:
++
++              /* The lock must be a root lock */
++              GDLM_ASSERT(!lkb->lkb_parent,);
++
++              req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
++              memcpy(req->rr_name, rsb->res_name, rsb->res_length);
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONVERT:
++
++              req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
++              if (lkb->lkb_range) {
++                      req->rr_flags |= GDLM_LKFLG_RANGE;
++                      req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
++                      req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
++              }
++              break;
++
++      case GDLM_LQSTATE_WAIT_CONDGRANT:
++
++              req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
++              req->rr_resdir_seq = rsb->res_resdir_seq;
++              memcpy(req->rr_name, rsb->res_name, rsb->res_length);
++              if (lkb->lkb_range) {
++                      req->rr_flags |= GDLM_LKFLG_RANGE;
++                      req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
++                      req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
++              }
++              break;
++
++      case GDLM_LQSTATE_WAIT_UNLOCK:
++
++              req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
++              break;
++
++      default:
++              GDLM_ASSERT(!"Unknown cluster request",);
++      }
++
++      add_request_lvb(lkb, req);
++      midcomms_send_buffer(&req->rr_header, e);
++
++      return 0;
++}
++
++/*
++ * We got a request from another cluster node, process it and return an info
++ * structure with the lock state/LVB etc as required.  Executes in the DLM's
++ * recvd thread.
++ */
++
++int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
++{
++      gd_ls_t *lspace;
++      gd_lkb_t *lkb = NULL;
++      gd_res_t *rsb;
++      int send_reply = 0, status = 0, namelen;
++      struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
++      struct gd_remlockreply reply;
++
++      lspace = find_lockspace_by_global_id(req->rh_lockspace);
++
++      if (!lspace) {
++              log_print("process_cluster_request invalid lockspace %x "
++                        "from %d req %u", req->rh_lockspace, nodeid,
++                        req->rh_cmd);
++              status = -EINVAL;
++              goto out;
++      }
++
++      /* wait for recoverd to drain requestqueue */
++      if (!recovery)
++              wait_requestqueue(lspace);
++
++      /*
++       * If we're in recovery then queue the request for later.  Otherwise,
++       * we still need to get the "in_recovery" lock to make sure the
++       * recovery itself doesn't start until we are done.
++       */
++ retry:
++      if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
++              if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
++                      log_error(lspace, "process_cluster_request warning %u",
++                                nodeid);
++              add_to_requestqueue(lspace, nodeid, (char *) req,
++                                  req->rh_length);
++              log_debug(lspace, "process_cluster_request abort");
++              status = -EINTR;
++              goto out;
++      }
++      if (!down_read_trylock(&lspace->ls_in_recovery)) {
++              schedule();
++              goto retry;
++      }
++
++
++      /*
++       * Process the request.
++       */
++
++      switch (req->rh_cmd) {
++
++      case GDLM_REMCMD_LOOKUP:
++              {
++                      gd_resdata_t *rd;
++                      int status;
++                      uint32_t dir_nodeid;
++
++                      namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++
++                      dir_nodeid = name_to_directory_nodeid(lspace,
++                                                            freq->rr_name,
++                                                            namelen);
++                      if (dir_nodeid != our_nodeid())
++                              log_debug(lspace, "ignoring directory lookup");
++
++                      status = get_resdata(lspace, nodeid, freq->rr_name,
++                                           namelen, &rd, 0);
++                      if (status)
++                              status = -ENOMEM;
++
++                      reply.rl_status = status;
++                      reply.rl_lockstate = 0;
++                      reply.rl_nodeid = rd->rd_master_nodeid;
++                      reply.rl_resdir_seq = rd->rd_sequence;
++              }
++              send_reply = 1;
++              break;
++
++      case GDLM_REMCMD_REM_RESDATA:
++
++              namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++              remove_resdata(lspace, nodeid, freq->rr_name, namelen,
++                             freq->rr_resdir_seq);
++              break;
++
++      case GDLM_REMCMD_LOCKREQUEST:
++
++              lkb = remote_stage2(nodeid, lspace, freq);
++              if (lkb) {
++                      lkb->lkb_request = freq;
++                      dlm_lock_stage3(lkb);
++
++                      /*
++                       * If the request was granted in lock_stage3, then a
++                       * reply message was already sent in combination with
++                       * the grant message and lkb_request is NULL.
++                       */
++
++                      if (lkb->lkb_request) {
++                              lkb->lkb_request = NULL;
++                              send_reply = 1;
++                              reply.rl_status = lkb->lkb_retstatus;
++                              reply.rl_lockstate = lkb->lkb_status;
++                              reply.rl_lkid = lkb->lkb_id;
++
++                              /*
++                               * If the request could not be granted and the
++                               * user won't wait, then free up the LKB
++                               */
++
++                              if (lkb->lkb_flags & GDLM_LKFLG_DELAST) {
++                                      rsb = lkb->lkb_resource;
++                                      release_lkb(lspace, lkb);
++                                      release_rsb(rsb);
++                                      lkb = NULL;
++                              }
++                      }
++              } else {
++                      reply.rl_status = -ENOMEM;
++                      send_reply = 1;
++              }
++              break;
++
++      case GDLM_REMCMD_CONVREQUEST:
++
++              lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++                      log_error(lspace, "convrequest: invalid status %d",
++                                lkb->lkb_status);
++
++              lkb->lkb_rqmode = freq->rr_rqmode;
++              lkb->lkb_lockqueue_flags = freq->rr_flags;
++              lkb->lkb_request = freq;
++              lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
++
++              if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
++                  || freq->rr_flags & DLM_LKF_VALBLK) {
++                      lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++                      allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
++                                            freq->rr_lvb);
++              }
++
++              if (freq->rr_flags & GDLM_LKFLG_RANGE) {
++                      if (lkb_set_range(lspace, lkb, freq->rr_range_start,
++                                        freq->rr_range_end)) {
++                              reply.rl_status = -ENOMEM;
++                              send_reply = 1;
++                              goto out;
++                      }
++              }
++
++              dlm_convert_stage2(lkb, FALSE);
++
++              /*
++               * If the conv request was granted in stage2, then a reply
++               * message was already sent in combination with the grant
++               * message.
++               */
++
++              if (lkb->lkb_request) {
++                      lkb->lkb_request = NULL;
++                      send_reply = 1;
++                      reply.rl_status = lkb->lkb_retstatus;
++                      reply.rl_lockstate = lkb->lkb_status;
++                      reply.rl_lkid = lkb->lkb_id;
++              }
++              break;
++
++      case GDLM_REMCMD_LOCKREPLY:
++
++              lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
++              break;
++
++      case GDLM_REMCMD_LOCKGRANT:
++
++              /*
++               * Remote lock has been granted asynchronously.  Do a compact
++               * version of what grant_lock() does.
++               */
++
++              lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              rsb = lkb->lkb_resource;
++
++              if (lkb->lkb_lockqueue_state)
++                      log_error(rsb->res_ls, "granting lock on lockqueue "
++                                "id=%x from=%u lqstate=%d flags=%x",
++                                lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
++                                lkb->lkb_flags);
++
++              down_write(&rsb->res_lock);
++
++              if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++                      memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
++
++              lkb->lkb_grmode = lkb->lkb_rqmode;
++              lkb->lkb_rqmode = DLM_LOCK_IV;
++
++              if (lkb->lkb_range) {
++                      lkb->lkb_range[GR_RANGE_START] =
++                          lkb->lkb_range[RQ_RANGE_START];
++                      lkb->lkb_range[GR_RANGE_END] =
++                          lkb->lkb_range[RQ_RANGE_END];
++              }
++
++              lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++              up_write(&rsb->res_lock);
++
++              if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
++                      lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
++
++              lkb->lkb_retstatus = 0;
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              break;
++
++      case GDLM_REMCMD_SENDBAST:
++
++              lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
++                      queue_ast(lkb, GDLM_QUEUE_BLKAST, freq->rr_rqmode);
++              break;
++
++      case GDLM_REMCMD_SENDCAST:
++
++              /* This is only used for some error completion ASTs */
++
++              lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              /* Return the lock to granted status */
++              res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
++
++              lkb->lkb_retstatus = freq->rr_status;
++              queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++              break;
++
++      case GDLM_REMCMD_UNLOCKREQUEST:
++
++              lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++              GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++                                      freq->rr_remlkid,
++                                      freq->rr_header.rh_lkid, nodeid););
++
++              reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
++              send_reply = 1;
++              break;
++
++      case GDLM_REMCMD_QUERY:
++              remote_query(nodeid, lspace, req);
++              break;
++
++      case GDLM_REMCMD_QUERYREPLY:
++              remote_query_reply(nodeid, lspace, req);
++              break;
++
++      default:
++              log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
++      }
++
++      up_read(&lspace->ls_in_recovery);
++
++      out:
++      if (send_reply) {
++              reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
++              reply.rl_header.rh_flags = 0;
++              reply.rl_header.rh_length = sizeof(reply);
++              reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
++              reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
++
++              status = midcomms_send_message(nodeid, &reply.rl_header,
++                                             GFP_KERNEL);
++      }
++
++      wake_astd();
++
++      return status;
++}
++
++static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
++{
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++              memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
++}
++
++static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
++{
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++              memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
++}
+diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
+--- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockqueue.h      2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKQUEUE_DOT_H__
++#define __LOCKQUEUE_DOT_H__
++
++void remote_grant(gd_lkb_t * lkb);
++void reply_and_grant(gd_lkb_t * lkb);
++int remote_stage(gd_lkb_t * lkb, int state);
++int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
++int send_cluster_request(gd_lkb_t * lkb, int state);
++void purge_requestqueue(gd_ls_t * ls);
++int process_requestqueue(gd_ls_t * ls);
++int reply_in_requestqueue(gd_ls_t * ls, int lkid);
++void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
++                         uint8_t sequence);
++void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
++
++#endif                                /* __LOCKQUEUE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
+--- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockspace.c      2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,706 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "recoverd.h"
++#include "ast.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "lowcomms.h"
++#include "config.h"
++#include "memory.h"
++#include "lockspace.h"
++#include "device.h"
++
++#define GDST_NONE       (0)
++#define GDST_RUNNING    (1)
++
++static int gdlmstate;
++static int gdlmcount;
++static struct semaphore gdlmstate_lock;
++struct list_head lslist;
++spinlock_t lslist_lock;
++struct kcl_service_ops ls_ops;
++
++static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
++
++
++void dlm_lockspace_init(void)
++{
++      gdlmstate = GDST_NONE;
++      gdlmcount = 0;
++      init_MUTEX(&gdlmstate_lock);
++      INIT_LIST_HEAD(&lslist);
++      spin_lock_init(&lslist_lock);
++}
++
++gd_ls_t *find_lockspace_by_global_id(uint32_t id)
++{
++      gd_ls_t *ls;
++
++      spin_lock(&lslist_lock);
++
++      list_for_each_entry(ls, &lslist, ls_list) {
++              if (ls->ls_global_id == id)
++                      goto out;
++      }
++      ls = NULL;
++      out:
++      spin_unlock(&lslist_lock);
++      return ls;
++}
++
++/* TODO: make this more efficient */
++gd_ls_t *find_lockspace_by_local_id(void *id)
++{
++      gd_ls_t *ls;
++
++      spin_lock(&lslist_lock);
++
++      list_for_each_entry(ls, &lslist, ls_list) {
++              if (ls->ls_local_id == (uint32_t)(long)id)
++                      goto out;
++      }
++      ls = NULL;
++      out:
++      spin_unlock(&lslist_lock);
++      return ls;
++}
++
++gd_ls_t *find_lockspace_by_name(char *name, int namelen)
++{
++      gd_ls_t *ls;
++
++      spin_lock(&lslist_lock);
++
++      list_for_each_entry(ls, &lslist, ls_list) {
++              if (ls->ls_namelen == namelen &&
++                  memcmp(ls->ls_name, name, namelen) == 0)
++                      goto out;
++      }
++      ls = NULL;
++      out:
++      spin_unlock(&lslist_lock);
++      return ls;
++}
++
++/*
++ * Called from dlm_init.  These are the general threads which are not
++ * lockspace-specific and work for all gdlm lockspaces.
++ */
++
++static int threads_start(void)
++{
++      int error;
++
++      /* Thread which interacts with cman for all ls's */
++      error = recoverd_start();
++      if (error) {
++              log_print("cannot start recovery thread %d", error);
++              goto fail;
++      }
++
++      /* Thread which process lock requests for all ls's */
++      error = astd_start();
++      if (error) {
++              log_print("cannot start ast thread %d", error);
++              goto recoverd_fail;
++      }
++
++      /* Thread for sending/receiving messages for all ls's */
++      error = lowcomms_start();
++      if (error) {
++              log_print("cannot start lowcomms %d", error);
++              goto astd_fail;
++      }
++
++      return 0;
++
++      astd_fail:
++      astd_stop();
++
++      recoverd_fail:
++      recoverd_stop();
++
++      fail:
++      return error;
++}
++
++static void threads_stop(void)
++{
++      lowcomms_stop();
++      astd_stop();
++      recoverd_stop();
++}
++
++static int init_internal(void)
++{
++      int error = 0;
++
++      if (gdlmstate == GDST_RUNNING)
++              gdlmcount++;
++      else {
++              error = threads_start();
++              if (error)
++                      goto out;
++
++              gdlmstate = GDST_RUNNING;
++              gdlmcount = 1;
++      }
++
++      out:
++      return error;
++}
++
++
++/*
++ * Called after gdlm module is loaded and before any lockspaces are created.
++ * Starts and initializes global threads and structures.  These global entities
++ * are shared by and independent of all lockspaces.
++ *
++ * There should be a gdlm-specific user command which a person can run which
++ * calls this function.  If a user hasn't run that command and something
++ * creates a new lockspace, this is called first.
++ *
++ * This also starts the default lockspace.
++ */
++
++int dlm_init(void)
++{
++      int error;
++
++      down(&gdlmstate_lock);
++      error = init_internal();
++      up(&gdlmstate_lock);
++
++      return error;
++}
++
++int dlm_release(void)
++{
++      int error = 0;
++
++      down(&gdlmstate_lock);
++
++      if (gdlmstate == GDST_NONE)
++              goto out;
++
++      if (gdlmcount)
++              gdlmcount--;
++
++      if (gdlmcount)
++              goto out;
++
++      spin_lock(&lslist_lock);
++      if (!list_empty(&lslist)) {
++              spin_unlock(&lslist_lock);
++              log_print("cannot stop threads, lockspaces still exist");
++              goto out;
++      }
++      spin_unlock(&lslist_lock);
++
++      threads_stop();
++      gdlmstate = GDST_NONE;
++
++      out:
++      up(&gdlmstate_lock);
++
++      return error;
++}
++
++gd_ls_t *allocate_ls(int namelen)
++{
++      gd_ls_t *ls;
++
++      /* FIXME: use appropriate malloc type */
++
++      ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
++      if (ls)
++              memset(ls, 0, sizeof(gd_ls_t) + namelen);
++
++      return ls;
++}
++
++void free_ls(gd_ls_t *ls)
++{
++      kfree(ls);
++}
++
++static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
++{
++      gd_ls_t *ls;
++      int i, error = -ENOMEM;
++      uint32_t local_id = 0;
++
++      if (!try_module_get(THIS_MODULE))
++              return -EINVAL;
++
++      if (namelen > MAX_SERVICE_NAME_LEN)
++              return -EINVAL;
++
++      if ((ls = find_lockspace_by_name(name, namelen))) {
++              *lockspace = (void *)ls->ls_local_id;
++              return -EEXIST;
++      }
++
++      /*
++       * Initialize ls fields
++       */
++
++      ls = allocate_ls(namelen);
++      if (!ls)
++              goto out;
++
++      memcpy(ls->ls_name, name, namelen);
++      ls->ls_namelen = namelen;
++
++      ls->ls_allocation = GFP_KERNEL;
++      memset(&ls->ls_flags, 0, sizeof(unsigned long));
++      INIT_LIST_HEAD(&ls->ls_rootres);
++      ls->ls_hashsize = dlm_config.reshashtbl;
++      ls->ls_hashmask = ls->ls_hashsize - 1;
++
++      ls->ls_reshashtbl =
++          kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
++      if (!ls->ls_reshashtbl)
++              goto out_lsfree;
++
++      for (i = 0; i < ls->ls_hashsize; i++)
++              INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
++
++      rwlock_init(&ls->ls_reshash_lock);
++
++      if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
++              goto out_htfree;
++
++      INIT_LIST_HEAD(&ls->ls_nodes);
++      ls->ls_num_nodes = 0;
++      INIT_LIST_HEAD(&ls->ls_nodes_gone);
++      INIT_LIST_HEAD(&ls->ls_recover);
++      spin_lock_init(&ls->ls_recover_lock);
++      INIT_LIST_HEAD(&ls->ls_recover_list);
++      ls->ls_recover_list_count = 0;
++      spin_lock_init(&ls->ls_recover_list_lock);
++      init_waitqueue_head(&ls->ls_wait_general);
++      INIT_LIST_HEAD(&ls->ls_requestqueue);
++      INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
++      ls->ls_last_stop = 0;
++      ls->ls_last_start = 0;
++      ls->ls_last_finish = 0;
++      ls->ls_rcom_msgid = 0;
++      init_MUTEX(&ls->ls_rcom_lock);
++      init_rwsem(&ls->ls_in_recovery);
++      init_rwsem(&ls->ls_unlock_sem);
++      init_rwsem(&ls->ls_rec_rsblist);
++      init_rwsem(&ls->ls_gap_rsblist);
++      down_write(&ls->ls_in_recovery);
++
++      for (i = 0; i < RESDIRHASH_SIZE; i++) {
++              INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
++              rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
++      }
++
++      if (flags & DLM_LSF_NOTIMERS)
++              set_bit(LSFL_NOTIMERS, &ls->ls_flags);
++
++      /*
++       * Connect this lockspace with the cluster manager
++       */
++
++      error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
++                                   &ls_ops, TRUE, (void *) ls, &local_id);
++      if (error)
++              goto out_idtblfree;
++
++      ls->ls_state = LSST_INIT;
++      ls->ls_local_id = local_id;
++
++      spin_lock(&lslist_lock);
++      list_add(&ls->ls_list, &lslist);
++      spin_unlock(&lslist_lock);
++
++      error = kcl_join_service(local_id);
++      if (error) {
++              log_error(ls, "service manager join error %d", error);
++              goto out_reg;
++      }
++
++      /* The ls isn't actually running until it receives a start() from CMAN.
++       * Neither does it have a global ls id until started. */
++
++
++      /* Return the local ID as the lockspace handle. I've left this
++         cast to a void* as it allows us to replace it with pretty much
++         anything at a future date without breaking clients. But returning
++         the address of the lockspace is a bad idea as it could get
++         forcibly removed, leaving client with a dangling pointer */
++      *lockspace = (void *)local_id;
++
++      return 0;
++
++      out_reg:
++      kcl_unregister_service(ls->ls_local_id);
++
++      out_idtblfree:
++      free_lockidtbl(ls);
++
++      out_htfree:
++      kfree(ls->ls_reshashtbl);
++
++      out_lsfree:
++      free_ls(ls);
++
++      out:
++      return error;
++}
++
++/*
++ * Called by a system like GFS which wants independent lock spaces.
++ */
++
++int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
++{
++      int error = -ENOSYS;
++
++      down(&gdlmstate_lock);
++
++      error = init_internal();
++      if (error)
++              goto out;
++
++      error = new_lockspace(name, namelen, lockspace, flags);
++
++      out:
++      up(&gdlmstate_lock);
++
++      return error;
++}
++
++/* Return 1 if the lockspace still has active remote locks,
++ *        2 if the lockspace still has active local locks.
++ */
++static int lockspace_busy(gd_ls_t *ls)
++{
++    int i;
++    int lkb_found = 0;
++    gd_lkb_t *lkb;
++
++    /* NOTE: We check the lockidtbl here rather than the resource table.
++     * This is because there may be LKBs queued as ASTs that have been unlinked
++     * from their RSBs and are pending deletion once the AST has been delivered
++     */
++    read_lock(&ls->ls_lockidtbl_lock);
++    for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++      if (!list_empty(&ls->ls_lockidtbl[i].list)) {
++          lkb_found = 1;
++          list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
++              if (!lkb->lkb_nodeid) {
++                  read_unlock(&ls->ls_lockidtbl_lock);
++                  return 2;
++              }
++          }
++      }
++    }
++    read_unlock(&ls->ls_lockidtbl_lock);
++    return lkb_found;
++}
++
++/* Actually release the lockspace */
++static int release_lockspace(gd_ls_t *ls, int force)
++{
++      gd_lkb_t *lkb;
++      gd_res_t *rsb;
++      gd_recover_t *gr;
++      gd_csb_t *csb;
++      struct list_head *head;
++      int i;
++      int busy = lockspace_busy(ls);
++
++      /* Don't destroy a busy lockspace */
++      if (busy > force)
++              return -EBUSY;
++
++      if (force < 3) {
++              kcl_leave_service(ls->ls_local_id);
++              kcl_unregister_service(ls->ls_local_id);
++      }
++
++      spin_lock(&lslist_lock);
++      list_del(&ls->ls_list);
++      spin_unlock(&lslist_lock);
++
++      /*
++       * Free resdata structs.
++       */
++
++      resdir_clear(ls);
++
++      /*
++       * Free all lkb's on lockidtbl[] lists.
++       */
++
++      for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++              head = &ls->ls_lockidtbl[i].list;
++              while (!list_empty(head)) {
++                      lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
++                      list_del(&lkb->lkb_idtbl_list);
++
++                      if (lkb->lkb_lockqueue_state)
++                              remove_from_lockqueue(lkb);
++
++                      if (lkb->lkb_asts_to_deliver)
++                              list_del(&lkb->lkb_astqueue);
++
++                      if (lkb->lkb_lvbptr
++                          && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++                              free_lvb(lkb->lkb_lvbptr);
++
++                      free_lkb(lkb);
++              }
++      }
++
++      /*
++       * Free lkidtbl[] itself
++       */
++
++      kfree(ls->ls_lockidtbl);
++
++      /*
++       * Free all rsb's on reshashtbl[] lists
++       */
++
++      for (i = 0; i < ls->ls_hashsize; i++) {
++              head = &ls->ls_reshashtbl[i];
++              while (!list_empty(head)) {
++                      rsb = list_entry(head->next, gd_res_t, res_hashchain);
++                      list_del(&rsb->res_hashchain);
++
++                      if (rsb->res_lvbptr)
++                              free_lvb(rsb->res_lvbptr);
++
++                      free_rsb(rsb);
++              }
++      }
++
++      /*
++       * Free reshashtbl[] itself
++       */
++
++      kfree(ls->ls_reshashtbl);
++
++      /*
++       * Free structures on any other lists
++       */
++
++      head = &ls->ls_recover;
++      while (!list_empty(head)) {
++              gr = list_entry(head->next, gd_recover_t, gr_list);
++              list_del(&gr->gr_list);
++              free_dlm_recover(gr);
++      }
++
++      head = &ls->ls_nodes;
++      while (!list_empty(head)) {
++              csb = list_entry(head->next, gd_csb_t, csb_list);
++              list_del(&csb->csb_list);
++              release_csb(csb);
++      }
++
++      head = &ls->ls_nodes_gone;
++      while (!list_empty(head)) {
++              csb = list_entry(head->next, gd_csb_t, csb_list);
++              list_del(&csb->csb_list);
++              release_csb(csb);
++      }
++
++      free_ls(ls);
++
++      dlm_release();
++
++      module_put(THIS_MODULE);
++      return 0;
++}
++
++
++/*
++ * Called when a system has released all its locks and is not going to use the
++ * lockspace any longer.  We blindly free everything we're managing for this
++ * lockspace.  Remaining nodes will go through the recovery process as if we'd
++ * died.  The lockspace must continue to function as usual, participating in
++ * recoveries, until kcl_leave_service returns.
++ *
++ * Force has 4 possible values:
++ * 0 - don't destroy locksapce if it has any LKBs
++ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
++ * 2 - destroy lockspace regardless of LKBs
++ * 3 - destroy lockspace as part of a forced shutdown
++ */
++
++int dlm_release_lockspace(void *lockspace, int force)
++{
++      gd_ls_t *ls;
++
++      ls = find_lockspace_by_local_id(lockspace);
++      if (!ls)
++          return -EINVAL;
++
++      return release_lockspace(ls, force);
++}
++
++
++/* Called when the cluster is being shut down dirtily */
++void dlm_emergency_shutdown()
++{
++      gd_ls_t *ls;
++      gd_ls_t *tmp;
++
++      /* Shut lowcomms down to prevent any socket activity */
++      lowcomms_stop_accept();
++
++      /* Delete the devices that belong the the userland
++         lockspaces to be deleted. */
++      dlm_device_free_devices();
++
++      /* Now try to clean the lockspaces */
++      spin_lock(&lslist_lock);
++
++      list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
++              spin_unlock(&lslist_lock);
++              release_lockspace(ls, 3);
++              spin_lock(&lslist_lock);
++      }
++
++      spin_unlock(&lslist_lock);
++}
++
++gd_recover_t *allocate_dlm_recover(void)
++{
++      gd_recover_t *gr;
++
++      gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
++      if (gr)
++              memset(gr, 0, sizeof(gd_recover_t));
++
++      return gr;
++}
++
++void free_dlm_recover(gd_recover_t * gr)
++{
++      kfree(gr);
++}
++
++/*
++ * Called by CMAN on a specific ls.  "stop" means set flag which while set
++ * causes all new requests to ls to be queued and not submitted until flag is
++ * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
++ * The recoverd thread carries out any work called for by this event.
++ */
++
++static int dlm_ls_stop(void *servicedata)
++{
++      gd_ls_t *ls = (gd_ls_t *) servicedata;
++      int new;
++
++      spin_lock(&ls->ls_recover_lock);
++      ls->ls_last_stop = ls->ls_last_start;
++      set_bit(LSFL_LS_STOP, &ls->ls_flags);
++      new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
++      spin_unlock(&ls->ls_recover_lock);
++
++      /*
++       * This in_recovery lock does two things:
++       *
++       * 1) Keeps this function from returning until all threads are out
++       *    of locking routines and locking is truely stopped.
++       * 2) Keeps any new requests from being processed until it's unlocked
++       *    when recovery is complete.
++       */
++
++      if (new)
++              down_write(&ls->ls_in_recovery);
++
++      clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
++      clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
++      clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
++      clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
++
++      recoverd_kick(ls);
++
++      return 0;
++}
++
++/*
++ * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
++ * request processing which first requires that the recovery procedure be
++ * stepped through with all nodes sharing the lockspace (nodeids).  The first
++ * start on the ls after it's created is a special case and requires some extra
++ * work like figuring out our own local nodeid.  We can't do all this in the
++ * calling CMAN context, so we must pass this work off to the recoverd thread
++ * which was created in gdlm_init().  The recoverd thread carries out any work
++ * called for by this event.
++ */
++
++static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
++                      int event_id, int type)
++{
++      gd_ls_t *ls = (gd_ls_t *) servicedata;
++      gd_recover_t *gr;
++      int error = -ENOMEM;
++
++      gr = allocate_dlm_recover();
++      if (!gr)
++              goto out;
++
++      gr->gr_nodeids = nodeids;
++      gr->gr_node_count = count;
++      gr->gr_event_id = event_id;
++
++      spin_lock(&ls->ls_recover_lock);
++      ls->ls_last_start = event_id;
++      list_add_tail(&gr->gr_list, &ls->ls_recover);
++      set_bit(LSFL_LS_START, &ls->ls_flags);
++      spin_unlock(&ls->ls_recover_lock);
++
++      recoverd_kick(ls);
++      error = 0;
++
++      out:
++      return error;
++}
++
++/*
++ * Called by CMAN on a specific ls.  "finish" means that all nodes which
++ * received a "start" have completed the start and called kcl_start_done.
++ * The recoverd thread carries out any work called for by this event.
++ */
++
++static void dlm_ls_finish(void *servicedata, int event_id)
++{
++      gd_ls_t *ls = (gd_ls_t *) servicedata;
++
++      spin_lock(&ls->ls_recover_lock);
++      ls->ls_last_finish = event_id;
++      set_bit(LSFL_LS_FINISH, &ls->ls_flags);
++      spin_unlock(&ls->ls_recover_lock);
++
++      recoverd_kick(ls);
++}
++
++struct kcl_service_ops ls_ops = {
++      .stop = dlm_ls_stop,
++      .start = dlm_ls_start,
++      .finish = dlm_ls_finish
++};
+diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
+--- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockspace.h      2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKSPACE_DOT_H__
++#define __LOCKSPACE_DOT_H__
++
++void dlm_lockspace_init(void);
++int dlm_init(void);
++int dlm_release(void);
++int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
++int dlm_release_lockspace(void *ls, int force);
++gd_ls_t *find_lockspace_by_global_id(uint32_t id);
++gd_ls_t *find_lockspace_by_local_id(void *id);
++gd_ls_t *find_lockspace_by_name(char *name, int namelen);
++void free_dlm_recover(gd_recover_t *gr);
++int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
++void dlm_emergency_shutdown(void);
++
++#endif                                /* __LOCKSPACE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
+--- linux-orig/cluster/dlm/lowcomms.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lowcomms.c       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1354 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * lowcomms.c
++ *
++ * This is the "low-level" comms layer.
++ *
++ * It is responsible for sending/receiving messages
++ * from other nodes in the cluster.
++ *
++ * Cluster nodes are referred to by their nodeids. nodeids are
++ * simply 32 bit numbers to the locking module - if they need to
++ * be expanded for the cluster infrastructure then that is it's
++ * responsibility. It is this layer's
++ * responsibility to resolve these into IP address or
++ * whatever it needs for inter-node communication.
++ *
++ * The comms level is two kernel threads that deal mainly with
++ * the receiving of messages from other nodes and passing them
++ * up to the mid-level comms layer (which understands the
++ * message format) for execution by the locking core, and
++ * a send thread which does all the setting up of connections
++ * to remote nodes and the sending of data. Threads are not allowed
++ * to send their own data because it may cause them to wait in times
++ * of high load. Also, this way, the sending thread can collect together
++ * messages bound for one node and send them in one block.
++ *
++ * I don't see any problem with the recv thread executing the locking
++ * code on behalf of remote processes as the locking code is
++ * short, efficient and never waits.
++ *
++ */
++
++
++#include <asm/ioctls.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <linux/pagemap.h>
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "config.h"
++
++struct cbuf {
++      unsigned base;
++      unsigned len;
++      unsigned mask;
++};
++
++#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
++#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
++#define CBUF_EMPTY(cb) ((cb)->len == 0)
++#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
++#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
++                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
++#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
++
++struct connection {
++      struct socket *sock;    /* NULL if not connected */
++      uint32_t nodeid;        /* So we know who we are in the list */
++      struct rw_semaphore sock_sem;   /* Stop connect races */
++      struct list_head read_list;     /* On this list when ready for reading */
++      struct list_head write_list;    /* On this list when ready for writing */
++      struct list_head state_list;    /* On this list when ready to connect */
++      unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
++#define CF_READ_PENDING 1
++#define CF_WRITE_PENDING 2
++#define CF_CONNECT_PENDING 3
++#define CF_IS_OTHERSOCK 4
++      struct list_head writequeue;    /* List of outgoing writequeue_entries */
++      struct list_head listenlist;    /* List of allocated listening sockets */
++      spinlock_t writequeue_lock;
++      int (*rx_action) (struct connection *); /* What to do when active */
++      struct page *rx_page;
++      struct cbuf cb;
++      int retries;
++#define MAX_CONNECT_RETRIES 3
++      struct connection *othersock;
++};
++#define sock2con(x) ((struct connection *)(x)->sk_user_data)
++#define nodeid2con(x) (&connections[(x)])
++
++/* An entry waiting to be sent */
++struct writequeue_entry {
++      struct list_head list;
++      struct page *page;
++      int offset;
++      int len;
++      int end;
++      int users;
++      struct connection *con;
++};
++
++/* "Template" structure for IPv4 and IPv6 used to fill
++ * in the missing bits when converting between cman (which knows
++ * nothing about sockaddr structs) and real life where we actually
++ * have to connect to these addresses. Also one of these structs
++ * will hold the cached "us" address.
++ *
++ * It's an in6 sockaddr just so there's enough space for anything
++ * we're likely to see here.
++ */
++static struct sockaddr_in6 local_addr;
++
++/* Manage daemons */
++static struct semaphore thread_lock;
++static struct completion thread_completion;
++static atomic_t send_run;
++static atomic_t recv_run;
++
++/* An array of connections, indexed by NODEID */
++static struct connection *connections;
++static int conn_array_size;
++static atomic_t writequeue_length;
++static atomic_t accepting;
++
++static wait_queue_t lowcomms_send_waitq_head;
++static wait_queue_head_t lowcomms_send_waitq;
++
++static wait_queue_t lowcomms_recv_waitq_head;
++static wait_queue_head_t lowcomms_recv_waitq;
++
++/* List of sockets that have reads pending */
++static struct list_head read_sockets;
++static spinlock_t read_sockets_lock;
++
++/* List of sockets which have writes pending */
++static struct list_head write_sockets;
++static spinlock_t write_sockets_lock;
++
++/* List of sockets which have connects pending */
++static struct list_head state_sockets;
++static spinlock_t state_sockets_lock;
++
++/* List of allocated listen sockets */
++static struct list_head listen_sockets;
++
++static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
++static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
++
++
++/* Data available on socket or listen socket received a connect */
++static void lowcomms_data_ready(struct sock *sk, int count_unused)
++{
++      struct connection *con = sock2con(sk);
++
++      if (test_and_set_bit(CF_READ_PENDING, &con->flags))
++              return;
++
++      spin_lock_bh(&read_sockets_lock);
++      list_add_tail(&con->read_list, &read_sockets);
++      spin_unlock_bh(&read_sockets_lock);
++
++      wake_up_interruptible(&lowcomms_recv_waitq);
++}
++
++static void lowcomms_write_space(struct sock *sk)
++{
++      struct connection *con = sock2con(sk);
++
++      if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
++              return;
++
++      spin_lock_bh(&write_sockets_lock);
++      list_add_tail(&con->write_list, &write_sockets);
++      spin_unlock_bh(&write_sockets_lock);
++
++      wake_up_interruptible(&lowcomms_send_waitq);
++}
++
++static inline void lowcomms_connect_sock(struct connection *con)
++{
++      if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
++              return;
++      if (!atomic_read(&accepting))
++              return;
++
++      spin_lock_bh(&state_sockets_lock);
++      list_add_tail(&con->state_list, &state_sockets);
++      spin_unlock_bh(&state_sockets_lock);
++
++      wake_up_interruptible(&lowcomms_send_waitq);
++}
++
++static void lowcomms_state_change(struct sock *sk)
++{
++/*    struct connection *con = sock2con(sk); */
++
++      switch (sk->sk_state) {
++      case TCP_ESTABLISHED:
++              lowcomms_write_space(sk);
++              break;
++
++      case TCP_FIN_WAIT1:
++      case TCP_FIN_WAIT2:
++      case TCP_TIME_WAIT:
++      case TCP_CLOSE:
++      case TCP_CLOSE_WAIT:
++      case TCP_LAST_ACK:
++      case TCP_CLOSING:
++              /* FIXME: I think this causes more trouble than it solves.
++                 lowcomms wil reconnect anyway when there is something to
++                 send. This just attempts reconnection if a node goes down!
++              */
++              /* lowcomms_connect_sock(con); */
++              break;
++
++      default:
++              printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
++              break;
++      }
++}
++
++/* Make a socket active */
++static int add_sock(struct socket *sock, struct connection *con)
++{
++      con->sock = sock;
++
++      /* Install a data_ready callback */
++      con->sock->sk->sk_data_ready = lowcomms_data_ready;
++      con->sock->sk->sk_write_space = lowcomms_write_space;
++      con->sock->sk->sk_state_change = lowcomms_state_change;
++
++      return 0;
++}
++
++/* Add the port number to an IP6 or 4 sockaddr and return the address
++   length */
++static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
++                        int *addr_len)
++{
++        saddr->sin6_family = local_addr.sin6_family;
++        if (local_addr.sin6_family == AF_INET) {
++          struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
++          in4_addr->sin_port = cpu_to_be16(port);
++          *addr_len = sizeof(struct sockaddr_in);
++      }
++      else {
++          saddr->sin6_port = cpu_to_be16(port);
++          *addr_len = sizeof(struct sockaddr_in6);
++      }
++}
++
++/* Close a remote connection and tidy up */
++static void close_connection(struct connection *con)
++{
++      if (test_bit(CF_IS_OTHERSOCK, &con->flags))
++              return;
++
++      down_write(&con->sock_sem);
++
++      if (con->sock) {
++              sock_release(con->sock);
++              con->sock = NULL;
++              if (con->othersock) {
++                      down_write(&con->othersock->sock_sem);
++                      sock_release(con->othersock->sock);
++                      con->othersock->sock = NULL;
++                      up_write(&con->othersock->sock_sem);
++                      kfree(con->othersock);
++                      con->othersock = NULL;
++              }
++      }
++      if (con->rx_page) {
++              __free_page(con->rx_page);
++              con->rx_page = NULL;
++      }
++      up_write(&con->sock_sem);
++}
++
++/* Data received from remote end */
++static int receive_from_sock(struct connection *con)
++{
++      int ret = 0;
++      struct msghdr msg;
++      struct iovec iov[2];
++      mm_segment_t fs;
++      unsigned len;
++      int r;
++      int call_again_soon = 0;
++
++      down_read(&con->sock_sem);
++
++      if (con->sock == NULL)
++              goto out;
++      if (con->rx_page == NULL) {
++              /*
++               * This doesn't need to be atomic, but I think it should
++               * improve performance if it is.
++               */
++              con->rx_page = alloc_page(GFP_ATOMIC);
++              if (con->rx_page == NULL)
++                      goto out_resched;
++              CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
++      }
++      /*
++       * To avoid doing too many short reads, we will reschedule for another
++       * another time if there are less than 32 bytes left in the buffer.
++       */
++      if (!CBUF_MAY_ADD(&con->cb, 32))
++              goto out_resched;
++
++      msg.msg_control = NULL;
++      msg.msg_controllen = 0;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = iov;
++      msg.msg_name = NULL;
++      msg.msg_namelen = 0;
++      msg.msg_flags = 0;
++
++      /*
++       * iov[0] is the bit of the circular buffer between the current end
++       * point (cb.base + cb.len) and the end of the buffer.
++       */
++      iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
++      iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
++      iov[1].iov_len = 0;
++
++      /*
++       * iov[1] is the bit of the circular buffer between the start of the
++       * buffer and the start of the currently used section (cb.base)
++       */
++      if (CBUF_DATA(&con->cb) >= con->cb.base) {
++              iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
++              iov[1].iov_len = con->cb.base;
++              iov[1].iov_base = page_address(con->rx_page);
++              msg.msg_iovlen = 2;
++      }
++      len = iov[0].iov_len + iov[1].iov_len;
++
++      fs = get_fs();
++      set_fs(get_ds());
++      r = ret = sock_recvmsg(con->sock, &msg, len,
++                             MSG_DONTWAIT | MSG_NOSIGNAL);
++      set_fs(fs);
++
++      if (ret <= 0)
++              goto out_close;
++      if (ret == len)
++              call_again_soon = 1;
++      CBUF_ADD(&con->cb, ret);
++      ret = midcomms_process_incoming_buffer(con->nodeid,
++                                             page_address(con->rx_page),
++                                             con->cb.base, con->cb.len,
++                                             PAGE_CACHE_SIZE);
++      if (ret == -EBADMSG) {
++              printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
++                     "iov_len=%u, iov_base[0]=%p, read=%d\n",
++                     page_address(con->rx_page), con->cb.base, con->cb.len,
++                     len, iov[0].iov_base, r);
++      }
++      if (ret < 0)
++              goto out_close;
++      CBUF_EAT(&con->cb, ret);
++
++      if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
++              __free_page(con->rx_page);
++              con->rx_page = NULL;
++      }
++      out:
++      if (call_again_soon)
++              goto out_resched;
++      up_read(&con->sock_sem);
++      ret = 0;
++      goto out_ret;
++
++      out_resched:
++      lowcomms_data_ready(con->sock->sk, 0);
++      up_read(&con->sock_sem);
++      ret = 0;
++      goto out_ret;
++
++      out_close:
++      up_read(&con->sock_sem);
++      if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
++              close_connection(con);
++              lowcomms_connect_sock(con);
++      }
++
++      out_ret:
++      return ret;
++}
++
++/* Listening socket is busy, accept a connection */
++static int accept_from_sock(struct connection *con)
++{
++      int result;
++      struct sockaddr_in6 peeraddr;
++      struct socket *newsock;
++      int len;
++      int nodeid;
++      struct connection *newcon;
++
++      memset(&peeraddr, 0, sizeof(peeraddr));
++      newsock = sock_alloc();
++      if (!newsock)
++              return -ENOMEM;
++
++      down_read(&con->sock_sem);
++
++      result = -ENOTCONN;
++      if (con->sock == NULL)
++              goto accept_err;
++
++      newsock->type = con->sock->type;
++      newsock->ops = con->sock->ops;
++
++      result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
++      if (result < 0)
++              goto accept_err;
++
++      /* Get the connected socket's peer */
++      if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
++                                &len, 2)) {
++              result = -ECONNABORTED;
++              goto accept_err;
++      }
++
++      /* Get the new node's NODEID */
++      nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
++      if (nodeid == 0) {
++              printk("dlm: connect from non cluster node\n");
++              sock_release(newsock);
++              up_read(&con->sock_sem);
++              return -1;
++      }
++
++      log_print("got connection from %d", nodeid);
++
++      /*  Check to see if we already have a connection to this node. This
++       *  could happen if the two nodes initiate a connection at roughly
++       *  the same time and the connections cross on the wire.
++       * TEMPORARY FIX:
++       *  In this case we store the incoming one in "othersock"
++       */
++      newcon = nodeid2con(nodeid);
++      down_write(&newcon->sock_sem);
++      if (newcon->sock) {
++              struct connection *othercon;
++
++              othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
++              if (!othercon) {
++                      printk("dlm: failed to allocate incoming socket\n");
++                      sock_release(newsock);
++                      up_write(&newcon->sock_sem);
++                      up_read(&con->sock_sem);
++                      goto accept_out;
++              }
++              memset(othercon, 0, sizeof(*othercon));
++              newcon->othersock = othercon;
++              othercon->nodeid = nodeid;
++              othercon->sock = newsock;
++              othercon->rx_action = receive_from_sock;
++              add_sock(newsock, othercon);
++              init_rwsem(&othercon->sock_sem);
++              set_bit(CF_IS_OTHERSOCK, &othercon->flags);
++              newsock->sk->sk_user_data = othercon;
++
++              up_write(&newcon->sock_sem);
++              lowcomms_data_ready(newsock->sk, 0);
++              up_read(&con->sock_sem);
++              goto accept_out;
++      }
++
++      newsock->sk->sk_user_data = newcon;
++      newcon->rx_action = receive_from_sock;
++      add_sock(newsock, newcon);
++      up_write(&newcon->sock_sem);
++
++      /*
++       * Add it to the active queue in case we got data
++       * beween processing the accept adding the socket
++       * to the read_sockets list
++       */
++      lowcomms_data_ready(newsock->sk, 0);
++
++      up_read(&con->sock_sem);
++
++      accept_out:
++      return 0;
++
++      accept_err:
++      up_read(&con->sock_sem);
++      sock_release(newsock);
++
++      printk("dlm: error accepting connection from node: %d\n", result);
++      return result;
++}
++
++/* Connect a new socket to its peer */
++static int connect_to_sock(struct connection *con)
++{
++      int result = -EHOSTUNREACH;
++      struct sockaddr_in6 saddr;
++      int addr_len;
++      struct socket *sock;
++
++      if (con->nodeid == 0) {
++              log_print("attempt to connect sock 0 foiled");
++              return 0;
++      }
++
++      down_write(&con->sock_sem);
++      if (con->retries++ > MAX_CONNECT_RETRIES)
++              goto out;
++
++      // FIXME not sure this should happen, let alone like this.
++      if (con->sock) {
++              sock_release(con->sock);
++              con->sock = NULL;
++      }
++
++      /* Create a socket to communicate with */
++      result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
++      if (result < 0)
++              goto out_err;
++
++      if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
++              goto out_err;
++
++      sock->sk->sk_user_data = con;
++      con->rx_action = receive_from_sock;
++
++      make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
++
++      add_sock(sock, con);
++      result =
++          sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
++                             O_NONBLOCK);
++      if (result == -EINPROGRESS)
++              result = 0;
++      if (result != 0)
++              goto out_err;
++
++      out:
++      up_write(&con->sock_sem);
++      /*
++       * Returning an error here means we've given up trying to connect to
++       * a remote node, otherwise we return 0 and reschedule the connetion
++       * attempt
++       */
++      return result;
++
++      out_err:
++      if (con->sock) {
++              sock_release(con->sock);
++              con->sock = NULL;
++      }
++      /*
++       * Some errors are fatal and this list might need adjusting. For other
++       * errors we try again until the max number of retries is reached.
++       */
++      if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
++          result != -ENETDOWN && result != EINVAL
++          && result != -EPROTONOSUPPORT) {
++              lowcomms_connect_sock(con);
++              result = 0;
++      }
++      goto out;
++}
++
++static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
++{
++        struct socket *sock = NULL;
++      mm_segment_t fs;
++      int result = 0;
++      int one = 1;
++      struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
++
++      /* Create a socket to communicate with */
++      result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
++      if (result < 0) {
++              printk("dlm: Can't create listening comms socket\n");
++              goto create_out;
++      }
++
++      fs = get_fs();
++      set_fs(get_ds());
++      result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
++      set_fs(fs);
++      if (result < 0) {
++              printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
++      }
++      sock->sk->sk_user_data = con;
++      con->rx_action = accept_from_sock;
++      con->sock = sock;
++
++      /* Bind to our port */
++      make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
++      result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
++      if (result < 0) {
++              printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
++              sock_release(sock);
++              sock = NULL;
++              goto create_out;
++      }
++
++      fs = get_fs();
++      set_fs(get_ds());
++
++      result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
++      set_fs(fs);
++      if (result < 0) {
++              printk("dlm: Set keepalive failed: %d\n", result);
++      }
++
++      result = sock->ops->listen(sock, 5);
++      if (result < 0) {
++              printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
++              sock_release(sock);
++              sock = NULL;
++              goto create_out;
++      }
++
++      create_out:
++      return sock;
++}
++
++
++/* Listen on all interfaces */
++static int listen_for_all(void)
++{
++      int result = 0;
++      int nodeid;
++      struct socket *sock = NULL;
++      struct list_head *addr_list;
++      struct connection *con = nodeid2con(0);
++      struct cluster_node_addr *node_addr;
++      char local_addr[sizeof(struct sockaddr_in6)];
++
++      /* This will also fill in local_addr */
++      nodeid = lowcomms_our_nodeid();
++
++      addr_list = kcl_get_node_addresses(nodeid);
++      if (!addr_list) {
++              printk("dlm: cannot initialise comms layer\n");
++              result = -ENOTCONN;
++              goto create_out;
++      }
++
++      list_for_each_entry(node_addr, addr_list, list) {
++
++              if (!con) {
++                      con = kmalloc(sizeof(struct connection), GFP_KERNEL);
++                      if (!con) {
++                              printk("dlm: failed to allocate listen socket\n");
++                              goto create_out;
++                      }
++                      memset(con, 0, sizeof(*con));
++                      init_rwsem(&con->sock_sem);
++                      spin_lock_init(&con->writequeue_lock);
++                      INIT_LIST_HEAD(&con->writequeue);
++                      set_bit(CF_IS_OTHERSOCK, &con->flags);
++              }
++
++              memcpy(local_addr, node_addr->addr, node_addr->addr_len);
++              sock = create_listen_sock(con, local_addr,
++                                        node_addr->addr_len);
++              if (sock) {
++                      add_sock(sock, con);
++              }
++              else {
++                      kfree(con);
++              }
++
++              /* Keep a list of dynamically allocated listening sockets
++                 so we can free them at shutdown */
++              if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
++                      list_add_tail(&con->listenlist, &listen_sockets);
++              }
++              con = NULL;
++      }
++
++      create_out:
++      return result;
++}
++
++
++
++static struct writequeue_entry *new_writequeue_entry(struct connection *con,
++                                                   int allocation)
++{
++      struct writequeue_entry *entry;
++
++      entry = kmalloc(sizeof(struct writequeue_entry), allocation);
++      if (!entry)
++              return NULL;
++
++      entry->page = alloc_page(allocation);
++      if (!entry->page) {
++              kfree(entry);
++              return NULL;
++      }
++
++      entry->offset = 0;
++      entry->len = 0;
++      entry->end = 0;
++      entry->users = 0;
++      entry->con = con;
++
++      return entry;
++}
++
++struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
++                                           int allocation, char **ppc)
++{
++      struct connection *con = nodeid2con(nodeid);
++      struct writequeue_entry *e;
++      int offset = 0;
++      int users = 0;
++
++      if (!atomic_read(&accepting))
++              return NULL;
++
++      spin_lock(&con->writequeue_lock);
++      e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
++      if (((struct list_head *) e == &con->writequeue) ||
++          (PAGE_CACHE_SIZE - e->end < len)) {
++              e = NULL;
++      } else {
++              offset = e->end;
++              e->end += len;
++              users = e->users++;
++      }
++      spin_unlock(&con->writequeue_lock);
++
++      if (e) {
++            got_one:
++              if (users == 0)
++                      kmap(e->page);
++              *ppc = page_address(e->page) + offset;
++              return e;
++      }
++
++      e = new_writequeue_entry(con, allocation);
++      if (e) {
++              spin_lock(&con->writequeue_lock);
++              offset = e->end;
++              e->end += len;
++              users = e->users++;
++              list_add_tail(&e->list, &con->writequeue);
++              spin_unlock(&con->writequeue_lock);
++              atomic_inc(&writequeue_length);
++              goto got_one;
++      }
++      return NULL;
++}
++
++void lowcomms_commit_buffer(struct writequeue_entry *e)
++{
++      struct connection *con = e->con;
++      int users;
++
++      if (!atomic_read(&accepting))
++              return;
++
++      spin_lock(&con->writequeue_lock);
++      users = --e->users;
++      if (users)
++              goto out;
++      e->len = e->end - e->offset;
++      kunmap(e->page);
++      spin_unlock(&con->writequeue_lock);
++
++      if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
++              spin_lock_bh(&write_sockets_lock);
++              list_add_tail(&con->write_list, &write_sockets);
++              spin_unlock_bh(&write_sockets_lock);
++
++              wake_up_interruptible(&lowcomms_send_waitq);
++      }
++      return;
++
++      out:
++      spin_unlock(&con->writequeue_lock);
++      return;
++}
++
++static void free_entry(struct writequeue_entry *e)
++{
++      __free_page(e->page);
++      kfree(e);
++      atomic_dec(&writequeue_length);
++}
++
++/* Send a message */
++static int send_to_sock(struct connection *con)
++{
++      int ret = 0;
++      ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
++      const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
++      struct writequeue_entry *e;
++      int len, offset;
++
++      down_read(&con->sock_sem);
++      if (con->sock == NULL)
++              goto out_connect;
++
++      sendpage = con->sock->ops->sendpage;
++
++      spin_lock(&con->writequeue_lock);
++      for (;;) {
++              e = list_entry(con->writequeue.next, struct writequeue_entry,
++                             list);
++              if ((struct list_head *) e == &con->writequeue)
++                      break;
++
++              len = e->len;
++              offset = e->offset;
++              BUG_ON(len == 0 && e->users == 0);
++              spin_unlock(&con->writequeue_lock);
++
++              ret = 0;
++              if (len) {
++                      ret = sendpage(con->sock, e->page, offset, len,
++                                     msg_flags);
++                      if (ret == -EAGAIN || ret == 0)
++                              goto out;
++                      if (ret <= 0)
++                              goto send_error;
++              }
++
++              spin_lock(&con->writequeue_lock);
++              e->offset += ret;
++              e->len -= ret;
++
++              if (e->len == 0 && e->users == 0) {
++                      list_del(&e->list);
++                      free_entry(e);
++                      continue;
++              }
++      }
++      spin_unlock(&con->writequeue_lock);
++      out:
++      up_read(&con->sock_sem);
++      return ret;
++
++      send_error:
++      up_read(&con->sock_sem);
++      close_connection(con);
++      lowcomms_connect_sock(con);
++      return ret;
++
++      out_connect:
++      up_read(&con->sock_sem);
++      lowcomms_connect_sock(con);
++      return 0;
++}
++
++/* Called from recoverd when it knows that a node has
++   left the cluster */
++int lowcomms_close(int nodeid)
++{
++      struct connection *con;
++
++      if (!connections)
++              goto out;
++
++      con = nodeid2con(nodeid);
++      if (con->sock) {
++              close_connection(con);
++              return 0;
++      }
++
++      out:
++      return -1;
++}
++
++/* API send message call, may queue the request */
++/* N.B. This is the old interface - use the new one for new calls */
++int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
++{
++      struct writequeue_entry *e;
++      char *b;
++
++      GDLM_ASSERT(nodeid < dlm_config.max_connections,
++                  printk("nodeid=%u\n", nodeid););
++
++      e = lowcomms_get_buffer(nodeid, len, allocation, &b);
++      if (e) {
++              memcpy(b, buf, len);
++              lowcomms_commit_buffer(e);
++              return 0;
++      }
++      return -ENOBUFS;
++}
++
++/* Look for activity on active sockets */
++static void process_sockets(void)
++{
++      struct list_head *list;
++      struct list_head *temp;
++
++      spin_lock_bh(&read_sockets_lock);
++      list_for_each_safe(list, temp, &read_sockets) {
++              struct connection *con =
++                  list_entry(list, struct connection, read_list);
++              list_del(&con->read_list);
++              clear_bit(CF_READ_PENDING, &con->flags);
++
++              spin_unlock_bh(&read_sockets_lock);
++
++              con->rx_action(con);
++
++              /* Don't starve out everyone else */
++              schedule();
++              spin_lock_bh(&read_sockets_lock);
++      }
++      spin_unlock_bh(&read_sockets_lock);
++}
++
++/* Try to send any messages that are pending
++ */
++static void process_output_queue(void)
++{
++      struct list_head *list;
++      struct list_head *temp;
++      int ret;
++
++      spin_lock_bh(&write_sockets_lock);
++      list_for_each_safe(list, temp, &write_sockets) {
++              struct connection *con =
++                  list_entry(list, struct connection, write_list);
++              list_del(&con->write_list);
++              clear_bit(CF_WRITE_PENDING, &con->flags);
++
++              spin_unlock_bh(&write_sockets_lock);
++
++              ret = send_to_sock(con);
++              if (ret < 0) {
++              }
++              spin_lock_bh(&write_sockets_lock);
++      }
++      spin_unlock_bh(&write_sockets_lock);
++}
++
++static void process_state_queue(void)
++{
++      struct list_head *list;
++      struct list_head *temp;
++      int ret;
++
++      spin_lock_bh(&state_sockets_lock);
++      list_for_each_safe(list, temp, &state_sockets) {
++              struct connection *con =
++                  list_entry(list, struct connection, state_list);
++              list_del(&con->state_list);
++              clear_bit(CF_CONNECT_PENDING, &con->flags);
++              spin_unlock_bh(&state_sockets_lock);
++
++              ret = connect_to_sock(con);
++              if (ret < 0) {
++              }
++              spin_lock_bh(&state_sockets_lock);
++      }
++      spin_unlock_bh(&state_sockets_lock);
++}
++
++/* Discard all entries on the write queues */
++static void clean_writequeues(void)
++{
++      struct list_head *list;
++      struct list_head *temp;
++      int nodeid;
++
++      for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
++              struct connection *con = nodeid2con(nodeid);
++
++              spin_lock(&con->writequeue_lock);
++              list_for_each_safe(list, temp, &con->writequeue) {
++                      struct writequeue_entry *e =
++                          list_entry(list, struct writequeue_entry, list);
++                      list_del(&e->list);
++                      free_entry(e);
++              }
++              spin_unlock(&con->writequeue_lock);
++      }
++}
++
++static int read_list_empty(void)
++{
++      int status;
++
++      spin_lock_bh(&read_sockets_lock);
++      status = list_empty(&read_sockets);
++      spin_unlock_bh(&read_sockets_lock);
++
++      return status;
++}
++
++/* DLM Transport comms receive daemon */
++static int dlm_recvd(void *data)
++{
++      daemonize("dlm_recvd");
++      atomic_set(&recv_run, 1);
++
++      init_waitqueue_head(&lowcomms_recv_waitq);
++      init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
++      add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
++
++      complete(&thread_completion);
++
++      while (atomic_read(&recv_run)) {
++
++              set_task_state(current, TASK_INTERRUPTIBLE);
++
++              if (read_list_empty())
++                      schedule();
++
++              set_task_state(current, TASK_RUNNING);
++
++              process_sockets();
++      }
++
++      down(&thread_lock);
++      up(&thread_lock);
++
++      complete(&thread_completion);
++
++      return 0;
++}
++
++static int write_and_state_lists_empty(void)
++{
++      int status;
++
++      spin_lock_bh(&write_sockets_lock);
++      status = list_empty(&write_sockets);
++      spin_unlock_bh(&write_sockets_lock);
++
++      spin_lock_bh(&state_sockets_lock);
++      if (list_empty(&state_sockets) == 0)
++              status = 0;
++      spin_unlock_bh(&state_sockets_lock);
++
++      return status;
++}
++
++/* DLM Transport send daemon */
++static int dlm_sendd(void *data)
++{
++      daemonize("dlm_sendd");
++      atomic_set(&send_run, 1);
++
++      init_waitqueue_head(&lowcomms_send_waitq);
++      init_waitqueue_entry(&lowcomms_send_waitq_head, current);
++      add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
++
++      complete(&thread_completion);
++
++      while (atomic_read(&send_run)) {
++
++              set_task_state(current, TASK_INTERRUPTIBLE);
++
++              if (write_and_state_lists_empty())
++                      schedule();
++
++              set_task_state(current, TASK_RUNNING);
++
++              process_state_queue();
++              process_output_queue();
++      }
++
++      down(&thread_lock);
++      up(&thread_lock);
++
++      complete(&thread_completion);
++
++      return 0;
++}
++
++static void daemons_stop(void)
++{
++      if (atomic_read(&recv_run)) {
++              down(&thread_lock);
++              atomic_set(&recv_run, 0);
++              wake_up_interruptible(&lowcomms_recv_waitq);
++              up(&thread_lock);
++              wait_for_completion(&thread_completion);
++      }
++
++      if (atomic_read(&send_run)) {
++              down(&thread_lock);
++              atomic_set(&send_run, 0);
++              wake_up_interruptible(&lowcomms_send_waitq);
++              up(&thread_lock);
++              wait_for_completion(&thread_completion);
++      }
++}
++
++static int daemons_start(void)
++{
++      int error;
++
++      error = kernel_thread(dlm_recvd, NULL, 0);
++      if (error < 0) {
++              log_print("can't start recvd thread: %d", error);
++              goto out;
++      }
++      wait_for_completion(&thread_completion);
++
++      error = kernel_thread(dlm_sendd, NULL, 0);
++      if (error < 0) {
++              log_print("can't start sendd thread: %d", error);
++              daemons_stop();
++              goto out;
++      }
++      wait_for_completion(&thread_completion);
++
++      error = 0;
++ out:
++      return error;
++}
++
++/*
++ * Return the largest buffer size we can cope with.
++ */
++int lowcomms_max_buffer_size(void)
++{
++      return PAGE_CACHE_SIZE;
++}
++
++void lowcomms_stop(void)
++{
++      int i;
++      struct connection *temp;
++      struct connection *lcon;
++
++      atomic_set(&accepting, 0);
++
++      /* Set all the activity flags to prevent any
++         socket activity.
++      */
++      for (i = 0; i < conn_array_size; i++) {
++              connections[i].flags = 0x7;
++      }
++      daemons_stop();
++      clean_writequeues();
++
++      for (i = 0; i < conn_array_size; i++) {
++              close_connection(nodeid2con(i));
++      }
++
++      kfree(connections);
++      connections = NULL;
++
++      /* Free up any dynamically allocated listening sockets */
++      list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
++              sock_release(lcon->sock);
++              kfree(lcon);
++      }
++
++      kcl_releaseref_cluster();
++}
++
++/* This is quite likely to sleep... */
++int lowcomms_start(void)
++{
++      int error = 0;
++      int i;
++
++      INIT_LIST_HEAD(&read_sockets);
++      INIT_LIST_HEAD(&write_sockets);
++      INIT_LIST_HEAD(&state_sockets);
++      INIT_LIST_HEAD(&listen_sockets);
++
++      spin_lock_init(&read_sockets_lock);
++      spin_lock_init(&write_sockets_lock);
++      spin_lock_init(&state_sockets_lock);
++
++      init_completion(&thread_completion);
++      init_MUTEX(&thread_lock);
++      atomic_set(&send_run, 0);
++      atomic_set(&recv_run, 0);
++
++      error = -ENOTCONN;
++      if (kcl_addref_cluster())
++              goto out;
++
++      /*
++       * Temporarily initialise the waitq head so that lowcomms_send_message
++       * doesn't crash if it gets called before the thread is fully
++       * initialised
++       */
++      init_waitqueue_head(&lowcomms_send_waitq);
++
++      error = -ENOMEM;
++
++      connections = kmalloc(sizeof(struct connection) *
++                            dlm_config.max_connections, GFP_KERNEL);
++      if (!connections)
++              goto out;
++
++      memset(connections, 0,
++             sizeof(struct connection) * dlm_config.max_connections);
++      for (i = 0; i < dlm_config.max_connections; i++) {
++              connections[i].nodeid = i;
++              init_rwsem(&connections[i].sock_sem);
++              INIT_LIST_HEAD(&connections[i].writequeue);
++              spin_lock_init(&connections[i].writequeue_lock);
++      }
++      conn_array_size = dlm_config.max_connections;
++
++      /* Start listening */
++      error = listen_for_all();
++      if (error)
++              goto fail_free_conn;
++
++      error = daemons_start();
++      if (error)
++              goto fail_free_conn;
++
++      atomic_set(&accepting, 1);
++
++      return 0;
++
++      fail_free_conn:
++      kfree(connections);
++
++      out:
++      return error;
++}
++
++/* Don't accept any more outgoing work */
++void lowcomms_stop_accept()
++{
++        atomic_set(&accepting, 0);
++}
++
++/* Cluster Manager interface functions for looking up
++   nodeids and IP addresses by each other
++*/
++
++/* Return the IP address of a node given its NODEID */
++static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
++{
++      struct list_head *addrs;
++      struct cluster_node_addr *node_addr;
++      struct cluster_node_addr *current_addr = NULL;
++      struct sockaddr_in6 *saddr;
++      int interface;
++      int i;
++
++      addrs = kcl_get_node_addresses(nodeid);
++      if (!addrs)
++              return -1;
++
++      interface = kcl_get_current_interface();
++
++      /* Look for address number <interface> */
++      i=0; /* i/f numbers start at 1 */
++      list_for_each_entry(node_addr, addrs, list) {
++              if (interface == ++i) {
++                      current_addr = node_addr;
++                      break;
++              }
++      }
++
++      /* If that failed then just use the first one */
++      if (!current_addr)
++              current_addr = (struct cluster_node_addr *)addrs->next;
++
++      saddr = (struct sockaddr_in6 *)current_addr->addr;
++
++      /* Extract the IP address */
++      if (saddr->sin6_family == AF_INET) {
++              struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
++              struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
++              ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
++      }
++      else {
++              struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
++              memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
++      }
++
++      return 0;
++}
++
++/* Return the NODEID for a node given its sockaddr */
++static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
++{
++      struct kcl_cluster_node node;
++      struct sockaddr_in6 ipv6_addr;
++      struct sockaddr_in  ipv4_addr;
++
++      if (addr->sa_family == AF_INET) {
++              struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
++              memcpy(&ipv4_addr, &local_addr, addr_len);
++              memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
++
++              addr = (struct sockaddr *)&ipv4_addr;
++      }
++      else {
++              struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
++              memcpy(&ipv6_addr, &local_addr, addr_len);
++              memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
++
++              addr = (struct sockaddr *)&ipv6_addr;
++      }
++
++      if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
++              return node.node_id;
++      else
++              return 0;
++}
++
++int lowcomms_our_nodeid(void)
++{
++      struct kcl_cluster_node node;
++      struct list_head *addrs;
++      struct cluster_node_addr *first_addr;
++      static int our_nodeid = 0;
++
++      if (our_nodeid)
++              return our_nodeid;
++
++      if (kcl_get_node_by_nodeid(0, &node) == -1)
++              return 0;
++
++      our_nodeid = node.node_id;
++
++      /* Fill in the "template" structure */
++      addrs = kcl_get_node_addresses(our_nodeid);
++      if (!addrs)
++              return 0;
++
++      first_addr = (struct cluster_node_addr *) addrs->next;
++      memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
++
++      return node.node_id;
++}
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
+--- linux-orig/cluster/dlm/lowcomms.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lowcomms.h       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOWCOMMS_DOT_H__
++#define __LOWCOMMS_DOT_H__
++
++/* The old interface */
++int lowcomms_send_message(int csid, char *buf, int len, int allocation);
++
++/* The new interface */
++struct writequeue_entry;
++extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
++                                                  int allocation, char **ppc);
++extern void lowcomms_commit_buffer(struct writequeue_entry *e);
++
++int lowcomms_start(void);
++void lowcomms_stop(void);
++void lowcomms_stop_accept(void);
++int lowcomms_close(int nodeid);
++int lowcomms_max_buffer_size(void);
++
++int lowcomms_our_nodeid(void);
++
++#endif                                /* __LOWCOMMS_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
+--- linux-orig/cluster/dlm/main.c      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/main.c   2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++#include <linux/ctype.h>
++#include <linux/seq_file.h>
++#include <linux/module.h>
++#include <net/sock.h>
++
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lockspace.h"
++#include "recoverd.h"
++#include "ast.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "locking.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++#include "lowcomms.h"
++
++int  dlm_device_init(void);
++void dlm_device_exit(void);
++void dlm_proc_init(void);
++void dlm_proc_exit(void);
++
++
++/* Cluster manager callbacks, we want to know if a node dies
++   N.B. this is independent of lockspace-specific event callbacks from SM */
++
++static void cman_callback(kcl_callback_reason reason, long arg)
++{
++      if (reason == DIED) {
++              lowcomms_close((int) arg);
++      }
++
++      /* This is unconditional. so do what we can to tidy up */
++      if (reason == LEAVING) {
++              dlm_emergency_shutdown();
++      }
++}
++
++int __init init_dlm(void)
++{
++      dlm_proc_init();
++      dlm_lockspace_init();
++      dlm_recoverd_init();
++      dlm_nodes_init();
++      dlm_device_init();
++      dlm_memory_init();
++      dlm_config_init();
++
++      kcl_add_callback(cman_callback);
++
++      printk("DLM %s (built %s %s) installed\n",
++             DLM_RELEASE_NAME, __DATE__, __TIME__);
++
++      return 0;
++}
++
++void __exit exit_dlm(void)
++{
++      kcl_remove_callback(cman_callback);
++
++      dlm_device_exit();
++      dlm_memory_exit();
++      dlm_config_exit();
++      dlm_proc_exit();
++}
++
++MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++module_init(init_dlm);
++module_exit(exit_dlm);
++
++EXPORT_SYMBOL(dlm_init);
++EXPORT_SYMBOL(dlm_release);
++EXPORT_SYMBOL(dlm_new_lockspace);
++EXPORT_SYMBOL(dlm_release_lockspace);
++EXPORT_SYMBOL(dlm_lock);
++EXPORT_SYMBOL(dlm_unlock);
+diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
+--- linux-orig/cluster/dlm/memory.c    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/memory.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,238 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* memory.c
++ * 
++ * memory allocation routines
++ * 
++ */
++
++#include "dlm_internal.h"
++#include "memory.h"
++#include "config.h"
++
++/* as the man says...Shouldn't this be in a header file somewhere? */
++#define       BYTES_PER_WORD          sizeof(void *)
++
++static kmem_cache_t *rsb_cache_small;
++static kmem_cache_t *rsb_cache_large;
++static kmem_cache_t *lkb_cache;
++static kmem_cache_t *lvb_cache;
++static kmem_cache_t *resdir_cache_large;
++static kmem_cache_t *resdir_cache_small;
++
++/* The thresholds above which we allocate large RSBs/resdatas rather than small 
++ * ones. This must make the resultant structure end on a word boundary */
++#define LARGE_RSB_NAME 28
++#define LARGE_RES_NAME 28
++
++int dlm_memory_init()
++{
++      int ret = -ENOMEM;
++
++
++      rsb_cache_small =
++          kmem_cache_create("dlm_rsb(small)",
++                            (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
++                            __alignof__(gd_res_t), 0, NULL, NULL);
++      if (!rsb_cache_small)
++              goto out;
++
++      rsb_cache_large =
++          kmem_cache_create("dlm_rsb(large)",
++                            sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
++                            __alignof__(gd_res_t), 0, NULL, NULL);
++      if (!rsb_cache_large)
++              goto out_free_rsbs;
++
++      lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
++                                    __alignof__(gd_lkb_t), 0, NULL, NULL);
++      if (!lkb_cache)
++              goto out_free_rsbl;
++
++      resdir_cache_large =
++          kmem_cache_create("dlm_resdir(l)",
++                            sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
++                            __alignof__(gd_resdata_t), 0, NULL, NULL);
++      if (!resdir_cache_large)
++              goto out_free_lkb;
++
++      resdir_cache_small =
++          kmem_cache_create("dlm_resdir(s)",
++                            (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
++                            __alignof__(gd_resdata_t), 0, NULL, NULL);
++      if (!resdir_cache_small)
++              goto out_free_resl;
++
++      /* LVB cache also holds ranges, so should be 64bit aligned */
++      lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
++                                    __alignof__(uint64_t), 0, NULL, NULL);
++      if (!lkb_cache)
++              goto out_free_ress;
++
++      ret = 0;
++      goto out;
++
++      out_free_ress:
++      kmem_cache_destroy(resdir_cache_small);
++
++      out_free_resl:
++      kmem_cache_destroy(resdir_cache_large);
++
++      out_free_lkb:
++      kmem_cache_destroy(lkb_cache);
++
++      out_free_rsbl:
++      kmem_cache_destroy(rsb_cache_large);
++
++      out_free_rsbs:
++      kmem_cache_destroy(rsb_cache_small);
++
++      out:
++      return ret;
++}
++
++void dlm_memory_exit()
++{
++      kmem_cache_destroy(rsb_cache_large);
++      kmem_cache_destroy(rsb_cache_small);
++      kmem_cache_destroy(lkb_cache);
++      kmem_cache_destroy(resdir_cache_small);
++      kmem_cache_destroy(resdir_cache_large);
++      kmem_cache_destroy(lvb_cache);
++}
++
++gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
++{
++      gd_res_t *r;
++
++      GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++      if (namelen >= LARGE_RSB_NAME)
++              r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
++      else
++              r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
++
++      if (r)
++              memset(r, 0, sizeof(gd_res_t) + namelen);
++
++      return r;
++}
++
++void free_rsb(gd_res_t *r)
++{
++      int length = r->res_length;
++
++#ifdef POISON
++      memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
++#endif
++
++      if (length >= LARGE_RSB_NAME)
++              kmem_cache_free(rsb_cache_large, r);
++      else
++              kmem_cache_free(rsb_cache_small, r);
++}
++
++gd_lkb_t *allocate_lkb(gd_ls_t *ls)
++{
++      gd_lkb_t *l;
++
++      l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
++      if (l)
++              memset(l, 0, sizeof(gd_lkb_t));
++
++      return l;
++}
++
++void free_lkb(gd_lkb_t *l)
++{
++#ifdef POISON
++      memset(l, 0xAA, sizeof(gd_lkb_t));
++#endif
++      kmem_cache_free(lkb_cache, l);
++}
++
++gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
++{
++      gd_resdata_t *rd;
++
++      GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++      if (namelen >= LARGE_RES_NAME)
++              rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
++      else
++              rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
++
++      if (rd)
++              memset(rd, 0, sizeof(gd_resdata_t));
++
++      return rd;
++}
++
++void free_resdata(gd_resdata_t *rd)
++{
++      if (rd->rd_length >= LARGE_RES_NAME)
++              kmem_cache_free(resdir_cache_large, rd);
++      else
++              kmem_cache_free(resdir_cache_small, rd);
++}
++
++char *allocate_lvb(gd_ls_t *ls)
++{
++      char *l;
++
++      l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
++      if (l)
++              memset(l, 0, DLM_LVB_LEN);
++
++      return l;
++}
++
++void free_lvb(char *l)
++{
++      kmem_cache_free(lvb_cache, l);
++}
++
++/* Ranges are allocated from the LVB cache as they are the same size (4x64
++ * bits) */
++uint64_t *allocate_range(gd_ls_t * ls)
++{
++      uint64_t *l;
++
++      l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
++      if (l)
++              memset(l, 0, DLM_LVB_LEN);
++
++      return l;
++}
++
++void free_range(uint64_t *l)
++{
++      kmem_cache_free(lvb_cache, l);
++}
++
++gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
++{
++      gd_rcom_t *rc;
++
++      rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
++      if (rc)
++              memset(rc, 0, dlm_config.buffer_size);
++
++      return rc;
++}
++
++void free_rcom_buffer(gd_rcom_t *rc)
++{
++      kfree(rc);
++}
+diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
+--- linux-orig/cluster/dlm/memory.h    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/memory.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MEMORY_DOT_H__
++#define __MEMORY_DOT_H__
++
++int dlm_memory_init(void);
++void dlm_memory_exit(void);
++gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
++void free_rsb(gd_res_t * r);
++gd_lkb_t *allocate_lkb(gd_ls_t * ls);
++void free_lkb(gd_lkb_t * l);
++gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
++void free_resdata(gd_resdata_t * rd);
++char *allocate_lvb(gd_ls_t * ls);
++void free_lvb(char *l);
++gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
++void free_rcom_buffer(gd_rcom_t * rc);
++uint64_t *allocate_range(gd_ls_t * ls);
++void free_range(uint64_t * l);
++
++#endif                /* __MEMORY_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
+--- linux-orig/cluster/dlm/midcomms.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/midcomms.c       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,351 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * midcomms.c
++ *
++ * This is the appallingly named "mid-level" comms layer.
++ *
++ * Its purpose is to take packets from the "real" comms layer,
++ * split them up into packets and pass them to the interested
++ * part of the locking mechanism.
++ *
++ * It also takes messages from the locking layer, formats them
++ * into packets and sends them to the comms layer.
++ *
++ * It knows the format of the mid-level messages used and nodeidss
++ * but it does not know how to resolve a nodeid into an IP address
++ * or any of the comms channel details
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "lockqueue.h"
++#include "nodes.h"
++#include "reccomms.h"
++#include "config.h"
++
++/* Byteorder routines */
++
++static void host_to_network(void *msg)
++{
++      struct gd_req_header *head = msg;
++      struct gd_remlockrequest *req = msg;
++      struct gd_remlockreply *reply = msg;
++      struct gd_remquery *query = msg;
++      struct gd_remqueryreply *queryrep = msg;
++      gd_rcom_t *rc = msg;
++
++      /* Force into network byte order */
++
++      /*
++       * Do the common header first
++       */
++
++      head->rh_length = cpu_to_le16(head->rh_length);
++      head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
++      /* Leave the lkid alone as it is transparent at the remote end */
++
++      /*
++       * Do the fields in the remlockrequest or remlockreply structs
++       */
++
++      switch (req->rr_header.rh_cmd) {
++
++      case GDLM_REMCMD_LOCKREQUEST:
++      case GDLM_REMCMD_CONVREQUEST:
++              req->rr_range_start = cpu_to_le64(req->rr_range_start);
++              req->rr_range_end = cpu_to_le64(req->rr_range_end);
++              /* Deliberate fall through */
++      case GDLM_REMCMD_UNLOCKREQUEST:
++      case GDLM_REMCMD_LOOKUP:
++      case GDLM_REMCMD_LOCKGRANT:
++      case GDLM_REMCMD_SENDBAST:
++      case GDLM_REMCMD_SENDCAST:
++      case GDLM_REMCMD_REM_RESDATA:
++              req->rr_flags = cpu_to_le32(req->rr_flags);
++              req->rr_status = cpu_to_le32(req->rr_status);
++              break;
++
++      case GDLM_REMCMD_LOCKREPLY:
++              reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
++              reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
++              reply->rl_status = cpu_to_le32(reply->rl_status);
++              break;
++
++      case GDLM_REMCMD_RECOVERMESSAGE:
++      case GDLM_REMCMD_RECOVERREPLY:
++              rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
++              rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
++              break;
++
++      case GDLM_REMCMD_QUERY:
++              query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
++              query->rq_query = cpu_to_le32(query->rq_query);
++              query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
++              break;
++
++      case GDLM_REMCMD_QUERYREPLY:
++              queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
++              queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
++              queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
++              queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
++              queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
++              break;
++
++      default:
++              printk("dlm: warning, unknown REMCMD type %u\n",
++                     req->rr_header.rh_cmd);
++      }
++}
++
++static void network_to_host(void *msg)
++{
++      struct gd_req_header *head = msg;
++      struct gd_remlockrequest *req = msg;
++      struct gd_remlockreply *reply = msg;
++      struct gd_remquery *query = msg;
++      struct gd_remqueryreply *queryrep = msg;
++      gd_rcom_t *rc = msg;
++
++      /* Force into host byte order */
++
++      /*
++       * Do the common header first
++       */
++
++      head->rh_length = le16_to_cpu(head->rh_length);
++      head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
++      /* Leave the lkid alone as it is transparent at the remote end */
++
++      /*
++       * Do the fields in the remlockrequest or remlockreply structs
++       */
++
++      switch (req->rr_header.rh_cmd) {
++
++      case GDLM_REMCMD_LOCKREQUEST:
++      case GDLM_REMCMD_CONVREQUEST:
++              req->rr_range_start = le64_to_cpu(req->rr_range_start);
++              req->rr_range_end = le64_to_cpu(req->rr_range_end);
++      case GDLM_REMCMD_LOOKUP:
++      case GDLM_REMCMD_UNLOCKREQUEST:
++      case GDLM_REMCMD_LOCKGRANT:
++      case GDLM_REMCMD_SENDBAST:
++      case GDLM_REMCMD_SENDCAST:
++      case GDLM_REMCMD_REM_RESDATA:
++              /* Actually, not much to do here as the remote lock IDs are
++               * transparent too */
++              req->rr_flags = le32_to_cpu(req->rr_flags);
++              req->rr_status = le32_to_cpu(req->rr_status);
++              break;
++
++      case GDLM_REMCMD_LOCKREPLY:
++              reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
++              reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
++              reply->rl_status = le32_to_cpu(reply->rl_status);
++              break;
++
++      case GDLM_REMCMD_RECOVERMESSAGE:
++      case GDLM_REMCMD_RECOVERREPLY:
++              rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
++              rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
++              break;
++
++
++      case GDLM_REMCMD_QUERY:
++              query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
++              query->rq_query = le32_to_cpu(query->rq_query);
++              query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
++              break;
++
++      case GDLM_REMCMD_QUERYREPLY:
++              queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
++              queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
++              queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
++              queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
++              queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
++              break;
++
++      default:
++              printk("dlm: warning, unknown REMCMD type %u\n",
++                     req->rr_header.rh_cmd);
++      }
++}
++
++static void copy_from_cb(void *dst, const void *base, unsigned offset,
++                       unsigned len, unsigned limit)
++{
++      unsigned copy = len;
++
++      if ((copy + offset) > limit)
++              copy = limit - offset;
++      memcpy(dst, base + offset, copy);
++      len -= copy;
++      if (len)
++              memcpy(dst + copy, base, len);
++}
++
++static void khexdump(const unsigned char *c, int len)
++{
++      while (len > 16) {
++              printk(KERN_INFO
++                     "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
++                     c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
++                     c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
++              len -= 16;
++      }
++      while (len > 4) {
++              printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
++                     c[3]);
++              len -= 4;
++      }
++      while (len > 0) {
++              printk(KERN_INFO "%02x\n", c[0]);
++              len--;
++      }
++}
++
++/*
++ * Called from the low-level comms layer to process a buffer of
++ * commands.
++ *
++ * Only complete messages are processed here, any "spare" bytes from
++ * the end of a buffer are saved and tacked onto the front of the next
++ * message that comes in. I doubt this will happen very often but we
++ * need to be able to cope with it and I don't want the task to be waiting
++ * for packets to come in when there is useful work to be done.
++ *
++ */
++int midcomms_process_incoming_buffer(int nodeid, const void *base,
++                                   unsigned offset, unsigned len,
++                                   unsigned limit)
++{
++      unsigned char __tmp[sizeof(struct gd_req_header) + 64];
++      struct gd_req_header *msg = (struct gd_req_header *) __tmp;
++      int ret = 0;
++      int err = 0;
++      unsigned msglen;
++      __u32 id, space;
++
++      while (len > sizeof(struct gd_req_header)) {
++              /* Get message header and check it over */
++              copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
++                           limit);
++              msglen = le16_to_cpu(msg->rh_length);
++              id = msg->rh_lkid;
++              space = msg->rh_lockspace;
++
++              /* Check message size */
++              err = -EINVAL;
++              if (msglen < sizeof(struct gd_req_header))
++                      break;
++              err = -E2BIG;
++              if (msglen > dlm_config.buffer_size) {
++                      printk("dlm: message size too big %d\n", msglen);
++                      break;
++              }
++              err = 0;
++
++              /* Not enough in buffer yet? wait for some more */
++              if (msglen > len)
++                      break;
++
++              /* Make sure our temp buffer is large enough */
++              if (msglen > sizeof(__tmp) &&
++                  msg == (struct gd_req_header *) __tmp) {
++                      msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
++                      if (msg == NULL)
++                              return ret;
++              }
++
++              copy_from_cb(msg, base, offset, msglen, limit);
++              BUG_ON(id != msg->rh_lkid);
++              BUG_ON(space != msg->rh_lockspace);
++              ret += msglen;
++              offset += msglen;
++              offset &= (limit - 1);
++              len -= msglen;
++              network_to_host(msg);
++
++              if ((msg->rh_cmd > 32) ||
++                  (msg->rh_cmd == 0) ||
++                  (msg->rh_length < sizeof(struct gd_req_header)) ||
++                  (msg->rh_length > dlm_config.buffer_size)) {
++
++                      printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
++                             "lkid=%u, lockspace=%u\n",
++                             msg->rh_cmd, msg->rh_flags, msg->rh_length,
++                             msg->rh_lkid, msg->rh_lockspace);
++
++                      printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
++                             "ret=%u, limit=%08x newbuf=%d\n",
++                             base, offset, len, ret, limit,
++                             ((struct gd_req_header *) __tmp == msg));
++
++                      khexdump((const unsigned char *) msg, msg->rh_length);
++
++                      return -EBADMSG;
++              }
++
++              switch (msg->rh_cmd) {
++              case GDLM_REMCMD_RECOVERMESSAGE:
++              case GDLM_REMCMD_RECOVERREPLY:
++                      process_recovery_comm(nodeid, msg);
++                      break;
++              default:
++                      process_cluster_request(nodeid, msg, FALSE);
++              }
++      }
++
++      if (msg != (struct gd_req_header *) __tmp)
++              kfree(msg);
++
++      return err ? err : ret;
++}
++
++/*
++ * Send a lowcomms buffer
++ */
++
++void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
++{
++      host_to_network(msg);
++      lowcomms_commit_buffer(e);
++}
++
++/*
++ * Make the message into network byte order and send it
++ */
++
++int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
++                        int allocation)
++{
++      int len = msg->rh_length;
++
++      host_to_network(msg);
++
++      /*
++       * Loopback.  In fact, the locking code pretty much prevents this from
++       * being needed but it can happen when the directory node is also the
++       * local node.
++       */
++
++      if (nodeid == our_nodeid())
++              return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
++                                                      len, len);
++
++      return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
++}
+diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
+--- linux-orig/cluster/dlm/midcomms.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/midcomms.h       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,24 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MIDCOMMS_DOT_H__
++#define __MIDCOMMS_DOT_H__
++
++int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
++                        int allocation);
++int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
++                                   unsigned len, unsigned limit);
++void midcomms_send_buffer(struct gd_req_header *msg,
++                        struct writequeue_entry *e);
++
++#endif                                /* __MIDCOMMS_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
+--- linux-orig/cluster/dlm/nodes.c     1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/nodes.c  2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,325 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <net/sock.h>
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "nodes.h"
++#include "recover.h"
++#include "reccomms.h"
++#include "util.h"
++
++static struct list_head cluster_nodes;
++static spinlock_t node_lock;
++static uint32_t local_nodeid;
++static struct semaphore local_init_lock;
++
++
++void dlm_nodes_init(void)
++{
++      INIT_LIST_HEAD(&cluster_nodes);
++      spin_lock_init(&node_lock);
++      local_nodeid = 0;
++      init_MUTEX(&local_init_lock);
++}
++
++static gd_node_t *search_node(uint32_t nodeid)
++{
++      gd_node_t *node;
++
++      list_for_each_entry(node, &cluster_nodes, gn_list) {
++              if (node->gn_nodeid == nodeid)
++                      goto out;
++      }
++      node = NULL;
++      out:
++      return node;
++}
++
++static void put_node(gd_node_t *node)
++{
++      spin_lock(&node_lock);
++      node->gn_refcount--;
++      if (node->gn_refcount == 0) {
++              list_del(&node->gn_list);
++              spin_unlock(&node_lock);
++              kfree(node);
++              return;
++      }
++      spin_unlock(&node_lock);
++}
++
++static int get_node(uint32_t nodeid, gd_node_t **ndp)
++{
++      gd_node_t *node, *node2;
++      int error = -ENOMEM;
++
++      spin_lock(&node_lock);
++      node = search_node(nodeid);
++      if (node)
++              node->gn_refcount++;
++      spin_unlock(&node_lock);
++
++      if (node)
++              goto out;
++
++      node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
++      if (!node)
++              goto fail;
++
++      memset(node, 0, sizeof(gd_node_t));
++      node->gn_nodeid = nodeid;
++
++      spin_lock(&node_lock);
++      node2 = search_node(nodeid);
++      if (node2) {
++              node2->gn_refcount++;
++              spin_unlock(&node_lock);
++              kfree(node);
++              node = node2;
++              goto out;
++      }
++
++      node->gn_refcount = 1;
++      list_add_tail(&node->gn_list, &cluster_nodes);
++      spin_unlock(&node_lock);
++
++      out:
++      *ndp = node;
++      return 0;
++
++      fail:
++      return error;
++}
++
++int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
++{
++      gd_csb_t *csb;
++      gd_node_t *node;
++      int error = -ENOMEM;
++
++      csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
++      if (!csb)
++              goto fail;
++
++      memset(csb, 0, sizeof(gd_csb_t));
++
++      error = get_node(nodeid, &node);
++      if (error)
++              goto fail_free;
++
++      csb->csb_node = node;
++
++      down(&local_init_lock);
++
++      if (!local_nodeid) {
++              if (nodeid == our_nodeid()) {
++                      local_nodeid = node->gn_nodeid;
++              }
++      }
++      up(&local_init_lock);
++
++      *ret_csb = csb;
++      return 0;
++
++      fail_free:
++      kfree(csb);
++      fail:
++      return error;
++}
++
++void release_csb(gd_csb_t *csb)
++{
++      put_node(csb->csb_node);
++      kfree(csb);
++}
++
++uint32_t our_nodeid(void)
++{
++      return lowcomms_our_nodeid();
++}
++
++int nodes_reconfig_wait(gd_ls_t *ls)
++{
++      int error;
++
++      if (ls->ls_low_nodeid == our_nodeid()) {
++              error = gdlm_wait_status_all(ls, NODES_VALID);
++              if (!error)
++                      set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
++
++              /* Experimental: this delay should allow any final messages
++               * from the previous node to be received before beginning
++               * recovery. */
++
++              if (ls->ls_num_nodes == 1) {
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout((2) * HZ);
++              }
++
++      } else
++              error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
++
++      return error;
++}
++
++static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
++{
++      gd_csb_t *csb = NULL;
++      struct list_head *tmp;
++      struct list_head *newlist = &new->csb_list;
++      struct list_head *head = &ls->ls_nodes;
++
++      list_for_each(tmp, head) {
++              csb = list_entry(tmp, gd_csb_t, csb_list);
++
++              if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
++                      break;
++      }
++
++      if (!csb)
++              list_add_tail(newlist, head);
++      else {
++              /* FIXME: can use list macro here */
++              newlist->prev = tmp->prev;
++              newlist->next = tmp;
++              tmp->prev->next = newlist;
++              tmp->prev = newlist;
++      }
++}
++
++int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
++{
++      gd_csb_t *csb, *safe;
++      int error, i, found, pos = 0, neg = 0;
++      uint32_t low = (uint32_t) (-1);
++
++      /* 
++       * Remove (and save) departed nodes from lockspace's nodes list
++       */
++
++      list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
++              found = FALSE;
++              for (i = 0; i < gr->gr_node_count; i++) {
++                      if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
++                              found = TRUE;
++                              break;
++                      }
++              }
++
++              if (!found) {
++                      neg++;
++                      csb->csb_gone_event = gr->gr_event_id;
++                      list_del(&csb->csb_list);
++                      list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
++                      ls->ls_num_nodes--;
++                      log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
++              }
++      }
++
++      /* 
++       * Add new nodes to lockspace's nodes list
++       */
++
++      for (i = 0; i < gr->gr_node_count; i++) {
++              found = FALSE;
++              list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++                      if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
++                              found = TRUE;
++                              break;
++                      }
++              }
++
++              if (!found) {
++                      pos++;
++
++                      error = init_new_csb(gr->gr_nodeids[i], &csb);
++                      GDLM_ASSERT(!error,);
++
++                      add_ordered_node(ls, csb);
++                      ls->ls_num_nodes++;
++                      log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
++              }
++      }
++
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              if (csb->csb_node->gn_nodeid < low)
++                      low = csb->csb_node->gn_nodeid;
++      }
++
++      rcom_log_clear(ls);
++      ls->ls_low_nodeid = low;
++      ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
++      set_bit(LSFL_NODES_VALID, &ls->ls_flags);
++      *neg_out = neg;
++
++      error = nodes_reconfig_wait(ls);
++
++      log_all(ls, "total nodes %d", ls->ls_num_nodes);
++
++      return error;
++}
++
++int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
++{
++      gd_csb_t *csb;
++      int i, error;
++      uint32_t low = (uint32_t) (-1);
++
++      log_all(ls, "add nodes");
++
++      for (i = 0; i < gr->gr_node_count; i++) {
++              error = init_new_csb(gr->gr_nodeids[i], &csb);
++              if (error)
++                      goto fail;
++
++              add_ordered_node(ls, csb);
++              ls->ls_num_nodes++;
++
++              if (csb->csb_node->gn_nodeid < low)
++                      low = csb->csb_node->gn_nodeid;
++      }
++
++      ls->ls_low_nodeid = low;
++      ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
++      set_bit(LSFL_NODES_VALID, &ls->ls_flags);
++
++      error = nodes_reconfig_wait(ls);
++
++      log_all(ls, "total nodes %d", ls->ls_num_nodes);
++
++      return error;
++
++      fail:
++      while (!list_empty(&ls->ls_nodes)) {
++              csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
++              list_del(&csb->csb_list);
++              release_csb(csb);
++      }
++      ls->ls_num_nodes = 0;
++
++      return error;
++}
++
++int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
++{
++      gd_csb_t *csb;
++
++      list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
++              if (csb->csb_node->gn_nodeid == nodeid)
++                      return TRUE;
++      }
++      return FALSE;
++}
+diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
+--- linux-orig/cluster/dlm/nodes.h     1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/nodes.h  2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,25 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __NODES_DOT_H__
++#define __NODES_DOT_H__
++
++void dlm_nodes_init(void);
++int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
++void release_csb(gd_csb_t * csb);
++uint32_t our_nodeid(void);
++int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
++int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
++int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
++
++#endif                                /* __NODES_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
+--- linux-orig/cluster/dlm/proc.c      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/proc.c   2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,469 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++#include <linux/ctype.h>
++#include <linux/seq_file.h>
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "lockspace.h"
++
++#if defined(DLM_DEBUG)
++#define DLM_DEBUG_SIZE                (1024)
++#define MAX_DEBUG_MSG_LEN     (64)
++#else
++#define DLM_DEBUG_SIZE                (0)
++#define MAX_DEBUG_MSG_LEN     (0)
++#endif
++
++static char *                 debug_buf;
++static unsigned int           debug_size;
++static unsigned int           debug_point;
++static int                    debug_wrap;
++static spinlock_t             debug_lock;
++static struct proc_dir_entry *        debug_proc_entry = NULL;
++static struct proc_dir_entry *        rcom_proc_entry = NULL;
++static char                   proc_ls_name[255] = "";
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++static struct proc_dir_entry *        locks_proc_entry = NULL;
++static struct seq_operations  locks_info_op;
++
++
++static int locks_open(struct inode *inode, struct file *file)
++{
++      return seq_open(file, &locks_info_op);
++}
++
++/* Write simply sets the lockspace to use */
++static ssize_t locks_write(struct file *file, const char *buf,
++                         size_t count, loff_t * ppos)
++{
++      if (count < sizeof(proc_ls_name)) {
++              copy_from_user(proc_ls_name, buf, count);
++              proc_ls_name[count] = '\0';
++
++              /* Remove any trailing LF so that lazy users
++                 can just echo "lsname" > /proc/cluster/dlm_locks */
++              if (proc_ls_name[count - 1] == '\n')
++                      proc_ls_name[count - 1] = '\0';
++
++              return count;
++      }
++      return 0;
++}
++
++static struct file_operations locks_fops = {
++      open:locks_open,
++      write:locks_write,
++      read:seq_read,
++      llseek:seq_lseek,
++      release:seq_release,
++};
++
++struct ls_dumpinfo {
++      int entry;
++      struct list_head *next;
++      gd_ls_t *ls;
++      gd_res_t *rsb;
++};
++
++static int print_resource(gd_res_t * res, struct seq_file *s);
++
++static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
++{
++      read_lock(&di->ls->ls_reshash_lock);
++      if (!di->next) {
++              /* Find the next non-empty hash bucket */
++              while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
++                     di->entry < di->ls->ls_hashsize) {
++                      di->entry++;
++              }
++              if (di->entry >= di->ls->ls_hashsize) {
++                      read_unlock(&di->ls->ls_reshash_lock);
++                      return NULL;    /* End of hash list */
++              }
++
++              di->next = di->ls->ls_reshashtbl[di->entry].next;
++      } else {                /* Find the next entry in the list */
++
++              di->next = di->next->next;
++              if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
++                      /* End of list - move to next bucket */
++                      di->next = NULL;
++                      di->entry++;
++                      read_unlock(&di->ls->ls_reshash_lock);
++
++                      return next_rsb(di);    /* do the top half of this conditional */
++              }
++      }
++      di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
++      read_unlock(&di->ls->ls_reshash_lock);
++
++      return di;
++}
++
++static void *s_start(struct seq_file *m, loff_t * pos)
++{
++      struct ls_dumpinfo *di;
++      gd_ls_t *ls;
++      int i;
++
++      ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
++      if (!ls)
++              return NULL;
++
++      di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
++      if (!di)
++              return NULL;
++
++      if (*pos == 0)
++              seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
++
++      di->entry = 0;
++      di->next = NULL;
++      di->ls = ls;
++
++      for (i = 0; i < *pos; i++)
++              if (next_rsb(di) == NULL)
++                      return NULL;
++
++      return next_rsb(di);
++}
++
++static void *s_next(struct seq_file *m, void *p, loff_t * pos)
++{
++      struct ls_dumpinfo *di = p;
++
++      *pos += 1;
++
++      return next_rsb(di);
++}
++
++static int s_show(struct seq_file *m, void *p)
++{
++      struct ls_dumpinfo *di = p;
++      return print_resource(di->rsb, m);
++}
++
++static void s_stop(struct seq_file *m, void *p)
++{
++      kfree(p);
++}
++
++static struct seq_operations locks_info_op = {
++      start:s_start,
++      next:s_next,
++      stop:s_stop,
++      show:s_show
++};
++
++static char *print_lockmode(int mode)
++{
++      switch (mode) {
++      case DLM_LOCK_IV:
++              return "--";
++      case DLM_LOCK_NL:
++              return "NL";
++      case DLM_LOCK_CR:
++              return "CR";
++      case DLM_LOCK_CW:
++              return "CW";
++      case DLM_LOCK_PR:
++              return "PR";
++      case DLM_LOCK_PW:
++              return "PW";
++      case DLM_LOCK_EX:
++              return "EX";
++      default:
++              return "??";
++      }
++}
++
++static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
++{
++
++      seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
++
++      if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++          || lkb->lkb_status == GDLM_LKSTS_WAITING)
++              seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
++
++      if (lkb->lkb_range) {
++              /* This warns on Alpha. Tough. Only I see it */
++              if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++                  || lkb->lkb_status == GDLM_LKSTS_GRANTED)
++                      seq_printf(s, " %" PRIx64 "-%" PRIx64,
++                                 lkb->lkb_range[GR_RANGE_START],
++                                 lkb->lkb_range[GR_RANGE_END]);
++              if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++                  || lkb->lkb_status == GDLM_LKSTS_WAITING)
++                      seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
++                                 lkb->lkb_range[RQ_RANGE_START],
++                                 lkb->lkb_range[RQ_RANGE_END]);
++      }
++
++      if (lkb->lkb_nodeid) {
++              if (lkb->lkb_nodeid != res->res_nodeid)
++                      seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
++                                 lkb->lkb_remid);
++              else
++                      seq_printf(s, " Master:     %08x", lkb->lkb_remid);
++      }
++
++      if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++              seq_printf(s, "  LQ: %d", lkb->lkb_lockqueue_state);
++
++      seq_printf(s, "\n");
++}
++
++static int print_resource(gd_res_t *res, struct seq_file *s)
++{
++      int i;
++      struct list_head *locklist;
++
++      seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
++                 res->res_parent, res->res_length);
++      for (i = 0; i < res->res_length; i++) {
++              if (isprint(res->res_name[i]))
++                      seq_printf(s, "%c", res->res_name[i]);
++              else
++                      seq_printf(s, "%c", '.');
++      }
++      if (res->res_nodeid)
++              seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
++                         res->res_nodeid);
++      else
++              seq_printf(s, "\"  \nMaster Copy\n");
++
++      /* Print the LVB: */
++      if (res->res_lvbptr) {
++              seq_printf(s, "LVB: ");
++              for (i = 0; i < DLM_LVB_LEN; i++) {
++                      if (i == DLM_LVB_LEN / 2)
++                              seq_printf(s, "\n     ");
++                      seq_printf(s, "%02x ",
++                                 (unsigned char) res->res_lvbptr[i]);
++              }
++              seq_printf(s, "\n");
++      }
++
++      /* Print the locks attached to this resource */
++      seq_printf(s, "Granted Queue\n");
++      list_for_each(locklist, &res->res_grantqueue) {
++              gd_lkb_t *this_lkb =
++                  list_entry(locklist, gd_lkb_t, lkb_statequeue);
++              print_lock(s, this_lkb, res);
++      }
++
++      seq_printf(s, "Conversion Queue\n");
++      list_for_each(locklist, &res->res_convertqueue) {
++              gd_lkb_t *this_lkb =
++                  list_entry(locklist, gd_lkb_t, lkb_statequeue);
++              print_lock(s, this_lkb, res);
++      }
++
++      seq_printf(s, "Waiting Queue\n");
++      list_for_each(locklist, &res->res_waitqueue) {
++              gd_lkb_t *this_lkb =
++                  list_entry(locklist, gd_lkb_t, lkb_statequeue);
++              print_lock(s, this_lkb, res);
++      }
++      return 0;
++}
++#endif                                /* CONFIG_CLUSTER_DLM_PROCLOCKS */
++
++void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
++{
++      va_list va;
++      int i, n, size, len;
++      char buf[MAX_DEBUG_MSG_LEN+1];
++
++      spin_lock(&debug_lock);
++
++      if (!debug_buf)
++              goto out;
++
++      size = MAX_DEBUG_MSG_LEN;
++      memset(buf, 0, size+1);
++
++      n = snprintf(buf, size, "%s ", ls->ls_name);
++      size -= n;
++
++      va_start(va, fmt);
++      vsnprintf(buf+n, size, fmt, va);
++      va_end(va);
++
++      len = strlen(buf);
++      if (len > MAX_DEBUG_MSG_LEN-1)
++              len = MAX_DEBUG_MSG_LEN-1;
++      buf[len] = '\n';
++      buf[len+1] = '\0';
++
++      for (i = 0; i < strlen(buf); i++) {
++              debug_buf[debug_point++] = buf[i];
++
++              if (debug_point == debug_size) {
++                      debug_point = 0;
++                      debug_wrap = 1;
++              }
++      }
++ out:
++      spin_unlock(&debug_lock);
++}
++
++void dlm_debug_dump(void)
++{
++      int i;
++
++      spin_lock(&debug_lock);
++      if (debug_wrap) {
++              for (i = debug_point; i < debug_size; i++)
++                      printk("%c", debug_buf[i]);
++      }
++      for (i = 0; i < debug_point; i++)
++              printk("%c", debug_buf[i]);
++      spin_unlock(&debug_lock);
++}
++
++void dlm_debug_setup(int size)
++{
++      char *b = NULL;
++
++      if (size > PAGE_SIZE)
++              size = PAGE_SIZE;
++      if (size)
++              b = kmalloc(size, GFP_KERNEL);
++
++      spin_lock(&debug_lock);
++      if (debug_buf)
++              kfree(debug_buf);
++      if (!size || !b)
++              goto out;
++      debug_size = size;
++      debug_point = 0;
++      debug_wrap = 0;
++      debug_buf = b;
++      memset(debug_buf, 0, debug_size);
++ out:
++        spin_unlock(&debug_lock);
++}
++
++static void dlm_debug_init(void)
++{
++      debug_buf = NULL;
++        debug_size = 0;
++      debug_point = 0;
++      debug_wrap = 0;
++      spin_lock_init(&debug_lock);
++
++      dlm_debug_setup(DLM_DEBUG_SIZE);
++}
++
++#ifdef CONFIG_PROC_FS
++int dlm_debug_info(char *b, char **start, off_t offset, int length)
++{
++      int i, n = 0;
++
++      spin_lock(&debug_lock);
++
++      if (debug_wrap) {
++              for (i = debug_point; i < debug_size; i++)
++                      n += sprintf(b + n, "%c", debug_buf[i]);
++      }
++      for (i = 0; i < debug_point; i++)
++              n += sprintf(b + n, "%c", debug_buf[i]);
++
++      spin_unlock(&debug_lock);
++
++      return n;
++}
++
++int dlm_rcom_info(char *b, char **start, off_t offset, int length)
++{
++      gd_ls_t *ls;
++      gd_csb_t *csb;
++      int n = 0;
++
++      ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
++      if (!ls)
++              return 0;
++
++      n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
++                                 "names_recv_count names_recv_msgid "
++                                 "locks_send_count locks_send_msgid "
++                                 "locks_recv_count locks_recv_msgid\n");
++
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
++                           csb->csb_node->gn_nodeid,
++                           csb->csb_names_send_count,
++                           csb->csb_names_send_msgid,
++                           csb->csb_names_recv_count,
++                           csb->csb_names_recv_msgid,
++                           csb->csb_locks_send_count,
++                           csb->csb_locks_send_msgid,
++                           csb->csb_locks_recv_count,
++                           csb->csb_locks_recv_msgid);
++        }
++      return n;
++}
++#endif
++
++void dlm_proc_init(void)
++{
++#ifdef CONFIG_PROC_FS
++      debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
++                                           NULL);
++      if (!debug_proc_entry)
++              return;
++
++      debug_proc_entry->get_info = &dlm_debug_info;
++
++      rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
++      if (!rcom_proc_entry)
++              return;
++
++      rcom_proc_entry->get_info = &dlm_rcom_info;
++#endif
++      dlm_debug_init();
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++      locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
++                                                S_IFREG | 0400,
++                                                NULL, NULL, NULL);
++      if (!locks_proc_entry)
++              return;
++      locks_proc_entry->proc_fops = &locks_fops;
++#endif
++}
++
++void dlm_proc_exit(void)
++{
++#ifdef CONFIG_PROC_FS
++      if (debug_proc_entry) {
++              remove_proc_entry("cluster/dlm_debug", NULL);
++              dlm_debug_setup(0);
++      }
++
++      if (rcom_proc_entry)
++              remove_proc_entry("cluster/dlm_rcom", NULL);
++#endif
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++      if (locks_proc_entry)
++              remove_proc_entry("cluster/dlm_locks", NULL);
++#endif
++}
+diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
+--- linux-orig/cluster/dlm/queries.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/queries.c        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,697 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * queries.c
++ *
++ * This file provides the kernel query interface to the DLM.
++ *
++ */
++
++#define EXPORT_SYMTAB
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "locking.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "memory.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "rsb.h"
++
++static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
++static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
++
++/*
++ * API entry point.
++ */
++int dlm_query(void *lockspace,
++            struct dlm_lksb *lksb,
++            int query,
++            struct dlm_queryinfo *qinfo,
++            void (ast_routine(void *)),
++            void *astarg)
++{
++      int status = -EINVAL;
++      gd_lkb_t *target_lkb;
++      gd_lkb_t *query_lkb = NULL;     /* Our temporary LKB */
++      gd_ls_t  *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
++
++
++      if (!qinfo)
++              goto out;
++      if (!ls)
++              goto out;
++      if (!ast_routine)
++              goto out;
++      if (!lksb)
++              goto out;
++
++      if (!qinfo->gqi_lockinfo)
++              qinfo->gqi_locksize = 0;
++
++        /* Find the lkid */
++      target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
++      if (!target_lkb)
++              goto out;
++
++      /* If the user wants a list of locks that are blocking or
++         not blocking this lock, then it must be waiting
++         for something
++      */
++      if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
++           (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
++          target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
++              return -EINVAL;
++
++      /* We now allocate an LKB for our own use (so we can hang
++       * things like the AST routine and the lksb from it) */
++      lksb->sb_status = -EBUSY;
++      query_lkb = create_lkb(ls);
++      if (!query_lkb) {
++              status = -ENOMEM;
++              goto out;
++      }
++      query_lkb->lkb_astaddr  = ast_routine;
++      query_lkb->lkb_astparam = (long)astarg;
++      query_lkb->lkb_resource = target_lkb->lkb_resource;
++      query_lkb->lkb_lksb     = lksb;
++
++      /* Don't free the resource while we are querying it. This ref
++       * will be dropped when the LKB is freed */
++      hold_rsb(query_lkb->lkb_resource);
++
++      /* Fill in the stuff that's always local */
++      if (qinfo->gqi_resinfo) {
++              if (target_lkb->lkb_resource->res_nodeid)
++                      qinfo->gqi_resinfo->rsi_masternode =
++                              target_lkb->lkb_resource->res_nodeid;
++              else
++                      qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
++              qinfo->gqi_resinfo->rsi_length =
++                      target_lkb->lkb_resource->res_length;
++              memcpy(qinfo->gqi_resinfo->rsi_name,
++                     target_lkb->lkb_resource->res_name,
++                     qinfo->gqi_resinfo->rsi_length);
++      }
++
++      /* If the master is local (or the user doesn't want the overhead of a
++       * remote call) - fill in the details here */
++      if (target_lkb->lkb_resource->res_nodeid == 0 ||
++          (query & DLM_QUERY_LOCAL)) {
++
++              status = 0;
++              /* Resource info */
++              if (qinfo->gqi_resinfo) {
++                      query_resource(target_lkb->lkb_resource,
++                                     qinfo->gqi_resinfo);
++              }
++
++              /* Lock lists */
++              if (qinfo->gqi_lockinfo) {
++                      status = query_locks(query, target_lkb, qinfo);
++              }
++
++              query_lkb->lkb_retstatus = status;
++              query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++              queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
++              wake_astd();
++
++              /* An AST will be delivered so we must return success here */
++              status = 0;
++              goto out;
++      }
++
++      /* Remote master */
++      if (target_lkb->lkb_resource->res_nodeid != 0)
++      {
++              struct gd_remquery *remquery;
++              struct writequeue_entry *e;
++
++              /* Clear this cos the receiving end adds to it with
++                 each incoming packet */
++              qinfo->gqi_lockcount = 0;
++
++              /* Squirrel a pointer to the query info struct
++                 somewhere illegal */
++              query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
++
++              e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
++                                      sizeof(struct gd_remquery),
++                                      ls->ls_allocation,
++                                      (char **) &remquery);
++              if (!e) {
++                      status = -ENOBUFS;
++                      goto out;
++              }
++
++              /* Build remote packet */
++              memset(remquery, 0, sizeof(struct gd_remquery));
++
++              remquery->rq_maxlocks  = qinfo->gqi_locksize;
++              remquery->rq_query     = query;
++              remquery->rq_mstlkid   = target_lkb->lkb_remid;
++              if (qinfo->gqi_lockinfo)
++                      remquery->rq_maxlocks = qinfo->gqi_locksize;
++
++              remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
++              remquery->rq_header.rh_flags     = 0;
++              remquery->rq_header.rh_length    = sizeof(struct gd_remquery);
++              remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
++              remquery->rq_header.rh_lockspace = ls->ls_global_id;
++
++              midcomms_send_buffer(&remquery->rq_header, e);
++              status = 0;
++      }
++
++      out:
++
++      return status;
++}
++
++static inline int valid_range(struct dlm_range *r)
++{
++    if (r->ra_start != 0ULL ||
++      r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
++      return 1;
++    else
++      return 0;
++}
++
++static void put_int(int x, char *buf, int *offp)
++{
++        x = cpu_to_le32(x);
++        memcpy(buf + *offp, &x, sizeof(int));
++        *offp += sizeof(int);
++}
++
++static void put_int64(uint64_t x, char *buf, int *offp)
++{
++        x = cpu_to_le64(x);
++        memcpy(buf + *offp, &x, sizeof(uint64_t));
++        *offp += sizeof(uint64_t);
++}
++
++static int get_int(char *buf, int *offp)
++{
++        int value;
++        memcpy(&value, buf + *offp, sizeof(int));
++        *offp += sizeof(int);
++        return le32_to_cpu(value);
++}
++
++static uint64_t get_int64(char *buf, int *offp)
++{
++        uint64_t value;
++
++        memcpy(&value, buf + *offp, sizeof(uint64_t));
++        *offp += sizeof(uint64_t);
++        return le64_to_cpu(value);
++}
++
++#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
++
++/* Called from recvd to get lock info for a remote node */
++int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
++{
++        struct gd_remquery *query = (struct gd_remquery *) msg;
++      struct gd_remqueryreply *reply;
++      struct dlm_resinfo resinfo;
++      struct dlm_queryinfo qinfo;
++      struct writequeue_entry *e;
++      char *buf;
++      gd_lkb_t *lkb;
++      int status = 0;
++      int bufidx;
++      int finished = 0;
++      int cur_lock = 0;
++      int start_lock = 0;
++
++      lkb = find_lock_by_id(ls, query->rq_mstlkid);
++      if (!lkb) {
++              status = -EINVAL;
++              goto send_error;
++      }
++
++      qinfo.gqi_resinfo = &resinfo;
++      qinfo.gqi_locksize = query->rq_maxlocks;
++
++      /* Get the resource bits */
++      query_resource(lkb->lkb_resource, &resinfo);
++
++      /* Now get the locks if wanted */
++      if (query->rq_maxlocks) {
++              qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
++                                           GFP_KERNEL);
++              if (!qinfo.gqi_lockinfo) {
++                      status = -ENOMEM;
++                      goto send_error;
++              }
++
++              status = query_locks(query->rq_query, lkb, &qinfo);
++              if (status && status != -E2BIG) {
++                      kfree(qinfo.gqi_lockinfo);
++                      goto send_error;
++              }
++      }
++      else {
++              qinfo.gqi_lockinfo = NULL;
++              qinfo.gqi_lockcount = 0;
++      }
++
++      /* Send as many blocks as needed for all the locks */
++      do {
++              int i;
++              int msg_len = sizeof(struct gd_remqueryreply);
++              int last_msg_len = msg_len; /* keeps compiler quiet */
++              int last_lock;
++
++              /* First work out how many locks we can fit into a block */
++              for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
++
++                      last_msg_len = msg_len;
++
++                      msg_len += LOCK_LEN;
++                      if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
++                          valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
++
++                              msg_len += sizeof(uint64_t) * 4;
++                      }
++              }
++
++              /* There must be a neater way of doing this... */
++              if (msg_len > PAGE_SIZE) {
++                      last_lock = i-1;
++                      msg_len = last_msg_len;
++              }
++              else {
++                      last_lock = i;
++              }
++
++              e = lowcomms_get_buffer(nodeid,
++                                      msg_len,
++                                      ls->ls_allocation,
++                                      (char **) &reply);
++              if (!e) {
++                      kfree(qinfo.gqi_lockinfo);
++                      status = -ENOBUFS;
++                      goto out;
++              }
++
++              reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
++              reply->rq_header.rh_length    = msg_len;
++              reply->rq_header.rh_lkid      = msg->rh_lkid;
++              reply->rq_header.rh_lockspace = msg->rh_lockspace;
++
++              reply->rq_status     = status;
++              reply->rq_startlock  = cur_lock;
++              reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
++              reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
++              reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
++              memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
++
++              buf = (char *)reply;
++              bufidx = sizeof(struct gd_remqueryreply);
++
++              for (; cur_lock < last_lock; cur_lock++) {
++
++                      buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
++                      buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
++                      buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
++                      put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
++                      put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
++                      put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
++                      put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
++
++                      if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
++                          valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
++
++                              buf[bufidx++] = 1;
++                              put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
++                              put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
++                              put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
++                              put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
++                      }
++                      else {
++                              buf[bufidx++] = 0;
++                      }
++              }
++
++              if (cur_lock == qinfo.gqi_lockcount) {
++                      reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
++                      finished = 1;
++              }
++              else {
++                      reply->rq_header.rh_flags = 0;
++              }
++
++              reply->rq_numlocks = cur_lock - start_lock;
++              start_lock = cur_lock;
++
++              midcomms_send_buffer(&reply->rq_header, e);
++      } while (!finished);
++
++      kfree(qinfo.gqi_lockinfo);
++ out:
++      return status;
++
++ send_error:
++      e = lowcomms_get_buffer(nodeid,
++                              sizeof(struct gd_remqueryreply),
++                              ls->ls_allocation,
++                              (char **) &reply);
++      if (!e) {
++              status =  -ENOBUFS;
++              goto out;
++      }
++      reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
++      reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
++      reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
++      reply->rq_header.rh_lkid = msg->rh_lkid;
++      reply->rq_header.rh_lockspace = msg->rh_lockspace;
++      reply->rq_status     = status;
++      reply->rq_numlocks   = 0;
++      reply->rq_startlock  = 0;
++      reply->rq_grantcount = 0;
++      reply->rq_convcount  = 0;
++      reply->rq_waitcount  = 0;
++
++      midcomms_send_buffer(&reply->rq_header, e);
++
++      return status;
++}
++
++/* Reply to a remote query */
++int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
++{
++      gd_lkb_t *query_lkb;
++      struct dlm_queryinfo *qinfo;
++      struct gd_remqueryreply *reply;
++      char *buf;
++      int i;
++      int bufidx;
++
++      query_lkb = find_lock_by_id(ls, msg->rh_lkid);
++      if (!query_lkb)
++              return -EINVAL;
++
++      qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
++      reply = (struct gd_remqueryreply *) msg;
++
++      /* Copy the easy bits first */
++      qinfo->gqi_lockcount += reply->rq_numlocks;
++      if (qinfo->gqi_resinfo) {
++              qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
++              qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
++              qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
++              memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
++                      DLM_LVB_LEN);
++      }
++
++      /* Now unpack the locks */
++      bufidx = sizeof(struct gd_remqueryreply);
++      buf = (char *) msg;
++
++      GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
++                  printk("start = %d, num + %d. Max=  %d\n",
++                         reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
++
++      for (i = reply->rq_startlock;
++           i < reply->rq_startlock + reply->rq_numlocks; i++) {
++              qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
++              qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
++              qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
++              qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
++              qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
++              qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
++              qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
++              if (buf[bufidx++]) {
++                      qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
++                      qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
++                      qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
++                      qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
++              }
++              else {
++                      qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
++                      qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
++                      qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
++                      qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
++              }
++      }
++
++      /* If this was the last block then now tell the user */
++      if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
++              query_lkb->lkb_retstatus = reply->rq_status;
++              query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++              queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
++              wake_astd();
++      }
++
++      return 0;
++}
++
++/* Aggregate resource information */
++static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
++{
++      struct list_head *tmp;
++
++
++      if (rsb->res_lvbptr)
++              memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
++
++      resinfo->rsi_grantcount = 0;
++      list_for_each(tmp, &rsb->res_grantqueue) {
++              resinfo->rsi_grantcount++;
++      }
++
++      resinfo->rsi_waitcount = 0;
++      list_for_each(tmp, &rsb->res_waitqueue) {
++              resinfo->rsi_waitcount++;
++      }
++
++      resinfo->rsi_convcount = 0;
++      list_for_each(tmp, &rsb->res_convertqueue) {
++              resinfo->rsi_convcount++;
++      }
++
++      return 0;
++}
++
++static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
++{
++      int entry;
++
++      /* Don't fill it in if the buffer is full */
++      if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
++              return -E2BIG;
++
++      /* gqi_lockcount contains the number of locks we have returned */
++      entry = qinfo->gqi_lockcount++;
++
++      /* Fun with master copies */
++      if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++              qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
++              qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
++      }
++      else {
++              qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
++              qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
++      }
++
++      /* Also make sure we always have a valid nodeid in there, the
++         calling end may not know which node "0" is */
++      if (lkb->lkb_nodeid)
++          qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
++      else
++          qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
++
++      if (lkb->lkb_parent)
++              qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
++      else
++              qinfo->gqi_lockinfo[entry].lki_parent = 0;
++
++      qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
++      qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
++      qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
++
++      if (lkb->lkb_range) {
++              qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
++                      lkb->lkb_range[GR_RANGE_START];
++              qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
++                      lkb->lkb_range[GR_RANGE_END];
++              qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
++                      lkb->lkb_range[RQ_RANGE_START];
++              qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
++                      lkb->lkb_range[RQ_RANGE_END];
++      } else {
++              qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
++              qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
++              qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
++              qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
++      }
++      return 0;
++}
++
++static int query_lkb_queue(struct list_head *queue, int query,
++                         struct dlm_queryinfo *qinfo)
++{
++      struct list_head *tmp;
++      int status = 0;
++      int mode = query & DLM_QUERY_MODE_MASK;
++
++      list_for_each(tmp, queue) {
++              gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++              int lkmode;
++
++              if (query & DLM_QUERY_RQMODE)
++                      lkmode = lkb->lkb_rqmode;
++              else
++                      lkmode = lkb->lkb_grmode;
++
++              /* Add the LKB info to the list if it matches the criteria in
++               * the query bitmap */
++              switch (query & DLM_QUERY_MASK) {
++              case DLM_QUERY_LOCKS_ALL:
++                      status = add_lock(lkb, qinfo);
++                      break;
++
++              case DLM_QUERY_LOCKS_HIGHER:
++                      if (lkmode > mode)
++                              status = add_lock(lkb, qinfo);
++                      break;
++
++              case DLM_QUERY_LOCKS_EQUAL:
++                      if (lkmode == mode)
++                              status = add_lock(lkb, qinfo);
++                      break;
++
++              case DLM_QUERY_LOCKS_LOWER:
++                      if (lkmode < mode)
++                              status = add_lock(lkb, qinfo);
++                      break;
++              }
++      }
++      return status;
++}
++
++/*
++ * Return 1 if the locks' ranges overlap
++ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
++ */
++static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
++{
++      if (!lkb1->lkb_range || !lkb2->lkb_range)
++              return 1;
++
++      if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
++          lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
++              return 0;
++
++      return 1;
++}
++extern const int __dlm_compat_matrix[8][8];
++
++
++static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
++{
++      struct list_head *tmp;
++      int status = 0;
++
++      list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
++              gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++              if (ranges_overlap(lkb, qlkb) &&
++                  !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
++                      status = add_lock(lkb, qinfo);
++      }
++
++      return status;
++}
++
++static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
++{
++      struct list_head *tmp;
++      int status = 0;
++
++      list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
++              gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++              if (!(ranges_overlap(lkb, qlkb) &&
++                    !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
++                      status = add_lock(lkb, qinfo);
++      }
++
++      return status;
++}
++
++/* Gather a list of appropriate locks */
++static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
++{
++      int status = 0;
++
++
++      /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
++       * was requested as the mode
++       */
++      if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
++              query &= ~DLM_QUERY_MODE_MASK;
++              if (query & DLM_QUERY_RQMODE)
++                      query |= lkb->lkb_rqmode;
++              else
++                      query |= lkb->lkb_grmode;
++      }
++
++      qinfo->gqi_lockcount = 0;
++
++      /* BLOCKING/NOTBLOCK only look at the granted queue */
++      if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
++              return get_blocking_locks(lkb, qinfo);
++
++      if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
++              return get_nonblocking_locks(lkb, qinfo);
++
++        /* Do the lock queues that were requested */
++      if (query & DLM_QUERY_QUEUE_GRANT) {
++              status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
++                                       query, qinfo);
++      }
++
++      if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
++              status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
++                                       query, qinfo);
++      }
++
++      if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
++              status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
++                                       query, qinfo);
++      }
++
++
++      return status;
++}
++
++EXPORT_SYMBOL(dlm_query);
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
+--- linux-orig/cluster/dlm/queries.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/queries.h        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __QUERIES_DOT_H__
++#define __QUERIES_DOT_H__
++
++extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
++extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
++
++#endif                          /* __QUERIES_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
+--- linux-orig/cluster/dlm/rebuild.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rebuild.c        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1246 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* 
++ * Rebuild RSB's on new masters.  Functions for transferring locks and
++ * subresources to new RSB masters during recovery.
++ */
++
++#include "dlm_internal.h"
++#include "reccomms.h"
++#include "lkb.h"
++#include "rsb.h"
++#include "nodes.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++
++
++/* Types of entity serialised in remastering messages */
++#define REMASTER_ROOTRSB 1
++#define REMASTER_RSB     2
++#define REMASTER_LKB     3
++
++struct rcom_fill {
++      char *                  outbuf;         /* Beginning of data */
++      int                     offset;         /* Current offset into outbuf */
++      int                     maxlen;         /* Max value of offset */
++      int                     remasterid;
++      int                     count;
++      gd_res_t *              rsb;
++      gd_res_t *              subrsb;
++      gd_lkb_t *              lkb;
++      struct list_head *      lkbqueue;
++      char                    more;
++};
++typedef struct rcom_fill rcom_fill_t;
++
++
++struct rebuild_node {
++      struct list_head        list;
++      int                     nodeid;
++      gd_res_t *              rootrsb;
++};
++typedef struct rebuild_node rebuild_node_t;
++
++
++/* 
++ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
++ * master.  The rsb will be "done" with recovery when the new master has
++ * replied with all the new remote lockid's for this rsb's lkb's.
++ */
++
++void expect_new_lkids(gd_res_t *rsb)
++{
++      rsb->res_newlkid_expect = 0;
++      recover_list_add(rsb);
++}
++
++/* 
++ * This function is called on root rsb or subrsb when another lkb is being sent
++ * to the new master for which we expect to receive a corresponding remote lkid
++ */
++
++void need_new_lkid(gd_res_t *rsb)
++{
++      gd_res_t *root = rsb;
++
++      if (rsb->res_parent)
++              root = rsb->res_root;
++
++      if (!root->res_newlkid_expect)
++              recover_list_add(root);
++      else
++              GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
++
++      root->res_newlkid_expect++;
++}
++
++/* 
++ * This function is called for each lkb for which a new remote lkid is
++ * received.  Decrement the expected number of remote lkids expected for the
++ * root rsb.
++ */
++
++void have_new_lkid(gd_lkb_t *lkb)
++{
++      gd_res_t *root = lkb->lkb_resource;
++
++      if (root->res_parent)
++              root = root->res_root;
++
++      down_write(&root->res_lock);
++
++      GDLM_ASSERT(root->res_newlkid_expect,
++                  printk("newlkid_expect=%d\n", root->res_newlkid_expect););
++
++      root->res_newlkid_expect--;
++
++      if (!root->res_newlkid_expect) {
++              clear_bit(RESFL_NEW_MASTER, &root->res_flags);
++              recover_list_del(root);
++      }
++      up_write(&root->res_lock);
++}
++
++/* 
++ * Return the rebuild struct for a node - will create an entry on the rootrsb
++ * list if necessary.
++ *
++ * Currently no locking is needed here as it all happens in the gdlm_recvd
++ * thread
++ */
++
++static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
++{
++      rebuild_node_t *node = NULL;
++
++      list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
++              if (node->nodeid == nodeid)
++                      return node;
++      }
++
++      /* Not found, add one */
++      node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
++      if (!node)
++              return NULL;
++
++      node->nodeid = nodeid;
++      node->rootrsb = NULL;
++      list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
++
++      return node;
++}
++
++/* 
++ * Tidy up after a rebuild run.  Called when all recovery has finished
++ */
++
++void rebuild_freemem(gd_ls_t *ls)
++{
++      rebuild_node_t *node = NULL, *s;
++
++      list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
++              list_del(&node->list);
++              kfree(node);
++      }
++}
++
++static void put_int(int x, char *buf, int *offp)
++{
++      x = cpu_to_le32(x);
++      memcpy(buf + *offp, &x, sizeof(int));
++      *offp += sizeof(int);
++}
++
++static void put_int64(uint64_t x, char *buf, int *offp)
++{
++      x = cpu_to_le64(x);
++      memcpy(buf + *offp, &x, sizeof(uint64_t));
++      *offp += sizeof(uint64_t);
++}
++
++static void put_bytes(char *x, int len, char *buf, int *offp)
++{
++      put_int(len, buf, offp);
++      memcpy(buf + *offp, x, len);
++      *offp += len;
++}
++
++static void put_char(char x, char *buf, int *offp)
++{
++      buf[*offp] = x;
++      *offp += 1;
++}
++
++static int get_int(char *buf, int *offp)
++{
++      int value;
++      memcpy(&value, buf + *offp, sizeof(int));
++      *offp += sizeof(int);
++      return le32_to_cpu(value);
++}
++
++static uint64_t get_int64(char *buf, int *offp)
++{
++      uint64_t value;
++
++      memcpy(&value, buf + *offp, sizeof(uint64_t));
++      *offp += sizeof(uint64_t);
++      return le64_to_cpu(value);
++}
++
++static char get_char(char *buf, int *offp)
++{
++      char x = buf[*offp];
++
++      *offp += 1;
++      return x;
++}
++
++static void get_bytes(char *bytes, int *len, char *buf, int *offp)
++{
++      *len = get_int(buf, offp);
++      memcpy(bytes, buf + *offp, *len);
++      *offp += *len;
++}
++
++static int lkb_length(gd_lkb_t *lkb)
++{
++      int len = 0;
++
++      len += sizeof(int);     /* lkb_id */
++      len += sizeof(int);     /* lkb_resource->res_reamasterid */
++      len += sizeof(int);     /* lkb_flags */
++      len += sizeof(int);     /* lkb_status */
++      len += sizeof(char);    /* lkb_rqmode */
++      len += sizeof(char);    /* lkb_grmode */
++      len += sizeof(int);     /* lkb_childcnt */
++      len += sizeof(int);     /* lkb_parent->lkb_id */
++      len += sizeof(int);     /* lkb_bastaddr */
++
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++              len += sizeof(int);     /* number of lvb bytes */
++              len += DLM_LVB_LEN;
++      }
++
++      if (lkb->lkb_range) {
++              len += sizeof(uint64_t);
++              len += sizeof(uint64_t);
++              if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
++                      len += sizeof(uint64_t);
++                      len += sizeof(uint64_t);
++              }
++      }
++
++      return len;
++}
++
++/* 
++ * It's up to the caller to be sure there's enough space in the buffer.
++ */
++
++static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
++{
++      int flags;
++
++      /* Need to tell the remote end if we have a range */
++      flags = lkb->lkb_flags;
++      if (lkb->lkb_range)
++              flags |= GDLM_LKFLG_RANGE;
++
++      /* 
++       * See lkb_length()
++       * Total: 30 (no lvb) or 66 (with lvb) bytes
++       */
++
++      put_int(lkb->lkb_id, buf, offp);
++      put_int(lkb->lkb_resource->res_remasterid, buf, offp);
++      put_int(flags, buf, offp);
++      put_int(lkb->lkb_status, buf, offp);
++      put_char(lkb->lkb_rqmode, buf, offp);
++      put_char(lkb->lkb_grmode, buf, offp);
++      put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
++
++      if (lkb->lkb_parent)
++              put_int(lkb->lkb_parent->lkb_id, buf, offp);
++      else
++              put_int(0, buf, offp);
++
++      if (lkb->lkb_bastaddr)
++              put_int(1, buf, offp);
++      else
++              put_int(0, buf, offp);
++
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++              GDLM_ASSERT(lkb->lkb_lvbptr,);
++              put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
++      }
++
++      /* Only send the range we actually need */
++      if (lkb->lkb_range) {
++              switch (lkb->lkb_status) {
++              case GDLM_LKSTS_CONVERT:
++                      put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
++                      put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
++                      put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
++                      put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
++                      break;
++              case GDLM_LKSTS_WAITING:
++                      put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
++                      put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
++                      break;
++              case GDLM_LKSTS_GRANTED:
++                      put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
++                      put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
++                      break;
++              default:
++                      GDLM_ASSERT(0,);
++              }
++      }
++}
++
++static int rsb_length(gd_res_t *rsb)
++{
++      int len = 0;
++
++      len += sizeof(int);     /* number of res_name bytes */
++      len += rsb->res_length; /* res_name */
++      len += sizeof(int);     /* res_remasterid */
++      len += sizeof(int);     /* res_parent->res_remasterid */
++
++      return len;
++}
++
++static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
++{
++      struct list_head *tmp;
++      gd_res_t *r;
++
++      tmp = subrsb->res_subreslist.next;
++      r = list_entry(tmp, gd_res_t, res_subreslist);
++
++      return r;
++}
++
++static inline int last_in_list(gd_res_t *r, struct list_head *head)
++{
++      gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
++
++      if (last == r)
++              return 1;
++      return 0;
++}
++
++/* 
++ * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
++ * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's on the
++ * wait queue are not rebuilt.
++ */
++
++static int lkbs_to_remaster(gd_res_t *r)
++{
++      gd_lkb_t *lkb;
++      gd_res_t *sub;
++
++      if (!list_empty(&r->res_grantqueue) ||
++          !list_empty(&r->res_convertqueue))
++              return TRUE;
++
++      list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
++              if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++                      continue;
++              return TRUE;
++      }
++
++      list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
++              if (!list_empty(&sub->res_grantqueue) ||
++                  !list_empty(&sub->res_convertqueue))
++                      return TRUE;
++
++              list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
++                      if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++                              continue;
++                      return TRUE;
++              }
++      }
++
++      return FALSE;
++}
++
++static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
++{
++      /* 
++       * See rsb_length()
++       * Total: 36 bytes (4 + 24 + 4 + 4)
++       */
++
++      put_bytes(rsb->res_name, rsb->res_length, buf, offp);
++      put_int(rsb->res_remasterid, buf, offp);
++
++      if (rsb->res_parent)
++              put_int(rsb->res_parent->res_remasterid, buf, offp);
++      else
++              put_int(0, buf, offp);
++
++      GDLM_ASSERT(!rsb->res_lvbptr,);
++}
++
++/* 
++ * Flatten an LKB into a buffer for sending to the new RSB master.  As a
++ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
++ * master.
++ */
++
++static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
++{
++      if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
++              goto nospace;
++
++      lkb->lkb_nodeid = r->res_nodeid;
++
++      put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
++      serialise_lkb(lkb, fill->outbuf, &fill->offset);
++
++      fill->count++;
++      need_new_lkid(r);
++      return 0;
++
++      nospace:
++      return -ENOSPC;
++}
++
++/* 
++ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
++ */
++
++static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
++                        rcom_fill_t *fill)
++{
++      gd_lkb_t *lkb;
++      int error;
++
++      list_for_each_entry(lkb, queue, lkb_statequeue) {
++              if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++                      continue;
++
++              error = pack_one_lkb(r, lkb, fill);
++              if (error)
++                      goto nospace;
++      }
++
++      return 0;
++
++      nospace:
++      fill->lkb = lkb;
++      fill->lkbqueue = queue;
++
++      return error;
++}
++
++static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
++{
++      int error;
++
++      error = pack_lkb_queue(r, &r->res_grantqueue, fill);
++      if (error)
++              goto nospace;
++
++      error = pack_lkb_queue(r, &r->res_convertqueue, fill);
++      if (error)
++              goto nospace;
++
++      error = pack_lkb_queue(r, &r->res_waitqueue, fill);
++
++      nospace:
++      return error;
++}
++
++/* 
++ * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
++ * queue and full lkb queues.
++ */
++
++static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
++{
++      struct list_head *tmp, *start, *end;
++      gd_lkb_t *lkb;
++      int error;
++
++      /* 
++       * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
++       */
++
++      error = pack_one_lkb(r, fill->lkb, fill);
++      if (error)
++              goto out;
++
++      start = fill->lkb->lkb_statequeue.next;
++      end = fill->lkbqueue;
++
++      for (tmp = start; tmp != end; tmp = tmp->next) {
++              lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++              error = pack_one_lkb(r, lkb, fill);
++              if (error) {
++                      fill->lkb = lkb;
++                      goto out;
++              }
++      }
++
++      /* 
++       * Pack all lkb's on r's queues following fill->lkbqueue.
++       */
++
++      if (fill->lkbqueue == &r->res_waitqueue)
++              goto out;
++      if (fill->lkbqueue == &r->res_convertqueue)
++              goto skip;
++
++      GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
++
++      error = pack_lkb_queue(r, &r->res_convertqueue, fill);
++      if (error)
++              goto out;
++      skip:
++      error = pack_lkb_queue(r, &r->res_waitqueue, fill);
++
++      out:
++      return error;
++}
++
++static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
++{
++      int error;
++
++      down_write(&subrsb->res_lock);
++
++      if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
++              goto nospace;
++
++      subrsb->res_nodeid = rsb->res_nodeid;
++      subrsb->res_remasterid = ++fill->remasterid;
++
++      put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
++      serialise_rsb(subrsb, fill->outbuf, &fill->offset);
++
++      error = pack_lkb_queues(subrsb, fill);
++      if (error)
++              goto nospace;
++
++      up_write(&subrsb->res_lock);
++
++      return 0;
++
++      nospace:
++      up_write(&subrsb->res_lock);
++      fill->subrsb = subrsb;
++
++      return -ENOSPC;
++}
++
++static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
++{
++      gd_res_t *subrsb;
++      int error = 0;
++
++      /* 
++       * When an initial subrsb is given, we know it needs to be packed.
++       * When no initial subrsb is given, begin with the first (if any exist).
++       */
++
++      if (!in_subrsb) {
++              if (list_empty(&rsb->res_subreslist))
++                      goto out;
++
++              subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
++                                  res_subreslist);
++      } else
++              subrsb = in_subrsb;
++
++      for (;;) {
++              error = pack_one_subrsb(rsb, subrsb, fill);
++              if (error)
++                      goto out;
++
++              if (last_in_list(subrsb, &rsb->res_subreslist))
++                      break;
++
++              subrsb = next_subrsb(subrsb);
++      }
++
++      out:
++      return error;
++}
++
++/* 
++ * Finish packing whatever is left in an rsb tree.  If space runs out while
++ * finishing, save subrsb/lkb and this will be called again for the same rsb.
++ *
++ * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
++ *  subrsb && !lkb, we left off just before starting a new subrsb.
++ *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
++ * !subrsb && !lkb, we shouldn't be in this function, but starting
++ *                  a new rsb in pack_rsb_tree().
++ */
++
++static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
++                                 rcom_fill_t *fill)
++{
++      gd_res_t *subrsb = NULL;
++      int error = 0;
++
++      if (!fill->subrsb && fill->lkb) {
++              error = pack_lkb_remaining(rsb, fill);
++              if (error)
++                      goto out;
++
++              error = pack_subrsbs(rsb, NULL, fill);
++              if (error)
++                      goto out;
++      }
++
++      else if (fill->subrsb && !fill->lkb) {
++              error = pack_subrsbs(rsb, fill->subrsb, fill);
++              if (error)
++                      goto out;
++      }
++
++      else if (fill->subrsb && fill->lkb) {
++              error = pack_lkb_remaining(fill->subrsb, fill);
++              if (error)
++                      goto out;
++
++              if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
++                      goto out;
++
++              subrsb = next_subrsb(fill->subrsb);
++
++              error = pack_subrsbs(rsb, subrsb, fill);
++              if (error)
++                      goto out;
++      }
++
++      fill->subrsb = NULL;
++      fill->lkb = NULL;
++
++      out:
++      return error;
++}
++
++/* 
++ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
++ * buffer.  When the buffer runs out of space, save the place to restart (the
++ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
++ */
++
++static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
++{
++      int error = -ENOSPC;
++
++      fill->remasterid = 0;
++
++      /* 
++       * Pack the root rsb itself.  A 1 byte type precedes the serialised
++       * rsb.  Then pack the lkb's for the root rsb.
++       */
++
++      down_write(&rsb->res_lock);
++
++      if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
++              goto out;
++
++      rsb->res_remasterid = ++fill->remasterid;
++      put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
++      serialise_rsb(rsb, fill->outbuf, &fill->offset);
++
++      error = pack_lkb_queues(rsb, fill);
++      if (error)
++              goto out;
++
++      up_write(&rsb->res_lock);
++
++      /* 
++       * Pack subrsb/lkb's under the root rsb.
++       */
++
++      error = pack_subrsbs(rsb, NULL, fill);
++
++      return error;
++
++      out:
++      up_write(&rsb->res_lock);
++      return error;
++}
++
++/* 
++ * Given an RSB, return the next RSB that should be sent to a new master.
++ */
++
++static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
++{
++      struct list_head *tmp, *start, *end;
++      gd_res_t *r;
++
++      if (!rsb)
++              start = ls->ls_rootres.next;
++      else
++              start = rsb->res_rootlist.next;
++
++      end = &ls->ls_rootres;
++
++      for (tmp = start; tmp != end; tmp = tmp->next) {
++              r = list_entry(tmp, gd_res_t, res_rootlist);
++
++              if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
++                      if (r->res_nodeid && lkbs_to_remaster(r)) {
++                              expect_new_lkids(r);
++                              return r;
++                      } else
++                              clear_bit(RESFL_NEW_MASTER, &r->res_flags);
++              }
++      }
++
++      return NULL;
++}
++
++/* 
++ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
++ * new master node.  In the case where all the data to send to one node
++ * requires multiple messages, this function needs to resume filling each
++ * successive buffer from the point where it left off when the previous buffer
++ * filled up.
++ */
++
++static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
++{
++      gd_res_t *rsb, *prev_rsb = fill->rsb;
++      int error;
++
++      fill->offset = 0;
++
++      if (!prev_rsb) {
++
++              /* 
++               * The first time this function is called.
++               */
++
++              rsb = next_remastered_rsb(ls, NULL);
++              if (!rsb)
++                      goto no_more;
++
++      } else if (fill->subrsb || fill->lkb) {
++
++              /* 
++               * Continue packing an rsb tree that was partially packed last
++               * time (fill->subrsb/lkb indicates where packing of last block
++               * left off)
++               */
++
++              rsb = prev_rsb;
++              *nodeid = rsb->res_nodeid;
++
++              error = pack_rsb_tree_remaining(ls, rsb, fill);
++              if (error == -ENOSPC)
++                      goto more;
++
++              rsb = next_remastered_rsb(ls, prev_rsb);
++              if (!rsb)
++                      goto no_more;
++
++              if (rsb->res_nodeid != prev_rsb->res_nodeid)
++                      goto more;
++      } else {
++              rsb = prev_rsb;
++      }
++
++      /* 
++       * Pack rsb trees into the buffer until we run out of space, run out of
++       * new rsb's or hit a new nodeid.
++       */
++
++      *nodeid = rsb->res_nodeid;
++
++      for (;;) {
++              error = pack_rsb_tree(ls, rsb, fill);
++              if (error == -ENOSPC)
++                      goto more;
++
++              prev_rsb = rsb;
++
++              rsb = next_remastered_rsb(ls, prev_rsb);
++              if (!rsb)
++                      goto no_more;
++
++              if (rsb->res_nodeid != prev_rsb->res_nodeid)
++                      goto more;
++      }
++
++      more:
++      fill->more = 1;
++      fill->rsb = rsb;
++      return;
++
++      no_more:
++      fill->more = 0;
++}
++
++/* 
++ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
++ */
++
++int rebuild_rsbs_send(gd_ls_t *ls)
++{
++      gd_rcom_t *rc;
++      rcom_fill_t fill;
++      uint32_t nodeid;
++      int error;
++
++      GDLM_ASSERT(recover_list_empty(ls),);
++
++      log_all(ls, "rebuild locks");
++
++      error = -ENOMEM;
++      rc = allocate_rcom_buffer(ls);
++      if (!rc)
++              goto ret;
++
++      error = 0;
++      memset(&fill, 0, sizeof(rcom_fill_t));
++      fill.outbuf = rc->rc_buf;
++      fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
++
++      do {
++              fill_rcom_buffer(ls, &fill, &nodeid);
++              if (!fill.offset)
++                      break;
++
++              rc->rc_datalen = fill.offset;
++              error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
++              if (error)
++                      goto out;
++
++              schedule();
++              error = gdlm_recovery_stopped(ls);
++              if (error)
++                      goto out;
++      }
++      while (fill.more);
++
++      error = gdlm_wait_function(ls, &recover_list_empty);
++
++      log_all(ls, "rebuilt %d locks", fill.count);
++
++      out:
++      rebuild_freemem(ls);
++      free_rcom_buffer(rc);
++
++      ret:
++      return error;
++}
++
++static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
++                                  gd_res_t *rootrsb)
++{
++      gd_res_t *rsb;
++
++      GDLM_ASSERT(rootrsb,);
++
++      if (rootrsb->res_remasterid == remasterid) {
++              rsb = rootrsb;
++              goto out;
++      }
++
++      list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
++              if (rsb->res_remasterid == remasterid)
++                      goto out;
++      }
++      rsb = NULL;
++
++      out:
++      return rsb;
++}
++
++/* 
++ * Search a queue for the given remote lock id (remlkid).
++ */
++
++static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
++                              int remid)
++{
++      gd_lkb_t *lkb;
++
++      list_for_each_entry(lkb, statequeue, lkb_statequeue) {
++              if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
++                      return lkb;
++              }
++      }
++
++      return NULL;
++}
++
++/* 
++ * Given a remote lock ID (and a parent resource), return the local LKB for it
++ * Hopefully we dont need to do this too often on deep lock trees.  This is
++ * VERY suboptimal for anything but the smallest lock trees. It searches the
++ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
++ * returns the LKB address.  OPTIMISATION: we should keep a list of these while
++ * we are building up the remastered LKBs
++ */
++
++static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
++{
++      gd_lkb_t *lkb;
++      gd_res_t *rsb;
++
++      lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
++      if (lkb)
++              goto out;
++
++      lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
++      if (lkb)
++              goto out;
++
++      lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
++      if (lkb)
++              goto out;
++
++      list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
++              lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
++              if (lkb)
++                      goto out;
++
++              lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
++              if (lkb)
++                      goto out;
++
++              lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
++              if (lkb)
++                      goto out;
++      }
++      lkb = NULL;
++
++      out:
++      return lkb;
++}
++
++/* 
++ * Unpack an LKB from a remaster operation
++ */
++
++static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
++                         char *buf, int *ptr, char *outbuf, int *outoffp)
++{
++      gd_lkb_t *lkb;
++      gd_res_t *rsb;
++      int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
++
++      remote_lkid = get_int(buf, ptr);
++
++      rsb_rmid = get_int(buf, ptr);
++      rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
++      GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
++
++      /* 
++       * We could have received this lkb already from a previous recovery
++       * that was interrupted.  If so, just return the lkid to the remote
++       * node.
++       */
++      lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
++      if (lkb)
++              goto put_lkid;
++
++      lkb = create_lkb(rsb->res_ls);
++      if (!lkb)
++              goto out;
++
++      lkb->lkb_remid = remote_lkid;
++      lkb->lkb_flags = get_int(buf, ptr);
++      status = get_int(buf, ptr);
++      lkb->lkb_rqmode = get_char(buf, ptr);
++      lkb->lkb_grmode = get_char(buf, ptr);
++      atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
++
++      parentid = get_int(buf, ptr);
++      lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
++
++      if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++              lkb->lkb_lvbptr = allocate_lvb(ls);
++              if (!lkb->lkb_lvbptr)
++                      goto out;
++              get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
++      }
++
++      if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
++              uint64_t start, end;
++
++              /* Don't need to keep the range flag, for comms use only */
++              lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
++              start = get_int64(buf, ptr);
++              end = get_int64(buf, ptr);
++
++              lkb->lkb_range = allocate_range(rsb->res_ls);
++              if (!lkb->lkb_range)
++                      goto out;
++
++              switch (status) {
++              case GDLM_LKSTS_CONVERT:
++                      lkb->lkb_range[RQ_RANGE_START] = start;
++                      lkb->lkb_range[RQ_RANGE_END] = end;
++                      start = get_int64(buf, ptr);
++                      end = get_int64(buf, ptr);
++                      lkb->lkb_range[GR_RANGE_START] = start;
++                      lkb->lkb_range[GR_RANGE_END] = end;
++
++              case GDLM_LKSTS_WAITING:
++                      lkb->lkb_range[RQ_RANGE_START] = start;
++                      lkb->lkb_range[RQ_RANGE_END] = end;
++                      break;
++
++              case GDLM_LKSTS_GRANTED:
++                      lkb->lkb_range[GR_RANGE_START] = start;
++                      lkb->lkb_range[GR_RANGE_END] = end;
++                      break;
++              default:
++                      GDLM_ASSERT(0,);
++              }
++      }
++
++      /* Resolve local lock LKB address from parent ID */
++      if (parentid)
++              lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
++                                                parentid);
++
++      atomic_inc(&rsb->res_ref);
++      lkb->lkb_resource = rsb;
++
++      lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
++      lkb->lkb_nodeid = rem_nodeid;
++
++      /* 
++       * Put the lkb on an RSB queue.  An lkb that's in the midst of a
++       * conversion request (on the requesting node's lockqueue and has
++       * LQCONVERT set) should be put on the granted queue.  The convert
++       * request will be resent by the requesting node.
++       */
++
++      if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
++              lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
++              GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
++                          printk("status=%d\n", status););
++              lkb->lkb_rqmode = DLM_LOCK_IV;
++              status = GDLM_LKSTS_GRANTED;
++      }
++
++      lkb_enqueue(rsb, lkb, status);
++
++      /* 
++       * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
++       */
++
++      if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++          && lkb->lkb_grmode > DLM_LOCK_NL) {
++              if (!rsb->res_lvbptr)
++                      rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++              if (!rsb->res_lvbptr)
++                      goto out;
++              memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++      }
++
++      /* 
++       * Clear flags that may have been sent over that are only relevant in
++       * the context of the sender.
++       */
++
++      lkb->lkb_flags &= ~(GDLM_LKFLG_DELAST | GDLM_LKFLG_DELETED |
++                          GDLM_LKFLG_LQRESEND | GDLM_LKFLG_NOREBUILD |
++                          GDLM_LKFLG_DEMOTED);
++
++      put_lkid:
++      /* Return the new LKID to the caller's buffer */
++      put_int(lkb->lkb_id, outbuf, outoffp);
++      put_int(lkb->lkb_remid, outbuf, outoffp);
++      error = 0;
++
++      out:
++      return error;
++}
++
++static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
++                               char *buf, int *ptr)
++{
++      int length;
++      int remasterid;
++      int parent_remasterid;
++      char name[DLM_RESNAME_MAXLEN];
++      int error;
++      gd_res_t *parent = NULL;
++      gd_res_t *rsb;
++
++      get_bytes(name, &length, buf, ptr);
++      remasterid = get_int(buf, ptr);
++      parent_remasterid = get_int(buf, ptr);
++
++      if (parent_remasterid)
++              parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
++
++      /* 
++       * The rsb reference from this find_or_create_rsb() will keep the rsb
++       * around while we add new lkb's to it from deserialise_lkb.  Each of
++       * the lkb's will add an rsb reference.  The reference added here is
++       * removed by release_rsb() after all lkb's are added.
++       */
++
++      error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
++      GDLM_ASSERT(!error,);
++
++      /* There is a case where the above needs to create the RSB. */
++      if (rsb->res_nodeid == -1)
++              rsb->res_nodeid = our_nodeid();
++
++      rsb->res_remasterid = remasterid;
++
++      return rsb;
++}
++
++/* 
++ * Processing at the receiving end of a NEWLOCKS message from a node in
++ * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
++ * node whose locks we are now mastering.  For a reply we need to send back the
++ * new lockids of the remastered locks so that remote ops can find them.
++ */
++
++int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
++{
++      gd_rcom_t *rc;
++      gd_res_t *rsb = NULL;
++      rebuild_node_t *rnode;
++      char *outbuf;
++      int outptr, ptr = 0, error = -ENOMEM;
++
++      rnode = find_rebuild_root(ls, nodeid);
++      if (!rnode)
++              goto out;
++
++      /* 
++       * Allocate a buffer for the reply message which is a list of remote
++       * lock IDs and their (new) local lock ids.  It will always be big
++       * enough to fit <n> ID pairs if it already fit <n> LKBs.
++       */
++
++      rc = allocate_rcom_buffer(ls);
++      if (!rc)
++              goto out;
++      outbuf = rc->rc_buf;
++      outptr = 0;
++
++      /* 
++       * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
++       * created.  Each deserialise_rsb adds an rsb reference that must be
++       * removed with release_rsb once all new lkb's for an rsb have been
++       * added.
++       */
++
++      while (ptr < len) {
++              int type;
++
++              type = get_char(buf, &ptr);
++
++              switch (type) {
++              case REMASTER_ROOTRSB:
++                      if (rsb)
++                              release_rsb(rsb);
++                      rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
++                                            &ptr);
++                      rnode->rootrsb = rsb;
++                      break;
++
++              case REMASTER_RSB:
++                      if (rsb)
++                              release_rsb(rsb);
++                      rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
++                                            &ptr);
++                      break;
++
++              case REMASTER_LKB:
++                      deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
++                                      outbuf, &outptr);
++                      break;
++
++              default:
++                      GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
++                                            "len=%d\n", type, nodeid, ptr,
++                                            len););
++              }
++      }
++
++      if (rsb)
++              release_rsb(rsb);
++
++      /* 
++       * Reply with the new lock IDs.
++       */
++
++      rc->rc_datalen = outptr;
++      error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
++
++      free_rcom_buffer(rc);
++
++      out:
++      return error;
++}
++
++/* 
++ * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
++ * new master telling us what the new remote lock IDs are for the remastered
++ * locks
++ */
++
++int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
++{
++      int offset = 0;
++
++      if (len == 1)
++              len = 0;
++
++      while (offset < len) {
++              int remote_id;
++              int local_id;
++              gd_lkb_t *lkb;
++
++              if (offset + 8 > len) {
++                      log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
++                                "length nodeid=%d offset=%d len=%d",
++                                nodeid, offset, len);
++                      break;
++              }
++
++              remote_id = get_int(buf, &offset);
++              local_id = get_int(buf, &offset);
++
++              lkb = find_lock_by_id(ls, local_id);
++              if (lkb) {
++                      lkb->lkb_remid = remote_id;
++                      have_new_lkid(lkb);
++              } else {
++                      log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
++                                "nodeid=%d id=%x remid=%x offset=%d len=%d",
++                                nodeid, local_id, remote_id, offset, len);
++              }
++      }
++
++      if (recover_list_empty(ls))
++              wake_up(&ls->ls_wait_general);
++
++      return 0;
++}
+diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
+--- linux-orig/cluster/dlm/rebuild.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rebuild.h        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __REBUILD_DOT_H__
++#define __REBUILD_DOT_H__
++
++int rebuild_rsbs_send(gd_ls_t * ls);
++int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
++int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
++int rebuild_freemem(gd_ls_t * ls);
++
++#endif                                /* __REBUILD_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
+--- linux-orig/cluster/dlm/reccomms.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/reccomms.c       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,502 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "reccomms.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "recover.h"
++#include "dir.h"
++#include "config.h"
++#include "rebuild.h"
++#include "memory.h"
++
++/* Running on the basis that only a single recovery communication will be done
++ * at a time per lockspace */
++
++static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
++
++/*
++ * Track per-node progress/stats during recovery to help debugging.
++ */
++
++void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
++{
++      gd_csb_t *csb;
++      int found = 0;
++ 
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              if (csb->csb_node->gn_nodeid == nodeid) {
++                      found = TRUE;
++                      break;
++              }
++      }
++
++      if (!found)
++              return;
++
++      if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
++              if (send) {
++                      csb->csb_names_send_count++;
++                      csb->csb_names_send_msgid = rc->rc_msgid;
++              } else {
++                      csb->csb_names_recv_count++;
++                      csb->csb_names_recv_msgid = rc->rc_msgid;
++              }
++      } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
++              if (send) {
++                      csb->csb_locks_send_count++;
++                      csb->csb_locks_send_msgid = rc->rc_msgid;
++              } else {
++                      csb->csb_locks_recv_count++;
++                      csb->csb_locks_recv_msgid = rc->rc_msgid;
++              }
++      }
++}
++
++void rcom_log_clear(gd_ls_t *ls)
++{
++      gd_csb_t *csb;
++ 
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              csb->csb_names_send_count = 0;
++              csb->csb_names_send_msgid = 0;
++              csb->csb_names_recv_count = 0;
++              csb->csb_names_recv_msgid = 0;
++              csb->csb_locks_send_count = 0;
++              csb->csb_locks_send_msgid = 0;
++              csb->csb_locks_recv_count = 0;
++              csb->csb_locks_recv_msgid = 0;
++      }
++}
++
++static int rcom_response(gd_ls_t *ls)
++{
++      return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++}
++
++/**
++ * rcom_send_message - send or request recovery data
++ * @ls: the lockspace
++ * @nodeid: node to which the message is sent
++ * @type: type of recovery message
++ * @rc: the rc buffer to send
++ * @need_reply: wait for reply if this is set
++ *
++ * Using this interface
++ * i)   Allocate an rc buffer:  
++ *          rc = allocate_rcom_buffer(ls);
++ * ii)  Copy data to send beginning at rc->rc_buf:
++ *          memcpy(rc->rc_buf, mybuf, mylen);
++ * iii) Set rc->rc_datalen to the number of bytes copied in (ii): 
++ *          rc->rc_datalen = mylen
++ * iv)  Submit the rc to this function:
++ *          rcom_send_message(rc);
++ *
++ * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t).  If
++ * more data must be passed in one send, use rcom_expand_buffer() which
++ * incrementally increases the size of the rc buffer by dlm_config.buffer_size
++ * bytes.
++ *
++ * Any data returned for the message (when need_reply is set) will saved in
++ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
++ * number of bytes copied into rc->rc_buf.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
++                    int need_reply)
++{
++      int error = 0;
++
++      if (!rc->rc_datalen)
++              rc->rc_datalen = 1;
++
++      /* 
++       * Fill in the header.
++       */
++
++      rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
++      rc->rc_header.rh_lockspace = ls->ls_global_id;
++      rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
++      rc->rc_subcmd = type;
++      rc->rc_msgid = ++ls->ls_rcom_msgid;
++
++      rcom_log(ls, nodeid, rc, 1);
++
++      /* 
++       * When a reply is received, the reply data goes back into this buffer.
++       * Synchronous rcom requests (need_reply=1) are serialised because of
++       * the single ls_rcom.
++       */
++
++      if (need_reply) {
++              down(&ls->ls_rcom_lock);
++              ls->ls_rcom = rc;
++      }
++
++      /* 
++       * After sending the message we'll wait at the end of this function to
++       * get a reply.  The READY flag will be set when the reply has been
++       * received and requested data has been copied into
++       * ls->ls_rcom->rc_buf;
++       */
++
++      GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
++
++      /* 
++       * The WAIT bit indicates that we're waiting for and willing to accept a
++       * reply.  Any replies are ignored unless this bit is set.
++       */
++
++      set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
++
++      /* 
++       * Process the message locally.
++       */
++
++      if (nodeid == our_nodeid()) {
++              rcom_process_message(ls, nodeid, rc);
++              goto out;
++      }
++
++      /* 
++       * Send the message.
++       */
++
++      log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
++
++      error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
++                                    GFP_KERNEL);
++      GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
++      error = 0;
++
++      /* 
++       * Wait for a reply.  Once a reply is processed from midcomms, the
++       * READY bit will be set and we'll be awoken (gdlm_wait_function will
++       * return 0).
++       */
++
++      if (need_reply) {
++              error = gdlm_wait_function(ls, &rcom_response);
++              if (error)
++                      log_debug(ls, "rcom wait error %d", error);
++      }
++
++      out:
++      clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
++      clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++
++      if (need_reply)
++              up(&ls->ls_rcom_lock);
++
++      return error;
++}
++
++/* 
++ * Runs in same context as midcomms.
++ */
++
++static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
++{
++      gd_rcom_t rc_stack;
++      gd_rcom_t *reply = NULL;
++      gd_resdata_t *rd;
++      int status, datalen, maxlen;
++      uint32_t be_nodeid;
++
++      if (!ls)
++              return;
++
++      rcom_log(ls, nodeid, rc, 0);
++
++      if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
++              log_error(ls, "ignoring recovery message %x from %u",
++                        rc->rc_subcmd, nodeid);
++              return;
++      }
++
++      switch (rc->rc_subcmd) {
++
++      case RECCOMM_STATUS:
++
++              memset(&rc_stack, 0, sizeof(gd_rcom_t));
++              reply = &rc_stack;
++
++              reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++              reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++              reply->rc_subcmd = rc->rc_subcmd;
++              reply->rc_msgid = rc->rc_msgid;
++              reply->rc_buf[0] = 0;
++
++              if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
++                      reply->rc_buf[0] |= RESDIR_VALID;
++
++              if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
++                      reply->rc_buf[0] |= RESDIR_ALL_VALID;
++
++              if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
++                      reply->rc_buf[0] |= NODES_VALID;
++
++              if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
++                      reply->rc_buf[0] |= NODES_ALL_VALID;
++
++              reply->rc_datalen = 1;
++              reply->rc_header.rh_length =
++                      sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++              log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
++              break;
++
++      case RECCOMM_RECOVERNAMES:
++
++              reply = allocate_rcom_buffer(ls);
++              GDLM_ASSERT(reply,);
++              maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
++
++              reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++              reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++              reply->rc_subcmd = rc->rc_subcmd;
++              reply->rc_msgid = rc->rc_msgid;
++
++              /* 
++               * The other node wants a bunch of resource names.  The name of
++               * the resource to begin with is in rc->rc_buf.
++               */
++
++              datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
++                                            reply->rc_buf, maxlen, nodeid);
++
++              reply->rc_datalen = datalen;
++              reply->rc_header.rh_length =
++                  sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++              log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
++                        reply->rc_msgid);
++              break;
++
++      case RECCOMM_GETMASTER:
++
++              reply = allocate_rcom_buffer(ls);
++              GDLM_ASSERT(reply,);
++
++              reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++              reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++              reply->rc_subcmd = rc->rc_subcmd;
++              reply->rc_msgid = rc->rc_msgid;
++
++              /* 
++               * The other node wants to know the master of a named resource.
++               */
++
++              status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
++                                   &rd, 1);
++              if (status != 0) {
++                      free_rcom_buffer(reply);
++                      reply = NULL;
++                      return;
++              }
++              be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
++              memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
++              reply->rc_datalen = sizeof(uint32_t);
++              reply->rc_header.rh_length =
++                  sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++              break;
++
++      case RECCOMM_BULKLOOKUP:
++
++              reply = allocate_rcom_buffer(ls);
++              GDLM_ASSERT(reply,);
++
++              reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++              reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++              reply->rc_subcmd = rc->rc_subcmd;
++              reply->rc_msgid = rc->rc_msgid;
++
++              /* 
++               * This is a bulk version of the above and just returns a
++               * buffer full of node ids to match the resources
++               */
++
++              datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
++                                           rc->rc_datalen, reply->rc_buf);
++              if (datalen < 0) {
++                      free_rcom_buffer(reply);
++                      reply = NULL;
++                      return;
++              }
++
++              reply->rc_datalen = datalen;
++              reply->rc_header.rh_length =
++                  sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++              break;
++
++              /* 
++               * These RECCOMM messages don't need replies.
++               */
++
++      case RECCOMM_NEWLOCKS:
++              rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
++              break;
++
++      case RECCOMM_NEWLOCKIDS:
++              rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
++              break;
++
++      case RECCOMM_REMRESDATA:
++              remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
++              break;
++
++      default:
++              GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
++      }
++
++      if (reply) {
++              if (nodeid == our_nodeid()) {
++                      GDLM_ASSERT(rc == ls->ls_rcom,);
++                      memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
++                      rc->rc_datalen = reply->rc_datalen;
++              } else {
++                      midcomms_send_message(nodeid,
++                                            (struct gd_req_header *) reply,
++                                            GFP_KERNEL);
++              }
++
++              if (reply != &rc_stack)
++                      free_rcom_buffer(reply);
++      }
++}
++
++static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++      gd_rcom_t *rc = ls->ls_rcom;
++
++      if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
++              log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
++              return;
++      }
++
++      if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
++              log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
++                        reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
++              return;
++      }
++
++      memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
++      rc->rc_datalen = reply->rc_datalen;
++
++      /* 
++       * Tell the thread waiting in rcom_send_message() that it can go ahead.
++       */
++
++      set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++      wake_up(&ls->ls_wait_general);
++}
++
++static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++      restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
++                             reply->rc_msgid);
++}
++
++/* 
++ * Runs in same context as midcomms.
++ */
++
++static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++      if (gdlm_recovery_stopped(ls)) {
++              log_error(ls, "ignoring recovery reply %x from %u",
++                        reply->rc_subcmd, nodeid);
++              return;
++      }
++
++      switch (reply->rc_subcmd) {
++      case RECCOMM_GETMASTER:
++              process_reply_async(ls, nodeid, reply);
++              break;
++      case RECCOMM_STATUS:
++      case RECCOMM_NEWLOCKS:
++      case RECCOMM_NEWLOCKIDS:
++      case RECCOMM_RECOVERNAMES:
++              process_reply_sync(ls, nodeid, reply);
++              break;
++      default:
++              log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
++                        reply->rc_subcmd, nodeid);
++      }
++}
++
++
++static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
++{
++      struct writequeue_entry *wq;
++      gd_rcom_t *rc = (gd_rcom_t *) header;
++      gd_rcom_t *reply;
++
++      wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
++                               (char **)&reply);
++      if (!wq)
++              return -ENOMEM;
++
++      reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++      reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++      reply->rc_subcmd = rc->rc_subcmd;
++      reply->rc_msgid = rc->rc_msgid;
++      reply->rc_buf[0] = 0;
++
++      reply->rc_datalen = 1;
++      reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++      midcomms_send_buffer((struct gd_req_header *)reply, wq);
++      return 0;
++}
++
++
++/* 
++ * Runs in same context as midcomms.  Both recovery requests and recovery
++ * replies come through this function.
++ */
++
++void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
++{
++      gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
++      gd_rcom_t *rc = (gd_rcom_t *) header;
++
++      /* If the lockspace doesn't exist then still send a status message
++         back, it's possible that it just doesn't have it's global_id
++         yet. */
++      if (!ls) {
++            send_ls_not_ready(nodeid, header);
++            return;
++      }
++
++      switch (header->rh_cmd) {
++      case GDLM_REMCMD_RECOVERMESSAGE:
++              down_read(&ls->ls_rec_rsblist);
++              rcom_process_message(ls, nodeid, rc);
++              up_read(&ls->ls_rec_rsblist);
++              break;
++
++      case GDLM_REMCMD_RECOVERREPLY:
++              rcom_process_reply(ls, nodeid, rc);
++              break;
++
++      default:
++              GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
++      }
++}
++
+diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
+--- linux-orig/cluster/dlm/reccomms.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/reccomms.h       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,37 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECCOMMS_DOT_H__
++#define __RECCOMMS_DOT_H__
++
++/* Bit flags */
++
++#define RESDIR_VALID            (1)
++#define RESDIR_ALL_VALID        (2)
++#define NODES_VALID             (4)
++#define NODES_ALL_VALID         (8)
++
++#define RECCOMM_STATUS          (1)
++#define RECCOMM_RECOVERNAMES    (2)
++#define RECCOMM_GETMASTER       (3)
++#define RECCOMM_BULKLOOKUP      (4)
++#define RECCOMM_NEWLOCKS        (5)
++#define RECCOMM_NEWLOCKIDS      (6)
++#define RECCOMM_REMRESDATA      (7)
++
++int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
++                    int need_reply);
++void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
++void rcom_log_clear(gd_ls_t *ls);
++
++#endif
+diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
+--- linux-orig/cluster/dlm/recover.c   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recover.c        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,632 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "reccomms.h"
++#include "dir.h"
++#include "locking.h"
++#include "rsb.h"
++#include "lockspace.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "config.h"
++#include "ast.h"
++#include "memory.h"
++
++/*
++ * Called in recovery routines to check whether the recovery process has been
++ * interrupted/stopped by another transition.  A recovery in-process will abort
++ * if the lockspace is "stopped" so that a new recovery process can start from
++ * the beginning when the lockspace is "started" again.
++ */
++
++int gdlm_recovery_stopped(gd_ls_t *ls)
++{
++      return test_bit(LSFL_LS_STOP, &ls->ls_flags);
++}
++
++static void gdlm_wait_timer_fn(unsigned long data)
++{
++      gd_ls_t *ls = (gd_ls_t *) data;
++
++      wake_up(&ls->ls_wait_general);
++}
++
++/*
++ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
++ * set due to failure of a node in ls_nodes).  When another function thinks it
++ * could have completed the waited-on task, they should wake up ls_wait_general
++ * to get an immediate response rather than waiting for the timer to detect the
++ * result.  A timer wakes us up periodically while waiting to see if we should
++ * abort due to a node failure.
++ */
++
++int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
++{
++      struct timer_list timer;
++      int error = 0;
++
++      init_timer(&timer);
++      timer.function = gdlm_wait_timer_fn;
++      timer.data = (long) ls;
++
++      for (;;) {
++              mod_timer(&timer, jiffies + (5 * HZ));
++
++              wchan_cond_sleep_intr(ls->ls_wait_general,
++                                    !testfn(ls) &&
++                                    !test_bit(LSFL_LS_STOP, &ls->ls_flags));
++
++              if (timer_pending(&timer))
++                      del_timer(&timer);
++
++              if (testfn(ls))
++                      break;
++
++              if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
++                      error = -1;
++                      break;
++              }
++      }
++
++      return error;
++}
++
++int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
++{
++      gd_rcom_t rc_stack, *rc;
++      gd_csb_t *csb;
++      int status;
++      int error = 0;
++
++      memset(&rc_stack, 0, sizeof(gd_rcom_t));
++      rc = &rc_stack;
++      rc->rc_datalen = 0;
++
++      list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++              for (;;) {
++                      error = gdlm_recovery_stopped(ls);
++                      if (error)
++                              goto out;
++
++                      error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
++                                                RECCOMM_STATUS, rc, 1);
++                      if (error)
++                              goto out;
++
++                      status = rc->rc_buf[0];
++                      if (status & wait_status)
++                              break;
++                      else {
++                              set_current_state(TASK_INTERRUPTIBLE);
++                              schedule_timeout(HZ >> 1);
++                      }
++              }
++      }
++
++      out:
++      return error;
++}
++
++int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
++{
++      gd_rcom_t rc_stack, *rc;
++      uint32_t nodeid = ls->ls_low_nodeid;
++      int status;
++      int error = 0;
++
++      memset(&rc_stack, 0, sizeof(gd_rcom_t));
++      rc = &rc_stack;
++      rc->rc_datalen = 0;
++
++      for (;;) {
++              error = gdlm_recovery_stopped(ls);
++              if (error)
++                      goto out;
++
++              error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
++              if (error)
++                      break;
++
++              status = rc->rc_buf[0];
++              if (status & wait_status)
++                      break;
++              else {
++                      set_current_state(TASK_INTERRUPTIBLE);
++                      schedule_timeout(HZ >> 1);
++              }
++      }
++
++      out:
++      return error;
++}
++
++static int purge_queue(gd_ls_t *ls, struct list_head *queue)
++{
++      gd_lkb_t *lkb, *safe;
++      gd_res_t *rsb;
++      int count = 0;
++
++      list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
++              if (!lkb->lkb_nodeid)
++                      continue;
++
++              GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
++
++              if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
++                      list_del(&lkb->lkb_statequeue);
++
++                      rsb = lkb->lkb_resource;
++                      lkb->lkb_status = 0;
++
++                      if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++                          && &lkb->lkb_duetime)
++                              remove_from_deadlockqueue(lkb);
++
++                      release_lkb(ls, lkb);
++                      release_rsb(rsb);
++                      count++;
++              }
++      }
++
++      return count;
++}
++
++/*
++ * Go through local restbl and for each rsb we're master of, clear out any
++ * lkb's held by departed nodes.
++ */
++
++int restbl_lkb_purge(gd_ls_t *ls)
++{
++      struct list_head *tmp2, *safe2;
++      int count = 0;
++      gd_res_t *rootrsb, *safe, *rsb;
++
++      log_all(ls, "purge locks of departed nodes");
++
++      list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
++
++              rootrsb->res_resdir_seq = 1;
++
++              if (rootrsb->res_nodeid)
++                      continue;
++
++              hold_rsb(rootrsb);
++              down_write(&rootrsb->res_lock);
++
++              /* This traverses the subreslist in reverse order so we purge
++               * the children before their parents. */
++
++              for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
++                   tmp2 != &rootrsb->res_subreslist;
++                   tmp2 = safe2, safe2 = safe2->prev) {
++                      rsb = list_entry(tmp2, gd_res_t, res_subreslist);
++
++                      hold_rsb(rsb);
++                      purge_queue(ls, &rsb->res_grantqueue);
++                      purge_queue(ls, &rsb->res_convertqueue);
++                      purge_queue(ls, &rsb->res_waitqueue);
++                      release_rsb(rsb);
++              }
++              count += purge_queue(ls, &rootrsb->res_grantqueue);
++              count += purge_queue(ls, &rootrsb->res_convertqueue);
++              count += purge_queue(ls, &rootrsb->res_waitqueue);
++
++              up_write(&rootrsb->res_lock);
++              release_rsb(rootrsb);
++      }
++
++      log_all(ls, "purged %d locks", count);
++
++      return 0;
++}
++
++/*
++ * Grant any locks that have become grantable after a purge
++ */
++
++int restbl_grant_after_purge(gd_ls_t *ls)
++{
++      gd_res_t *root, *rsb, *safe;
++      int error = 0;
++
++      down_write(&ls->ls_gap_rsblist);
++
++      list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
++              /* only the rsb master grants locks */
++              if (root->res_nodeid)
++                      continue;
++
++              if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++                      log_debug(ls, "restbl_grant_after_purge aborted");
++                      error = -EINTR;
++                      up_write(&ls->ls_gap_rsblist);
++                      goto out;
++              }
++
++              down_write(&root->res_lock);
++              grant_pending_locks(root);
++              up_write(&root->res_lock);
++
++              list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
++                      down_write(&rsb->res_lock);
++                      grant_pending_locks(rsb);
++                      up_write(&rsb->res_lock);
++              }
++      }
++      up_write(&ls->ls_gap_rsblist);
++      wake_astd();
++ out:
++      return error;
++}
++
++/*
++ * Set the lock master for all LKBs in a lock queue
++ */
++
++static void set_lock_master(struct list_head *queue, int nodeid)
++{
++      gd_lkb_t *lkb;
++
++      list_for_each_entry(lkb, queue, lkb_statequeue) {
++              /* Don't muck around with pre-exising sublocks */
++              if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
++                      lkb->lkb_nodeid = nodeid;
++      }
++}
++
++static void set_master_lkbs(gd_res_t *rsb)
++{
++      set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
++      set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
++      set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
++}
++
++/*
++ * This rsb struct is now the master so it is responsible for keeping the
++ * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
++ * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
++ * this rsb in deserialise_lkb.
++ */
++
++static void set_rsb_lvb(gd_res_t *rsb)
++{
++      gd_lkb_t *lkb;
++
++      list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
++
++              if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
++                  (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
++                  (lkb->lkb_grmode > DLM_LOCK_NL))
++              {
++                      if (!rsb->res_lvbptr)
++                              rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++                      memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++                      return;
++              }
++      }
++
++      list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
++
++              if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
++                  (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
++                  (lkb->lkb_grmode > DLM_LOCK_NL))
++              {
++                      if (!rsb->res_lvbptr)
++                              rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++                      memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++                      return;
++              }
++      }
++}
++
++/*
++ * Propogate the new master nodeid to locks, subrsbs, sublocks.
++ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
++ */
++
++static void set_new_master(gd_res_t *rsb)
++{
++      gd_res_t *subrsb;
++
++      down_write(&rsb->res_lock);
++
++      if (rsb->res_nodeid == our_nodeid()) {
++              rsb->res_nodeid = 0;
++              set_rsb_lvb(rsb);
++      }
++
++      set_master_lkbs(rsb);
++
++      list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
++              subrsb->res_nodeid = rsb->res_nodeid;
++              set_master_lkbs(subrsb);
++      }
++
++      up_write(&rsb->res_lock);
++
++      set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
++}
++
++/*
++ * The recover_list contains all the rsb's for which we've requested the new
++ * master nodeid.  As replies are returned from the resource directories the
++ * rsb's are removed from the list.  When the list is empty we're done.
++ *
++ * The recover_list is later similarly used for all rsb's for which we've sent
++ * new lkb's and need to receive new corresponding lkid's.
++ */
++
++int recover_list_empty(gd_ls_t *ls)
++{
++      int empty;
++
++      spin_lock(&ls->ls_recover_list_lock);
++      empty = list_empty(&ls->ls_recover_list);
++      spin_unlock(&ls->ls_recover_list_lock);
++
++      return empty;
++}
++
++int recover_list_count(gd_ls_t *ls)
++{
++      int count;
++
++      spin_lock(&ls->ls_recover_list_lock);
++      count = ls->ls_recover_list_count;
++      spin_unlock(&ls->ls_recover_list_lock);
++
++      return count;
++}
++
++void recover_list_add(gd_res_t *rsb)
++{
++      gd_ls_t *ls = rsb->res_ls;
++
++      spin_lock(&ls->ls_recover_list_lock);
++      if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
++              list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
++              ls->ls_recover_list_count++;
++              hold_rsb(rsb);
++      }
++      spin_unlock(&ls->ls_recover_list_lock);
++}
++
++void recover_list_del(gd_res_t *rsb)
++{
++      gd_ls_t *ls = rsb->res_ls;
++
++      spin_lock(&ls->ls_recover_list_lock);
++      clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
++      list_del(&rsb->res_recover_list);
++      ls->ls_recover_list_count--;
++      spin_unlock(&ls->ls_recover_list_lock);
++
++      release_rsb(rsb);
++}
++
++static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
++{
++      gd_res_t *rsb = NULL;
++
++      spin_lock(&ls->ls_recover_list_lock);
++
++      list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
++              if (rsb->res_recover_msgid == msgid)
++                      goto rec_found;
++      }
++      rsb = NULL;
++
++ rec_found:
++      spin_unlock(&ls->ls_recover_list_lock);
++      return rsb;
++}
++
++#if 0
++static void recover_list_clear(gd_ls_t *ls)
++{
++      gd_res_t *rsb;
++
++
++      spin_lock(&ls->ls_recover_list_lock);
++
++      while (!list_empty(&ls->ls_recover_list)) {
++              rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
++                               res_recover_list);
++              list_del(&rsb->res_recover_list);
++              ls->ls_recover_list_count--;
++      }
++      spin_unlock(&ls->ls_recover_list_lock);
++
++}
++#endif
++
++#if 0
++void recover_list_dump(gd_ls_t *ls)
++{
++      struct list_head *tmp;
++      gd_res_t *rsb;
++
++      spin_lock(&ls->ls_recover_list_lock);
++
++      printk("recover_list_count=%d\n", ls->ls_recover_list_count);
++
++      list_for_each(tmp, &ls->ls_recover_list) {
++              rsb = list_entry(tmp, gd_res_t, res_recover_list);
++              gdlm_res_dbprint(rsb);
++      }
++      spin_unlock(&ls->ls_recover_list_lock);
++}
++#endif
++
++static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
++{
++      gd_ls_t *ls = rsb->res_ls;
++      gd_resdata_t *rd;
++      uint32_t dir_nodeid;
++      int error;
++
++      dir_nodeid = get_directory_nodeid(rsb);
++
++      if (dir_nodeid == our_nodeid()) {
++              error = get_resdata(ls, dir_nodeid, rsb->res_name,
++                                  rsb->res_length, &rd, 1);
++              if (error)
++                      goto fail;
++
++              rsb->res_nodeid = rd->rd_master_nodeid;
++              set_new_master(rsb);
++      } else {
++              /* As we are the only thread doing recovery this 
++                 should be safe. if not then we need to use a different
++                 ID somehow. We must set it in the RSB before rcom_send_msg
++                 completes cos we may get a reply quite quickly.
++              */
++              rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
++
++              recover_list_add(rsb);
++
++              memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
++              rc->rc_datalen = rsb->res_length;
++
++              error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
++                                        rc, 0);
++              if (error)
++                      goto fail;
++      }
++
++      fail:
++      return error;
++}
++
++/*
++ * Go through local root resources and for each rsb which has a master which
++ * has departed, get the new master nodeid from the resdir.  The resdir will
++ * assign mastery to the first node to look up the new master.  That means
++ * we'll discover in this lookup if we're the new master of any rsb's.
++ *
++ * We fire off all the resdir requests individually and asynchronously to the
++ * correct resdir node.  The replies are processed in rsb_master_recv().
++ */
++
++int restbl_rsb_update(gd_ls_t *ls)
++{
++      gd_res_t *rsb, *safe;
++      gd_rcom_t *rc;
++      int error = -ENOMEM;
++      int count = 0;
++
++      log_all(ls, "update remastered resources");
++
++      rc = allocate_rcom_buffer(ls);
++      if (!rc)
++              goto out;
++
++      list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
++              if (!rsb->res_nodeid)
++                      continue;
++
++              error = gdlm_recovery_stopped(ls);
++              if (error)
++                      goto out_free;
++
++              if (in_nodes_gone(ls, rsb->res_nodeid)) {
++                      error = rsb_master_lookup(rsb, rc);
++                      if (error)
++                              goto out_free;
++                      count++;
++              }
++      }
++
++      error = gdlm_wait_function(ls, &recover_list_empty);
++
++      log_all(ls, "updated %d resources", count);
++
++      out_free:
++      free_rcom_buffer(rc);
++
++      out:
++      return error;
++}
++
++int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
++                         int msgid)
++{
++      gd_res_t *rsb;
++      uint32_t be_nodeid;
++
++      rsb = recover_list_find(ls, msgid);
++      if (!rsb) {
++              log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
++              goto out;
++      }
++
++      memcpy(&be_nodeid, buf, sizeof(uint32_t));
++      rsb->res_nodeid = be32_to_cpu(be_nodeid);
++      set_new_master(rsb);
++      recover_list_del(rsb);
++
++      if (recover_list_empty(ls))
++              wake_up(&ls->ls_wait_general);
++
++      out:
++      return 0;
++}
++
++/*
++ * This function not used any longer.
++ */
++
++int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
++                     char *outbuf)
++{
++      char *inbufptr, *outbufptr;
++
++      /*
++       * The other node wants nodeids matching the resource names in inbuf.
++       * The resource names are packed into inbuf as
++       * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
++       * lenX bytes.  Matching nodeids are packed into outbuf in order
++       * [nodeid1][nodeid2]...
++       */
++
++      inbufptr = inbuf;
++      outbufptr = outbuf;
++
++      while (inbufptr < inbuf + inlen) {
++              gd_resdata_t *rd;
++              uint32_t be_nodeid;
++              int status;
++
++              status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
++                                   &rd, 1);
++              if (status != 0)
++                      goto fail;
++
++              inbufptr += *inbufptr + 1;
++
++              be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
++              memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
++              outbufptr += sizeof(uint32_t);
++
++              /* add assertion that outbufptr - outbuf is not > than ... */
++      }
++
++      return (outbufptr - outbuf);
++
++      fail:
++      return -1;
++}
+diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
+--- linux-orig/cluster/dlm/recover.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recover.h        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVER_DOT_H__
++#define __RECOVER_DOT_H__
++
++int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
++int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
++int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
++int gdlm_recovery_stopped(gd_ls_t * ls);
++int recover_list_empty(gd_ls_t * ls);
++int recover_list_count(gd_ls_t * ls);
++void recover_list_add(gd_res_t * rsb);
++void recover_list_del(gd_res_t * rsb);
++void recover_list_dump(gd_ls_t * ls);
++int restbl_lkb_purge(gd_ls_t * ls);
++void restbl_grant_after_purge(gd_ls_t * ls);
++int restbl_rsb_update(gd_ls_t * ls);
++int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
++                         int msgid);
++int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
++                     char *outbuf);
++
++#endif                                /* __RECOVER_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
+--- linux-orig/cluster/dlm/recoverd.c  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recoverd.c       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,692 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "recover.h"
++#include "lockspace.h"
++#include "lowcomms.h"
++#include "lockqueue.h"
++#include "lkb.h"
++#include "rebuild.h"
++
++/* 
++ * next_move actions
++ */
++
++#define DO_STOP             (1)
++#define DO_START            (2)
++#define DO_FINISH           (3)
++#define DO_FINISH_STOP      (4)
++#define DO_FINISH_START     (5)
++
++/* 
++ * recoverd_flags for thread
++ */
++
++#define THREAD_STOP         (0)
++
++/* 
++ * local thread variables
++ */
++
++static unsigned long recoverd_flags;
++static struct completion recoverd_run;
++static wait_queue_head_t recoverd_wait;
++static struct task_struct *recoverd_task;
++
++/* 
++ * Queue of lockspaces (gr_recover_t structs) which need to be
++ * started/recovered
++ */
++
++static struct list_head recoverd_start_queue;
++static atomic_t recoverd_start_count;
++
++extern struct list_head lslist;
++extern spinlock_t lslist_lock;
++
++void dlm_recoverd_init(void)
++{
++      INIT_LIST_HEAD(&recoverd_start_queue);
++      atomic_set(&recoverd_start_count, 0);
++
++      init_completion(&recoverd_run);
++      init_waitqueue_head(&recoverd_wait);
++      memset(&recoverd_flags, 0, sizeof(unsigned long));
++}
++
++static int enable_locking(gd_ls_t *ls, int event_id)
++{
++      int error = 0;
++
++      spin_lock(&ls->ls_recover_lock);
++      if (ls->ls_last_stop < event_id) {
++              set_bit(LSFL_LS_RUN, &ls->ls_flags);
++              up_write(&ls->ls_in_recovery);
++      } else {
++              error = -EINTR;
++              log_debug(ls, "enable_locking: abort %d", event_id);
++      }
++      spin_unlock(&ls->ls_recover_lock);
++      return error;
++}
++
++static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
++{
++      int error;
++
++      log_all(ls, "recover event %u (first)", gr->gr_event_id);
++
++      kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
++
++      error = ls_nodes_init(ls, gr);
++      if (error) {
++              log_error(ls, "nodes_init failed %d", error);
++              goto out;
++      }
++
++      error = resdir_rebuild_local(ls);
++      if (error) {
++              log_error(ls, "resdir_rebuild_local failed %d", error);
++              goto out;
++      }
++
++      error = resdir_rebuild_wait(ls);
++      if (error) {
++              log_error(ls, "resdir_rebuild_wait failed %d", error);
++              goto out;
++      }
++
++      log_all(ls, "recover event %u done", gr->gr_event_id);
++      kcl_start_done(ls->ls_local_id, gr->gr_event_id);
++
++      out:
++      return error;
++}
++
++/* 
++ * We are given here a new group of nodes which are in the lockspace.  We first
++ * figure out the differences in ls membership from when we were last running.
++ * If nodes from before are gone, then there will be some lock recovery to do.
++ * If there are only nodes which have joined, then there's no lock recovery.
++ *
++ * note: cman requires an rc to finish starting on an revent (where nodes die)
++ * before it allows an sevent (where nodes join) to be processed.  This means
++ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
++ * joined.
++ */
++
++static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
++{
++      int error, neg = 0;
++
++      log_all(ls, "recover event %u", gr->gr_event_id);
++
++      /* 
++       * Add or remove nodes from the lockspace's ls_nodes list.
++       */
++
++      error = ls_nodes_reconfig(ls, gr, &neg);
++      if (error) {
++              log_error(ls, "nodes_reconfig failed %d", error);
++              goto fail;
++      }
++
++      /* 
++       * Rebuild our own share of the resdir by collecting from all other
++       * nodes rsb name/master pairs for which the name hashes to us.
++       */
++
++      error = resdir_rebuild_local(ls);
++      if (error) {
++              log_error(ls, "resdir_rebuild_local failed %d", error);
++              goto fail;
++      }
++
++      /* 
++       * Purge resdir-related requests that are being held in requestqueue.
++       * All resdir requests from before recovery started are invalid now due
++       * to the resdir rebuild and will be resent by the requesting nodes.
++       */
++
++      purge_requestqueue(ls);
++      set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
++
++      /* 
++       * Wait for all nodes to complete resdir rebuild.
++       */
++
++      error = resdir_rebuild_wait(ls);
++      if (error) {
++              log_error(ls, "resdir_rebuild_wait failed %d", error);
++              goto fail;
++      }
++
++      /* 
++       * Mark our own lkb's waiting in the lockqueue for remote replies from
++       * nodes that are now departed.  These will be resent to the new
++       * masters in resend_cluster_requests.  Also mark resdir lookup
++       * requests for resending.
++       */
++
++      lockqueue_lkb_mark(ls);
++
++      error = gdlm_recovery_stopped(ls);
++      if (error)
++              goto fail;
++
++      if (neg) {
++              /* 
++               * Clear lkb's for departed nodes.  This can't fail since it
++               * doesn't involve communicating with other nodes.
++               */
++
++              down_write(&ls->ls_rec_rsblist);
++              restbl_lkb_purge(ls);
++              up_write(&ls->ls_rec_rsblist);
++
++              down_read(&ls->ls_rec_rsblist);
++
++              /* 
++               * Get new master id's for rsb's of departed nodes.  This fails
++               * if we can't communicate with other nodes.
++               */
++
++              error = restbl_rsb_update(ls);
++              if (error) {
++                      log_error(ls, "restbl_rsb_update failed %d", error);
++                      goto fail_up;
++              }
++
++              /* 
++               * Send our lkb info to new masters.  This fails if we can't
++               * communicate with a node.
++               */
++
++              error = rebuild_rsbs_send(ls);
++              if (error) {
++                      log_error(ls, "rebuild_rsbs_send failed %d", error);
++                      goto fail_up;
++              }
++              up_read(&ls->ls_rec_rsblist);
++      }
++
++      clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
++
++      log_all(ls, "recover event %u done", gr->gr_event_id);
++      kcl_start_done(ls->ls_local_id, gr->gr_event_id);
++      return 0;
++
++ fail_up:
++      up_read(&ls->ls_rec_rsblist);
++ fail:
++      log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
++      return error;
++}
++
++static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
++{
++      gd_csb_t *csb, *safe;
++
++      list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
++              if (csb->csb_gone_event <= finish_event) {
++                      list_del(&csb->csb_list);
++                      release_csb(csb);
++              }
++      }
++}
++
++/* 
++ * Between calls to this routine for a ls, there can be multiple stop/start
++ * events from cman where every start but the latest is cancelled by stops.
++ * There can only be a single finish from cman because every finish requires us
++ * to call start_done.  A single finish event could be followed by multiple
++ * stop/start events.  This routine takes any combination of events from cman
++ * and boils them down to one course of action.
++ */
++
++int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
++{
++      LIST_HEAD(events);
++      unsigned int cmd = 0, stop, start, finish;
++      unsigned int last_stop, last_start, last_finish;
++      gd_recover_t *gr = NULL, *start_gr = NULL;
++
++      /* 
++       * Grab the current state of cman/sm events.
++       */
++
++      spin_lock(&ls->ls_recover_lock);
++
++      stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
++      start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
++      finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
++
++      last_stop = ls->ls_last_stop;
++      last_start = ls->ls_last_start;
++      last_finish = ls->ls_last_finish;
++
++      while (!list_empty(&ls->ls_recover)) {
++              gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
++              list_del(&gr->gr_list);
++              list_add_tail(&gr->gr_list, &events);
++      }
++      spin_unlock(&ls->ls_recover_lock);
++
++      log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
++                last_stop, last_start, last_finish);
++
++      /* 
++       * Toss start events which have since been cancelled.
++       */
++
++      while (!list_empty(&events)) {
++              GDLM_ASSERT(start,);
++              gr = list_entry(events.next, gd_recover_t, gr_list);
++              list_del(&gr->gr_list);
++
++              if (gr->gr_event_id <= last_stop) {
++                      log_debug(ls, "move skip event %u", gr->gr_event_id);
++                      kfree(gr->gr_nodeids);
++                      free_dlm_recover(gr);
++                      gr = NULL;
++              } else {
++                      log_debug(ls, "move use event %u", gr->gr_event_id);
++                      GDLM_ASSERT(!start_gr,);
++                      start_gr = gr;
++              }
++      }
++
++      /* 
++       * Eight possible combinations of events.
++       */
++
++      /* 0 */
++      if (!stop && !start && !finish) {
++              GDLM_ASSERT(!start_gr,);
++              cmd = 0;
++              goto out;
++      }
++
++      /* 1 */
++      if (!stop && !start && finish) {
++              GDLM_ASSERT(!start_gr,);
++              GDLM_ASSERT(last_start > last_stop,);
++              GDLM_ASSERT(last_finish == last_start,);
++              cmd = DO_FINISH;
++              *finish_out = last_finish;
++              goto out;
++      }
++
++      /* 2 */
++      if (!stop && start && !finish) {
++              GDLM_ASSERT(start_gr,);
++              GDLM_ASSERT(last_start > last_stop,);
++              cmd = DO_START;
++              *gr_out = start_gr;
++              goto out;
++      }
++
++      /* 3 */
++      if (!stop && start && finish) {
++              GDLM_ASSERT(0, printk("finish and start with no stop\n"););
++      }
++
++      /* 4 */
++      if (stop && !start && !finish) {
++              GDLM_ASSERT(!start_gr,);
++              GDLM_ASSERT(last_start == last_stop,);
++              cmd = DO_STOP;
++              goto out;
++      }
++
++      /* 5 */
++      if (stop && !start && finish) {
++              GDLM_ASSERT(!start_gr,);
++              GDLM_ASSERT(last_finish == last_start,);
++              GDLM_ASSERT(last_stop == last_start,);
++              cmd = DO_FINISH_STOP;
++              *finish_out = last_finish;
++              goto out;
++      }
++
++      /* 6 */
++      if (stop && start && !finish) {
++              if (start_gr) {
++                      GDLM_ASSERT(last_start > last_stop,);
++                      cmd = DO_START;
++                      *gr_out = start_gr;
++              } else {
++                      GDLM_ASSERT(last_stop == last_start,);
++                      cmd = DO_STOP;
++              }
++              goto out;
++      }
++
++      /* 7 */
++      if (stop && start && finish) {
++              if (start_gr) {
++                      GDLM_ASSERT(last_start > last_stop,);
++                      GDLM_ASSERT(last_start > last_finish,);
++                      cmd = DO_FINISH_START;
++                      *finish_out = last_finish;
++                      *gr_out = start_gr;
++              } else {
++                      GDLM_ASSERT(last_start == last_stop,);
++                      GDLM_ASSERT(last_start > last_finish,);
++                      cmd = DO_FINISH_STOP;
++                      *finish_out = last_finish;
++              }
++              goto out;
++      }
++
++      out:
++      return cmd;
++}
++
++/* 
++ * This function decides what to do given every combination of current
++ * lockspace state and next lockspace state.
++ */
++
++static void do_ls_recovery(gd_ls_t *ls)
++{
++      gd_recover_t *gr = NULL;
++      int error, cur_state, next_state = 0, do_now, finish_event = 0;
++
++      do_now = next_move(ls, &gr, &finish_event);
++      if (!do_now)
++              goto out;
++
++      cur_state = ls->ls_state;
++      next_state = 0;
++
++      GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
++                  log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
++
++      /* 
++       * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
++       * a stop and start which equates with a START.
++       */
++
++      if (cur_state == LSST_CLEAR) {
++              switch (do_now) {
++              case DO_STOP:
++                      next_state = LSST_WAIT_START;
++                      break;
++
++              case DO_START:
++                      error = ls_reconfig(ls, gr);
++                      if (error)
++                              next_state = LSST_WAIT_START;
++                      else
++                              next_state = LSST_RECONFIG_DONE;
++                      break;
++
++              case DO_FINISH: /* invalid */
++              case DO_FINISH_STOP:    /* invalid */
++              case DO_FINISH_START:   /* invalid */
++              default:
++                      GDLM_ASSERT(0,);
++              }
++              goto out;
++      }
++
++      /* 
++       * LSST_WAIT_START - we're not running because of getting a stop or
++       * failing a start.  We wait in this state for another stop/start or
++       * just the next start to begin another reconfig attempt.
++       */
++
++      if (cur_state == LSST_WAIT_START) {
++              switch (do_now) {
++              case DO_STOP:
++                      break;
++
++              case DO_START:
++                      error = ls_reconfig(ls, gr);
++                      if (error)
++                              next_state = LSST_WAIT_START;
++                      else
++                              next_state = LSST_RECONFIG_DONE;
++                      break;
++
++              case DO_FINISH: /* invalid */
++              case DO_FINISH_STOP:    /* invalid */
++              case DO_FINISH_START:   /* invalid */
++              default:
++                      GDLM_ASSERT(0,);
++              }
++              goto out;
++      }
++
++      /* 
++       * LSST_RECONFIG_DONE - we entered this state after successfully
++       * completing ls_reconfig and calling kcl_start_done.  We expect to get
++       * a finish if everything goes ok.  A finish could be followed by stop
++       * or stop/start before we get here to check it.  Or a finish may never
++       * happen, only stop or stop/start.
++       */
++
++      if (cur_state == LSST_RECONFIG_DONE) {
++              switch (do_now) {
++              case DO_FINISH:
++                      clear_finished_nodes(ls, finish_event);
++                      next_state = LSST_CLEAR;
++
++                      error = enable_locking(ls, finish_event);
++                      if (error)
++                              break;
++
++                      error = process_requestqueue(ls);
++                      if (error)
++                              break;
++
++                      error = resend_cluster_requests(ls);
++                      if (error)
++                              break;
++
++                      restbl_grant_after_purge(ls);
++
++                      log_all(ls, "recover event %u finished", finish_event);
++                      break;
++
++              case DO_STOP:
++                      next_state = LSST_WAIT_START;
++                      break;
++
++              case DO_FINISH_STOP:
++                      clear_finished_nodes(ls, finish_event);
++                      next_state = LSST_WAIT_START;
++                      break;
++
++              case DO_FINISH_START:
++                      clear_finished_nodes(ls, finish_event);
++                      /* fall into DO_START */
++
++              case DO_START:
++                      error = ls_reconfig(ls, gr);
++                      if (error)
++                              next_state = LSST_WAIT_START;
++                      else
++                              next_state = LSST_RECONFIG_DONE;
++                      break;
++
++              default:
++                      GDLM_ASSERT(0,);
++              }
++              goto out;
++      }
++
++      /* 
++       * LSST_INIT - state after ls is created and before it has been
++       * started.  A start operation will cause the ls to be started for the
++       * first time.  A failed start will cause to just wait in INIT for
++       * another stop/start.
++       */
++
++      if (cur_state == LSST_INIT) {
++              switch (do_now) {
++              case DO_START:
++                      error = ls_first_start(ls, gr);
++                      if (!error)
++                              next_state = LSST_INIT_DONE;
++                      break;
++
++              case DO_STOP:
++                      break;
++
++              case DO_FINISH: /* invalid */
++              case DO_FINISH_STOP:    /* invalid */
++              case DO_FINISH_START:   /* invalid */
++              default:
++                      GDLM_ASSERT(0,);
++              }
++              goto out;
++      }
++
++      /* 
++       * LSST_INIT_DONE - after the first start operation is completed
++       * successfully and kcl_start_done() called.  If there are no errors, a
++       * finish will arrive next and we'll move to LSST_CLEAR.
++       */
++
++      if (cur_state == LSST_INIT_DONE) {
++              switch (do_now) {
++              case DO_STOP:
++              case DO_FINISH_STOP:
++                      next_state = LSST_WAIT_START;
++                      break;
++
++              case DO_START:
++              case DO_FINISH_START:
++                      error = ls_reconfig(ls, gr);
++                      if (error)
++                              next_state = LSST_WAIT_START;
++                      else
++                              next_state = LSST_RECONFIG_DONE;
++                      break;
++
++              case DO_FINISH:
++                      next_state = LSST_CLEAR;
++                      enable_locking(ls, finish_event);
++                      log_all(ls, "recover event %u finished", finish_event);
++                      break;
++
++              default:
++                      GDLM_ASSERT(0,);
++              }
++              goto out;
++      }
++
++      out:
++      if (next_state)
++              ls->ls_state = next_state;
++
++      if (gr) {
++              kfree(gr->gr_nodeids);
++              free_dlm_recover(gr);
++      }
++}
++
++static __inline__ gd_ls_t *get_work(int clear)
++{
++      gd_ls_t *ls;
++
++      spin_lock(&lslist_lock);
++
++      list_for_each_entry(ls, &lslist, ls_list) {
++              if (clear) {
++                      if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
++                              goto got_work;
++
++              } else {
++                      if (test_bit(LSFL_WORK, &ls->ls_flags))
++                              goto got_work;
++              }
++      }
++      ls = NULL;
++
++ got_work:
++      spin_unlock(&lslist_lock);
++
++      return ls;
++}
++
++/* 
++ * Thread which does recovery for all lockspaces.
++ */
++
++static int dlm_recoverd(void *arg)
++{
++      gd_ls_t *ls;
++
++      daemonize("dlm_recoverd");
++      recoverd_task = current;
++      complete(&recoverd_run);
++
++      while (!test_bit(THREAD_STOP, &recoverd_flags)) {
++              wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
++              if ((ls = get_work(1)))
++                      do_ls_recovery(ls);
++      }
++
++      complete(&recoverd_run);
++      return 0;
++}
++
++/* 
++ * Mark a specific lockspace as needing work and wake up the thread to do it.
++ */
++
++void recoverd_kick(gd_ls_t *ls)
++{
++      set_bit(LSFL_WORK, &ls->ls_flags);
++      wake_up(&recoverd_wait);
++}
++
++/* 
++ * Start the recoverd thread when gdlm is started (before any lockspaces).
++ */
++
++int recoverd_start(void)
++{
++      int error;
++
++      clear_bit(THREAD_STOP, &recoverd_flags);
++      error = kernel_thread(dlm_recoverd, NULL, 0);
++      if (error < 0)
++              goto out;
++
++      error = 0;
++      wait_for_completion(&recoverd_run);
++
++      out:
++      return error;
++}
++
++/* 
++ * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
++ */
++
++int recoverd_stop(void)
++{
++      set_bit(THREAD_STOP, &recoverd_flags);
++      wake_up(&recoverd_wait);
++      wait_for_completion(&recoverd_run);
++
++      return 0;
++}
+diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
+--- linux-orig/cluster/dlm/recoverd.h  1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recoverd.h       2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVERD_DOT_H__
++#define __RECOVERD_DOT_H__
++
++void dlm_recoverd_init(void);
++void recoverd_kick(gd_ls_t * ls);
++int recoverd_start(void);
++int recoverd_stop(void);
++
++#endif                                /* __RECOVERD_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
+--- linux-orig/cluster/dlm/rsb.c       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rsb.c    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,307 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "locking.h"
++#include "memory.h"
++#include "lockqueue.h"
++#include "nodes.h"
++#include "dir.h"
++#include "util.h"
++
++static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
++                                char *name, int namelen)
++{
++      gd_res_t *r;
++
++      list_for_each_entry(r, head, res_hashchain) {
++              if ((parent == r->res_parent) && (namelen == r->res_length) &&
++                  (memcmp(name, r->res_name, namelen) == 0)) {
++                      atomic_inc(&r->res_ref);
++                      return r;
++              }
++      }
++
++      return NULL;
++}
++
++/*
++ * A way to arbitrarily hold onto an rsb which we already have a reference to
++ * to make sure it doesn't go away.  Opposite of release_rsb().
++ */
++
++void hold_rsb(gd_res_t *r)
++{
++      atomic_inc(&r->res_ref);
++}
++
++/*
++ * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
++ * struct when there are zero references.  Every lkb for the rsb adds a
++ * reference.  When ref is zero there can be no more lkb's for the rsb, on the
++ * queue's or anywhere else.
++ */
++
++void release_rsb(gd_res_t *r)
++{
++      gd_ls_t *ls = r->res_ls;
++      int removed = FALSE;
++
++      write_lock(&ls->ls_reshash_lock);
++      atomic_dec(&r->res_ref);
++
++      if (!atomic_read(&r->res_ref)) {
++              GDLM_ASSERT(list_empty(&r->res_grantqueue),);
++              GDLM_ASSERT(list_empty(&r->res_waitqueue),);
++              GDLM_ASSERT(list_empty(&r->res_convertqueue),);
++              removed = TRUE;
++              list_del(&r->res_hashchain);
++      }
++      write_unlock(&ls->ls_reshash_lock);
++
++      if (removed) {
++              down_read(&ls->ls_gap_rsblist);
++              if (r->res_parent)
++                      list_del(&r->res_subreslist);
++              else
++                      list_del(&r->res_rootlist);
++              up_read(&ls->ls_gap_rsblist);
++
++              /*
++               * Remove resdir entry if this was a locally mastered root rsb.
++               */
++              if (!r->res_parent && !r->res_nodeid) {
++                      if (get_directory_nodeid(r) != our_nodeid())
++                              remote_remove_resdata(r->res_ls,
++                                                    get_directory_nodeid(r),
++                                                    r->res_name,
++                                                    r->res_length,
++                                                    r->res_resdir_seq);
++                      else
++                              remove_resdata(r->res_ls, our_nodeid(),
++                                             r->res_name, r->res_length,
++                                             r->res_resdir_seq);
++              }
++
++              if (r->res_lvbptr)
++                      free_lvb(r->res_lvbptr);
++
++              free_rsb(r);
++      }
++}
++
++/*
++ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
++ * If the rsb exists, its ref count is incremented by this function.  If it
++ * doesn't exist, it's created with a ref count of one.
++ */
++
++int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
++                     int create, gd_res_t **rp)
++{
++      uint32_t hash;
++      gd_res_t *r, *tmp;
++      int error = -ENOMEM;
++
++      GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++      hash = gdlm_hash(name, namelen);
++      hash &= ls->ls_hashmask;
++
++      read_lock(&ls->ls_reshash_lock);
++      r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
++      read_unlock(&ls->ls_reshash_lock);
++
++      if (r)
++              goto out_set;
++      if (!create) {
++              *rp = NULL;
++              goto out;
++      }
++
++      r = allocate_rsb(ls, namelen);
++      if (!r)
++              goto fail;
++
++      INIT_LIST_HEAD(&r->res_subreslist);
++      INIT_LIST_HEAD(&r->res_grantqueue);
++      INIT_LIST_HEAD(&r->res_convertqueue);
++      INIT_LIST_HEAD(&r->res_waitqueue);
++
++      memcpy(r->res_name, name, namelen);
++      r->res_length = namelen;
++      r->res_ls = ls;
++      init_rwsem(&r->res_lock);
++      atomic_set(&r->res_ref, 1);
++
++      if (parent) {
++              r->res_parent = parent;
++              r->res_depth = parent->res_depth + 1;
++              r->res_root = parent->res_root;
++              r->res_nodeid = parent->res_nodeid;
++      } else {
++              r->res_parent = NULL;
++              r->res_depth = 1;
++              r->res_root = r;
++              r->res_nodeid = -1;
++      }
++
++      write_lock(&ls->ls_reshash_lock);
++      tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
++      if (tmp) {
++              write_unlock(&ls->ls_reshash_lock);
++              free_rsb(r);
++              r = tmp;
++      } else {
++              list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
++              write_unlock(&ls->ls_reshash_lock);
++
++              down_read(&ls->ls_gap_rsblist);
++              if (parent)
++                      list_add_tail(&r->res_subreslist,
++                                    &r->res_root->res_subreslist);
++              else
++                      list_add(&r->res_rootlist, &ls->ls_rootres);
++              up_read(&ls->ls_gap_rsblist);
++      }
++
++      out_set:
++      *rp = r;
++
++      out:
++      error = 0;
++
++      fail:
++      return error;
++}
++
++/*
++ * Add a LKB to a resource's grant/convert/wait queue. in order
++ */
++
++void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
++{
++      gd_lkb_t *lkb = NULL;
++
++      list_for_each_entry(lkb, head, lkb_statequeue) {
++              if (lkb->lkb_rqmode < mode)
++                      break;
++      }
++
++      if (!lkb) {
++              /* No entries in the queue, we are alone */
++              list_add_tail(new, head);
++      } else {
++              __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
++      }
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++
++      GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
++
++      lkb->lkb_status = type;
++
++      switch (type) {
++      case GDLM_LKSTS_WAITING:
++              list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
++              break;
++
++      case GDLM_LKSTS_GRANTED:
++              lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
++                              lkb->lkb_grmode);
++              break;
++
++      case GDLM_LKSTS_CONVERT:
++              if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
++                      list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
++
++              else
++                      if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
++                              list_add_tail(&lkb->lkb_statequeue,
++                                            &r->res_convertqueue);
++                      else
++                              lkb_add_ordered(&lkb->lkb_statequeue,
++                                              &r->res_convertqueue, lkb->lkb_rqmode);
++              break;
++
++      default:
++              GDLM_ASSERT(0,);
++      }
++}
++
++void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++      down_write(&r->res_lock);
++      lkb_enqueue(r, lkb, type);
++      up_write(&r->res_lock);
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int lkb_dequeue(gd_lkb_t *lkb)
++{
++      int status = lkb->lkb_status;
++
++      if (!status)
++              goto out;
++
++      lkb->lkb_status = 0;
++      list_del(&lkb->lkb_statequeue);
++
++      out:
++      return status;
++}
++
++int res_lkb_dequeue(gd_lkb_t *lkb)
++{
++      int status;
++
++      down_write(&lkb->lkb_resource->res_lock);
++      status = lkb_dequeue(lkb);
++      up_write(&lkb->lkb_resource->res_lock);
++
++      return status;
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++      int status;
++
++      status = lkb_dequeue(lkb);
++      lkb_enqueue(r, lkb, type);
++
++      return status;
++}
++
++int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++      int status;
++
++      down_write(&r->res_lock);
++      status = lkb_swqueue(r, lkb, type);
++      up_write(&r->res_lock);
++
++      return status;
++}
+diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
+--- linux-orig/cluster/dlm/rsb.h       1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rsb.h    2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RSB_DOT_H__
++#define __RSB_DOT_H__
++
++void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
++void _release_rsb(gd_res_t * r);
++void release_rsb(gd_res_t * r);
++void hold_rsb(gd_res_t * r);
++int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
++                     int create, gd_res_t ** rp);
++void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++int lkb_dequeue(gd_lkb_t * lkb);
++int res_lkb_dequeue(gd_lkb_t * lkb);
++int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++
++#endif                                /* __RSB_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
+--- linux-orig/cluster/dlm/util.c      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/util.c   2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,130 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++
++static const uint32_t crc_32_tab[] = {
++      0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
++      0xe963a535, 0x9e6495a3,
++      0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
++      0xe7b82d07, 0x90bf1d91,
++      0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
++      0xf4d4b551, 0x83d385c7,
++      0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
++      0xfa0f3d63, 0x8d080df5,
++      0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
++      0xd20d85fd, 0xa50ab56b,
++      0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
++      0xdcd60dcf, 0xabd13d59,
++      0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
++      0xcfba9599, 0xb8bda50f,
++      0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
++      0xc1611dab, 0xb6662d3d,
++      0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
++      0x9fbfe4a5, 0xe8b8d433,
++      0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
++      0x91646c97, 0xe6635c01,
++      0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
++      0x8208f4c1, 0xf50fc457,
++      0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
++      0x8cd37cf3, 0xfbd44c65,
++      0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
++      0xa4d1c46d, 0xd3d6f4fb,
++      0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
++      0xaa0a4c5f, 0xdd0d7cc9,
++      0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
++      0xb966d409, 0xce61e49f,
++      0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
++      0xb7bd5c3b, 0xc0ba6cad,
++      0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
++      0x04db2615, 0x73dc1683,
++      0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
++      0x0a00ae27, 0x7d079eb1,
++      0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
++      0x196c3671, 0x6e6b06e7,
++      0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
++      0x17b7be43, 0x60b08ed5,
++      0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
++      0x3fb506dd, 0x48b2364b,
++      0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
++      0x316e8eef, 0x4669be79,
++      0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
++      0x220216b9, 0x5505262f,
++      0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
++      0x2cd99e8b, 0x5bdeae1d,
++      0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
++      0x72076785, 0x05005713,
++      0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
++      0x7cdcefb7, 0x0bdbdf21,
++      0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
++      0x6fb077e1, 0x18b74777,
++      0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
++      0x616bffd3, 0x166ccf45,
++      0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
++      0x4969474d, 0x3e6e77db,
++      0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
++      0x47b2cf7f, 0x30b5ffe9,
++      0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
++      0x54de5729, 0x23d967bf,
++      0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
++      0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * gdlm_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Copied from GFS.
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data.  The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at.  That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t gdlm_hash(const char *data, int len)
++{
++      uint32_t hash = 0xFFFFFFFF;
++
++      for (; len--; data++)
++              hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++      hash = ~hash;
++
++      return hash;
++}
++
++uint32_t gdlm_next_power2(uint32_t val)
++{
++      uint32_t x;
++
++      for (x = 1; x < val; x <<= 1) ;
++
++      return x;
++}
++
++void print_lkb(gd_lkb_t *lkb)
++{
++      printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
++              "nodeid=%u lqstate=%x lqflags=%x\n",
++              lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
++              lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
++              lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
++}
+diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
+--- linux-orig/cluster/dlm/util.h      1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/util.h   2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++uint32_t gdlm_hash(const char *data, int len);
++uint32_t gdlm_next_power2(uint32_t val);
++
++void print_lkb(gd_lkb_t *lkb);
++
++#endif
+diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
+--- linux-orig/include/cluster/dlm.h   1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/dlm.h        2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,404 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DLM_DOT_H__
++#define __DLM_DOT_H__
++
++/*
++ * Interface to DLM - routines and structures to use DLM lockspaces.
++ */
++
++/*
++ * Lock Modes
++ */
++
++#define DLM_LOCK_IV            (-1)   /* invalid */
++#define DLM_LOCK_NL            (0)    /* null */
++#define DLM_LOCK_CR            (1)    /* concurrent read */
++#define DLM_LOCK_CW            (2)    /* concurrent write */
++#define DLM_LOCK_PR            (3)    /* protected read */
++#define DLM_LOCK_PW            (4)    /* protected write */
++#define DLM_LOCK_EX            (5)    /* exclusive */
++
++/*
++ * Maximum size in bytes of a dlm_lock name
++ */
++
++#define DLM_RESNAME_MAXLEN     (64)
++
++/*
++ * Size in bytes of Lock Value Block
++ */
++
++#define DLM_LVB_LEN            (32)
++
++/*
++ * Flags to dlm_new_lockspace
++ *
++ * DLM_LSF_NOTIMERS
++ *
++ * Do not subject locks in this lockspace to time-outs.
++ *
++ */
++
++#define DLM_LSF_NOTIMERS       (1)
++
++/*
++ * Flags to dlm_lock
++ *
++ * DLM_LKF_NOQUEUE
++ *
++ * Do not queue the lock request on the wait queue if it cannot be granted
++ * immediately.  If the lock cannot be granted because of this flag, DLM will
++ * either return -EAGAIN from the dlm_lock call or will return 0 from
++ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
++ *
++ * DLM_LKF_CONVERT
++ *
++ * Indicates a lock conversion request.  For conversions the name and namelen
++ * are ignored and the lock ID in the LKSB is used to identify the lock.
++ *
++ * DLM_LKF_VALBLK
++ *
++ * Requests DLM to return the current contents of the lock value block in the
++ * lock status block.  When this flag is set in a lock conversion from PW or EX
++ * modes, DLM assigns the value specified in the lock status block to the lock
++ * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
++ * containing application-specific information.
++ *
++ * DLM_LKF_QUECVT
++ *
++ * Force a conversion lock request to the back of the convert queue.  All other
++ * conversion requests ahead of it must be granted before it can be granted.
++ * This enforces a FIFO ordering on the convert queue.  When this flag is set,
++ * indefinite postponement is averted.  This flag is allowed only when
++ * converting a lock to a more restrictive mode.
++ *
++ * DLM_LKF_CANCEL
++ *
++ * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
++ * previously granted mode.
++ *
++ * DLM_LKF_IVVALBLK
++ *
++ * Invalidate/clear the lock value block.
++ *
++ * DLM_LKF_CONVDEADLK
++ *
++ * The granted mode of a lock being converted (from a non-NL mode) can be
++ * changed to NL in the process of acquiring the requested mode to avoid
++ * conversion deadlock.
++ *
++ * DLM_LKF_PERSISTENT
++ *
++ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
++ * that this lock should not be unlocked when the process exits.
++ *
++ * DLM_LKF_NODLKWT
++ *
++ * This lock is not to be checked for conversion deadlocks.
++ *
++ * DLM_LKF_NODLCKBLK
++ *
++ * not yet implemented
++ *
++ * DLM_LKF_EXPEDITE
++ *
++ * If this lock conversion cannot be granted immediately it is to go to the
++ * head of the conversion queue regardless of its requested lock mode.
++ *
++ * DLM_LKF_NOQUEUEBAST
++ *
++ * Send blocking AST's before returning -EAGAIN to the caller.  It is only
++ * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
++ * NOQUEUE requests otherwise.
++ *
++ */
++
++#define DLM_LKF_NOQUEUE        (0x00000001)
++#define DLM_LKF_CANCEL         (0x00000002)
++#define DLM_LKF_CONVERT        (0x00000004)
++#define DLM_LKF_VALBLK         (0x00000008)
++#define DLM_LKF_QUECVT         (0x00000010)
++#define DLM_LKF_IVVALBLK       (0x00000020)
++#define DLM_LKF_CONVDEADLK     (0x00000040)
++#define DLM_LKF_PERSISTENT     (0x00000080)
++#define DLM_LKF_NODLCKWT       (0x00000100)
++#define DLM_LKF_NODLCKBLK      (0x00000200)
++#define DLM_LKF_EXPEDITE       (0x00000400)
++#define DLM_LKF_NOQUEUEBAST    (0x00000800)
++
++/*
++ * Some return codes that are not not in errno.h
++ */
++
++#define DLM_ECANCEL            (0x10001)
++#define DLM_EUNLOCK            (0x10002)
++
++typedef void dlm_lockspace_t;
++
++/*
++ * Lock range structure
++ */
++
++struct dlm_range {
++      uint64_t ra_start;
++      uint64_t ra_end;
++};
++
++/*
++ * Lock status block
++ *
++ * Use this structure to specify the contents of the lock value block.  For a
++ * conversion request, this structure is used to specify the lock ID of the
++ * lock.  DLM writes the status of the lock request and the lock ID assigned
++ * to the request in the lock status block.
++ *
++ * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
++ * It is available when dlm_lock returns.
++ *
++ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
++ * shown for the DLM_LKF_VALBLK flag.
++ *
++ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
++ * it was first demoted to NL to avoid conversion deadlock.
++ *
++ * sb_status: the returned status of the lock request set prior to AST
++ * execution.  Possible return values:
++ *
++ * 0 if lock request was successful
++ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
++ * -ENOMEM if there is no memory to process request
++ * -EINVAL if there are invalid parameters
++ * -DLM_EUNLOCK if unlock request was successful
++ * -DLM_ECANCEL ?
++ */
++
++#define DLM_SBF_DEMOTED        (0x01)
++
++struct dlm_lksb {
++      int      sb_status;
++      uint32_t sb_lkid;
++      char     sb_flags;
++      char *   sb_lvbptr;
++};
++
++/*
++ * These defines are the bits that make up the
++ * query code.
++ */
++
++/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
++ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
++#define DLM_LOCK_THIS            0x0007
++#define DLM_QUERY_MODE_MASK      0x0007
++
++/* Bits 3, 4, 5  bitmap of queue(s) to query */
++#define DLM_QUERY_QUEUE_WAIT     0x0008
++#define DLM_QUERY_QUEUE_CONVERT  0x0010
++#define DLM_QUERY_QUEUE_GRANT    0x0020
++#define DLM_QUERY_QUEUE_GRANTED  0x0030       /* Shorthand */
++#define DLM_QUERY_QUEUE_ALL      0x0038       /* Shorthand */
++
++/* Bit 6, Return only the information that can be established without a network
++ * round-trip. The caller must be aware of the implications of this. Useful for
++ * just getting the master node id or resource name. */
++#define DLM_QUERY_LOCAL          0x0040
++
++/* Bits 8 up, query type */
++#define DLM_QUERY_LOCKS_HIGHER   0x0100
++#define DLM_QUERY_LOCKS_LOWER    0x0200
++#define DLM_QUERY_LOCKS_EQUAL    0x0300
++#define DLM_QUERY_LOCKS_BLOCKING 0x0400
++#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
++#define DLM_QUERY_LOCKS_ALL      0x0600
++#define DLM_QUERY_MASK           0x0F00
++
++/* GRMODE is the default for mode comparisons,
++   RQMODE might also be handy */
++#define DLM_QUERY_GRMODE         0x0000
++#define DLM_QUERY_RQMODE         0x1000
++
++/* Structures passed into and out of the query */
++
++struct dlm_lockinfo {
++      int lki_lkid;           /* Lock ID on originating node */
++        int lki_mstlkid;        /* Lock ID on master node */
++      int lki_parent;
++      int lki_node;           /* Originating node (not master) */
++      uint8_t lki_state;      /* Queue the lock is on */
++      uint8_t lki_grmode;     /* Granted mode */
++      uint8_t lki_rqmode;     /* Requested mode */
++      struct dlm_range lki_grrange;   /* Granted range, if applicable */
++      struct dlm_range lki_rqrange;   /* Requested range, if applicable */
++};
++
++struct dlm_resinfo {
++      int rsi_length;
++      int rsi_grantcount;     /* No. of nodes on grant queue */
++      int rsi_convcount;      /* No. of nodes on convert queue */
++      int rsi_waitcount;      /* No. of nodes on wait queue */
++      int rsi_masternode;     /* Master for this resource */
++      char rsi_name[DLM_RESNAME_MAXLEN];      /* Resource name */
++      char rsi_valblk[DLM_LVB_LEN];   /* Master's LVB contents, if applicable
++                                       */
++};
++
++struct dlm_queryinfo {
++      struct dlm_resinfo *gqi_resinfo;
++      struct dlm_lockinfo *gqi_lockinfo;      /* This points to an array
++                                               * of structs */
++      int gqi_locksize;       /* input */
++      int gqi_lockcount;      /* output */
++};
++
++#ifdef __KERNEL__
++/*
++ * dlm_init
++ *
++ * Starts and initializes DLM threads and structures.  Creation of the first
++ * lockspace will call this if it has not been called already.
++ *
++ * Returns: 0 if successful, -EXXX on error
++ */
++
++int dlm_init(void);
++
++/*
++ * dlm_release
++ *
++ * Stops DLM threads.
++ *
++ * Returns: 0 if successful, -EXXX on error
++ */
++
++int dlm_release(void);
++
++/*
++ * dlm_new_lockspace
++ *
++ * Starts a lockspace with the given name.  If the named lockspace exists in
++ * the cluster, the calling node joins it.
++ */
++
++int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
++                    int flags);
++
++/*
++ * dlm_release_lockspace
++ *
++ * Stop a lockspace.
++ */
++
++int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
++
++/*
++ * dlm_lock
++ *
++ * Make an asyncronous request to acquire or convert a lock on a named
++ * resource.
++ *
++ * lockspace: context for the request
++ * mode: the requested mode of the lock (DLM_LOCK_)
++ * lksb: lock status block for input and async return values
++ * flags: input flags (DLM_LKF_)
++ * name: name of the resource to lock, can be binary
++ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
++ * parent: the lock ID of a parent lock or 0 if none
++ * lockast: function DLM executes when it completes processing the request
++ * astarg: argument passed to lockast and bast functions
++ * bast: function DLM executes when this lock later blocks another request
++ *
++ * Returns:
++ * 0 if request is successfully queued for processing
++ * -EINVAL if any input parameters are invalid
++ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
++ * -ENOMEM if there is no memory to process request
++ * -ENOTCONN if there is a communication error
++ *
++ * If the call to dlm_lock returns an error then the operation has failed and
++ * the AST routine will not be called.  If dlm_lock returns 0 it is still
++ * possible that the lock operation will fail. The AST routine will be called
++ * when the locking is complete and the status is returned in the lksb.
++ *
++ * If the AST routines or parameter are passed to a conversion operation then
++ * they will overwrite those values that were passed to a previous dlm_lock
++ * call.
++ *
++ * AST routines should not block (at least not for long), but may make
++ * any locking calls they please.
++ */
++
++int dlm_lock(dlm_lockspace_t *lockspace,
++           uint32_t mode,
++           struct dlm_lksb *lksb,
++           uint32_t flags,
++           void *name,
++           unsigned int namelen,
++           uint32_t parent,
++           void (*lockast) (void *astarg),
++           void *astarg,
++           void (*bast) (void *astarg, int mode),
++           struct dlm_range *range);
++
++/*
++ * dlm_unlock
++ *
++ * Asynchronously release a lock on a resource.  The AST routine is called
++ * when the resource is successfully unlocked.
++ *
++ * lockspace: context for the request
++ * lkid: the lock ID as returned in the lksb
++ * flags: input flags (DLM_LKF_)
++ * lksb: if NULL the lksb parameter passed to last lock request is used
++ * astarg: if NULL, astarg in last lock request is used
++ *
++ * Returns:
++ * 0 if request is successfully queued for processing
++ * -EINVAL if any input parameters are invalid
++ * -ENOTEMPTY if the lock still has sublocks
++ * -EBUSY if the lock is waiting for a remote lock operation
++ * -ENOTCONN if there is a communication error
++ */
++
++extern int dlm_unlock(dlm_lockspace_t *lockspace,
++                     uint32_t lkid,
++                     uint32_t flags,
++                     struct dlm_lksb *lksb,
++                     void *astarg);
++
++/* Query interface
++ *
++ * Query the other holders of a resource, given a known lock ID
++ *
++ * lockspace:   context for the request
++ * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
++ *              on the resource. sb_status will contain the status
++ *            of the request on completion.
++ * query:       query bitmap see DLM_QUERY_* above
++ * qinfo:       pointer to dlm_queryinfo structure
++ * ast_routine: AST routine to call on completion
++ * artarg:      argument to AST routine. It is "traditional"
++ *              to put the qinfo pointer into lksb->sb_lvbptr
++ *              and pass the lksb in here.
++ */
++extern int dlm_query(dlm_lockspace_t *lockspace,
++                    struct dlm_lksb *lksb,
++                    int query,
++                    struct dlm_queryinfo *qinfo,
++                    void (ast_routine(void *)),
++                    void *astarg);
++
++#endif                                /* __KERNEL__ */
++
++#endif                                /* __DLM_DOT_H__ */
+diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
+--- linux-orig/include/cluster/dlm_device.h    1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/dlm_device.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,63 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* This is the device interface for dlm, most users will use a library
++ * interface.
++ */
++
++/* Version of the device interface */
++#define DLM_DEVICE_VERSION_MAJOR 2
++#define DLM_DEVICE_VERSION_MINOR 0
++#define DLM_DEVICE_VERSION_PATCH 0
++
++/* struct passed to the lock write */
++struct dlm_lock_params {
++      uint32_t version[3];
++      uint8_t cmd;
++      uint8_t mode;
++      uint16_t flags;
++      uint32_t lkid;
++      uint32_t parent;
++      struct dlm_range range;
++      uint8_t namelen;
++        void *astparam;
++        void *astaddr;
++        void *bastaddr;
++        struct dlm_lksb *lksb;
++      char name[1];
++};
++
++
++/* struct read from the "device" fd,
++   consists mainly of userspace pointers for the library to use */
++struct dlm_lock_result {
++      uint8_t cmd;
++        void *astparam;
++        void (*astaddr)(void *astparam);
++        struct dlm_lksb *user_lksb;
++        struct dlm_lksb lksb;  /* But this has real data in it */
++        uint8_t bast_mode; /* Not yet used */
++};
++
++/* commands passed to the device */
++#define DLM_USER_LOCK       1
++#define DLM_USER_UNLOCK     2
++#define DLM_USER_QUERY      3
++
++/* Arbitrary length restriction */
++#define MAX_LS_NAME_LEN 64
++
++/* ioctls on the device */
++#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
++#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
++#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)
diff --git a/linux-cluster-gfs.patch b/linux-cluster-gfs.patch

new file mode 100644 (file)

index 0000000..1e9ce09
--- /dev/null
+++ b/linux-cluster-gfs.patch
@@ -0,0 +1,49150 @@
+# Make the VFS call down into the FS on flock calls.
+diff -urN -p linux-2.6.7/fs/locks.c linux/fs/locks.c
+--- linux-2.6.7/fs/locks.c     2004-06-16 12:00:44.567463632 -0500
++++ linux/fs/locks.c   2004-06-16 12:01:58.844205936 -0500
+@@ -1294,6 +1294,27 @@ out_unlock:
+       return error;
+ }
+ 
++/*
++ * Wrapper function around the file_operations lock routine when called for
++ * flock().  The lock routine is called for both fcntl() and flock(), so
++ * the flock parameters must be translated to an equivalent fcntl()-like
++ * lock.
++ *
++ * Don't use locks_alloc_lock() (or flock_make_lock()) here, as
++ * this is just a temporary lock structure.  We especially don't
++ * want to fail because we couldn't allocate a lock structure if
++ * this is an unlock operation.
++ */
++int flock_fs_file(struct file *filp, int type, int wait)
++{
++      struct file_lock fl = { .fl_flags = FL_FLOCK,
++                              .fl_type = type };
++
++      return filp->f_op->lock(filp,
++                              (wait) ? F_SETLKW : F_SETLK,
++                              &fl);
++}
++
+ /**
+  *    sys_flock: - flock() system call.
+  *    @fd: the file descriptor to lock.
+@@ -1342,6 +1363,50 @@ asmlinkage long sys_flock(unsigned int f
+       if (error)
+               goto out_free;
+ 
++      /*
++       * Execute any filesystem-specific flock routines.  The filesystem may
++       * maintain supplemental locks.  This code allows the supplemental locks
++       * to be kept in sync with the vfs flock lock.  If flock() is called on
++       * a lock already held for the given filp, the current flock lock is
++       * dropped before obtaining the requested lock.  This unlock operation
++       * must be completed for the any filesystem specific locks and the vfs
++       * flock lock before proceeding with obtaining the requested lock.  When
++       * the filesystem routine drops a lock for such a request, it must
++       * return -EDEADLK, allowing the vfs lock to be dropped, and the
++       * filesystem code is then re-executed to obtain the lock.
++       *
++       * A non-blocking request that returns EWOULDBLOCK also causes any vfs
++       * flock lock to be released, but then returns the error to the caller.
++       */
++      if (filp->f_op && filp->f_op->lock) {
++ repeat:
++              error = flock_fs_file(filp, lock->fl_type, can_sleep);
++              if (error < 0) {
++                      /*
++                       * We may have dropped a lock.  We need to
++                       * finish unlocking before returning or
++                       * continuing with lock acquisition.
++                       */
++                      if (error != -ENOLCK)
++                              flock_lock_file(filp, &(struct file_lock){.fl_type = F_UNLCK});
++
++                      /*
++                       * We already held the lock in some mode, and
++                       * had to drop filesystem-specific locks before
++                       * proceeding.  We come back through this
++                       * routine to unlock the vfs flock lock.  Now go
++                       * back and try again.  Using EAGAIN as the
++                       * error here would be better, but the one valid
++                       * error value defined for flock(), EWOULDBLOCK,
++                       * is defined as EAGAIN.
++                       */
++                      if (error == -EDEADLK)
++                              goto repeat;
++
++                      goto out_free;
++              }
++      }
++
+       for (;;) {
+               error = flock_lock_file(filp, lock);
+               if ((error != -EAGAIN) || !can_sleep)
+@@ -1354,6 +1419,13 @@ asmlinkage long sys_flock(unsigned int f
+               break;
+       }
+ 
++      /*
++       * If we failed to get the vfs flock, we need to clean up any
++       * filesystem-specific lock state that we previously obtained.
++       */
++      if (error && filp->f_op && filp->f_op->lock)
++              flock_fs_file(filp, F_UNLCK, 1);
++
+  out_free:
+       if (list_empty(&lock->fl_link)) {
+               locks_free_lock(lock);
+@@ -1714,6 +1786,8 @@ void locks_remove_flock(struct file *fil
+               if (fl->fl_file == filp) {
+                       if (IS_FLOCK(fl)) {
+                               locks_delete_lock(before);
++                              if (filp->f_op && filp->f_op->lock)
++                                      flock_fs_file(filp, F_UNLCK, 1);
+                               continue;
+                       }
+                       if (IS_LEASE(fl)) {
+# Add lock harness to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig     2004-06-16 12:00:44.558465722 -0500
++++ linux/fs/Kconfig   2004-06-16 12:02:02.401379449 -0500
+@@ -1669,6 +1669,14 @@ config AFS_FS
+ config RXRPC
+       tristate
+ 
++config LOCK_HARNESS
++      tristate "GFS Lock Harness"
++      help
++        The module that connects GFS to the modules that provide
++        locking for GFS.
++
++        If you want to use GFS (a cluster filesystem) say Y here.
++
+ endmenu
+ 
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/Makefile linux/fs/Makefile
+--- linux-2.6.7/fs/Makefile    2004-06-16 12:00:44.558465722 -0500
++++ linux/fs/Makefile  2004-06-16 12:02:02.402379216 -0500
+@@ -91,3 +91,4 @@ obj-$(CONFIG_JFS_FS)         += jfs/
+ obj-$(CONFIG_XFS_FS)          += xfs/
+ obj-$(CONFIG_AFS_FS)          += afs/
+ obj-$(CONFIG_BEFS_FS)         += befs/
++obj-$(CONFIG_LOCK_HARNESS)    += gfs_locking/
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile        1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/Makefile      2004-06-16 12:02:02.402379216 -0500
+@@ -0,0 +1,14 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_HARNESS)    += lock_harness/
++
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_harness/Makefile linux/fs/gfs_locking/lock_harness/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_harness/Makefile   1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_harness/Makefile 2004-06-16 12:02:02.402379216 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_HARNESS) += lock_harness.o
++
++lock_harness-y        := main.o
++
+# Add GFS to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig     2004-06-25 13:57:24.435829621 -0500
++++ linux/fs/Kconfig   2004-06-25 13:59:16.786347614 -0500
+@@ -316,13 +316,13 @@ config JFS_STATISTICS
+         to be made available to the user in the /proc/fs/jfs/ directory.
+ 
+ config FS_POSIX_ACL
+-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs)
++# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/GFS)
+ #
+ # NOTE: you can implement Posix ACLs without these helpers (XFS does).
+ #     Never use this symbol for ifdefs.
+ #
+       bool
+-      depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL
++      depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || GFS_FS
+       default y
+ 
+ config XFS_FS
+@@ -1677,6 +1677,20 @@ config LOCK_HARNESS
+ 
+         If you want to use GFS (a cluster filesystem) say Y here.
+ 
++config GFS_FS
++      tristate "GFS file system support"
++      depends on LOCK_HARNESS
++      help
++        A cluster filesystem.
++
++        Allows a cluster of computers to simultaneously use a block device
++        that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
++        and writes to the block device like a local filesystem, but also uses
++        a lock module to allow the computers coordinate their I/O so
++        filesystem consistency is maintained.  One of the nifty features of
++        GFS is perfect consistency -- changes made to the filesystem on one
++        machine show up immediately on all other machines in the cluster.
++
+ endmenu
+ 
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/Makefile linux/fs/Makefile
+--- linux-2.6.7/fs/Makefile    2004-06-25 13:57:24.436829391 -0500
++++ linux/fs/Makefile  2004-06-25 13:57:24.447826863 -0500
+@@ -92,3 +92,4 @@ obj-$(CONFIG_XFS_FS)         += xfs/
+ obj-$(CONFIG_AFS_FS)          += afs/
+ obj-$(CONFIG_BEFS_FS)         += befs/
+ obj-$(CONFIG_LOCK_HARNESS)    += gfs_locking/
++obj-$(CONFIG_GFS_FS)          += gfs/
+diff -urN -p linux-2.6.7/fs/gfs/Makefile linux/fs/gfs/Makefile
+--- linux-2.6.7/fs/gfs/Makefile        1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs/Makefile      2004-06-25 13:57:24.448826633 -0500
+@@ -0,0 +1,51 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_GFS_FS) += gfs.o
++
++gfs-y :=      acl.o \
++              bits.o \
++              bmap.o \
++              daemon.o \
++              dio.o \
++              dir.o \
++              eattr.o \
++              file.o \
++              flock.o \
++              glock.o \
++              glops.o \
++              inode.o \
++              ioctl.o \
++              locking.o \
++              log.o \
++              lops.o \
++              lvb.o \
++              main.o \
++              mount.o \
++              ondisk.o \
++              ops_address.o \
++              ops_dentry.o \
++              ops_export.o \
++              ops_file.o \
++              ops_fstype.o \
++              ops_inode.o \
++              ops_super.o \
++              ops_vm.o \
++              page.o \
++              quota.o \
++              recovery.o \
++              rgrp.o \
++              super.o \
++              trans.o \
++              unlinked.o \
++              util.o
++
+# Add lock_nolock to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig     2004-06-16 12:02:09.563715325 -0500
++++ linux/fs/Kconfig   2004-06-16 12:02:09.574712769 -0500
+@@ -1691,6 +1691,12 @@ config GFS_FS
+         GFS is perfect consistency -- changes made to the filesystem on one
+         machine show up immediately on all other machines in the cluster.
+ 
++config LOCK_NOLOCK
++      tristate "Lock Nolock"
++      depends on LOCK_HARNESS
++      help
++        A "fake" lock module that allows GFS to run as a local filesystem.
++
+ endmenu
+ 
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile        2004-06-16 12:02:05.985546690 -0500
++++ linux/fs/gfs_locking/Makefile      2004-06-16 12:02:09.574712769 -0500
+@@ -11,4 +11,5 @@
+ ###############################################################################
+ 
+ obj-$(CONFIG_LOCK_HARNESS)    += lock_harness/
++obj-$(CONFIG_LOCK_NOLOCK)     += lock_nolock/
+ 
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_nolock/Makefile linux/fs/gfs_locking/lock_nolock/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_nolock/Makefile    1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_nolock/Makefile  2004-06-16 12:02:09.575712537 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_NOLOCK) += lock_nolock.o
++
++lock_nolock-y := main.o
++
+# Add lock_dlm to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig     2004-06-16 12:02:13.145883030 -0500
++++ linux/fs/Kconfig   2004-06-16 12:02:13.157880243 -0500
+@@ -1697,6 +1697,12 @@ config LOCK_NOLOCK
+       help
+         A "fake" lock module that allows GFS to run as a local filesystem.
+ 
++config LOCK_DLM
++      tristate "Lock DLM"
++      depends on LOCK_HARNESS
++      help
++        A lock module that allows GFS to use a Distributed Lock Manager.
++
+ endmenu
+ 
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile        2004-06-16 12:02:13.146882798 -0500
++++ linux/fs/gfs_locking/Makefile      2004-06-16 12:02:13.157880243 -0500
+@@ -12,4 +12,5 @@
+ 
+ obj-$(CONFIG_LOCK_HARNESS)    += lock_harness/
+ obj-$(CONFIG_LOCK_NOLOCK)     += lock_nolock/
++obj-$(CONFIG_LOCK_DLM)                += lock_dlm/
+ 
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_dlm/Makefile linux/fs/gfs_locking/lock_dlm/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_dlm/Makefile       1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_dlm/Makefile     2004-06-16 12:02:13.157880243 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_DLM) += lock_dlm.o
++
++lock_dlm-y    := main.o group.o lock.o mount.o thread.o plock.o
++
+# Add lock_gulm to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig     2004-06-16 12:02:16.816030294 -0500
++++ linux/fs/Kconfig   2004-06-16 12:02:16.827027739 -0500
+@@ -1703,6 +1703,12 @@ config LOCK_DLM
+       help
+         A lock module that allows GFS to use a Distributed Lock Manager.
+ 
++config LOCK_GULM
++      tristate "Lock GULM"
++      depends on LOCK_HARNESS
++      help
++        A lock module that allows GFS to use a Failover Lock Manager.
++
+ endmenu
+ 
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile        2004-06-16 12:02:16.817030062 -0500
++++ linux/fs/gfs_locking/Makefile      2004-06-16 12:02:16.828027507 -0500
+@@ -13,4 +13,5 @@
+ obj-$(CONFIG_LOCK_HARNESS)    += lock_harness/
+ obj-$(CONFIG_LOCK_NOLOCK)     += lock_nolock/
+ obj-$(CONFIG_LOCK_DLM)                += lock_dlm/
++obj-$(CONFIG_LOCK_GULM)               += lock_gulm/
+ 
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_gulm/Makefile linux/fs/gfs_locking/lock_gulm/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_gulm/Makefile      1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_gulm/Makefile    2004-06-16 12:02:16.828027507 -0500
+@@ -0,0 +1,33 @@
++###############################################################################
++###############################################################################
++##
++##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++##
++##  This copyrighted material is made available to anyone wishing to use,
++##  modify, copy, or redistribute it subject to the terms and conditions
++##  of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_GULM) += lock_gulm.o
++
++lock_gulm-y   :=      gulm_core.o \
++              gulm_fs.o \
++              gulm_jid.o \
++              gulm_lt.o \
++              gulm_procinfo.o \
++              handler.o \
++              lg_core.o \
++              lg_lock.o \
++              lg_main.o \
++              linux_gulm_main.o \
++              load_info.o \
++              util.o \
++              utils_crc.o \
++              utils_tostr.o \
++              utils_verb_flags.o \
++              xdr_base.o \
++              xdr_io.o \
++              xdr_socket.o
++
+diff -urN linux-orig/fs/gfs/acl.c linux-patched/fs/gfs/acl.c
+--- linux-orig/fs/gfs/acl.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/acl.c 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,397 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "dio.h"
++#include "eattr.h"
++#include "glock.h"
++#include "trans.h"
++#include "inode.h"
++
++/*
++ * Check to make sure that the acl is actually valid
++ */ 
++int
++gfs_validate_acl(struct gfs_inode *ip, const char *value, int size, int access)
++{
++      int err = 0;
++      struct posix_acl *acl = NULL;
++      struct gfs_sbd *sdp = ip->i_sbd;
++
++      if ((current->fsuid != ip->i_di.di_uid) && !capable(CAP_FOWNER))
++              return -EPERM;
++      if (ip->i_di.di_type == GFS_FILE_LNK)
++              return -EOPNOTSUPP;
++      if (!access && ip->i_di.di_type != GFS_FILE_DIR)
++              return -EACCES;
++      if (!sdp->sd_args.ar_posixacls)
++              return -EOPNOTSUPP;
++
++      if (value) {
++              acl = posix_acl_from_xattr(value, size);
++              if (IS_ERR(acl))
++                      return PTR_ERR(acl);
++              else if (acl) {
++                      err = posix_acl_valid(acl);
++                      posix_acl_release(acl);
++              }
++      }
++      return err;     
++}
++
++void
++gfs_acl_set_mode(struct gfs_inode *ip, struct posix_acl *acl)
++{
++      struct inode *inode;
++      mode_t mode;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      mode = inode->i_mode;
++      posix_acl_equiv_mode(acl, &mode);
++      inode->i_mode = mode;
++      iput(inode);
++      gfs_inode_attr_out(ip);
++}
++
++
++/**
++ * gfs_replace_acl - replace the value of the ea to the value of the acl
++ *
++ * NOTE: The new value must be the same size as the old one. 
++ */
++int
++gfs_replace_acl(struct inode *inode, struct posix_acl *acl, int access,
++              struct gfs_ea_location location)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_easet_io req;
++      int size;
++      void *data;
++      int error;
++
++      size = posix_acl_to_xattr(acl, NULL, 0);
++      GFS_ASSERT(size == GFS_EA_DATA_LEN(location.ea),
++                 printk("new acl size = %d, ea size = %u\n", size,
++                        GFS_EA_DATA_LEN(location.ea)););
++
++      data = gmalloc(size);
++
++      posix_acl_to_xattr(acl, data, size);
++
++      req.es_data = data;
++      req.es_name = (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT;
++      req.es_data_len = size;
++      req.es_name_len = (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN;
++      req.es_cmd = GFS_EACMD_REPLACE;
++      req.es_type = GFS_EATYPE_SYS;
++
++      error = replace_ea(ip->i_sbd, ip, location.ea, &req);
++      if (!error)
++              gfs_trans_add_bh(ip->i_gl, location.bh);
++
++      kfree(data);
++
++      return error;
++}
++
++/**
++ * gfs_findacl - returns the requested posix acl
++ *
++ * this function does not log the inode. It assumes that a lock is already
++ * held on it.
++ */
++int
++gfs_findacl(struct gfs_inode *ip, int access, struct posix_acl **acl_ptr,
++          struct gfs_ea_location *location)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct posix_acl *acl;
++      uint32_t avail_size;
++      void *data;
++      int error;
++
++      avail_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++      *acl_ptr = NULL;
++
++      if (!ip->i_di.di_eattr)
++              return 0;
++
++      error = find_eattr(ip,
++                         (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT,
++                         (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN,
++                         GFS_EATYPE_SYS, location);
++      if (error <= 0)
++              return error;
++
++      data = gmalloc(GFS_EA_DATA_LEN(location->ea));
++
++      error = 0;
++      if (GFS_EA_IS_UNSTUFFED(location->ea))
++              error = read_unstuffed(data, ip, sdp, location->ea, avail_size,
++                                     gfs_ea_memcpy);
++      else
++              gfs_ea_memcpy(data, GFS_EA_DATA(location->ea),
++                            GFS_EA_DATA_LEN(location->ea));
++      if (error)
++              goto out;
++
++      acl = posix_acl_from_xattr(data, GFS_EA_DATA_LEN(location->ea));
++      if (IS_ERR(acl))
++              error = PTR_ERR(acl);
++      else
++              *acl_ptr = acl;
++
++ out:
++      kfree(data);
++      if (error)
++              brelse(location->bh);
++
++      return error;
++}
++
++int
++gfs_getacl(struct inode *inode, int access, struct posix_acl **acl_ptr)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_eaget_io req;
++      struct posix_acl *acl;
++      int size;
++      void *data;
++      int error = 0;
++
++      *acl_ptr = NULL;
++
++      if (!sdp->sd_args.ar_posixacls)
++              return 0;
++
++      req.eg_name = (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT;
++      req.eg_name_len = (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN;
++      req.eg_type = GFS_EATYPE_SYS;
++      req.eg_len = NULL;
++      req.eg_data = NULL;
++      req.eg_data_len = 0;
++
++      error = gfs_ea_read_permission(&req, ip);
++      if (error)
++              return error;
++
++      if (!ip->i_di.di_eattr)
++              return error;
++
++      size = get_ea(sdp, ip, &req, gfs_ea_memcpy);
++      if (size < 0) {
++              if (size != -ENODATA)
++                      error = size;
++              return error;
++      }
++
++      data = gmalloc(size);
++
++      req.eg_data = data;
++      req.eg_data_len = size;
++
++      size = get_ea(sdp, ip, &req, gfs_ea_memcpy);
++      if (size < 0) {
++              error = size;
++              goto out_free;
++      }
++
++      acl = posix_acl_from_xattr(data, size);
++      if (IS_ERR(acl))
++              error = PTR_ERR(acl);
++      else
++              *acl_ptr = acl;
++
++ out_free:
++      kfree(data);
++
++      return error;
++}
++
++int
++gfs_setup_new_acl(struct gfs_inode *dip,
++                unsigned int type, unsigned int *mode,
++                struct posix_acl **acl_ptr)
++{
++      struct gfs_ea_location location;
++      struct posix_acl *acl = NULL;
++      mode_t access_mode = *mode;
++      int error;
++
++      if (type == GFS_FILE_LNK)
++              return 0;
++
++      error = gfs_findacl(dip, FALSE, &acl, &location);
++      if (error)
++              return error;
++      if (!acl) {
++              (*mode) &= ~current->fs->umask;
++              return 0;
++      }
++      brelse(location.bh);
++
++      if (type == GFS_FILE_DIR) {
++              *acl_ptr = acl;
++              return 0;
++      }
++
++      error = posix_acl_create_masq(acl, &access_mode);
++      *mode = access_mode;
++      if (error > 0) {
++              *acl_ptr = acl;
++              return 0;
++      }
++
++      posix_acl_release(acl);
++
++      return error;
++}
++
++/**
++ * gfs_init_default_acl - initializes the default acl
++ *
++ * NOTE: gfs_init_access_acl must be called first
++ */
++int
++gfs_create_default_acl(struct gfs_inode *dip, struct gfs_inode *ip, void *data,
++                     int size)
++{
++      struct gfs_easet_io req;
++      struct gfs_ea_location avail;
++      int error;
++
++      memset(&avail, 0, sizeof(struct gfs_ea_location));
++
++      req.es_data = data;
++      req.es_name = GFS_POSIX_ACL_DEFAULT;
++      req.es_data_len = size;
++      req.es_name_len = GFS_POSIX_ACL_DEFAULT_LEN;
++      req.es_cmd = GFS_EACMD_CREATE;
++      req.es_type = GFS_EATYPE_SYS;
++
++      error = find_sys_space(dip, ip, size, &avail);
++      if (error)
++              return error;
++
++      avail.ea = prep_ea(avail.ea);
++
++      error = write_ea(ip->i_sbd, dip, ip, avail.ea, &req);
++      if (!error)
++              gfs_trans_add_bh(ip->i_gl, avail.bh);  /*  Huh!?!  */
++
++      brelse(avail.bh);
++
++      return error;
++}
++
++/**
++ * gfs_init_access_acl - initialized the access acl
++ *
++ * NOTE: This must be the first extended attribute that is created for
++ *       this inode.
++ */
++int
++gfs_init_access_acl(struct gfs_inode *dip, struct gfs_inode *ip, void *data,
++                  int size)
++{
++      struct gfs_easet_io req;
++
++      req.es_data = data;
++      req.es_name = GFS_POSIX_ACL_ACCESS;
++      req.es_data_len = size;
++      req.es_name_len = GFS_POSIX_ACL_ACCESS_LEN;
++      req.es_cmd = GFS_EACMD_CREATE;
++      req.es_type = GFS_EATYPE_SYS;
++
++      return init_new_inode_eattr(dip, ip, &req);
++}
++
++int
++gfs_init_acl(struct gfs_inode *dip, struct gfs_inode *ip, unsigned int type,
++           struct posix_acl *acl)
++{
++      struct buffer_head *dibh;
++      void *data;
++      int size;
++      int error;
++
++      size = posix_acl_to_xattr(acl, NULL, 0);
++
++      data = gmalloc(size);
++
++      posix_acl_to_xattr(acl, data, size);
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto out;
++
++      error = gfs_init_access_acl(dip, ip, data, size);
++      if (error)
++              goto out_relse;
++
++      if (type == GFS_FILE_DIR) {
++              error = gfs_create_default_acl(dip, ip, data, size);
++              if (error)
++                      goto out_relse;
++      }
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ out_relse:
++      brelse(dibh);
++
++ out:
++      kfree(data);
++      posix_acl_release(acl);
++
++      return error;
++}
++
++int
++gfs_acl_setattr(struct inode *inode)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      struct posix_acl *acl;
++      struct gfs_ea_location location;
++      int error;
++
++      if (S_ISLNK(inode->i_mode))
++              return 0;
++
++      memset(&location, 0, sizeof(struct gfs_ea_location));
++
++      error = gfs_findacl(ip, TRUE, &acl, &location); /* Check error here? */
++      if (!location.ea)
++              return error;
++
++      error = posix_acl_chmod_masq(acl, inode->i_mode);
++      if (!error)
++              error = gfs_replace_acl(inode, acl, TRUE, location);
++
++      posix_acl_release(acl);
++      brelse(location.bh);
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/acl.h linux-patched/fs/gfs/acl.h
+--- linux-orig/fs/gfs/acl.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/acl.h 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,28 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __ACL_DOT_H__
++#define __ACL_DOT_H__
++
++int gfs_setup_new_acl(struct gfs_inode *dip,
++                    unsigned int type, unsigned int *mode,
++                    struct posix_acl **acl_ptr);
++int gfs_getacl(struct inode *inode, int access, struct posix_acl **acl_ptr);
++int gfs_init_acl(struct gfs_inode *dip, struct gfs_inode *ip, unsigned int type,
++               struct posix_acl *acl);
++int gfs_acl_setattr(struct inode *inode);
++int gfs_validate_acl(struct gfs_inode *ip, const char *value, int size,
++                     int access);
++void gfs_acl_set_mode(struct gfs_inode *ip, struct posix_acl *acl);
++
++#endif /* __ACL_DOT_H__ */
+diff -urN linux-orig/fs/gfs/bits.c linux-patched/fs/gfs/bits.c
+--- linux-orig/fs/gfs/bits.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bits.c        2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,183 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * These routines are used by the resource group routines (rgrp.c)
++ * to keep track of block allocation.  Each block is represented by two
++ * bits.  One bit indicates whether or not the block is used.  (1=used,
++ * 0=free)  The other bit indicates whether or not the block contains a
++ * dinode or not.  (1=dinode, 0=data block) So, each byte represents
++ * GFS_NBBY (i.e. 4) blocks.  
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bits.h"
++
++static const char valid_change[16] = {
++      /* current */
++
++      /* n */ 0, 1, 1, 1,
++      /* e */ 1, 0, 0, 0,
++      /* w */ 1, 0, 0, 1,
++      0, 0, 1, 0
++};
++
++/**
++ * gfs_setbit - Set a bit in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @block: the block to set
++ * @new_state: the new state of the block
++ *
++ */
++
++void
++gfs_setbit(struct gfs_rgrpd *rgd,
++         unsigned char *buffer, unsigned int buflen,
++         uint32_t block, unsigned char new_state)
++{
++      unsigned char *byte, *end, cur_state;
++      unsigned int bit;
++
++      byte = buffer + (block / GFS_NBBY);
++      bit = (block % GFS_NBBY) * GFS_BIT_SIZE;
++      end = buffer + buflen;
++
++      GFS_ASSERT_RGRPD(byte < end, rgd,);
++
++      cur_state = (*byte >> bit) & GFS_BIT_MASK;
++      GFS_ASSERT_RGRPD(valid_change[new_state * 4 + cur_state], rgd,
++                       printk("cur_state = %u, new_state = %u\n",
++                              cur_state, new_state););
++
++      *byte ^= cur_state << bit;
++      *byte |= new_state << bit;
++}
++
++/**
++ * gfs_testbit - test a bit in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @block: the block to read
++ *
++ */
++
++unsigned char
++gfs_testbit(struct gfs_rgrpd *rgd,
++          unsigned char *buffer, unsigned int buflen, uint32_t block)
++{
++      unsigned char *byte, *end, cur_state;
++      unsigned int bit;
++
++      byte = buffer + (block / GFS_NBBY);
++      bit = (block % GFS_NBBY) * GFS_BIT_SIZE;
++      end = buffer + buflen;
++
++      GFS_ASSERT_RGRPD(byte < end, rgd,);
++
++      cur_state = (*byte >> bit) & GFS_BIT_MASK;
++
++      return cur_state;
++}
++
++/**
++ * gfs_bitfit - Find a free block in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @goal: the block to try to allocate
++ * @old_state: the state of the block we're looking for
++ *
++ * Return: the block number that was allocated
++ */
++
++uint32_t
++gfs_bitfit(struct gfs_rgrpd *rgd,
++         unsigned char *buffer, unsigned int buflen,
++         uint32_t goal, unsigned char old_state)
++{
++      unsigned char *byte, *end, alloc;
++      uint32_t blk = goal;
++      unsigned int bit;
++
++      byte = buffer + (goal / GFS_NBBY);
++      bit = (goal % GFS_NBBY) * GFS_BIT_SIZE;
++      end = buffer + buflen;
++      alloc = (old_state & 1) ? 0 : 0x55;
++
++      while (byte < end) {
++              if ((*byte & 0x55) == alloc) {
++                      blk += (8 - bit) >> 1;
++
++                      bit = 0;
++                      byte++;
++
++                      continue;
++              }
++
++              if (((*byte >> bit) & GFS_BIT_MASK) == old_state)
++                      return blk;
++
++              bit += GFS_BIT_SIZE;
++              if (bit >= 8) {
++                      bit = 0;
++                      byte++;
++              }
++
++              blk++;
++      }
++
++      return BFITNOENT;
++}
++
++/**
++ * gfs_bitcount - count the number of bits in a certain state
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @state: the state of the block we're looking for
++ *
++ * Returns: The number of bits
++ */
++
++uint32_t
++gfs_bitcount(struct gfs_rgrpd *rgd,
++           unsigned char *buffer, unsigned int buflen,
++           unsigned char state)
++{
++      unsigned char *byte = buffer;
++      unsigned char *end = buffer + buflen;
++      unsigned char state1 = state << 2;
++      unsigned char state2 = state << 4;
++      unsigned char state3 = state << 6;
++      uint32_t count = 0;
++
++      for (; byte < end; byte++) {
++              if (((*byte) & 0x03) == state)
++                      count++;
++              if (((*byte) & 0x0C) == state1)
++                      count++;
++              if (((*byte) & 0x30) == state2)
++                      count++;
++              if (((*byte) & 0xC0) == state3)
++                      count++;
++      }
++
++      return count;
++}
+diff -urN linux-orig/fs/gfs/bits.h linux-patched/fs/gfs/bits.h
+--- linux-orig/fs/gfs/bits.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bits.h        2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __BITS_DOT_H__
++#define __BITS_DOT_H__
++
++#define BFITNOENT (0xFFFFFFFF)
++
++void gfs_setbit(struct gfs_rgrpd *rgd,
++              unsigned char *buffer, unsigned int buflen,
++              uint32_t block, unsigned char new_state);
++unsigned char gfs_testbit(struct gfs_rgrpd *rgd,
++                        unsigned char *buffer, unsigned int buflen,
++                        uint32_t block);
++uint32_t gfs_bitfit(struct gfs_rgrpd *rgd,
++                  unsigned char *buffer, unsigned int buflen,
++                  uint32_t goal, unsigned char old_state);
++uint32_t gfs_bitcount(struct gfs_rgrpd *rgd,
++                    unsigned char *buffer, unsigned int buflen,
++                    unsigned char state);
++
++#endif /* __BITS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/bmap.c linux-patched/fs/gfs/bmap.c
+--- linux-orig/fs/gfs/bmap.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bmap.c        2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,1404 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++struct metapath {
++      unsigned int mp_list[GFS_MAX_META_HEIGHT];
++};
++
++typedef int (*block_call_t) (struct gfs_inode *ip, struct buffer_head *dibh,
++                           struct buffer_head *bh, uint64_t *top,
++                           uint64_t *bottom, unsigned int height,
++                           void *data);
++
++struct strip_mine {
++      int sm_first;
++      unsigned int sm_height;
++};
++
++/**
++ * gfs_unstuffer_sync - unstuff a dinode synchronously
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_sync(struct gfs_inode *ip, struct buffer_head *dibh,
++                 uint64_t block, void *private)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      int error;
++
++      error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++      if (error)
++              return error;
++
++      gfs_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs_dinode));
++
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * gfs_unstuffer_async - unstuff a dinode asynchronously
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_async(struct gfs_inode *ip, struct buffer_head *dibh,
++                  uint64_t block, void *private)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      int error;
++
++      error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++      if (error)
++              return error;
++
++      gfs_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs_dinode));
++
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY);
++
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * gfs_unstuff_dinode - Unstuff a dinode when the data has grown too big
++ * @ip: The GFS inode to unstuff
++ * @unstuffer: the routine that handles unstuffing a non-zero length file
++ * @private: private data for the unstuffer
++ *
++ * This routine unstuffs a dinode and returns it to a "normal" state such 
++ * that the height can be grown in the traditional way.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_unstuff_dinode(struct gfs_inode *ip, gfs_unstuffer_t unstuffer,
++                 void *private)
++{
++      struct buffer_head *bh, *dibh;
++      uint64_t block = 0;
++      int journaled = gfs_is_jdata(ip);
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(ip), ip,);
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              return error;
++              
++      if (ip->i_di.di_size) {
++              /* Get a free block, fill it with the stuffed data,
++                 and write it out to disk */
++
++              if (journaled) {
++                      error = gfs_metaalloc(ip, &block);
++                      if (error)
++                              goto fail;
++
++                      error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++                      if (error)
++                              goto fail;
++
++                      gfs_buffer_copy_tail(bh, sizeof(struct gfs_meta_header),
++                                           dibh, sizeof(struct gfs_dinode));
++
++                      brelse(bh);
++              } else {
++                      gfs_blkalloc(ip, &block);
++
++                      error = unstuffer(ip, dibh, block, private);
++                      if (error)
++                              goto fail;
++              }
++      }
++
++      /*  Set up the pointer to the new block  */
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++
++      gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++      if (ip->i_di.di_size) {
++              *(uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode)) = cpu_to_gfs64(block);
++              ip->i_di.di_blocks++;
++      }
++
++      ip->i_di.di_height = 1;
++
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++
++ fail:
++      brelse(dibh);
++
++      return error;
++}
++
++/**
++ * calc_tree_height - Calculate the height of a metadata tree
++ * @ip: The GFS inode
++ * @size: The proposed size of the file
++ *
++ * Work out how tall a metadata tree needs to be in order to accommodate a
++ * file of a particular size. If size is less than the current size of
++ * the inode, then the current size of the inode is used instead of the
++ * supplied one.
++ *
++ * Returns: the height the tree should be
++ */
++
++static unsigned int
++calc_tree_height(struct gfs_inode *ip, uint64_t size)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t *arr;
++      unsigned int max, height;
++
++      if (ip->i_di.di_size > size)
++              size = ip->i_di.di_size;
++
++      if (gfs_is_jdata(ip)) {
++              arr = sdp->sd_jheightsize;
++              max = sdp->sd_max_jheight;
++      } else {
++              arr = sdp->sd_heightsize;
++              max = sdp->sd_max_height;
++      }
++
++      for (height = 0; height < max; height++)
++              if (arr[height] >= size)
++                      break;
++
++      return height;
++}
++
++/**
++ * build_height - Build a metadata tree of the requested height
++ * @ip: The GFS inode
++ * @height: The height to build to
++ *
++ * This routine makes sure that the metadata tree is tall enough to hold
++ * "size" bytes of data.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++build_height(struct gfs_inode *ip, int height)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh, *dibh;
++      uint64_t block, *bp;
++      unsigned int x;
++      int new_block;
++      int error;
++
++      while (ip->i_di.di_height < height) {
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      return error;
++
++              new_block = FALSE;
++              bp = (uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode));
++              for (x = 0; x < sdp->sd_diptrs; x++, bp++)
++                      if (*bp) {
++                              new_block = TRUE;
++                              break;
++                      }
++
++              if (new_block) {
++                      /*  Get a new block, fill it with the old direct pointers,
++                          and write it out  */
++
++                      error = gfs_metaalloc(ip, &block);
++                      if (error)
++                              goto fail;
++
++                      error = gfs_dread(sdp, block, ip->i_gl,
++                                        DIO_NEW | DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              goto fail;
++
++                      gfs_trans_add_bh(ip->i_gl, bh);
++                      gfs_metatype_set(sdp, bh, GFS_METATYPE_IN,
++                                       GFS_FORMAT_IN);
++                      memset(bh->b_data + sizeof(struct gfs_meta_header),
++                             0,
++                             sizeof(struct gfs_indirect) -
++                             sizeof(struct gfs_meta_header));
++                      gfs_buffer_copy_tail(bh, sizeof(struct gfs_indirect),
++                                           dibh, sizeof(struct gfs_dinode));
++
++                      brelse(bh);
++              }
++
++              /*  Set up the new direct pointer and write it out to disk  */
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++
++              gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++              if (new_block) {
++                      *(uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode)) = cpu_to_gfs64(block);
++                      ip->i_di.di_blocks++;
++              }
++
++              ip->i_di.di_height++;
++
++              gfs_dinode_out(&ip->i_di, dibh->b_data);
++              brelse(dibh);
++      }
++
++      return 0;
++
++ fail:
++      brelse(dibh);
++
++      return error;
++}
++
++/**
++ * find_metapath - Find path through the metadata tree
++ * @ip: The inode pointer
++ * @mp: The metapath to return the result in
++ * @block: The disk block to look up
++ *
++ *   This routine returns a struct metapath structure that defines a path through
++ *   the metadata of inode "ip" to get to block "block".
++ *
++ *   Example:
++ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
++ *   filesystem with a blocksize of 4096.
++ *
++ *   find_metapath() would return a struct metapath structure set to:
++ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
++ *   and mp_list[2] = 165.
++ *
++ *   That means that in order to get to the block containing the byte at
++ *   offset 101342453, we would load the indirect block pointed to by pointer
++ *   0 in the dinode.  We would then load the indirect block pointed to by
++ *   pointer 48 in that indirect block.  We would then load the data block
++ *   pointed to by pointer 165 in that indirect block.
++ *
++ *             ----------------------------------------
++ *             | Dinode |                             |
++ *             |        |                            4|
++ *             |        |0 1 2 3 4 5                 9|
++ *             |        |                            6|
++ *             ----------------------------------------
++ *                       |
++ *                       |
++ *                       V
++ *             ----------------------------------------
++ *             | Indirect Block                       |
++ *             |                                     5|
++ *             |            4 4 4 4 4 5 5            1|
++ *             |0           5 6 7 8 9 0 1            2|
++ *             ----------------------------------------
++ *                                |
++ *                                |
++ *                                V
++ *             ----------------------------------------
++ *             | Indirect Block                       |
++ *             |                         1 1 1 1 1   5|
++ *             |                         6 6 6 6 6   1|
++ *             |0                        3 4 5 6 7   2|
++ *             ----------------------------------------
++ *                                           |
++ *                                           |
++ *                                           V
++ *             ----------------------------------------
++ *             | Data block containing offset         |
++ *             |            101342453                 |
++ *             |                                      |
++ *             |                                      |
++ *             ----------------------------------------
++ *
++ */
++
++static struct metapath *
++find_metapath(struct gfs_inode *ip, uint64_t block)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct metapath *mp;
++      uint64_t b = block;
++      unsigned int i;
++
++      mp = gmalloc(sizeof(struct metapath));
++      memset(mp, 0, sizeof(struct metapath));
++
++      for (i = ip->i_di.di_height; i--;)
++              mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
++
++      return mp;
++}
++
++/**
++ * metapointer - Return pointer to start of metadata in a buffer
++ * @bh: The buffer
++ * @height: The metadata height (0 = dinode)
++ * @mp: The metapath 
++ *
++ * Return a pointer to the block number of the next height of the metadata
++ * tree given a buffer containing the pointer to the current height of the
++ * metadata tree.
++ */
++
++static __inline__ uint64_t *
++metapointer(struct buffer_head *bh, unsigned int height, struct metapath *mp)
++{
++      unsigned int head_size = (height > 0) ?
++              sizeof(struct gfs_indirect) : sizeof(struct gfs_dinode);
++
++      return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height];
++}
++
++/**
++ * get_metablock - Get the next metadata block in metadata tree
++ * @ip: The GFS inode
++ * @bh: Buffer containing the pointers to metadata blocks
++ * @height: The height of the tree (0 = dinode)
++ * @mp: The metapath
++ * @create: Non-zero if we may create a new meatdata block
++ * @new: Used to indicate if we did create a new metadata block
++ * @block: the returned disk block number
++ *
++ * Given a metatree, complete to a particular height, checks to see if the next
++ * height of the tree exists. If not the next height of the tree is created.
++ * The block number of the next height of the metadata tree is returned.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_metablock(struct gfs_inode *ip,
++            struct buffer_head *bh, unsigned int height, struct metapath *mp,
++            int create, int *new, uint64_t *block)
++{
++      uint64_t *ptr = metapointer(bh, height, mp);
++      int error;
++
++      if (*ptr) {
++              *block = gfs64_to_cpu(*ptr);
++              return 0;
++      }
++
++      *block = 0;
++
++      if (!create)
++              return 0;
++
++      error = gfs_metaalloc(ip, block);
++      if (error)
++              return error;
++
++      gfs_trans_add_bh(ip->i_gl, bh);
++
++      *ptr = cpu_to_gfs64(*block);
++      ip->i_di.di_blocks++;
++
++      *new = 1;
++
++      return 0;
++}
++
++/**
++ * get_datablock - Get datablock number from metadata block
++ * @ip: The GFS inode
++ * @bh: The buffer containing pointers to datablocks
++ * @mp: The metapath
++ * @create: Non-zero if we may create a new data block
++ * @new: Used to indicate if we created a new data block
++ * @block: the returned disk block number
++ *
++ * Given a fully built metadata tree, checks to see if a particular data
++ * block exists. It is created if it does not exist and the block number
++ * on disk is returned.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_datablock(struct gfs_inode *ip,
++            struct buffer_head *bh, struct metapath *mp,
++            int create, int *new, uint64_t *block)
++{
++      uint64_t *ptr = metapointer(bh, ip->i_di.di_height - 1, mp);
++
++      if (*ptr) {
++              *block = gfs64_to_cpu(*ptr);
++              return 0;
++      }
++
++      *block = 0;
++
++      if (!create)
++              return 0;
++
++      if (gfs_is_jdata(ip)) {
++              int error;
++              error = gfs_metaalloc(ip, block);
++              if (error)
++                      return error;
++      } else
++              gfs_blkalloc(ip, block);
++
++      gfs_trans_add_bh(ip->i_gl, bh);
++
++      *ptr = cpu_to_gfs64(*block);
++      ip->i_di.di_blocks++;
++
++      *new = 1;
++
++      return 0;
++}
++
++/**
++ * gfs_block_map - Map a block from an inode to a disk block
++ * @ip: The GFS inode
++ * @lblock: The logical block number
++ * @new: Value/Result argument (1 = may create/did create new blocks)
++ * @dblock: the disk block number of the start of an extent
++ * @extlen: the size of the extent
++ *
++ * Find the block number on the current device which corresponds to an
++ * inode's block. If the block had to be created, "new" will be set.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_block_map(struct gfs_inode *ip,
++            uint64_t lblock, int *new,
++            uint64_t *dblock, uint32_t *extlen)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      struct metapath *mp;
++      int create = *new;
++      unsigned int bsize;
++      unsigned int height;
++      unsigned int end_of_metadata;
++      unsigned int x;
++      int error;
++
++      *new = 0;
++      *dblock = 0;
++      if (extlen)
++              *extlen = 0;
++
++      if (gfs_is_stuffed(ip)) {
++              if (!lblock) {
++                      *dblock = ip->i_num.no_addr;
++                      if (extlen)
++                              *extlen = 1;
++              }
++              return 0;
++      }
++
++      bsize = (gfs_is_jdata(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
++
++      height = calc_tree_height(ip, (lblock + 1) * bsize);
++      if (ip->i_di.di_height < height) {
++              if (!create)
++                      return 0;
++
++              error = build_height(ip, height);
++              if (error)
++                      return error;
++      }
++
++      mp = find_metapath(ip, lblock);
++      end_of_metadata = ip->i_di.di_height - 1;
++
++      error = gfs_get_inode_buffer(ip, &bh);
++      if (error)
++              goto out;
++
++      for (x = 0; x < end_of_metadata; x++) {
++              error = get_metablock(ip, bh, x, mp, create, new, dblock);
++              brelse(bh);
++              if (error || !*dblock)
++                      goto out;
++
++              error = gfs_get_meta_buffer(ip, x + 1, *dblock, *new, &bh);
++              if (error)
++                      goto out;
++      }
++
++      error = get_datablock(ip, bh, mp, create, new, dblock);
++      if (error) {
++              brelse(bh);
++              goto out;
++      }
++
++      if (extlen && *dblock) {
++              *extlen = 1;
++
++              if (!*new) {
++                      uint64_t tmp_dblock;
++                      int tmp_new;
++                      unsigned int nptrs;
++
++                      nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
++
++                      while (++mp->mp_list[end_of_metadata] < nptrs) {
++                              get_datablock(ip, bh, mp,
++                                            FALSE, &tmp_new,
++                                            &tmp_dblock);
++
++                              if (*dblock + *extlen != tmp_dblock)
++                                      break;
++
++                              (*extlen)++;
++                      }
++              }
++      }
++
++      brelse(bh);
++
++      if (*new) {
++              error = gfs_get_inode_buffer(ip, &bh);
++              if (!error) {
++                      gfs_trans_add_bh(ip->i_gl, bh);
++                      gfs_dinode_out(&ip->i_di, bh->b_data);
++                      brelse(bh);
++              }
++      }
++
++ out:
++      kfree(mp);
++
++      return error;
++}
++
++/**
++ * do_grow - Make a file look bigger than it is
++ * @ip: the inode
++ * @size: the size to set the file to
++ *
++ * Called with an exclusive lock on @ip.
++ *
++ * Returns: 0 on succes, -EXXX on failure
++ */
++
++static int
++do_grow(struct gfs_inode *ip, uint64_t size)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al;
++      struct buffer_head *dibh;
++      unsigned int h;
++      int journaled = gfs_is_jdata(ip);
++      int error;
++
++      al = gfs_alloc_get(ip);
++
++      error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (error)
++              goto fail;
++
++      error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++      if (error)
++              goto fail_gunlock_q;
++
++      if (journaled)
++              al->al_requested_meta = sdp->sd_max_height + 1;
++      else {
++              al->al_requested_meta = sdp->sd_max_height;
++              al->al_requested_data = 1;
++      }
++
++      error = gfs_inplace_reserve(ip);
++      if (error)
++              goto fail_gunlock_q;
++
++      /* Trans may require:
++         Full extention of the metadata tree, block allocation,
++         a dinode modification, and a quota change */
++
++      error = gfs_trans_begin(sdp,
++                              sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
++                              1 + !!journaled,
++                              1);
++      if (error)
++              goto fail_ipres;
++
++      if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) {
++              if (gfs_is_stuffed(ip)) {
++                      error = gfs_unstuff_dinode(ip, gfs_unstuffer_sync, NULL);
++                      if (error)
++                              goto fail_end_trans;
++              }
++
++              h = calc_tree_height(ip, size);
++              if (ip->i_di.di_height < h) {
++                      error = build_height(ip, h);
++                      if (error)
++                              goto fail_end_trans;
++              }
++      }
++
++      ip->i_di.di_size = size;
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      gfs_trans_end(sdp);
++
++      gfs_inplace_release(ip);
++      gfs_quota_unlock_m(ip);
++      gfs_alloc_put(ip);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++      gfs_quota_unlock_m(ip);
++
++ fail:
++      gfs_alloc_put(ip);
++
++      return error;
++}
++
++/**
++ * recursive_scan - recursively scan through the end of a file
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @mp: the path through the metadata to the point to start
++ * @height: the height the recursion is at
++ * @block: the indirect block to look at
++ * @first: TRUE if this is the first block
++ * @bc: the call to make for each piece of metadata
++ * @data: data opaque to this function to pass to @bc
++ *
++ * When this is first called @height and @block should be zero and
++ * @first should be TRUE.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++recursive_scan(struct gfs_inode *ip, struct buffer_head *dibh,
++             struct metapath *mp, unsigned int height, uint64_t block,
++             int first, block_call_t bc, void *data)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh = NULL;
++      uint64_t *top, *bottom;
++      uint64_t bn;
++      int error;
++
++      if (!height) {
++              error = gfs_get_inode_buffer(ip, &bh);
++              if (error)
++                      goto fail;
++              dibh = bh;
++
++              top = (uint64_t *)(bh->b_data + sizeof(struct gfs_dinode)) +
++                      mp->mp_list[0];
++              bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs_dinode)) +
++                      sdp->sd_diptrs;
++      } else {
++              error = gfs_get_meta_buffer(ip, height, block, FALSE, &bh);
++              if (error)
++                      goto fail;
++
++              top = (uint64_t *)(bh->b_data + sizeof(struct gfs_indirect)) +
++                      ((first) ? mp->mp_list[height] : 0);
++              bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs_indirect)) +
++                      sdp->sd_inptrs;
++      }
++
++      error = bc(ip, dibh, bh, top, bottom, height, data);
++      if (error)
++              goto fail;
++
++      if (height < ip->i_di.di_height - 1)
++              for (; top < bottom; top++, first = FALSE) {
++                      if (!*top)
++                              continue;
++
++                      bn = gfs64_to_cpu(*top);
++
++                      error = recursive_scan(ip, dibh, mp,
++                                             height + 1, bn, first,
++                                             bc, data);
++                      if (error)
++                              goto fail;
++              }
++
++      brelse(bh);
++
++      return 0;
++
++ fail:
++      if (bh)
++              brelse(bh);
++
++      return error;
++}
++
++/**
++ * do_strip - Look for a layer a particular layer of the file and strip it off
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @bh: A buffer of pointers
++ * @top: The first pointer in the buffer
++ * @bottom: One more than the last pointer
++ * @height: the height this buffer is at
++ * @data: a pointer to a struct strip_mine
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_strip(struct gfs_inode *ip, struct buffer_head *dibh,
++       struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
++       unsigned int height, void *data)
++{
++      struct strip_mine *sm = (struct strip_mine *)data;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_holder ri_gh;
++      struct gfs_rgrp_list rlist;
++      uint64_t bn, bstart;
++      uint32_t blen;
++      uint64_t *p;
++      unsigned int rg_blocks = 0;
++      int metadata;
++      int x;
++      int error;
++
++      if (!*top)
++              sm->sm_first = FALSE;
++
++      if (height != sm->sm_height)
++              return 0;
++
++      if (sm->sm_first) {
++              top++;
++              sm->sm_first = FALSE;
++      }
++
++      metadata = (height != ip->i_di.di_height - 1) || gfs_is_jdata(ip);
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              return error;
++
++      memset(&rlist, 0, sizeof(struct gfs_rgrp_list));
++      bstart = 0;
++      blen = 0;
++
++      for (p = top; p < bottom; p++) {
++              if (!*p)
++                      continue;
++
++              bn = gfs64_to_cpu(*p);
++
++              if (bstart + blen == bn)
++                      blen++;
++              else {
++                      if (bstart)
++                              gfs_rlist_add(sdp, &rlist, bstart);
++
++                      bstart = bn;
++                      blen = 1;
++              }
++      }
++
++      if (bstart)
++              gfs_rlist_add(sdp, &rlist, bstart);
++      else
++              goto out; /* Nothing to do */
++
++      gfs_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
++
++      error = gfs_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
++      if (error)
++              goto fail;
++
++      for (x = 0; x < rlist.rl_rgrps; x++) {
++              struct gfs_rgrpd *rgd;
++              rgd = gl2rgd(rlist.rl_ghs[x].gh_gl);
++              rg_blocks += rgd->rd_ri.ri_length;
++      }
++
++      /* Trans may require:
++         All the bitmaps that were reserved. 
++         One block for the dinode.
++         One block for the indirect block being cleared.
++         One block for a quota change. */
++
++      error = gfs_trans_begin(sdp, rg_blocks + 2, 1);
++      if (error)
++              goto fail_rg_gunlock;
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_trans_add_bh(ip->i_gl, bh);
++
++      bstart = 0;
++      blen = 0;
++
++      for (p = top; p < bottom; p++) {
++              if (!*p)
++                      continue;
++
++              bn = gfs64_to_cpu(*p);
++
++              if (bstart + blen == bn)
++                      blen++;
++              else {
++                      if (bstart) {
++                              if (metadata)
++                                      gfs_metafree(ip, bstart, blen);
++                              else
++                                      gfs_blkfree(ip, bstart, blen);
++                      }
++
++                      bstart = bn;
++                      blen = 1;
++              }
++
++              *p = 0;
++              ip->i_di.di_blocks--;
++      }
++
++      if (bstart) {
++              if (metadata)
++                      gfs_metafree(ip, bstart, blen);
++              else
++                      gfs_blkfree(ip, bstart, blen);
++      }
++
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++      gfs_rlist_free(&rlist);
++
++ out:
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return 0;
++
++ fail_rg_gunlock:
++      gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++
++ fail:
++      gfs_rlist_free(&rlist);
++
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return error;
++}
++
++/**
++ * gfs_truncator_default - truncate a partial data block
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncator_default(struct gfs_inode *ip, uint64_t size)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      uint64_t bn;
++      int not_new = 0;
++      int error;
++
++      error = gfs_block_map(ip, size >> sdp->sd_sb.sb_bsize_shift, &not_new,
++                            &bn, NULL);
++      if (error)
++              return error;
++      if (!bn)
++              return 0;
++
++      error = gfs_get_data_buffer(ip, bn, FALSE, &bh);
++      if (error)
++              return error;
++
++      gfs_buffer_clear_tail(bh, size & (sdp->sd_sb.sb_bsize - 1));
++
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY);
++
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * truncator_journaled - truncate a partial data block
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++truncator_journaled(struct gfs_inode *ip, uint64_t size)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      uint64_t lbn, dbn;
++      uint32_t off;
++      int not_new = 0;
++      int error;
++
++      lbn = size;
++      off = do_div(lbn, sdp->sd_jbsize);
++
++      error = gfs_block_map(ip, lbn, &not_new, &dbn, NULL);
++      if (error)
++              return error;
++      if (!dbn)
++              return 0;
++
++      error = gfs_trans_begin(sdp, 1, 0);
++      if (error)
++              return error;
++
++      error = gfs_get_data_buffer(ip, dbn, FALSE, &bh);
++      if (!error) {
++              gfs_trans_add_bh(ip->i_gl, bh);
++              gfs_buffer_clear_tail(bh,
++                                    sizeof(struct gfs_meta_header) +
++                                    off);
++              brelse(bh);
++      }
++
++      gfs_trans_end(sdp);
++
++      return error;
++}
++
++/**
++ * gfs_shrink - make a file smaller
++ * @ip: the inode
++ * @size: the size to make the file
++ * @truncator: function to truncate the last partial block
++ *
++ * Called with an exclusive lock on @ip.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_shrink(struct gfs_inode *ip, uint64_t size, gfs_truncator_t truncator)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_holder ri_gh;
++      struct gfs_rgrpd *rgd;
++      struct buffer_head *dibh;
++      uint64_t block;
++      unsigned int height;
++      int journaled = gfs_is_jdata(ip);
++      int error;
++
++      if (!size)
++              block = 0;
++      else if (journaled) {
++              block = size - 1;
++              do_div(block, sdp->sd_jbsize);
++      }
++      else
++              block = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
++
++      /*  Get rid of all the data/metadata blocks  */
++
++      height = ip->i_di.di_height;
++      if (height) {
++              struct metapath *mp = find_metapath(ip, block);
++              gfs_alloc_get(ip);
++
++              error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++              if (error) {
++                      gfs_alloc_put(ip);
++                      kfree(mp);
++                      return error;
++              }
++
++              while (height--) {
++                      struct strip_mine sm;
++
++                      sm.sm_first = (size) ? TRUE : FALSE;
++                      sm.sm_height = height;
++
++                      error = recursive_scan(ip, NULL, mp, 0, 0, TRUE,
++                                             do_strip, &sm);
++                      if (error) {
++                              gfs_quota_unhold_m(ip);
++                              gfs_alloc_put(ip);
++                              kfree(mp);
++                              return error;
++                      }
++              }
++
++              gfs_quota_unhold_m(ip);
++              gfs_alloc_put(ip);
++              kfree(mp);
++      }
++
++      /*  If we truncated in the middle of a block, zero out the leftovers.  */
++
++      if (gfs_is_stuffed(ip)) {
++              /*  Do nothing  */
++      } else if (journaled) {
++              if (do_mod(size, sdp->sd_jbsize)) {
++                      error = truncator_journaled(ip, size);
++                      if (error)
++                              return error;
++              }
++      } else if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1)) {
++              error = truncator(ip, size);
++              if (error)
++                      return error;
++      }
++
++      /*  Set the new size (and possibly the height)  */
++
++      if (!size) {
++              error = gfs_rindex_hold(sdp, &ri_gh);
++              if (error)
++                      return error;
++      }
++
++      error = gfs_trans_begin(sdp, 1, 0);
++      if (error)
++              goto out;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto out_end_trans;
++
++      if (!size) {
++              ip->i_di.di_height = 0;
++
++              rgd = gfs_blk2rgrpd(sdp, ip->i_num.no_addr);
++              GFS_ASSERT_INODE(rgd, ip,);
++
++              ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++              ip->i_di.di_goal_dblk =
++                      ip->i_di.di_goal_mblk =
++                      ip->i_num.no_addr - rgd->rd_ri.ri_data1;
++      }
++
++      ip->i_di.di_size = size;
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++
++      if (!ip->i_di.di_height &&
++          size < sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))
++              gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode) + size);
++
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++ out_end_trans:
++      gfs_trans_end(sdp);
++
++ out:
++      if (!size)
++              gfs_glock_dq_uninit(&ri_gh);
++
++      return error;
++}
++
++/**
++ * do_same - truncate to same size (update time stamps)
++ * @ip: 
++ *
++ * Returns: errno
++ */
++
++static int
++do_same(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *dibh;
++      int error;
++
++      error = gfs_trans_begin(sdp, 1, 0);
++      if (error)
++              return error;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto out;
++
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++ out:
++      gfs_trans_end(sdp);
++
++      return error;
++}
++
++/**
++ * gfs_truncatei - make a file a give size
++ * @ip: the inode
++ * @size: the size to make the file
++ * @truncator: function to truncate the last partial block
++ *
++ * The file size can grow, shrink, or stay the same size.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncatei(struct gfs_inode *ip, uint64_t size,
++            gfs_truncator_t truncator)
++{
++      GFS_ASSERT_INODE(ip->i_di.di_type == GFS_FILE_REG, ip,);
++
++      if (size == ip->i_di.di_size)
++              return do_same(ip);
++      else if (size > ip->i_di.di_size)
++              return do_grow(ip, size);
++      else
++              return gfs_shrink(ip, size, truncator);
++}
++
++/**
++ * gfs_write_calc_reserv - calculate the number of blocks needed to write to a file
++ * @ip: the file
++ * @len: the number of bytes to be written to the file
++ * @data_blocks: returns the number of data blocks required
++ * @ind_blocks: returns the number of indirect blocks required
++ *
++ */
++
++void
++gfs_write_calc_reserv(struct gfs_inode *ip, unsigned int len,
++                    unsigned int *data_blocks, unsigned int *ind_blocks)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      unsigned int tmp;
++
++      if (gfs_is_jdata(ip)) {
++              *data_blocks = DIV_RU(len, sdp->sd_jbsize) + 2;
++              *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
++      } else {
++              *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
++              *ind_blocks = 3 * (sdp->sd_max_height - 1);
++      }
++
++      for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
++              tmp = DIV_RU(tmp, sdp->sd_inptrs);
++              *ind_blocks += tmp;
++      }
++}
++
++/**
++ * gfs_write_alloc_required - figure out if a write is going to require an allocation
++ * @ip: the file being written to
++ * @offset: the offset to write to
++ * @len: the number of bytes being written
++ * @alloc_required: the int is set to TRUE if an alloc is required, FALSE otherwise
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_write_alloc_required(struct gfs_inode *ip,
++                       uint64_t offset, unsigned int len,
++                       int *alloc_required)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t lblock, lblock_stop, dblock;
++      uint32_t extlen;
++      int not_new = FALSE;
++      int error = 0;
++
++      *alloc_required = FALSE;
++
++      if (!len)
++              return 0;
++
++      if (gfs_is_stuffed(ip)) {
++              if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))
++                      *alloc_required = TRUE;
++              return 0;
++      }
++
++      if (gfs_is_jdata(ip)) {
++              unsigned int bsize = sdp->sd_jbsize;
++              lblock = offset;
++              do_div(lblock, bsize);
++              lblock_stop = offset + len + bsize - 1;
++              do_div(lblock_stop, bsize);
++      } else {
++              unsigned int shift = sdp->sd_sb.sb_bsize_shift;
++              lblock = offset >> shift;
++              lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
++      }
++
++      for (; lblock < lblock_stop; lblock += extlen) {
++              error = gfs_block_map(ip, lblock, &not_new, &dblock, &extlen);
++              if (error)
++                      return error;
++
++              if (!dblock) {
++                      *alloc_required = TRUE;
++                      return 0;
++              }
++      }
++
++      return 0;
++}
++
++/**
++ * do_gfm - Copy out the dinode/indirect blocks of a file
++ * @ip: the file
++ * @dibh: the dinode buffer
++ * @bh: the indirect buffer we're looking at
++ * @top: the first pointer in the block
++ * @bottom: one more than the last pointer in the block
++ * @height: the height the block is at
++ * @data: a pointer to a struct gfs_user_buffer structure
++ *
++ * If this is a journaled file, copy out the data too.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_gfm(struct gfs_inode *ip, struct buffer_head *dibh,
++       struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
++       unsigned int height, void *data)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_user_buffer *ub = (struct gfs_user_buffer *)data;
++      struct buffer_head *data_bh;
++      uint64_t *bp, bn;
++      int error;
++
++      error = gfs_add_bh_to_ub(ub, bh);
++      if (error)
++              return error;
++
++      if (ip->i_di.di_type != GFS_FILE_DIR ||
++          height + 1 != ip->i_di.di_height)
++              return 0;
++
++      for (bp = top; bp < bottom; bp++)
++              if (*bp) {
++                      bn = gfs64_to_cpu(*bp);
++
++                      error = gfs_dread(sdp, bn, ip->i_gl,
++                                        DIO_START | DIO_WAIT, &data_bh);
++                      if (error)
++                              return error;
++
++                      error = gfs_add_bh_to_ub(ub, data_bh);
++
++                      brelse(data_bh);
++
++                      if (error)
++                              return error;
++              }
++
++      return 0;
++}
++
++/**
++ * gfs_get_file_meta - return all the metadata for a file
++ * @ip: the file
++ * @ub: the structure representing the meta
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_file_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub)
++{
++      struct buffer_head *dibh;
++      struct metapath *mp;
++      int error;
++
++      if (gfs_is_stuffed(ip)) {
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (!error) {
++                      error = gfs_add_bh_to_ub(ub, dibh);
++                      brelse(dibh);
++              }
++      } else {
++              mp = find_metapath(ip, 0);
++              error = recursive_scan(ip, NULL, mp, 0, 0, TRUE, do_gfm, ub);
++              kfree(mp);
++      }
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/bmap.h linux-patched/fs/gfs/bmap.h
+--- linux-orig/fs/gfs/bmap.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bmap.h        2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,48 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __BMAP_DOT_H__
++#define __BMAP_DOT_H__
++
++typedef int (*gfs_unstuffer_t) (struct gfs_inode * ip,
++                              struct buffer_head * dibh, uint64_t block,
++                              void *private);
++
++int gfs_unstuffer_sync(struct gfs_inode *ip, struct buffer_head *dibh,
++                     uint64_t block, void *private);
++int gfs_unstuffer_async(struct gfs_inode *ip, struct buffer_head *dibh,
++                      uint64_t block, void *private);
++
++int gfs_unstuff_dinode(struct gfs_inode *ip, gfs_unstuffer_t unstuffer,
++                     void *private);
++
++int gfs_block_map(struct gfs_inode *ip,
++                uint64_t lblock, int *new,
++                uint64_t *dblock, uint32_t *extlen);
++
++typedef int (*gfs_truncator_t) (struct gfs_inode * ip, uint64_t size);
++
++int gfs_truncator_default(struct gfs_inode *ip, uint64_t size);
++
++int gfs_shrink(struct gfs_inode *ip, uint64_t size, gfs_truncator_t truncator);
++int gfs_truncatei(struct gfs_inode *ip, uint64_t size,
++                gfs_truncator_t truncator);
++
++void gfs_write_calc_reserv(struct gfs_inode *ip, unsigned int len,
++                         unsigned int *data_blocks, unsigned int *ind_blocks);
++int gfs_write_alloc_required(struct gfs_inode *ip, uint64_t offset,
++                           unsigned int len, int *alloc_required);
++
++int gfs_get_file_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++#endif /* __BMAP_DOT_H__ */
+diff -urN linux-orig/fs/gfs/daemon.c linux-patched/fs/gfs/daemon.c
+--- linux-orig/fs/gfs/daemon.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/daemon.c      2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,259 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "daemon.h"
++#include "glock.h"
++#include "log.h"
++#include "quota.h"
++#include "recovery.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_scand - Writing of cached scan chanes into the scan file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_scand(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++      daemonize("gfs_scand");
++      sdp->sd_scand_process = current;
++      set_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              gfs_scand_internal(sdp);
++
++              if (!test_bit(SDF_SCAND_RUN, &sdp->sd_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(sdp->sd_tune.gt_scand_secs * HZ);
++      }
++
++      down(&sdp->sd_thread_lock);
++      up(&sdp->sd_thread_lock);
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
++
++/**
++ * gfs_glockd - Writing of cached scan chanes into the scan file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_glockd(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++      daemonize("gfs_glockd");
++      set_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              while (atomic_read(&sdp->sd_reclaim_count))
++                      gfs_reclaim_glock(sdp);
++
++              if (!test_bit(SDF_GLOCKD_RUN, &sdp->sd_flags))
++                      break;
++
++              {
++                      DECLARE_WAITQUEUE(__wait_chan, current);
++                      current->state = TASK_INTERRUPTIBLE;
++                      add_wait_queue(&sdp->sd_reclaim_wchan, &__wait_chan);
++                      if (!atomic_read(&sdp->sd_reclaim_count)
++                          && test_bit(SDF_GLOCKD_RUN, &sdp->sd_flags))
++                              schedule();
++                      remove_wait_queue(&sdp->sd_reclaim_wchan, &__wait_chan);
++                      current->state = TASK_RUNNING;
++              }
++      }
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
++
++/**
++ * gfs_recoverd - Recovery of dead machine's journals
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_recoverd(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++      daemonize("gfs_recoverd");
++      sdp->sd_recoverd_process = current;
++      set_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              gfs_check_journals(sdp);
++
++              if (!test_bit(SDF_RECOVERD_RUN, &sdp->sd_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(sdp->sd_tune.gt_recoverd_secs * HZ);
++      }
++
++      down(&sdp->sd_thread_lock);
++      up(&sdp->sd_thread_lock);
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
++
++/**
++ * gfs_logd - Writing of cached log chanes into the log file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_logd(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++      struct gfs_holder ji_gh;
++
++      daemonize("gfs_logd");
++      sdp->sd_logd_process = current;
++      set_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              gfs_ail_empty(sdp);
++
++              if (time_after_eq(jiffies,
++                                sdp->sd_jindex_refresh_time +
++                                sdp->sd_tune.gt_jindex_refresh_secs * HZ)) {
++                      if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags) &&
++                          !gfs_jindex_hold(sdp, &ji_gh))
++                              gfs_glock_dq_uninit(&ji_gh);
++                      sdp->sd_jindex_refresh_time = jiffies;
++              }
++
++              if (!test_bit(SDF_LOGD_RUN, &sdp->sd_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(sdp->sd_tune.gt_logd_secs * HZ);
++      }
++
++      down(&sdp->sd_thread_lock);
++      up(&sdp->sd_thread_lock);
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
++
++/**
++ * gfs_quotad - Writing of cached quota chanes into the quota file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_quotad(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++      int error;
++
++      daemonize("gfs_quotad");
++      sdp->sd_quotad_process = current;
++      set_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              if (time_after_eq(jiffies,
++                                sdp->sd_quota_sync_time +
++                                sdp->sd_tune.gt_quota_quantum * HZ)) {
++                      error = gfs_quota_sync(sdp);
++                      if (error && error != -EROFS)
++                              printk("GFS: fsid=%s: quotad: error = %d\n",
++                                     sdp->sd_fsname, error);
++                      sdp->sd_quota_sync_time = jiffies;
++              }
++
++              gfs_quota_scan(sdp);
++
++              if (!test_bit(SDF_QUOTAD_RUN, &sdp->sd_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(sdp->sd_tune.gt_quotad_secs * HZ);
++      }
++
++      down(&sdp->sd_thread_lock);
++      up(&sdp->sd_thread_lock);
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
++
++/**
++ * gfs_inoded - Deallocation of unlinked inodes
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_inoded(void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++      daemonize("gfs_inoded");
++      sdp->sd_inoded_process = current;
++      set_bit(SDF_INODED_RUN, &sdp->sd_flags);
++      complete(&sdp->sd_thread_completion);
++
++      for (;;) {
++              gfs_unlinked_dealloc(sdp);
++
++              if (!test_bit(SDF_INODED_RUN, &sdp->sd_flags))
++                      break;
++
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(sdp->sd_tune.gt_inoded_secs * HZ);
++      }
++
++      down(&sdp->sd_thread_lock);
++      up(&sdp->sd_thread_lock);
++
++      complete(&sdp->sd_thread_completion);
++
++      return 0;
++}
+diff -urN linux-orig/fs/gfs/daemon.h linux-patched/fs/gfs/daemon.h
+--- linux-orig/fs/gfs/daemon.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/daemon.h      2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,24 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DAEMON_DOT_H__
++#define __DAEMON_DOT_H__
++
++int gfs_scand(void *data);
++int gfs_glockd(void *data);
++int gfs_recoverd(void *data);
++int gfs_logd(void *data);
++int gfs_quotad(void *data);
++int gfs_inoded(void *data);
++
++#endif /* __DAEMON_DOT_H__ */
+diff -urN linux-orig/fs/gfs/dio.c linux-patched/fs/gfs/dio.c
+--- linux-orig/fs/gfs/dio.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dio.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,1302 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/writeback.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "lops.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define buffer_busy(bh) ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
++
++/**
++ * aspace_get_block - 
++ * @inode:
++ * @lblock:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++aspace_get_block(struct inode *inode, sector_t lblock,
++               struct buffer_head *bh_result, int create)
++{
++      struct gfs_sbd *sdp = vfs2sdp(inode->i_sb);
++      GFS_ASSERT_SBD(FALSE, sdp,);
++}
++
++/**
++ * gfs_aspace_writepage - write an aspace page
++ * @page: the page
++ * @wbc:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int 
++gfs_aspace_writepage(struct page *page, struct writeback_control *wbc)
++{
++      return block_write_full_page(page, aspace_get_block, wbc);
++}
++
++/**
++ * stuck_releasepage - We're stuck in gfs_releasepage().  Print stuff out.
++ * @bh: the buffer we're stuck on
++ *
++ */
++
++static void
++stuck_releasepage(struct buffer_head *bh)
++{
++      struct gfs_sbd *sdp = vfs2sdp(bh->b_page->mapping->host->i_sb);
++      struct gfs_bufdata *bd = bh2bd(bh);
++
++      printk("GFS: fsid=%s: stuck in gfs_releasepage()...\n", sdp->sd_fsname);
++      printk("GFS: fsid=%s: blkno = %"PRIu64", bh->b_count = %d\n",
++             sdp->sd_fsname,
++             (uint64_t)bh->b_blocknr,
++             atomic_read(&bh->b_count));
++      printk("GFS: fsid=%s: bh2bd(bh) = %s\n",
++             sdp->sd_fsname,
++             (bd) ? "!NULL" : "NULL");
++
++      if (bd) {
++              struct gfs_glock *gl = bd->bd_gl;
++
++              printk("GFS: fsid=%s: gl = (%u, %"PRIu64")\n",
++                     sdp->sd_fsname,
++                     gl->gl_name.ln_type,
++                     gl->gl_name.ln_number);
++
++              printk("GFS: fsid=%s: bd_new_le.le_trans = %s\n",
++                     sdp->sd_fsname,
++                     (bd->bd_new_le.le_trans) ? "!NULL" : "NULL");
++              printk("GFS: fsid=%s: bd_incore_le.le_trans = %s\n",
++                     sdp->sd_fsname,
++                     (bd->bd_incore_le.le_trans) ? "!NULL" : "NULL");
++              printk("GFS: fsid=%s: bd_frozen = %s\n",
++                     sdp->sd_fsname,
++                     (bd->bd_frozen) ? "!NULL" : "NULL");
++              printk("GFS: fsid=%s: bd_pinned = %u\n",
++                     sdp->sd_fsname, bd->bd_pinned);
++              printk("GFS: fsid=%s: bd_ail_tr_list = %s\n",
++                     sdp->sd_fsname,
++                     (list_empty(&bd->bd_ail_tr_list)) ? "Empty" : "!Empty");
++
++              if (gl->gl_ops == &gfs_inode_glops) {
++                      struct gfs_inode *ip = gl2ip(gl);
++
++                      if (ip) {
++                              unsigned int x;
++
++                              printk("GFS: fsid=%s: ip = %"PRIu64"/%"PRIu64"\n",
++                                     sdp->sd_fsname,
++                                     ip->i_num.no_formal_ino,
++                                     ip->i_num.no_addr);
++                              printk("GFS: fsid=%s: ip->i_count = %d, ip->i_vnode = %s\n",
++                                   sdp->sd_fsname,
++                                   atomic_read(&ip->i_count),
++                                   (ip->i_vnode) ? "!NULL" : "NULL");
++                              for (x = 0; x < GFS_MAX_META_HEIGHT; x++)
++                                      printk("GFS: fsid=%s: ip->i_cache[%u] = %s\n",
++                                             sdp->sd_fsname, x,
++                                             (ip->i_cache[x]) ? "!NULL" : "NULL");
++                      }
++              }
++      }
++}
++
++/**
++ * gfs_aspace_releasepage - free the metadata associated with a page 
++ * @page: the page that's being released
++ * @gfp_mask: huh??
++ *
++ * Call try_to_free_buffers() if the buffers in this page can be
++ * released.
++ *
++ * Returns: 0
++ */
++
++static int
++gfs_aspace_releasepage(struct page *page, int gfp_mask)
++{
++      struct inode *aspace = page->mapping->host;
++      struct gfs_sbd *sdp = vfs2sdp(aspace->i_sb);
++      struct buffer_head *bh, *head;
++      struct gfs_bufdata *bd;
++      unsigned long t;
++
++      if (!page_has_buffers(page))
++              goto out;
++
++      head = bh = page_buffers(page);
++      do {
++              t = jiffies;
++
++              while (atomic_read(&bh->b_count)) {
++                      if (atomic_read(&aspace->i_writecount)) {
++                              if (time_after_eq(jiffies,
++                                                t +
++                                                sdp->sd_tune.gt_stall_secs * HZ)) {
++                                      stuck_releasepage(bh);
++                                      t = jiffies;
++                              }
++
++                              yield();
++                              continue;
++                      }
++
++                      return 0;
++              }
++
++              bd = bh2bd(bh);
++              if (bd) {
++                      GFS_ASSERT_SBD(bd->bd_bh == bh, sdp,);
++                      GFS_ASSERT_SBD(!bd->bd_new_le.le_trans, sdp,);
++                      GFS_ASSERT_SBD(!bd->bd_incore_le.le_trans, sdp,);
++                      GFS_ASSERT_SBD(!bd->bd_frozen, sdp,);
++                      GFS_ASSERT_SBD(!bd->bd_pinned, sdp,);
++                      GFS_ASSERT_SBD(list_empty(&bd->bd_ail_tr_list), sdp,);
++                      kmem_cache_free(gfs_bufdata_cachep, bd);
++                      atomic_dec(&sdp->sd_bufdata_count);
++                      bh2bd(bh) = NULL;
++              }
++
++              bh = bh->b_this_page;
++      }
++      while (bh != head);
++
++ out:
++      return try_to_free_buffers(page);
++}
++
++static struct address_space_operations aspace_aops = {
++      .writepage = gfs_aspace_writepage,
++      .releasepage = gfs_aspace_releasepage,
++};
++
++/**
++ * gfs_aspace_get - Get and initialize a struct inode structure
++ * @sdp: the filesystem the aspace is in
++ *
++ * Right now a struct inode is just a struct inode.  Maybe Linux
++ * will supply a more lightweight address space construct (that works)
++ * in the future.
++ *
++ * Make sure pages/buffers in this aspace aren't in high memory.
++ *
++ * Returns: the aspace
++ */
++
++struct inode *
++gfs_aspace_get(struct gfs_sbd *sdp)
++{
++      struct inode *aspace;
++
++      aspace = new_inode(sdp->sd_vfs);
++      if (aspace) {
++              mapping_set_gfp_mask(aspace->i_mapping, GFP_KERNEL);
++              aspace->i_mapping->a_ops = &aspace_aops;
++              aspace->i_size = ~0ULL;
++              vn2ip(aspace) = NULL;
++              insert_inode_hash(aspace);
++      }
++
++      return aspace;
++}
++
++/**
++ * gfs_aspace_put - get rid of an aspace
++ * @aspace:
++ *
++ */
++
++void
++gfs_aspace_put(struct inode *aspace)
++{
++      remove_inode_hash(aspace);
++      iput(aspace);
++}
++
++/**
++ * gfs_ail_start_trans - Start I/O on a part of the AIL
++ * @sdp: the filesystem
++ * @tr: the part of the AIL
++ *
++ */
++
++void
++gfs_ail_start_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *head, *tmp, *prev;
++      struct gfs_bufdata *bd;
++      struct buffer_head *bh;
++      int retry;
++
++      do {
++              retry = FALSE;
++
++              spin_lock(&sdp->sd_ail_lock);
++
++              for (head = &tr->tr_ail_bufs, tmp = head->prev, prev = tmp->prev;
++                   tmp != head;
++                   tmp = prev, prev = tmp->prev) {
++                      bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++                      bh = bd->bd_bh;
++
++                      if (gfs_trylock_buffer(bh))
++                              continue;
++
++                      if (bd->bd_pinned) {
++                              gfs_unlock_buffer(bh);
++                              continue;
++                      }
++
++                      if (!buffer_busy(bh)) {
++                              if (!buffer_uptodate(bh))
++                                      gfs_io_error_bh(sdp, bh);
++
++                              list_del_init(&bd->bd_ail_tr_list);
++                              list_del(&bd->bd_ail_gl_list);
++
++                              gfs_unlock_buffer(bh);
++                              brelse(bh);
++                              continue;
++                      }
++
++                      if (buffer_dirty(bh)) {
++                              list_move(&bd->bd_ail_tr_list, head);
++
++                              spin_unlock(&sdp->sd_ail_lock);
++                              wait_on_buffer(bh);
++                              ll_rw_block(WRITE, 1, &bh);
++                              spin_lock(&sdp->sd_ail_lock);
++
++                              gfs_unlock_buffer(bh);
++                              retry = TRUE;
++                              break;
++                      }
++
++                      gfs_unlock_buffer(bh);
++              }
++
++              spin_unlock(&sdp->sd_ail_lock);
++      } while (retry);
++}
++
++/**
++ * gfs_ail_empty_trans - Check whether or not a trans in the AIL has been synced
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++int
++gfs_ail_empty_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *head, *tmp, *prev;
++      struct gfs_bufdata *bd;
++      struct buffer_head *bh;
++      int ret;
++
++      spin_lock(&sdp->sd_ail_lock);
++
++      for (head = &tr->tr_ail_bufs, tmp = head->prev, prev = tmp->prev;
++           tmp != head;
++           tmp = prev, prev = tmp->prev) {
++              bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++              bh = bd->bd_bh;
++
++              if (gfs_trylock_buffer(bh))
++                      continue;
++
++              if (bd->bd_pinned || buffer_busy(bh)) {
++                      gfs_unlock_buffer(bh);
++                      continue;
++              }
++
++              if (!buffer_uptodate(bh))
++                      gfs_io_error_bh(sdp, bh);
++
++              list_del_init(&bd->bd_ail_tr_list);
++              list_del(&bd->bd_ail_gl_list);
++
++              gfs_unlock_buffer(bh);
++              brelse(bh);
++      }
++
++      ret = list_empty(head);
++
++      spin_unlock(&sdp->sd_ail_lock);
++
++      return ret;
++}
++
++/**
++ * ail_empty_gl - remove all buffers for a given lock from the AIL
++ * @gl: the glock
++ *
++ * None of the buffers should be dirty, locked, or pinned.
++ */
++
++static void
++ail_empty_gl(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_bufdata *bd;
++      struct buffer_head *bh;
++
++      spin_lock(&sdp->sd_ail_lock);
++
++      while (!list_empty(&gl->gl_ail_bufs)) {
++              bd = list_entry(gl->gl_ail_bufs.next,
++                              struct gfs_bufdata, bd_ail_gl_list);
++              bh = bd->bd_bh;
++
++              GFS_ASSERT_GLOCK(!bd->bd_pinned && !buffer_busy(bh), gl,
++                               printk("%u %.8lX\n", bd->bd_pinned, bh->b_state););
++              if (!buffer_uptodate(bh))
++                      gfs_io_error_bh(sdp, bh);
++
++              list_del_init(&bd->bd_ail_tr_list);
++              list_del(&bd->bd_ail_gl_list);
++
++              brelse(bh);
++      }
++
++      spin_unlock(&sdp->sd_ail_lock);
++}
++
++/**
++ * gfs_inval_buf - Invalidate all buffers associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_buf(struct gfs_glock *gl)
++{
++      struct inode *aspace = gl->gl_aspace;
++      struct address_space *mapping = gl->gl_aspace->i_mapping;
++
++      ail_empty_gl(gl);
++
++      atomic_inc(&aspace->i_writecount);
++      truncate_inode_pages(mapping, 0);
++      atomic_dec(&aspace->i_writecount);
++
++      GFS_ASSERT_GLOCK(!mapping->nrpages, gl,);
++}
++
++/**
++ * gfs_sync_buf - Sync all buffers associated with a glock
++ * @gl: The glock
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_buf(struct gfs_glock *gl, int flags)
++{
++      struct address_space *mapping = gl->gl_aspace->i_mapping;
++      int error = 0;
++
++      if (flags & DIO_START)
++              error = filemap_fdatawrite(mapping);
++      if (!error && (flags & DIO_WAIT))
++              error = filemap_fdatawait(mapping);
++      if (!error && (flags & (DIO_INVISIBLE | DIO_CHECK)) == DIO_CHECK)
++              ail_empty_gl(gl);
++
++      if (error)
++              gfs_io_error(gl->gl_sbd);
++}
++
++/**
++ * getbuf - Get a buffer with a given address space
++ * @sdp: the filesystem
++ * @aspace: the address space
++ * @blkno: the block number
++ * @create: TRUE if the buffer should be created
++ *
++ * Returns: the buffer
++ */
++
++static struct buffer_head *
++getbuf(struct gfs_sbd *sdp, struct inode *aspace, uint64_t blkno, int create)
++{
++      struct page *page;
++      struct buffer_head *bh;
++      unsigned int shift;
++      unsigned long index;
++      unsigned int bufnum;
++
++      shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
++      index = blkno >> shift;
++      bufnum = blkno - (index << shift);
++
++      if (create) {
++              RETRY_MALLOC(page = grab_cache_page(aspace->i_mapping, index), page);
++      } else {
++              page = find_lock_page(aspace->i_mapping, index);
++              if (!page)
++                      return NULL;
++      }
++
++      if (!page_has_buffers(page))
++              create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
++
++      for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
++              /* Do nothing */;
++      get_bh(bh);
++
++      if (!buffer_mapped(bh))
++              map_bh(bh, sdp->sd_vfs, blkno);
++      else
++              GFS_ASSERT_SBD(bh->b_bdev == sdp->sd_vfs->s_bdev &&
++                             bh->b_blocknr == blkno,
++                             sdp,);
++
++      unlock_page(page);
++      page_cache_release(page);
++
++      return bh;
++}
++
++/**
++ * gfs_dgetblk - Get a block
++ * @sdp: The GFS superblock
++ * @blkno: The block number
++ * @gl: The glock associated with this block
++ *
++ * Returns: The buffer
++ */
++
++struct buffer_head *
++gfs_dgetblk(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl)
++{
++      struct buffer_head *bh;
++
++      if (gl)
++              bh = getbuf(sdp, gl->gl_aspace, blkno, CREATE);
++      else
++              bh = sb_getblk(sdp->sd_vfs, blkno);
++
++      return bh;
++}
++
++/**
++ * gfs_dread - Read a block from disk
++ * @sdp: The GFS superblock
++ * @blkno: The block number
++ * @gl: The glock covering the block
++ * @flags: flags to gfs_dreread()
++ * @bhp: the place where the buffer is returned
++ *
++ * Returns: The buffer on success, NULL on failur
++ */
++
++int
++gfs_dread(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl, int flags,
++        struct buffer_head **bhp)
++{
++      int error;
++
++      *bhp = gfs_dgetblk(sdp, blkno, gl);
++      error = gfs_dreread(sdp, *bhp, flags);
++      if (error)
++              brelse(*bhp);
++
++      return error;
++}
++
++/**
++ * gfs_prep_new_buffer - Mark a new buffer we just gfs_dgetblk()ed uptodate
++ * @bh: the buffer
++ *
++ */
++
++void
++gfs_prep_new_buffer(struct buffer_head *bh)
++{
++      wait_on_buffer(bh);
++      clear_buffer_dirty(bh);
++      set_buffer_uptodate(bh);
++}
++
++/**
++ * gfs_dreread - Reread a block from disk
++ * @sdp: the filesystem
++ * @bh: The block to read
++ * @flags: Flags that control the read
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dreread(struct gfs_sbd *sdp, struct buffer_head *bh, int flags)
++{
++      int error = 0;
++
++      if (flags & DIO_NEW) {
++              if (gfs_mhc_fish(sdp, bh))
++                      return 0;
++              clear_buffer_uptodate(bh);
++      }
++
++      if (flags & DIO_FORCE)
++              clear_buffer_uptodate(bh);
++
++      if ((flags & DIO_START) && !buffer_uptodate(bh))
++              ll_rw_block(READ, 1, &bh);
++
++      if (flags & DIO_WAIT) {
++              wait_on_buffer(bh);
++
++              if (!buffer_uptodate(bh)) {
++                      gfs_io_error_bh(sdp, bh);
++                      error = -EIO;
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_dwrite - Write a buffer
++ * @sdp: the filesystem
++ * @bh: The buffer to write
++ * @flags: The type of write operation to do
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dwrite(struct gfs_sbd *sdp, struct buffer_head *bh, int flags)
++{
++      int error = 0;
++
++      GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++      GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++
++      if (flags & DIO_CLEAN) {
++              lock_buffer(bh);
++              clear_buffer_dirty(bh);
++              unlock_buffer(bh);
++      }
++
++      if (flags & DIO_DIRTY)
++              mark_buffer_dirty(bh);
++
++      if ((flags & DIO_START) && buffer_dirty(bh)) {
++              wait_on_buffer(bh);
++              ll_rw_block(WRITE, 1, &bh);
++      }
++
++      if (flags & DIO_WAIT) {
++              wait_on_buffer(bh);
++
++              if (!buffer_uptodate(bh) || buffer_dirty(bh)) {
++                      gfs_io_error_bh(sdp, bh);
++                      error = -EIO;
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_attach_bufdata - attach a struct gfs_bufdata structure to a buffer
++ * @bh: The buffer to be attached to
++ * @gl: the glock the buffer belongs to
++ *
++ */
++
++void
++gfs_attach_bufdata(struct buffer_head *bh, struct gfs_glock *gl)
++{
++      struct gfs_bufdata *bd;
++
++      lock_page(bh->b_page);
++
++      if (bh2bd(bh)) {
++              unlock_page(bh->b_page);
++              return;
++      }
++
++      RETRY_MALLOC(bd = kmem_cache_alloc(gfs_bufdata_cachep, GFP_KERNEL), bd);
++      atomic_inc(&gl->gl_sbd->sd_bufdata_count);
++
++      memset(bd, 0, sizeof(struct gfs_bufdata));
++
++      bd->bd_bh = bh;
++      bd->bd_gl = gl;
++
++      INIT_LE(&bd->bd_new_le, &gfs_buf_lops);
++      INIT_LE(&bd->bd_incore_le, &gfs_buf_lops);
++
++      init_MUTEX(&bd->bd_lock);
++
++      INIT_LIST_HEAD(&bd->bd_ail_tr_list);
++
++      bh2bd(bh) = bd;
++
++      unlock_page(bh->b_page);
++}
++
++/**
++ * gfs_is_pinned - Figure out if a buffer is pinned or not
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to be pinned
++ *
++ * Returns: TRUE if the buffer is pinned, FALSE otherwise
++ */
++
++int
++gfs_is_pinned(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      struct gfs_bufdata *bd = bh2bd(bh);
++      int ret = FALSE;
++
++      if (bd) {
++              gfs_lock_buffer(bh);
++              if (bd->bd_pinned)
++                      ret = TRUE;
++              gfs_unlock_buffer(bh);
++      }
++
++      return ret;
++}
++
++/**
++ * gfs_dpin - Pin a metadata buffer in memory
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to be pinned
++ *
++ */
++
++void
++gfs_dpin(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      struct gfs_bufdata *bd;
++      char *data;
++
++      GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++      GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++
++      bd = bh2bd(bh);
++      GFS_ASSERT_SBD(bd, sdp,);
++
++      gfs_lock_buffer(bh);
++
++      GFS_ASSERT_GLOCK(!bd->bd_frozen, bd->bd_gl,);
++
++      if (!bd->bd_pinned++) {
++              wait_on_buffer(bh);
++
++              /* If this buffer is in the AIL and it has already been written,
++                 remove it from the AIL. */
++
++              spin_lock(&sdp->sd_ail_lock);
++              if (!list_empty(&bd->bd_ail_tr_list) && !buffer_busy(bh)) {
++                      list_del_init(&bd->bd_ail_tr_list);
++                      list_del(&bd->bd_ail_gl_list);
++                      brelse(bh);
++              }
++              spin_unlock(&sdp->sd_ail_lock);
++
++              clear_buffer_dirty(bh);
++              wait_on_buffer(bh);
++
++              if (!buffer_uptodate(bh))
++                      gfs_io_error_bh(sdp, bh);
++      } else {
++              gfs_unlock_buffer(bh);
++
++              data = gmalloc(sdp->sd_sb.sb_bsize);
++
++              gfs_lock_buffer(bh);
++              if (bd->bd_pinned > 1) {
++                      memcpy(data, bh->b_data, sdp->sd_sb.sb_bsize);
++                      bd->bd_frozen = data;
++              } else
++                      kfree(data);
++      }
++
++      gfs_unlock_buffer(bh);
++
++      get_bh(bh);
++}
++
++/**
++ * gfs_dunpin - Unpin a buffer
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to unpin
++ * @tr: The transaction in the AIL that contains this buffer
++ *
++ */
++
++void
++gfs_dunpin(struct gfs_sbd *sdp, struct buffer_head *bh, struct gfs_trans *tr)
++{
++      struct gfs_bufdata *bd;
++
++      GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++
++      bd = bh2bd(bh);
++      GFS_ASSERT_SBD(bd, sdp,);
++
++      gfs_lock_buffer(bh);
++
++      GFS_ASSERT_GLOCK(bd->bd_pinned, bd->bd_gl,);
++
++      if (bd->bd_pinned == 1)
++              mark_buffer_dirty(bh);
++
++      bd->bd_pinned--;
++
++      gfs_unlock_buffer(bh);
++
++      /* Add the buffer to the AIL
++         and get rid of an old reference if there is one */
++
++      if (tr) {
++              spin_lock(&sdp->sd_ail_lock);
++
++              if (list_empty(&bd->bd_ail_tr_list))
++                      list_add(&bd->bd_ail_gl_list, &bd->bd_gl->gl_ail_bufs);
++              else {
++                      list_del_init(&bd->bd_ail_tr_list);
++                      brelse(bh);
++              }
++              list_add(&bd->bd_ail_tr_list, &tr->tr_ail_bufs);
++
++              spin_unlock(&sdp->sd_ail_lock);
++      } else
++              brelse(bh);
++}
++
++/**
++ * logbh_end_io - called at the end of a logbh write
++ * @bh: the buffer
++ * @uptodate: whether or not the write succeeded
++ *
++ * Don't do ENTER() AND EXIT() here.
++ *
++ */
++
++static void
++logbh_end_io(struct buffer_head *bh, int uptodate)
++{
++      if (uptodate)
++              set_buffer_uptodate(bh);
++      else
++              clear_buffer_uptodate(bh);
++      unlock_buffer(bh);
++}
++
++/**
++ * gfs_logbh_init - Initialize a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to initialize
++ * @blkno: the block address of the buffer
++ * @data: the data to be written
++ *
++ */
++
++void
++gfs_logbh_init(struct gfs_sbd *sdp, struct buffer_head *bh,
++             uint64_t blkno, char *data)
++{
++      memset(bh, 0, sizeof(struct buffer_head));
++      bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
++      atomic_set(&bh->b_count, 1);
++      set_bh_page(bh, virt_to_page(data), ((unsigned long)data) & (PAGE_SIZE - 1));
++      bh->b_blocknr = blkno;
++      bh->b_size = sdp->sd_sb.sb_bsize;
++      bh->b_bdev = sdp->sd_vfs->s_bdev;
++      init_buffer(bh, logbh_end_io, NULL);
++      INIT_LIST_HEAD(&bh->b_assoc_buffers);
++}
++
++/**
++ * gfs_logbh_uninit - Clean up a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to clean
++ *
++ */
++
++void
++gfs_logbh_uninit(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      GFS_ASSERT_SBD(!buffer_busy(bh) &&
++                     atomic_read(&bh->b_count) == 1,
++                     sdp,);
++}
++
++/**
++ * gfs_logbh_start - Start writing a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to write
++ *
++ * Returns: 0 on success, -EXXX on error;
++ */
++
++int
++gfs_logbh_start(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      submit_bh(WRITE, bh);
++      return 0;
++}
++
++/**
++ * gfs_logbh_wait - Wait for the write of a fake buffer head to complete
++ * @sdp: the filesystem
++ * @bh: the buffer to write
++ *
++ * Returns: 0 on success, -EXXX on error;
++ */
++
++int
++gfs_logbh_wait(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      int error = 0;
++
++      wait_on_buffer(bh);
++
++      if (!buffer_uptodate(bh) || buffer_dirty(bh)) {
++              gfs_io_error_bh(sdp, bh);
++              error = -EIO;
++      }
++
++      return error;
++}
++
++/**
++ * gfs_replay_buf - write a log buffer to its inplace location
++ * @gl: the journal's glock 
++ * @bh: the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_replay_buf(struct gfs_glock *gl, struct buffer_head *bh)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_bufdata *bd;
++
++      bd = bh2bd(bh);
++      if (!bd) {
++              gfs_attach_bufdata(bh, gl);
++              bd = bh2bd(bh);
++      }
++
++      mark_buffer_dirty(bh);
++
++      if (list_empty(&bd->bd_ail_tr_list)) {
++              get_bh(bh);
++              list_add(&bd->bd_ail_tr_list, &sdp->sd_recovery_bufs);
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_replay_check - Check up on journal replay
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_replay_check(struct gfs_sbd *sdp)
++{
++      struct buffer_head *bh;
++      struct gfs_bufdata *bd;
++
++      while (!list_empty(&sdp->sd_recovery_bufs)) {
++              bd = list_entry(sdp->sd_recovery_bufs.prev,
++                              struct gfs_bufdata, bd_ail_tr_list);
++              bh = bd->bd_bh;
++
++              if (buffer_busy(bh)) {
++                      list_move(&bd->bd_ail_tr_list,
++                                &sdp->sd_recovery_bufs);
++                      break;
++              } else {
++                      list_del_init(&bd->bd_ail_tr_list);
++                      if (!buffer_uptodate(bh))
++                              gfs_io_error_bh(sdp, bh);
++                      brelse(bh);
++              }
++      }
++}
++
++/**
++ * gfs_replay_wait - Wait for all replayed buffers to hit the disk
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_replay_wait(struct gfs_sbd *sdp)
++{
++      struct list_head *head, *tmp, *prev;
++      struct buffer_head *bh;
++      struct gfs_bufdata *bd;
++
++      for (head = &sdp->sd_recovery_bufs, tmp = head->prev, prev = tmp->prev;
++           tmp != head;
++           tmp = prev, prev = tmp->prev) {
++              bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++              bh = bd->bd_bh;
++
++              if (!buffer_busy(bh)) {
++                      list_del_init(&bd->bd_ail_tr_list);
++                      if (!buffer_uptodate(bh))
++                              gfs_io_error_bh(sdp, bh);
++                      brelse(bh);
++                      continue;
++              }
++
++              if (buffer_dirty(bh)) {
++                      wait_on_buffer(bh);
++                      ll_rw_block(WRITE, 1, &bh);
++              }
++      }
++
++      while (!list_empty(head)) {
++              bd = list_entry(head->prev, struct gfs_bufdata, bd_ail_tr_list);
++              bh = bd->bd_bh;
++
++              wait_on_buffer(bh);
++
++              GFS_ASSERT_SBD(!buffer_busy(bh), sdp,);
++
++              list_del_init(&bd->bd_ail_tr_list);
++              if (!buffer_uptodate(bh))
++                      gfs_io_error_bh(sdp, bh);
++              brelse(bh);
++      }
++}
++
++/**
++ * gfs_wipe_buffers - make buffers so they aren't dirty/pinned anymore
++ * @ip: the inode who owns the buffers
++ * @bstart: the first buffer in the run
++ * @blen: the number of buffers in the run
++ *
++ */
++
++void
++gfs_wipe_buffers(struct gfs_inode *ip, struct gfs_rgrpd *rgd,
++               uint64_t bstart, uint32_t blen)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct inode *aspace = ip->i_gl->gl_aspace;
++      struct buffer_head *bh;
++      struct gfs_bufdata *bd;
++      int busy;
++      int add = FALSE;
++
++      while (blen) {
++              bh = getbuf(sdp, aspace, bstart, NO_CREATE);
++              if (bh) {
++
++                      bd = bh2bd(bh);
++
++                      if (buffer_uptodate(bh)) {
++                              if (bd) {
++                                      gfs_lock_buffer(bh);
++                                      gfs_mhc_add(rgd, &bh, 1);
++                                      busy = bd->bd_pinned || buffer_busy(bh);
++                                      gfs_unlock_buffer(bh);
++
++                                      if (busy)
++                                              add = TRUE;
++                                      else {
++                                              spin_lock(&sdp->sd_ail_lock);
++                                              if (!list_empty(&bd->bd_ail_tr_list)) {
++                                                      list_del_init(&bd->bd_ail_tr_list);
++                                                      list_del(&bd->bd_ail_gl_list);
++                                                      brelse(bh);
++                                              }
++                                              spin_unlock(&sdp->sd_ail_lock);
++                                      }
++                              } else {
++                                      GFS_ASSERT_INODE(!buffer_dirty(bh), ip,);
++                                      wait_on_buffer(bh);
++                                      GFS_ASSERT_INODE(!buffer_busy(bh), ip,);
++                                      gfs_mhc_add(rgd, &bh, 1);
++                              }
++                      } else {
++                              GFS_ASSERT_INODE(!bd || !bd->bd_pinned, ip,);
++                              GFS_ASSERT_INODE(!buffer_dirty(bh), ip,);
++                              wait_on_buffer(bh);
++                              GFS_ASSERT_INODE(!buffer_busy(bh), ip,);
++                      }
++
++                      brelse(bh);
++              }
++
++              bstart++;
++              blen--;
++      }
++
++      if (add)
++              gfs_depend_add(rgd, ip->i_num.no_formal_ino);
++}
++
++/**
++ * gfs_sync_meta - sync all the buffers in a filesystem
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_sync_meta(struct gfs_sbd *sdp)
++{
++      gfs_log_flush(sdp);
++      for (;;) {
++              gfs_ail_start(sdp, DIO_ALL);
++              if (gfs_ail_empty(sdp))
++                      break;
++
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ / 10);
++      }
++}
++
++/**
++ * gfs_flush_meta_cache - get rid of any references on buffers for this inode
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_flush_meta_cache(struct gfs_inode *ip)
++{
++      struct buffer_head **bh_slot;
++      unsigned int x;
++
++      spin_lock(&ip->i_lock);
++
++      for (x = 0; x < GFS_MAX_META_HEIGHT; x++) {
++              bh_slot = &ip->i_cache[x];
++              if (*bh_slot) {
++                      brelse(*bh_slot);
++                      *bh_slot = NULL;
++              }
++      }
++
++      spin_unlock(&ip->i_lock);
++}
++
++/**
++ * gfs_get_meta_buffer - Get a metadata buffer
++ * @ip: The GFS inode
++ * @depth: The depth in the metadata tree
++ * @num: The block number (device relative) of the buffer
++ * @new: Non-zero if we may create a new buffer
++ * @bhp: the buffer is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_meta_buffer(struct gfs_inode *ip, int height, uint64_t num, int new,
++                  struct buffer_head **bhp)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh, **bh_slot = &ip->i_cache[height];
++      int flags = ((new) ? DIO_NEW : 0) | DIO_START | DIO_WAIT;
++      int error;
++
++      spin_lock(&ip->i_lock);
++      bh = *bh_slot;
++      if (bh) {
++              if (bh->b_blocknr == num)
++                      get_bh(bh);
++              else
++                      bh = NULL;
++      }
++      spin_unlock(&ip->i_lock);
++
++      if (bh) {
++              error = gfs_dreread(sdp, bh, flags);
++              if (error) {
++                      brelse(bh);
++                      return error;
++              }
++      } else {
++              error = gfs_dread(sdp, num, ip->i_gl, flags, &bh);
++              if (error)
++                      return error;
++
++              spin_lock(&ip->i_lock);
++              if (*bh_slot != bh) {
++                      if (*bh_slot)
++                              brelse(*bh_slot);
++                      *bh_slot = bh;
++                      get_bh(bh);
++              }
++              spin_unlock(&ip->i_lock);
++      }
++
++      if (new) {
++              GFS_ASSERT_INODE(height, ip,);
++
++              gfs_trans_add_bh(ip->i_gl, bh);
++              gfs_metatype_set(sdp, bh, GFS_METATYPE_IN, GFS_FORMAT_IN);
++              gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++      } else
++              gfs_metatype_check(sdp, bh,
++                                 (height) ? GFS_METATYPE_IN : GFS_METATYPE_DI);
++
++      *bhp = bh;
++
++      return 0;
++}
++
++/**
++ * gfs_get_data_buffer - Get a data buffer
++ * @ip: The GFS inode
++ * @num: The block number (device relative) of the data block
++ * @new: Non-zero if this is a new allocation
++ * @bhp: the buffer is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_data_buffer(struct gfs_inode *ip, uint64_t block, int new,
++                  struct buffer_head **bhp)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      int error = 0;
++
++      if (block == ip->i_num.no_addr) {
++              GFS_ASSERT_INODE(!new, ip,);
++
++              error = gfs_dread(sdp, block, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++              if (error)
++                      return error;
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_DI);
++      } else if (gfs_is_jdata(ip)) {
++              if (new) {
++                      error = gfs_dread(sdp, block, ip->i_gl,
++                                        DIO_NEW | DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              return error;
++                      gfs_trans_add_bh(ip->i_gl, bh);
++                      gfs_metatype_set(sdp, bh, GFS_METATYPE_JD, GFS_FORMAT_JD);
++                      gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++              } else {
++                      error = gfs_dread(sdp, block, ip->i_gl,
++                                        DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              return error;
++                      gfs_metatype_check(sdp, bh, GFS_METATYPE_JD);
++              }
++      } else {
++              if (new) {
++                      bh = gfs_dgetblk(sdp, block, ip->i_gl);
++                      gfs_prep_new_buffer(bh);
++              } else {
++                      error = gfs_dread(sdp, block, ip->i_gl,
++                                        DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              return error;
++              }
++      }
++
++      *bhp = bh;
++
++      return 0;
++}
++
++/**
++ * gfs_start_ra - start readahead on an extent of a file
++ * @gl: the glock the blocks belong to
++ * @dblock: the starting disk block
++ * @extlen: the number of blocks in the extent
++ *
++ */
++
++void
++gfs_start_ra(struct gfs_glock *gl, uint64_t dblock, uint32_t extlen)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct inode *aspace = gl->gl_aspace;
++      struct buffer_head *first_bh, *bh;
++      uint32_t max_ra = sdp->sd_tune.gt_max_readahead >> sdp->sd_sb.sb_bsize_shift;
++      int error;
++
++      GFS_ASSERT_GLOCK(extlen, gl,);
++      if (!max_ra)
++              return;
++      if (extlen > max_ra)
++              extlen = max_ra;
++
++      first_bh = getbuf(sdp, aspace, dblock, CREATE);
++
++      if (buffer_uptodate(first_bh))
++              goto out;
++      if (!buffer_locked(first_bh)) {
++              error = gfs_dreread(sdp, first_bh, DIO_START);
++              if (error)
++                      goto out;
++      }
++
++      dblock++;
++      extlen--;
++
++      while (extlen) {
++              bh = getbuf(sdp, aspace, dblock, CREATE);
++
++              if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
++                      error = gfs_dreread(sdp, bh, DIO_START);
++                      brelse(bh);
++                      if (error)
++                              goto out;
++              } else
++                      brelse(bh);
++
++              dblock++;
++              extlen--;
++
++              if (buffer_uptodate(first_bh))
++                      break;
++      }
++
++ out:
++      brelse(first_bh);
++}
+diff -urN linux-orig/fs/gfs/dio.h linux-patched/fs/gfs/dio.h
+--- linux-orig/fs/gfs/dio.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dio.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,195 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIO_DOT_H__
++#define __DIO_DOT_H__
++
++void gfs_ail_start_trans(struct gfs_sbd *sdp, struct gfs_trans *tr);
++int gfs_ail_empty_trans(struct gfs_sbd *sdp, struct gfs_trans *tr);
++
++/*  Asynchronous I/O Routines  */
++
++struct buffer_head *gfs_dgetblk(struct gfs_sbd *sdp, uint64_t blkno,
++                              struct gfs_glock *gl);
++int gfs_dread(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl,
++            int flags, struct buffer_head **bhp);
++
++void gfs_prep_new_buffer(struct buffer_head *bh);
++int gfs_dreread(struct gfs_sbd *sdp, struct buffer_head *bh, int flags);
++int gfs_dwrite(struct gfs_sbd *sdp, struct buffer_head *bh, int flags);
++
++void gfs_attach_bufdata(struct buffer_head *bh, struct gfs_glock *gl);
++int gfs_is_pinned(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_dpin(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_dunpin(struct gfs_sbd *sdp, struct buffer_head *bh,
++              struct gfs_trans *tr);
++
++static __inline__
++void gfs_lock_buffer(struct buffer_head *bh)
++{
++      struct gfs_bufdata *bd = bh2bd(bh);
++      down(&bd->bd_lock);
++}
++static __inline__
++int gfs_trylock_buffer(struct buffer_head *bh)
++{
++      struct gfs_bufdata *bd = bh2bd(bh);
++      return down_trylock(&bd->bd_lock);
++}
++static __inline__
++void gfs_unlock_buffer(struct buffer_head *bh)
++{
++      struct gfs_bufdata *bd = bh2bd(bh);
++      up(&bd->bd_lock);
++}
++
++void gfs_logbh_init(struct gfs_sbd *sdp, struct buffer_head *bh, uint64_t blkno,
++                  char *data);
++void gfs_logbh_uninit(struct gfs_sbd *sdp, struct buffer_head *bh);
++int gfs_logbh_start(struct gfs_sbd *sdp, struct buffer_head *bh);
++int gfs_logbh_wait(struct gfs_sbd *sdp, struct buffer_head *bh);
++
++int gfs_replay_buf(struct gfs_glock *gl, struct buffer_head *bh);
++void gfs_replay_check(struct gfs_sbd *sdp);
++void gfs_replay_wait(struct gfs_sbd *sdp);
++
++void gfs_wipe_buffers(struct gfs_inode *ip, struct gfs_rgrpd *rgd,
++                    uint64_t bstart, uint32_t blen);
++
++void gfs_sync_meta(struct gfs_sbd *sdp);
++
++/*  Buffer Caching routines  */
++
++int gfs_get_meta_buffer(struct gfs_inode *ip, int height, uint64_t num, int new,
++                      struct buffer_head **bhp);
++int gfs_get_data_buffer(struct gfs_inode *ip, uint64_t block, int new,
++                      struct buffer_head **bhp);
++void gfs_start_ra(struct gfs_glock *gl, uint64_t dblock, uint32_t extlen);
++
++static __inline__ int
++gfs_get_inode_buffer(struct gfs_inode *ip, struct buffer_head **bhp)
++{
++      return gfs_get_meta_buffer(ip, 0, ip->i_num.no_addr, FALSE, bhp);
++}
++
++struct inode *gfs_aspace_get(struct gfs_sbd *sdp);
++void gfs_aspace_put(struct inode *aspace);
++
++void gfs_inval_buf(struct gfs_glock *gl);
++void gfs_sync_buf(struct gfs_glock *gl, int flags);
++
++void gfs_flush_meta_cache(struct gfs_inode *ip);
++
++/*  Buffer Content Functions  */
++
++/**
++ * gfs_buffer_clear - Zeros out a buffer
++ * @ip: The GFS inode
++ * @bh: The buffer to zero
++ *
++ */
++
++static __inline__ void
++gfs_buffer_clear(struct buffer_head *bh)
++{
++      memset(bh->b_data, 0, bh->b_size);
++}
++
++/**
++ * gfs_buffer_clear_tail - Clear buffer beyond the dinode
++ * @bh: The buffer containing the on-disk inode
++ * @head: the size of the head of the buffer
++ *
++ * Clears the remaining part of an on-disk inode that is not a dinode.
++ * i.e. The data part of a stuffed inode, or the top level of metadata
++ * of a non-stuffed inode.
++ */
++
++static __inline__ void
++gfs_buffer_clear_tail(struct buffer_head *bh, int head)
++{
++      memset(bh->b_data + head, 0, bh->b_size - head);
++}
++
++/**
++ * gfs_buffer_clear_ends - Zero out any bits of a buffer which are not being written
++ * @bh: The buffer
++ * @offset: Offset in buffer where write starts
++ * @amount: Amount of data being written
++ * @journaled: TRUE if this is a journaled buffer
++ *
++ */
++
++static __inline__ void
++gfs_buffer_clear_ends(struct buffer_head *bh, int offset, int amount,
++                    int journaled)
++{
++      int z_off1 = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++      int z_len1 = offset - z_off1;
++      int z_off2 = offset + amount;
++      int z_len2 = (bh)->b_size - z_off2;
++
++      if (z_len1)
++              memset(bh->b_data + z_off1, 0, z_len1);
++
++      if (z_len2)
++              memset(bh->b_data + z_off2, 0, z_len2);
++}
++
++/**
++ * gfs_buffer_copy_tail - copies the tail of one buffer to another
++ * @to_bh: the buffer to copy to
++ * @to_head: the size of the head of to_bh
++ * @from_bh: the buffer to copy from
++ * @from_head: the size of the head of from_bh
++ *
++ * from_head is guaranteed to bigger than to_head 
++ */
++
++static __inline__ void
++gfs_buffer_copy_tail(struct buffer_head *to_bh, int to_head,
++                   struct buffer_head *from_bh, int from_head)
++{
++      memcpy(to_bh->b_data + to_head,
++             from_bh->b_data + from_head,
++             from_bh->b_size - from_head);
++      memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
++             0,
++             from_head - to_head);
++}
++
++/**
++ * gfs_buffer_print - print a buffer to the debug console
++ * @bh: the buffer
++ * @string:  what to print before the contents of the buffer
++ *
++ */
++
++static __inline__ void
++gfs_buffer_print(struct buffer_head *bh, char *string)
++{
++      unsigned int x, size = (bh)->b_size;
++      unsigned char *c = (bh)->b_data;
++
++      printk("%s\n", string);
++
++      for (x = 0; x < size; x++) {
++              printk("%.2X ", c[x]);
++              if (x % 16 == 15)
++                      printk("\n");
++      }
++      if (x % 16 != 0)
++              printk("\n");
++}
++
++#endif                                /*  __DIO_DOT_H__  */
+diff -urN linux-orig/fs/gfs/dir.c linux-patched/fs/gfs/dir.c
+--- linux-orig/fs/gfs/dir.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dir.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,2273 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++* Implements Extendible Hashing as described in:
++*   "Extendible Hashing" by Fagin, et al in
++*     __ACM Trans. on Database Systems__, Sept 1979.
++*
++*
++* Here's the layout of dirents which is essentially the same as that of ext2 
++* within a single block. The field de_name_len is the number of bytes
++* actually required for the name (no null terminator). The field de_rec_len
++* is the number of bytes allocated to the dirent. The offset of the next
++* dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
++* deleted, the preceding dirent inherits its allocated space, ie
++* prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
++* by adding de_rec_len to the current dirent, this essentially causes the
++* deleted dirent to get jumped over when iterating through all the dirents.
++* When deleting the first dirent in a block, there is no previous dirent so
++* the field de_ino is set to zero to designate it as deleted. When allocating
++* a dirent, gfs_dirent_alloc iterates through the dirents in a block. If the
++* first dirent has (de_ino == 0) and de_rec_len is large enough, this first
++* dirent is allocated. Otherwise it must go through all the 'used' dirents
++* searching for one in which the amount of total space minus the amount of
++* used space will provide enough space for the new dirent.
++* There are two types of blocks in which dirents reside. In a stuffed dinode,
++* the dirents begin at offset sizeof(struct gfs_dinode) from the beginning of the block.
++* In leaves, they begin at offset sizeof (struct gfs_leaf) from the beginning of the
++* leaf block. The dirents reside in leaves when 
++* 
++* dip->i_di.di_regime == GFS_DIR_EXHASH.
++* 
++* The dirents are in the stuffed dinode when dip->i_di.di_regime == GFS_DIR_LINEAR.
++* When the dirents are in leaves, the actual contents of the directory file are
++* used as an array of 64-bit block pointers pointing to the leaf blocks. The
++* dirents are NOT in the directory file itself. There can be more than one block
++* pointer in the array that points to the same leaf. In fact, when a directory is
++* first converted from linear to exhash, all of the pointers point to the same
++* leaf. When a leaf is completely full, the size of the hash table can be doubled
++* unless it is already at the maximum size which is hard coded into 
++* GFS_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list but
++* never before the maximum hash table size has been reached.
++*/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "dir.h"
++#include "file.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define IS_LEAF     (1)
++#define IS_DINODE   (2)
++
++#if 1
++#define gfs_dir_hash2offset(h) (((uint64_t)(h)) >> 1)
++#define gfs_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
++#else
++#define gfs_dir_hash2offset(h) (((uint64_t)(h)))
++#define gfs_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
++#endif
++
++typedef int (*leaf_call_t) (struct gfs_inode *dip,
++                          uint32_t index, uint32_t len, uint64_t leaf_no,
++                          void *data);
++
++/**
++ * int gfs_filecmp - Compare two filenames
++ * @file1: The first filename
++ * @file2: The second filename
++ * @len_of_file2: The length of the second file
++ *
++ * This routine compares two filenames and returns TRUE if they are equal.
++ *
++ * Returns: TRUE (!=0) if the files are the same, otherwise FALSE (0).
++ */
++
++int
++gfs_filecmp(struct qstr *file1, char *file2, int len_of_file2)
++{
++      if (file1->len != len_of_file2)
++              return FALSE;
++      if (memcmp(file1->name, file2, file1->len))
++              return FALSE;
++      return TRUE;
++}
++
++/**
++ * dirent_first - Return the first dirent
++ * @dip: the directory
++ * @bh: The buffer
++ * @dent: Pointer to list of dirents
++ *
++ * return first dirent whether bh points to leaf or stuffed dinode
++ *
++ * Returns: IS_LEAF or IS_DINODE
++ */
++
++static int
++dirent_first(struct gfs_inode *dip, struct buffer_head *bh,
++           struct gfs_dirent **dent)
++{
++      struct gfs_meta_header *h = (struct gfs_meta_header *)bh->b_data;
++
++      if (gfs32_to_cpu(h->mh_type) == GFS_METATYPE_LF) {
++              gfs_meta_check(dip->i_sbd, bh);
++              *dent = (struct gfs_dirent *)(bh->b_data + sizeof(struct gfs_leaf));
++              return IS_LEAF;
++      } else {
++              gfs_metatype_check(dip->i_sbd, bh, GFS_METATYPE_DI);
++              *dent = (struct gfs_dirent *)(bh->b_data + sizeof(struct gfs_dinode));
++              return IS_DINODE;
++      }
++}
++
++/**
++ * dirent_next - Next dirent
++ * @dip: the directory
++ * @bh: The buffer
++ * @dent: Pointer to list of dirents
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++dirent_next(struct gfs_inode *dip, struct buffer_head *bh,
++          struct gfs_dirent **dent)
++{
++      struct gfs_dirent *tmp, *cur;
++      char *bh_end;
++      uint32_t cur_rec_len;
++
++      cur = *dent;
++      bh_end = bh->b_data + bh->b_size;
++
++      cur_rec_len = gfs16_to_cpu(cur->de_rec_len);
++
++      if ((char *)cur + cur_rec_len >= bh_end) {
++              GFS_ASSERT_INODE((char *)cur + cur_rec_len == bh_end, dip,);
++              return -ENOENT;
++      }
++
++      tmp = (struct gfs_dirent *)((char *)cur + cur_rec_len);
++
++      GFS_ASSERT_INODE((char *)tmp + gfs16_to_cpu(tmp->de_rec_len) <= bh_end,
++                       dip,);
++        /* Only the first dent could ever have de_ino == 0 */
++      GFS_ASSERT_INODE(tmp->de_inum.no_formal_ino, dip,);
++
++      *dent = tmp;
++
++      return 0;
++}
++
++/**
++ * dirent_del - Delete a dirent
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @prev: The previous dirent
++ * @cur: The current dirent
++ *
++ */
++
++static void
++dirent_del(struct gfs_inode *dip, struct buffer_head *bh,
++         struct gfs_dirent *prev, struct gfs_dirent *cur)
++{
++      uint32_t cur_rec_len, prev_rec_len;
++
++      GFS_ASSERT_INODE(cur->de_inum.no_formal_ino, dip,);
++
++      gfs_trans_add_bh(dip->i_gl, bh);
++
++      /* If there is no prev entry, this is the first entry in the block.
++         The de_rec_len is already as big as it needs to be.  Just zero
++         out the inode number and return.  */
++
++      if (!prev) {
++              cur->de_inum.no_formal_ino = 0; /* No endianess worries */
++              return;
++      }
++
++      /*  Combine this dentry with the previous one.  */
++
++      prev_rec_len = gfs16_to_cpu(prev->de_rec_len);
++      cur_rec_len = gfs16_to_cpu(cur->de_rec_len);
++
++      GFS_ASSERT_INODE((char *)prev + prev_rec_len == (char *)cur, dip,);
++      GFS_ASSERT_INODE((char *)cur + cur_rec_len <=
++                       bh->b_data + bh->b_size, dip,);
++
++      prev_rec_len += cur_rec_len;
++      prev->de_rec_len = cpu_to_gfs16(prev_rec_len);
++}
++
++/**
++ * gfs_dirent_alloc - Allocate a directory entry
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @name_len: The length of the name
++ * @dent_out: Pointer to list of dirents
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_dirent_alloc(struct gfs_inode *dip, struct buffer_head *bh, int name_len,
++               struct gfs_dirent **dent_out)
++{
++      struct gfs_dirent *dent, *new;
++      unsigned int rec_len = GFS_DIRENT_SIZE(name_len);
++      unsigned int entries = 0, offset = 0, x = 0;
++      int type;
++
++      type = dirent_first(dip, bh, &dent);
++
++      if (type == IS_LEAF) {
++              struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++              entries = gfs16_to_cpu(leaf->lf_entries);
++              offset = sizeof(struct gfs_leaf);
++      } else {
++              struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++              entries = gfs32_to_cpu(dinode->di_entries);
++              offset = sizeof(struct gfs_dinode);
++      }
++
++      if (!entries) {
++              gfs_trans_add_bh(dip->i_gl, bh);
++
++              dent->de_rec_len = bh->b_size - offset;
++              dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++              dent->de_name_len = cpu_to_gfs16(name_len);
++
++              *dent_out = dent;
++              return 0;
++      }
++
++      do {
++              uint32_t cur_rec_len, cur_name_len;
++
++              cur_rec_len = gfs16_to_cpu(dent->de_rec_len);
++              cur_name_len = gfs16_to_cpu(dent->de_name_len);
++
++              if ((!dent->de_inum.no_formal_ino && cur_rec_len >= rec_len) ||
++                  (cur_rec_len >= GFS_DIRENT_SIZE(cur_name_len) + rec_len)) {
++                      gfs_trans_add_bh(dip->i_gl, bh);
++
++                      if (dent->de_inum.no_formal_ino) {
++                              new = (struct gfs_dirent *)((char *)dent +
++                                                          GFS_DIRENT_SIZE(cur_name_len));
++                              memset(new, 0, sizeof(struct gfs_dirent));
++
++                              new->de_rec_len = cpu_to_gfs16(cur_rec_len -
++                                                             GFS_DIRENT_SIZE(cur_name_len));
++                              new->de_name_len = cpu_to_gfs16(name_len);
++
++                              dent->de_rec_len = cur_rec_len - gfs16_to_cpu(new->de_rec_len);
++                              dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++
++                              *dent_out = new;
++                              return 0;
++                      }
++
++                      dent->de_name_len = cpu_to_gfs16(name_len);
++
++                      *dent_out = dent;
++                      return 0;
++              }
++
++              GFS_ASSERT_INODE(x < entries, dip,);
++
++              if (dent->de_inum.no_formal_ino)
++                      x++;
++      }
++      while (dirent_next(dip, bh, &dent) == 0);
++
++      return -ENOSPC;
++}
++
++/**
++ * dirent_fits - See if we can fit a entry in this buffer
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @name_len: The length of the name
++ *
++ * Returns: TRUE if it can fit, FALSE otherwise
++ */
++
++static int
++dirent_fits(struct gfs_inode *dip, struct buffer_head *bh, int name_len)
++{
++      struct gfs_dirent *dent;
++      unsigned int rec_len = GFS_DIRENT_SIZE(name_len);
++      unsigned int entries = 0, x = 0;
++      int type;
++
++      type = dirent_first(dip, bh, &dent);
++
++      if (type == IS_LEAF) {
++              struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++              entries = gfs16_to_cpu(leaf->lf_entries);
++      } else {
++              struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++              entries = gfs32_to_cpu(dinode->di_entries);
++      }
++
++      if (!entries)
++              return TRUE;
++
++      do {
++              uint32_t cur_rec_len, cur_name_len;
++
++              cur_rec_len = gfs16_to_cpu(dent->de_rec_len);
++              cur_name_len = gfs16_to_cpu(dent->de_name_len);
++
++              if ((!dent->de_inum.no_formal_ino && cur_rec_len >= rec_len) ||
++                  (cur_rec_len >= GFS_DIRENT_SIZE(cur_name_len) + rec_len))
++                      return TRUE;
++
++              GFS_ASSERT_INODE(x < entries, dip,);
++
++              if (dent->de_inum.no_formal_ino)
++                      x++;
++      }
++      while (dirent_next(dip, bh, &dent) == 0);
++
++      return FALSE;
++}
++
++/**
++ * leaf_search
++ * @bh:
++ * @filename:
++ * @dent_out:
++ * @dent_prev:
++ *
++ * Returns:
++ */
++
++static int
++leaf_search(struct gfs_inode *dip,
++          struct buffer_head *bh, struct qstr *filename,
++          struct gfs_dirent **dent_out, struct gfs_dirent **dent_prev)
++{
++      uint32_t hash;
++      struct gfs_dirent *dent, *prev = NULL;
++      unsigned int entries = 0, x = 0;
++      int type;
++
++      type = dirent_first(dip, bh, &dent);
++
++      if (type == IS_LEAF) {
++              struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++              entries = gfs16_to_cpu(leaf->lf_entries);
++      } else if (type == IS_DINODE) {
++              struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++              entries = gfs32_to_cpu(dinode->di_entries);
++      }
++
++      hash = gfs_dir_hash(filename->name, filename->len);
++
++      do {
++              if (!dent->de_inum.no_formal_ino) {
++                      prev = dent;
++                      continue;
++              }
++
++              if (gfs32_to_cpu(dent->de_hash) == hash &&
++                  gfs_filecmp(filename, (char *)(dent + 1),
++                              gfs16_to_cpu(dent->de_name_len))) {
++                      *dent_out = dent;
++                      if (dent_prev)
++                              *dent_prev = prev;
++
++                      return 0;
++              }
++
++              GFS_ASSERT_INODE(x < entries, dip,);
++              x++;
++              prev = dent;
++      }
++      while (dirent_next(dip, bh, &dent) == 0);
++
++      return -ENOENT;
++}
++
++/**
++ * get_leaf - Get leaf
++ * @dip:
++ * @leaf_no:
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_leaf(struct gfs_inode *dip, uint64_t leaf_no, struct buffer_head **bhp)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      int error;
++
++      error = gfs_dread(sdp, leaf_no, dip->i_gl, DIO_START | DIO_WAIT, bhp);
++      if (!error)
++              gfs_metatype_check(sdp, *bhp, GFS_METATYPE_LF);
++
++      return error;
++}
++
++/**
++ * get_leaf_nr - Get a leaf number associated with the index
++ * @dip: The GFS inode
++ * @index:
++ * @leaf_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_leaf_nr(struct gfs_inode *dip, uint32_t index, uint64_t *leaf_out)
++{
++      uint64_t leaf_no;
++      int error;
++
++      error = gfs_internal_read(dip, (char *)&leaf_no,
++                                index * sizeof(uint64_t),
++                                sizeof(uint64_t));
++      if (error != sizeof(uint64_t))
++              return (error < 0) ? error : -EIO;
++
++      *leaf_out = gfs64_to_cpu(leaf_no);
++
++      return 0;
++}
++
++/**
++ * get_first_leaf - Get first leaf
++ * @dip: The GFS inode
++ * @index:
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_first_leaf(struct gfs_inode *dip, uint32_t index,
++             struct buffer_head **bh_out)
++{
++      uint64_t leaf_no;
++      int error;
++
++      error = get_leaf_nr(dip, index, &leaf_no);
++      if (!error)
++              error = get_leaf(dip, leaf_no, bh_out);
++
++      return error;
++}
++
++/**
++ * get_next_leaf - Get next leaf
++ * @dip: The GFS inode
++ * @bh_in: The buffer
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_next_leaf(struct gfs_inode *dip, struct buffer_head *bh_in,
++            struct buffer_head **bh_out)
++{
++      struct gfs_leaf *leaf;
++      int error;
++
++      leaf = (struct gfs_leaf *)bh_in->b_data;
++
++      if (!leaf->lf_next)
++              error = -ENOENT;
++      else
++              error = get_leaf(dip, gfs64_to_cpu(leaf->lf_next), bh_out);
++
++      return error;
++}
++
++/**
++ * linked_leaf_search - Linked leaf search
++ * @dip: The GFS inode
++ * @filename: The filename to search for
++ * @dent_out:
++ * @dent_prev:
++ * @bh_out:
++ *
++ * Returns: 0 on sucess, error code otherwise
++ */
++
++static int
++linked_leaf_search(struct gfs_inode *dip, struct qstr *filename,
++                 struct gfs_dirent **dent_out, struct gfs_dirent **dent_prev,
++                 struct buffer_head **bh_out)
++{
++      struct buffer_head *bh = NULL, *bh_next;
++      uint32_t hsize, index;
++      uint32_t hash;
++      int error;
++
++      hsize = 1 << dip->i_di.di_depth;
++      GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++      /*  Figure out the address of the leaf node.  */
++
++      hash = gfs_dir_hash(filename->name, filename->len);
++      index = hash >> (32 - dip->i_di.di_depth);
++
++      error = get_first_leaf(dip, index, &bh_next);
++      if (error)
++              return error;
++
++      /*  Find the entry  */
++
++      do {
++              if (bh)
++                      brelse(bh);
++
++              bh = bh_next;
++
++              error = leaf_search(dip, bh, filename, dent_out, dent_prev);
++              switch (error) {
++              case 0:
++                      *bh_out = bh;
++                      return 0;
++
++              case -ENOENT:
++                      break;
++
++              default:
++                      brelse(bh);
++                      return error;
++              }
++
++              error = get_next_leaf(dip, bh, &bh_next);
++      }
++      while (!error);
++
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * dir_make_exhash - Convet a stuffed directory into an ExHash directory
++ * @dip: The GFS inode
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++dir_make_exhash(struct gfs_inode *dip)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_dirent *dent;
++      struct buffer_head *bh, *dibh;
++      struct gfs_leaf *leaf;
++      int y;
++      uint32_t x;
++      uint64_t *lp, bn;
++      int error;
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      /*  Allocate a new block for the first leaf node  */
++
++      error = gfs_metaalloc(dip, &bn);
++      if (error)
++              goto fail;
++
++      /*  Turn over a new leaf  */
++
++      error = gfs_dread(sdp, bn, dip->i_gl, DIO_NEW | DIO_START | DIO_WAIT, &bh);
++      if (error)
++              goto fail;
++
++      gfs_trans_add_bh(dip->i_gl, bh);
++      gfs_metatype_set(sdp, bh, GFS_METATYPE_LF, GFS_FORMAT_LF);
++      gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++
++      /*  Fill in the leaf structure  */
++
++      leaf = (struct gfs_leaf *)bh->b_data;
++
++      GFS_ASSERT_INODE(dip->i_di.di_entries < (1 << 16), dip,);
++
++      leaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++      leaf->lf_entries = cpu_to_gfs16(dip->i_di.di_entries);
++
++      /*  Copy dirents  */
++
++      gfs_buffer_copy_tail(bh, sizeof(struct gfs_leaf), dibh,
++                           sizeof(struct gfs_dinode));
++
++      /*  Find last entry  */
++
++      x = 0;
++      dirent_first(dip, bh, &dent);
++
++      do {
++              if (!dent->de_inum.no_formal_ino)
++                      continue;
++              if (++x == dip->i_di.di_entries)
++                      break;
++      }
++      while (dirent_next(dip, bh, &dent) == 0);
++
++      /*  Adjust the last dirent's record length
++         (Remember that dent still points to the last entry.)  */
++
++      dent->de_rec_len = gfs16_to_cpu(dent->de_rec_len) +
++              sizeof(struct gfs_dinode) -
++              sizeof(struct gfs_leaf);
++      dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++
++      brelse(bh);
++
++      /*  We're done with the new leaf block, now setup the new
++          hash table.  */
++
++      gfs_trans_add_bh(dip->i_gl, dibh);
++      gfs_buffer_clear_tail(dibh, sizeof (struct gfs_dinode));
++
++      lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode));
++
++      for (x = sdp->sd_hash_ptrs; x--; lp++)
++              *lp = cpu_to_gfs64(bn);
++
++      dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
++      dip->i_di.di_blocks++;
++      dip->i_di.di_flags |= GFS_DIF_EXHASH;
++      dip->i_di.di_payload_format = 0;
++
++      for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
++      dip->i_di.di_depth = y;
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++      return 0;
++
++ fail:
++      brelse(dibh);
++      return error;
++}
++
++/**
++ * dir_split_leaf - Split a leaf block into two
++ * @dip: The GFS inode
++ * @index:
++ * @leaf_no:
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++static int
++dir_split_leaf(struct gfs_inode *dip, uint32_t index, uint64_t leaf_no)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *nbh, *obh, *dibh;
++      struct gfs_leaf *nleaf, *oleaf;
++      struct gfs_dirent *dent, *prev = NULL, *next = NULL, *new;
++      uint32_t start, len, half_len, divider;
++      uint64_t bn, *lp;
++      uint32_t name_len;
++      int x, moved = FALSE;
++      int error;
++
++      /*  Allocate the new leaf block  */
++
++      error = gfs_metaalloc(dip, &bn);
++      if (error)
++              return error;
++
++      /*  Get the new leaf block  */
++
++      error = gfs_dread(sdp, bn, dip->i_gl,
++                        DIO_NEW | DIO_START | DIO_WAIT, &nbh);
++      if (error)
++              return error;
++
++      gfs_trans_add_bh(dip->i_gl, nbh);
++      gfs_metatype_set(sdp, nbh, GFS_METATYPE_LF, GFS_FORMAT_LF);
++      gfs_buffer_clear_tail(nbh, sizeof (struct gfs_meta_header));
++
++      nleaf = (struct gfs_leaf *)nbh->b_data;
++
++      nleaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++
++      /*  Get the old leaf block  */
++
++      error = get_leaf(dip, leaf_no, &obh);
++      if (error)
++              goto fail;
++
++      gfs_trans_add_bh(dip->i_gl, obh);
++
++      oleaf = (struct gfs_leaf *)obh->b_data;
++
++      /*  Compute the start and len of leaf pointers in the hash table.  */
++
++      len = 1 << (dip->i_di.di_depth - gfs16_to_cpu(oleaf->lf_depth));
++      GFS_ASSERT_INODE(len != 1, dip,);
++      half_len = len >> 1;
++
++      start = (index & ~(len - 1));
++
++      /*  Change the pointers.
++         Don't bother distinguishing stuffed from non-stuffed.
++         This code is complicated enough already.  */
++
++      lp = gmalloc(half_len * sizeof(uint64_t));
++
++      error = gfs_internal_read(dip, (char *)lp, start * sizeof(uint64_t),
++                                half_len * sizeof(uint64_t));
++      if (error != half_len * sizeof(uint64_t)) {
++              if (error >= 0)
++                      error = -EIO;
++              goto fail_lpfree;
++      }
++
++      /*  Change the pointers  */
++
++      for (x = 0; x < half_len; x++)
++              lp[x] = cpu_to_gfs64(bn);
++
++      error = gfs_internal_write(dip, (char *)lp, start * sizeof(uint64_t),
++                                 half_len * sizeof(uint64_t));
++      if (error != half_len * sizeof(uint64_t)) {
++              if (error >= 0)
++                      error = -EIO;
++              goto fail_lpfree;
++      }
++
++      kfree(lp);
++
++      /*  Compute the divider  */
++
++      divider = (start + half_len) << (32 - dip->i_di.di_depth);
++
++      /*  Copy the entries  */
++
++      dirent_first(dip, obh, &dent);
++
++      do {
++              next = dent;
++              if (dirent_next(dip, obh, &next))
++                      next = NULL;
++
++              if (dent->de_inum.no_formal_ino &&
++                  gfs32_to_cpu(dent->de_hash) < divider) {
++                      name_len = gfs16_to_cpu(dent->de_name_len);
++
++                      error = gfs_dirent_alloc(dip, nbh, name_len, &new);
++                      GFS_ASSERT_INODE(!error, dip,);
++
++                      new->de_inum = dent->de_inum; /* No endianness worries */
++                      new->de_hash = dent->de_hash; /* No endianness worries */
++                      new->de_type = dent->de_type; /* No endianness worries */
++                      memcpy((char *)(new + 1), (char *)(dent + 1),
++                             name_len);
++
++                      nleaf->lf_entries = gfs16_to_cpu(nleaf->lf_entries) + 1;
++                      nleaf->lf_entries = cpu_to_gfs16(nleaf->lf_entries);
++
++                      dirent_del(dip, obh, prev, dent);
++
++                      GFS_ASSERT_INODE(gfs16_to_cpu(oleaf->lf_entries), dip,);
++                      oleaf->lf_entries = gfs16_to_cpu(oleaf->lf_entries) - 1;
++                      oleaf->lf_entries = cpu_to_gfs16(oleaf->lf_entries);
++
++                      if (!prev)
++                              prev = dent;
++
++                      moved = TRUE;
++              } else
++                      prev = dent;
++
++              dent = next;
++      }
++      while (dent);
++
++      /* If none of the entries got moved into the new leaf,
++         artificially fill in the first entry. */
++
++      if (!moved) {
++              error = gfs_dirent_alloc(dip, nbh, 0, &new);
++              GFS_ASSERT_INODE(!error, dip,);
++              new->de_inum.no_formal_ino = 0;
++      }
++
++      oleaf->lf_depth = gfs16_to_cpu(oleaf->lf_depth) + 1;
++      oleaf->lf_depth = cpu_to_gfs16(oleaf->lf_depth);
++      nleaf->lf_depth = oleaf->lf_depth;
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      GFS_ASSERT_INODE(!error, dip,); /* Pinned in gfs_internal_write() */
++
++      dip->i_di.di_blocks++;
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      brelse(obh);
++      brelse(nbh);
++
++      return 0;
++
++ fail_lpfree:
++      kfree(lp);
++
++      brelse(obh);
++
++ fail:
++      brelse(nbh);
++      return error;
++}
++
++/**
++ * dir_double_exhash - Double size of ExHash table
++ * @dip: The GFS dinode
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++static int
++dir_double_exhash(struct gfs_inode *dip)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *dibh;
++      uint32_t hsize;
++      uint64_t *buf;
++      uint64_t *from, *to;
++      uint64_t block;
++      int x;
++      int error = 0;
++
++      hsize = 1 << dip->i_di.di_depth;
++      GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++      /*  Allocate both the "from" and "to" buffers in one big chunk  */
++
++      buf = gmalloc(3 * sdp->sd_hash_bsize);
++
++      for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
++              error = gfs_internal_read(dip, (char *)buf,
++                                        block * sdp->sd_hash_bsize,
++                                        sdp->sd_hash_bsize);
++              if (error != sdp->sd_hash_bsize) {
++                      if (error >= 0)
++                              error = -EIO;
++                      goto fail;
++              }
++
++              from = buf;
++              to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
++
++              for (x = sdp->sd_hash_ptrs; x--; from++) {
++                      *to++ = *from;  /*  No endianess worries  */
++                      *to++ = *from;
++              }
++
++              error = gfs_internal_write(dip, (char *)buf + sdp->sd_hash_bsize,
++                                         block * sdp->sd_sb.sb_bsize,
++                                         sdp->sd_sb.sb_bsize);
++              if (error != sdp->sd_sb.sb_bsize) {
++                      if (error >= 0)
++                              error = -EIO;
++                      goto fail;
++              }
++      }
++
++      kfree(buf);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      GFS_ASSERT_INODE(!error, dip,); /* Pinned in gfs_internal_write() */
++
++      dip->i_di.di_depth++;
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++
++ fail:
++      kfree(buf);
++
++      return error;
++}
++
++/**
++ * compare_dents - compare directory entries by hash value
++ * @a: first dent
++ * @b: second dent
++ *
++ * When comparing the hash entries of @a to @b:
++ *   gt: returns 1
++ *   lt: returns -1
++ *   eq: returns 0
++ */
++
++static int
++compare_dents(void *a, void *b)
++{
++      struct gfs_dirent *dent_a, *dent_b;
++      uint32_t hash_a, hash_b;
++      int ret = 0;
++
++      dent_a = *(struct gfs_dirent **)a;
++      hash_a = dent_a->de_hash;
++      hash_a = gfs32_to_cpu(hash_a);
++
++      dent_b = *(struct gfs_dirent **)b;
++      hash_b = dent_b->de_hash;
++      hash_b = gfs32_to_cpu(hash_b);
++
++      if (hash_a > hash_b)
++              ret = 1;
++      else if (hash_a < hash_b)
++              ret = -1;
++      else {
++              unsigned int len_a = gfs16_to_cpu(dent_a->de_name_len);
++              unsigned int len_b = gfs16_to_cpu(dent_b->de_name_len);
++
++              if (len_a > len_b)
++                      ret = 1;
++              else if (len_a < len_b)
++                      ret = -1;
++              else
++                      ret = memcmp((char *)(dent_a + 1),
++                                   (char *)(dent_b + 1),
++                                   len_a);
++      }
++
++      return ret;
++}
++
++/**
++ * do_filldir_main - read out directory entries
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to 
++ * @darr: an array of struct gfs_dirent pointers to read
++ * @entries: the number of entries in darr
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Jump through some hoops to make sure that if there are hash collsions,
++ * they are read out at the beginning of a buffer.  We want to minimize
++ * the possibility that they will fall into different readdir buffers or
++ * that someone will want to seek to that location.
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_main(struct gfs_inode *dip, uint64_t *offset,
++              void *opaque, gfs_filldir_t filldir,
++              struct gfs_dirent **darr, uint32_t entries, int *copied)
++{
++      struct gfs_dirent *dent, *dent_next;
++      struct gfs_inum inum;
++      uint64_t off, off_next;
++      unsigned int x, y;
++      int run = FALSE;
++      int error = 0;
++
++      gfs_sort(darr, entries, sizeof(struct gfs_dirent *), compare_dents);
++
++      dent_next = darr[0];
++      off_next = gfs32_to_cpu(dent_next->de_hash);
++      off_next = gfs_dir_hash2offset(off_next);
++
++      for (x = 0, y = 1; x < entries; x++, y++) {
++              dent = dent_next;
++              off = off_next;
++
++              if (y < entries) {
++                      dent_next = darr[y];
++                      off_next = gfs32_to_cpu(dent_next->de_hash);
++                      off_next = gfs_dir_hash2offset(off_next);
++
++                      if (off < *offset)
++                              continue;
++                      *offset = off;
++
++                      if (off_next == off) {
++                              if (*copied && !run)
++                                      return 1;
++                              run = TRUE;
++                      } else
++                              run = FALSE;
++              } else {
++                      if (off < *offset)
++                              continue;
++                      *offset = off;
++              }
++
++              gfs_inum_in(&inum, (char *)&dent->de_inum);
++
++              error = filldir(opaque, (char *)(dent + 1),
++                              gfs16_to_cpu(dent->de_name_len),
++                              off, &inum,
++                              gfs16_to_cpu(dent->de_type));
++              if (error)
++                      return 1;
++
++              *copied = TRUE;
++      }
++
++      /* Increment the *offset by one, so the next time we come into the do_filldir fxn, 
++         we get the next entry instead of the last one in the current leaf */
++
++      (*offset)++;
++
++      return 0;
++}
++
++/**
++ * do_filldir_single - Read directory entries out of a single block
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to 
++ * @bh: the block
++ * @entries: the number of entries in the block
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_single(struct gfs_inode *dip, uint64_t *offset,
++                void *opaque, gfs_filldir_t filldir,
++                struct buffer_head *bh, uint32_t entries, int *copied)
++{
++      struct gfs_dirent **darr;
++      struct gfs_dirent *de;
++      unsigned int e = 0;
++      int error = 0;
++
++      if (!entries)
++              return 0;
++
++      darr = gmalloc(entries * sizeof(struct gfs_dirent *));
++
++      dirent_first(dip, bh, &de);
++      do {
++              if (!de->de_inum.no_formal_ino)
++                      continue;
++              darr[e++] = de;
++      }
++      while (dirent_next(dip, bh, &de) == 0);
++
++      GFS_ASSERT_INODE(e == entries, dip,);
++
++      error = do_filldir_main(dip, offset, opaque, filldir, darr,
++                              entries, copied);
++
++      kfree(darr);
++
++      return error;
++}
++
++/**
++ * do_filldir_multi - Read directory entries out of a linked leaf list
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to 
++ * @bh: the first leaf in the list
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_multi(struct gfs_inode *dip, uint64_t *offset,
++               void *opaque, gfs_filldir_t filldir,
++               struct buffer_head *bh, int *copied)
++{
++      struct buffer_head **larr = NULL;
++      struct gfs_dirent **darr;
++      struct gfs_leaf *leaf;
++      struct buffer_head *tmp_bh;
++      struct gfs_dirent *de;
++      unsigned int entries, e = 0;
++      unsigned int leaves = 0, l = 0;
++      unsigned int x;
++      uint64_t ln;
++      int error = 0;
++
++      /*  Count leaves and entries  */
++
++      leaf = (struct gfs_leaf *)bh->b_data;
++      entries = gfs16_to_cpu(leaf->lf_entries);
++      ln = leaf->lf_next;
++
++      while (ln) {
++              ln = gfs64_to_cpu(ln);
++
++              error = get_leaf(dip, ln, &tmp_bh);
++              if (error)
++                      return error;
++
++              leaf = (struct gfs_leaf *)tmp_bh->b_data;
++              if (leaf->lf_entries) {
++                      entries += gfs16_to_cpu(leaf->lf_entries);
++                      leaves++;
++              }
++              ln = leaf->lf_next;
++
++              brelse(tmp_bh);
++      }
++
++      /*  Bail out if there's nothing to do  */
++
++      if (!entries)
++              return 0;
++
++      /*  Alloc arrays  */
++
++      if (leaves)
++              larr = gmalloc(leaves * sizeof(struct buffer_head *));
++
++      darr = gmalloc(entries * sizeof(struct gfs_dirent *));
++
++      /*  Fill in arrays  */
++
++      leaf = (struct gfs_leaf *)bh->b_data;
++      if (leaf->lf_entries) {
++              dirent_first(dip, bh, &de);
++              do {
++                      if (!de->de_inum.no_formal_ino)
++                              continue;
++                      darr[e++] = de;
++              }
++              while (dirent_next(dip, bh, &de) == 0);
++      }
++      ln = leaf->lf_next;
++
++      while (ln) {
++              ln = gfs64_to_cpu(ln);
++
++              error = get_leaf(dip, ln, &tmp_bh);
++              if (error)
++                      goto out;
++
++              leaf = (struct gfs_leaf *)tmp_bh->b_data;
++              if (leaf->lf_entries) {
++                      dirent_first(dip, tmp_bh, &de);
++                      do {
++                              if (!de->de_inum.no_formal_ino)
++                                      continue;
++                              darr[e++] = de;
++                      }
++                      while (dirent_next(dip, tmp_bh, &de) == 0);
++
++                      larr[l++] = tmp_bh;
++
++                      ln = leaf->lf_next;
++              } else {
++                      ln = leaf->lf_next;
++                      brelse(tmp_bh);
++              }
++      }
++
++      GFS_ASSERT_INODE(l == leaves, dip,);
++      GFS_ASSERT_INODE(e == entries, dip,);
++
++      /*  Do work  */
++
++      error = do_filldir_main(dip, offset, opaque, filldir, darr,
++                              entries, copied);
++
++      /*  Clean up  */
++
++ out:
++      kfree(darr);
++
++      for (x = 0; x < l; x++)
++              brelse(larr[x]);
++
++      if (leaves)
++              kfree(larr);
++
++      return error;
++}
++
++/**
++ * dir_e_search -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_search(struct gfs_inode *dip, struct qstr *filename,
++           struct gfs_inum *inum, unsigned int *type)
++{
++      struct buffer_head *bh;
++      struct gfs_dirent *dent;
++      int error;
++
++      error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
++      if (error)
++              return error;
++
++      if (inum)
++              gfs_inum_in(inum, (char *)&dent->de_inum);
++      if (type)
++              *type = gfs16_to_cpu(dent->de_type);
++
++      brelse(bh);
++
++      return 0;
++}
++
++/**
++ * dir_e_add -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ * @type:
++ *
++ */
++
++static int
++dir_e_add(struct gfs_inode *dip, struct qstr *filename,
++        struct gfs_inum *inum, unsigned int type)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *bh, *nbh, *dibh;
++      struct gfs_leaf *leaf, *nleaf;
++      struct gfs_dirent *dent;
++      uint32_t hsize, index;
++      uint32_t hash;
++      uint64_t leaf_no, bn;
++      int error;
++
++ restart:
++      hsize = 1 << dip->i_di.di_depth;
++      GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++      /*  Figure out the address of the leaf node.  */
++
++      hash = gfs_dir_hash(filename->name, filename->len);
++      index = hash >> (32 - dip->i_di.di_depth);
++
++      error = get_leaf_nr(dip, index, &leaf_no);
++      if (error)
++              return error;
++
++      /*  Add entry to the leaf  */
++
++      for (;;) {
++              error = get_leaf(dip, leaf_no, &bh);
++              if (error)
++                      return error;
++
++              leaf = (struct gfs_leaf *)bh->b_data;
++
++              if (gfs_dirent_alloc(dip, bh, filename->len, &dent)) {
++
++                      if (gfs16_to_cpu(leaf->lf_depth) < dip->i_di.di_depth) {
++                              /* Can we split the leaf? */
++
++                              brelse(bh);
++
++                              error = dir_split_leaf(dip, index, leaf_no);
++                              if (error)
++                                      return error;
++
++                              goto restart;
++
++                      } else if (dip->i_di.di_depth < GFS_DIR_MAX_DEPTH) {
++                              /* Can we double the hash table? */
++
++                              brelse(bh);
++
++                              error = dir_double_exhash(dip);
++                              if (error)
++                                      return error;
++
++                              goto restart;
++
++                      } else if (leaf->lf_next) {
++                              /* Can we try the next leaf in the list? */
++                              leaf_no = gfs64_to_cpu(leaf->lf_next);
++                              brelse(bh);
++                              continue;
++
++                      } else {
++                              /* Create a new leaf and add it to the list. */
++
++                              error = gfs_metaalloc(dip, &bn);
++                              if (error) {
++                                      brelse(bh);
++                                      return error;
++                              }
++
++                              error = gfs_dread(sdp, bn, dip->i_gl,
++                                                DIO_NEW | DIO_START | DIO_WAIT,
++                                                &nbh);
++                              if (error) {
++                                      brelse(bh);
++                                      return error;
++                              }
++
++                              gfs_trans_add_bh(dip->i_gl, nbh);
++                              gfs_metatype_set(sdp, nbh, GFS_METATYPE_LF,
++                                               GFS_FORMAT_LF);
++                              gfs_buffer_clear_tail(nbh,
++                                                    sizeof(struct gfs_meta_header));
++
++                              gfs_trans_add_bh(dip->i_gl, bh);
++                              leaf->lf_next = cpu_to_gfs64(bn);
++
++                              nleaf = (struct gfs_leaf *)nbh->b_data;
++                              nleaf->lf_depth = leaf->lf_depth;
++                              nleaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++
++                              if (gfs_dirent_alloc(dip, nbh, filename->len, &dent))
++                                      GFS_ASSERT_INODE(FALSE, dip,);
++
++                              dip->i_di.di_blocks++;
++
++                              brelse(bh);
++
++                              bh = nbh;
++                              leaf = nleaf;
++                      }
++              }
++
++              /*  If the gfs_dirent_alloc() succeeded, it pinned the "bh".  */
++
++              gfs_inum_out(inum, (char *)&dent->de_inum);
++              dent->de_hash = cpu_to_gfs32(hash);
++              dent->de_type = cpu_to_gfs16(type);
++              memcpy((char *)(dent + 1), filename->name, filename->len);
++
++              leaf->lf_entries = gfs16_to_cpu(leaf->lf_entries) + 1;
++              leaf->lf_entries = cpu_to_gfs16(leaf->lf_entries);
++
++              brelse(bh);
++
++              error = gfs_get_inode_buffer(dip, &dibh);
++              if (error)
++                      return error;
++
++              dip->i_di.di_entries++;
++              dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++              gfs_trans_add_bh(dip->i_gl, dibh);
++              gfs_dinode_out(&dip->i_di, dibh->b_data);
++              brelse(dibh);
++
++              return 0;
++      }
++
++      return -ENOENT;
++}
++
++/**
++ * dir_e_del - 
++ * @dip: The GFS inode
++ * @filename:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_del(struct gfs_inode *dip, struct qstr *filename)
++{
++      struct buffer_head *bh, *dibh;
++      struct gfs_dirent *dent, *prev;
++      struct gfs_leaf *leaf;
++      unsigned int entries;
++      int error;
++
++      error = linked_leaf_search(dip, filename, &dent, &prev, &bh);
++      GFS_ASSERT_INODE(error != -ENOENT, dip,);
++      if (error)
++              return error;
++
++      dirent_del(dip, bh, prev, dent); /* Pins bh */
++
++      leaf = (struct gfs_leaf *)bh->b_data;
++      entries = gfs16_to_cpu(leaf->lf_entries);
++      GFS_ASSERT_INODE(entries, dip,);
++      entries--;
++      leaf->lf_entries = cpu_to_gfs16(entries);
++
++      brelse(bh);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_entries, dip,);
++      dip->i_di.di_entries--;
++      dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(dip->i_gl, dibh);
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * dir_e_read - Reads the entries from a directory into a filldir buffer 
++ * @dip: dinode pointer
++ * @offset: the hash of the last entry read shifted to the right once
++ * @opaque: buffer for the filldir function to fill 
++ * @filldir: points to the filldir function to use
++ *
++ */
++
++static int
++dir_e_read(struct gfs_inode *dip, uint64_t *offset, void *opaque,
++         gfs_filldir_t filldir)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *bh;
++      struct gfs_leaf leaf;
++      uint32_t hsize, len;
++      uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
++      uint32_t hash, index;
++      uint64_t *lp;
++      int copied = FALSE;
++      int error = 0;
++
++      hsize = 1 << dip->i_di.di_depth;
++      GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++      hash = gfs_dir_offset2hash(*offset);
++      index = hash >> (32 - dip->i_di.di_depth);
++
++      lp = gmalloc(sdp->sd_hash_bsize);
++
++      while (index < hsize) {
++              lp_offset = index & (sdp->sd_hash_ptrs - 1);
++              ht_offset = index - lp_offset;
++
++              if (ht_offset_cur != ht_offset) {
++                      error = gfs_internal_read(dip, (char *)lp,
++                                                ht_offset * sizeof(uint64_t),
++                                                sdp->sd_hash_bsize);
++                      if (error != sdp->sd_hash_bsize) {
++                              if (error >= 0)
++                                      error = -EIO;
++                              goto out;
++                      }
++                      ht_offset_cur = ht_offset;
++              }
++
++              error = get_leaf(dip, gfs64_to_cpu(lp[lp_offset]), &bh);
++              if (error)
++                      goto out;
++
++              gfs_leaf_in(&leaf, bh->b_data);
++
++              if (leaf.lf_next)
++                      error = do_filldir_multi(dip, offset,
++                                               opaque, filldir,
++                                               bh, &copied);
++              else
++                      error = do_filldir_single(dip, offset,
++                                                opaque, filldir,
++                                                bh, leaf.lf_entries,
++                                                &copied);
++
++              brelse(bh);
++
++              if (error) {
++                      if (error > 0)
++                              error = 0;
++                      goto out;
++              }
++
++              len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
++              index = (index & ~(len - 1)) + len;
++      }
++
++ out:
++      kfree(lp);
++
++      return error;
++}
++
++/**
++ * dir_e_mvino -
++ * @dip: The GFS inode
++ * @filename:
++ * @new_inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_mvino(struct gfs_inode *dip, struct qstr *filename,
++          struct gfs_inum *inum, unsigned int new_type)
++{
++      struct buffer_head *bh, *dibh;
++      struct gfs_dirent *dent;
++      int error;
++
++      error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
++      GFS_ASSERT_INODE(error != -ENOENT, dip,);
++      if (error)
++              return error;
++
++      gfs_trans_add_bh(dip->i_gl, bh);
++
++      gfs_inum_out(inum, (char *)&dent->de_inum);
++      dent->de_type = cpu_to_gfs16(new_type);
++
++      brelse(bh);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(dip->i_gl, dibh);
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * dir_l_search -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_search(struct gfs_inode *dip, struct qstr *filename,
++           struct gfs_inum *inum, unsigned int *type)
++{
++      struct buffer_head *dibh;
++      struct gfs_dirent *dent;
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      error = leaf_search(dip, dibh, filename, &dent, NULL);
++      if (!error) {
++              if (inum)
++                      gfs_inum_in(inum, (char *)&dent->de_inum);
++              if (type)
++                      *type = gfs16_to_cpu(dent->de_type);
++      }
++
++      brelse(dibh);
++
++      return error;
++}
++
++/**
++ * dir_l_add -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ * @type:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_add(struct gfs_inode *dip, struct qstr *filename,
++        struct gfs_inum *inum, unsigned int type)
++{
++      struct buffer_head *dibh;
++      struct gfs_dirent *dent;
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      if (gfs_dirent_alloc(dip, dibh, filename->len, &dent)) {
++              brelse(dibh);
++
++              error = dir_make_exhash(dip);
++              if (!error)
++                      error = dir_e_add(dip, filename, inum, type);
++
++              return error;
++      }
++
++      /*  gfs_dirent_alloc() pins  */
++
++      gfs_inum_out(inum, (char *)&dent->de_inum);
++      dent->de_hash = gfs_dir_hash(filename->name, filename->len);
++      dent->de_hash = cpu_to_gfs32(dent->de_hash);
++      dent->de_type = cpu_to_gfs16(type);
++      memcpy((char *)(dent + 1), filename->name, filename->len);
++
++      dip->i_di.di_entries++;
++      dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * dir_l_del - 
++ * @dip: The GFS inode
++ * @filename:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_del(struct gfs_inode *dip, struct qstr *filename)
++{
++      struct buffer_head *dibh;
++      struct gfs_dirent *dent, *prev;
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      error = leaf_search(dip, dibh, filename, &dent, &prev);
++      GFS_ASSERT_INODE(!error, dip,);
++
++      dirent_del(dip, dibh, prev, dent);
++
++      /*  dirent_del() pins  */
++
++      GFS_ASSERT_INODE(dip->i_di.di_entries, dip,);
++      dip->i_di.di_entries--;
++
++      dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * dir_l_read -
++ * @dip:
++ * @offset:
++ * @opaque:
++ * @filldir:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_read(struct gfs_inode *dip, uint64_t *offset, void *opaque,
++         gfs_filldir_t filldir)
++{
++      struct buffer_head *dibh;
++      int copied = FALSE;
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++      if (!dip->i_di.di_entries)
++              return 0;
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      error = do_filldir_single(dip, offset,
++                                opaque, filldir,
++                                dibh, dip->i_di.di_entries,
++                                &copied);
++      if (error > 0)
++              error = 0;
++
++      brelse(dibh);
++
++      return error;
++}
++
++/**
++ * dir_l_mvino -
++ * @dip:
++ * @filename:
++ * @new_inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_mvino(struct gfs_inode *dip, struct qstr *filename,
++          struct gfs_inum *inum, unsigned int new_type)
++{
++      struct buffer_head *dibh;
++      struct gfs_dirent *dent;
++      int error;
++
++      GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              return error;
++
++      error = leaf_search(dip, dibh, filename, &dent, NULL);
++      GFS_ASSERT_INODE(!error, dip,);
++
++      gfs_trans_add_bh(dip->i_gl, dibh);
++
++      gfs_inum_out(inum, (char *)&dent->de_inum);
++      dent->de_type = cpu_to_gfs16(new_type);
++
++      dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * gfs_dir_search - Search a directory
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * This routine searches a directory for a file or another directory.
++ * Assumes a glock is held on dip.
++ *
++ * Returns: Inode number if found, -EXXXX on failure.
++ */
++
++int
++gfs_dir_search(struct gfs_inode *dip, struct qstr *filename,
++             struct gfs_inum *inum, unsigned int *type)
++{
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++              error = dir_e_search(dip, filename, inum, type);
++      else
++              error = dir_l_search(dip, filename, inum, type);
++
++      return error;
++}
++
++/**
++ * gfs_dir_add - Add new filename into directory
++ * @dip: The GFS inode
++ * @filename: The new name
++ * @inode: The inode number of the entry
++ * @type: The type of the entry
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int
++gfs_dir_add(struct gfs_inode *dip, struct qstr *filename,
++          struct gfs_inum *inum, unsigned int type)
++{
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++              error = dir_e_add(dip, filename, inum, type);
++      else
++              error = dir_l_add(dip, filename, inum, type);
++
++      return error;
++}
++
++/**
++ * gfs_dir_del - Delete a directory entry
++ * @dip: The GFS inode
++ * @filename: The filename
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int
++gfs_dir_del(struct gfs_inode *dip, struct qstr *filename)
++{
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++              error = dir_e_del(dip, filename);
++      else
++              error = dir_l_del(dip, filename);
++
++      return error;
++}
++
++/**
++ * gfs_dir_read - Translate a GFS filename
++ * @dip: The GFS inode
++ * @offset:
++ * @opaque:
++ * @filldir:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_dir_read(struct gfs_inode *dip, uint64_t * offset, void *opaque,
++           gfs_filldir_t filldir)
++{
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++              error = dir_e_read(dip, offset, opaque, filldir);
++      else
++              error = dir_l_read(dip, offset, opaque, filldir);
++
++      return error;
++}
++
++/**
++ * gfs_dir_mvino - Change inode number of directory entry
++ * @dip: The GFS inode
++ * @filename:
++ * @new_inode:
++ *
++ * This routine changes the inode number of a directory entry.  It's used
++ * by rename to change ".." when a directory is moved.
++ * Assumes a glock is held on dvp.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_dir_mvino(struct gfs_inode *dip, struct qstr *filename,
++            struct gfs_inum *inum, unsigned int new_type)
++{
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++              error = dir_e_mvino(dip, filename, inum, new_type);
++      else
++              error = dir_l_mvino(dip, filename, inum, new_type);
++
++      return error;
++}
++
++/**
++ * foreach_leaf - call a function for each leaf in a directory
++ * @dip: the directory
++ * @lc: the function to call for each each
++ * @data: private data to pass to it
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++foreach_leaf(struct gfs_inode *dip, leaf_call_t lc, void *data)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *bh;
++      struct gfs_leaf leaf;
++      uint32_t hsize, len;
++      uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
++      uint32_t index = 0;
++      uint64_t *lp;
++      uint64_t leaf_no;
++      int error = 0;
++
++      GFS_ASSERT_INODE(dip->i_di.di_flags & GFS_DIF_EXHASH, dip,);
++      hsize = 1 << dip->i_di.di_depth;
++      GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++      lp = gmalloc(sdp->sd_hash_bsize);
++
++      while (index < hsize) {
++              lp_offset = index & (sdp->sd_hash_ptrs - 1);
++              ht_offset = index - lp_offset;
++
++              if (ht_offset_cur != ht_offset) {
++                      error = gfs_internal_read(dip, (char *)lp,
++                                                ht_offset * sizeof(uint64_t),
++                                                sdp->sd_hash_bsize);
++                      if (error != sdp->sd_hash_bsize) {
++                              if (error >= 0)
++                                      error = -EIO;
++                              goto out;
++                      }
++                      ht_offset_cur = ht_offset;
++              }
++
++              leaf_no = gfs64_to_cpu(lp[lp_offset]);
++              if (leaf_no) {
++                      error = get_leaf(dip, leaf_no, &bh);
++                      if (error)
++                              goto out;
++                      gfs_leaf_in(&leaf, bh->b_data);
++                      brelse(bh);
++
++                      len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
++
++                      error = lc(dip, index, len, leaf_no, data);
++                      if (error)
++                              goto out;
++
++                      index = (index & ~(len - 1)) + len;
++              } else
++                      index++;
++      }
++
++      GFS_ASSERT_INODE(index == hsize, dip,);
++
++ out:
++      kfree(lp);
++
++      return error;
++}
++
++/**
++ * leaf_free - Deallocate a directory leaf
++ * @dip: the directory
++ * @index: the hash table offset in the directory
++ * @len: the number of pointers to this leaf
++ * @leaf_no: the leaf number
++ * @data: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++leaf_free(struct gfs_inode *dip,
++        uint32_t index, uint32_t len,
++        uint64_t leaf_no, void *data)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_holder ri_gh;
++      struct gfs_leaf tmp_leaf;
++      struct gfs_rgrp_list rlist;
++      struct buffer_head *bh, *dibh;
++      uint64_t blk;
++      unsigned int rg_blocks = 0;
++      char *ht;
++      unsigned int x, size = len * sizeof(uint64_t);
++      int error;
++
++      memset(&rlist, 0, sizeof(struct gfs_rgrp_list));
++
++      ht = gmalloc(size);
++      memset(ht, 0, size);
++
++      gfs_alloc_get(dip);
++
++      error = gfs_quota_hold_m(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (error)
++              goto fail;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              goto fail_qs;
++
++      /*  Count the number of leaves  */
++
++      for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
++              error = get_leaf(dip, blk, &bh);
++              if (error)
++                      goto fail_rlist;
++              gfs_leaf_in(&tmp_leaf, (bh)->b_data);
++              brelse(bh);
++
++              gfs_rlist_add(sdp, &rlist, blk);
++      }
++
++      gfs_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
++
++      error = gfs_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
++      if (error)
++              goto fail_rlist;
++
++      for (x = 0; x < rlist.rl_rgrps; x++) {
++              struct gfs_rgrpd *rgd;
++              rgd = gl2rgd(rlist.rl_ghs[x].gh_gl);
++              rg_blocks += rgd->rd_ri.ri_length;
++      }
++
++      /* Trans may require:
++         All the bitmaps that were reserved.
++         One block for the dinode.
++         All the hash blocks that will be changed.
++         One block for a quota change. */
++
++      error = gfs_trans_begin(sdp,
++                              rg_blocks + 1 + (DIV_RU(size, sdp->sd_jbsize) + 1),
++                              1);
++      if (error)
++              goto fail_rg_gunlock;
++
++      for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
++              error = get_leaf(dip, blk, &bh);
++              if (error)
++                      goto fail_end_trans;
++              gfs_leaf_in(&tmp_leaf, bh->b_data);
++              brelse(bh);
++
++              gfs_metafree(dip, blk, 1);
++
++              dip->i_di.di_blocks--;
++      }
++
++      error = gfs_internal_write(dip, ht, index * sizeof(uint64_t), size);
++      if (error != size) {
++              if (error >= 0)
++                      error = -EIO;
++              goto fail_end_trans;
++      }
++
++      error = gfs_get_inode_buffer(dip, &dibh);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_add_bh(dip->i_gl, dibh);
++      gfs_dinode_out(&dip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++      gfs_rlist_free(&rlist);
++      gfs_glock_dq_uninit(&ri_gh);
++      gfs_quota_unhold_m(dip);
++      gfs_alloc_put(dip);
++      kfree(ht);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_rg_gunlock:
++      gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++
++ fail_rlist:
++      gfs_rlist_free(&rlist);
++      gfs_glock_dq_uninit(&ri_gh);
++
++ fail_qs:
++      gfs_quota_unhold_m(dip);
++
++ fail:
++      gfs_alloc_put(dip);
++      kfree(ht);
++
++      return error;
++}
++
++/**
++ * gfs_dir_exhash_free - free all the leaf block in a directory
++ * @dip: the directory
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dir_exhash_free(struct gfs_inode *dip)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct buffer_head *bh;
++      int error;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      error = foreach_leaf(dip, leaf_free, NULL);
++      if (error)
++              return error;
++
++      /*  Make this a regular file in case we crash.
++         (We don't want to free these blocks a second time.)  */
++
++      error = gfs_trans_begin(sdp, 1, 0);
++      if (error)
++              return error;
++
++      error = gfs_get_inode_buffer(dip, &bh);
++      if (error)
++              goto fail;
++
++      gfs_trans_add_bh(dip->i_gl, bh);
++      ((struct gfs_dinode *)bh->b_data)->di_type = cpu_to_gfs16(GFS_FILE_REG);
++
++      brelse(bh);
++
++      gfs_trans_end(sdp);
++
++      return 0;
++
++ fail:
++      gfs_trans_end(sdp);
++      return error;
++}
++
++/**
++ * gfs_diradd_alloc_required - figure out if an entry addition is going to require an allocation
++ * @ip: the file being written to
++ * @filname: the filename that's going to be added
++ * @alloc_required: the int is set to TRUE if an alloc is required, FALSE otherwise
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_diradd_alloc_required(struct gfs_inode *dip, struct qstr *filename,
++                        int *alloc_required)
++{
++      struct buffer_head *bh = NULL, *bh_next;
++      uint32_t hsize, hash, index;
++      int error = 0;
++
++      *alloc_required = FALSE;
++
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++      if (dip->i_di.di_flags & GFS_DIF_EXHASH) {
++              hsize = 1 << dip->i_di.di_depth;
++              GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size,
++                               dip,);
++
++              hash = gfs_dir_hash(filename->name, filename->len);
++              index = hash >> (32 - dip->i_di.di_depth);
++
++              error = get_first_leaf(dip, index, &bh_next);
++              if (error)
++                      return error;
++
++              do {
++                      if (bh)
++                              brelse(bh);
++
++                      bh = bh_next;
++
++                      if (dirent_fits(dip, bh, filename->len))
++                              break;
++
++                      error = get_next_leaf(dip, bh, &bh_next);
++                      if (error == -ENOENT) {
++                              *alloc_required = TRUE;
++                              error = 0;
++                              break;
++                      }
++              }
++              while (!error);
++
++              brelse(bh);
++      } else {
++              error = gfs_get_inode_buffer(dip, &bh);
++              if (error)
++                      return error;
++
++              if (!dirent_fits(dip, bh, filename->len))
++                      *alloc_required = TRUE;
++
++              brelse(bh);
++      }
++
++      return error;
++}
++
++/**
++ * do_gdm - copy out one leaf (or list of leaves)
++ * @dip: the directory
++ * @index: the hash table offset in the directory
++ * @len: the number of pointers to this leaf
++ * @leaf_no: the leaf number
++ * @data: a pointer to a struct gfs_user_buffer structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_gdm(struct gfs_inode *dip, uint32_t index, uint32_t len, uint64_t leaf_no,
++       void *data)
++{
++      struct gfs_user_buffer *ub = (struct gfs_user_buffer *)data;
++      struct gfs_leaf leaf;
++      struct buffer_head *bh;
++      uint64_t blk;
++      int error = 0;
++
++      for (blk = leaf_no; blk; blk = leaf.lf_next) {
++              error = get_leaf(dip, blk, &bh);
++              if (error)
++                      break;
++
++              gfs_leaf_in(&leaf, bh->b_data);
++
++              error = gfs_add_bh_to_ub(ub, bh);
++
++              brelse(bh);
++
++              if (error)
++                      break;
++      }
++
++      return error;
++}
++
++/**
++ * gfs_get_dir_meta - return all the leaf blocks of a directory
++ * @dip: the directory
++ * @ub: the structure representing the meta
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_dir_meta(struct gfs_inode *dip, struct gfs_user_buffer *ub)
++{
++      GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++      return foreach_leaf(dip, do_gdm, ub);
++}
+diff -urN linux-orig/fs/gfs/dir.h linux-patched/fs/gfs/dir.h
+--- linux-orig/fs/gfs/dir.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dir.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,55 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIR_DOT_H__
++#define __DIR_DOT_H__
++
++/**
++ * gfs_filldir_t - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++typedef int (*gfs_filldir_t) (void *opaque,
++                            const char *name, unsigned int length,
++                            uint64_t offset,
++                            struct gfs_inum *inum, unsigned int type);
++
++int gfs_filecmp(struct qstr *file1, char *file2, int len_of_file2);
++int gfs_dirent_alloc(struct gfs_inode *dip, struct buffer_head *bh,
++                   int name_len, struct gfs_dirent **dent_out);
++
++int gfs_dir_search(struct gfs_inode *dip, struct qstr *filename,
++                 struct gfs_inum *inum, unsigned int *type);
++int gfs_dir_add(struct gfs_inode *dip, struct qstr *filename,
++              struct gfs_inum *inum, unsigned int type);
++int gfs_dir_del(struct gfs_inode *dip, struct qstr *filename);
++int gfs_dir_read(struct gfs_inode *dip, uint64_t * offset, void *opaque,
++               gfs_filldir_t filldir);
++int gfs_dir_mvino(struct gfs_inode *dip, struct qstr *filename,
++                struct gfs_inum *new_inum, unsigned int new_type);
++
++int gfs_dir_exhash_free(struct gfs_inode *dip);
++
++int gfs_diradd_alloc_required(struct gfs_inode *dip, struct qstr *filename,
++                            int *alloc_required);
++
++int gfs_get_dir_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++#endif /* __DIR_DOT_H__ */
+diff -urN linux-orig/fs/gfs/eattr.c linux-patched/fs/gfs/eattr.c
+--- linux-orig/fs/gfs/eattr.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/eattr.c       2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,2340 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "dio.h"
++#include "eattr.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define GFS_EA_REC_LEN(x) gfs32_to_cpu((x)->ea_rec_len)
++#define GFS_EA_NAME(x) ((char *)(x) + sizeof(struct gfs_ea_header))
++#define GFS_EA_DATA_PTRS(x) ((uint64_t *)((char *)(x) + sizeof(struct gfs_ea_header) + (((x)->ea_name_len + 7) & ~7)))
++
++#define GFS_EA_NEXT(x) (struct gfs_ea_header *)((char *)(x) + GFS_EA_REC_LEN(x))
++#define GFS_EA_FREESPACE(x) (struct gfs_ea_header *)((char *)(x) + GFS_EA_SIZE(x))
++
++#define GFS_EAREQ_IS_STUFFED(x, y) (((sizeof(struct gfs_ea_header) + (x)->es_data_len + (x)->es_name_len + 7) & ~7) <= y)
++
++#define GFS_EADATA_NUM_PTRS(x, y) (((x) + (y) - 1) / (y))
++
++#define GFS_EA_SIZE(x) ((sizeof(struct gfs_ea_header) + (x)->ea_name_len + (GFS_EA_IS_UNSTUFFED(x)? (8 * (x)->ea_num_ptrs) : GFS_EA_DATA_LEN(x)) + 7) & ~ 7)
++
++#define GFS_EACMD_VALID(x) ((x) <= GFS_EACMD_REMOVE)
++
++#define GFS_EA_IS_LAST(x) ((x)->ea_flags & GFS_EAFLAG_LAST)
++
++#define GFS_EA_STRLEN(x) ((x)->ea_name_len + 1 + (((x)->ea_type == GFS_EATYPE_USR)? 5 : 7))
++
++#define GFS_FIRST_EA(x) ((struct gfs_ea_header *) ((x)->b_data + sizeof(struct gfs_meta_header)))
++
++#define EA_ALLOC 1
++#define EA_DEALLOC 2
++
++static struct buffer_head *alloc_eattr_blk(struct gfs_sbd *sdp,
++                                         struct gfs_inode *alloc_ip,
++                                         struct gfs_inode *ip,
++                                         uint64_t * block);
++
++/**
++ * can_replace - returns true if ea is large enough to hold the data in
++ *               the request
++ */
++
++static __inline__ int
++can_replace(struct gfs_ea_header *ea, struct gfs_easet_io *req,
++          uint32_t avail_size)
++{
++      int data_space =
++          GFS_EA_REC_LEN(ea) - sizeof (struct gfs_ea_header) -
++          ea->ea_name_len;
++
++      if (GFS_EAREQ_IS_STUFFED(req, avail_size) && !GFS_EA_IS_UNSTUFFED(ea))
++              return (req->es_data_len <= data_space);
++      else
++              return (GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size) <=
++                      ea->ea_num_ptrs);
++}
++
++/**
++ * get_req_size - returns the acutal number of bytes the request will take up
++ *                (not counting any unstuffed data blocks)
++ */
++
++static __inline__ uint32_t
++get_req_size(struct gfs_easet_io *req, uint32_t avail_size)
++{
++      uint32_t size =
++          ((sizeof (struct gfs_ea_header) + req->es_data_len +
++            req->es_name_len + 7) & ~7);
++
++      if (size <= avail_size)
++              return size;
++
++      return ((sizeof (struct gfs_ea_header) + req->es_name_len + 7) & ~7) +
++          (8 * GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size));
++}
++
++/**
++ * gfs_ea_write_permission - decides if the user has permission to write to 
++ *                           the ea
++ * @req: the write request
++ * @ip: inode of file with the ea
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_ea_write_permission(struct gfs_easet_io *req, struct gfs_inode *ip)
++{
++      struct inode *inode = gfs_iget(ip, NO_CREATE);
++      int error = 0;
++
++      GFS_ASSERT_INODE(inode, ip,);
++
++      if (req->es_type == GFS_EATYPE_USR) {
++              if (!S_ISREG(inode->i_mode) &&
++                  (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++                      error = -EPERM; 
++              else {
++                      error = permission(inode, MAY_WRITE, NULL);
++                      if (error == -EACCES)
++                              error = -EPERM;
++              }
++      } else if (req->es_type == GFS_EATYPE_SYS) {
++              if (IS_ACCESS_ACL(req->es_name, req->es_name_len))
++                      error = gfs_validate_acl(ip, req->es_data,
++                                      req->es_data_len, 1);
++              else if (IS_DEFAULT_ACL(req->es_name, req->es_name_len))
++                      error = gfs_validate_acl(ip, req->es_data, 
++                                      req->es_data_len, 0);
++              else {
++                      if (!capable(CAP_SYS_ADMIN))
++                              error = -EPERM;
++              }
++      } else
++              error = -EOPNOTSUPP;
++
++      iput(inode);
++
++      return error;
++}
++
++/**
++ * gfs_ea_read_permission - decides if the user has permission to read from 
++ *                          the ea
++ * @req: the read request
++ * @ip: inode of file with the ea
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_ea_read_permission(struct gfs_eaget_io *req, struct gfs_inode *ip)
++{
++      struct inode *inode = gfs_iget(ip, NO_CREATE);
++      int error = 0;
++
++      GFS_ASSERT_INODE(inode, ip,);
++
++      if (req->eg_type == GFS_EATYPE_USR){ 
++              error = permission(inode, MAY_READ, NULL);
++              if (error == -EACCES)
++                      error = -EPERM;
++      }
++      else if (req->eg_type == GFS_EATYPE_SYS) {
++              if (IS_ACCESS_ACL(req->eg_name, req->eg_name_len) ||
++                  IS_DEFAULT_ACL(req->eg_name, req->eg_name_len))
++                      error = 0;
++              else{
++                      if (!capable(CAP_SYS_ADMIN))
++                              error = -EPERM;
++              }
++      } else
++              error = -EOPNOTSUPP;
++
++      iput(inode);
++
++      return error;
++}
++
++/**
++ * gfs_es_memcpy - gfs memcpy wrapper with a return value
++ *
++ */
++
++int
++gfs_ea_memcpy(void *dest, void *src, unsigned long size)
++{
++      memcpy(dest, src, size);
++      return 0;
++}
++
++/**
++ * gfs_ea_copy_to_user - copy_to_user wrapper
++ */
++
++int
++gfs_ea_copy_to_user(void *dest, void *src, unsigned long size)
++{
++      int error;
++      error = (copy_to_user(dest, src, size)) ? -EFAULT : 0;
++      return error;
++}
++
++/**
++ * Returns: 1 if find_direct_eattr should stop checking (if the eattr was found
++ *                                                location will be set)
++ *          0 if find_eattr should keep on checking
++ *          -EXXX on error
++ */
++int
++find_direct_eattr(struct gfs_inode *ip, uint64_t blkno, char *name,
++                int name_len, int type, struct gfs_ea_location *location)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_ea_header *curr, *prev = NULL;
++
++      err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++      if (err)
++              goto out;
++      gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++      curr =
++          (struct gfs_ea_header *) ((bh)->b_data +
++                                    sizeof (struct gfs_meta_header));
++      if (curr->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(curr))
++                      goto out_drelse;
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      if (type != curr->ea_type && ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              if (type == GFS_EATYPE_SYS)
++                      err = 1;
++              goto out_drelse;
++      }
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++
++              if (type == curr->ea_type && name_len == curr->ea_name_len &&
++                  !memcmp(name, GFS_EA_NAME(curr), name_len)) {
++                      location->bh = bh;
++                      location->ea = curr;
++                      location->prev = prev;
++                      err = 1;
++                      goto out;
++              }
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++
++      out_drelse:
++      brelse(bh);
++
++      out:
++      return err;
++}
++
++/**
++ * find_eattr - find a matching eattr
++ *
++ * Returns: 1 if ea found, 0 if no ea found, -EXXX on error
++ */
++int
++find_eattr(struct gfs_inode *ip, char *name, int name_len, int type,
++         struct gfs_ea_location *location)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t *eablk, *end;
++
++      memset(location, 0, sizeof (struct gfs_ea_location));
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              err =
++                  gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++                            DIO_START | DIO_WAIT, &bh);
++              if (err)
++                      goto fail;
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++              eablk =
++                  (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++              end =
++                  eablk +
++                  ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++              while (eablk < end && *eablk) {
++                      err =
++                          find_direct_eattr(ip, gfs64_to_cpu(*eablk), name,
++                                            name_len, type, location);
++                      if (err || location->ea)
++                              break;
++                      eablk++;
++              }
++              brelse(bh);
++              if (err < 0)
++                      goto fail;
++      } else {
++              err =
++                  find_direct_eattr(ip, ip->i_di.di_eattr, name, name_len,
++                                    type, location);
++              if (err < 0)
++                      goto fail;
++      }
++
++      return (location->ea != NULL);
++
++      fail:
++      return err;
++}
++
++static void
++make_space(struct gfs_inode *ip, struct buffer_head *bh, uint32_t size,
++         uint64_t blkno, struct gfs_ea_location *avail)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint32_t free_size, avail_size;
++      struct gfs_ea_header *ea, *new_ea;
++      void *buf;
++
++      free_size = 0;
++      avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      ea = GFS_FIRST_EA(bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++      if (ea->ea_type == GFS_EATYPE_UNUSED) {
++              free_size = GFS_EA_REC_LEN(ea);
++              ea = GFS_EA_NEXT(ea);
++      }
++      while (free_size < size) {
++              free_size += (GFS_EA_REC_LEN(ea) - GFS_EA_SIZE(ea));
++              if (GFS_EA_IS_LAST(ea))
++                      break;
++              ea = GFS_EA_NEXT(ea);
++      }
++      if (free_size < size)
++              goto out;
++      buf = gmalloc(avail_size);
++
++      free_size = avail_size;
++      ea = GFS_FIRST_EA(bh);
++      if (ea->ea_type == GFS_EATYPE_UNUSED)
++              ea = GFS_EA_NEXT(ea);
++      new_ea = (struct gfs_ea_header *) buf;
++      new_ea->ea_flags = 0;
++      new_ea->ea_rec_len = cpu_to_gfs32(size);
++      new_ea->ea_num_ptrs = 0;
++      new_ea->ea_type = GFS_EATYPE_UNUSED;
++      free_size -= size;
++      new_ea = GFS_EA_NEXT(new_ea);
++      while (1) {
++              memcpy(new_ea, ea, GFS_EA_SIZE(ea));
++              if (GFS_EA_IS_LAST(ea))
++                      break;
++              new_ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(ea));
++              free_size -= GFS_EA_SIZE(ea);
++              ea = GFS_EA_NEXT(ea);
++              new_ea = GFS_EA_NEXT(new_ea);
++      }
++      new_ea->ea_rec_len = cpu_to_gfs32(free_size);
++      memcpy(GFS_FIRST_EA(bh), buf, avail_size);
++      kfree(buf);
++      avail->ea = GFS_FIRST_EA(bh);
++      avail->prev = NULL;
++      avail->bh = bh;
++
++      out:
++      return;
++}
++
++static int
++expand_to_indirect(struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++                 struct buffer_head **bh)
++{
++      int err;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh1 = NULL, *bh2 = NULL, *indbh = NULL;
++      uint64_t blkno, *blkptr;
++      uint32_t free_size, avail_size;
++      struct gfs_ea_header *prev, *curr, *new_ea = NULL;
++
++      avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      free_size = avail_size;
++      ip->i_di.di_flags |= GFS_DIF_EA_INDIRECT;
++      blkno = ip->i_di.di_eattr;
++      err = gfs_metaalloc(alloc_ip, &ip->i_di.di_eattr);
++      if (err)
++              goto out;
++      ip->i_di.di_blocks++;
++      err = gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_NEW | DIO_START |
++                      DIO_WAIT, &indbh);
++      if (err)
++              goto out;
++      bh1 = *bh;
++      *bh = indbh;
++      gfs_trans_add_bh(ip->i_gl, indbh);
++      gfs_metatype_set(sdp, indbh, GFS_METATYPE_IN, GFS_FORMAT_IN);
++      memset((indbh)->b_data + sizeof (struct gfs_meta_header), 0,
++             sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++      blkptr = (uint64_t *) ((indbh)->b_data + sizeof (struct gfs_indirect));
++      *blkptr++ = cpu_to_gfs64(blkno);
++      prev = NULL;
++      curr = GFS_FIRST_EA(bh1);
++      while (curr->ea_type != GFS_EATYPE_USR) {
++              if (GFS_EA_IS_LAST(curr))
++                      goto out_drelse1;
++              free_size -= GFS_EA_REC_LEN(curr);
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      if (!prev || prev->ea_type == GFS_EATYPE_UNUSED)
++              goto out_drelse1;
++      gfs_trans_add_bh(ip->i_gl, bh1);
++      prev->ea_rec_len = cpu_to_gfs32(GFS_EA_REC_LEN(prev) + free_size);
++      prev->ea_flags |= GFS_EAFLAG_LAST;
++      bh2 = alloc_eattr_blk(sdp, alloc_ip, ip, &blkno);
++      if (!bh2) {
++              err = -EIO;
++              goto out_drelse1;
++      }
++      free_size = avail_size;
++      new_ea = GFS_FIRST_EA(bh2);
++      while (1) {
++              memcpy(new_ea, curr, GFS_EA_SIZE(curr));
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              new_ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(curr));
++              free_size -= GFS_EA_SIZE(curr);
++              curr = GFS_EA_NEXT(curr);
++              new_ea = GFS_EA_NEXT(new_ea);
++      }
++      new_ea->ea_rec_len = cpu_to_gfs32(free_size);
++      *blkptr = cpu_to_gfs64(blkno);
++      brelse(bh2);
++
++      out_drelse1:
++      brelse(bh1);
++
++      out:
++      return err;
++}
++
++static void
++find_direct_sys_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++                    struct gfs_ea_location *avail)
++{
++      struct gfs_ea_header *curr, *prev = NULL;
++
++      curr = GFS_FIRST_EA(bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++      if (curr->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_REC_LEN(curr) >= size) {
++                      avail->ea = curr;
++                      avail->prev = NULL;
++                      avail->bh = bh;
++                      goto out;
++              }
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      while (curr->ea_type == GFS_EATYPE_SYS) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++                      avail->ea = curr;
++                      avail->prev = prev;
++                      avail->bh = bh;
++                      goto out;
++              }
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      make_space(ip, bh, size, ip->i_di.di_eattr, avail);
++
++      out:
++      return;
++}
++
++/**
++ * int find_indirect_space
++ *
++ * @space:   
++ * @blktype: returns the type of block GFS_EATYPE_...
++ *
++ * returns 0 on success, -EXXX on failure
++ */
++static int
++find_indirect_space(struct gfs_inode *ip, uint64_t blkno, int type,
++                  int size, struct gfs_ea_location *avail, int *blktype)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_ea_header *curr, *prev = NULL;
++
++      err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++      if (err)
++              goto out;
++      gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++      curr = GFS_FIRST_EA(bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++      if (curr->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(curr)) {
++                      avail->ea = curr;
++                      avail->prev = NULL;
++                      avail->bh = bh;
++                      *blktype = GFS_EATYPE_UNUSED;
++                      goto out;
++              }
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      if (type != curr->ea_type) {
++              *blktype = curr->ea_type;
++              goto out_drelse;
++      } else
++              *blktype = type;
++      if (prev && GFS_EA_REC_LEN(prev) >= size) {
++              avail->ea = prev;
++              avail->prev = NULL;
++              avail->bh = bh;
++              goto out;
++      }
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++                      avail->ea = curr;
++                      avail->prev = prev;
++                      avail->bh = bh;
++                      goto out;
++              }
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++
++      out_drelse:
++      brelse(bh);
++
++      out:
++      return err;
++}
++
++static int
++find_indirect_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++                      int size, struct buffer_head *bh,
++                      struct gfs_ea_location *avail)
++{
++      int err = 0;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t *eablk, *end, *first_usr_blk = NULL;
++      int blktype;
++      uint64_t blkno;
++
++      eablk = (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++      end =
++          eablk + ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++      while (eablk < end && *eablk) {
++              err =
++                  find_indirect_space(ip, gfs64_to_cpu(*eablk),
++                                      GFS_EATYPE_SYS, size, avail, &blktype);
++              if (err)
++                      goto out;
++              if (blktype == GFS_EATYPE_USR && !first_usr_blk)
++                      first_usr_blk = eablk;
++              if (avail->ea) {
++                      if (!first_usr_blk)
++                              goto out;
++                      gfs_trans_add_bh(ip->i_gl, bh);
++                      blkno = *eablk;
++                      *eablk = *first_usr_blk;
++                      *first_usr_blk = blkno;
++                      goto out;
++              }
++              eablk++;
++      }
++      if (eablk >= end) {
++              err = -ENOSPC;
++              goto out;
++      }
++      avail->bh = alloc_eattr_blk(sdp, alloc_ip, ip, &blkno);
++      if (!avail->bh) {
++              err = -EIO;
++              goto out;
++      }
++      avail->ea = GFS_FIRST_EA(avail->bh);
++      avail->prev = NULL;
++      gfs_trans_add_bh(ip->i_gl, bh);
++      if (first_usr_blk) {
++              *eablk = *first_usr_blk;
++              *first_usr_blk = cpu_to_gfs64(blkno);
++      } else
++              *eablk = cpu_to_gfs64(blkno);
++
++      out:
++      return err;
++}
++
++int
++find_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip, int size,
++             struct gfs_ea_location *avail)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++
++      err =
++          gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++                    &bh);
++      if (err)
++              goto out;
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++              err = find_indirect_sys_space(alloc_ip, ip, size, bh, avail);
++      } else {
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++              find_direct_sys_space(ip, size, bh, avail);
++              if (!avail->ea) {
++                      err = expand_to_indirect(alloc_ip, ip, &bh);
++                      if (err)
++                              goto out_drelse;
++                      err =
++                          find_indirect_sys_space(alloc_ip, ip, size, bh,
++                                                  avail);
++              }
++      }
++
++      out_drelse:
++      if (avail->bh != bh)
++              brelse(bh);
++
++      out:
++      return err;
++}
++
++static int
++get_blk_type(struct gfs_inode *ip, uint64_t blkno, int *blktype)
++{
++      int err = 0;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      struct gfs_ea_header *ea;
++
++      err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++      if (err)
++              goto out;
++      gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++      ea = GFS_FIRST_EA(bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++      if (ea->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(ea)) {
++                      *blktype = GFS_EATYPE_UNUSED;
++                      goto out_drelse;
++              }
++              ea = GFS_EA_NEXT(ea);
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++      }
++      *blktype = ea->ea_type;
++
++      out_drelse:
++      brelse(bh);
++
++      out:
++      return err;
++}
++
++static void
++find_direct_usr_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++                    struct gfs_ea_location *avail)
++{
++      struct gfs_ea_header *curr, *prev = NULL;
++
++      curr = GFS_FIRST_EA(bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++      if (curr->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(curr)) {
++                      avail->ea = curr;
++                      avail->prev = NULL;
++                      avail->bh = bh;
++                      goto out;
++              }
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++              if (curr->ea_type == GFS_EATYPE_USR
++                  && GFS_EA_REC_LEN(prev) >= size) {
++                      avail->ea = prev;
++                      avail->prev = NULL;
++                      avail->bh = bh;
++                      goto out;
++              }
++      }
++      while (curr->ea_type != GFS_EATYPE_USR) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++                      avail->ea = curr;
++                      avail->prev = prev;
++                      avail->bh = bh;
++                      goto out;
++              }
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++
++      out:
++      return;
++}
++
++static int
++find_indirect_usr_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++                      struct gfs_ea_location *avail)
++{
++      int err = 0;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t *eablk, *end, *last_sys_blk = NULL, *first_usr_blk = NULL;
++      int blktype;
++      uint64_t blkno;
++
++      eablk = (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++      end =
++          eablk + ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++      while (eablk < end && *eablk) {
++              err =
++                  find_indirect_space(ip, gfs64_to_cpu(*eablk),
++                                      GFS_EATYPE_USR, size, avail, &blktype);
++              if (err)
++                      goto out;
++              if (blktype == GFS_EATYPE_SYS)
++                      last_sys_blk = eablk;
++              if (blktype == GFS_EATYPE_USR && !first_usr_blk)
++                      first_usr_blk = eablk;
++              if (avail->ea) {
++                      if (first_usr_blk)
++                              goto out;
++                      first_usr_blk = eablk + 1;
++                      while (first_usr_blk < end && *first_usr_blk) {
++                              err =
++                                  get_blk_type(ip,
++                                               gfs64_to_cpu(*first_usr_blk),
++                                               &blktype);
++                              if (blktype == GFS_EATYPE_SYS)
++                                      last_sys_blk = first_usr_blk;
++                              if (blktype == GFS_EATYPE_USR)
++                                      break;
++                              first_usr_blk++;
++                      }
++                      if (last_sys_blk > eablk) {
++                              gfs_trans_add_bh(ip->i_gl, bh);
++                              blkno = *eablk;
++                              *eablk = *last_sys_blk;
++                              *last_sys_blk = blkno;
++                      }
++                      goto out;
++              }
++              eablk++;
++      }
++
++      if (eablk >= end) {
++              err = -ENOSPC;
++              goto out;
++      }
++      avail->bh = alloc_eattr_blk(sdp, ip, ip, &blkno);
++      if (!avail->bh) {
++              err = -EIO;
++              goto out;
++      }
++      avail->ea = GFS_FIRST_EA(avail->bh);
++      avail->prev = NULL;
++      gfs_trans_add_bh(ip->i_gl, bh);
++      *eablk = cpu_to_gfs64(blkno);
++
++      out:
++      return err;
++}
++
++static int
++find_usr_space(struct gfs_inode *ip, int size, struct gfs_ea_location *avail)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++
++      err =
++          gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++                    &bh);
++      if (err)
++              goto out;
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++              err = find_indirect_usr_space(ip, size, bh, avail);
++      } else {
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++              find_direct_usr_space(ip, size, bh, avail);
++              if (!avail->ea) {
++                      err = expand_to_indirect(ip, ip, &bh);
++                      if (err)
++                              goto out_drelse;
++                      err = find_indirect_usr_space(ip, size, bh, avail);
++              }
++      }
++
++      out_drelse:
++      if (avail->bh != bh)
++              brelse(bh);
++
++      out:
++      return err;
++}
++
++static int
++find_space(struct gfs_inode *ip, int size, int type,
++         struct gfs_ea_location *avail)
++{
++      int err;
++
++      memset(avail, 0, sizeof (struct gfs_ea_location));
++
++      if (type == GFS_EATYPE_SYS)
++              err = find_sys_space(ip, ip, size, avail);
++      else
++              err = find_usr_space(ip, size, avail);
++
++      return err;
++}
++
++static int
++can_replace_in_block(struct gfs_inode *ip, int size,
++                   struct gfs_ea_location found, struct gfs_ea_header **space)
++{
++      struct gfs_ea_header *curr, *prev = NULL;
++
++      *space = NULL;
++      curr = GFS_FIRST_EA(found.bh);
++      GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++      if (curr->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_REC_LEN(curr) >= size) {
++                      *space = curr;
++                      goto out;
++              }
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++              if (curr == found.ea) {
++                      /*
++                       * See if there will be enough space after the old version of the eattr
++                       * is deleted.
++                       */
++                      if (prev) {
++                              if (prev->ea_type == GFS_EATYPE_UNUSED) {
++                                      if (GFS_EA_REC_LEN(prev) +
++                                          GFS_EA_REC_LEN(curr) >= size) {
++                                              *space = prev;
++                                              goto out;
++                                      }
++                              } else if (GFS_EA_REC_LEN(prev) +
++                                         GFS_EA_REC_LEN(curr) >=
++                                         GFS_EA_SIZE(prev) + size) {
++                                      *space = prev;
++                                      goto out;
++                              }
++                      } else if (GFS_EA_REC_LEN(curr) >= size) {
++                              *space = curr;
++                              goto out;
++                      }
++              } else if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++                      *space = curr;
++                      goto out;
++              }
++              if (GFS_EA_IS_LAST(curr))
++                      break;
++              prev = curr;
++              curr = GFS_EA_NEXT(curr);
++      }
++
++      out:
++      return (*space != NULL);
++}
++
++/**
++ * read_unstuffed - actually copies the unstuffed data into the
++ *                  request buffer
++ */
++
++int
++read_unstuffed(void *dest, struct gfs_inode *ip, struct gfs_sbd *sdp,
++             struct gfs_ea_header *ea, uint32_t avail_size,
++             gfs_ea_copy_fn_t copy_fn)
++{
++      struct buffer_head *bh[66];     /*  This is the maximum number of data ptrs possible  */
++      int err = 0;
++      int max = GFS_EADATA_NUM_PTRS(GFS_EA_DATA_LEN(ea), avail_size);
++      int i, j, left = GFS_EA_DATA_LEN(ea);
++      char *outptr, *buf;
++      uint64_t *indptr = GFS_EA_DATA_PTRS(ea);
++
++      for (i = 0; i < max; i++) {
++              err =
++                  gfs_dread(sdp, gfs64_to_cpu(*indptr), ip->i_gl, DIO_START,
++                            &bh[i]);
++              indptr++;
++              if (err) {
++                      for (j = 0; j < i; j++)
++                              brelse(bh[j]);
++                      goto out;
++              }
++      }
++
++      outptr = dest;
++
++      for (i = 0; i < max; i++) {
++              err = gfs_dreread(sdp, bh[i], DIO_WAIT);
++              if (err) {
++                      for (j = i; j < max; j++)
++                              brelse(bh[j]);
++                      goto out;
++              }
++              gfs_metatype_check(sdp, bh[i], GFS_METATYPE_EA);
++              buf = (bh[i])->b_data + sizeof (struct gfs_meta_header);
++              err =
++                  copy_fn(outptr, buf,
++                          (avail_size > left) ? left : avail_size);
++              if (err) {
++                      for (j = i; j < max; j++)
++                              brelse(bh[j]);
++                      goto out;
++              }
++              left -= avail_size;
++              outptr += avail_size;
++              brelse(bh[i]);
++      }
++
++      out:
++
++      return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++int
++get_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++       gfs_ea_copy_fn_t copy_fn)
++{
++      int err;
++      struct gfs_ea_location location;
++      uint32_t avail_size;
++
++      avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++      err = find_eattr(ip, req->eg_name, req->eg_name_len, req->eg_type,
++                       &location);
++      if (err != 1) {
++              if (err == 0)
++                      err = -ENODATA;
++              goto out;
++      }
++
++      if (req->eg_data_len) {
++              if (req->eg_data_len < GFS_EA_DATA_LEN(location.ea))
++                      err = -ERANGE;
++              else if (GFS_EA_IS_UNSTUFFED(location.ea))
++                      err =
++                          read_unstuffed(req->eg_data, ip, sdp, location.ea,
++                                         avail_size, copy_fn);
++              else
++                      err = copy_fn(req->eg_data, GFS_EA_DATA(location.ea),
++                                    GFS_EA_DATA_LEN(location.ea));
++              if (!err)
++                      err = GFS_EA_DATA_LEN(location.ea);
++      } else
++              err = GFS_EA_DATA_LEN(location.ea);
++
++      brelse(location.bh);
++
++      out:
++      return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct gfs_ea_header *
++prep_ea(struct gfs_ea_header *ea)
++{
++      struct gfs_ea_header *new = ea;
++
++      if (ea->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(ea))
++                      ea->ea_flags = GFS_EAFLAG_LAST;
++              else
++                      ea->ea_flags = 0;
++      } else {
++              new = GFS_EA_FREESPACE(ea);
++              new->ea_rec_len =
++                  cpu_to_gfs32(GFS_EA_REC_LEN(ea) - GFS_EA_SIZE(ea));
++              ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(ea));
++              if (GFS_EA_IS_LAST(ea)) {
++                      ea->ea_flags &= ~GFS_EAFLAG_LAST;
++                      new->ea_flags = GFS_EAFLAG_LAST;
++              } else
++                      new->ea_flags = 0;
++      }
++
++      return new;
++}
++
++/**
++ * replace_ea - replaces the existing data with the request data
++ */
++int
++replace_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_ea_header *ea,
++         struct gfs_easet_io *req)
++{
++      int err = 0;
++      int i;
++      uint32_t copy_size, data_left = req->es_data_len;
++      struct buffer_head *bh;
++      uint64_t *datablk = GFS_EA_DATA_PTRS(ea);
++      const char *dataptr = req->es_data;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++      ea->ea_data_len = cpu_to_gfs32(req->es_data_len);
++      if (!GFS_EA_IS_UNSTUFFED(ea))
++              memcpy(GFS_EA_DATA(ea), req->es_data, req->es_data_len);
++      else {
++              for (i = 0; i < ea->ea_num_ptrs && data_left > 0; i++) {
++                      err = gfs_dread(sdp, gfs64_to_cpu(*datablk), ip->i_gl,
++                                      DIO_START | DIO_WAIT, &bh);
++                      if (err)
++                              goto out;
++                      gfs_trans_add_bh(ip->i_gl, bh);
++                      gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++                      copy_size =
++                          (data_left > avail_size) ? avail_size : data_left;
++                      memcpy((bh)->b_data + sizeof (struct gfs_meta_header),
++                             dataptr, copy_size);
++                      dataptr += copy_size;
++                      data_left -= copy_size;
++                      datablk++;
++                      brelse(bh);
++              }
++              GFS_ASSERT_INODE(data_left == 0, ip,
++                               printk
++                               ("req->es_data_len = %u, ea->ea_num_ptrs = %d\n",
++                                req->es_data_len, ea->ea_num_ptrs);
++                  );
++      }
++
++      out:
++      return err;
++}
++
++/**
++ * write_ea - writes the request info to an ea, creating new blocks if
++ *            necessary
++ *
++ * @sdp: superblock pointer
++ * @alloc_ip: inode that has the blocks reserved for allocation
++ * @ip:  inode that is being modified
++ * @ea:  the location of the new ea in a block
++ * @req: the write request
++ *
++ * Note: does not update ea_rec_len or the GFS_EAFLAG_LAST bin of ea_flags
++ *
++ * returns : 0 on success, -EXXX on error
++ */
++
++int
++write_ea(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++       struct gfs_ea_header *ea, struct gfs_easet_io *req)
++{
++      int err = 0;
++      uint64_t *blkptr;
++      uint64_t temp;
++      const char *dataptr;
++      uint32_t data_left, copy;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      int i;
++      struct buffer_head *bh = NULL;
++
++      ea->ea_data_len = cpu_to_gfs32(req->es_data_len);
++      ea->ea_name_len = req->es_name_len;
++      ea->ea_type = req->es_type;
++      ea->ea_pad = 0;
++
++      memcpy(GFS_EA_NAME(ea), req->es_name, req->es_name_len);
++
++      if (GFS_EAREQ_IS_STUFFED(req, avail_size)) {
++              ea->ea_num_ptrs = 0;
++              memcpy(GFS_EA_DATA(ea), req->es_data, req->es_data_len);
++      } else {
++              blkptr = GFS_EA_DATA_PTRS(ea);
++              dataptr = req->es_data;
++              data_left = req->es_data_len;
++              ea->ea_num_ptrs =
++                  GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size);
++
++              for (i = 0; i < ea->ea_num_ptrs; i++) {
++                      if ((bh =
++                           alloc_eattr_blk(sdp, alloc_ip, ip,
++                                           &temp)) == NULL) {
++                              err = -EIO;
++                              goto out;
++                      }
++                      copy =
++                          (data_left > avail_size) ? avail_size : data_left;
++                      memcpy((bh)->b_data + sizeof (struct gfs_meta_header),
++                             dataptr, copy);
++                      *blkptr = cpu_to_gfs64(temp);
++                      dataptr += copy;
++                      data_left -= copy;
++                      blkptr++;
++                      brelse(bh);
++              }
++
++              GFS_ASSERT_INODE(!data_left, ip,);
++      }
++
++      out:
++
++      return err;
++}
++
++/**
++ * erase_ea_data_ptrs - deallocate all the unstuffed data blocks pointed to
++ *                          ea records in this block
++ * @sdp: the superblock
++ * @ip: the inode
++ * @blk: the block to check for data pointers
++ *
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++erase_ea_data_ptrs(struct gfs_sbd *sdp, struct gfs_inode *ip,
++                 struct buffer_head *dibh, uint64_t blk)
++{
++      struct gfs_holder rgd_gh;
++      int i, err = 0;
++      uint64_t *datablk;
++      struct buffer_head *eabh;
++      char *buf;
++      struct gfs_ea_header *ea;
++      struct gfs_rgrpd *rgd = NULL;
++
++      err = gfs_dread(sdp, blk, ip->i_gl, DIO_WAIT | DIO_START, &eabh);
++      if (err)
++              goto fail;
++
++      gfs_metatype_check(sdp, eabh, GFS_METATYPE_EA);
++      buf = (eabh)->b_data + sizeof (struct gfs_meta_header);
++      ea = (struct gfs_ea_header *) buf;
++
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++              if (GFS_EA_IS_UNSTUFFED(ea)) {
++                      datablk = GFS_EA_DATA_PTRS(ea);
++                      rgd = gfs_blk2rgrpd(sdp, gfs64_to_cpu(*datablk));
++                      GFS_ASSERT_INODE(rgd, ip,
++                                       printk("block = %" PRIu64 "\n",
++                                              gfs64_to_cpu(*datablk)););
++                      err =
++                          gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
++                                            &rgd_gh);
++                      if (err)
++                              goto fail_eabh;
++                      /* Trans may require:
++                         One block for the RG header. One block for each ea data block. One
++                         One block for the dinode. One block for the current ea block.
++                         One block for a quote change.
++                         FIXME */
++                      err =
++                          gfs_trans_begin(sdp,
++                                          3 + ea->ea_num_ptrs, 1);
++                      if (err)
++                              goto fail_glock_rg;
++                      gfs_trans_add_bh(ip->i_gl, dibh);
++                      for (i = 0; i < ea->ea_num_ptrs; i++, datablk++) {
++                              gfs_metafree(ip, gfs64_to_cpu(*datablk), 1);
++                              ip->i_di.di_blocks--;
++                      }
++                      ea->ea_num_ptrs = 0;
++                      gfs_trans_add_bh(ip->i_gl, eabh);
++                      gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++                      gfs_trans_end(sdp);
++                      gfs_glock_dq_uninit(&rgd_gh);
++              }
++              if (GFS_EA_IS_LAST(ea))
++                      break;
++              ea = GFS_EA_NEXT(ea);
++      }
++
++      brelse(eabh);
++
++      return err;
++
++      fail_glock_rg:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++      fail_eabh:
++      brelse(eabh);
++
++      fail:
++      return err;
++}
++
++/**
++ * gfs_ea_dealloc - deallocate the extended attribute fork
++ * @ip: the inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_ea_dealloc(struct gfs_inode *ip)
++{
++      struct gfs_holder ri_gh, rgd_gh;
++      int err = 0;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *dibh, *indbh = NULL;
++      uint64_t *startblk, *eablk, *end, *next;
++      uint64_t temp;
++      int num_blks;
++      struct gfs_rgrpd *rgd = NULL;
++
++      if (!ip->i_di.di_eattr)
++              goto out;
++
++      gfs_alloc_get(ip);
++
++      err = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (err)
++              goto out_alloc;
++
++      err = gfs_rindex_hold(sdp, &ri_gh);
++      if (err)
++              goto out_unhold_q;
++
++      err = gfs_get_inode_buffer(ip, &dibh);
++      if (err)
++              goto out_rindex_release;
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              err =
++                  gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++                            DIO_WAIT | DIO_START, &indbh);
++              if (err)
++                      goto out_dibh;
++
++              gfs_metatype_check(sdp, indbh, GFS_METATYPE_IN);
++
++              eablk =
++                  (uint64_t *) ((indbh)->b_data +
++                                sizeof (struct gfs_indirect));
++              end =
++                  eablk +
++                  ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++              while (*eablk && eablk < end) {
++                      err =
++                          erase_ea_data_ptrs(sdp, ip, dibh,
++                                             gfs64_to_cpu(*eablk));
++                      if (err)
++                              goto out_indbh;
++                      eablk++;
++              }
++
++              startblk = eablk - 1;
++              end =
++                  (uint64_t *) ((indbh)->b_data +
++                                sizeof (struct gfs_indirect));
++
++              while (startblk >= end) {
++                      rgd = gfs_blk2rgrpd(sdp, gfs64_to_cpu(*startblk));
++                      GFS_ASSERT_INODE(rgd, ip,);
++
++                      num_blks = 1;
++                      next = eablk = startblk - 1;
++
++                      while (eablk >= end) {
++                              if (rgd ==
++                                  gfs_blk2rgrpd(sdp, gfs64_to_cpu(*eablk))) {
++                                      if (eablk != next) {
++                                              temp = *eablk;
++                                              *eablk = *next;
++                                              *next = temp;
++                                      }
++                                      num_blks++;
++                                      next--;
++                              }
++                              eablk--;
++                      }
++
++                      err =
++                          gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
++                                            &rgd_gh);
++                      if (err)
++                              goto out_rindex_release;
++
++                      /* Trans may require:
++                         One block for the RG header. One block for each block from this
++                         resource group. One block for the indirect ea block, 
++                         One block for the quote change */
++
++                      err =
++                          gfs_trans_begin(sdp, 3 + num_blks,
++                                          1);
++                      if (err)
++                              goto out_gunlock_rg;
++
++                      gfs_trans_add_bh(ip->i_gl, dibh);
++
++                      while (startblk > next) {
++                              gfs_metafree(ip, gfs64_to_cpu(*startblk), 1);
++                              ip->i_di.di_blocks--;
++                              *startblk = 0;
++                              startblk--;
++                      }
++
++                      gfs_trans_add_bh(ip->i_gl, indbh);
++                      gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++                      gfs_trans_end(sdp);
++
++                      gfs_glock_dq_uninit(&rgd_gh);
++              }
++
++              brelse(indbh);
++              indbh = NULL;
++      } else {
++              err = erase_ea_data_ptrs(sdp, ip, dibh, ip->i_di.di_eattr);
++              if (err)
++                      goto out_rindex_release;
++      }
++
++      rgd = gfs_blk2rgrpd(sdp, ip->i_di.di_eattr);
++      GFS_ASSERT_INODE(rgd, ip,
++                       printk("block = %" PRIu64 "\n", ip->i_di.di_eattr);
++          );
++
++      err = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++      if (err)
++              goto out_rindex_release;
++
++      err = gfs_trans_begin(sdp, 3, 1);
++      if (err)
++              goto out_gunlock_rg;
++
++      gfs_metafree(ip, ip->i_di.di_eattr, 1);
++
++      ip->i_di.di_blocks--;
++      ip->i_di.di_eattr = 0;
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++      gfs_trans_end(sdp);
++
++      out_gunlock_rg:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++      out_indbh:
++      if (indbh)
++              brelse(indbh);
++
++      out_dibh:
++      brelse(dibh);
++
++      out_rindex_release:
++      gfs_glock_dq_uninit(&ri_gh);
++
++      out_unhold_q:
++      gfs_quota_unhold_m(ip);
++
++      out_alloc:
++      gfs_alloc_put(ip);
++
++      out:
++
++      return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static void
++remove_ea(struct gfs_inode *ip, struct gfs_ea_header *ea,
++        struct gfs_ea_header *prev)
++{
++      uint64_t *datablk;
++      int i;
++
++      if (GFS_EA_IS_UNSTUFFED(ea)) {
++              datablk = GFS_EA_DATA_PTRS(ea);
++              for (i = 0; i < ea->ea_num_ptrs; i++, datablk++) {
++                      gfs_metafree(ip, gfs64_to_cpu(*datablk), 1);
++                      ip->i_di.di_blocks--;
++              }
++      }
++
++      ea->ea_type = GFS_EATYPE_UNUSED;
++      ea->ea_num_ptrs = 0;
++
++      if (prev && prev != ea) {
++              prev->ea_rec_len =
++                  cpu_to_gfs32(GFS_EA_REC_LEN(prev) + GFS_EA_REC_LEN(ea));
++              if (GFS_EA_IS_LAST(ea))
++                      prev->ea_flags |= GFS_EAFLAG_LAST;
++      }
++}
++
++int
++init_new_inode_eattr(struct gfs_inode *dip, struct gfs_inode *ip,
++                   struct gfs_easet_io *req)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_ea_header *ea;
++
++      err = gfs_metaalloc(dip, &ip->i_di.di_eattr);
++      if (err)
++              goto out;
++
++      err = gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++                      DIO_NEW | DIO_START | DIO_WAIT, &bh);
++      if (err)
++              goto out;
++
++      gfs_metatype_set(sdp, bh, GFS_METATYPE_EA, GFS_FORMAT_EA);
++
++      ip->i_di.di_blocks++;
++
++      ea = GFS_FIRST_EA(bh);
++      ea->ea_flags = GFS_EAFLAG_LAST;
++      ea->ea_rec_len =
++          cpu_to_gfs32(sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++      ea->ea_num_ptrs = 0;
++      ea->ea_type = GFS_EATYPE_UNUSED;
++      err = write_ea(sdp, dip, ip, ea, req);
++      if (err)
++              goto out_drelse;
++
++      gfs_trans_add_bh(ip->i_gl, bh);
++
++      out_drelse:
++      brelse(bh);
++
++      out:
++      return err;
++}
++
++int
++do_init_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++            struct gfs_easet_io *req)
++{
++      int err;
++      struct buffer_head *bh;
++      struct gfs_ea_header *ea;
++
++      bh = alloc_eattr_blk(sdp, ip, ip, &ip->i_di.di_eattr);
++      if (bh) {
++              ea = GFS_FIRST_EA(bh);
++              err = write_ea(sdp, ip, ip, ea, req);
++              brelse(bh);
++      } else
++              err = -EIO;
++
++      return err;
++}
++
++/**
++ * init_eattr - initializes a new eattr block
++ */
++
++static int
++init_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req)
++{
++      int err = 0;
++      struct gfs_alloc *al;
++      uint32_t ea_metablks;
++      struct buffer_head *dibh;
++      struct posix_acl *acl = NULL;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++      ea_metablks =
++          GFS_EAREQ_IS_STUFFED(req,
++                               avail_size) ? 1 : (1 +
++                                                  GFS_EADATA_NUM_PTRS(req->
++                                                                      es_data_len,
++                                                                      avail_size));
++
++      if (IS_ACCESS_ACL(req->es_name, req->es_name_len)){
++                acl = posix_acl_from_xattr(req->es_data, req->es_data_len);
++                if (IS_ERR(acl)) {
++                        err = PTR_ERR(acl);
++                        goto out;
++                }
++        }
++
++      al = gfs_alloc_get(ip);
++
++      err = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (err)
++              goto out_alloc;
++
++      al->al_requested_meta = ea_metablks;
++
++      err = gfs_inplace_reserve(ip);
++      if (err)
++              goto out_gunlock_q;
++
++      err = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++      if (err)
++              goto out_ipres;
++
++      err = gfs_get_inode_buffer(ip, &dibh);
++      if (err)
++              goto out_ipres;
++
++      /* Trans may require:
++         A modified dinode, multiple EA metadata blocks, and all blocks for a RG
++         bitmap */
++
++      err =
++          gfs_trans_begin(sdp,
++                          1 + ea_metablks + al->al_rgd->rd_ri.ri_length, 1);
++      if (err)
++              goto out_dibh;
++
++      err = do_init_eattr(sdp, ip, req);
++      if (err)
++              goto out_end_trans;
++
++      if (acl)
++                gfs_acl_set_mode(ip, acl);
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++      out_end_trans:
++      gfs_trans_end(sdp);
++
++      out_dibh:
++      brelse(dibh);
++
++      out_ipres:
++      gfs_inplace_release(ip);
++
++      out_gunlock_q:
++      gfs_quota_unlock_m(ip);
++
++      out_alloc:
++      gfs_alloc_put(ip);
++      posix_acl_release(acl);
++
++      out:
++      return err;
++}
++
++/**
++ * alloc_eattr_blk - allocates a new block for extended attributes.
++ * @sdp: A pointer to the superblock
++ * @alloc_ip: A pointer to the inode that has reserved the blocks for
++ *            allocation
++ * @ip: A pointer to the inode that's getting extended attributes
++ * @block: the block allocated
++ *
++ * Returns: the buffer head on success, NULL on failure
++ */
++
++static struct buffer_head *
++alloc_eattr_blk(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip,
++              struct gfs_inode *ip, uint64_t * block)
++{
++      int err = 0;
++      struct buffer_head *bh = NULL;
++      struct gfs_ea_header *ea;
++
++      err = gfs_metaalloc(alloc_ip, block);
++      if (err)
++              goto out;
++
++      err =
++          gfs_dread(sdp, *block, ip->i_gl, DIO_NEW | DIO_START | DIO_WAIT, &bh);
++      if (err)
++              goto out;
++
++      gfs_metatype_set(sdp, bh, GFS_METATYPE_EA, GFS_FORMAT_EA);
++
++      ip->i_di.di_blocks++;
++
++      ea = GFS_FIRST_EA(bh);
++      ea->ea_flags = GFS_EAFLAG_LAST;
++      ea->ea_rec_len =
++          cpu_to_gfs32(sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++      ea->ea_num_ptrs = 0;
++      ea->ea_type = GFS_EATYPE_UNUSED;
++
++      gfs_trans_add_bh(ip->i_gl, bh);
++
++      out:
++
++      return bh;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++list_direct_ea(struct gfs_sbd *sdp, struct gfs_inode *ip,
++             struct buffer_head *bh, struct gfs_eaget_io *req,
++             gfs_ea_copy_fn_t copy_fn, uint32_t * size)
++{
++      int err = 0;
++      struct gfs_ea_header *ea;
++      char buf[256];
++      char *ptr;
++
++      gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++
++      ea = (struct gfs_ea_header *) ((bh)->b_data +
++                                     sizeof (struct gfs_meta_header));
++      if (ea->ea_type == GFS_EATYPE_UNUSED) {
++              if (GFS_EA_IS_LAST(ea))
++                      goto out;
++              else
++                      ea = GFS_EA_NEXT(ea);
++      }
++
++      while (1) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++
++              if (req->eg_data_len) {
++                      if (*size > req->eg_data_len) {
++                              err = -ERANGE;
++                              break;
++                      }
++                      ptr = buf;
++
++                      GFS_ASSERT_INODE(GFS_EATYPE_VALID(ea->ea_type), ip,);
++                      if (ea->ea_type == GFS_EATYPE_USR) {
++                              memcpy(ptr, "user.", 5);
++                              ptr += 5;
++                      } else {
++                              memcpy(ptr, "system.", 7);
++                              ptr += 7;
++                      }
++                      memcpy(ptr, GFS_EA_NAME(ea), ea->ea_name_len);
++                      ptr += ea->ea_name_len;
++                      *ptr = 0;
++                      err =
++                          copy_fn(req->eg_data + *size, buf,
++                                  GFS_EA_STRLEN(ea));
++                      if (err)
++                              break;
++              }
++
++              *size = *size + GFS_EA_STRLEN(ea);
++
++              if (GFS_EA_IS_LAST(ea))
++                      break;
++              ea = GFS_EA_NEXT(ea);
++      }
++
++      out:
++
++      return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++list_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++      gfs_ea_copy_fn_t copy_fn)
++{
++      int err;
++      struct buffer_head *bh, *eabh;
++      uint64_t *eablk, *end;
++      uint32_t size = 0;
++
++      err =
++          gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++                    &bh);
++      if (err)
++              goto out;
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++              eablk =
++                  (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++              end =
++                  eablk +
++                  ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++              while (*eablk && eablk < end) {
++                      err =
++                          gfs_dread(sdp, gfs64_to_cpu(*eablk), ip->i_gl,
++                                    DIO_START | DIO_WAIT, &eabh);
++                      if (err)
++                              goto out_drelse;
++                      err = list_direct_ea(sdp, ip, eabh, req, copy_fn, &size);
++                      brelse(eabh);
++                      if (err)
++                              goto out_drelse;
++                      eablk++;
++              }
++      } else {
++              err = list_direct_ea(sdp, ip, bh, req, copy_fn, &size);
++              if (err)
++                      goto out_drelse;
++      }
++
++      if (!err)
++              err = size;
++
++      out_drelse:
++      brelse(bh);
++
++      out:
++
++      return err;
++}
++
++/**
++ * gfs_get_eattr - read an extended attribute, or a list of ea names
++ * @sdp: pointer to the superblock
++ * @ip: pointer to the inode for the target file  
++ * @req: the request information
++ * @copy_fn: the function to use to do the actual copying
++ *
++ * Returns: actual size of data on success, -EXXX on error
++ */
++int
++gfs_get_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++            struct gfs_eaget_io *req, gfs_ea_copy_fn_t copy_fn)
++{
++      struct gfs_holder i_gh;
++      int err;
++
++      if (req->eg_name) {
++              err = gfs_ea_read_permission(req, ip);
++              if (err)
++                      goto out;
++      }
++
++      /*  This seems to be a read.  Are we sure we don't want to acquire the lock in LM_ST_SHARED?  */
++
++      err = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (err)
++              goto out;
++
++      if (ip->i_di.di_eattr == 0) {
++              if (!req->eg_name) {
++                      if (!req->eg_data_len && req->eg_len) {
++                              uint32_t no_data = 0;
++
++                              err =
++                                  copy_fn(req->eg_len, &no_data,
++                                          sizeof (uint32_t));
++                      }
++              } else
++                      err = -ENODATA;
++
++              goto out_gunlock;
++      }
++
++      if (req->eg_name)
++              err = get_ea(sdp, ip, req, copy_fn);
++      else
++              err = list_ea(sdp, ip, req, copy_fn);
++
++      out_gunlock:
++      gfs_glock_dq_uninit(&i_gh);
++
++      out:
++
++      return err;
++}
++
++static int
++do_set_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req,
++        struct gfs_ea_location location)
++{
++      int err = 0;
++      int req_size;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      struct gfs_ea_location space;
++
++      req_size = get_req_size(req, avail_size);
++
++      if (location.ea) {
++              struct gfs_ea_header *new_space;
++              if (req->es_cmd == GFS_EACMD_REMOVE) {
++                      remove_ea(ip, location.ea, location.prev);
++                      gfs_trans_add_bh(ip->i_gl, location.bh);
++                      goto out;
++              }
++              if (can_replace(location.ea, req, avail_size)) {
++                      err = replace_ea(sdp, ip, location.ea, req);
++                      if (!err)
++                              gfs_trans_add_bh(ip->i_gl, location.bh);
++                      goto out;
++              }
++              /*
++               * This part is kind of confusing.  If the inode has direct EAs
++               * Then adding another EA can't run it out of space, so it is safe to
++               * delete the EA before looking for space.  If the inode has indirect
++               * EAs, there may not be enough space left, so first you check for space
++               * and they you delete the EA.
++               */
++              if ((ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) == 0) {
++                      remove_ea(ip, location.ea, location.prev);
++                      err = find_space(ip, req_size, req->es_type, &space);
++                      if (err)
++                              goto out;
++                      new_space = prep_ea(space.ea);
++                      err = write_ea(sdp, ip, ip, new_space, req);
++                      if (!err) {
++                              gfs_trans_add_bh(ip->i_gl, location.bh);
++                              gfs_trans_add_bh(ip->i_gl, space.bh);
++                      }
++                      brelse(space.bh);
++                      goto out;
++              }
++              if (can_replace_in_block(ip, req_size, location, &new_space)) {
++                      remove_ea(ip, location.ea, location.prev);
++                      new_space = prep_ea(new_space);
++                      err = write_ea(sdp, ip, ip, new_space, req);
++                      if (!err)
++                              gfs_trans_add_bh(ip->i_gl, location.bh);
++                      goto out;
++              }
++              err = find_space(ip, req_size, req->es_type, &space);
++              if (err)
++                      /* You can return a non IO error here.  If there is no space left,
++                       * you can return -ENOSPC. So you must not have added a buffer to
++                       * the transaction yet.
++                       */
++                      goto out;
++              remove_ea(ip, location.ea, location.prev);
++              new_space = prep_ea(space.ea);
++              err = write_ea(sdp, ip, ip, new_space, req);
++              if (!err) {
++                      gfs_trans_add_bh(ip->i_gl, location.bh);
++                      gfs_trans_add_bh(ip->i_gl, space.bh);
++              }
++              brelse(space.bh);
++              goto out;
++      }
++      err = find_space(ip, req_size, req->es_type, &space);
++      if (err)
++              /* you can also get -ENOSPC here */
++              goto out;
++      space.ea = prep_ea(space.ea);
++      err = write_ea(sdp, ip, ip, space.ea, req);
++      if (!err)
++              gfs_trans_add_bh(ip->i_gl, space.bh);
++      brelse(space.bh);
++
++      out:
++      return err;
++}
++
++static int
++set_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req,
++       struct gfs_ea_location location)
++{
++      int err;
++      struct gfs_alloc *al;
++      struct gfs_rgrpd *rgd = NULL;
++      struct buffer_head *dibh;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      int unstuffed_ea_blks = 0;
++      struct gfs_holder ri_gh, rgd_gh;
++      struct posix_acl *acl = NULL;
++
++      if (IS_ACCESS_ACL(req->es_name, req->es_name_len) && req->es_data){
++                acl = posix_acl_from_xattr(req->es_data, req->es_data_len);
++                if (IS_ERR(acl)) {
++                        err = PTR_ERR(acl);
++                        goto out;
++                }
++        }
++
++      err = gfs_get_inode_buffer(ip, &dibh);
++      if (err)
++              goto out_acl;
++      al = gfs_alloc_get(ip);
++
++      err = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (err)
++              goto out_alloc;
++
++      /* 
++       * worst case, you need to switch from direct to indirect, which can
++       * take up to 3 new blocks, and you need to create enough unstuffed data
++       * blocks to hold all the data
++       */
++      al->al_requested_meta = 3 + GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size);
++
++      err = gfs_inplace_reserve(ip);
++      if (err)
++              goto out_lock_quota;
++
++      err = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++      if (err)
++              goto out_reserve;
++
++      if (location.ea && GFS_EA_IS_UNSTUFFED(location.ea)) {
++              /*
++               * If there is an EA, we might need to delete it. 
++               * Since all unstuffed data blocks are added at the same time,
++               * they are all from the same resource group.
++               */
++              err = gfs_rindex_hold(sdp, &ri_gh);
++              if (err)
++                      goto out_reserve;
++              rgd =
++                  gfs_blk2rgrpd(sdp,
++                                gfs64_to_cpu(*GFS_EA_DATA_PTRS(location.ea)));
++              GFS_ASSERT_INODE(rgd, ip,
++                               printk("block = %" PRIu64 "\n",
++                                      gfs64_to_cpu(*GFS_EA_DATA_PTRS
++                                                   (location.ea)));
++                  );
++              err =
++                  gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++              if (err)
++                      goto out_rindex;
++              unstuffed_ea_blks = location.ea->ea_num_ptrs;
++      }
++
++      /* 
++       * The transaction may require:
++       * Modifying the dinode block, Modifying the indirect ea block,
++       * modifying an ea block, all the allocation blocks, all the blocks for
++       * a RG bitmap,  the RG header block, a RG block for each unstuffed data
++       * block you might be deleting.
++       */
++      err = gfs_trans_begin(sdp, 4 + al->al_requested_meta +
++                            al->al_rgd->rd_ri.ri_length + unstuffed_ea_blks,
++                            1);
++      if (err)
++              goto out_lock_rg;
++
++      err = do_set_ea(sdp, ip, req, location);
++
++      if (!err) {
++              if (acl)
++                      gfs_acl_set_mode(ip, acl);
++              gfs_trans_add_bh(ip->i_gl, dibh);
++              gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++      }
++
++      gfs_trans_end(sdp);
++
++      out_lock_rg:
++      if (rgd)
++              gfs_glock_dq_uninit(&rgd_gh);
++
++      out_rindex:
++      if (rgd)
++              gfs_glock_dq_uninit(&ri_gh);
++
++      out_reserve:
++      gfs_inplace_release(ip);
++
++      out_lock_quota:
++      gfs_quota_unlock_m(ip);
++
++      out_alloc:
++      gfs_alloc_put(ip);
++      brelse(dibh);
++
++      out_acl:
++      posix_acl_release(acl);
++
++      out:
++      return err;
++}
++
++/**
++ * gfs_set_eattr - sets (or creates or replaces) an extended attribute
++ * @sdp: pointer to the superblock
++ * @ip: pointer to the inode of the target file
++ * @req: request information
++ *
++ * Returns: 0 on success -EXXX on error
++ */
++int
++gfs_set_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++            struct gfs_easet_io *req)
++{
++      struct gfs_holder i_gh;
++      int err;
++      uint32_t req_size;
++      uint32_t avail_size =
++          sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++      struct gfs_ea_location location;
++
++      if (!GFS_EACMD_VALID(req->es_cmd)) {
++              err = -EOPNOTSUPP;
++              goto out;
++      }
++
++      if (strlen(req->es_name) == 0) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      err = gfs_ea_write_permission(req, ip);
++      if (err)
++              goto out;
++
++      if ((req_size = get_req_size(req, avail_size)) > avail_size) {
++              /* This can only happen with 512 byte blocks */
++              err = -ERANGE;
++              goto out;
++      }
++      err = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (err)
++              goto out;
++
++      if (ip->i_di.di_eattr == 0) {
++              if (req->es_cmd == GFS_EACMD_REPLACE
++                  || req->es_cmd == GFS_EACMD_REMOVE) {
++                      err = -ENODATA;
++                      goto out_gunlock;
++              }
++              err = init_eattr(sdp, ip, req);
++              goto out_gunlock;
++      }
++
++      err = find_eattr(ip, req->es_name, req->es_name_len, req->es_type,
++                       &location);
++      if (err < 0)
++              goto out_gunlock;
++      if (err == 0 && (req->es_cmd == GFS_EACMD_REPLACE ||
++                       req->es_cmd == GFS_EACMD_REMOVE)) {
++              err = -ENODATA;
++              goto out_relse;
++      }
++      err = set_ea(sdp, ip, req, location);
++
++      out_relse:
++      if (location.bh)
++              brelse(location.bh);
++
++      out_gunlock:
++      gfs_glock_dq_uninit(&i_gh);
++
++      out:
++      return err;
++}
++
++/**
++ * gfs_set_eattr_ioctl - creates, modifies, or removes an extended attribute.
++ * @sdp: pointer to the superblock
++ * @ip: a pointer to the gfs inode for the file
++ * @arg: a pointer to gfs_set_eattr_io_t struct with the request
++ *
++ * Notes: ioctl wrapper for gfs_set_eattr
++ * Returns: 0 on success, -EXXX or error
++ */
++
++int
++gfs_set_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg)
++{
++      struct gfs_easet_io req;
++      int err = 0;
++      char *name = NULL;
++      char *data = NULL;
++
++      if (copy_from_user(&req, arg, sizeof (struct gfs_easet_io))) {
++              err = -EFAULT;
++              goto out;
++      }
++
++      name = gmalloc(req.es_name_len);
++
++      if (req.es_data) {
++              data = gmalloc(req.es_data_len);
++
++              if (copy_from_user(data, req.es_data, req.es_data_len)) {
++                      err = -EFAULT;
++                      goto out_free;
++              }
++      }
++      if (copy_from_user(name, req.es_name, req.es_name_len)) {
++              err = -EFAULT;
++              goto out_free;
++      }
++      req.es_data = data;
++      req.es_name = name;
++      err = gfs_set_eattr(sdp, ip, &req);
++
++      out_free:
++      kfree(name);
++      if (data)
++              kfree(data);
++
++      out:
++      return err;
++}
++
++/**
++ * gfs_get_eattr_ioctl - gets the value for the requested attribute name,
++ *                       or a list of all the extended attribute names.
++ * @sdp: pointer to the superblock
++ * @ip: a pointer to the inode for the file
++ * @arg: a pointer to the struct gfs_eaget_io struct holding the request 
++ *
++ * Notes: ioctl wrapper for the gfs_get_eattr function 
++ * Returns: 0 on success, -EXXX on error.
++ */
++
++int
++gfs_get_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg)
++{
++      struct gfs_eaget_io req;
++      int result = 0;
++      char *name = NULL;
++      uint32_t size;
++
++      if (copy_from_user(&req, arg, sizeof (struct gfs_eaget_io))) {
++              result = -EFAULT;
++              goto out;
++      }
++
++      if (req.eg_name) {
++              name = gmalloc(req.eg_name_len);
++
++              if (copy_from_user(name, req.eg_name, req.eg_name_len)) {
++                      result = -EFAULT;
++                      goto out_free;
++              }
++              req.eg_name = name;
++      }
++      result = gfs_get_eattr(sdp, ip, &req, gfs_ea_copy_to_user);
++
++      out_free:
++      if (name)
++              kfree(name);
++
++      if (result >= 0) {
++              size = result;
++              result =
++                  gfs_ea_copy_to_user(req.eg_len, &size, sizeof(uint32_t));
++      }
++
++      out:
++
++      return result;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++gfs_get_direct_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub,
++                        uint64_t blk)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *databh, *bh;
++      struct gfs_ea_header *ea;
++      uint64_t *datablk;
++      unsigned int i;
++      int error;
++
++      error = gfs_dread(sdp, blk, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++      if (error)
++              goto out;
++
++      error = gfs_add_bh_to_ub(ub, bh);
++
++      ea = (struct gfs_ea_header *) ((bh)->b_data +
++                                     sizeof (struct gfs_meta_header));
++      for (;;) {
++              GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++
++              datablk = GFS_EA_DATA_PTRS(ea);
++
++              for (i = 0; i < ea->ea_num_ptrs; i++) {
++                      error =
++                          gfs_dread(sdp, gfs64_to_cpu(*datablk), ip->i_gl,
++                                    DIO_START | DIO_WAIT, &databh);
++                      if (error)
++                              goto out_relse;
++
++                      error = gfs_add_bh_to_ub(ub, databh);
++
++                      brelse(databh);
++
++                      if (error)
++                              goto out_relse;
++
++                      datablk++;
++              }
++
++              if (GFS_EA_IS_LAST(ea))
++                      break;
++              ea = GFS_EA_NEXT(ea);
++      }
++
++      out_relse:
++      brelse(bh);
++
++      out:
++
++      return error;
++}
++
++/**
++ * gfs_get_eattr_meta - return all the eattr blocks of a file
++ * @dip: the directory
++ * @ub: the structure representing the user buffer to copy to
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      int error;
++      uint64_t *eablk, *end;
++
++      if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++              error =
++                  gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++                            DIO_WAIT | DIO_START, &bh);
++              if (error)
++                      goto out;
++
++              error = gfs_add_bh_to_ub(ub, bh);
++
++              eablk =
++                  (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++              end =
++                  eablk +
++                  ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++              while (*eablk && eablk < end) {
++                      error =
++                          gfs_get_direct_eattr_meta(ip, ub,
++                                                    gfs64_to_cpu(*eablk));
++                      if (error) {
++                              brelse(bh);
++                              goto out;
++                      }
++                      eablk++;
++              }
++              brelse(bh);
++      } else
++              error = gfs_get_direct_eattr_meta(ip, ub, ip->i_di.di_eattr);
++
++      out:
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/eattr.h linux-patched/fs/gfs/eattr.h
+--- linux-orig/fs/gfs/eattr.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/eattr.h       2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,90 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __EATTR_DOT_H__
++#define __EATTR_DOT_H__
++
++#define GFS_EA_MAY_WRITE 1
++#define GFS_EA_MAY_READ 2
++
++#define GFS_EA_DATA_LEN(x) gfs32_to_cpu((x)->ea_data_len)
++#define GFS_EA_IS_UNSTUFFED(x) ((x)->ea_num_ptrs)
++#define GFS_EA_DATA(x) ((char *)(x) + sizeof(struct gfs_ea_header) + (x)->ea_name_len)
++
++struct gfs_ea_location {
++      struct buffer_head *bh;
++      struct gfs_ea_header *ea;
++      struct gfs_ea_header *prev;
++};
++
++#define GFS_POSIX_ACL_ACCESS  "posix_acl_access"
++#define GFS_POSIX_ACL_ACCESS_LEN 16
++#define GFS_POSIX_ACL_DEFAULT "posix_acl_default"
++#define GFS_POSIX_ACL_DEFAULT_LEN 17
++
++#define IS_ACCESS_ACL(name, len) \
++        ((len) == GFS_POSIX_ACL_ACCESS_LEN && \
++         !memcmp(GFS_POSIX_ACL_ACCESS, (name), (len)))
++
++#define IS_DEFAULT_ACL(name, len) \
++        ((len) == GFS_POSIX_ACL_DEFAULT_LEN && \
++         !memcmp(GFS_POSIX_ACL_DEFAULT, (name), (len)))
++
++#define GFS_MAX_EA_ACL_BLKS 66        /* 65 for unstuffed data blocks, 1 for the ea
++                                 itself */
++
++typedef int (*gfs_ea_copy_fn_t) (void *dest, void *src, unsigned long size);
++
++int gfs_ea_memcpy(void *dest, void *src, unsigned long size);
++int gfs_ea_copy_to_user(void *dest, void *src, unsigned long size);
++
++int find_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip, int size,
++                 struct gfs_ea_location *avail);
++
++struct gfs_ea_header *prep_ea(struct gfs_ea_header *ea);
++
++int write_ea(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip,
++           struct gfs_inode *ip, struct gfs_ea_header *ea,
++           struct gfs_easet_io *req);
++
++int gfs_get_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++                struct gfs_eaget_io *req, gfs_ea_copy_fn_t copy_fn);
++int gfs_set_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++                struct gfs_easet_io *req);
++
++int gfs_set_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg);
++int gfs_get_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg);
++
++int gfs_ea_dealloc(struct gfs_inode *ip);
++
++int gfs_get_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++int replace_ea(struct gfs_sbd *sdp, struct gfs_inode *ip,
++             struct gfs_ea_header *ea, struct gfs_easet_io *req);
++
++int find_eattr(struct gfs_inode *ip, char *name, int name_len, int type,
++             struct gfs_ea_location *location);
++
++int read_unstuffed(void *dest, struct gfs_inode *ip, struct gfs_sbd *sdp,
++                 struct gfs_ea_header *ea, uint32_t avail_size,
++                 gfs_ea_copy_fn_t copy_fn);
++
++int get_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++         gfs_ea_copy_fn_t copy_fn);
++
++int init_new_inode_eattr(struct gfs_inode *dip, struct gfs_inode *ip,
++                       struct gfs_easet_io *req);
++
++int gfs_ea_read_permission(struct gfs_eaget_io *req, struct gfs_inode *ip);
++
++#endif /* __EATTR_DOT_H__ */
+diff -urN linux-orig/fs/gfs/file.c linux-patched/fs/gfs/file.c
+--- linux-orig/fs/gfs/file.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/file.c        2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,382 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "file.h"
++#include "inode.h"
++#include "trans.h"
++
++/**
++ * gfs_copy2mem - Trivial copy function for gfs_readi()
++ * @bh: The buffer to copy from, or NULL meaning zero the buffer
++ * @buf: The buffer to copy/zero
++ * @offset: The offset in the buffer to copy from
++ * @size: The amount of data to copy/zero
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy2mem(struct buffer_head *bh, void **buf, unsigned int offset,
++           unsigned int size)
++{
++      char **p = (char **)buf;
++
++      if (bh)
++              memcpy(*p, bh->b_data + offset, size);
++      else
++              memset(*p, 0, size);
++
++      *p += size;
++
++      return 0;
++}
++
++/**
++ * gfs_copy2user - Copy data to user space
++ * @bh: The buffer
++ * @buf: The destination of the data
++ * @offset: The offset into the buffer
++ * @size: The amount of data to copy
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy2user(struct buffer_head *bh, void **buf,
++            unsigned int offset, unsigned int size)
++{
++      char **p = (char **)buf;
++      int error;
++
++      if (bh)
++              error = copy_to_user(*p, bh->b_data + offset, size);
++      else
++              error = clear_user(*p, size);
++
++      if (error)
++              error = -EFAULT;
++      else
++              *p += size;
++
++      return error;
++}
++
++/**
++ * gfs_readi - Read a file
++ * @ip: The GFS Inode
++ * @buf: The buffer to place result into
++ * @offset: File offset to begin reading from
++ * @size: Amount of data to transfer
++ * @copy_fn: Function to actually perform the copy
++ *
++ * The @copy_fn only copies a maximum of a single block at once so
++ * we are safe calling it with int arguments. It is done so that
++ * we don't needlessly put 64bit arguments on the stack and it
++ * also makes the code in the @copy_fn nicer too.
++ *
++ * Returns: The amount of data actually copied or the error
++ */
++
++int
++gfs_readi(struct gfs_inode *ip, void *buf,
++        uint64_t offset, unsigned int size,
++        read_copy_fn_t copy_fn)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *bh;
++      uint64_t lblock, dblock;
++      unsigned int o;
++      uint32_t extlen = 0;
++      unsigned int amount;
++      int not_new = 0;
++      int journaled = gfs_is_jdata(ip);
++      int copied = 0;
++      int error = 0;
++
++      if (offset >= ip->i_di.di_size)
++              return 0;
++
++      if ((offset + size) > ip->i_di.di_size)
++              size = ip->i_di.di_size - offset;
++
++      if (!size)
++              return 0;
++
++      if (journaled) {
++              lblock = offset;
++              o = do_div(lblock, sdp->sd_jbsize);
++      } else {
++              lblock = offset >> sdp->sd_sb.sb_bsize_shift;
++              o = offset & (sdp->sd_sb.sb_bsize - 1);
++      }
++
++      if (gfs_is_stuffed(ip))
++              o += sizeof(struct gfs_dinode);
++      else if (journaled)
++              o += sizeof(struct gfs_meta_header);
++
++      while (copied < size) {
++              amount = size - copied;
++              if (amount > sdp->sd_sb.sb_bsize - o)
++                      amount = sdp->sd_sb.sb_bsize - o;
++
++              if (!extlen) {
++                      error = gfs_block_map(ip, lblock, &not_new,
++                                            &dblock, &extlen);
++                      if (error)
++                              goto fail;
++              }
++
++              if (extlen > 1)
++                      gfs_start_ra(ip->i_gl, dblock, extlen);
++
++              if (dblock) {
++                      error = gfs_get_data_buffer(ip, dblock, not_new, &bh);
++                      if (error)
++                              goto fail;
++
++                      dblock++;
++                      extlen--;
++              } else
++                      bh = NULL;
++
++              error = copy_fn(bh, &buf, o, amount);
++              if (bh)
++                      brelse(bh);
++              if (error)
++                      goto fail;
++
++              copied += amount;
++              lblock++;
++
++              o = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++      }
++
++      return copied;
++
++ fail:
++      return (copied) ? copied : error;
++}
++
++/**
++ * gfs_copy_from_mem - Trivial copy function for gfs_writei()
++ * @ip: The file to write to
++ * @bh: The buffer to copy to or clear
++ * @buf: The buffer to copy from
++ * @offset: The offset in the buffer to write to
++ * @size: The amount of data to write
++ * @new: Flag indicating that remaining space in the buffer should be zeroed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy_from_mem(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++                unsigned int offset, unsigned int size, int new)
++{
++      char **p = (char **)buf;
++      int error = 0;
++
++      if (bh->b_blocknr == ip->i_num.no_addr) {
++              GFS_ASSERT_INODE(!new, ip,);
++              gfs_trans_add_bh(ip->i_gl, bh);
++              memcpy(bh->b_data + offset, *p, size);
++      } else if (gfs_is_jdata(ip)) {
++              gfs_trans_add_bh(ip->i_gl, bh);
++              memcpy(bh->b_data + offset, *p, size);
++              if (new)
++                      gfs_buffer_clear_ends(bh, offset, size, TRUE);
++      } else {
++              memcpy(bh->b_data + offset, *p, size);
++              if (new)
++                      gfs_buffer_clear_ends(bh, offset, size, FALSE);
++              error = gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++      }
++
++      if (!error)
++              *p += size;
++
++      return error;
++}
++
++/**
++ * gfs_copy_from_user - Copy bytes from user space for gfs_writei()
++ * @ip: The file to write to
++ * @bh: The buffer to copy to or clear
++ * @buf: The buffer to copy from
++ * @offset: The offset in the buffer to write to
++ * @size: The amount of data to write
++ * @new: Flag indicating that remaining space in the buffer should be zeroed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy_from_user(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++                 unsigned int offset, unsigned int size, int new)
++{
++      char **p = (char **)buf;
++      int error = 0;
++
++      if (bh->b_blocknr == ip->i_num.no_addr) {
++              GFS_ASSERT_INODE(!new, ip,);
++              gfs_trans_add_bh(ip->i_gl, bh);
++              if (copy_from_user(bh->b_data + offset, *p, size))
++                      error = -EFAULT;
++      } else if (gfs_is_jdata(ip)) {
++              gfs_trans_add_bh(ip->i_gl, bh);
++              if (copy_from_user(bh->b_data + offset, *p, size))
++                      error = -EFAULT;
++              if (new) {
++                      gfs_buffer_clear_ends(bh, offset, size, TRUE);
++                      if (error)
++                              memset(bh->b_data + offset, 0, size);
++              }
++      } else {
++              if (copy_from_user(bh->b_data + offset, *p, size))
++                      error = -EFAULT;
++              if (error) {
++                      if (new)
++                              gfs_buffer_clear(bh);
++                      gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++              } else {
++                      if (new)
++                              gfs_buffer_clear_ends(bh, offset, size, FALSE);
++                      error = gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++              }
++      }
++
++      if (!error)
++              *p += size;
++
++      return error;
++}
++
++/**
++ * gfs_writei - Write bytes to a file
++ * @ip: The GFS inode
++ * @buf: The buffer containing information to be written
++ * @offset: The file offset to start writing at
++ * @size: The amount of data to write
++ * @copy_fn: Function to do the actual copying
++ *
++ * Returns: The number of bytes correctly written or error code
++ */
++
++int
++gfs_writei(struct gfs_inode *ip, void *buf,
++         uint64_t offset, unsigned int size,
++         write_copy_fn_t copy_fn)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct buffer_head *dibh, *bh;
++      uint64_t lblock, dblock;
++      unsigned int o;
++      uint32_t extlen = 0;
++      unsigned int amount;
++      int new;
++      int journaled = gfs_is_jdata(ip);
++      const uint64_t start = offset;
++      int copied = 0;
++      int error = 0;
++
++      if (!size)
++              return 0;
++
++      if (gfs_is_stuffed(ip) &&
++          ((start + size) > (sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)))) {
++              error = gfs_unstuff_dinode(ip, gfs_unstuffer_async, NULL);
++              if (error)
++                      return error;
++      }
++
++      if (journaled) {
++              lblock = offset;
++              o = do_div(lblock, sdp->sd_jbsize);
++      } else {
++              lblock = offset >> sdp->sd_sb.sb_bsize_shift;
++              o = offset & (sdp->sd_sb.sb_bsize - 1);
++      }
++
++      if (gfs_is_stuffed(ip))
++              o += sizeof(struct gfs_dinode);
++      else if (journaled)
++              o += sizeof(struct gfs_meta_header);
++
++      while (copied < size) {
++              amount = size - copied;
++              if (amount > sdp->sd_sb.sb_bsize - o)
++                      amount = sdp->sd_sb.sb_bsize - o;
++
++              if (!extlen) {
++                      new = TRUE;
++                      error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++                      if (error)
++                              goto fail;
++                      GFS_ASSERT_INODE(dblock, ip,);
++              }
++
++              if (journaled && extlen > 1)
++                      gfs_start_ra(ip->i_gl, dblock, extlen);
++
++              error = gfs_get_data_buffer(ip, dblock,
++                                          (amount == sdp->sd_sb.sb_bsize) ? TRUE : new,
++                                          &bh);
++              if (error)
++                      goto fail;
++
++              error = copy_fn(ip, bh, &buf, o, amount, new);
++              brelse(bh);
++              if (error)
++                      goto fail;
++
++              copied += amount;
++              lblock++;
++              dblock++;
++              extlen--;
++
++              o = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++      }
++
++ out:
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              return error;
++
++      if (ip->i_di.di_size < start + copied)
++              ip->i_di.di_size = start + copied;
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return copied;
++
++ fail:
++      if (copied)
++              goto out;
++      return error;
++}
+diff -urN linux-orig/fs/gfs/file.h linux-patched/fs/gfs/file.h
+--- linux-orig/fs/gfs/file.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/file.h        2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,51 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FILE_DOT_H__
++#define __FILE_DOT_H__
++
++typedef int (*read_copy_fn_t) (struct buffer_head * bh, void **buf,
++                             unsigned int offset, unsigned int size);
++typedef int (*write_copy_fn_t) (struct gfs_inode * ip, struct buffer_head * bh,
++                              void **buf, unsigned int offset,
++                              unsigned int size, int new);
++
++int gfs_copy2mem(struct buffer_head *bh, void **buf,
++               unsigned int offset, unsigned int size);
++int gfs_copy2user(struct buffer_head *bh, void **buf,
++                unsigned int offset, unsigned int size);
++int gfs_readi(struct gfs_inode *ip, void *buf, uint64_t offset,
++            unsigned int size, read_copy_fn_t copy_fn);
++
++int gfs_copy_from_mem(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++                    unsigned int offset, unsigned int size, int new);
++int gfs_copy_from_user(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++                     unsigned int offset, unsigned int size, int new);
++int gfs_writei(struct gfs_inode *ip, void *buf, uint64_t offset,
++             unsigned int size, write_copy_fn_t copy_fn);
++
++static __inline__ int
++gfs_internal_read(struct gfs_inode *ip, char *buf, uint64_t offset,
++                unsigned int size)
++{
++      return gfs_readi(ip, buf, offset, size, gfs_copy2mem);
++}
++
++static __inline__ int
++gfs_internal_write(struct gfs_inode *ip, char *buf, uint64_t offset,
++                 unsigned int size)
++{
++      return gfs_writei(ip, buf, offset, size, gfs_copy_from_mem);
++}
++
++#endif /* __FILE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/fixed_div64.h linux-patched/fs/gfs/fixed_div64.h
+--- linux-orig/fs/gfs/fixed_div64.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/fixed_div64.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,142 @@
++/*
++ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it would be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
++ *
++ * Further, this software is distributed without any warranty that it is
++ * free of the rightful claim of any third person regarding infringement
++ * or the like.  Any license provided herein, whether implied or
++ * otherwise, applies only to this software file.  Patent licenses, if
++ * any, provided herein do not apply to combinations of this program with
++ * other software, or any other product whatsoever.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with this program; if not, write the Free Software Foundation, Inc., 59
++ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
++ *
++ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
++ * Mountain View, CA  94043, or:
++ *
++ * http://www.sgi.com
++ *
++ * For further information regarding this notice, see:
++ *
++ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
++ *
++ * Additional munging:
++ * Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++ */
++
++#ifndef __FIXED_DIV64_DOT_H__
++#define __FIXED_DIV64_DOT_H__
++
++#include <asm/div64.h>
++
++#if defined __i386__
++/* For ia32 we need to pull some tricks to get past various versions
++ * of the compiler which do not like us using do_div in the middle
++ * of large functions.
++ */
++static inline __u32 fixed_div64_do_div(void *a, __u32 b, int n)
++{
++      __u32   mod;
++
++      switch (n) {
++              case 4:
++                      mod = *(__u32 *)a % b;
++                      *(__u32 *)a = *(__u32 *)a / b;
++                      return mod;
++              case 8:
++                      {
++                      unsigned long __upper, __low, __high, __mod;
++                      __u64   c = *(__u64 *)a;
++                      __upper = __high = c >> 32;
++                      __low = c;
++                      if (__high) {
++                              __upper = __high % (b);
++                              __high = __high / (b);
++                      }
++                      asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
++                      asm("":"=A" (c):"a" (__low),"d" (__high));
++                      *(__u64 *)a = c;
++                      return __mod;
++                      }
++      }
++
++      /* NOTREACHED */
++      return 0;
++}
++
++/* Side effect free 64 bit mod operation */
++static inline __u32 fixed_div64_do_mod(void *a, __u32 b, int n)
++{
++      switch (n) {
++              case 4:
++                      return *(__u32 *)a % b;
++              case 8:
++                      {
++                      unsigned long __upper, __low, __high, __mod;
++                      __u64   c = *(__u64 *)a;
++                      __upper = __high = c >> 32;
++                      __low = c;
++                      if (__high) {
++                              __upper = __high % (b);
++                              __high = __high / (b);
++                      }
++                      asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
++                      asm("":"=A" (c):"a" (__low),"d" (__high));
++                      return __mod;
++                      }
++      }
++
++      /* NOTREACHED */
++      return 0;
++}
++#else
++static inline __u32 fixed_div64_do_div(void *a, __u32 b, int n)
++{
++      __u32   mod;
++
++      switch (n) {
++              case 4:
++                      mod = *(__u32 *)a % b;
++                      *(__u32 *)a = *(__u32 *)a / b;
++                      return mod;
++              case 8:
++                      mod = do_div(*(__u64 *)a, b);
++                      return mod;
++      }
++
++      /* NOTREACHED */
++      return 0;
++}
++
++/* Side effect free 64 bit mod operation */
++static inline __u32 fixed_div64_do_mod(void *a, __u32 b, int n)
++{
++      switch (n) {
++              case 4:
++                      return *(__u32 *)a % b;
++              case 8:
++                      {
++                      __u64   c = *(__u64 *)a;
++                      return do_div(c, b);
++                      }
++      }
++
++      /* NOTREACHED */
++      return 0;
++}
++#endif
++
++#undef do_div
++#define do_div(a, b)  fixed_div64_do_div(&(a), (b), sizeof(a))
++#define do_mod(a, b)  fixed_div64_do_mod(&(a), (b), sizeof(a))
++
++#endif /* __FIXED_DIV64_DOT_H__ */
+diff -urN linux-orig/fs/gfs/flock.c linux-patched/fs/gfs/flock.c
+--- linux-orig/fs/gfs/flock.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/flock.c       2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "flock.h"
++#include "glock.h"
++#include "glops.h"
++
++/**
++ * gfs_flock - Acquire a flock on a file
++ * @fp: the file
++ * @ex: exclusive lock
++ * @wait: wait for lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_flock(struct gfs_file *fp, int ex, int wait)
++{
++      struct gfs_holder *fl_gh = &fp->f_fl_gh;
++      struct gfs_inode *ip = fp->f_inode;
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_glock *gl;
++      int error = 0;
++
++      down(&fp->f_fl_lock);
++
++      if (fl_gh->gh_gl) {
++              gfs_glock_dq_uninit(fl_gh);
++              error = -EDEADLK;
++              goto out;
++      }
++
++      error = gfs_glock_get(sdp,
++                            ip->i_num.no_formal_ino, &gfs_flock_glops,
++                            CREATE, &gl);
++      if (error)
++              goto out;
++
++      gfs_holder_init(gl, (ex) ? LM_ST_EXCLUSIVE : LM_ST_SHARED,
++                      ((wait) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE,
++                      fl_gh);
++      fl_gh->gh_owner = NULL;
++
++      gfs_glock_put(gl);
++
++      error = gfs_glock_nq(fl_gh);
++      if (error) {
++              gfs_holder_uninit(fl_gh);
++              if (error == GLR_TRYFAILED) {
++                      GFS_ASSERT_INODE(!wait, ip,);
++                      error = -EAGAIN;
++              }
++      }
++
++ out:
++      up(&fp->f_fl_lock);
++
++      return error;
++}
++
++/**
++ * gfs_funlock - Release a flock on a file
++ * @fp: the file
++ *
++ */
++
++int
++gfs_funlock(struct gfs_file *fp)
++{
++      struct gfs_holder *fl_gh = &fp->f_fl_gh;
++
++      down(&fp->f_fl_lock);
++      if (fl_gh->gh_gl)
++              gfs_glock_dq_uninit(fl_gh);
++      up(&fp->f_fl_lock);
++
++      return 0;
++}
+diff -urN linux-orig/fs/gfs/flock.h linux-patched/fs/gfs/flock.h
+--- linux-orig/fs/gfs/flock.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/flock.h       2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FLOCK_DOT_H__
++#define __FLOCK_DOT_H__
++
++int gfs_flock(struct gfs_file *fp, int ex, int wait);
++int gfs_funlock(struct gfs_file *fp);
++
++#endif /* __FLOCK_DOT_H__ */
+diff -urN linux-orig/fs/gfs/format.h linux-patched/fs/gfs/format.h
+--- linux-orig/fs/gfs/format.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/format.h      2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FORMAT_DOT_H__
++#define __FORMAT_DOT_H__
++
++static const uint32_t gfs_old_fs_formats[] = {
++      1308,
++      1307,
++      1306,
++      1305,
++      0
++};
++
++static const uint32_t gfs_old_multihost_formats[] = {
++      1400,
++      0
++};
++
++#endif /* __FORMAT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/gfs.h linux-patched/fs/gfs/gfs.h
+--- linux-orig/fs/gfs/gfs.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/gfs.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,130 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_DOT_H__
++#define __GFS_DOT_H__
++
++#define GFS_RELEASE_NAME "<CVS>"
++
++#include <linux/lm_interface.h>
++#include <linux/gfs_ondisk.h>
++#include <linux/gfs_ioctl.h>
++
++#include "fixed_div64.h"
++#include "lvb.h"
++#include "incore.h"
++#include "util.h"
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#define NO_CREATE (0)
++#define CREATE (1)
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++/*  Divide x by y.  Round up if there is a remainder.  */
++#define DIV_RU(x, y) (((x) + (y) - 1) / (y))
++
++#define GFS_FAST_NAME_SIZE (8)
++
++#define vfs2sdp(sb) ((struct gfs_sbd *)(sb)->s_fs_info)
++#define vn2ip(inode) ((struct gfs_inode *)(inode)->u.generic_ip)
++#define vf2fp(file) ((struct gfs_file *)(file)->private_data)
++#define bh2bd(bh) ((struct gfs_bufdata *)(bh)->b_private)
++#define current_transaction ((struct gfs_trans *)(current->journal_info))
++
++#define gl2ip(gl) ((struct gfs_inode *)(gl)->gl_object)
++#define gl2rgd(gl) ((struct gfs_rgrpd *)(gl)->gl_object)
++#define gl2gl(gl) ((struct gfs_glock *)(gl)->gl_object)
++
++#define gfs_meta_check(sdp, bh) \
++do \
++{ \
++  uint32_t meta_check_magic = ((struct gfs_meta_header *)(bh)->b_data)->mh_magic; \
++  meta_check_magic = gfs32_to_cpu(meta_check_magic); \
++  GFS_ASSERT_SBD(meta_check_magic == GFS_MAGIC, (sdp), \
++               struct gfs_meta_header meta_check_mh; \
++               printk("Bad metadata at %"PRIu64"\n", (uint64_t)(bh)->b_blocknr); \
++               gfs_meta_header_in(&meta_check_mh, (bh)->b_data); \
++               gfs_meta_header_print(&meta_check_mh);); \
++} \
++while (0)
++
++#define gfs_metatype_check(sdp, bh, type) \
++do \
++{ \
++  uint32_t metatype_check_magic = ((struct gfs_meta_header *)(bh)->b_data)->mh_magic; \
++  uint32_t metatype_check_type = ((struct gfs_meta_header *)(bh)->b_data)->mh_type; \
++  metatype_check_magic = gfs32_to_cpu(metatype_check_magic); \
++  metatype_check_type = gfs32_to_cpu(metatype_check_type); \
++  GFS_ASSERT_SBD(metatype_check_magic == GFS_MAGIC && \
++               metatype_check_type == (type), (sdp), \
++               struct gfs_meta_header metatype_check_mh; \
++               printk("Bad metadata at %"PRIu64", should be %u\n", (uint64_t)(bh)->b_blocknr, (type)); \
++               gfs_meta_header_in(&metatype_check_mh, (bh)->b_data); \
++               gfs_meta_header_print(&metatype_check_mh);); \
++} \
++while (0)
++
++#define gfs_metatype_set(sdp, bh, type, format) \
++do \
++{ \
++  gfs_meta_check((sdp), (bh)); \
++  ((struct gfs_meta_header *)(bh)->b_data)->mh_type = cpu_to_gfs32((type)); \
++  ((struct gfs_meta_header *)(bh)->b_data)->mh_format = cpu_to_gfs32((format)); \
++} \
++while (0)
++
++#define gfs_sprintf(fmt, args...) \
++do { \
++  if (buf) { \
++    if (*count + 256 > size) { \
++      error = -ENOMEM; \
++      goto out; \
++    } \
++    *count += snprintf(buf + *count, 256, fmt, ##args); \
++  } \
++  else \
++    printk(fmt, ##args); \
++} \
++while (0)
++
++#endif /* __GFS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/glock.c linux-patched/fs/gfs/glock.c
+--- linux-orig/fs/gfs/glock.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glock.c       2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,2524 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "lops.h"
++#include "quota.h"
++#include "recovery.h"
++
++/*  Must be kept in sync with the beginning of struct gfs_glock  */
++struct glock_plug {
++      struct list_head gl_list;
++      unsigned long gl_flags;
++};
++
++typedef void (*glock_examiner) (struct gfs_glock * gl);
++
++/**
++ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
++ * @actual: the current state of the lock
++ * @requested: the lock state that was requested by the caller
++ * @flags: the modifier flags passed in by the caller
++ *
++ * Returns: TRUE if the locks are compatible, FALSE otherwise
++ */
++
++static __inline__ int
++relaxed_state_ok(unsigned int actual, unsigned requested, int flags)
++{
++      if (actual == requested)
++              return TRUE;
++
++      if (flags & GL_EXACT)
++              return FALSE;
++
++      if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
++              return TRUE;
++
++      if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
++              return TRUE;
++
++      return FALSE;
++}
++
++/**
++ * gl_hash() - Turn glock number into hash bucket number
++ * @lock: The glock number
++ *
++ * Returns: The number of the corresponding hash bucket
++ */
++
++static unsigned int
++gl_hash(struct lm_lockname *name)
++{
++      unsigned int h;
++
++      h = gfs_hash(&name->ln_number, sizeof(uint64_t));
++      h = gfs_hash_more(&name->ln_type, sizeof(unsigned int), h);
++      h &= GFS_GL_HASH_MASK;
++
++      return h;
++}
++
++/**
++ * glock_hold() - increment reference count on glock
++ * @gl: The glock to put
++ *
++ */
++
++static __inline__ void
++glock_hold(struct gfs_glock *gl)
++{
++      atomic_inc(&gl->gl_count);
++}
++
++/**
++ * glock_put() - Decrement reference count on glock
++ * @gl: The glock to put
++ *
++ */
++
++static __inline__ void
++glock_put(struct gfs_glock *gl)
++{
++      if (atomic_read(&gl->gl_count) == 1)
++              gfs_glock_schedule_for_reclaim(gl);
++      atomic_dec(&gl->gl_count);
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) >= 0, gl,);
++}
++
++/**
++ * queue_empty - check to see if a glock's queue is empty
++ * @gl: the glock
++ * @head: the head of the queue to check
++ *
++ * Returns: TRUE if the queue is empty
++ */
++
++static __inline__ int
++queue_empty(struct gfs_glock *gl, struct list_head *head)
++{
++      int empty;
++      spin_lock(&gl->gl_spin);
++      empty = list_empty(head);
++      spin_unlock(&gl->gl_spin);
++      return empty;
++}
++
++/**
++ * search_bucket() - Find struct gfs_glock by lock number
++ * @bucket: the bucket to search
++ * @name: The lock name
++ *
++ * Returns: NULL, or the struct gfs_glock with the requested number
++ */
++
++static struct gfs_glock *
++search_bucket(struct gfs_gl_hash_bucket *bucket, struct lm_lockname *name)
++{
++      struct list_head *tmp, *head;
++      struct gfs_glock *gl;
++
++      for (head = &bucket->hb_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++              if (test_bit(GLF_PLUG, &gl->gl_flags))
++                      continue;
++              if (!lm_name_equal(&gl->gl_name, name))
++                      continue;
++
++              glock_hold(gl);
++
++              return gl;
++      }
++
++      return NULL;
++}
++
++/**
++ * gfs_glock_find() - Find glock by lock number
++ * @sdp: The GFS superblock
++ * @name: The lock name
++ *
++ * Figure out what bucket the lock is in, acquire the read lock on
++ * it and call search_bucket().
++ *
++ * Returns: NULL, or the struct gfs_glock with the requested number
++ */
++
++struct gfs_glock *
++gfs_glock_find(struct gfs_sbd *sdp, struct lm_lockname *name)
++{
++      struct gfs_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
++      struct gfs_glock *gl;
++
++      read_lock(&bucket->hb_lock);
++      gl = search_bucket(bucket, name);
++      read_unlock(&bucket->hb_lock);
++
++      return gl;
++}
++
++/**
++ * glock_free() - Perform a few checks and then release struct gfs_glock
++ * @gl: The glock to release
++ *
++ */
++
++static void
++glock_free(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct inode *aspace = gl->gl_aspace;
++
++      GFS_ASSERT_GLOCK(list_empty(&gl->gl_list), gl,);
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) == 1, gl,);
++      GFS_ASSERT_GLOCK(list_empty(&gl->gl_holders), gl,);
++      GFS_ASSERT_GLOCK(list_empty(&gl->gl_waiters1), gl,);
++      GFS_ASSERT_GLOCK(list_empty(&gl->gl_waiters2), gl,);
++      GFS_ASSERT_GLOCK(gl->gl_state == LM_ST_UNLOCKED, gl,);
++      GFS_ASSERT_GLOCK(!gl->gl_object, gl,);
++      GFS_ASSERT_GLOCK(!gl->gl_lvb, gl,);
++      GFS_ASSERT_GLOCK(list_empty(&gl->gl_reclaim), gl,);
++
++      sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
++
++      if (aspace)
++              gfs_aspace_put(aspace);
++
++      kmem_cache_free(gfs_glock_cachep, gl);
++
++      atomic_dec(&sdp->sd_glock_count);
++}
++
++/**
++ * gfs_glock_get() - Get a glock, or create one if one doesn't exist
++ * @sdp: The GFS superblock
++ * @number: the lock number
++ * @glops: The glock_operations to use
++ * @create: If FALSE, don't create the glock if it doesn't exist
++ * @glp: the glock is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_glock_get(struct gfs_sbd *sdp,
++            uint64_t number, struct gfs_glock_operations *glops,
++            int create, struct gfs_glock **glp)
++{
++      struct lm_lockname name;
++      struct gfs_glock *gl, *tmp;
++      struct gfs_gl_hash_bucket *bucket;
++      int error;
++
++      name.ln_number = number;
++      name.ln_type = glops->go_type;
++      bucket = &sdp->sd_gl_hash[gl_hash(&name)];
++
++      read_lock(&bucket->hb_lock);
++      gl = search_bucket(bucket, &name);
++      read_unlock(&bucket->hb_lock);
++
++      if (gl || !create) {
++              *glp = gl;
++              return 0;
++      }
++
++      gl = kmem_cache_alloc(gfs_glock_cachep, GFP_KERNEL);
++      if (!gl)
++              return -ENOMEM;
++
++      memset(gl, 0, sizeof(struct gfs_glock));
++
++      INIT_LIST_HEAD(&gl->gl_list);
++      gl->gl_name = name;
++      atomic_set(&gl->gl_count, 1);
++
++      spin_lock_init(&gl->gl_spin);
++
++      gl->gl_state = LM_ST_UNLOCKED;
++      INIT_LIST_HEAD(&gl->gl_holders);
++      INIT_LIST_HEAD(&gl->gl_waiters1);
++      INIT_LIST_HEAD(&gl->gl_waiters2);
++
++      gl->gl_ops = glops;
++
++      INIT_LE(&gl->gl_new_le, &gfs_glock_lops);
++      INIT_LE(&gl->gl_incore_le, &gfs_glock_lops);
++
++      gl->gl_bucket = bucket;
++      INIT_LIST_HEAD(&gl->gl_reclaim);
++
++      gl->gl_sbd = sdp;
++
++      INIT_LIST_HEAD(&gl->gl_dirty_buffers);
++      INIT_LIST_HEAD(&gl->gl_ail_bufs);
++
++      if (glops == &gfs_inode_glops ||
++          glops == &gfs_rgrp_glops ||
++          glops == &gfs_meta_glops) {
++              gl->gl_aspace = gfs_aspace_get(sdp);
++              if (!gl->gl_aspace) {
++                      error = -ENOMEM;
++                      goto fail;
++              }
++      }
++
++      error = sdp->sd_lockstruct.ls_ops->lm_get_lock(sdp->sd_lockstruct.ls_lockspace,
++                                                     &name,
++                                                     &gl->gl_lock);
++      if (error)
++              goto fail_aspace;
++
++      atomic_inc(&sdp->sd_glock_count);
++
++      write_lock(&bucket->hb_lock);
++      tmp = search_bucket(bucket, &name);
++      if (tmp) {
++              write_unlock(&bucket->hb_lock);
++              glock_free(gl);
++              gl = tmp;
++      } else {
++              list_add_tail(&gl->gl_list, &bucket->hb_list);
++              write_unlock(&bucket->hb_lock);
++      }
++
++      *glp = gl;
++
++      return 0;
++
++ fail_aspace:
++      if (gl->gl_aspace)
++              gfs_aspace_put(gl->gl_aspace);
++
++ fail:
++      kmem_cache_free(gfs_glock_cachep, gl);  
++
++      return error;
++}
++
++/**
++ * gfs_glock_hold() - As glock_hold(), but suitable for exporting
++ * @gl: The glock to hold
++ *
++ */
++
++void
++gfs_glock_hold(struct gfs_glock *gl)
++{
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++      glock_hold(gl);
++}
++
++/**
++ * gfs_glock_put() - As glock_put(), but suitable for exporting
++ * @gl: The glock to put
++ *
++ */
++
++void
++gfs_glock_put(struct gfs_glock *gl)
++{
++      glock_put(gl);
++}
++
++/**
++ * gfs_holder_init - initialize a struct gfs_holder in the default way
++ * @gl: the glock 
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_init(struct gfs_glock *gl, unsigned int state, int flags,
++              struct gfs_holder *gh)
++{
++      memset(gh, 0, sizeof(struct gfs_holder));
++
++      INIT_LIST_HEAD(&gh->gh_list);
++      gh->gh_gl = gl;
++      gh->gh_owner = current;
++      gh->gh_state = state;
++      gh->gh_flags = flags;
++
++      if (gh->gh_state == LM_ST_EXCLUSIVE)
++              gh->gh_flags |= GL_LOCAL_EXCL;
++
++      init_completion(&gh->gh_wait);
++
++      glock_hold(gl);
++}
++
++/**
++ * gfs_holder_reinit - reinitialize a struct gfs_holder so we can requeue it
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ * Don't mess with the glock.
++ *
++ */
++
++void
++gfs_holder_reinit(unsigned int state, int flags, struct gfs_holder *gh)
++{
++      int alloced;
++
++      GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gh->gh_gl,);
++
++      gh->gh_state = state;
++      gh->gh_flags = flags;
++
++      if (gh->gh_state == LM_ST_EXCLUSIVE)
++              gh->gh_flags |= GL_LOCAL_EXCL;
++
++      alloced = test_bit(HIF_ALLOCED, &gh->gh_iflags);
++      memset(&gh->gh_iflags, 0, sizeof(unsigned long));
++      if (alloced)
++              set_bit(HIF_ALLOCED, &gh->gh_iflags);
++}
++
++/**
++ * gfs_holder_uninit - uninitialize a holder structure (drop reference on glock)
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_uninit(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++
++      GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gl,);
++      gh->gh_gl = NULL;
++
++      glock_put(gl);
++}
++
++/**
++ * gfs_holder_get - get a struct gfs_holder structure
++ * @gl: the glock 
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ *
++ * Figure out how big an impact this function has.  Either:
++ * 1) Replace it with a cache of structures hanging off the struct gfs_sbd
++ * 2) Get rid of it and call gmalloc() directly
++ * 3) Leave it like it is
++ *
++ * Returns: the holder structure
++ */
++
++struct gfs_holder *
++gfs_holder_get(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      struct gfs_holder *gh;
++
++      gh = gmalloc(sizeof(struct gfs_holder));
++      gfs_holder_init(gl, state, flags, gh);
++      set_bit(HIF_ALLOCED, &gh->gh_iflags);
++
++      return gh;
++}
++
++/**
++ * gfs_holder_put - get rid of a struct gfs_holder structure
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_put(struct gfs_holder *gh)
++{
++      GFS_ASSERT_GLOCK(test_bit(HIF_ALLOCED, &gh->gh_iflags), gh->gh_gl,);
++      gfs_holder_uninit(gh);
++      kfree(gh);
++}
++
++/**
++ * handle_recurse - put other holder structures (marked recursive) into the holders list
++ * @gh: the holder structure
++ *
++ */
++
++static void
++handle_recurse(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct list_head *tmp, *head, *next;
++      struct gfs_holder *tmp_gh;
++      int found = FALSE;
++
++      GFS_ASSERT_GLOCK(gh->gh_owner, gl,);
++
++      for (head = &gl->gl_waiters2, tmp = head->next, next = tmp->next;
++           tmp != head;
++           tmp = next, next = tmp->next) {
++              tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++              if (tmp_gh->gh_owner != gh->gh_owner)
++                      continue;
++
++              GFS_ASSERT_GLOCK(test_bit(HIF_RECURSE, &tmp_gh->gh_iflags),
++                               gl,);
++
++              list_move_tail(&tmp_gh->gh_list, &gl->gl_holders);
++              tmp_gh->gh_error = 0;
++              set_bit(HIF_HOLDER, &tmp_gh->gh_iflags);
++
++              complete(&tmp_gh->gh_wait);
++
++              found = TRUE;
++      }
++
++      GFS_ASSERT_GLOCK(found, gl,);
++}
++
++/**
++ * do_unrecurse - a recursive holder was just dropped of the waiters2 list
++ * @gh: the holder
++ *
++ * If there is only one other recursive holder, clear is HIF_RECURSE bit.
++ * If there is more than one, leave them alone.
++ *
++ */
++
++static void
++do_unrecurse(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct list_head *tmp, *head;
++      struct gfs_holder *tmp_gh, *last_gh = NULL;
++      int found = FALSE;
++
++      GFS_ASSERT_GLOCK(gh->gh_owner, gl,);
++
++      for (head = &gl->gl_waiters2, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++              if (tmp_gh->gh_owner != gh->gh_owner)
++                      continue;
++
++              GFS_ASSERT_GLOCK(test_bit(HIF_RECURSE, &tmp_gh->gh_iflags),
++                               gl,);
++
++              if (found)
++                      return;
++
++              found = TRUE;
++              last_gh = tmp_gh;
++      }
++
++      GFS_ASSERT_GLOCK(found, gl,);
++      clear_bit(HIF_RECURSE, &last_gh->gh_iflags);
++}
++
++/**
++ * rq_mutex - process a mutex request in the queue
++ * @gh: the glock holder
++ *
++ * Returns: TRUE if the queue is blocked, 
++ */
++
++static int
++rq_mutex(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++
++      list_del_init(&gh->gh_list);
++      /*  gh->gh_error never examined.  */
++      set_bit(GLF_LOCK, &gl->gl_flags);
++      complete(&gh->gh_wait);
++
++      return TRUE;
++}
++
++/**
++ * rq_promote - process a promote request in the queue
++ * @gh: the glock holder
++ * @promote_ok: It's ok to ask the LM to do promotes on a sync lock module
++ *
++ * Returns: TRUE if the queue is blocked, 
++ */
++
++static int
++rq_promote(struct gfs_holder *gh, int promote_ok)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      int recurse;
++
++      if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
++              if (list_empty(&gl->gl_holders)) {
++                      if (promote_ok || GFS_ASYNC_LM(sdp)) {
++                              gl->gl_req_gh = gh;
++                              set_bit(GLF_LOCK, &gl->gl_flags);
++                              spin_unlock(&gl->gl_spin);
++
++                              if (atomic_read(&sdp->sd_reclaim_count) >
++                                  sdp->sd_tune.gt_reclaim_limit &&
++                                  !(gh->gh_flags & LM_FLAG_PRIORITY)) {
++                                      gfs_reclaim_glock(sdp);
++                                      gfs_reclaim_glock(sdp);
++                              }
++
++                              glops->go_xmote_th(gl, gh->gh_state,
++                                                 gh->gh_flags);
++
++                              spin_lock(&gl->gl_spin);
++                      } else
++                          if (!test_and_set_bit(HIF_WAKEUP, &gh->gh_iflags))
++                              complete(&gh->gh_wait);
++              }
++              return TRUE;
++      }
++
++      if (list_empty(&gl->gl_holders)) {
++              set_bit(HIF_FIRST, &gh->gh_iflags);
++              set_bit(GLF_LOCK, &gl->gl_flags);
++              recurse = FALSE;
++      } else {
++              struct gfs_holder *next_gh;
++              if (gh->gh_flags & GL_LOCAL_EXCL)
++                      return TRUE;
++              next_gh = list_entry(gl->gl_holders.next, struct gfs_holder, gh_list);
++              if (next_gh->gh_flags & GL_LOCAL_EXCL)
++                       return TRUE;
++              recurse = test_bit(HIF_RECURSE, &gh->gh_iflags);
++      }
++
++      list_move_tail(&gh->gh_list, &gl->gl_holders);
++      gh->gh_error = 0;
++      set_bit(HIF_HOLDER, &gh->gh_iflags);
++
++      if (recurse)
++              handle_recurse(gh);
++
++      complete(&gh->gh_wait);
++
++      return FALSE;
++}
++
++/**
++ * rq_demote - process a demote request in the queue
++ * @gh: the glock holder
++ *
++ * Returns: TRUE if the queue is blocked, 
++ */
++
++static int
++rq_demote(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++
++      if (!list_empty(&gl->gl_holders))
++              return TRUE;
++
++      if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
++              list_del_init(&gh->gh_list);
++              gh->gh_error = 0;
++              spin_unlock(&gl->gl_spin);
++              if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++                      gfs_holder_put(gh);
++              else
++                      complete(&gh->gh_wait);
++              spin_lock(&gl->gl_spin);
++      } else {
++              gl->gl_req_gh = gh;
++              set_bit(GLF_LOCK, &gl->gl_flags);
++              spin_unlock(&gl->gl_spin);
++
++              if (gh->gh_state == LM_ST_UNLOCKED ||
++                  gl->gl_state != LM_ST_EXCLUSIVE)
++                      glops->go_drop_th(gl);
++              else
++                      glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
++
++              spin_lock(&gl->gl_spin);
++      }
++
++      return FALSE;
++}
++
++/**
++ * run_queue - process holder structures on a glock
++ * @gl: the glock
++ * @promote_ok: It's ok to ask the LM to do promotes on a sync lock module
++ *
++ */
++
++static void
++run_queue(struct gfs_glock *gl, int promote_ok)
++{
++      struct gfs_holder *gh;
++      int blocked;
++
++      for (;;) {
++              if (test_bit(GLF_LOCK, &gl->gl_flags))
++                      break;
++
++              if (!list_empty(&gl->gl_waiters1)) {
++                      gh = list_entry(gl->gl_waiters1.next,
++                                      struct gfs_holder, gh_list);
++
++                      if (test_bit(HIF_MUTEX, &gh->gh_iflags))
++                              blocked = rq_mutex(gh);
++                      else
++                              GFS_ASSERT_GLOCK(FALSE, gl,);
++
++              } else if (!list_empty(&gl->gl_waiters2)) {
++                      gh = list_entry(gl->gl_waiters2.next,
++                                      struct gfs_holder, gh_list);
++
++                      if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
++                              blocked = rq_promote(gh, promote_ok);
++                      else if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
++                              blocked = rq_demote(gh);
++                      else
++                              GFS_ASSERT_GLOCK(FALSE, gl,);
++
++              } else
++                      break;
++
++              if (blocked)
++                      break;
++      }
++}
++
++/**
++ * lock_on_glock - acquire a local lock on a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++lock_on_glock(struct gfs_glock *gl)
++{
++      struct gfs_holder gh;
++
++      gfs_holder_init(gl, 0, 0, &gh);
++      set_bit(HIF_MUTEX, &gh.gh_iflags);
++
++      spin_lock(&gl->gl_spin);
++      if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
++              list_add_tail(&gh.gh_list, &gl->gl_waiters1);
++      else
++              complete(&gh.gh_wait);
++      spin_unlock(&gl->gl_spin);
++
++      wait_for_completion(&gh.gh_wait);
++      gfs_holder_uninit(&gh);
++}
++
++/**
++ * trylock_on_glock - try to acquire a local lock on a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if the glock is acquired
++ */
++
++static int
++trylock_on_glock(struct gfs_glock *gl)
++{
++      int acquired = TRUE;
++
++      spin_lock(&gl->gl_spin);
++      if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
++              acquired = FALSE;
++      spin_unlock(&gl->gl_spin);
++
++      return acquired;
++}
++
++/**
++ * unlock_on_glock - release a local lock on a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++unlock_on_glock(struct gfs_glock *gl)
++{
++      spin_lock(&gl->gl_spin);
++      clear_bit(GLF_LOCK, &gl->gl_flags);
++      run_queue(gl, FALSE);
++      spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * handle_callback - add a demote request to a lock's queue
++ * @gl: the glock
++ * @state: the state the callback is us to change to
++ *
++ */
++
++static void
++handle_callback(struct gfs_glock *gl, unsigned int state)
++{
++      struct list_head *tmp, *head;
++      struct gfs_holder *gh, *new_gh = NULL;
++
++      GFS_ASSERT_GLOCK(state != LM_ST_EXCLUSIVE, gl,);
++
++ restart:
++      spin_lock(&gl->gl_spin);
++
++      for (head = &gl->gl_waiters2, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gh = list_entry(tmp, struct gfs_holder, gh_list);
++              if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
++                  gl->gl_req_gh != gh) {
++                      if (gh->gh_state != state)
++                              gh->gh_state = LM_ST_UNLOCKED;
++                      goto out;
++              }
++      }
++
++      if (new_gh) {
++              list_add(&new_gh->gh_list, &gl->gl_waiters2);
++              new_gh = NULL;
++      } else {
++              spin_unlock(&gl->gl_spin);
++
++              new_gh = gfs_holder_get(gl, state, LM_FLAG_TRY);
++              set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
++              set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
++              new_gh->gh_owner = NULL;
++
++              goto restart;
++      }
++
++ out:
++      spin_unlock(&gl->gl_spin);
++
++      if (new_gh)
++              gfs_holder_put(new_gh);
++}
++
++/**
++ * state_change - record that the glock is now in a different state
++ * @gl: the glock
++ * @new_state the new state
++ *
++ */
++
++static void
++state_change(struct gfs_glock *gl, unsigned int new_state)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      int held1, held2;
++
++      held1 = (gl->gl_state != LM_ST_UNLOCKED);
++      held2 = (new_state != LM_ST_UNLOCKED);
++
++      if (held1 != held2) {
++              if (held2) {
++                      atomic_inc(&sdp->sd_glock_held_count);
++                      glock_hold(gl);
++              } else {
++                      atomic_dec(&sdp->sd_glock_held_count);
++                      glock_put(gl);
++              }
++      }
++
++      gl->gl_state = new_state;
++}
++
++/**
++ * xmote_bh - Called after the lock module is done acquiring a lock
++ * @gl: The glock in question
++ * @ret: the int returned from the lock module
++ *
++ */
++
++static void
++xmote_bh(struct gfs_glock *gl, unsigned int ret)
++{
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      struct gfs_holder *gh = gl->gl_req_gh;
++      int prev_state = gl->gl_state;
++      int op_done = TRUE;
++
++      GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++      GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++      GFS_ASSERT_GLOCK(!(ret & LM_OUT_ASYNC), gl,);
++
++      state_change(gl, ret & LM_OUT_ST_MASK);
++
++      if (ret & LM_OUT_NEED_E)
++              handle_callback(gl, LM_ST_UNLOCKED);
++      else if (ret & LM_OUT_NEED_D)
++              handle_callback(gl, LM_ST_DEFERRED);
++      else if (ret & LM_OUT_NEED_S)
++              handle_callback(gl, LM_ST_SHARED);
++
++      if (ret & LM_OUT_LVB_INVALID)
++              set_bit(GLF_LVB_INVALID, &gl->gl_flags);
++
++      if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
++              if (glops->go_inval)
++                      glops->go_inval(gl, DIO_METADATA | DIO_DATA);
++      } else if (gl->gl_state == LM_ST_DEFERRED) {
++              /* We might not want to do this here.
++                 Look at moving to the inode glops. */
++              if (glops->go_inval)
++                      glops->go_inval(gl, DIO_DATA);
++      }
++
++      /*  Deal with each possible exit condition  */
++
++      if (!gh)
++              gl->gl_stamp = jiffies;
++
++      else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
++              spin_lock(&gl->gl_spin);
++              list_del_init(&gh->gh_list);
++              if (gl->gl_state == gh->gh_state ||
++                  gl->gl_state == LM_ST_UNLOCKED)
++                      gh->gh_error = 0;
++              else
++                      gh->gh_error = GLR_TRYFAILED;
++              spin_unlock(&gl->gl_spin);
++
++              if (ret & LM_OUT_CANCELED)
++                      handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
++
++      } else if (ret & LM_OUT_CANCELED) {
++              spin_lock(&gl->gl_spin);
++              list_del_init(&gh->gh_list);
++              gh->gh_error = GLR_CANCELED;
++              if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++                      do_unrecurse(gh);
++              spin_unlock(&gl->gl_spin);
++
++      } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
++              spin_lock(&gl->gl_spin);
++              list_move_tail(&gh->gh_list, &gl->gl_holders);
++              gh->gh_error = 0;
++              set_bit(HIF_HOLDER, &gh->gh_iflags);
++              spin_unlock(&gl->gl_spin);
++
++              set_bit(HIF_FIRST, &gh->gh_iflags);
++
++              op_done = FALSE;
++
++      } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
++              spin_lock(&gl->gl_spin);
++              list_del_init(&gh->gh_list);
++              gh->gh_error = GLR_TRYFAILED;
++              if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++                      do_unrecurse(gh);
++              spin_unlock(&gl->gl_spin);
++
++      } else
++              GFS_ASSERT_GLOCK(FALSE, gl,);
++
++      if (glops->go_xmote_bh)
++              glops->go_xmote_bh(gl);
++
++      if (op_done) {
++              spin_lock(&gl->gl_spin);
++              gl->gl_req_gh = NULL;
++              gl->gl_req_bh = NULL;
++              clear_bit(GLF_LOCK, &gl->gl_flags);
++              run_queue(gl, FALSE);
++              spin_unlock(&gl->gl_spin);
++      }
++
++      glock_put(gl);
++
++      if (gh) {
++              if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++                      gfs_holder_put(gh);
++              else
++                      complete(&gh->gh_wait);
++      }
++}
++
++/**
++ * gfs_glock_xmote_th - Call into the lock module to acquire a glock
++ * @gl: The glock in question
++ * @state: the requested state
++ * @flags: modifier flags to the lock call
++ *
++ */
++
++void
++gfs_glock_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
++                               LM_FLAG_NOEXP | LM_FLAG_ANY |
++                               LM_FLAG_PRIORITY);
++      unsigned int lck_ret;
++
++      GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++      GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++      GFS_ASSERT_GLOCK(state != LM_ST_UNLOCKED, gl,);
++      GFS_ASSERT_GLOCK(state != gl->gl_state, gl,);
++
++      if (gl->gl_state == LM_ST_EXCLUSIVE) {
++              if (glops->go_sync)
++                      glops->go_sync(gl, DIO_METADATA | DIO_DATA);
++      }
++
++      glock_hold(gl);
++      gl->gl_req_bh = xmote_bh;
++
++      atomic_inc(&sdp->sd_lm_lock_calls);
++
++      lck_ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl->gl_lock,
++                                                   gl->gl_state,
++                                                   state, lck_flags);
++
++      if (lck_ret & LM_OUT_ASYNC)
++              GFS_ASSERT_GLOCK(lck_ret == LM_OUT_ASYNC, gl,);
++      else
++              xmote_bh(gl, lck_ret);
++}
++
++/**
++ * drop_bh - Called after a lock module unlock completes
++ * @gl: the glock
++ * @ret: the return status
++ *
++ * Doesn't wake up the process waiting on the struct gfs_holder (if any)
++ * Doesn't drop the reference on the glock the top half took out
++ *
++ */
++
++static void
++drop_bh(struct gfs_glock *gl, unsigned int ret)
++{
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      struct gfs_holder *gh = gl->gl_req_gh;
++
++      clear_bit(GLF_PREFETCH, &gl->gl_flags);
++
++      GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++      GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++      GFS_ASSERT_GLOCK(!ret, gl,);
++
++      state_change(gl, LM_ST_UNLOCKED);
++
++      if (glops->go_inval)
++              glops->go_inval(gl, DIO_METADATA | DIO_DATA);
++
++      if (gh) {
++              spin_lock(&gl->gl_spin);
++              list_del_init(&gh->gh_list);
++              gh->gh_error = 0;
++              spin_unlock(&gl->gl_spin);
++      }
++
++      if (glops->go_drop_bh)
++              glops->go_drop_bh(gl);
++
++      spin_lock(&gl->gl_spin);
++      gl->gl_req_gh = NULL;
++      gl->gl_req_bh = NULL;
++      clear_bit(GLF_LOCK, &gl->gl_flags);
++      run_queue(gl, FALSE);
++      spin_unlock(&gl->gl_spin);
++
++      glock_put(gl);
++
++      if (gh) {
++              if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++                      gfs_holder_put(gh);
++              else
++                      complete(&gh->gh_wait);
++      }
++}
++
++/**
++ * gfs_glock_drop_th - call into the lock module to unlock a lock 
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_drop_th(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      unsigned int ret;
++
++      GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++      GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++      GFS_ASSERT_GLOCK(gl->gl_state != LM_ST_UNLOCKED, gl,);
++
++      if (gl->gl_state == LM_ST_EXCLUSIVE) {
++              if (glops->go_sync)
++                      glops->go_sync(gl, DIO_METADATA | DIO_DATA);
++      }
++
++      glock_hold(gl);
++      gl->gl_req_bh = drop_bh;
++
++      atomic_inc(&sdp->sd_lm_unlock_calls);
++
++      ret = sdp->sd_lockstruct.ls_ops->lm_unlock(gl->gl_lock, gl->gl_state);
++
++      if (!ret)
++              drop_bh(gl, ret);
++      else
++              GFS_ASSERT_GLOCK(ret == LM_OUT_ASYNC, gl,);
++}
++
++/**
++ * handle_cancels - cancel requests for locks stuck waiting on an expire flag
++ * @gh: the LM_FLAG_NOEXP holder waiting to acquire the lock
++ *
++ */
++
++static void
++handle_cancels(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++
++      spin_lock(&gl->gl_spin);
++
++      while (gl->gl_req_gh != gh &&
++             !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
++             !test_bit(HIF_WAKEUP, &gh->gh_iflags) &&
++             !list_empty(&gh->gh_list)) {
++              if (gl->gl_req_bh) {
++                      spin_unlock(&gl->gl_spin);
++                      gl->gl_sbd->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
++                      yield();
++                      spin_lock(&gl->gl_spin);
++              } else {
++                      spin_unlock(&gl->gl_spin);
++                      yield();
++                      spin_lock(&gl->gl_spin);
++              }
++      }
++
++      spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * glock_wait_internal - wait on a glock acquisition
++ * @gh: the glock holder
++ *
++ * Returns: 0 on success
++ */
++
++static int
++glock_wait_internal(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      int error = 0;
++
++      if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
++              spin_lock(&gl->gl_spin);
++              if (gl->gl_req_gh != gh &&
++                  !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
++                  !test_bit(HIF_WAKEUP, &gh->gh_iflags) &&
++                  !list_empty(&gh->gh_list)) {
++                      list_del_init(&gh->gh_list);
++                      gh->gh_error = GLR_TRYFAILED;
++                      if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++                              do_unrecurse(gh);
++                      run_queue(gl, FALSE);
++                      spin_unlock(&gl->gl_spin);
++                      return GLR_TRYFAILED;
++              }
++              spin_unlock(&gl->gl_spin);
++      }
++
++      if (gh->gh_flags & LM_FLAG_NOEXP)
++              handle_cancels(gh);
++
++      for (;;) {
++              wait_for_completion(&gh->gh_wait);
++
++              spin_lock(&gl->gl_spin);
++              if (test_and_clear_bit(HIF_WAKEUP, &gh->gh_iflags)) {
++                      run_queue(gl, TRUE);
++                      spin_unlock(&gl->gl_spin);
++              } else {
++                      spin_unlock(&gl->gl_spin);
++                      break;
++              }
++      }
++
++      if (gh->gh_error)
++              return gh->gh_error;
++
++      GFS_ASSERT_GLOCK(test_bit(HIF_HOLDER, &gh->gh_iflags), gl,);
++      GFS_ASSERT_GLOCK(relaxed_state_ok(gl->gl_state, gh->gh_state,
++                                        gh->gh_flags), gl,);
++
++      if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
++              GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++
++              if (glops->go_lock) {
++                      error = glops->go_lock(gl, gh->gh_flags);
++                      if (error) {
++                              spin_lock(&gl->gl_spin);
++                              list_del_init(&gh->gh_list);
++                              gh->gh_error = error;
++                              if (test_and_clear_bit(HIF_RECURSE, &gh->gh_iflags))
++                                      do_unrecurse(gh);
++                              spin_unlock(&gl->gl_spin);
++                      }
++              }
++
++              spin_lock(&gl->gl_spin);
++              gl->gl_req_gh = NULL;
++              gl->gl_req_bh = NULL;
++              clear_bit(GLF_LOCK, &gl->gl_flags);
++              if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++                      handle_recurse(gh);
++              run_queue(gl, FALSE);
++              spin_unlock(&gl->gl_spin);
++      }
++
++      return error;
++}
++
++/**
++ * add_to_queue - Add a holder to the wait queue (but look for recursion)
++ * @gh: the holder structure
++ *
++ */
++
++static void
++add_to_queue(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct list_head *tmp, *head;
++      struct gfs_holder *tmp_gh;
++
++      if (gh->gh_owner) {
++              for (head = &gl->gl_holders, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++                      if (tmp_gh->gh_owner == gh->gh_owner) {
++                              GFS_ASSERT_GLOCK((gh->gh_flags & LM_FLAG_ANY) ||
++                                               !(tmp_gh->gh_flags & LM_FLAG_ANY),
++                                               gl,);
++                              GFS_ASSERT_GLOCK((tmp_gh->gh_flags & GL_LOCAL_EXCL) ||
++                                               !(gh->gh_flags & GL_LOCAL_EXCL),
++                                               gl,);
++                              GFS_ASSERT_GLOCK(relaxed_state_ok(gl->gl_state,
++                                                                gh->gh_state,
++                                                                gh->gh_flags),
++                                               gl,);
++
++                              list_add_tail(&gh->gh_list, &gl->gl_holders);
++                              set_bit(HIF_HOLDER, &gh->gh_iflags);
++
++                              gh->gh_error = 0;
++                              complete(&gh->gh_wait);
++
++                              return;
++                      }
++              }
++
++              for (head = &gl->gl_waiters2, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++                      if (tmp_gh->gh_owner == gh->gh_owner) {
++                              GFS_ASSERT_GLOCK(test_bit(HIF_PROMOTE,
++                                                        &tmp_gh->gh_iflags),
++                                               gl,);
++                              GFS_ASSERT_GLOCK((gh->gh_flags & LM_FLAG_ANY) ||
++                                               !(tmp_gh->gh_flags & LM_FLAG_ANY),
++                                               gl,);
++                              GFS_ASSERT_GLOCK((tmp_gh->gh_flags & GL_LOCAL_EXCL) ||
++                                               !(gh->gh_flags & GL_LOCAL_EXCL),
++                                               gl,);
++                              GFS_ASSERT_GLOCK(relaxed_state_ok(tmp_gh->gh_state,
++                                                                gh->gh_state,
++                                                                gh->gh_flags),
++                                               gl,);
++
++                              set_bit(HIF_RECURSE, &gh->gh_iflags);
++                              set_bit(HIF_RECURSE, &tmp_gh->gh_iflags);
++
++                              list_add_tail(&gh->gh_list, &gl->gl_waiters2);
++
++                              return;
++                      }
++              }
++      }
++
++      if (gh->gh_flags & LM_FLAG_PRIORITY)
++              list_add(&gh->gh_list, &gl->gl_waiters2);
++      else
++              list_add_tail(&gh->gh_list, &gl->gl_waiters2);
++}
++
++/**
++ * gfs_glock_nq - enqueue a struct gfs_holder onto a glock (acquire a glock)
++ * @gh: the holder structure
++ *
++ * if (gh->gh_flags & GL_ASYNC), this never returns an error
++ *
++ * Returns: 0, GLR_TRYFAILED, or -EXXX on failure
++ */
++
++int
++gfs_glock_nq(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      int error = 0;
++
++      GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gl,);
++      GFS_ASSERT_GLOCK(gh->gh_state != LM_ST_UNLOCKED, gl,);
++      GFS_ASSERT_GLOCK((gh->gh_flags & (LM_FLAG_ANY | GL_EXACT)) !=
++                       (LM_FLAG_ANY | GL_EXACT), gl,);
++      GFS_ASSERT_GLOCK(GFS_ASYNC_LM(sdp) ||
++                       !(gh->gh_flags & GL_ASYNC), gl,);
++
++      atomic_inc(&sdp->sd_glock_nq_calls);
++
++ restart:
++      set_bit(HIF_PROMOTE, &gh->gh_iflags);
++
++      spin_lock(&gl->gl_spin);
++      add_to_queue(gh);
++      run_queue(gl, TRUE);
++      spin_unlock(&gl->gl_spin);
++
++      if (!(gh->gh_flags & GL_ASYNC)) {
++              error = glock_wait_internal(gh);
++              if (error == GLR_CANCELED) {
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      goto restart;
++              }
++      }
++
++      clear_bit(GLF_PREFETCH, &gl->gl_flags);
++
++      return error;
++}
++
++/**
++ * gfs_glock_poll - poll to see if an async request has been completed
++ * @gh: the holder
++ *
++ * Returns: TRUE if the request is ready to be gfs_glock_wait()ed on
++ */
++
++int
++gfs_glock_poll(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      int ready = FALSE;
++
++      GFS_ASSERT_GLOCK(gh->gh_flags & GL_ASYNC, gl,);
++      GFS_ASSERT_GLOCK(!test_bit(HIF_WAKEUP, &gh->gh_iflags), gl,);
++
++      spin_lock(&gl->gl_spin);
++
++      if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++              ready = TRUE;
++      else if (list_empty(&gh->gh_list)) {
++              if (gh->gh_error == GLR_CANCELED) {
++                      spin_unlock(&gl->gl_spin);
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      gfs_glock_nq(gh);
++                      return FALSE;
++              } else
++                      ready = TRUE;
++      }
++
++      spin_unlock(&gl->gl_spin);
++
++      return ready;
++}
++
++/**
++ * gfs_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
++ * @gh: the holder structure
++ *
++ * Returns: 0, GLR_TRYFAILED, or -EXXX on failure
++ */
++
++int
++gfs_glock_wait(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      int error;
++
++      GFS_ASSERT_GLOCK(gh->gh_flags & GL_ASYNC, gl,);
++      GFS_ASSERT_GLOCK(!test_bit(HIF_WAKEUP, &gh->gh_iflags), gl,);
++
++      error = glock_wait_internal(gh);
++      if (error == GLR_CANCELED) {
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ);
++              gh->gh_flags &= ~GL_ASYNC;
++              error = gfs_glock_nq(gh);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_glock_dq - dequeue a struct gfs_holder from a glock (release a glock)
++ * @gh: the glock holder
++ *
++ */
++
++void
++gfs_glock_dq(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++
++      GFS_ASSERT_GLOCK(!queue_empty(gl, &gh->gh_list), gl,);
++      GFS_ASSERT_GLOCK(test_bit(HIF_HOLDER, &gh->gh_iflags), gl,);
++
++      atomic_inc(&gl->gl_sbd->sd_glock_dq_calls);
++
++      if (gh->gh_flags & GL_SYNC)
++              set_bit(GLF_SYNC, &gl->gl_flags);
++      if (gh->gh_flags & GL_NOCACHE)
++              handle_callback(gl, LM_ST_UNLOCKED);
++
++      lock_on_glock(gl);
++
++      spin_lock(&gl->gl_spin);
++      list_del_init(&gh->gh_list);
++      if (list_empty(&gl->gl_holders)) {
++              spin_unlock(&gl->gl_spin);
++
++              if (glops->go_unlock)
++                      glops->go_unlock(gl, gh->gh_flags);
++
++              if (test_bit(GLF_SYNC, &gl->gl_flags)) {
++                      if (glops->go_sync)
++                              glops->go_sync(gl,
++                                             DIO_METADATA |
++                                             DIO_DATA |
++                                             DIO_INVISIBLE);
++              }
++
++              gl->gl_stamp = jiffies;
++
++              spin_lock(&gl->gl_spin);
++      }
++
++      clear_bit(GLF_LOCK, &gl->gl_flags);
++      run_queue(gl, FALSE);
++      spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * gfs_glock_prefetch - Try to prefetch a glock
++ * @gl: the glock
++ * @state: the state to prefetch in 
++ * @flags: flags passed to go_xmote_th()
++ *
++ */
++
++void
++gfs_glock_prefetch(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      struct gfs_glock_operations *glops = gl->gl_ops;
++
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++      GFS_ASSERT_GLOCK(state != LM_ST_UNLOCKED, gl,);
++      GFS_ASSERT_GLOCK((flags & (LM_FLAG_ANY | GL_EXACT)) !=
++                       (LM_FLAG_ANY | GL_EXACT), gl,);
++
++      spin_lock(&gl->gl_spin);
++
++      if (test_bit(GLF_LOCK, &gl->gl_flags) ||
++          !list_empty(&gl->gl_holders) ||
++          !list_empty(&gl->gl_waiters1) ||
++          !list_empty(&gl->gl_waiters2) ||
++          relaxed_state_ok(gl->gl_state, state, flags)) {
++              spin_unlock(&gl->gl_spin);
++              return;
++      }
++
++      set_bit(GLF_PREFETCH, &gl->gl_flags);
++
++      GFS_ASSERT_GLOCK(!gl->gl_req_gh, gl,);
++      set_bit(GLF_LOCK, &gl->gl_flags);
++      spin_unlock(&gl->gl_spin);
++
++      glops->go_xmote_th(gl, state, flags);
++
++      atomic_inc(&gl->gl_sbd->sd_glock_prefetch_calls);
++}
++
++/**
++ * gfs_glock_force_drop - Force a glock to be uncached
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_force_drop(struct gfs_glock *gl)
++{
++      struct gfs_holder gh;
++
++      gfs_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
++      set_bit(HIF_DEMOTE, &gh.gh_iflags);
++      gh.gh_owner = NULL;
++
++      spin_lock(&gl->gl_spin);
++      list_add(&gh.gh_list, &gl->gl_waiters2);
++      run_queue(gl, FALSE);
++      spin_unlock(&gl->gl_spin);
++
++      wait_for_completion(&gh.gh_wait);
++      gfs_holder_uninit(&gh);
++}
++
++/**
++ * gfs_glock_nq_init - intialize a holder and enqueue it on a glock
++ * @gl: the glock 
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ * Returns: 0, GLR_*, or -EXXX
++ */
++
++int
++gfs_glock_nq_init(struct gfs_glock *gl, unsigned int state, int flags,
++                struct gfs_holder *gh)
++{
++      int error;
++
++      gfs_holder_init(gl, state, flags, gh);
++
++      error = gfs_glock_nq(gh);
++      if (error)
++              gfs_holder_uninit(gh);
++
++      return error;
++}
++
++/**
++ * gfs_glock_dq_uninit - dequeue a holder from a glock and initialize it
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_glock_dq_uninit(struct gfs_holder *gh)
++{
++      gfs_glock_dq(gh);
++      gfs_holder_uninit(gh);
++}
++
++/**
++ * gfs_glock_nq_num - acquire a glock based on lock number
++ * @sdp: the filesystem
++ * @number: the lock number
++ * @glops: the glock operations for the type of glock
++ * @state: the state to acquire the glock in
++ * @flags: modifier flags for the aquisition
++ * @gh: the struct gfs_holder
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_glock_nq_num(struct gfs_sbd *sdp,
++               uint64_t number, struct gfs_glock_operations *glops,
++               unsigned int state, int flags, struct gfs_holder *gh)
++{
++      struct gfs_glock *gl;
++      int error;
++
++      error = gfs_glock_get(sdp, number, glops, CREATE, &gl);
++      if (!error) {
++              error = gfs_glock_nq_init(gl, state, flags, gh);
++              glock_put(gl);
++      }
++
++      return error;
++}
++
++/**
++ * glock_compare - Compare two struct gfs_glock structures for sorting
++ * @arg_a: the first structure
++ * @arg_b: the second structure
++ *
++ */
++
++static int
++glock_compare(void *arg_a, void *arg_b)
++{
++      struct gfs_holder *gh_a = *(struct gfs_holder **)arg_a;
++      struct gfs_holder *gh_b = *(struct gfs_holder **)arg_b;
++      struct lm_lockname *a = &gh_a->gh_gl->gl_name;
++      struct lm_lockname *b = &gh_b->gh_gl->gl_name;
++      int ret = 0;
++
++      if (a->ln_number > b->ln_number)
++              ret = 1;
++      else if (a->ln_number < b->ln_number)
++              ret = -1;
++      else {
++              if (gh_a->gh_state == LM_ST_SHARED &&
++                  gh_b->gh_state == LM_ST_EXCLUSIVE)
++                      ret = 1;
++              else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
++                       (gh_b->gh_flags & GL_LOCAL_EXCL))
++                      ret = 1;
++      }
++
++      return ret;
++}
++
++/**
++ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++static int
++nq_m_sync(unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct gfs_holder *p[num_gh];
++      unsigned int x;
++      int error = 0;
++
++      for (x = 0; x < num_gh; x++)
++              p[x] = &ghs[x];
++
++      gfs_sort(p, num_gh, sizeof(struct gfs_holder *), glock_compare);
++
++      for (x = 0; x < num_gh; x++) {
++              p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++
++              error = gfs_glock_nq(p[x]);
++              if (error) {
++                      while (x--)
++                              gfs_glock_dq(p[x]);
++                      break;
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_glock_nq_m - acquire multiple glocks
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Figure out how big an impact this function has.  Either:
++ * 1) Replace this code with code that calls gfs_glock_prefetch()
++ * 2) Forget async stuff and just call nq_m_sync()
++ * 3) Leave it like it is
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++int
++gfs_glock_nq_m(unsigned int num_gh, struct gfs_holder *ghs)
++{
++      int e[num_gh];
++      unsigned int x;
++      int borked = FALSE, serious = 0;
++      int error = 0;
++
++      GFS_ASSERT(num_gh,);
++
++      if (num_gh == 1) {
++              ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++              error = gfs_glock_nq(ghs);
++              return error;
++      }
++
++      if (!GFS_ASYNC_LM(ghs->gh_gl->gl_sbd)) {
++              error = nq_m_sync(num_gh, ghs);
++              return error;
++      }
++
++      for (x = 0; x < num_gh; x++) {
++              ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
++              gfs_glock_nq(&ghs[x]);
++      }
++
++      for (x = 0; x < num_gh; x++) {
++              error = e[x] = glock_wait_internal(&ghs[x]);
++              if (error) {
++                      borked = TRUE;
++                      if (error != GLR_TRYFAILED && error != GLR_CANCELED)
++                              serious = error;
++              }
++      }
++
++      if (!borked)
++              return 0;
++
++      for (x = 0; x < num_gh; x++)
++              if (!e[x])
++                      gfs_glock_dq(&ghs[x]);
++
++      if (serious)
++              error = serious;
++      else {
++              for (x = 0; x < num_gh; x++)
++                      gfs_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
++                                        &ghs[x]);
++              error = nq_m_sync(num_gh, ghs);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_glock_dq_m - release multiple glocks
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ */
++
++void
++gfs_glock_dq_m(unsigned int num_gh, struct gfs_holder *ghs)
++{
++      unsigned int x;
++
++      for (x = 0; x < num_gh; x++)
++              gfs_glock_dq(&ghs[x]);
++}
++
++/**
++ * gfs_glock_prefetch_num - prefetch a glock based on lock number
++ * @sdp: the filesystem
++ * @number: the lock number
++ * @glops: the glock operations for the type of glock
++ * @state: the state to acquire the glock in
++ * @flags: modifier flags for the aquisition
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_glock_prefetch_num(struct gfs_sbd *sdp,
++                     uint64_t number, struct gfs_glock_operations *glops,
++                     unsigned int state, int flags)
++{
++      struct gfs_glock *gl;
++      int error;
++
++      if (atomic_read(&sdp->sd_reclaim_count) < sdp->sd_tune.gt_reclaim_limit) {
++              error = gfs_glock_get(sdp, number, glops, CREATE, &gl);
++              if (!error) {
++                      gfs_glock_prefetch(gl, state, flags);
++                      glock_put(gl);
++              }
++      }
++}
++
++/**
++ * gfs_lvb_hold - attach a LVB from a glock
++ * @gl: The glock in question
++ *
++ */
++
++int
++gfs_lvb_hold(struct gfs_glock *gl)
++{
++      int error = 0;
++
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++
++      lock_on_glock(gl);
++
++      atomic_inc(&gl->gl_lvb_count);
++      if (atomic_read(&gl->gl_lvb_count) == 1) {
++              glock_hold(gl);
++              GFS_ASSERT_GLOCK(!gl->gl_lvb, gl,);
++              error = gl->gl_sbd->sd_lockstruct.ls_ops->lm_hold_lvb(gl->gl_lock,
++                                                                    &gl->gl_lvb);
++              if (error) {
++                      glock_put(gl);
++                      atomic_dec(&gl->gl_lvb_count);
++              }
++      }
++
++      unlock_on_glock(gl);
++
++      return error;
++}
++
++/**
++ * gfs_lvb_unhold - detach a LVB from a glock
++ * @gl: The glock in question
++ * 
++ */
++
++void
++gfs_lvb_unhold(struct gfs_glock *gl)
++{
++      glock_hold(gl);
++
++      lock_on_glock(gl);
++
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_lvb_count), gl,);
++      if (atomic_dec_and_test(&gl->gl_lvb_count)) {
++              GFS_ASSERT_GLOCK(gl->gl_lvb, gl,);
++              gl->gl_sbd->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock,
++                                                              gl->gl_lvb);
++              gl->gl_lvb = NULL;
++              glock_put(gl);
++      }
++
++      unlock_on_glock(gl);
++
++      glock_put(gl);
++}
++
++/**
++ * gfs_lvb_sync - sync a LVB
++ * @gl: The glock in question
++ * 
++ */
++
++void
++gfs_lvb_sync(struct gfs_glock *gl)
++{
++      GFS_ASSERT_GLOCK(atomic_read(&gl->gl_lvb_count), gl,);
++
++      lock_on_glock(gl);
++
++      GFS_ASSERT_GLOCK(gfs_glock_is_held_excl(gl), gl,);
++      gl->gl_sbd->sd_lockstruct.ls_ops->lm_sync_lvb(gl->gl_lock, gl->gl_lvb);
++
++      unlock_on_glock(gl);
++}
++
++/**
++ * gfs_glock_cb - Callback used by locking module
++ * @fsdata: Pointer to the superblock
++ * @type: Type of callback
++ * @data: Type dependent data pointer
++ *
++ * Called by the locking module when it wants to tell us something.
++ * Either we need to drop a lock or another client expired.
++ */
++
++void
++gfs_glock_cb(lm_fsdata_t * fsdata, unsigned int type, void *data)
++{
++      struct gfs_sbd *sdp = (struct gfs_sbd *)fsdata;
++      struct gfs_glock *gl;
++      struct lm_lockname *name = NULL;
++      unsigned int state = 0;
++      struct lm_async_cb *async;
++      unsigned int journal;
++
++      atomic_inc(&sdp->sd_lm_callbacks);
++
++      switch (type) {
++      case LM_CB_NEED_E:
++              name = (struct lm_lockname *)data;
++              state = LM_ST_UNLOCKED;
++              break;
++
++      case LM_CB_NEED_D:
++              name = (struct lm_lockname *)data;
++              state = LM_ST_DEFERRED;
++              break;
++
++      case LM_CB_NEED_S:
++              name = (struct lm_lockname *)data;
++              state = LM_ST_SHARED;
++              break;
++
++      case LM_CB_ASYNC:
++              async = (struct lm_async_cb *)data;
++
++              gl = gfs_glock_find(sdp, &async->lc_name);
++              GFS_ASSERT_SBD(gl, sdp,);
++              GFS_ASSERT_GLOCK(gl->gl_req_bh, gl,);
++              gl->gl_req_bh(gl, async->lc_ret);
++              glock_put(gl);
++
++              break;
++
++      case LM_CB_NEED_RECOVERY:
++              journal = *(unsigned int *)data;
++
++              gfs_add_dirty_j(sdp, journal);
++
++              if (test_bit(SDF_RECOVERD_RUN, &sdp->sd_flags))
++                      wake_up_process(sdp->sd_recoverd_process);
++
++              break;
++
++      case LM_CB_DROPLOCKS:
++              gfs_gl_hash_clear(sdp, FALSE);
++              gfs_quota_scan(sdp);
++              break;
++
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,
++                             printk("type = %u\n", type););
++              break;
++      }
++
++      if (name) {
++              gl = gfs_glock_find(sdp, name);
++              if (gl) {
++                      if (gl->gl_ops->go_callback)
++                              gl->gl_ops->go_callback(gl, state);
++                      handle_callback(gl, state);
++                      spin_lock(&gl->gl_spin);
++                      run_queue(gl, FALSE);
++                      spin_unlock(&gl->gl_spin);
++                      glock_put(gl);
++              }
++      }
++}
++
++/**
++ * gfs_try_toss_inode - try to remove a particular inode from GFS' cache
++ * sdp: the filesystem
++ * inum: the inode number
++ *
++ */
++
++void
++gfs_try_toss_inode(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      struct gfs_glock *gl;
++      struct gfs_inode *ip;
++      int error;
++
++      error = gfs_glock_get(sdp,
++                            inum->no_formal_ino, &gfs_inode_glops,
++                            NO_CREATE, &gl);
++      if (error || !gl)
++              return;
++
++      if (!trylock_on_glock(gl))
++              goto out;
++
++      if (!queue_empty(gl, &gl->gl_holders))
++              goto out_unlock;
++
++      ip = gl2ip(gl);
++      if (!ip)
++              goto out_unlock;
++
++      if (atomic_read(&ip->i_count))
++              goto out_unlock;
++
++      gfs_inode_destroy(ip);
++
++ out_unlock:
++      unlock_on_glock(gl);
++
++ out:
++      glock_put(gl);
++}
++
++/**
++ * gfs_iopen_go_callback - Try to kick the inode/vnode associated with an iopen glock from memory
++ * @io_gl: the iopen glock
++ * @state: the state into which the glock should be put
++ *
++ */
++
++void
++gfs_iopen_go_callback(struct gfs_glock *io_gl, unsigned int state)
++{
++      struct gfs_glock *i_gl;
++      struct gfs_inode *ip;
++
++      if (state != LM_ST_UNLOCKED)
++              return;
++
++      spin_lock(&io_gl->gl_spin);
++      i_gl = gl2gl(io_gl);
++      if (i_gl) {
++              glock_hold(i_gl);
++              spin_unlock(&io_gl->gl_spin);
++      } else {
++              spin_unlock(&io_gl->gl_spin);
++              return;
++      }
++
++      if (trylock_on_glock(i_gl)) {
++              if (queue_empty(i_gl, &i_gl->gl_holders)) {
++                      ip = gl2ip(i_gl);
++                      if (ip) {
++                              gfs_try_toss_vnode(ip);
++                              unlock_on_glock(i_gl);
++                              gfs_glock_schedule_for_reclaim(i_gl);
++                              goto out;
++                      }
++              }
++              unlock_on_glock(i_gl);
++      }
++
++ out:
++      glock_put(i_gl);
++}
++
++/**
++ * demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++demote_ok(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_glock_operations *glops = gl->gl_ops;
++      int demote = TRUE;
++
++      if (test_bit(GLF_STICKY, &gl->gl_flags))
++              demote = FALSE;
++      else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
++              demote = time_after_eq(jiffies,
++                                     gl->gl_stamp +
++                                     sdp->sd_tune.gt_prefetch_secs * HZ);
++      else if (glops->go_demote_ok)
++              demote = glops->go_demote_ok(gl);
++
++      return demote;
++}
++
++/**
++ * gfs_glock_schedule_for_reclaim - Add a glock to the reclaim list
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_schedule_for_reclaim(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++
++      spin_lock(&sdp->sd_reclaim_lock);
++      if (list_empty(&gl->gl_reclaim)) {
++              glock_hold(gl);
++              list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
++              atomic_inc(&sdp->sd_reclaim_count);
++      }
++      spin_unlock(&sdp->sd_reclaim_lock);
++
++      wake_up(&sdp->sd_reclaim_wchan);
++}
++
++/**
++ * gfs_reclaim_glock - process an glock on the reclaim list
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_reclaim_glock(struct gfs_sbd *sdp)
++{
++      struct gfs_glock *gl;
++      struct gfs_gl_hash_bucket *bucket;
++
++      spin_lock(&sdp->sd_reclaim_lock);
++
++      if (list_empty(&sdp->sd_reclaim_list)) {
++              spin_unlock(&sdp->sd_reclaim_lock);
++              return;
++      }
++
++      gl = list_entry(sdp->sd_reclaim_list.next,
++                      struct gfs_glock, gl_reclaim);
++      list_del_init(&gl->gl_reclaim);
++
++      spin_unlock(&sdp->sd_reclaim_lock);
++
++      atomic_dec(&sdp->sd_reclaim_count);
++      atomic_inc(&sdp->sd_reclaimed);
++
++      if (trylock_on_glock(gl)) {
++              if (queue_empty(gl, &gl->gl_holders)) {
++                      if (gl->gl_ops == &gfs_inode_glops) {
++                              struct gfs_inode *ip = gl2ip(gl);
++                              if (ip && !atomic_read(&ip->i_count))
++                                      gfs_inode_destroy(ip);
++                      }
++                      if (gl->gl_state != LM_ST_UNLOCKED &&
++                          demote_ok(gl))
++                              handle_callback(gl, LM_ST_UNLOCKED);
++              }
++              unlock_on_glock(gl);
++      }
++
++      bucket = gl->gl_bucket;
++
++      write_lock(&bucket->hb_lock);
++      if (atomic_read(&gl->gl_count) == 1) {
++              list_del_init(&gl->gl_list);
++              write_unlock(&bucket->hb_lock);
++              glock_free(gl);
++      } else {
++              write_unlock(&bucket->hb_lock);
++              glock_put(gl);
++      }
++}
++
++/**
++ * examine_bucket - Call a function for glock in a hash bucket
++ * @examiner: the function 
++ * @sdp: the filesystem
++ * @bucket: the bucket
++ *
++ * Returns: TRUE if the bucket is has entries
++ */
++
++static int
++examine_bucket(glock_examiner examiner,
++             struct gfs_sbd *sdp, struct gfs_gl_hash_bucket *bucket)
++{
++      struct glock_plug plug;
++      struct list_head *tmp;
++      struct gfs_glock *gl;
++      int entries;
++
++      memset(&plug.gl_flags, 0, sizeof(unsigned long));
++      set_bit(GLF_PLUG, &plug.gl_flags);
++
++      write_lock(&bucket->hb_lock);
++      list_add(&plug.gl_list, &bucket->hb_list);
++      write_unlock(&bucket->hb_lock);
++
++      for (;;) {
++              write_lock(&bucket->hb_lock);
++
++              for (;;) {
++                      tmp = plug.gl_list.next;
++                      if (tmp == &bucket->hb_list) {
++                              list_del(&plug.gl_list);
++                              entries = !list_empty(&bucket->hb_list);
++                              write_unlock(&bucket->hb_lock);
++                              return entries;
++                      }
++                      gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++                      list_move(&plug.gl_list, &gl->gl_list);
++
++                      if (test_bit(GLF_PLUG, &gl->gl_flags))
++                              continue;
++
++                      glock_hold(gl);
++
++                      break;
++              }
++
++              write_unlock(&bucket->hb_lock);
++
++              examiner(gl);
++      }
++}
++
++/**
++ * scan_glock - lock at a glock and see if we can do stuff to it
++ * @gl: the glock to look at
++ *
++ */
++
++static void
++scan_glock(struct gfs_glock *gl)
++{
++      if (trylock_on_glock(gl)) {
++              if (queue_empty(gl, &gl->gl_holders)) {
++                      if (gl->gl_ops == &gfs_inode_glops) {
++                              struct gfs_inode *ip = gl2ip(gl);
++                              if (ip && !atomic_read(&ip->i_count)) {
++                                      unlock_on_glock(gl);
++                                      gfs_glock_schedule_for_reclaim(gl);
++                                      goto out;
++                              }
++                      }
++                      if (gl->gl_state != LM_ST_UNLOCKED &&
++                          demote_ok(gl)) {
++                              unlock_on_glock(gl);
++                              gfs_glock_schedule_for_reclaim(gl);
++                              goto out;
++                      }
++              }
++
++              unlock_on_glock(gl);
++      }
++
++ out:
++      glock_put(gl);
++}
++
++/**
++ * gfs_scand_internal - Look for glocks and inodes to toss from memory
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_scand_internal(struct gfs_sbd *sdp)
++{
++      unsigned int x;
++
++      for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++              examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
++              cond_resched();
++      }
++}
++
++/**
++ * clear_glock - lock at a glock and see if we can do stuff to it
++ * @gl: the glock to look at
++ * @timeout: demote locks left unused for longer than this many seconds
++ *
++ */
++
++static void
++clear_glock(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_gl_hash_bucket *bucket = gl->gl_bucket;
++
++      spin_lock(&sdp->sd_reclaim_lock);
++      if (!list_empty(&gl->gl_reclaim)) {
++              list_del_init(&gl->gl_reclaim);
++              atomic_dec(&sdp->sd_reclaim_count);
++              glock_put(gl);
++      }
++      spin_unlock(&sdp->sd_reclaim_lock);
++
++      if (trylock_on_glock(gl)) {
++              if (queue_empty(gl, &gl->gl_holders)) {
++                      if (gl->gl_ops == &gfs_inode_glops) {
++                              struct gfs_inode *ip = gl2ip(gl);
++                              if (ip && !atomic_read(&ip->i_count))
++                                      gfs_inode_destroy(ip);
++                      }
++                      if (gl->gl_state != LM_ST_UNLOCKED)
++                              handle_callback(gl, LM_ST_UNLOCKED);
++              }
++
++              unlock_on_glock(gl);
++      }
++
++      write_lock(&bucket->hb_lock);
++      if (atomic_read(&gl->gl_count) == 1) {
++              list_del_init(&gl->gl_list);
++              write_unlock(&bucket->hb_lock);
++              glock_free(gl);
++      } else {
++              write_unlock(&bucket->hb_lock);
++              glock_put(gl);
++      }
++}
++
++/**
++ * gfs_gl_hash_clear - Empty out the glock hash table
++ * @sdp: the filesystem
++ * @wait: wait until it's all gone
++ *
++ */
++
++void
++gfs_gl_hash_clear(struct gfs_sbd *sdp, int wait)
++{
++      unsigned long t;
++      unsigned int x;
++      int cont;
++
++      t = jiffies;
++
++      for (;;) {
++              cont = FALSE;
++
++              for (x = 0; x < GFS_GL_HASH_SIZE; x++)
++                      if (examine_bucket(clear_glock, sdp, &sdp->sd_gl_hash[x]))
++                              cont = TRUE;
++
++              if (!wait || !cont)
++                      break;
++
++              if (time_after_eq(jiffies, t + sdp->sd_tune.gt_stall_secs * HZ)) {
++                      printk("GFS: fsid=%s: Unmount seems to be stalled. Dumping lock state...\n",
++                             sdp->sd_fsname);
++                      gfs_dump_lockstate(sdp, NULL);
++                      t = jiffies;
++              }
++
++              invalidate_inodes(sdp->sd_vfs);
++              yield();
++      }
++}
++
++/*
++ *  Diagnostic routines to help debug distributed deadlock
++ */
++
++/**
++ * dump_holder - print information about a glock holder
++ * @str: a string naming the type of holder
++ * @gh: the glock holder
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_holder(char *str, struct gfs_holder *gh,
++          char *buf, unsigned int size, unsigned int *count)
++{
++      unsigned int x;
++      int error = 0;
++
++      gfs_sprintf("  %s\n", str);
++      gfs_sprintf("    owner = %ld\n",
++                  (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
++      gfs_sprintf("    gh_state = %u\n", gh->gh_state);
++      gfs_sprintf("    gh_flags =");
++      for (x = 0; x < 32; x++)
++              if (gh->gh_flags & (1 << x))
++                      gfs_sprintf(" %u", x);
++      gfs_sprintf(" \n");
++      gfs_sprintf("    error = %d\n", gh->gh_error);
++      gfs_sprintf("    gh_iflags =");
++      for (x = 0; x < 32; x++)
++              if (test_bit(x, &gh->gh_iflags))
++                      gfs_sprintf(" %u", x);
++      gfs_sprintf(" \n");
++
++ out:
++      return error;
++}
++
++/**
++ * dump_inode - print information about an inode
++ * @ip: the inode
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_inode(struct gfs_inode *ip,
++         char *buf, unsigned int size, unsigned int *count)
++{
++      unsigned int x;
++      int error = 0;
++
++      gfs_sprintf("  Inode:\n");
++      gfs_sprintf("    num = %" PRIu64 "/%" PRIu64 "\n",
++                  ip->i_num.no_formal_ino, ip->i_num.no_addr);
++      gfs_sprintf("    type = %u\n", ip->i_di.di_type);
++      gfs_sprintf("    i_count = %d\n", atomic_read(&ip->i_count));
++      gfs_sprintf("    i_flags =");
++      for (x = 0; x < 32; x++)
++              if (test_bit(x, &ip->i_flags))
++                      gfs_sprintf(" %u", x);
++      gfs_sprintf(" \n");
++      gfs_sprintf("    vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
++
++ out:
++      return error;
++}
++
++/**
++ * dump_glock - print information about a glock
++ * @gl: the glock
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_glock(struct gfs_glock *gl,
++         char *buf, unsigned int size, unsigned int *count)
++{
++      struct list_head *head, *tmp;
++      struct gfs_holder *gh;
++      unsigned int x;
++      int error = 0;
++
++      spin_lock(&gl->gl_spin);
++
++      gfs_sprintf("Glock (%u, %" PRIu64 ")\n",
++                  gl->gl_name.ln_type,
++                  gl->gl_name.ln_number);
++      gfs_sprintf("  gl_flags =");
++      for (x = 0; x < 32; x++)
++              if (test_bit(x, &gl->gl_flags))
++                      gfs_sprintf(" %u", x);
++      gfs_sprintf(" \n");
++      gfs_sprintf("  gl_count = %d\n", atomic_read(&gl->gl_count));
++      gfs_sprintf("  gl_state = %u\n", gl->gl_state);
++      gfs_sprintf("  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
++      gfs_sprintf("  object = %s\n", (gl->gl_object) ? "yes" : "no");
++      if (gl->gl_aspace)
++              gfs_sprintf("  aspace = %lu\n",
++                          gl->gl_aspace->i_mapping->nrpages);
++      else
++              gfs_sprintf("  aspace = no\n");
++      gfs_sprintf("  reclaim = %s\n",
++                  (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
++      if (gl->gl_req_gh) {
++              error = dump_holder("Request", gl->gl_req_gh, buf, size, count);
++              if (error)
++                      goto out;
++      }
++      for (head = &gl->gl_holders, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gh = list_entry(tmp, struct gfs_holder, gh_list);
++              error = dump_holder("Holder", gh, buf, size, count);
++              if (error)
++                      goto out;
++      }
++      for (head = &gl->gl_waiters1, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gh = list_entry(tmp, struct gfs_holder, gh_list);
++              error = dump_holder("Waiter1", gh, buf, size, count);
++              if (error)
++                      goto out;
++      }
++      for (head = &gl->gl_waiters2, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gh = list_entry(tmp, struct gfs_holder, gh_list);
++              error = dump_holder("Waiter2", gh, buf, size, count);
++              if (error)
++                      goto out;
++      }
++      if (gl->gl_ops == &gfs_inode_glops && gl2ip(gl)) {
++              if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
++                  list_empty(&gl->gl_holders)) {
++                      error = dump_inode(gl2ip(gl), buf, size, count);
++                      if (error)
++                              goto out;
++              } else
++                      gfs_sprintf("  Inode: busy\n");
++      }
++
++ out:
++      spin_unlock(&gl->gl_spin);
++
++      return error;
++}
++
++/**
++ * gfs_dump_lockstate - print out the current lockstate
++ * @sdp: the filesystem
++ * @ub: the buffer to copy the information into
++ *
++ * If @ub is NULL, dump the lockstate to the console.
++ *
++ */
++
++int
++gfs_dump_lockstate(struct gfs_sbd *sdp, struct gfs_user_buffer *ub)
++{
++      struct gfs_gl_hash_bucket *bucket;
++      struct list_head *tmp, *head;
++      struct gfs_glock *gl;
++      char *buf = NULL;
++      unsigned int size = sdp->sd_tune.gt_lockdump_size;
++      unsigned int x, count;
++      int error = 0;
++
++      if (ub) {
++              buf = kmalloc(size, GFP_KERNEL);
++              if (!buf)
++                      return -ENOMEM;
++      }
++
++      for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++              bucket = &sdp->sd_gl_hash[x];
++              count = 0;
++
++              read_lock(&bucket->hb_lock);
++
++              for (head = &bucket->hb_list, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++                      if (test_bit(GLF_PLUG, &gl->gl_flags))
++                              continue;
++
++                      error = dump_glock(gl, buf, size, &count);
++                      if (error)
++                              break;
++              }
++
++              read_unlock(&bucket->hb_lock);
++
++              if (error)
++                      break;
++
++              if (ub) {
++                      if (ub->ub_count + count > ub->ub_size) {
++                              error = -ENOMEM;
++                              break;
++                      }
++                      if (copy_to_user(ub->ub_data + ub->ub_count, buf, count)) {
++                              error = -EFAULT;
++                              break;
++                      }
++                      ub->ub_count += count;
++              }
++      }
++
++      if (ub)
++              kfree(buf);
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/glock.h linux-patched/fs/gfs/glock.h
+--- linux-orig/fs/gfs/glock.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glock.h       2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,134 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_GLOCK_DOT_H__
++#define __GFS_GLOCK_DOT_H__
++
++/*
++#define LM_FLAG_TRY       (0x00000001)
++#define LM_FLAG_TRY_1CB   (0x00000002)
++#define LM_FLAG_NOEXP     (0x00000004)
++#define LM_FLAG_ANY       (0x00000008)
++#define LM_FLAG_PRIORITY  (0x00000010)
++*/
++#define GL_LOCAL_EXCL     (0x00000020)
++#define GL_ASYNC          (0x00000040)
++#define GL_EXACT          (0x00000080)
++#define GL_SKIP           (0x00000100)
++#define GL_ATIME          (0x00000200)
++#define GL_NOCACHE        (0x00000400)
++#define GL_SYNC           (0x00000800)
++
++#define GLR_TRYFAILED     (13)
++#define GLR_CANCELED      (14)
++
++static __inline__ int
++gfs_glock_is_locked_by_me(struct gfs_glock *gl)
++{
++      struct list_head *tmp, *head;
++      struct gfs_holder *gh;
++      int locked = FALSE;
++
++      spin_lock(&gl->gl_spin);
++      for (head = &gl->gl_holders, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gh = list_entry(tmp, struct gfs_holder, gh_list);
++              if (gh->gh_owner == current) {
++                      locked = TRUE;
++                      break;
++              }
++      }
++      spin_unlock(&gl->gl_spin);
++
++      return locked;
++}
++static __inline__ int
++gfs_glock_is_held_excl(struct gfs_glock *gl)
++{
++      return (gl->gl_state == LM_ST_EXCLUSIVE);
++}
++static __inline__ int
++gfs_glock_is_held_dfrd(struct gfs_glock *gl)
++{
++      return (gl->gl_state == LM_ST_DEFERRED);
++}
++static __inline__ int
++gfs_glock_is_held_shrd(struct gfs_glock *gl)
++{
++      return (gl->gl_state == LM_ST_SHARED);
++}
++
++#define GFS_ASYNC_LM(sdp) ((sdp)->sd_lockstruct.ls_flags & LM_LSFLAG_ASYNC)
++
++struct gfs_glock *gfs_glock_find(struct gfs_sbd *sdp,
++                               struct lm_lockname *name);
++int gfs_glock_get(struct gfs_sbd *sdp,
++                uint64_t number, struct gfs_glock_operations *glops,
++                int create, struct gfs_glock **glp);
++void gfs_glock_hold(struct gfs_glock *gl);
++void gfs_glock_put(struct gfs_glock *gl);
++
++void gfs_holder_init(struct gfs_glock *gl, unsigned int state, int flags,
++                   struct gfs_holder *gh);
++void gfs_holder_reinit(unsigned int state, int flags, struct gfs_holder *gh);
++void gfs_holder_uninit(struct gfs_holder *gh);
++struct gfs_holder *gfs_holder_get(struct gfs_glock *gl, unsigned int state,
++                                int flags);
++void gfs_holder_put(struct gfs_holder *gh);
++
++void gfs_glock_xmote_th(struct gfs_glock *gl, unsigned int state, int flags);
++void gfs_glock_drop_th(struct gfs_glock *gl);
++
++int gfs_glock_nq(struct gfs_holder *gh);
++int gfs_glock_poll(struct gfs_holder *gh);
++int gfs_glock_wait(struct gfs_holder *gh);
++void gfs_glock_dq(struct gfs_holder *gh);
++
++void gfs_glock_prefetch(struct gfs_glock *gl, unsigned int state, int flags);
++void gfs_glock_force_drop(struct gfs_glock *gl);
++
++int gfs_glock_nq_init(struct gfs_glock *gl, unsigned int state, int flags,
++                    struct gfs_holder *gh);
++void gfs_glock_dq_uninit(struct gfs_holder *gh);
++int gfs_glock_nq_num(struct gfs_sbd *sdp,
++                   uint64_t number, struct gfs_glock_operations *glops,
++                   unsigned int state, int flags, struct gfs_holder *gh);
++
++int gfs_glock_nq_m(unsigned int num_gh, struct gfs_holder *ghs);
++void gfs_glock_dq_m(unsigned int num_gh, struct gfs_holder *ghs);
++
++void gfs_glock_prefetch_num(struct gfs_sbd *sdp,
++                          uint64_t number, struct gfs_glock_operations *glops,
++                          unsigned int state, int flags);
++
++/*  Lock Value Block functions  */
++
++int gfs_lvb_hold(struct gfs_glock *gl);
++void gfs_lvb_unhold(struct gfs_glock *gl);
++void gfs_lvb_sync(struct gfs_glock *gl);
++
++void gfs_glock_cb(lm_fsdata_t * fsdata, unsigned int type, void *data);
++
++void gfs_try_toss_inode(struct gfs_sbd *sdp, struct gfs_inum *inum);
++void gfs_iopen_go_callback(struct gfs_glock *gl, unsigned int state);
++
++void gfs_glock_schedule_for_reclaim(struct gfs_glock *gl);
++void gfs_reclaim_glock(struct gfs_sbd *sdp);
++
++void gfs_scand_internal(struct gfs_sbd *sdp);
++void gfs_gl_hash_clear(struct gfs_sbd *sdp, int wait);
++
++int gfs_dump_lockstate(struct gfs_sbd *sdp, struct gfs_user_buffer *ub);
++
++#endif /* __GFS_GLOCK_DOT_H__ */
+diff -urN linux-orig/fs/gfs/glops.c linux-patched/fs/gfs/glops.c
+--- linux-orig/fs/gfs/glops.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glops.c       2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,526 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "page.h"
++#include "recovery.h"
++#include "rgrp.h"
++
++/**
++ * meta_go_sync - sync out the metadata for this glock
++ * @gl: the glock
++ * @flags: DIO_*
++ *
++ */
++
++static void
++meta_go_sync(struct gfs_glock *gl, int flags)
++{
++      if (!(flags & DIO_METADATA))
++              return;
++
++      if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
++              gfs_log_flush_glock(gl);
++              gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++      }
++
++      clear_bit(GLF_DIRTY, &gl->gl_flags);
++      clear_bit(GLF_SYNC, &gl->gl_flags);
++}
++
++/**
++ * meta_go_inval - invalidate the metadata for this glock
++ * @gl: the glock
++ * @flags: 
++ *
++ */
++
++static void
++meta_go_inval(struct gfs_glock *gl, int flags)
++{
++      if (!(flags & DIO_METADATA))
++              return;
++
++      gfs_inval_buf(gl);
++      gl->gl_vn++;
++}
++
++/**
++ * meta_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++meta_go_demote_ok(struct gfs_glock *gl)
++{
++      return (gl->gl_aspace->i_mapping->nrpages) ? FALSE : TRUE;
++}
++
++/**
++ * inode_go_xmote_th - promote/demote a glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++inode_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      if (gl->gl_state != LM_ST_UNLOCKED)
++              gfs_inval_pte(gl);
++      gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * inode_go_xmote_bh - promote/demote a glock
++ * @gl: the glock
++ *
++ * This will be really broken when (no_formal_ino != no_addr)
++ *
++ */
++
++static void
++inode_go_xmote_bh(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_holder *gh = gl->gl_req_gh;
++      struct buffer_head *bh;
++      int error;
++
++      if (gl->gl_state != LM_ST_UNLOCKED &&
++          (!gh || !(gh->gh_flags & GL_SKIP))) {
++              error = gfs_dread(sdp, gl->gl_name.ln_number, gl, DIO_START, &bh);
++              if (!error)
++                      brelse(bh);
++      }
++}
++
++/**
++ * inode_go_drop_th - unlock a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++inode_go_drop_th(struct gfs_glock *gl)
++{
++      gfs_inval_pte(gl);
++      gfs_glock_drop_th(gl);
++}
++
++/**
++ * inode_go_sync - Sync the dirty data for a inode glock
++ * @gl: the glock
++ * @flags: 
++ *
++ */
++
++static void
++inode_go_sync(struct gfs_glock *gl, int flags)
++{
++      int meta = (flags & DIO_METADATA);
++      int data = (flags & DIO_DATA);
++
++      if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
++              if (meta && data) {
++                      gfs_sync_page(gl, flags | DIO_START);
++                      gfs_log_flush_glock(gl);
++                      gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++                      gfs_sync_page(gl, flags | DIO_WAIT | DIO_CHECK);
++              } else if (meta) {
++                      gfs_log_flush_glock(gl);
++                      gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++              } else if (data)
++                      gfs_sync_page(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++      }
++
++      if (meta && data) {
++              if (!(flags & DIO_INVISIBLE))
++                      clear_bit(GLF_DIRTY, &gl->gl_flags);
++              clear_bit(GLF_SYNC, &gl->gl_flags);
++      }
++}
++
++/**
++ * inode_go_inval - prepare a inode glock to be released
++ * @gl: the glock
++ * @flags: 
++ *
++ */
++
++static void
++inode_go_inval(struct gfs_glock *gl, int flags)
++{
++      int meta = (flags & DIO_METADATA);
++      int data = (flags & DIO_DATA);
++
++      if (meta) {
++              gfs_inval_buf(gl);
++              gl->gl_vn++;
++      }
++      if (data)
++              gfs_inval_page(gl);
++}
++
++/**
++ * inode_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++inode_go_demote_ok(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      int demote = FALSE;
++
++      if (!gl2ip(gl) && !gl->gl_aspace->i_mapping->nrpages)
++              demote = TRUE;
++      else if (!sdp->sd_args.ar_localcaching &&
++               time_after_eq(jiffies, gl->gl_stamp + sdp->sd_tune.gt_demote_secs * HZ))
++              demote = TRUE;
++
++      return demote;
++}
++
++/**
++ * inode_go_lock - operation done after an inode lock is locked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_glock()
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_go_lock(struct gfs_glock *gl, int flags)
++{
++      struct gfs_inode *ip = gl2ip(gl);
++      int error = 0;
++
++      if (ip && ip->i_vn != gl->gl_vn) {
++              error = gfs_copyin_dinode(ip);
++              if (!error)
++                      gfs_inode_attr_in(ip);
++      }
++
++      return error;
++}
++
++/**
++ * inode_go_unlock - operation done before an inode lock is unlocked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_gunlock()
++ *
++ */
++
++static void
++inode_go_unlock(struct gfs_glock *gl, int flags)
++{
++      struct gfs_inode *ip = gl2ip(gl);
++
++      if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
++              gfs_inode_attr_in(ip);
++
++      if (ip)
++              gfs_flush_meta_cache(ip);
++}
++
++/**
++ * rgrp_go_xmote_th - promote/demote a glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++rgrp_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++      GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++      gfs_mhc_zap(rgd);
++      gfs_depend_sync(rgd);
++      gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * rgrp_go_drop_th - unlock a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++rgrp_go_drop_th(struct gfs_glock *gl)
++{
++      struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++      GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++      gfs_mhc_zap(rgd);
++      gfs_depend_sync(rgd);
++      gfs_glock_drop_th(gl);
++}
++
++/**
++ * rgrp_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++rgrp_go_demote_ok(struct gfs_glock *gl)
++{
++      struct gfs_rgrpd *rgd = gl2rgd(gl);
++      int demote = TRUE;
++
++      if (gl->gl_aspace->i_mapping->nrpages)
++              demote = FALSE;
++      else if (rgd && !list_empty(&rgd->rd_mhc)) /* Don't bother with lock here */
++              demote = FALSE;
++
++      return demote;
++}
++
++/**
++ * rgrp_go_lock - operation done after an rgrp lock is locked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_glock()
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++rgrp_go_lock(struct gfs_glock *gl, int flags)
++{
++      struct gfs_rgrpd *rgd = gl2rgd(gl);
++      int error = 0;
++
++      GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++      if (!(flags & GL_SKIP))
++              error = gfs_rgrp_read(rgd);
++
++      return error;
++}
++
++/**
++ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_gunlock()
++ *
++ */
++
++static void
++rgrp_go_unlock(struct gfs_glock *gl, int flags)
++{
++      struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++      GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++      if (!(flags & GL_SKIP)) {
++              gfs_rgrp_relse(rgd);
++              if (test_bit(GLF_DIRTY, &gl->gl_flags))
++                      gfs_rgrp_lvb_fill(rgd);
++      }
++}
++
++/**
++ * trans_go_xmote_th - promote/demote a metadata glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++trans_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      int error;
++
++      if (gl->gl_state != LM_ST_UNLOCKED &&
++          test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++              gfs_sync_meta(sdp);
++
++              error = gfs_log_shutdown(sdp);
++              if (error)
++                      gfs_io_error(sdp);
++      }
++
++      gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * trans_go_xmote_bh - promote/demote a metadata glock
++ * @gl: the glock
++ *
++ */
++
++static void
++trans_go_xmote_bh(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_glock *j_gl = sdp->sd_journal_gh.gh_gl;
++      struct gfs_log_header head;
++      int error;
++
++      if (gl->gl_state != LM_ST_UNLOCKED &&
++          test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++              j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
++
++              error = gfs_find_jhead(sdp, &sdp->sd_jdesc, j_gl, &head);
++              GFS_ASSERT_SBD(!error, sdp,);  /* FixMe!!! */
++              GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++
++              /*  Initialize some head of the log stuff  */
++              sdp->sd_sequence = head.lh_sequence;
++              sdp->sd_log_head = head.lh_first + 1;
++      }
++}
++
++/**
++ * trans_go_drop_th - prepare the transaction glock to be released
++ * @gl: the glock
++ *
++ * We want to sync the device even with localcaching.  Remember
++ * that localcaching journal replay only marks buffers dirty.
++ */
++
++static void
++trans_go_drop_th(struct gfs_glock *gl)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      int error;
++
++      if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++              gfs_sync_meta(sdp);
++
++              error = gfs_log_shutdown(sdp);
++              if (error)
++                      gfs_io_error(sdp);
++      }
++
++      gfs_glock_drop_th(gl);
++}
++
++/**
++ * nondisk_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++nondisk_go_demote_ok(struct gfs_glock *gl)
++{
++      return FALSE;
++}
++
++/**
++ * quota_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++quota_go_demote_ok(struct gfs_glock *gl)
++{
++      return !atomic_read(&gl->gl_lvb_count);
++}
++
++struct gfs_glock_operations gfs_meta_glops = {
++      .go_xmote_th = gfs_glock_xmote_th,
++      .go_drop_th = gfs_glock_drop_th,
++      .go_sync = meta_go_sync,
++      .go_inval = meta_go_inval,
++      .go_demote_ok = meta_go_demote_ok,
++      .go_type = LM_TYPE_META
++};
++
++struct gfs_glock_operations gfs_inode_glops = {
++      .go_xmote_th = inode_go_xmote_th,
++      .go_xmote_bh = inode_go_xmote_bh,
++      .go_drop_th = inode_go_drop_th,
++      .go_sync = inode_go_sync,
++      .go_inval = inode_go_inval,
++      .go_demote_ok = inode_go_demote_ok,
++      .go_lock = inode_go_lock,
++      .go_unlock = inode_go_unlock,
++      .go_type = LM_TYPE_INODE
++};
++
++struct gfs_glock_operations gfs_rgrp_glops = {
++      .go_xmote_th = rgrp_go_xmote_th,
++      .go_drop_th = rgrp_go_drop_th,
++      .go_sync = meta_go_sync,
++      .go_inval = meta_go_inval,
++      .go_demote_ok = rgrp_go_demote_ok,
++      .go_lock = rgrp_go_lock,
++      .go_unlock = rgrp_go_unlock,
++      .go_type = LM_TYPE_RGRP
++};
++
++struct gfs_glock_operations gfs_trans_glops = {
++      .go_xmote_th = trans_go_xmote_th,
++      .go_xmote_bh = trans_go_xmote_bh,
++      .go_drop_th = trans_go_drop_th,
++      .go_type = LM_TYPE_NONDISK
++};
++
++struct gfs_glock_operations gfs_iopen_glops = {
++      .go_xmote_th = gfs_glock_xmote_th,
++      .go_drop_th = gfs_glock_drop_th,
++      .go_callback = gfs_iopen_go_callback,
++      .go_type = LM_TYPE_IOPEN
++};
++
++struct gfs_glock_operations gfs_flock_glops = {
++      .go_xmote_th = gfs_glock_xmote_th,
++      .go_drop_th = gfs_glock_drop_th,
++      .go_type = LM_TYPE_FLOCK
++};
++
++struct gfs_glock_operations gfs_nondisk_glops = {
++      .go_xmote_th = gfs_glock_xmote_th,
++      .go_drop_th = gfs_glock_drop_th,
++      .go_demote_ok = nondisk_go_demote_ok,
++      .go_type = LM_TYPE_NONDISK
++};
++
++struct gfs_glock_operations gfs_quota_glops = {
++      .go_xmote_th = gfs_glock_xmote_th,
++      .go_drop_th = gfs_glock_drop_th,
++      .go_demote_ok = quota_go_demote_ok,
++      .go_type = LM_TYPE_QUOTA
++};
+diff -urN linux-orig/fs/gfs/glops.h linux-patched/fs/gfs/glops.h
+--- linux-orig/fs/gfs/glops.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glops.h       2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,26 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GLOPS_DOT_H__
++#define __GLOPS_DOT_H__
++
++extern struct gfs_glock_operations gfs_meta_glops;
++extern struct gfs_glock_operations gfs_inode_glops;
++extern struct gfs_glock_operations gfs_rgrp_glops;
++extern struct gfs_glock_operations gfs_trans_glops;
++extern struct gfs_glock_operations gfs_iopen_glops;
++extern struct gfs_glock_operations gfs_flock_glops;
++extern struct gfs_glock_operations gfs_nondisk_glops;
++extern struct gfs_glock_operations gfs_quota_glops;
++
++#endif /* __GLOPS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/incore.h linux-patched/fs/gfs/incore.h
+--- linux-orig/fs/gfs/incore.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/incore.h      2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,726 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __INCORE_DOT_H__
++#define __INCORE_DOT_H__
++
++#define DIO_NEW           (0x00000001)
++#define DIO_FORCE         (0x00000002)
++#define DIO_CLEAN         (0x00000004)
++#define DIO_DIRTY         (0x00000008)
++#define DIO_START         (0x00000010)
++#define DIO_WAIT          (0x00000020)
++#define DIO_METADATA      (0x00000040)
++#define DIO_DATA          (0x00000080)
++#define DIO_INVISIBLE     (0x00000100)
++#define DIO_CHECK         (0x00000200)
++#define DIO_ALL           (0x00000400)
++
++/*  Structure prototypes  */
++
++struct gfs_log_operations;
++struct gfs_log_element;
++struct gfs_meta_header_cache;
++struct gfs_depend;
++struct gfs_bitmap;
++struct gfs_rgrpd;
++struct gfs_bufdata;
++struct gfs_glock_operations;
++struct gfs_holder;
++struct gfs_glock;
++struct gfs_alloc;
++struct gfs_inode;
++struct gfs_file;
++struct gfs_unlinked;
++struct gfs_quota_le;
++struct gfs_quota_data;
++struct gfs_log_buf;
++struct gfs_trans;
++struct gfs_gl_hash_bucket;
++struct gfs_sbd;
++
++typedef void (*gfs_glop_bh_t) (struct gfs_glock * gl, unsigned int ret);
++
++/*
++ *  Structure of operations that are associated with each
++ *  type of element in the log.
++ */
++
++struct gfs_log_operations {
++      /*  Operations specific to a given log element  */
++
++      void (*lo_add) (struct gfs_sbd * sdp, struct gfs_log_element * le);
++      void (*lo_trans_end) (struct gfs_sbd * sdp,
++                            struct gfs_log_element * le);
++      void (*lo_print) (struct gfs_sbd * sdp, struct gfs_log_element * le,
++                        unsigned int where);
++      struct gfs_trans *(*lo_overlap_trans) (struct gfs_sbd * sdp,
++                                             struct gfs_log_element * le);
++      void (*lo_incore_commit) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++                                struct gfs_log_element * le);
++      void (*lo_add_to_ail) (struct gfs_sbd * sdp,
++                             struct gfs_log_element * le);
++      void (*lo_clean_dump) (struct gfs_sbd * sdp,
++                             struct gfs_log_element * le);
++
++      /*  Operations specific to a class of log elements  */
++
++      void (*lo_trans_size) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++                             unsigned int *mblks, unsigned int *eblks,
++                             unsigned int *blocks, unsigned int *bmem);
++      void (*lo_trans_combine) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++                                struct gfs_trans * new_tr);
++      void (*lo_build_bhlist) (struct gfs_sbd * sdp, struct gfs_trans * tr);
++      void (*lo_dump_size) (struct gfs_sbd * sdp, unsigned int *elements,
++                            unsigned int *blocks, unsigned int *bmem);
++      void (*lo_build_dump) (struct gfs_sbd * sdp, struct gfs_trans * tr);
++
++      /*  Operations that happen at recovery time  */
++
++      void (*lo_before_scan) (struct gfs_sbd * sdp, unsigned int jid,
++                              struct gfs_log_header * head,
++                              unsigned int pass);
++      int (*lo_scan_elements) (struct gfs_sbd * sdp,
++                               struct gfs_jindex * jdesc,
++                               struct gfs_glock * gl, uint64_t start,
++                               struct gfs_log_descriptor * desc,
++                               unsigned int pass);
++      void (*lo_after_scan) (struct gfs_sbd * sdp, unsigned int jid,
++                             unsigned int pass);
++
++      char *lo_name;
++};
++
++/*
++ *  Structure that gets added to struct gfs_trans->tr_elements.  They
++ *  make up the "stuff" in each transaction.
++ */
++
++struct gfs_log_element {
++      struct gfs_log_operations *le_ops;
++
++      struct gfs_trans *le_trans;
++      struct list_head le_list;
++};
++
++struct gfs_meta_header_cache {
++      struct list_head mc_list_hash;
++      struct list_head mc_list_single;
++      struct list_head mc_list_rgd;
++
++      uint64_t mc_block;
++      struct gfs_meta_header mc_mh;
++};
++
++struct gfs_depend {
++      struct list_head gd_list_hash;
++      struct list_head gd_list_rgd;
++
++      struct gfs_rgrpd *gd_rgd;
++      uint64_t gd_formal_ino;
++      unsigned long gd_time;
++};
++
++/*
++ *  Structure containing information about the allocation bitmaps.
++ *  There are one of these for each fs block that the bitmap for
++ *  the resource group header covers.
++ */
++
++struct gfs_bitmap {
++      uint32_t bi_offset;     /* The offset in the buffer of the first byte */
++      uint32_t bi_start;      /* The position of the first byte in this block */
++      uint32_t bi_len;        /* The number of bytes in this block */
++};
++
++/*
++ *  Structure containing information Resource Groups
++ */
++
++struct gfs_rgrpd {
++      struct list_head rd_list;       /* Link with superblock */
++      struct list_head rd_list_mru;
++      struct list_head rd_recent;     /* Recently used rgrps */
++
++      struct gfs_glock *rd_gl;        /* Glock for rgrp */
++
++      unsigned long rd_flags;
++
++      struct gfs_rindex rd_ri;        /* Resource Index structure */
++      struct gfs_rgrp rd_rg;          /* Resource Group structure */
++      uint64_t rd_rg_vn;
++
++      struct gfs_bitmap *rd_bits;
++      struct buffer_head **rd_bh;
++
++      uint32_t rd_last_alloc_data;
++      uint32_t rd_last_alloc_meta;
++
++      struct list_head rd_mhc;
++      struct list_head rd_depend;
++
++      struct gfs_sbd *rd_sbd;
++};
++
++/*
++ *  Per-buffer data
++ */
++
++struct gfs_bufdata {
++      struct buffer_head *bd_bh;      /* struct buffer_head which this struct belongs to */
++      struct gfs_glock *bd_gl;        /* Pointer to Glock struct for this bh */
++
++      struct gfs_log_element bd_new_le;
++      struct gfs_log_element bd_incore_le;
++
++      char *bd_frozen;
++      struct semaphore bd_lock;
++
++      unsigned int bd_pinned;                 /* Pin count */
++      struct list_head bd_ail_tr_list;        /* List of buffers hanging off tr_ail_bufs */
++      struct list_head bd_ail_gl_list;        /* List of buffers hanging off gl_ail_bufs */
++};
++
++/*
++ *  Glock operations
++ */
++
++struct gfs_glock_operations {
++      void (*go_xmote_th) (struct gfs_glock * gl, unsigned int state,
++                           int flags);
++      void (*go_xmote_bh) (struct gfs_glock * gl);
++      void (*go_drop_th) (struct gfs_glock * gl);
++      void (*go_drop_bh) (struct gfs_glock * gl);
++      void (*go_sync) (struct gfs_glock * gl, int flags);
++      void (*go_inval) (struct gfs_glock * gl, int flags);
++      int (*go_demote_ok) (struct gfs_glock * gl);
++      int (*go_lock) (struct gfs_glock * gl, int flags);
++      void (*go_unlock) (struct gfs_glock * gl, int flags);
++      void (*go_callback) (struct gfs_glock * gl, unsigned int state);
++      int go_type;
++};
++
++/*  Actions  */
++#define HIF_MUTEX               (0)
++#define HIF_PROMOTE             (1)
++#define HIF_DEMOTE              (2)
++
++/*  States  */
++#define HIF_ALLOCED             (3)
++#define HIF_DEALLOC             (4)
++#define HIF_HOLDER              (5)
++#define HIF_FIRST               (6)
++#define HIF_WAKEUP              (7)
++#define HIF_RECURSE             (8)
++
++struct gfs_holder {
++      struct list_head gh_list;
++
++      struct gfs_glock *gh_gl;
++      struct task_struct *gh_owner;
++      unsigned int gh_state;
++      int gh_flags;
++
++      int gh_error;
++      unsigned long gh_iflags;
++      struct completion gh_wait;
++};
++
++/*
++ *  Glock Structure
++ */
++
++#define GLF_PLUG                (0)
++#define GLF_LOCK                (1)
++#define GLF_STICKY              (2)
++#define GLF_PREFETCH            (3)
++#define GLF_SYNC                (4)
++#define GLF_DIRTY               (5)
++#define GLF_LVB_INVALID         (6)
++
++struct gfs_glock {
++      struct list_head gl_list;
++      unsigned long gl_flags;
++      struct lm_lockname gl_name;
++      atomic_t gl_count;
++
++      spinlock_t gl_spin;
++
++      unsigned int gl_state;
++      struct list_head gl_holders;
++      struct list_head gl_waiters1;   /*  HIF_MUTEX  */
++      struct list_head gl_waiters2;   /*  HIF_DEMOTE, HIF_PROMOTE  */
++
++      struct gfs_glock_operations *gl_ops;
++
++      struct gfs_holder *gl_req_gh;
++      gfs_glop_bh_t gl_req_bh;
++
++      lm_lock_t *gl_lock;
++      char *gl_lvb;
++      atomic_t gl_lvb_count;
++
++      uint64_t gl_vn;
++      unsigned long gl_stamp;
++      void *gl_object;
++
++      struct gfs_log_element gl_new_le;
++      struct gfs_log_element gl_incore_le;
++
++      struct gfs_gl_hash_bucket *gl_bucket;
++      struct list_head gl_reclaim;
++
++      struct gfs_sbd *gl_sbd;
++
++      struct inode *gl_aspace;
++      struct list_head gl_dirty_buffers;
++      struct list_head gl_ail_bufs;
++};
++
++/*
++ *  In-Place Reservation structure
++ */
++
++struct gfs_alloc {
++      /*  Quota stuff  */
++
++      unsigned int al_qd_num;
++      struct gfs_quota_data *al_qd[4];
++      struct gfs_holder al_qd_ghs[4];
++
++      /* Filled in by the caller to gfs_inplace_reserve() */
++
++      uint32_t al_requested_di;
++      uint32_t al_requested_meta;
++      uint32_t al_requested_data;
++
++      /* Filled in by gfs_inplace_reserve() */
++
++      char *al_file;
++      unsigned int al_line;
++      struct gfs_holder al_ri_gh;
++      struct gfs_holder al_rgd_gh;
++      struct gfs_rgrpd *al_rgd;
++      uint32_t al_reserved_meta;
++      uint32_t al_reserved_data;
++
++      /* Filled in by gfs_blkalloc() */
++
++      uint32_t al_alloced_di;
++      uint32_t al_alloced_meta;
++      uint32_t al_alloced_data;
++
++      /* Dinode allocation crap */
++
++      struct gfs_unlinked *al_ul;
++};
++
++/*
++ *  Incore inode structure
++ */
++
++#define GIF_QD_LOCKED           (0)
++#define GIF_PAGED               (1)
++#define GIF_SW_PAGED            (2)
++
++struct gfs_inode {
++      struct gfs_inum i_num;
++
++      atomic_t i_count;
++      unsigned long i_flags;
++
++      uint64_t i_vn;
++      struct gfs_dinode i_di;
++
++      struct gfs_glock *i_gl;
++      struct gfs_sbd *i_sbd;
++      struct inode *i_vnode;
++
++      struct gfs_holder i_iopen_gh;
++
++      struct gfs_alloc *i_alloc;
++      uint64_t i_last_rg_alloc;
++
++      struct task_struct *i_creat_task;
++      pid_t i_creat_pid;
++
++      spinlock_t i_lock;
++      struct buffer_head *i_cache[GFS_MAX_META_HEIGHT];
++};
++
++/*
++ *  GFS per-fd structure
++ */
++
++#define GFF_DID_DIRECT_ALLOC    (0)
++
++struct gfs_file {
++      unsigned long f_flags;
++
++      struct semaphore f_fl_lock;
++      struct gfs_holder f_fl_gh;
++
++      struct gfs_inode *f_inode;
++      struct file *f_vfile;
++};
++
++/*
++ *  Unlinked inode log entry
++ */
++
++#define ULF_NEW_UL              (0)
++#define ULF_INCORE_UL           (1)
++#define ULF_IC_LIST             (2)
++#define ULF_OD_LIST             (3)
++#define ULF_LOCK                (4)
++
++struct gfs_unlinked {
++      struct list_head ul_list;
++      unsigned int ul_count;
++
++      struct gfs_inum ul_inum;
++      unsigned long ul_flags;
++
++      struct gfs_log_element ul_new_le;
++      struct gfs_log_element ul_incore_le;
++      struct gfs_log_element ul_ondisk_le;
++};
++
++/*
++ *  Quota log element
++ */
++
++struct gfs_quota_le {
++      struct gfs_log_element ql_le;
++
++      struct gfs_quota_data *ql_data;
++      struct list_head ql_data_list;
++
++      int64_t ql_change;
++};
++
++#define QDF_USER                (0)
++#define QDF_OD_LIST             (1)
++#define QDF_LOCK                (2)
++
++struct gfs_quota_data {
++      struct list_head qd_list;
++      unsigned int qd_count;
++
++      uint32_t qd_id;
++      unsigned long qd_flags;
++
++      struct list_head qd_le_list;
++
++      int64_t qd_change_new;
++      int64_t qd_change_ic;
++      int64_t qd_change_od;
++      int64_t qd_change_sync;
++
++      struct gfs_quota_le qd_ondisk_ql;
++      uint64_t qd_sync_gen;
++
++      struct gfs_glock *qd_gl;
++      struct gfs_quota_lvb qd_qb;
++
++      unsigned long qd_last_warn;
++};
++
++struct gfs_log_buf {
++      struct list_head lb_list;
++
++      struct buffer_head lb_bh;
++      struct buffer_head *lb_unlock;
++};
++
++/*
++ *  Transaction structures
++ */
++
++#define TRF_LOG_DUMP            (0x00000001)
++
++struct gfs_trans {
++      struct list_head tr_list;
++
++      /* Initial creation stuff */
++
++      char *tr_file;
++      unsigned int tr_line;
++
++      unsigned int tr_mblks_asked;    /* Number of log blocks asked to be reserved */
++      unsigned int tr_eblks_asked;
++      unsigned int tr_seg_reserved;   /* Number of segments reserved */
++
++      struct gfs_holder *tr_t_gh;
++
++      /* Stuff filled in during creation */
++
++      unsigned int tr_flags;
++      struct list_head tr_elements;
++
++      /* Stuff modified during the commit */
++
++      unsigned int tr_num_free_bufs;
++      struct list_head tr_free_bufs;
++      unsigned int tr_num_free_bmem;
++      struct list_head tr_free_bmem;
++
++      uint64_t tr_log_head;           /* The current log head */
++      uint64_t tr_first_head;         /* First header block */
++
++      struct list_head tr_bufs;       /* List of buffers going to the log */
++
++      /* Stuff that's part of the AIL */
++
++      struct list_head tr_ail_bufs;
++
++      /* Private data for different log element types */
++
++      unsigned int tr_num_gl;
++      unsigned int tr_num_buf;
++      unsigned int tr_num_iul;
++      unsigned int tr_num_ida;
++      unsigned int tr_num_q;
++};
++
++/*
++ *  One bucket of the glock hash table.
++ */
++
++struct gfs_gl_hash_bucket {
++      rwlock_t hb_lock;
++      struct list_head hb_list;
++} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
++
++/*
++ *  Super Block Data Structure  (One per filesystem)
++ */
++
++#define SDF_JOURNAL_LIVE        (0)
++#define SDF_SCAND_RUN           (1)
++#define SDF_GLOCKD_RUN          (2)
++#define SDF_RECOVERD_RUN        (3)
++#define SDF_LOGD_RUN            (4)
++#define SDF_QUOTAD_RUN          (5)
++#define SDF_INODED_RUN          (6)
++#define SDF_NOATIME             (7)
++#define SDF_ROFS                (8)
++#define SDF_NEED_LOG_DUMP       (9)
++#define SDF_FOUND_UL_DUMP       (10)
++#define SDF_FOUND_Q_DUMP        (11)
++#define SDF_IN_LOG_DUMP         (12)
++
++#define GFS_GL_HASH_SHIFT       (13)
++#define GFS_GL_HASH_SIZE        (1 << GFS_GL_HASH_SHIFT)
++#define GFS_GL_HASH_MASK        (GFS_GL_HASH_SIZE - 1)
++
++#define GFS_MHC_HASH_SHIFT      (10)
++#define GFS_MHC_HASH_SIZE       (1 << GFS_MHC_HASH_SHIFT)
++#define GFS_MHC_HASH_MASK       (GFS_MHC_HASH_SIZE - 1)
++
++#define GFS_DEPEND_HASH_SHIFT   (10)
++#define GFS_DEPEND_HASH_SIZE    (1 << GFS_DEPEND_HASH_SHIFT)
++#define GFS_DEPEND_HASH_MASK    (GFS_DEPEND_HASH_SIZE - 1)
++
++struct gfs_sbd {
++      struct gfs_sb sd_sb;            /* Super Block */
++
++      struct super_block *sd_vfs;     /* FS's device independent sb */
++
++      struct gfs_args sd_args;
++      unsigned long sd_flags;
++
++      struct gfs_tune sd_tune;        /* FS tuning structure */
++
++      /* Resource group stuff */
++
++      struct gfs_inode *sd_riinode;   /* rindex inode */
++      uint64_t sd_riinode_vn; /* Version number of the resource index inode */
++
++      struct list_head sd_rglist;     /* List of resource groups */
++      struct semaphore sd_rindex_lock;
++
++      struct list_head sd_rg_mru_list;        /* List of resource groups in MRU order */
++      spinlock_t sd_rg_mru_lock;      /* Lock for MRU list */
++      struct list_head sd_rg_recent;  /* Recently used rgrps */
++      spinlock_t sd_rg_recent_lock;
++      struct gfs_rgrpd *sd_rg_forward;        /* Next new rgrp to try for allocation */
++      spinlock_t sd_rg_forward_lock;
++
++      unsigned int sd_rgcount;        /* Count of resource groups */
++
++      /*  Constants computed on mount  */
++
++      uint32_t sd_fsb2bb;
++      uint32_t sd_fsb2bb_shift;       /* Shift FS Block numbers to the left by
++                                         this to get buffer cache blocks  */
++      uint32_t sd_diptrs;     /* Number of pointers in a dinode */
++      uint32_t sd_inptrs;     /* Number of pointers in a indirect block */
++      uint32_t sd_jbsize;     /* Size of a journaled data block */
++      uint32_t sd_hash_bsize; /* sizeof(exhash block) */
++      uint32_t sd_hash_bsize_shift;
++      uint32_t sd_hash_ptrs;  /* Number of points in a hash block */
++      uint32_t sd_max_dirres; /* Maximum space needed to add a directory entry */
++      uint32_t sd_max_height; /* Maximum height of a file's metadata tree */
++      uint64_t sd_heightsize[GFS_MAX_META_HEIGHT];
++      uint32_t sd_max_jheight;        /* Maximum height of a journaled file's metadata tree */
++      uint64_t sd_jheightsize[GFS_MAX_META_HEIGHT];
++
++      /*  Lock Stuff  */
++
++      struct gfs_gl_hash_bucket sd_gl_hash[GFS_GL_HASH_SIZE];
++
++      struct list_head sd_reclaim_list;
++      spinlock_t sd_reclaim_lock;
++      wait_queue_head_t sd_reclaim_wchan;
++      atomic_t sd_reclaim_count;
++
++      struct lm_lockstruct sd_lockstruct;
++
++      struct list_head sd_mhc[GFS_MHC_HASH_SIZE];
++      struct list_head sd_mhc_single;
++      spinlock_t sd_mhc_lock;
++      atomic_t sd_mhc_count;
++
++      struct list_head sd_depend[GFS_DEPEND_HASH_SIZE];
++      spinlock_t sd_depend_lock;
++      atomic_t sd_depend_count;
++
++      struct gfs_holder sd_live_gh;
++
++      struct gfs_holder sd_freeze_gh;
++      struct semaphore sd_freeze_lock;
++      unsigned int sd_freeze_count;
++
++      /*  Inode Stuff  */
++
++      struct gfs_inode *sd_rooti;     /* FS's root inode */
++
++      struct gfs_glock *sd_rename_gl; /* rename glock */
++
++      /*  Daemon stuff  */
++
++      struct task_struct *sd_scand_process;
++      unsigned int sd_glockd_num;
++      struct task_struct *sd_recoverd_process;
++      struct task_struct *sd_logd_process;
++      struct task_struct *sd_quotad_process;
++      struct task_struct *sd_inoded_process;
++
++      struct semaphore sd_thread_lock;
++      struct completion sd_thread_completion;
++
++      /*  Log stuff  */
++
++      struct gfs_glock *sd_trans_gl;  /* transaction glock */
++
++      struct gfs_inode *sd_jiinode;   /* jindex inode */
++      uint64_t sd_jiinode_vn; /* Version number of the journal index inode */
++
++      unsigned int sd_journals;       /* Number of journals in the FS */
++      struct gfs_jindex *sd_jindex;   /* Array of Jindex structures describing this FS's journals */
++      struct semaphore sd_jindex_lock;
++      unsigned long sd_jindex_refresh_time;
++
++      struct gfs_jindex sd_jdesc;     /* Jindex structure describing this machine's journal */
++      struct gfs_holder sd_journal_gh;        /* the glock for this machine's journal */
++
++      uint64_t sd_sequence;   /* Assigned to xactions in order they commit */
++      uint64_t sd_log_head;   /* Block number of next journal write */
++      uint64_t sd_log_wrap;
++
++      spinlock_t sd_log_seg_lock;
++      unsigned int sd_log_seg_free;   /* Free segments in the log */
++      struct list_head sd_log_seg_list;
++      wait_queue_head_t sd_log_seg_wait;
++
++      struct list_head sd_log_ail;    /* struct gfs_trans structures that form the Active Items List 
++                                         "next" is the head, "prev" is the tail  */
++
++      struct list_head sd_log_incore; /* transactions that have been commited incore (but not ondisk)
++                                         "next" is the newest, "prev" is the oldest  */
++      unsigned int sd_log_buffers;    /* Number of buffers in the incore log */
++
++      struct semaphore sd_log_lock;   /* Lock for access to log values */
++
++      uint64_t sd_log_dump_last;
++      uint64_t sd_log_dump_last_wrap;
++
++      /*  unlinked crap  */
++
++      struct list_head sd_unlinked_list;
++      spinlock_t sd_unlinked_lock;
++
++      atomic_t sd_unlinked_ic_count;
++      atomic_t sd_unlinked_od_count;
++
++      /*  quota crap  */
++
++      struct list_head sd_quota_list;
++      spinlock_t sd_quota_lock;
++
++      atomic_t sd_quota_count;
++      atomic_t sd_quota_od_count;
++
++      struct gfs_inode *sd_qinode;
++
++      uint64_t sd_quota_sync_gen;
++      unsigned long sd_quota_sync_time;
++
++      /*  license crap  */
++
++      struct gfs_inode *sd_linode;
++
++      /*  Recovery stuff  */
++
++      struct list_head sd_dirty_j;
++      spinlock_t sd_dirty_j_lock;
++
++      unsigned int sd_recovery_replays;
++      unsigned int sd_recovery_skips;
++      unsigned int sd_recovery_sames;
++
++      /*  Counters  */
++
++      atomic_t sd_glock_count;
++      atomic_t sd_glock_held_count;
++      atomic_t sd_inode_count;
++      atomic_t sd_bufdata_count;
++      atomic_t sd_fh2dentry_misses;
++      atomic_t sd_reclaimed;
++      atomic_t sd_glock_nq_calls;
++      atomic_t sd_glock_dq_calls;
++      atomic_t sd_glock_prefetch_calls;
++      atomic_t sd_lm_lock_calls;
++      atomic_t sd_lm_unlock_calls;
++      atomic_t sd_lm_callbacks;
++      atomic_t sd_ops_address;
++      atomic_t sd_ops_dentry;
++      atomic_t sd_ops_export;
++      atomic_t sd_ops_file;
++      atomic_t sd_ops_inode;
++      atomic_t sd_ops_super;
++      atomic_t sd_ops_vm;
++
++      char sd_fsname[256];
++
++      /*  Debugging crud  */
++
++      unsigned long sd_last_readdirplus;
++      unsigned long sd_last_unlocked_aop;
++
++      spinlock_t sd_ail_lock;
++      struct list_head sd_recovery_bufs;
++};
++
++#endif /* __INCORE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/inode.c linux-patched/fs/gfs/inode.c
+--- linux-orig/fs/gfs/inode.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/inode.c       2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,1993 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "ops_address.h"
++#include "ops_file.h"
++#include "ops_inode.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * inode_attr_in - Copy attributes from the dinode into the VFS inode
++ * @ip: The GFS inode
++ *
++ */
++
++static void
++inode_attr_in(struct gfs_inode *ip, struct inode *ino)
++{
++      unsigned int mode;
++
++      ino->i_ino = ip->i_num.no_formal_ino;
++
++      switch (ip->i_di.di_type) {
++      case GFS_FILE_REG:
++              mode = S_IFREG;
++              ino->i_rdev = 0;
++              break;
++      case GFS_FILE_DIR:
++              mode = S_IFDIR;
++              ino->i_rdev = 0;
++              break;
++      case GFS_FILE_LNK:
++              mode = S_IFLNK;
++              ino->i_rdev = 0;
++              break;
++      case GFS_FILE_BLK:
++              mode = S_IFBLK;
++              ino->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
++              break;
++      case GFS_FILE_CHR:
++              mode = S_IFCHR;
++              ino->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
++              break;
++      case GFS_FILE_FIFO:
++              mode = S_IFIFO;
++              ino->i_rdev = 0;
++              break;
++      case GFS_FILE_SOCK:
++              mode = S_IFSOCK;
++              ino->i_rdev = 0;
++              break;
++      default:
++              GFS_ASSERT_INODE(FALSE, ip,
++                               printk("type = %u\n", ip->i_di.di_type););
++              break;
++      };
++
++      ino->i_mode = mode | (ip->i_di.di_mode & S_IALLUGO);
++      ino->i_nlink = ip->i_di.di_nlink;
++      ino->i_uid = ip->i_di.di_uid;
++      ino->i_gid = ip->i_di.di_gid;
++      i_size_write(ino, ip->i_di.di_size);
++      ino->i_atime.tv_sec = ip->i_di.di_atime;
++      ino->i_mtime.tv_sec = ip->i_di.di_mtime;
++      ino->i_ctime.tv_sec = ip->i_di.di_ctime;
++      ino->i_atime.tv_nsec = ino->i_mtime.tv_nsec = ino->i_ctime.tv_nsec = 0;
++      ino->i_blksize = PAGE_SIZE;
++      ino->i_blocks = ip->i_di.di_blocks <<
++              (ip->i_sbd->sd_sb.sb_bsize_shift - GFS_BASIC_BLOCK_SHIFT);
++      ino->i_generation = ip->i_di.di_header.mh_incarn;
++}
++
++/**
++ * gfs_inode_attr_in - Copy attributes from the dinode into the VFS inode
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_attr_in(struct gfs_inode *ip)
++{
++      struct inode *inode;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (inode) {
++              inode_attr_in(ip, inode);
++              iput(inode);
++      }
++
++}
++
++/**
++ * gfs_inode_attr_out - Copy attributes from VFS inode into the dinode
++ * @ip: The GFS inode
++ *
++ * Only copy out the attributes that we want the VFS layer
++ * to be able to modify.
++ */
++
++void
++gfs_inode_attr_out(struct gfs_inode *ip)
++{
++      struct inode *inode;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (inode) {
++              ip->i_di.di_mode = inode->i_mode & S_IALLUGO;
++              ip->i_di.di_uid = inode->i_uid;
++              ip->i_di.di_gid = inode->i_gid;
++              ip->i_di.di_atime = inode->i_atime.tv_sec;
++              ip->i_di.di_mtime = inode->i_mtime.tv_sec;
++              ip->i_di.di_ctime = inode->i_ctime.tv_sec;
++              iput(inode);
++      }
++}
++
++/**
++ * gfs_iget - Get/Create a struct inode for a struct gfs_inode
++ * @ip: the struct gfs_inode to get the struct inode for
++ *
++ * Returns: An inode
++ */
++
++struct inode *
++gfs_iget(struct gfs_inode *ip, int create)
++{
++      struct inode *inode = NULL, *tmp;
++
++      spin_lock(&ip->i_lock);
++      if (ip->i_vnode)
++              inode = igrab(ip->i_vnode);
++      spin_unlock(&ip->i_lock);
++
++      if (inode || !create)
++              return inode;
++
++      tmp = new_inode(ip->i_sbd->sd_vfs);
++      if (!tmp)
++              return NULL;
++
++      inode_attr_in(ip, tmp);
++
++      if (ip->i_di.di_type == GFS_FILE_REG) {
++              tmp->i_op = &gfs_file_iops;
++              tmp->i_fop = &gfs_file_fops;
++              tmp->i_mapping->a_ops = &gfs_file_aops;
++      } else if (ip->i_di.di_type == GFS_FILE_DIR) {
++              tmp->i_op = &gfs_dir_iops;
++              tmp->i_fop = &gfs_dir_fops;
++      } else if (ip->i_di.di_type == GFS_FILE_LNK) {
++              tmp->i_op = &gfs_symlink_iops;
++      } else {
++              tmp->i_op = &gfs_dev_iops;
++              init_special_inode(tmp, tmp->i_mode, tmp->i_rdev);
++      }
++
++      vn2ip(tmp) = NULL;
++
++      for (;;) {
++              spin_lock(&ip->i_lock);
++              if (!ip->i_vnode)
++                      break;
++              inode = igrab(ip->i_vnode);
++              spin_unlock(&ip->i_lock);
++
++              if (inode) {
++                      iput(tmp);
++                      return inode;
++              }
++              yield();
++      }
++
++      inode = tmp;
++
++      gfs_inode_hold(ip);
++      ip->i_vnode = inode;
++      vn2ip(inode) = ip;
++
++      spin_unlock(&ip->i_lock);
++
++      insert_inode_hash(inode);
++
++      return inode;
++}
++
++/**
++ * gfs_copyin_dinode - Refresh the incore copy of the dinode
++ * @ip: The GFS inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copyin_dinode(struct gfs_inode *ip)
++{
++      struct buffer_head *dibh;
++      int error;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              return error;
++
++      gfs_metatype_check(ip->i_sbd, dibh, GFS_METATYPE_DI);
++      gfs_dinode_in(&ip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++      GFS_ASSERT_INODE(ip->i_num.no_formal_ino ==
++                       ip->i_di.di_num.no_formal_ino, ip,
++                       gfs_dinode_print(&ip->i_di););
++
++      /*  Handle a moved inode  */
++
++      if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
++              /*  Not implemented yet  */
++              GFS_ASSERT_INODE(FALSE, ip,);
++      }
++
++      ip->i_vn = ip->i_gl->gl_vn;
++
++      return 0;
++}
++
++/**
++ * inode_create - create a struct gfs_inode
++ * @i_gl: The glock covering the inode
++ * @inum: The inode number
++ * @io_gl: the iopen glock, or NULL
++ * @io_state: the state the iopen glock should be acquire in
++ * @ipp: pointer to put the returned inode in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_create(struct gfs_glock *i_gl, struct gfs_inum *inum,
++             struct gfs_glock *io_gl, unsigned int io_state,
++             struct gfs_inode **ipp)
++{
++      struct gfs_sbd *sdp = i_gl->gl_sbd;
++      struct gfs_inode *ip;
++      int error = 0;
++
++      RETRY_MALLOC(ip = kmem_cache_alloc(gfs_inode_cachep, GFP_KERNEL), ip);
++      memset(ip, 0, sizeof(struct gfs_inode));
++
++      ip->i_num = *inum;
++
++      atomic_set(&ip->i_count, 1);
++
++      ip->i_gl = i_gl;
++      ip->i_sbd = sdp;
++
++      spin_lock_init(&ip->i_lock);
++
++      error = gfs_glock_nq_init(io_gl,
++                                io_state, GL_LOCAL_EXCL | GL_EXACT,
++                                &ip->i_iopen_gh);
++      if (error)
++              goto fail;
++
++      ip->i_iopen_gh.gh_owner = NULL;
++
++      spin_lock(&io_gl->gl_spin);
++      gfs_glock_hold(i_gl);
++      gl2gl(io_gl) = i_gl;
++      spin_unlock(&io_gl->gl_spin);
++
++      error = gfs_copyin_dinode(ip);
++      if (error)
++              goto fail_iopen;
++
++      gfs_glock_hold(i_gl);
++      gl2ip(i_gl) = ip;
++
++      atomic_inc(&sdp->sd_inode_count);
++
++      *ipp = ip;
++
++      return 0;
++
++ fail_iopen:
++      spin_lock(&io_gl->gl_spin);
++      gl2gl(io_gl) = NULL;
++      gfs_glock_put(i_gl);
++      spin_unlock(&io_gl->gl_spin);
++
++      gfs_glock_dq_uninit(&ip->i_iopen_gh);
++
++ fail:
++      gfs_flush_meta_cache(ip);
++      kmem_cache_free(gfs_inode_cachep, ip);
++      *ipp = NULL;
++
++      return error;
++}
++
++/**
++ * gfs_inode_get - Get an inode given its number
++ * @i_gl: The glock covering the inode
++ * @inum: The inode number
++ * @create: Flag to say if we are allowed to create a new struct gfs_inode
++ * @ipp: pointer to put the returned inode in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_inode_get(struct gfs_glock *i_gl, struct gfs_inum *inum, int create,
++              struct gfs_inode **ipp)
++{
++      struct gfs_glock *io_gl;
++      int error = 0;
++
++      *ipp = gl2ip(i_gl);
++      if (*ipp) {
++              atomic_inc(&(*ipp)->i_count);
++              GFS_ASSERT_INODE((*ipp)->i_num.no_formal_ino ==
++                               inum->no_formal_ino,
++                               (*ipp),);
++      } else if (create) {
++              error = gfs_glock_get(i_gl->gl_sbd,
++                                    inum->no_addr, &gfs_iopen_glops,
++                                    CREATE, &io_gl);
++              if (!error) {
++                      error = inode_create(i_gl, inum, io_gl,
++                                             LM_ST_SHARED, ipp);
++                      gfs_glock_put(io_gl);
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_inode_hold - hold a struct gfs_inode structure
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_hold(struct gfs_inode *ip)
++{
++      GFS_ASSERT_INODE(atomic_read(&ip->i_count), ip,);
++      atomic_inc(&ip->i_count);
++}
++
++/**
++ * gfs_inode_put - put a struct gfs_inode structure
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_put(struct gfs_inode *ip)
++{
++      atomic_dec(&ip->i_count);
++      GFS_ASSERT_INODE(atomic_read(&ip->i_count) >= 0, ip,);
++}
++
++/**
++ * gfs_inode_destroy - Destroy an inode structure with no references on it
++ * @ip: The GFS inode
++ *
++ * This function must be called with a glock held on the inode.
++ *
++ */
++
++void
++gfs_inode_destroy(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_glock *io_gl = ip->i_iopen_gh.gh_gl;
++      struct gfs_glock *i_gl = ip->i_gl;
++
++      GFS_ASSERT_INODE(!atomic_read(&ip->i_count), ip,);
++      GFS_ASSERT_INODE(gl2gl(io_gl) == i_gl, ip,);
++
++      spin_lock(&io_gl->gl_spin);
++      gl2gl(io_gl) = NULL;
++      gfs_glock_put(i_gl);
++      spin_unlock(&io_gl->gl_spin);
++
++      gfs_glock_dq_uninit(&ip->i_iopen_gh);
++
++      gfs_flush_meta_cache(ip);
++      kmem_cache_free(gfs_inode_cachep, ip);
++
++      gl2ip(i_gl) = NULL;
++      gfs_glock_put(i_gl);
++
++      atomic_dec(&sdp->sd_inode_count);
++}
++
++/**
++ * dinode_mark_unused -
++ * @ip:
++ *
++ * Returns: errno
++ */
++
++static int
++dinode_mark_unused(struct gfs_inode *ip)
++{
++      struct buffer_head *dibh;
++      struct gfs_dinode *di;
++      uint32_t incarn;
++      uint64_t ctime;
++      uint32_t flags;
++      int error;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              return error;
++
++      di = (struct gfs_dinode *)dibh->b_data;
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++
++      incarn = gfs32_to_cpu(di->di_header.mh_incarn) + 1;
++      di->di_header.mh_incarn = cpu_to_gfs32(incarn);
++
++      ctime = get_seconds();
++      di->di_ctime = cpu_to_gfs64(ctime);
++
++      flags = (gfs32_to_cpu(di->di_flags)) | GFS_DIF_UNUSED;
++      di->di_flags = cpu_to_gfs32(flags);
++
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * dinode_dealloc - Put deallocate a dinode
++ * @ip: The GFS inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++dinode_dealloc(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_rgrpd *rgd;
++      struct gfs_holder ri_gh, rgd_gh;
++      int error;
++
++      gfs_alloc_get(ip);
++
++      error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (error)
++              goto fail;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              goto fail_qs;
++
++      rgd = gfs_blk2rgrpd(sdp, ip->i_num.no_addr);
++      GFS_ASSERT_INODE(rgd, ip,);
++
++      error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++      if (error)
++              goto fail_rindex_relse;
++
++      GFS_ASSERT_INODE(ip->i_di.di_blocks == 1, ip,
++                       gfs_dinode_print(&ip->i_di););
++
++      /* Trans may require:
++         One block for the RG header.
++         One block for the dinode bit.
++         One block for the dinode.
++         We also need a block for the unlinked change.
++         One block for the quota change. */
++
++      error = gfs_trans_begin(sdp, 3, 2);
++      if (error)
++              goto fail_rg_gunlock;
++
++      error = dinode_mark_unused(ip);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_difree(rgd, ip);
++
++      gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA, &ip->i_num);
++      clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_uninit(&rgd_gh);
++      gfs_glock_dq_uninit(&ri_gh);
++
++      gfs_quota_unhold_m(ip);
++      gfs_alloc_put(ip);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_rg_gunlock:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex_relse:
++      gfs_glock_dq_uninit(&ri_gh);
++
++ fail_qs:
++      gfs_quota_unhold_m(ip);
++
++ fail:
++      gfs_alloc_put(ip);
++
++      return error;
++}
++
++/**
++ * inode_dealloc - Deallocate an inode
++ * @sdp: the filesystem
++ * @inum: the inode number to deallocate
++ * @io_gh: a holder for the iopen glock for this inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum,
++              struct gfs_holder *io_gh)
++{
++      struct gfs_inode *ip;
++      struct gfs_holder i_gh;
++      int error;
++
++      error = gfs_glock_nq_num(sdp,
++                               inum->no_formal_ino, &gfs_inode_glops,
++                               LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (error)
++              return error;
++
++      /* We reacquire the iopen lock here to avoid a race with the NFS server
++         calling gfs_read_inode() with the inode number of a inode we're in the
++         process of deallocating.  And we can't keep our hold on the lock
++         from try_dealloc_inode() for deadlock reasons. */
++
++      gfs_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY, io_gh);
++      error = gfs_glock_nq(io_gh);
++      switch (error) {
++      case 0:
++              break;
++      case GLR_TRYFAILED:
++              error = 0;
++              goto fail;
++      default:
++              GFS_ASSERT_SBD(error < 0, sdp,);
++              goto fail;
++      }
++
++      GFS_ASSERT_GLOCK(!gl2ip(i_gh.gh_gl), i_gh.gh_gl,);
++      error = inode_create(i_gh.gh_gl, inum, io_gh->gh_gl, LM_ST_EXCLUSIVE,
++                           &ip);
++
++      gfs_glock_dq(io_gh);
++
++      if (error)
++              goto fail;
++
++      GFS_ASSERT_INODE(!ip->i_di.di_nlink, ip,
++                       gfs_dinode_print(&ip->i_di););
++      GFS_ASSERT_INODE(atomic_read(&ip->i_count) == 1, ip,);
++      GFS_ASSERT_INODE(!ip->i_vnode, ip,);
++
++      if (ip->i_di.di_type == GFS_FILE_DIR &&
++          (ip->i_di.di_flags & GFS_DIF_EXHASH)) {
++              error = gfs_dir_exhash_free(ip);
++              if (error)
++                      goto fail_iput;
++      }
++
++      if (ip->i_di.di_eattr) {
++              error = gfs_ea_dealloc(ip);
++              if (error)
++                      goto fail_iput;
++      }
++
++      error = gfs_shrink(ip, 0, NULL);
++      if (error)
++              goto fail_iput;
++
++      error = dinode_dealloc(ip);
++      if (error)
++              goto fail_iput;
++
++      gfs_inode_put(ip);
++      gfs_inode_destroy(ip);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return 0;
++
++ fail_iput:
++      gfs_inode_put(ip);
++      gfs_inode_destroy(ip);
++
++ fail:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * inode_dealloc_init - Try to deallocate an inode and all its blocks
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++static int
++inode_dealloc_init(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      struct gfs_holder io_gh;
++      int error = 0;
++
++      gfs_try_toss_inode(sdp, inum);
++
++      error = gfs_glock_nq_num(sdp,
++                               inum->no_addr, &gfs_iopen_glops,
++                               LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &io_gh);
++      switch (error) {
++      case 0:
++              break;
++      case GLR_TRYFAILED:
++              return 1;
++      default:
++              GFS_ASSERT_SBD(error < 0, sdp,);
++              return error;
++      }
++
++      gfs_glock_dq(&io_gh);
++      error = inode_dealloc(sdp, inum, &io_gh);
++      gfs_holder_uninit(&io_gh);
++
++      return error;
++}
++
++/**
++ * inode_dealloc_uninit - dealloc an uninitialized inode
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++static int
++inode_dealloc_uninit(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      struct gfs_rgrpd *rgd;
++      struct gfs_holder ri_gh, rgd_gh;
++      int error;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              return error;
++
++      rgd = gfs_blk2rgrpd(sdp, inum->no_addr);
++      GFS_ASSERT_SBD(rgd, sdp,);
++
++      error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++      if (error)
++              goto fail;
++
++      /* Trans may require:
++         One block for the RG header.
++         One block for the dinode bit.
++         We also need a block for the unlinked change. */
++
++      error = gfs_trans_begin(sdp, 2, 1);
++      if (error)
++              goto fail_gunlock;
++
++      gfs_difree_uninit(rgd, inum->no_addr);
++      gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA, inum);
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_uninit(&rgd_gh);
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return 0;
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++ fail:
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return error;   
++}
++
++/**
++ * gfs_inode_dealloc - Grab an unlinked inode off the list and try to free it.
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++int
++gfs_inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      if (inum->no_formal_ino)
++              return inode_dealloc_init(sdp, inum);
++      else
++              return inode_dealloc_uninit(sdp, inum);
++}
++
++/**
++ * gfs_change_nlink - Change nlink count on inode
++ * @ip: The GFS inode
++ * @diff: The change in the nlink count required
++ *
++ * Returns: 0 on success, -EXXXX on failure.
++ */
++
++int
++gfs_change_nlink(struct gfs_inode *ip, int diff)
++{
++      struct buffer_head *dibh;
++      uint32_t nlink;
++      int error;
++
++      nlink = ip->i_di.di_nlink + diff;
++
++      if (diff < 0)
++              GFS_ASSERT_INODE(nlink < ip->i_di.di_nlink, ip,
++                               gfs_dinode_print(&ip->i_di););
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              return error;
++
++      ip->i_di.di_nlink = nlink;
++      ip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * gfs_lookupi - Look up a filename in a directory and return its inode
++ * @d_gh: An initialized holder for the directory glock
++ * @name: The name of the inode to look for
++ * @is_root: If TRUE, ignore the caller's permissions
++ * @i_gh: An uninitialized holder for the new inode glock
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_lookupi(struct gfs_holder *d_gh, struct qstr *name,
++          int is_root, struct gfs_holder *i_gh)
++{
++      struct gfs_inode *dip = gl2ip(d_gh->gh_gl);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_glock *gl;
++      struct gfs_inode *ip;
++      struct gfs_inum inum, inum2;
++      unsigned int type;
++      int error;
++
++      i_gh->gh_gl = NULL;
++
++      if (!name->len || name->len > GFS_FNAMESIZE)
++              return -ENAMETOOLONG;
++
++      if (gfs_filecmp(name, ".", 1)) {
++              gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++              error = gfs_glock_nq(d_gh);
++              if (!error) {
++                      error = gfs_glock_nq_init(dip->i_gl,
++                                                LM_ST_SHARED, 0,
++                                                i_gh);
++                      GFS_ASSERT_INODE(!error, ip,);
++                      gfs_inode_hold(dip);
++              }
++
++              return error;
++      }
++
++      if (gfs_glock_is_locked_by_me(d_gh->gh_gl))
++              bitch_about(sdp, &sdp->sd_last_readdirplus,
++                          "readdirplus-type behavior");
++
++      gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++      error = gfs_glock_nq(d_gh);
++      if (error)
++              return error;
++
++      if (!is_root) {
++              struct inode *dir = gfs_iget(dip, NO_CREATE);
++              if (dir) {
++                      error = permission(dir, MAY_EXEC, NULL);
++                      iput(dir);
++                      if (error) {
++                              gfs_glock_dq(d_gh);
++                              return error;
++                      }
++              }
++      }
++
++      error = gfs_dir_search(dip, name, &inum, &type);
++      if (error) {
++              gfs_glock_dq(d_gh);
++              if (error == -ENOENT)
++                      error = 0;
++              return error;
++      }
++
++ restart:
++      error = gfs_glock_get(sdp, inum.no_formal_ino, &gfs_inode_glops,
++                            CREATE, &gl);
++      if (error) {
++              gfs_glock_dq(d_gh);
++              return error;
++      }
++
++      /*  Acquire the second lock  */
++
++      if (gl->gl_name.ln_number < dip->i_gl->gl_name.ln_number) {
++              gfs_glock_dq(d_gh);
++
++              error = gfs_glock_nq_init(gl, LM_ST_SHARED,
++                                        LM_FLAG_ANY | GL_LOCAL_EXCL,
++                                        i_gh);
++              if (error)
++                      goto out;
++
++              gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++              error = gfs_glock_nq(d_gh);
++              if (error) {
++                      gfs_glock_dq_uninit(i_gh);
++                      goto out;
++              }
++
++              if (!is_root) {
++                      struct inode *dir = gfs_iget(dip, NO_CREATE);
++                      if (dir) {
++                              error = permission(dir, MAY_EXEC, NULL);
++                              iput(dir);
++                              if (error) {
++                                      gfs_glock_dq(d_gh);
++                                      gfs_glock_dq_uninit(i_gh);
++                                      goto out;
++                              }
++                      }
++              }
++
++              error = gfs_dir_search(dip, name, &inum2, &type);
++              if (error) {
++                      gfs_glock_dq(d_gh);
++                      gfs_glock_dq_uninit(i_gh);
++                      if (error == -ENOENT)
++                              error = 0;
++                      goto out;
++              }
++
++              if (!gfs_inum_equal(&inum, &inum2)) {
++                      gfs_glock_dq_uninit(i_gh);
++                      gfs_glock_put(gl);
++                      inum = inum2;
++                      goto restart;
++              }
++      } else {
++              error = gfs_glock_nq_init(gl, LM_ST_SHARED,
++                                        LM_FLAG_ANY | GL_LOCAL_EXCL,
++                                        i_gh);
++              if (error) {
++                      gfs_glock_dq(d_gh);
++                      goto out;
++              }
++      }
++
++      error = gfs_inode_get(gl, &inum, CREATE, &ip);
++      if (error) {
++              gfs_glock_dq(d_gh);
++              gfs_glock_dq_uninit(i_gh);
++      }
++      GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++
++ out:
++      gfs_glock_put(gl);
++
++      return error;
++}
++
++/**
++ * create_ok -
++ * @dip:
++ * @name:
++ * @type:
++ *
++ * Returns: errno
++ */
++
++static int
++create_ok(struct gfs_inode *dip, struct qstr *name, unsigned int type)
++{
++      int error;
++
++      {
++              struct inode *dir = gfs_iget(dip, NO_CREATE);
++              if (dir) {
++                      error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++                      iput(dir);
++                      if (error)
++                              return error;
++              }
++      }
++
++      /*  Don't create entries in an unlinked directory  */
++
++      if (!dip->i_di.di_nlink)
++              return -EPERM;
++
++      error = gfs_dir_search(dip, name, NULL, NULL);
++      switch (error) {
++      case -ENOENT:
++              error = 0;
++              break;
++      case 0:
++              return -EEXIST;
++      default:
++              return error;
++      }
++
++      if (dip->i_di.di_entries == (uint32_t)-1)
++              return -EFBIG;
++      if (type == GFS_FILE_DIR && dip->i_di.di_nlink == (uint32_t)-1)
++              return -EMLINK;
++
++      return 0;
++}
++
++/**
++ * dinode_alloc - 
++ * @dip:
++ * @ul:
++ *
++ * Returns: errno
++ */
++
++static int
++dinode_alloc(struct gfs_inode *dip, struct gfs_unlinked **ul)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_alloc *al;
++      struct gfs_inum inum;
++      int error;
++
++      al = gfs_alloc_get(dip);
++
++      al->al_requested_di = 1;
++
++      error = gfs_inplace_reserve(dip);
++      if (error)
++              goto out;
++
++      error = gfs_trans_begin(sdp, al->al_rgd->rd_ri.ri_length, 1);
++      if (error)
++              goto out_inplace;
++
++      inum.no_formal_ino = 0;
++      error = gfs_dialloc(dip, &inum.no_addr);
++      if (error)
++              goto out_end_trans;
++
++      *ul = gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &inum);
++      gfs_unlinked_lock(sdp, *ul);
++
++      gfs_trans_add_gl(dip->i_gl);
++
++ out_end_trans:
++      gfs_trans_end(sdp);
++
++ out_inplace:
++      gfs_inplace_release(dip);
++
++ out:
++      gfs_alloc_put(dip);
++
++      return error;
++}
++
++/**
++ * pick_formal_ino - Pick a formal inode number for a given inode
++ * @sdp: the filesystem
++ * @inum: the inode number structure
++ *
++ */
++
++static void
++pick_formal_ino(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      /*  This won't always be true  */
++      inum->no_formal_ino = inum->no_addr;
++}
++
++/**
++ * make_dinode - Fill in a new dinode structure
++ * @dip: the directory this inode is being created in
++ * @gl: The glock covering the new inode
++ * @inum: the inode number
++ * @type: the file type
++ * @mode: the file permissions
++ * @uid:
++ * @gid:
++ *
++ */
++
++static int
++make_dinode(struct gfs_inode *dip,
++          struct gfs_glock *gl, struct gfs_inum *inum,
++          unsigned int type, unsigned int mode,
++          unsigned int uid, unsigned int gid)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_dinode di;
++      struct buffer_head *dibh;
++      struct gfs_rgrpd *rgd;
++      int error;
++
++      error = gfs_dread(sdp, inum->no_addr, gl,
++                        DIO_NEW | DIO_START | DIO_WAIT,
++                        &dibh);
++      if (error)
++              return error;
++
++      gfs_trans_add_bh(gl, dibh);
++      gfs_metatype_set(sdp, dibh, GFS_METATYPE_DI, GFS_FORMAT_DI);
++      gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++      memset(&di, 0, sizeof(struct gfs_dinode));
++
++      gfs_meta_header_in(&di.di_header, dibh->b_data);
++
++      di.di_num = *inum;
++
++      di.di_mode = mode & S_IALLUGO;
++      di.di_uid = uid;
++      di.di_gid = gid;
++      di.di_nlink = 1;
++      di.di_blocks = 1;
++      di.di_atime = di.di_mtime = di.di_ctime = get_seconds();
++
++      rgd = gfs_blk2rgrpd(sdp, inum->no_addr);
++      GFS_ASSERT_SBD(rgd, sdp,
++                     printk("block = %"PRIu64"\n", inum->no_addr););
++
++      di.di_rgrp = rgd->rd_ri.ri_addr;
++      di.di_goal_rgrp = di.di_rgrp;
++      di.di_goal_dblk = di.di_goal_mblk = inum->no_addr - rgd->rd_ri.ri_data1;
++
++      if (type == GFS_FILE_REG) {
++              if ((dip->i_di.di_flags & GFS_DIF_INHERIT_JDATA) ||
++                  sdp->sd_tune.gt_new_files_jdata)
++                      di.di_flags |= GFS_DIF_JDATA;
++              if ((dip->i_di.di_flags & GFS_DIF_INHERIT_DIRECTIO) ||
++                  sdp->sd_tune.gt_new_files_directio)
++                      di.di_flags |= GFS_DIF_DIRECTIO;
++      } else if (type == GFS_FILE_DIR) {
++              di.di_flags |= (dip->i_di.di_flags & GFS_DIF_INHERIT_DIRECTIO);
++              di.di_flags |= (dip->i_di.di_flags & GFS_DIF_INHERIT_JDATA);
++      }
++
++      di.di_type = type;
++
++      gfs_dinode_out(&di, dibh->b_data);
++      brelse(dibh);
++
++      return 0;
++}
++
++/**
++ * inode_init_and_link -
++ * @dip:
++ * @name:
++ * @inum:
++ * @gl:
++ * @type:
++ * @mode:
++ *
++ * Returns: errno
++ */
++
++static int
++inode_init_and_link(struct gfs_inode *dip, struct qstr *name,
++                  struct gfs_inum *inum, struct gfs_glock *gl,
++                  unsigned int type, unsigned int mode)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct posix_acl *acl = NULL;
++      struct gfs_alloc *al;
++      struct gfs_inode *ip;
++      unsigned int gid;
++      int alloc_required;
++      int error;
++
++      error = gfs_setup_new_acl(dip, type, &mode, &acl);
++      if (error)
++              return error;
++
++      if (dip->i_di.di_mode & S_ISGID) {
++              if (type == GFS_FILE_DIR)
++                      mode |= S_ISGID;
++              gid = dip->i_di.di_gid;
++      }
++      else
++              gid = current->fsgid;
++
++      al = gfs_alloc_get(dip);
++
++      error = gfs_quota_lock_m(dip,
++                               current->fsuid,
++                               gid);
++      if (error)
++              goto fail;
++
++      error = gfs_quota_check(dip, current->fsuid, gid);
++      if (error)
++              goto fail_gunlock_q;
++
++      if (acl)
++              alloc_required = TRUE;
++      else {
++              error = gfs_diradd_alloc_required(dip, name, &alloc_required);
++              if (error)
++                      goto fail_gunlock_q;
++      }
++
++      if (alloc_required) {
++              error = gfs_quota_check(dip, dip->i_di.di_uid, dip->i_di.di_gid);
++              if (error)
++                      goto fail_gunlock_q;
++
++              al->al_requested_meta = sdp->sd_max_dirres + GFS_MAX_EA_ACL_BLKS;
++
++              error = gfs_inplace_reserve(dip);
++              if (error)
++                      goto fail_gunlock_q;
++
++              /* Trans may require:
++                 blocks for two dinodes, the directory blocks necessary for
++                 a new entry, RG bitmap blocks for an allocation,
++                 and one block for a quota change and
++                 one block for an unlinked tag. */
++
++              error = gfs_trans_begin(sdp,
++                                      2 + sdp->sd_max_dirres +
++                                      al->al_rgd->rd_ri.ri_length +
++                                      GFS_MAX_EA_ACL_BLKS, 2);
++              if (error)
++                      goto fail_inplace;
++      } else {
++              /* Trans may require:
++                 blocks for two dinodes, a leaf block,
++                 and one block for a quota change and
++                 one block for an unlinked tag. */
++
++              error = gfs_trans_begin(sdp, 3, 2);
++              if (error)
++                      goto fail_gunlock_q;
++      }
++
++      error = gfs_dir_add(dip, name, inum, type);
++      if (error)
++              goto fail_end_trans;
++
++      error = make_dinode(dip, gl, inum, type, mode, current->fsuid, gid);
++      if (error)
++              goto fail_end_trans;
++
++      al->al_ul = gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA,
++                                         &(struct gfs_inum){0, inum->no_addr});
++      gfs_trans_add_quota(sdp, +1, current->fsuid, gid);
++
++      /* Gfs_inode_get() can't fail here.  But then again, it shouldn't be
++         here (it should be in gfs_createi()).  Gfs_init_acl() has no
++         business needing a memory-resident inode. */
++
++      gfs_inode_get(gl, inum, CREATE, &ip);
++
++      if (acl) {
++              error = gfs_init_acl(dip, ip, type, acl);
++              GFS_ASSERT(!error, ); /* Sigh. */
++      }
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_inplace:
++      if (alloc_required)
++              gfs_inplace_release(dip);
++
++ fail_gunlock_q:
++      gfs_quota_unlock_m(dip);
++
++ fail:
++      gfs_alloc_put(dip);
++      if (acl)
++              posix_acl_release(acl);
++
++      return error;
++}
++
++/**
++ * gfs_createi - Create a new inode
++ * @d_gh: An initialized holder for the directory glock
++ * @name: The name of the new file
++ * @type: The type of dinode (GFS_FILE_REG, GFS_FILE_DIR, GFS_FILE_LNK, ...)
++ * @mode: the permissions on the new inode
++ * @i_gh: An uninitialized holder for the new inode glock
++ *
++ * If the return value is 0, the glocks on both the directory and the new
++ * file are held.  A transaction has been started and an inplace reservation
++ * is held, as well.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_createi(struct gfs_holder *d_gh, struct qstr *name,
++          unsigned int type, unsigned int mode,
++          struct gfs_holder *i_gh)
++{
++      struct gfs_inode *dip = gl2ip(d_gh->gh_gl);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_unlinked *ul;
++      struct gfs_inum inum;
++      struct gfs_holder io_gh;
++      int error;
++
++      if (!name->len || name->len > GFS_FNAMESIZE)
++              return -ENAMETOOLONG;
++
++      gfs_holder_reinit(LM_ST_EXCLUSIVE, 0, d_gh);
++      error = gfs_glock_nq(d_gh);
++      if (error)
++              return error;
++
++      error = create_ok(dip, name, type);
++      if (error)
++              goto fail;
++
++      error = dinode_alloc(dip, &ul);
++      if (error)
++              goto fail;
++
++      inum.no_addr = ul->ul_inum.no_addr;
++      pick_formal_ino(sdp, &inum);
++
++      if (inum.no_formal_ino < dip->i_num.no_formal_ino) {
++              gfs_glock_dq(d_gh);
++
++              error = gfs_glock_nq_num(sdp,
++                                       inum.no_formal_ino, &gfs_inode_glops,
++                                       LM_ST_EXCLUSIVE, GL_SKIP, i_gh);
++              if (error) {
++                      gfs_unlinked_unlock(sdp, ul);
++                      return error;
++              }
++
++              gfs_holder_reinit(LM_ST_EXCLUSIVE, 0, d_gh);
++              error = gfs_glock_nq(d_gh);
++              if (error) {
++                      gfs_glock_dq_uninit(i_gh);
++                      gfs_unlinked_unlock(sdp, ul);
++                      return error;
++              }
++
++              error = create_ok(dip, name, type);
++              if (error)
++                      goto fail_gunlock_i;
++      } else {
++              error = gfs_glock_nq_num(sdp,
++                                       inum.no_formal_ino, &gfs_inode_glops,
++                                       LM_ST_EXCLUSIVE, GL_SKIP, i_gh);
++              if (error)
++                      goto fail_ul;
++      }
++
++      error = gfs_glock_nq_num(sdp,
++                               inum.no_addr, &gfs_iopen_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL | GL_EXACT,
++                               &io_gh);
++      if (error)
++              goto fail_gunlock_i;
++
++      error = inode_init_and_link(dip, name, &inum, i_gh->gh_gl, type, mode);
++      if (error)
++              goto fail_gunlock_io;
++
++      gfs_glock_dq_uninit(&io_gh);
++
++      return 0;
++
++ fail_gunlock_io:
++      gfs_glock_dq_uninit(&io_gh);
++
++ fail_gunlock_i:
++      gfs_glock_dq_uninit(i_gh);
++
++ fail_ul:
++      gfs_unlinked_unlock(sdp, ul);
++
++ fail:
++      gfs_glock_dq(d_gh);
++
++      return error;
++}
++
++/**
++ * gfs_unlinki - Unlink a file
++ * @dip: The inode of the directory
++ * @name: The name of the file to be unlinked
++ * @ip: The inode of the file to be removed
++ *
++ * Assumes Glocks on both dip and ip are held.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      int error;
++
++      error = gfs_dir_del(dip, name);
++      if (error)
++              return error;
++
++      error = gfs_change_nlink(ip, -1);
++      if (error)
++              return error;
++
++      /* If this inode is being unlinked from the directory structure,
++         we need to mark that in the log so that it isn't lost during
++         a crash. */
++
++      if (!ip->i_di.di_nlink) {
++              gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &ip->i_num);
++              set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_rmdiri - Remove a directory
++ * @dip: The parent directory of the directory to be removed
++ * @name: The name of the directory to be removed
++ * @ip: The GFS inode of the directory to be removed
++ *
++ * Assumes Glocks on dip and ip are held
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct qstr dotname;
++      int error;
++
++      GFS_ASSERT_INODE(ip->i_di.di_entries == 2, ip,
++                       gfs_dinode_print(&ip->i_di););
++
++      error = gfs_dir_del(dip, name);
++      if (error)
++              return error;
++
++      error = gfs_change_nlink(dip, -1);
++      if (error)
++              return error;
++
++      dotname.len = 1;
++      dotname.name = ".";
++      error = gfs_dir_del(ip, &dotname);
++      if (error)
++              return error;
++
++      dotname.len = 2;
++      dotname.name = "..";
++      error = gfs_dir_del(ip, &dotname);
++      if (error)
++              return error;
++
++      error = gfs_change_nlink(ip, -2);
++      if (error)
++              return error;
++
++      /* This inode is being unlinked from the directory structure and
++         we need to mark that in the log so that it isn't lost during
++         a crash. */
++
++      gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &ip->i_num);
++      set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++
++      return 0;
++}
++
++/*
++ * gfs_revalidate - check to see that a inode is still in a directory
++ * @dip: the directory
++ * @name: the name of the file
++ * @ip: the inode
++ *
++ * Assumes that the lock on (at least) @dip is held.
++ *
++ * Returns: 0 if the parent/child relationship is correct, -ENOENT if it isn't
++ */
++
++int
++gfs_revalidate(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++      struct gfs_inum inum;
++      unsigned int type;
++      int error;
++
++      error = gfs_dir_search(dip, name, &inum, &type);
++      if (!error) {
++              if (inum.no_formal_ino == ip->i_num.no_formal_ino)
++                      GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++              else
++                      error = -ENOENT;
++      }
++
++      return error;
++}
++
++/*
++ * gfs_ok_to_move - check if it's ok to move a directory to another directory
++ * @this: move this
++ * @to: to here
++ *
++ * Follow @to back to the root and make sure we don't encounter @this
++ * Assumes we already hold the rename lock.
++ *
++ * Returns: 0 if it's ok to move, -EXXX if it isn't
++ */
++
++int
++gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to)
++{
++      struct gfs_sbd *sdp = this->i_sbd;
++      struct gfs_inode *tmp;
++      struct gfs_holder to_gh, tmp_gh;
++      struct qstr dotdot;
++      int error = 0;
++
++      memset(&dotdot, 0, sizeof (struct qstr));
++      dotdot.name = "..";
++      dotdot.len = 2;
++
++      gfs_inode_hold(to);
++
++      for (;;) {
++              if (to == this) {
++                      error = -EINVAL;
++                      break;
++              }
++              if (to == sdp->sd_rooti) {
++                      error = 0;
++                      break;
++              }
++
++              gfs_holder_init(to->i_gl, 0, 0, &to_gh);
++
++              error = gfs_lookupi(&to_gh, &dotdot, TRUE, &tmp_gh);
++              if (error) {
++                      gfs_holder_uninit(&to_gh);
++                      break;
++              }
++              if (!tmp_gh.gh_gl) {
++                      gfs_holder_uninit(&to_gh);
++                      error = -ENOENT;
++                      break;
++              }
++
++              tmp = gl2ip(tmp_gh.gh_gl);
++
++              gfs_glock_dq_uninit(&to_gh);
++              gfs_glock_dq_uninit(&tmp_gh);
++
++              gfs_inode_put(to);
++              to = tmp;
++      }
++
++      gfs_inode_put(to);
++
++      return error;
++}
++
++/**
++ * gfs_readlinki - return the contents of a symlink
++ * @ip: the symlink's inode
++ * @buf: a pointer to the buffer to be filled
++ * @len: a pointer to the length of @buf
++ *
++ * If @buf is too small, a piece of memory is gmalloc()ed and needs
++ * to be freed by the caller.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len)
++{
++      struct gfs_holder i_gh;
++      struct buffer_head *dibh;
++      unsigned int x;
++      int error;
++
++      gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
++      error = gfs_glock_nq_atime(&i_gh);
++      if (error) {
++              gfs_holder_uninit(&i_gh);
++              return error;
++      }
++
++      GFS_ASSERT_INODE(ip->i_di.di_size, ip,);
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto out;
++
++      x = ip->i_di.di_size + 1;
++      if (x > *len)
++              *buf = gmalloc(x);
++
++      memcpy(*buf, dibh->b_data + sizeof(struct gfs_dinode), x);
++      *len = x;
++
++      brelse(dibh);
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_glock_nq_atime - Acquire the glock and conditionally update the atime on an inode
++ * @gh: the holder to acquire
++ *
++ * Tests atime for gfs_read, gfs_readdir and gfs_test_mmap
++ * Update if the difference between the current time and the current atime 
++ * is greater than a interval specfied at mount.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_glock_nq_atime(struct gfs_holder *gh)
++{
++      struct gfs_glock *gl = gh->gh_gl;
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_inode *ip;
++      int64_t curtime, quantum = sdp->sd_tune.gt_atime_quantum;
++      unsigned int state;
++      int flags;
++      int error;
++
++      GFS_ASSERT_GLOCK(gh->gh_flags & GL_ATIME, gl,);
++      GFS_ASSERT_GLOCK(!(gh->gh_flags & GL_ASYNC), gl,);
++      GFS_ASSERT_GLOCK(gl->gl_ops == &gfs_inode_glops, gl,);
++
++      ip = gl2ip(gl);
++      GFS_ASSERT_GLOCK(ip, gl,);
++
++      state = gh->gh_state;
++      flags = gh->gh_flags;
++
++      error = gfs_glock_nq(gh);
++      if (error)
++              return error;
++
++      if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
++          test_bit(SDF_ROFS, &sdp->sd_flags))
++              return 0;
++
++      curtime = get_seconds();
++      if (curtime - ip->i_di.di_atime >= quantum) {
++              int was_exclusive = (gl->gl_state == LM_ST_EXCLUSIVE);
++
++              gfs_glock_dq(gh);
++              gfs_holder_reinit(LM_ST_EXCLUSIVE,
++                                gh->gh_flags & ~LM_FLAG_ANY,
++                                gh);
++              error = gfs_glock_nq(gh);
++              if (error)
++                      return error;
++
++              /* Verify this hasn't been updated while we were
++                 trying to get exclusive lock. */
++
++              curtime = get_seconds();
++              if (curtime - ip->i_di.di_atime >= quantum) {
++                      struct buffer_head *dibh;
++
++                      error = gfs_trans_begin(sdp, 1, 0);
++                      if (error == -EROFS)
++                              return 0;
++                      if (error)
++                              goto fail;
++
++                      error = gfs_get_inode_buffer(ip, &dibh);
++                      if (error)
++                              goto fail_end_trans;
++
++                      ip->i_di.di_atime = curtime;
++
++                      gfs_trans_add_bh(ip->i_gl, dibh);
++                      gfs_dinode_out(&ip->i_di, dibh->b_data);
++                      brelse(dibh);
++
++                      gfs_trans_end(sdp);
++              }
++
++              if (!was_exclusive) {
++                      gfs_glock_dq(gh);
++                      flags &= ~LM_FLAG_ANY;
++                      flags |= GL_EXACT;
++                      gfs_holder_reinit(state, flags, gh);
++                      error = gfs_glock_nq(gh);
++                      return error;
++              }
++      }
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail:
++      gfs_glock_dq(gh);
++
++      return error;
++}
++
++/**
++ * glock_compare_atime - Compare two struct gfs_glock structures for sorting
++ * @arg_a: the first structure
++ * @arg_b: the second structure
++ *
++ */
++
++static int
++glock_compare_atime(void *arg_a, void *arg_b)
++{
++      struct gfs_holder *gh_a = *(struct gfs_holder **)arg_a;
++      struct gfs_holder *gh_b = *(struct gfs_holder **)arg_b;
++      struct lm_lockname *a = &gh_a->gh_gl->gl_name;
++      struct lm_lockname *b = &gh_b->gh_gl->gl_name;
++      int ret = 0;
++
++      if (a->ln_number > b->ln_number)
++              ret = 1;
++      else if (a->ln_number < b->ln_number)
++              ret = -1;
++      else {
++              if (gh_a->gh_state == LM_ST_SHARED &&
++                  gh_b->gh_state == LM_ST_EXCLUSIVE)
++                      ret = 1;
++              else if (gh_a->gh_state == LM_ST_SHARED &&
++                       (gh_b->gh_flags & GL_ATIME))
++                      ret = 1;
++      }
++
++      return ret;
++}
++
++/**
++ * gfs_glock_nq_m_atime - acquire multiple glocks where one may need an atime update
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++int
++gfs_glock_nq_m_atime(unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct gfs_holder *p[num_gh];
++      unsigned int x;
++      int error = 0;
++
++      GFS_ASSERT(num_gh,);
++
++      if (num_gh == 1) {
++              ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++              if (ghs->gh_flags & GL_ATIME)
++                      error = gfs_glock_nq_atime(ghs);
++              else
++                      error = gfs_glock_nq(ghs);
++              return error;
++      }
++
++      for (x = 0; x < num_gh; x++)
++              p[x] = &ghs[x];
++
++      gfs_sort(p, num_gh, sizeof(struct gfs_holder *), glock_compare_atime);
++
++      for (x = 0; x < num_gh; x++) {
++              p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++
++              if (p[x]->gh_flags & GL_ATIME)
++                      error = gfs_glock_nq_atime(p[x]);
++              else
++                      error = gfs_glock_nq(p[x]);
++
++              if (error) {
++                      while (x--)
++                              gfs_glock_dq(p[x]);
++                      break;
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_try_toss_vnode - See if we can toss a vnode from memory
++ * @ip: the inode
++ *
++ * Returns:  TRUE if the vnode was tossed
++ */
++
++void
++gfs_try_toss_vnode(struct gfs_inode *ip)
++{
++      struct inode *inode;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (!inode)
++              return;
++
++      d_prune_aliases(inode);
++
++      if (ip->i_di.di_type == GFS_FILE_DIR) {
++              struct list_head *head = &inode->i_dentry;
++              struct dentry *d = NULL;
++
++              spin_lock(&dcache_lock);
++              if (list_empty(head))
++                      spin_unlock(&dcache_lock);
++              else {
++                      d = list_entry(head->next, struct dentry, d_alias);
++                      dget_locked(d);
++                      spin_unlock(&dcache_lock);
++
++                      if (have_submounts(d))
++                              dput(d);
++                      else {
++                              shrink_dcache_parent(d);
++                              dput(d);
++                              d_prune_aliases(inode);
++                      }
++              }
++      }
++
++      inode->i_nlink = 0;
++      iput(inode);
++}
++
++/**
++ * iah_make_jdata -
++ * @gl:
++ * @inum:
++ *
++ */
++
++static void
++iah_make_jdata(struct gfs_glock *gl, struct gfs_inum *inum)
++{
++      struct buffer_head *bh;
++      struct gfs_dinode *di;
++      uint32_t flags;
++      int error;
++
++      error = gfs_dread(gl->gl_sbd, inum->no_addr, gl, DIO_START | DIO_WAIT, &bh);
++      GFS_ASSERT_GLOCK(!error, gl,); /* Already pinned */
++
++      di = (struct gfs_dinode *)bh->b_data;
++
++      flags = di->di_flags;
++      flags = gfs32_to_cpu(flags) | GFS_DIF_JDATA;
++      di->di_flags = cpu_to_gfs32(flags);
++
++      brelse(bh);
++}
++
++/**
++ * iah_super_update - 
++ * @sdp:
++ *
++ * Returns: errno
++ */
++
++static int
++iah_super_update(struct gfs_sbd *sdp)
++{
++      struct gfs_glock *gl;
++      struct buffer_head *bh;
++      int error;
++
++      error = gfs_glock_get(sdp,
++                            GFS_SB_LOCK, &gfs_meta_glops,
++                            NO_CREATE, &gl);
++      GFS_ASSERT_SBD(!error && gl, sdp,); /* This should already be held. */
++              
++      error = gfs_dread(sdp,
++                        GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, gl,
++                        DIO_START | DIO_WAIT, &bh);
++      if (!error) {
++              gfs_trans_add_bh(gl, bh);
++              gfs_sb_out(&sdp->sd_sb, bh->b_data);
++              brelse(bh);
++      }
++
++      gfs_glock_put(gl);
++
++      return error;
++}
++
++/**
++ * inode_alloc_hidden -
++ * @sdp:
++ * @inum:
++ *
++ * Returns: errno
++ */
++
++static int
++inode_alloc_hidden(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++      struct gfs_inode *dip = sdp->sd_rooti;
++      struct gfs_holder d_gh, i_gh;
++      struct gfs_unlinked *ul;
++      int error;
++
++      error = gfs_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh);
++      if (error)
++              return error;
++
++      error = dinode_alloc(dip, &ul);
++      if (error)
++              goto fail;
++
++      inum->no_addr = ul->ul_inum.no_addr;
++      pick_formal_ino(sdp, inum);
++
++      /* Don't worry about deadlock ordering here.  We're the first
++         mounter and still under the mount lock (i.e. there is no
++         contention). */
++
++      error = gfs_glock_nq_num(sdp,
++                               inum->no_formal_ino, &gfs_inode_glops,
++                               LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
++      if (error)
++              goto fail_ul;
++
++      gfs_alloc_get(dip);
++
++      error = gfs_quota_hold_m(dip, 0, 0);
++      if (error)
++              goto fail_al;
++
++      /* Trans may require:
++         The new inode, the superblock,
++         and one block for a quota change and
++         one block for an unlinked tag. */
++      
++      error = gfs_trans_begin(sdp, 2, 2);
++      if (error)
++              goto fail_unhold;
++      
++      error = make_dinode(dip, i_gh.gh_gl, inum, GFS_FILE_REG, 0600, 0, 0);
++      if (error)
++              goto fail_end_trans;
++
++      iah_make_jdata(i_gh.gh_gl, inum);
++
++      error = iah_super_update(sdp);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA,
++                             &(struct gfs_inum){0, inum->no_addr});
++      gfs_trans_add_quota(sdp, +1, 0, 0);
++      gfs_trans_add_gl(dip->i_gl);
++
++      gfs_trans_end(sdp);
++      gfs_quota_unhold_m(dip);
++      gfs_alloc_put(dip);
++
++      gfs_glock_dq_uninit(&i_gh);
++      gfs_glock_dq_uninit(&d_gh);
++
++      gfs_unlinked_unlock(sdp, ul);
++
++      gfs_log_flush(sdp);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_unhold:
++      gfs_quota_unhold_m(dip);
++
++ fail_al:
++      gfs_alloc_put(dip);
++      gfs_glock_dq_uninit(&i_gh);
++
++ fail_ul:
++      gfs_unlinked_unlock(sdp, ul);
++
++ fail:
++      gfs_glock_dq_uninit(&d_gh);
++
++      return error;
++}
++
++/**
++ * gfs_alloc_qinode - allocate a quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_alloc_qinode(struct gfs_sbd *sdp)
++{
++      return inode_alloc_hidden(sdp, &sdp->sd_sb.sb_quota_di);
++}
++
++/**
++ * gfs_alloc_linode - allocate a license inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_alloc_linode(struct gfs_sbd *sdp)
++{
++      return inode_alloc_hidden(sdp, &sdp->sd_sb.sb_license_di);
++}
+diff -urN linux-orig/fs/gfs/inode.h linux-patched/fs/gfs/inode.h
+--- linux-orig/fs/gfs/inode.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/inode.h       2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,68 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __INODE_DOT_H__
++#define __INODE_DOT_H__
++
++void gfs_inode_attr_in(struct gfs_inode *ip);
++void gfs_inode_attr_out(struct gfs_inode *ip);
++struct inode *gfs_iget(struct gfs_inode *ip, int create);
++
++int gfs_copyin_dinode(struct gfs_inode *ip);
++
++int gfs_inode_get(struct gfs_glock *i_gl, struct gfs_inum *inum, int create,
++                  struct gfs_inode **ipp);
++void gfs_inode_hold(struct gfs_inode *ip);
++void gfs_inode_put(struct gfs_inode *ip);
++void gfs_inode_destroy(struct gfs_inode *ip);
++
++int gfs_inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum);
++
++int gfs_change_nlink(struct gfs_inode *ip, int diff);
++int gfs_lookupi(struct gfs_holder *d_gh, struct qstr *name,
++              int is_root, struct gfs_holder *i_gh);
++int gfs_createi(struct gfs_holder *d_gh, struct qstr *name,
++              unsigned int type, unsigned int mode,
++              struct gfs_holder *i_gh);
++int gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
++int gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
++int gfs_revalidate(struct gfs_inode *dip, struct qstr *name,
++                 struct gfs_inode *ip);
++int gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to);
++int gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len);
++
++int gfs_glock_nq_atime(struct gfs_holder *gh);
++int gfs_glock_nq_m_atime(unsigned int num_gh, struct gfs_holder *ghs);
++
++void gfs_try_toss_vnode(struct gfs_inode *ip);
++
++/*  Backwards compatibility functions  */
++
++int gfs_alloc_qinode(struct gfs_sbd *sdp);
++int gfs_alloc_linode(struct gfs_sbd *sdp);
++
++/*  Inlines  */
++
++static __inline__ int
++gfs_is_stuffed(struct gfs_inode *ip)
++{
++      return !ip->i_di.di_height;
++}
++
++static __inline__ int
++gfs_is_jdata(struct gfs_inode *ip)
++{
++      return ip->i_di.di_flags & GFS_DIF_JDATA;
++}
++
++#endif /* __INODE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ioctl.c linux-patched/fs/gfs/ioctl.c
+--- linux-orig/fs/gfs/ioctl.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ioctl.c       2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,983 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * gfs_add_bh_to_ub - copy a buffer up to user space
++ * @ub: the structure representing where to copy
++ * @bh: the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_add_bh_to_ub(struct gfs_user_buffer *ub, struct buffer_head *bh)
++{
++      uint64_t blkno = bh->b_blocknr;
++
++      if (ub->ub_count + sizeof(uint64_t) + bh->b_size > ub->ub_size)
++              return -ENOMEM;
++
++      if (copy_to_user(ub->ub_data + ub->ub_count,
++                       &blkno,
++                       sizeof(uint64_t)))
++              return -EFAULT;
++      ub->ub_count += sizeof(uint64_t);
++
++      if (copy_to_user(ub->ub_data + ub->ub_count,
++                       bh->b_data,
++                       bh->b_size))
++              return -EFAULT;
++      ub->ub_count += bh->b_size;
++
++      return 0;
++}
++
++/**
++ * get_meta - Read out all the metadata for a file
++ * @ip: the file
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_meta(struct gfs_inode *ip, void *arg)
++{
++      struct gfs_holder i_gh;
++      struct gfs_user_buffer ub;
++      int error;
++
++      if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++              return -EFAULT;
++      ub.ub_count = 0;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_get_file_meta(ip, &ub);
++      if (error)
++              goto out;
++
++      if (ip->i_di.di_type == GFS_FILE_DIR &&
++          (ip->i_di.di_flags & GFS_DIF_EXHASH)) {
++              error = gfs_get_dir_meta(ip, &ub);
++              if (error)
++                      goto out;
++      }
++
++      if (ip->i_di.di_eattr) {
++              error = gfs_get_eattr_meta(ip, &ub);
++              if (error)
++                      goto out;
++      }
++
++      if (copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++              error = -EFAULT;
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * file_stat - return the struct gfs_dinode of a file to user space
++ * @ip: the inode
++ * @arg: where to copy to
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++file_stat(struct gfs_inode *ip, void *arg)
++{
++      struct gfs_holder i_gh;
++      struct gfs_dinode di;
++      int error;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (error)
++              return error;
++
++      memcpy(&di, &ip->i_di, sizeof(struct gfs_dinode));
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      if (copy_to_user(arg, &di, sizeof(struct gfs_dinode)))
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * do_get_super - Dump the superblock into a buffer
++ * @sb: The superblock
++ * @ptr: The buffer pointer
++ *
++ * Returns: 0 or error code
++ */
++
++static int
++do_get_super(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_sb *sb;
++      struct gfs_holder sb_gh;
++      struct buffer_head *bh;
++      int error;
++
++      sb = gmalloc(sizeof(struct gfs_sb));
++
++      error = gfs_glock_nq_num(sdp,
++                               GFS_SB_LOCK, &gfs_meta_glops,
++                               LM_ST_SHARED, 0, &sb_gh);
++      if (error)
++              goto out;
++
++      error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, sb_gh.gh_gl,
++                        DIO_START | DIO_WAIT, &bh);
++      if (error) {
++              gfs_glock_dq_uninit(&sb_gh);
++              goto out;
++      }
++
++      gfs_sb_in(sb, bh->b_data);
++      brelse(bh);
++
++      gfs_glock_dq_uninit(&sb_gh);
++
++      if (copy_to_user(arg, sb, sizeof(struct gfs_sb)))
++              error = -EFAULT;
++
++ out:
++      kfree(sb);
++
++      return error;
++}
++
++/**
++ * jt2ip - convert the file type in a jio struct to the right hidden ip
++ * @sdp: the filesystem
++ * @jt: the gfs_jio_structure
++ *
++ * Returns: The inode structure for the correct hidden file
++ */
++
++static struct gfs_inode *
++jt2ip(struct gfs_sbd *sdp, struct gfs_jio *jt)
++{
++      struct gfs_inode *ip = NULL;
++
++      switch (jt->jio_file) {
++      case GFS_HIDDEN_JINDEX:
++              ip = sdp->sd_jiinode;
++              break;
++
++      case GFS_HIDDEN_RINDEX:
++              ip = sdp->sd_riinode;
++              break;
++
++      case GFS_HIDDEN_QUOTA:
++              ip = sdp->sd_qinode;
++              break;
++
++      case GFS_HIDDEN_LICENSE:
++              ip = sdp->sd_linode;
++              break;
++      }
++
++      return ip;
++}
++
++/**
++ * jread_ioctl - Read from a journaled data file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: Amount of data copied or error
++ */
++
++static int
++jread_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_jio jt;
++      struct gfs_inode *ip;
++      struct gfs_holder i_gh;
++      int error;
++
++      if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++              return -EFAULT;
++
++      ip = jt2ip(sdp, &jt);
++      if (!ip)
++              return -EINVAL;
++
++      GFS_ASSERT_INODE(gfs_is_jdata(ip), ip,);
++
++      if (!access_ok(VERIFY_WRITE, jt.jio_data, jt.jio_size))
++              return -EFAULT;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_readi(ip, jt.jio_data, jt.jio_offset, jt.jio_size,
++                        gfs_copy2user);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      if (error < 0)
++              return error;
++      jt.jio_count = error;
++
++      if (copy_to_user(arg, &jt, sizeof(struct gfs_jio)))
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * jwrite_ioctl - Write to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: Amount of data copied or error
++ */
++
++static int
++jwrite_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_jio jt;
++      struct gfs_inode *ip;
++      struct gfs_alloc *al = NULL;
++      struct gfs_holder i_gh;
++      unsigned int data_blocks, ind_blocks;
++      int alloc_required;
++      int error;
++
++      if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++              return -EFAULT;
++
++      ip = jt2ip(sdp, &jt);
++      if (!ip)
++              return -EINVAL;
++
++      GFS_ASSERT_INODE(gfs_is_jdata(ip), ip,);
++
++      if (!access_ok(VERIFY_READ, jt.jio_data, jt.jio_size))
++              return -EFAULT;
++
++      gfs_write_calc_reserv(ip, jt.jio_size, &data_blocks, &ind_blocks);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
++                                LM_FLAG_PRIORITY | GL_SYNC, &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_write_alloc_required(ip, jt.jio_offset, jt.jio_size,
++                                       &alloc_required);
++      if (error)
++              goto out;
++
++      if (alloc_required) {
++              al = gfs_alloc_get(ip);
++
++              error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE,
++                                       NO_QUOTA_CHANGE);
++              if (error)
++                      goto out_alloc;
++
++              al->al_requested_meta = ind_blocks + data_blocks;
++
++              error = gfs_inplace_reserve(ip);
++              if (error)
++                      goto out_qs;
++
++              /* Trans may require:
++                 All blocks for a RG bitmap, all the "data" blocks, whatever
++                 indirect blocks we need, a modified dinode, and a quota change */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + al->al_rgd->rd_ri.ri_length +
++                                      ind_blocks + data_blocks, 1);
++              if (error)
++                      goto out_relse;
++      } else {
++              /* Trans may require:
++                 All the "data" blocks and a modified dinode. */
++
++              error = gfs_trans_begin(sdp, 1 + data_blocks, 0);
++              if (error)
++                      goto out_relse;
++      }
++
++      error = gfs_writei(ip, jt.jio_data, jt.jio_offset, jt.jio_size,
++                         gfs_copy_from_user);
++      if (error >= 0) {
++              jt.jio_count = error;
++              error = 0;
++      }
++      
++      gfs_trans_end(sdp);
++      
++ out_relse:
++      if (alloc_required) {
++              GFS_ASSERT_INODE(error || al->al_alloced_meta, ip,);
++              gfs_inplace_release(ip);
++      }
++
++ out_qs:
++      if (alloc_required)
++              gfs_quota_unhold_m(ip);
++
++ out_alloc:
++      if (alloc_required)
++              gfs_alloc_put(ip);
++
++ out:
++      ip->i_gl->gl_vn++;
++      gfs_glock_dq_uninit(&i_gh);
++
++      if (!error && copy_to_user(arg, &jt, sizeof(struct gfs_jio)))
++              return -EFAULT;
++
++      return error;
++}
++
++/**
++ * jstat_ioctl - Stat to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++jstat_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_jio jt;
++      struct gfs_inode *ip;
++      struct gfs_holder i_gh;
++      int error;
++
++      if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++          return -EFAULT;
++
++      ip = jt2ip(sdp, &jt);
++      if (!ip)
++              return -EINVAL;
++
++      if (jt.jio_size < sizeof(struct gfs_dinode))
++              return -EINVAL;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (error)
++              return error;
++
++      error = copy_to_user(jt.jio_data, &ip->i_di, sizeof(struct gfs_dinode));
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      if (error)
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * jtrunc_ioctl - Truncate to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++jtrunc_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_jio jt;
++      struct gfs_inode *ip;
++      struct gfs_holder i_gh;
++      int error;
++
++      if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++          return -EFAULT;
++
++      ip = jt2ip(sdp, &jt);
++      if (!ip)
++              return -EINVAL;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SYNC, &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_truncatei(ip, jt.jio_offset, NULL);
++
++      ip->i_gl->gl_vn++;
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * lock_dump - copy out info about the GFS' lock space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_user_buffer in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++lock_dump(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_user_buffer ub;
++      int error;
++
++      if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++              return -EFAULT;
++      ub.ub_count = 0;
++
++      error = gfs_dump_lockstate(sdp, &ub);
++      if (error)
++              return error;
++
++      if (copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * stat_gfs_ioctl - Do a GFS specific statfs
++ * @sdp: the filesystem
++ * @arg: the struct gfs_usage structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++stat_gfs_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_usage *u;
++      int error;
++
++      u = gmalloc(sizeof(struct gfs_usage));
++
++      error = gfs_stat_gfs(sdp, u, TRUE);
++      if (!error && copy_to_user(arg, u, sizeof(struct gfs_usage)))
++              return -EFAULT;
++
++      kfree(u);
++
++      return error;
++}
++
++/**
++ * reclaim_ioctl - ioctl called to perform metadata reclaimation
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_reclaim_stats in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++reclaim_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_reclaim_stats stats;
++      int error;
++
++      memset(&stats, 0, sizeof(struct gfs_reclaim_stats));
++
++      error = gfs_reclaim_metadata(sdp, &stats);
++      if (error)
++              return error;
++
++      if (copy_to_user(arg, &stats, sizeof(struct gfs_reclaim_stats)))
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * get_tune - pass the current tuneable parameters up to user space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_tune in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_tune(struct gfs_sbd *sdp, void *arg)
++{
++      if (copy_to_user(arg, &sdp->sd_tune, sizeof(struct gfs_tune)))
++              return -EFAULT;
++
++      return 0;
++}
++
++/**
++ * set_tune - replace the current tuneable parameters with a set from user space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_tune in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++set_tune(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_tune *gt;
++      int error = 0;
++
++      gt = gmalloc(sizeof(struct gfs_tune));
++
++      if (copy_from_user(gt, arg, sizeof(struct gfs_tune)))
++              error = -EFAULT;
++      else {
++              if (gt->gt_tune_version != GFS_TUNE_VERSION) {
++                      printk("GFS: fsid=%s: invalid version of tuneable parameters\n",
++                             sdp->sd_fsname);
++                      error = -EINVAL;
++              } else
++                      memcpy(&sdp->sd_tune, gt, sizeof(struct gfs_tune));
++      }
++
++      kfree(gt);
++
++      return error;
++}
++
++/**
++ * gfs_set_flag - set/clear a flag on an inode
++ * @ip: the inode
++ * @cmd: GFS_SET_FLAG or GFS_CLEAR_FLAG
++ * @arg: the flag to change (in user space)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_set_flag(struct gfs_inode *ip, unsigned int cmd, void *arg)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_holder i_gh;
++      struct buffer_head *dibh;
++      uint32_t flag;
++      int error;
++
++      if (copy_from_user(&flag, arg, sizeof(uint32_t)))
++              return -EFAULT;
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (error)
++              return error;
++
++      error = -EACCES;
++      if (ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
++              goto out;
++
++      error = -EINVAL;
++
++      switch (flag) {
++      case GFS_DIF_EXHASH:
++      case GFS_DIF_UNUSED:
++      case GFS_DIF_EA_INDIRECT:
++              goto out;
++
++      case GFS_DIF_JDATA:
++              if (ip->i_di.di_type != GFS_FILE_REG || ip->i_di.di_size)
++                      goto out;
++              break;
++
++      case GFS_DIF_DIRECTIO:
++              if (ip->i_di.di_type != GFS_FILE_REG)
++                      goto out;
++              break;
++
++      case GFS_DIF_IMMUTABLE:
++      case GFS_DIF_APPENDONLY:
++      case GFS_DIF_NOATIME:
++      case GFS_DIF_SYNC:
++              /*  FixMe!!!  */
++              error = -ENOSYS;
++              goto out;
++
++      case GFS_DIF_INHERIT_DIRECTIO:
++      case GFS_DIF_INHERIT_JDATA:
++              if (ip->i_di.di_type != GFS_FILE_DIR)
++                      goto out;
++              break;
++
++      default:
++              goto out;
++      }
++
++      error = gfs_trans_begin(sdp, 1, 0);
++      if (error)
++              goto out;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto out_trans_end;
++
++      if (cmd == GFS_SET_FLAG)
++              ip->i_di.di_flags |= flag;
++      else
++              ip->i_di.di_flags &= ~flag;
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++ out_trans_end:
++      gfs_trans_end(sdp);
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * handle_roll - Read a atomic_t as an unsigned int
++ * @a: a counter
++ *
++ * if @a is negative, reset it to zero
++ *
++ * Returns: the value of the counter
++ */
++
++static unsigned int
++handle_roll(atomic_t *a)
++{
++      int x = atomic_read(a);
++      if (x < 0) {
++              atomic_set(a, 0);
++              return 0;
++      }
++      return (unsigned int)x;
++}
++
++/**
++ * fill_counters - Write a FS' counters into a buffer
++ * @sdp: the filesystem
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: errno
++ */
++
++static int
++fill_counters(struct gfs_sbd *sdp,
++            char *buf, unsigned int size, unsigned int *count)
++{
++      int error = 0;
++
++      gfs_sprintf("sd_glock_count:locks::%d\n",
++                  atomic_read(&sdp->sd_glock_count));
++      gfs_sprintf("sd_glock_held_count:locks held::%d\n",
++                  atomic_read(&sdp->sd_glock_held_count));
++      gfs_sprintf("sd_inode_count:incore inodes::%d\n",
++                  atomic_read(&sdp->sd_inode_count));
++      gfs_sprintf("sd_bufdata_count:metadata buffers::%d\n",
++                  atomic_read(&sdp->sd_bufdata_count));
++      gfs_sprintf("sd_unlinked_ic_count:unlinked inodes::%d\n",
++                  atomic_read(&sdp->sd_unlinked_ic_count));
++      gfs_sprintf("sd_quota_count:quota IDs::%d\n",
++                  atomic_read(&sdp->sd_quota_count));
++      gfs_sprintf("sd_log_buffers:incore log buffers::%u\n",
++                  sdp->sd_log_buffers);
++      gfs_sprintf("sd_log_seg_free:log segments free::%u\n",
++                  sdp->sd_log_seg_free);
++      gfs_sprintf("ji_nsegment:log segments total::%u\n",
++                  sdp->sd_jdesc.ji_nsegment);
++      gfs_sprintf("sd_mhc_count:meta header cache entries::%d\n",
++                  atomic_read(&sdp->sd_mhc_count));
++      gfs_sprintf("sd_depend_count:glock dependencies::%d\n",
++                  atomic_read(&sdp->sd_depend_count));
++      gfs_sprintf("sd_reclaim_count:glocks on reclaim list::%d\n",
++                  atomic_read(&sdp->sd_reclaim_count));
++      gfs_sprintf("sd_log_wrap:log wraps::%"PRIu64"\n",
++                  sdp->sd_log_wrap);
++      gfs_sprintf("sd_fh2dentry_misses:fh2dentry misses:diff:%u\n",
++                  handle_roll(&sdp->sd_fh2dentry_misses));
++      gfs_sprintf("sd_reclaimed:glocks reclaimed:diff:%u\n",
++                  handle_roll(&sdp->sd_reclaimed));
++      gfs_sprintf("sd_glock_nq_calls:glock nq calls:diff:%u\n",
++                  handle_roll(&sdp->sd_glock_nq_calls));
++      gfs_sprintf("sd_glock_dq_calls:glock dq calls:diff:%u\n",
++                  handle_roll(&sdp->sd_glock_dq_calls));
++      gfs_sprintf("sd_glock_prefetch_calls:glock prefetch calls:diff:%u\n",
++                  handle_roll(&sdp->sd_glock_prefetch_calls));
++      gfs_sprintf("sd_lm_lock_calls:lm_lock calls:diff:%u\n",
++                  handle_roll(&sdp->sd_lm_lock_calls));
++      gfs_sprintf("sd_lm_unlock_calls:lm_unlock calls:diff:%u\n",
++                  handle_roll(&sdp->sd_lm_unlock_calls));
++      gfs_sprintf("sd_lm_callbacks:lm callbacks:diff:%u\n",
++                  handle_roll(&sdp->sd_lm_callbacks));
++      gfs_sprintf("sd_ops_address:address operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_address));
++      gfs_sprintf("sd_ops_dentry:dentry operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_dentry));
++      gfs_sprintf("sd_ops_export:export operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_export));
++      gfs_sprintf("sd_ops_file:file operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_file));
++      gfs_sprintf("sd_ops_inode:inode operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_inode));
++      gfs_sprintf("sd_ops_super:super operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_super));
++      gfs_sprintf("sd_ops_vm:vm operations:diff:%u\n",
++                  handle_roll(&sdp->sd_ops_vm));
++
++ out:
++      return error;
++}
++
++/**
++ * get_counters - return usage counters to user space
++ * @sdp: the filesystem
++ * @arg: the counter structure to fill
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_counters(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_user_buffer ub;
++      unsigned int size = sdp->sd_tune.gt_lockdump_size;
++      char *buf;
++      int error;
++
++      if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++              return -EFAULT;
++      ub.ub_count = 0;
++
++      if (size > ub.ub_size)
++              size = ub.ub_size;
++
++      buf = kmalloc(size, GFP_KERNEL);
++      if (!buf)
++              return -ENOMEM;
++
++      error = fill_counters(sdp, buf, size, &ub.ub_count);
++      if (!error) {
++              if (copy_to_user(ub.ub_data, buf, ub.ub_count) ||
++                  copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++                      error = -EFAULT;
++      }
++
++      kfree(buf);
++
++      return error;
++}
++
++/**
++ * gfs_ioctli - filesystem independent ioctl function
++ * @ip: the inode the ioctl was on
++ * @cmd: the ioctl number
++ * @arg: the argument (still in user space)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_ioctli(struct gfs_inode *ip, unsigned int cmd, void *arg)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      int error = 0;
++
++      switch (cmd) {
++      case GFS_GET_META:
++              error = get_meta(ip, arg);
++              break;
++
++      case GFS_FILE_STAT:
++              error = file_stat(ip, arg);
++              break;
++
++      case GFS_SHRINK:
++              if (capable(CAP_SYS_ADMIN))
++                      gfs_gl_hash_clear(sdp, FALSE);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_GET_ARGS:
++              if  (copy_to_user(arg, &sdp->sd_args,
++                                sizeof(struct gfs_args)))
++                      error = -EFAULT;
++              break;
++
++      case GFS_GET_LOCKSTRUCT:
++              if (copy_to_user(arg, &sdp->sd_lockstruct,
++                               sizeof(struct lm_lockstruct)))
++                      error = -EFAULT;
++              break;
++
++      case GFS_GET_SUPER:
++              error = do_get_super(sdp, arg);
++              break;
++
++      case GFS_JREAD:
++              if (capable(CAP_SYS_ADMIN))
++                      error = jread_ioctl(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_JWRITE:
++              if (capable(CAP_SYS_ADMIN))
++                      error = jwrite_ioctl(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_JSTAT:
++              error = jstat_ioctl(sdp, arg);
++              break;
++
++      case GFS_JTRUNC:
++              if (capable(CAP_SYS_ADMIN))
++                      error = jtrunc_ioctl(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_LOCK_DUMP:
++              if (capable(CAP_SYS_ADMIN))
++                      error = lock_dump(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_STATGFS:
++              error = stat_gfs_ioctl(sdp, arg);
++              break;
++
++      case GFS_FREEZE:
++              if (capable(CAP_SYS_ADMIN))
++                      error = gfs_freeze_fs(sdp);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_UNFREEZE:
++              if (capable(CAP_SYS_ADMIN))
++                      gfs_unfreeze_fs(sdp);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_RECLAIM_METADATA:
++              if (capable(CAP_SYS_ADMIN))
++                      error = reclaim_ioctl(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_QUOTA_SYNC:
++              if (capable(CAP_SYS_ADMIN))
++                      error = gfs_quota_sync(sdp);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_QUOTA_REFRESH:
++              if (capable(CAP_SYS_ADMIN))
++                      error = gfs_quota_refresh(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_QUOTA_READ:
++              /*  Permissions handled later  */
++              error = gfs_quota_read(sdp, arg);
++              break;
++
++      case GFS_GET_TUNE:
++              error = get_tune(sdp, arg);
++              break;
++
++      case GFS_SET_TUNE:
++              if (capable(CAP_SYS_ADMIN))
++                      error = set_tune(sdp, arg);
++              else
++                      error = -EACCES;
++              break;
++
++      case GFS_EATTR_GET:
++              /*  Permissions handled later  */
++              error = gfs_get_eattr_ioctl(sdp, ip, arg);
++              break;
++
++      case GFS_EATTR_SET:
++              /*  Permissions handled later  */
++              error = gfs_set_eattr_ioctl(sdp, ip, arg);
++              break;
++
++      case GFS_WHERE_ARE_YOU:
++              {
++                      unsigned int x = GFS_MAGIC;
++                      if (copy_to_user(arg, &x, sizeof(unsigned int)))
++                              error = -EFAULT;
++              }
++              break;
++
++      case GFS_SET_FLAG:
++      case GFS_CLEAR_FLAG:
++              /*  Permissions handled later  */
++              error = gfs_set_flag(ip, cmd, arg);
++              break;
++
++      case GFS_GET_COUNTERS:
++              error = get_counters(sdp, arg);
++              break;
++
++      case GFS_FILE_FLUSH:
++              gfs_glock_force_drop(ip->i_gl);
++              break;
++
++      default:
++              error = -ENOTTY;
++              break;
++      }
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/ioctl.h linux-patched/fs/gfs/ioctl.h
+--- linux-orig/fs/gfs/ioctl.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ioctl.h       2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,21 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __IOCTL_DOT_H__
++#define __IOCTL_DOT_H__
++
++int gfs_add_bh_to_ub(struct gfs_user_buffer *ub, struct buffer_head *bh);
++
++int gfs_ioctli(struct gfs_inode *ip, unsigned int cmd, void *arg);
++
++#endif /* __IOCTL_DOT_H__ */
+diff -urN linux-orig/fs/gfs/locking.c linux-patched/fs/gfs/locking.c
+--- linux-orig/fs/gfs/locking.c        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/locking.c     2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,114 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "locking.h"
++#include "super.h"
++
++/**
++ * gfs_mount_lockproto - mount a locking protocol
++ * @sdp: the filesystem
++ * @args: mount arguements
++ * @silent: if TRUE, don't complain if the FS isn't a GFS fs
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_mount_lockproto(struct gfs_sbd *sdp, int silent)
++{
++      struct gfs_sb *sb = NULL;
++      struct buffer_head *bh;
++      char *proto, *table, *p = NULL;
++      int error = 0;
++
++      proto = sdp->sd_args.ar_lockproto;
++      table = sdp->sd_args.ar_locktable;
++
++      /*  Try to autodetect  */
++
++      if (!proto[0] || !table[0]) {
++              error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, NULL,
++                                DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++              if (error)
++                      goto out;
++
++              sb = gmalloc(sizeof(struct gfs_sb));
++              gfs_sb_in(sb, bh->b_data);
++              brelse(bh);
++
++              error = gfs_check_sb(sdp, sb, silent);
++              if (error)
++                      goto out;
++
++              if (!proto[0])
++                      proto = sb->sb_lockproto;
++
++              if (!table[0])
++                      table = sb->sb_locktable;
++      }
++
++      error = lm_mount(proto, table, sdp->sd_args.ar_hostdata,
++                       gfs_glock_cb, sdp,
++                       GFS_MIN_LVB_SIZE, &sdp->sd_lockstruct);
++      if (error) {
++              printk("GFS: can't mount proto = %s, table = %s, hostdata = %s\n",
++                   proto, table, sdp->sd_args.ar_hostdata);
++              goto out;
++      }
++
++      GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_lockspace, sdp,);
++      GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_ops, sdp,);
++      GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_lvb_size >= GFS_MIN_LVB_SIZE,
++                     sdp,);
++
++      if (!*table) {
++              table = p = gmalloc(sizeof(sdp->sd_vfs->s_id) + 1);
++              strncpy(table, sdp->sd_vfs->s_id, sizeof(sdp->sd_vfs->s_id));
++              table[sizeof(sdp->sd_vfs->s_id)] = 0;
++      }
++
++      snprintf(sdp->sd_fsname, 256, "%s.%u", table,
++               sdp->sd_lockstruct.ls_jid);
++
++      if (p)
++              kfree(p);
++
++      out:
++      if (sb)
++              kfree(sb);
++
++      return error;
++}
++
++/**
++ * gfs_unmount_lockproto - Unmount lock protocol
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_unmount_lockproto(struct gfs_sbd *sdp)
++{
++      lm_unmount(&sdp->sd_lockstruct);
++}
+diff -urN linux-orig/fs/gfs/locking.h linux-patched/fs/gfs/locking.h
+--- linux-orig/fs/gfs/locking.h        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/locking.h     2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKING_DOT_H__
++#define __LOCKING_DOT_H__
++
++int gfs_mount_lockproto(struct gfs_sbd *sdp, int silent);
++void gfs_unmount_lockproto(struct gfs_sbd *sdp);
++
++#endif /* __LOCKING_DOT_H__ */
+diff -urN linux-orig/fs/gfs/log.c linux-patched/fs/gfs/log.c
+--- linux-orig/fs/gfs/log.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/log.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,1315 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++          What rolls down stairs
++             Alone or in pairs
++      Rolls over your neighbor's dog.
++         What's great for a snack
++           And fits on your back
++             It's log, log, log!
++             It's lo-og, lo-og,
++       It's big, it's heavy, it's wood.
++             It's lo-og, lo-og,
++       It's better than bad, it's good.
++           Everyone wants a log,
++         You're gonna love it, log
++         Come on and get your log,
++           Everyone needs a log...
++            LOG... FROM BLAMMO!
++
++                     -- The Ren and Stimpy Show
++*/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "log.h"
++#include "lops.h"
++
++/**
++ * gfs_struct2blk - compute stuff
++ * @sdp: the filesystem
++ * @nstruct: the number of structures
++ * @ssize: the size of the structures
++ *
++ * Compute the number of log descriptor blocks needed to hold a certain number
++ * of structures of a certain size.
++ *
++ * Returns: the number of blocks needed
++ */
++
++unsigned int
++gfs_struct2blk(struct gfs_sbd *sdp, unsigned int nstruct, unsigned int ssize)
++{
++      unsigned int blks;
++      unsigned int first, second;
++
++      blks = 1;
++      first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_log_descriptor)) / ssize;
++
++      if (nstruct > first) {
++              second = sdp->sd_sb.sb_bsize / ssize;
++              blks += DIV_RU(nstruct - first, second);
++      }
++
++      return blks;
++}
++
++/**
++ * gfs_blk2seg - Convert number of blocks into number of segments
++ * @sdp: The GFS superblock
++ * @blocks: The number of blocks
++ *
++ * Returns: The number of journal segments
++ */
++
++unsigned int
++gfs_blk2seg(struct gfs_sbd *sdp, unsigned int blocks)
++{
++      return DIV_RU(blocks, sdp->sd_sb.sb_seg_size - 1);
++}
++
++/**
++ * log_distance - Compute distance between two journal blocks
++ * @sdp: The GFS superblock
++ * @newer: The most recent journal block of the pair
++ * @older: The older journal block of the pair
++ *
++ *   Compute the distance (in the journal direction) between two
++ *   blocks in the journal
++ *
++ * Returns: the distance in blocks
++ */
++
++static __inline__ unsigned int
++log_distance(struct gfs_sbd *sdp, uint64_t newer, uint64_t older)
++{
++      int64_t dist;
++
++      dist = newer - older;
++      if (dist < 0)
++              dist += sdp->sd_jdesc.ji_nsegment * sdp->sd_sb.sb_seg_size;
++
++      return dist;
++}
++
++/**
++ * log_incr_head - Increment journal head
++ * @sdp: The GFS superblock
++ * @head: the variable holding the head of the journal
++ *
++ * Increment journal head by one. 
++ * At the end of the journal, wrap head back to the start.
++ *
++ */
++
++static __inline__ void
++log_incr_head(struct gfs_sbd *sdp, uint64_t * head)
++{
++      struct gfs_jindex *jdesc = &sdp->sd_jdesc;
++
++      if (++*head ==
++          jdesc->ji_addr + jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size)
++              *head = jdesc->ji_addr;
++}
++
++/**
++ * gfs_ail_start - Start I/O on the AIL
++ * @sdp: the filesystem
++ * @flags:
++ *
++ */
++
++void
++gfs_ail_start(struct gfs_sbd *sdp, int flags)
++{
++      struct list_head *head = &sdp->sd_log_ail;
++      struct list_head *first, *tmp;
++      struct gfs_trans *first_tr, *tr;
++
++      gfs_log_lock(sdp);
++
++      if (list_empty(head)) {
++              gfs_log_unlock(sdp);
++              return;
++      }
++
++      first = head->prev;
++      first_tr = list_entry(first, struct gfs_trans, tr_list);
++      gfs_ail_start_trans(sdp, first_tr);
++
++      if (flags & DIO_ALL)
++              first_tr = NULL;
++
++      for (tmp = first->prev; tmp != head; tmp = tmp->prev) {
++              if (first_tr && gfs_ail_empty_trans(sdp, first_tr))
++                      break;
++
++              tr = list_entry(tmp, struct gfs_trans, tr_list);
++              gfs_ail_start_trans(sdp, tr);
++      }
++
++      gfs_log_unlock(sdp);
++}
++
++/**
++ * current_tail - Find block number of current log tail
++ * @sdp: The GFS superblock
++ *
++ * Find the block number of the current tail of the log.
++ * Assumes that the log lock is held.
++ *
++ * Returns: The tail's block number
++ */
++
++static uint64_t
++current_tail(struct gfs_sbd *sdp)
++{
++      struct gfs_trans *tr;
++      uint64_t tail;
++
++      if (list_empty(&sdp->sd_log_ail)) {
++              tail = sdp->sd_log_head;
++
++              if (!gfs_log_is_header(sdp, tail)) {
++                      tail--;
++                      GFS_ASSERT_SBD(gfs_log_is_header(sdp, tail), sdp,);
++              }
++      } else {
++              tr = list_entry(sdp->sd_log_ail.prev,
++                              struct gfs_trans, tr_list);
++              tail = tr->tr_first_head;
++      }
++
++      return tail;
++}
++
++/**
++ * gfs_ail_empty - move the tail of the log forward (if possible)
++ * @sdp: the filesystem
++ *
++ * Returns: TRUE if the AIL is empty
++ */
++
++int
++gfs_ail_empty(struct gfs_sbd *sdp)
++{
++      struct list_head *head, *tmp, *prev;
++      struct gfs_trans *tr;
++      uint64_t oldtail, newtail;
++      unsigned int dist;
++      unsigned int segments;
++      int ret;
++
++      gfs_log_lock(sdp);
++
++      oldtail = current_tail(sdp);
++
++      for (head = &sdp->sd_log_ail, tmp = head->prev, prev = tmp->prev;
++           tmp != head;
++           tmp = prev, prev = tmp->prev) {
++              tr = list_entry(tmp, struct gfs_trans, tr_list);
++
++              if (gfs_ail_empty_trans(sdp, tr)) {
++                      list_del(&tr->tr_list);
++                      kfree(tr);
++              }
++      }
++
++      newtail = current_tail(sdp);
++
++      if (oldtail != newtail) {
++              dist = log_distance(sdp, newtail, oldtail);
++
++              segments = dist / sdp->sd_sb.sb_seg_size;
++              GFS_ASSERT_SBD(segments * sdp->sd_sb.sb_seg_size == dist, sdp,);
++
++              spin_lock(&sdp->sd_log_seg_lock);
++              sdp->sd_log_seg_free += segments;
++              GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++                             sdp,);
++              spin_unlock(&sdp->sd_log_seg_lock);
++      }
++
++      ret = list_empty(head);
++
++      gfs_log_unlock(sdp);
++
++      return ret;
++}
++
++/**
++ * gfs_log_reserve - Make a log reservation
++ * @sdp: The GFS superblock
++ * @segments: The number of segments to reserve
++ * @jump_queue: if TRUE, don't care about fairness ordering
++ *
++ * Returns:  0 on success, -EXXX on failure
++ */
++
++int
++gfs_log_reserve(struct gfs_sbd *sdp, unsigned int segments, int jump_queue)
++{
++      unsigned long start;
++      struct list_head list;
++      unsigned int try = 0;
++
++      GFS_ASSERT_SBD(segments, sdp,);
++
++      if (segments >= sdp->sd_jdesc.ji_nsegment) {
++              printk("GFS: fsid=%s: error reserving log space (%u, %u)\n",
++                     sdp->sd_fsname, segments, sdp->sd_jdesc.ji_nsegment);
++              return -EINVAL;
++      }
++
++      INIT_LIST_HEAD(&list);
++      start = jiffies;
++
++      for (;;) {
++              spin_lock(&sdp->sd_log_seg_lock);
++
++              if (list_empty(&list)) {
++                      if (jump_queue)
++                              list_add(&list, &sdp->sd_log_seg_list);
++                      else {
++                              list_add_tail(&list, &sdp->sd_log_seg_list);
++                              while (sdp->sd_log_seg_list.next != &list) {
++                                      DECLARE_WAITQUEUE(__wait_chan, current);
++                                      current->state = TASK_UNINTERRUPTIBLE;
++                                      add_wait_queue(&sdp->sd_log_seg_wait,
++                                                     &__wait_chan);
++                                      spin_unlock(&sdp->sd_log_seg_lock);
++                                      schedule();
++                                      spin_lock(&sdp->sd_log_seg_lock);
++                                      remove_wait_queue(&sdp->sd_log_seg_wait,
++                                                        &__wait_chan);
++                                      current->state = TASK_RUNNING;
++                              }
++                      }
++              }
++
++              if (sdp->sd_log_seg_free >= segments) {
++                      sdp->sd_log_seg_free -= segments;
++                      list_del(&list);
++                      spin_unlock(&sdp->sd_log_seg_lock);
++                      wake_up(&sdp->sd_log_seg_wait);
++                      break;
++              }
++
++              spin_unlock(&sdp->sd_log_seg_lock);
++
++              if (try) {
++                      gfs_log_flush(sdp);
++                      gfs_ail_start(sdp, 0);
++              }
++
++              gfs_ail_empty(sdp);
++
++              try++;
++              if (time_after_eq(jiffies, start + 60 * HZ))
++                      printk("GFS: fsid=%s: pid %d can't make log reservation (asking for %u segments)\n",
++                             sdp->sd_fsname, current->pid, segments);
++              yield();
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_log_release - Release a given number of log segments
++ * @sdp: The GFS superblock
++ * @segments: The number of segments
++ *
++ */
++
++void
++gfs_log_release(struct gfs_sbd *sdp, unsigned int segments)
++{
++      spin_lock(&sdp->sd_log_seg_lock);
++      sdp->sd_log_seg_free += segments;
++      GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment, sdp,);
++      spin_unlock(&sdp->sd_log_seg_lock);
++}
++
++/**
++ * log_get_header - Get the journal header buffer
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ * @next: TRUE is this is not a continuation of an existing transaction
++ *
++ * Returns: the log buffer
++ */
++
++static struct gfs_log_buf *
++log_get_header(struct gfs_sbd *sdp, struct gfs_trans *tr, int next)
++{
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++      struct gfs_log_header header;
++
++      GFS_ASSERT_SBD(gfs_log_is_header(sdp, tr->tr_log_head), sdp,);
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++                     !list_empty(&tr->tr_free_bufs), sdp,);
++      lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++      list_del(&lb->lb_list);
++      tr->tr_num_free_bufs--;
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bmem &&
++                     !list_empty(&tr->tr_free_bmem), sdp,);
++      bmem = tr->tr_free_bmem.next;
++      list_del(bmem);
++      tr->tr_num_free_bmem--;
++
++      gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, (char *)bmem);
++      memset(bmem, 0, sdp->sd_sb.sb_bsize);
++
++      memset(&header, 0, sizeof (header));
++
++      if (next) {
++              header.lh_header.mh_magic = GFS_MAGIC;
++              header.lh_header.mh_type = GFS_METATYPE_LH;
++              header.lh_header.mh_format = GFS_FORMAT_LH;
++              header.lh_first = tr->tr_log_head;
++              header.lh_sequence = sdp->sd_sequence + 1;
++              header.lh_tail = current_tail(sdp);
++              header.lh_last_dump = sdp->sd_log_dump_last;
++      } else {
++              header.lh_header.mh_magic = GFS_MAGIC;
++              header.lh_header.mh_type = GFS_METATYPE_LH;
++              header.lh_header.mh_format = GFS_FORMAT_LH;
++              header.lh_first = tr->tr_first_head;
++              header.lh_sequence = sdp->sd_sequence;
++              header.lh_tail = current_tail(sdp);
++              header.lh_last_dump = sdp->sd_log_dump_last;
++
++              list_add(&lb->lb_list, &tr->tr_bufs);
++      }
++
++      gfs_log_header_out(&header, lb->lb_bh.b_data);
++      gfs_log_header_out(&header,
++                         lb->lb_bh.b_data + GFS_BASIC_BLOCK -
++                         sizeof(struct gfs_log_header));
++
++      log_incr_head(sdp, &tr->tr_log_head);
++
++      return lb;
++}
++
++/**
++ * gfs_log_get_buf - Get a buffer to use for control data
++ * @sdp: The GFS superblock
++ * @tr: The GFS transaction
++ *
++ * Generate a regular buffer for use in the journal as control data.
++ *
++ * Returns: the buffer
++ */
++
++struct gfs_log_buf *
++gfs_log_get_buf(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++
++      if (gfs_log_is_header(sdp, tr->tr_log_head))
++              log_get_header(sdp, tr, FALSE);
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++                     !list_empty(&tr->tr_free_bufs), sdp,);
++      lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++      list_del(&lb->lb_list);
++      tr->tr_num_free_bufs--;
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bmem
++                     && !list_empty(&tr->tr_free_bmem), sdp,);
++      bmem = tr->tr_free_bmem.next;
++      list_del(bmem);
++      tr->tr_num_free_bmem--;
++
++      gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, (char *)bmem);
++      memset(bmem, 0, sdp->sd_sb.sb_bsize);
++
++      list_add(&lb->lb_list, &tr->tr_bufs);
++
++      log_incr_head(sdp, &tr->tr_log_head);
++
++      return lb;
++}
++
++/**
++ * gfs_log_fake_buf - Build a fake buffer head
++ * @sdp: the filesystem
++ * @tr: the transaction this is part of
++ * @data: the data the buffer should point to
++ * @unlock: a buffer that is unlocked as this struct gfs_log_buf is torn down
++ *
++ */
++
++void
++gfs_log_fake_buf(struct gfs_sbd *sdp, struct gfs_trans *tr, char *data,
++               struct buffer_head *unlock)
++{
++      struct gfs_log_buf *lb;
++
++      if (gfs_log_is_header(sdp, tr->tr_log_head))
++              log_get_header(sdp, tr, FALSE);
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++                     !list_empty(&tr->tr_free_bufs), sdp,);
++      lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++      list_del(&lb->lb_list);
++      tr->tr_num_free_bufs--;
++
++      gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, data);
++      lb->lb_unlock = unlock;
++
++      list_add(&lb->lb_list, &tr->tr_bufs);
++
++      log_incr_head(sdp, &tr->tr_log_head);
++}
++
++/**
++ * check_seg_usage - Check that we didn't use too many segments
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Also, make sure we don't write ever get to a point where there are
++ * no dumps in the log (corrupting the log).  Panic before we let
++ * that happen.
++ * 
++ */
++
++static void
++check_seg_usage(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_jindex *jdesc = &sdp->sd_jdesc;
++      unsigned int dist;
++      unsigned int segments;
++      uint64_t head_off, head_wrap;
++      uint64_t dump_off, dump_wrap;
++
++      dist = log_distance(sdp, tr->tr_log_head, tr->tr_first_head);
++
++      segments = dist / sdp->sd_sb.sb_seg_size;
++      GFS_ASSERT_SBD(segments * sdp->sd_sb.sb_seg_size == dist, sdp,);
++      GFS_ASSERT_SBD(segments == tr->tr_seg_reserved, sdp,);
++
++      if (sdp->sd_log_dump_last) {
++              head_off = tr->tr_first_head +
++                      tr->tr_seg_reserved * sdp->sd_sb.sb_seg_size;
++              head_wrap = sdp->sd_log_wrap;
++              if (head_off >= jdesc->ji_addr +
++                  jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size) {
++                      head_off -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++                      head_wrap++;
++              }
++
++              dump_off = sdp->sd_log_dump_last;
++              dump_wrap = sdp->sd_log_dump_last_wrap;
++
++              switch (head_wrap - dump_wrap) {
++              case 0:
++                      break;
++
++              case 1:
++                      if (head_off < dump_off)
++                              break;
++                      else if (head_off == dump_off &&
++                               (tr->tr_flags & TRF_LOG_DUMP))
++                              break;
++
++              default:
++                      GFS_ASSERT_SBD(FALSE, sdp,
++                                     printk("head_off = %"PRIu64", head_wrap = %"PRIu64"\n",
++                                            head_off, head_wrap);
++                                     printk("dump_off = %"PRIu64", dump_wrap = %"PRIu64"\n",
++                                            dump_off, dump_wrap););
++                      break;
++              }
++      }
++}
++
++/**
++ * log_free_buf - Free a struct gfs_log_buf (and possibly the data it points to)
++ * @sdp: the filesystem
++ * @lb: the log buffer
++ *
++ */
++
++static void
++log_free_buf(struct gfs_sbd *sdp, struct gfs_log_buf *lb)
++{
++      char *bmem;
++
++      bmem = lb->lb_bh.b_data;
++      gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++      if (lb->lb_unlock)
++              gfs_unlock_buffer(lb->lb_unlock);
++      else
++              kfree(bmem);
++
++      kfree(lb);
++}
++
++/**
++ * sync_trans - Add "last" descriptor to transaction and sync to disk
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Add the "last" descriptor on to the end of the current transaction
++ * and sync it out to disk.  Don't commit it yet, though.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++sync_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head, *prev;
++      struct gfs_log_descriptor desc;
++      struct gfs_log_buf *lb;
++      uint64_t blk;
++      int error = 0, e;
++
++      /*  Build LAST descriptor  */
++
++      lb = gfs_log_get_buf(sdp, tr);
++
++      memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++      desc.ld_header.mh_magic = GFS_MAGIC;
++      desc.ld_header.mh_type = GFS_METATYPE_LD;
++      desc.ld_header.mh_format = GFS_FORMAT_LD;
++      desc.ld_type = GFS_LOG_DESC_LAST;
++      desc.ld_length = 1;
++      for (blk = tr->tr_log_head; !gfs_log_is_header(sdp, blk); blk++)
++              desc.ld_length++;
++      gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++      while (!gfs_log_is_header(sdp, tr->tr_log_head))
++              log_incr_head(sdp, &tr->tr_log_head);
++
++      check_seg_usage(sdp, tr);
++
++      /* Start I/O
++         Go in "prev" direction to start the I/O in order. */
++
++      for (head = &tr->tr_bufs, tmp = head->prev, prev = tmp->prev;
++           tmp != head;
++           tmp = prev, prev = tmp->prev) {
++              lb = list_entry(tmp, struct gfs_log_buf, lb_list);
++
++              if (error) {
++                      list_del(&lb->lb_list);
++                      log_free_buf(sdp, lb);
++              } else {
++                      e = gfs_logbh_start(sdp, &lb->lb_bh);
++                      if (e) {
++                              list_del(&lb->lb_list);
++                              log_free_buf(sdp, lb);
++                              error = e;
++                      }
++              }
++      }
++
++      /* Wait on I/O
++         Go in "next" direction to minimize sleeps/wakeups. */
++
++      while (!list_empty(&tr->tr_bufs)) {
++              lb = list_entry(tr->tr_bufs.next, struct gfs_log_buf, lb_list);
++
++              e = gfs_logbh_wait(sdp, &lb->lb_bh);
++              if (e)
++                      error = e;
++
++              list_del(&lb->lb_list);
++              log_free_buf(sdp, lb);
++      }
++
++      return error;
++}
++
++/**
++ * commit_trans - Commit the current transaction
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Write next header to commit
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++commit_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_log_buf *lb;
++      int error;
++
++      lb = log_get_header(sdp, tr, TRUE);
++
++      error = gfs_logbh_start(sdp, &lb->lb_bh);
++      if (!error)
++              error = gfs_logbh_wait(sdp, &lb->lb_bh);
++
++      log_free_buf(sdp, lb);
++
++      return error;
++}
++
++/**
++ * disk_commit - Write a transaction to the on-disk journal
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++disk_commit(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      uint64_t last_dump, last_dump_wrap;
++      int error = 0;
++
++      GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++      tr->tr_log_head = sdp->sd_log_head;
++      tr->tr_first_head = tr->tr_log_head - 1;
++      GFS_ASSERT_SBD(gfs_log_is_header(sdp, tr->tr_first_head), sdp,);
++
++      LO_BUILD_BHLIST(sdp, tr);
++
++      GFS_ASSERT_SBD(!list_empty(&tr->tr_bufs), sdp,);
++
++      error = sync_trans(sdp, tr);
++      if (error) {
++              /* Eat unusable commit buffer */
++              log_free_buf(sdp, log_get_header(sdp, tr, TRUE));
++              goto out;
++      }
++
++      if (tr->tr_flags & TRF_LOG_DUMP) {
++              /* This commit header should point to the log dump we're
++                 commiting as the current one.  But save the copy of the
++                 old one in case we have problems commiting the dump. */
++
++              last_dump = sdp->sd_log_dump_last;
++              last_dump_wrap = sdp->sd_log_dump_last_wrap;
++
++              sdp->sd_log_dump_last = tr->tr_first_head;
++              sdp->sd_log_dump_last_wrap = sdp->sd_log_wrap;
++
++              error = commit_trans(sdp, tr);
++              if (error) {
++                      sdp->sd_log_dump_last = last_dump;
++                      sdp->sd_log_dump_last_wrap = last_dump_wrap;
++                      goto out;
++              }
++      } else {
++              error = commit_trans(sdp, tr);
++              if (error)
++                      goto out;
++      }
++
++      if (sdp->sd_log_head > tr->tr_log_head)
++              sdp->sd_log_wrap++;
++      sdp->sd_log_head = tr->tr_log_head;
++      sdp->sd_sequence++;
++
++ out:
++      GFS_ASSERT_SBD(!tr->tr_num_free_bufs &&
++                     list_empty(&tr->tr_free_bufs), sdp,);
++      GFS_ASSERT_SBD(!tr->tr_num_free_bmem &&
++                     list_empty(&tr->tr_free_bmem), sdp,);
++
++      return error;
++}
++
++/**
++ * add_trans_to_ail - Add a ondisk commited transaction to the AIL
++ * @sdp: the filesystem
++ * @tr: the transaction 
++ *
++ */
++
++static void
++add_trans_to_ail(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_log_element *le;
++
++      while (!list_empty(&tr->tr_elements)) {
++              le = list_entry(tr->tr_elements.next,
++                              struct gfs_log_element, le_list);
++              LO_ADD_TO_AIL(sdp, le);
++      }
++
++      list_add(&tr->tr_list, &sdp->sd_log_ail);
++}
++
++/**
++ * log_refund - Refund log segments to the free pool
++ * @sdp: The GFS superblock
++ * @tr: The tranaction to examine
++ *
++ * Look at the number of segments reserved for this transaction and the
++ * number of segments actually needed for it.  If they aren't the
++ * same, refund the difference to the free segment pool.
++ *
++ * Called with the log lock held
++ */
++
++static void
++log_refund(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++      unsigned int num_bufs = 0, num_bmem = 0;
++      unsigned int segments;
++
++      LO_TRANS_SIZE(sdp, tr, NULL, NULL, &num_bufs, &num_bmem);
++
++      segments = gfs_blk2seg(sdp, num_bufs + 1);
++      num_bufs += segments + 1;
++      num_bmem += segments + 1;
++
++      if (tr->tr_seg_reserved > segments) {
++              spin_lock(&sdp->sd_log_seg_lock);
++              sdp->sd_log_seg_free += tr->tr_seg_reserved - segments;
++              GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++                             sdp,);
++              spin_unlock(&sdp->sd_log_seg_lock);
++
++              tr->tr_seg_reserved = segments;
++      } else
++              GFS_ASSERT_SBD(tr->tr_seg_reserved == segments, sdp,);
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bufs >= num_bufs, sdp,);
++      while (tr->tr_num_free_bufs > num_bufs) {
++              lb = list_entry(tr->tr_free_bufs.next,
++                              struct gfs_log_buf, lb_list);
++              list_del(&lb->lb_list);
++              kfree(lb);
++              tr->tr_num_free_bufs--;
++      }
++
++      GFS_ASSERT_SBD(tr->tr_num_free_bmem >= num_bmem, sdp,);
++      while (tr->tr_num_free_bmem > num_bmem) {
++              bmem = tr->tr_free_bmem.next;
++              list_del(bmem);
++              kfree(bmem);
++              tr->tr_num_free_bmem--;
++      }
++}
++
++/**
++ * trans_combine - combine two transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that gets freed
++ *
++ * Assumes that the two transactions are independent.
++ */
++
++static void
++trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++            struct gfs_trans *new_tr)
++{
++      struct gfs_log_element *le;
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++
++      tr->tr_file = __FILE__;
++      tr->tr_line = __LINE__;
++      tr->tr_seg_reserved += new_tr->tr_seg_reserved;
++      tr->tr_flags |= new_tr->tr_flags;
++      tr->tr_num_free_bufs += new_tr->tr_num_free_bufs;
++      tr->tr_num_free_bmem += new_tr->tr_num_free_bmem;
++
++      /*  Combine the elements of the two transactions  */
++
++      while (!list_empty(&new_tr->tr_elements)) {
++              le = list_entry(new_tr->tr_elements.next,
++                              struct gfs_log_element, le_list);
++              GFS_ASSERT_SBD(le->le_trans == new_tr, sdp,);
++              le->le_trans = tr;
++              list_move(&le->le_list, &tr->tr_elements);
++      }
++
++      LO_TRANS_COMBINE(sdp, tr, new_tr);
++
++      while (!list_empty(&new_tr->tr_free_bufs)) {
++              lb = list_entry(new_tr->tr_free_bufs.next,
++                              struct gfs_log_buf, lb_list);
++              list_move(&lb->lb_list, &tr->tr_free_bufs);
++              new_tr->tr_num_free_bufs--;
++      }
++      while (!list_empty(&new_tr->tr_free_bmem)) {
++              bmem = new_tr->tr_free_bmem.next;
++              list_move(bmem, &tr->tr_free_bmem);
++              new_tr->tr_num_free_bmem--;
++      }
++
++      GFS_ASSERT_SBD(!new_tr->tr_num_free_bufs, sdp,);
++      GFS_ASSERT_SBD(!new_tr->tr_num_free_bmem, sdp,);
++
++      kfree(new_tr);
++}
++
++/**
++ * log_flush_internal - flush incore transactions
++ * @sdp: the filesystem
++ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
++ *
++ */
++
++static void
++log_flush_internal(struct gfs_sbd *sdp, struct gfs_glock *gl)
++{
++      struct gfs_trans *trans = NULL, *tr;
++      int error;
++
++      gfs_log_lock(sdp);
++
++      if (list_empty(&sdp->sd_log_incore))
++              goto out;
++
++      if (gl) {
++              if (!gl->gl_incore_le.le_trans)
++                      goto out;
++
++              trans = gl->gl_incore_le.le_trans;
++
++              list_del(&trans->tr_list);
++      } else {
++              while (!list_empty(&sdp->sd_log_incore)) {
++                      tr = list_entry(sdp->sd_log_incore.next,
++                                      struct gfs_trans, tr_list);
++
++                      list_del(&tr->tr_list);
++
++                      if (trans)
++                              trans_combine(sdp, trans, tr);
++                      else
++                              trans = tr;
++              }
++      }
++
++      GFS_ASSERT_SBD(trans, sdp,);
++
++      log_refund(sdp, trans);
++
++      /*  Actually do the stuff to commit the transaction  */
++
++      error = disk_commit(sdp, trans);
++      if (error)
++              gfs_io_error(sdp);
++
++      add_trans_to_ail(sdp, trans);
++
++      if (log_distance(sdp, sdp->sd_log_head, sdp->sd_log_dump_last) * GFS_DUMPS_PER_LOG >=
++          sdp->sd_jdesc.ji_nsegment * sdp->sd_sb.sb_seg_size)
++              set_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags);
++
++ out:
++      if (list_empty(&sdp->sd_log_incore))
++              sdp->sd_vfs->s_dirt = FALSE;
++
++      gfs_log_unlock(sdp);
++
++      /*  Dump if we need to.  */
++
++      if (test_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags))
++              gfs_log_dump(sdp, FALSE);
++}
++
++/**
++ * gfs_log_flush - flush the whole incore log
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_log_flush(struct gfs_sbd *sdp)
++{
++      log_flush_internal(sdp, NULL);
++}
++
++/**
++ * gfs_log_flush_glock - flush the incore log for a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_log_flush_glock(struct gfs_glock *gl)
++{
++      log_flush_internal(gl->gl_sbd, gl);
++}
++
++/**
++ * incore_commit - commit a transaction in-core
++ * @sdp: the filesystem
++ * @new_tr: the transaction to commit
++ *
++ * Add the transaction @new_tr to the end of the incore commit list.
++ * Pull up and merge an previously commited transactions that share
++ * locks.  Also pull up any rename transactions that need it.
++ */
++
++static void
++incore_commit(struct gfs_sbd *sdp, struct gfs_trans *new_tr)
++{
++      struct gfs_log_element *le;
++      struct gfs_trans *trans = NULL, *exist_tr;
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++      struct list_head *tmp, *head, *next;
++
++      for (head = &new_tr->tr_elements, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++
++              exist_tr = LO_OVERLAP_TRANS(sdp, le);
++              if (!exist_tr)
++                      continue;
++
++              if (exist_tr != trans) {
++                      list_del(&exist_tr->tr_list);
++                      if (trans)
++                              trans_combine(sdp, trans, exist_tr);
++                      else
++                              trans = exist_tr;
++              }
++      }
++
++      if (trans) {
++              trans->tr_file = __FILE__;
++              trans->tr_line = __LINE__;
++              trans->tr_seg_reserved += new_tr->tr_seg_reserved;
++              trans->tr_flags |= new_tr->tr_flags;
++              trans->tr_num_free_bufs += new_tr->tr_num_free_bufs;
++              trans->tr_num_free_bmem += new_tr->tr_num_free_bmem;
++
++              while (!list_empty(&new_tr->tr_free_bufs)) {
++                      lb = list_entry(new_tr->tr_free_bufs.next,
++                                      struct gfs_log_buf, lb_list);
++                      list_move(&lb->lb_list, &trans->tr_free_bufs);
++                      new_tr->tr_num_free_bufs--;
++              }
++              while (!list_empty(&new_tr->tr_free_bmem)) {
++                      bmem = new_tr->tr_free_bmem.next;
++                      list_move(bmem, &trans->tr_free_bmem);
++                      new_tr->tr_num_free_bmem--;
++              }
++      } else
++              trans = new_tr;
++
++      for (head = &new_tr->tr_elements, tmp = head->next, next = tmp->next;
++           tmp != head;
++           tmp = next, next = next->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              LO_INCORE_COMMIT(sdp, trans, le);
++      }
++
++      if (trans != new_tr) {
++              GFS_ASSERT_SBD(!new_tr->tr_num_free_bufs, sdp,);
++              GFS_ASSERT_SBD(!new_tr->tr_num_free_bmem, sdp,);
++              GFS_ASSERT_SBD(list_empty(&new_tr->tr_elements), sdp,);
++              kfree(new_tr);
++      }
++
++      log_refund(sdp, trans);
++
++      list_add(&trans->tr_list, &sdp->sd_log_incore);
++}
++
++/**
++ * gfs_log_commit - Commit a transaction to the log
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_log_commit(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++      unsigned int num_mblks = 0, num_eblks = 0, num_bufs = 0, num_bmem = 0;
++      unsigned int segments;
++
++      LO_TRANS_SIZE(sdp, tr, &num_mblks, &num_eblks, &num_bufs, &num_bmem);
++
++      GFS_ASSERT_SBD(num_mblks <= tr->tr_mblks_asked &&
++                     num_eblks <= tr->tr_eblks_asked, sdp,
++                     printk("type = (%s, %u)\n",
++                            tr->tr_file, tr->tr_line);
++                     printk("num_mblks = %u, tr->tr_mblks_asked = %u\n",
++                            num_mblks, tr->tr_mblks_asked);
++                     printk("num_eblks = %u, tr->tr_eblks_asked = %u\n",
++                            num_eblks, tr->tr_eblks_asked););
++
++      segments = gfs_blk2seg(sdp, num_bufs + 1);
++      num_bufs += segments + 1;
++      num_bmem += segments + 1;
++
++      while (num_bufs--) {
++              lb = gmalloc(sizeof(struct gfs_log_buf));
++              memset(lb, 0, sizeof(struct gfs_log_buf));
++              list_add(&lb->lb_list, &tr->tr_free_bufs);
++              tr->tr_num_free_bufs++;
++      }
++      while (num_bmem--) {
++              bmem = gmalloc(sdp->sd_sb.sb_bsize);
++              list_add(bmem, &tr->tr_free_bmem);
++              tr->tr_num_free_bmem++;
++      }
++
++      gfs_log_lock(sdp);
++
++      incore_commit(sdp, tr);
++
++      if (sdp->sd_log_buffers > sdp->sd_tune.gt_incore_log_blocks) {
++              gfs_log_unlock(sdp);
++              gfs_log_flush(sdp);
++      } else {
++              sdp->sd_vfs->s_dirt = TRUE;
++              gfs_log_unlock(sdp);
++      }
++
++}
++
++/**
++ * gfs_log_dump - make a Log Dump entry in the log
++ * @sdp: the filesystem
++ * @force: if TRUE, always make the dump even if one has been made recently
++ *
++ */
++
++void
++gfs_log_dump(struct gfs_sbd *sdp, int force)
++{
++      struct gfs_log_element *le;
++      struct gfs_trans tr;
++      struct gfs_log_buf *lb;
++      struct list_head *bmem;
++      unsigned int num_bufs, num_bmem;
++      unsigned int segments;
++      int error;
++
++      if (test_and_set_bit(SDF_IN_LOG_DUMP, &sdp->sd_flags)) {
++              GFS_ASSERT_SBD(!force, sdp,);
++              return;
++      }
++
++      memset(&tr, 0, sizeof(struct gfs_trans));
++      INIT_LIST_HEAD(&tr.tr_elements);
++      INIT_LIST_HEAD(&tr.tr_free_bufs);
++      INIT_LIST_HEAD(&tr.tr_free_bmem);
++      INIT_LIST_HEAD(&tr.tr_bufs);
++      tr.tr_flags = TRF_LOG_DUMP;
++      tr.tr_file = __FILE__;
++      tr.tr_line = __LINE__;
++
++      for (;;) {
++              gfs_log_lock(sdp);
++
++              if (!force && !test_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags))
++                      goto out;
++
++              num_bufs = num_bmem = 0;
++              LO_DUMP_SIZE(sdp, NULL, &num_bufs, &num_bmem);
++              GFS_ASSERT_SBD(num_bufs, sdp,);
++              segments = gfs_blk2seg(sdp, num_bufs + 1);
++              num_bufs += segments + 1;
++              num_bmem += segments + 1;
++
++              if (tr.tr_seg_reserved >= segments &&
++                  tr.tr_num_free_bufs >= num_bufs &&
++                  tr.tr_num_free_bmem >= num_bmem)
++                      break;
++
++              gfs_log_unlock(sdp);
++
++              if (tr.tr_seg_reserved < segments) {
++                      error = gfs_log_reserve(sdp,
++                                              segments - tr.tr_seg_reserved,
++                                              TRUE);
++                      GFS_ASSERT_SBD(!error, sdp,);
++                      tr.tr_seg_reserved = segments;
++              }
++              while (tr.tr_num_free_bufs < num_bufs) {
++                      lb = gmalloc(sizeof(struct gfs_log_buf));
++                      memset(lb, 0, sizeof(struct gfs_log_buf));
++                      list_add(&lb->lb_list, &tr.tr_free_bufs);
++                      tr.tr_num_free_bufs++;
++              }
++              while (tr.tr_num_free_bmem < num_bmem) {
++                      bmem = gmalloc(sdp->sd_sb.sb_bsize);
++                      list_add(bmem, &tr.tr_free_bmem);
++                      tr.tr_num_free_bmem++;
++              }
++      }
++
++      if (tr.tr_seg_reserved > segments) {
++              spin_lock(&sdp->sd_log_seg_lock);
++              sdp->sd_log_seg_free += tr.tr_seg_reserved - segments;
++              GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++                             sdp,);
++              spin_unlock(&sdp->sd_log_seg_lock);
++              tr.tr_seg_reserved = segments;
++      }
++      while (tr.tr_num_free_bufs > num_bufs) {
++              lb = list_entry(tr.tr_free_bufs.next,
++                              struct gfs_log_buf, lb_list);
++              list_del(&lb->lb_list);
++              kfree(lb);
++              tr.tr_num_free_bufs--;
++      }
++      while (tr.tr_num_free_bmem > num_bmem) {
++              bmem = tr.tr_free_bmem.next;
++              list_del(bmem);
++              kfree(bmem);
++              tr.tr_num_free_bmem--;
++      }
++
++      LO_BUILD_DUMP(sdp, &tr);
++
++      error = disk_commit(sdp, &tr);
++      if (error)
++              gfs_io_error(sdp);
++
++      while (!list_empty(&tr.tr_elements)) {
++              le = list_entry(tr.tr_elements.next,
++                              struct gfs_log_element, le_list);
++              LO_CLEAN_DUMP(sdp, le);
++      }
++
++      /* If there isn't anything the AIL, we won't get back the log
++         space we reserved unless we do it ourselves. */
++
++      if (list_empty(&sdp->sd_log_ail)) {
++              spin_lock(&sdp->sd_log_seg_lock);
++              sdp->sd_log_seg_free += tr.tr_seg_reserved;
++              GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++                             sdp,);
++              spin_unlock(&sdp->sd_log_seg_lock);
++      }
++
++      clear_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags);
++
++ out:
++      gfs_log_unlock(sdp);
++      clear_bit(SDF_IN_LOG_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * gfs_log_shutdown - write a shutdown header into a journal
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_log_shutdown(struct gfs_sbd *sdp)
++{
++      struct gfs_log_buf *lb;
++      char *bmem;
++      struct gfs_log_header head;
++      struct gfs_log_descriptor desc;
++      unsigned int elements = 0;
++      int error;
++
++      lb = gmalloc(sizeof(struct gfs_log_buf));
++      memset(lb, 0, sizeof(struct gfs_log_buf));
++      bmem = gmalloc(sdp->sd_sb.sb_bsize);
++
++      gfs_log_lock(sdp);
++
++      GFS_ASSERT_SBD(list_empty(&sdp->sd_log_ail), sdp,);
++      GFS_ASSERT_SBD(sdp->sd_log_seg_free == sdp->sd_jdesc.ji_nsegment - 1,
++                     sdp,);
++      GFS_ASSERT_SBD(!sdp->sd_log_buffers, sdp,);
++      GFS_ASSERT_SBD(gfs_log_is_header(sdp, sdp->sd_log_head - 1), sdp,);
++
++      /*  Build a "last" log descriptor  */
++
++      memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++      desc.ld_header.mh_magic = GFS_MAGIC;
++      desc.ld_header.mh_type = GFS_METATYPE_LD;
++      desc.ld_header.mh_format = GFS_FORMAT_LD;
++      desc.ld_type = GFS_LOG_DESC_LAST;
++      desc.ld_length = sdp->sd_sb.sb_seg_size - 1;
++
++      /*  Write the descriptor  */
++
++      gfs_logbh_init(sdp, &lb->lb_bh, sdp->sd_log_head, bmem);
++      memset(bmem, 0, sdp->sd_sb.sb_bsize);
++      gfs_desc_out(&desc, lb->lb_bh.b_data);
++      error = gfs_logbh_start(sdp, &lb->lb_bh);
++      if (!error)
++              error = gfs_logbh_wait(sdp, &lb->lb_bh);
++      gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++      if (error)
++              goto out;
++
++      /*  Move to the next header  */
++
++      while (!gfs_log_is_header(sdp, sdp->sd_log_head))
++              log_incr_head(sdp, &sdp->sd_log_head);
++
++      LO_DUMP_SIZE(sdp, &elements, NULL, NULL);
++
++      /*  Build the shutdown header  */
++
++      memset(&head, 0, sizeof (struct gfs_log_header));
++      head.lh_header.mh_magic = GFS_MAGIC;
++      head.lh_header.mh_type = GFS_METATYPE_LH;
++      head.lh_header.mh_format = GFS_FORMAT_LH;
++      head.lh_flags = GFS_LOG_HEAD_UNMOUNT;
++      head.lh_first = sdp->sd_log_head;
++      head.lh_sequence = sdp->sd_sequence + 1;
++      /*  Don't care about tail  */
++      head.lh_last_dump = (elements) ? sdp->sd_log_dump_last : 0;
++
++      /*  Write out the shutdown header  */
++
++      gfs_logbh_init(sdp, &lb->lb_bh, sdp->sd_log_head, bmem);
++      memset(bmem, 0, sdp->sd_sb.sb_bsize);
++      gfs_log_header_out(&head, lb->lb_bh.b_data);
++      gfs_log_header_out(&head,
++                         lb->lb_bh.b_data + GFS_BASIC_BLOCK -
++                         sizeof(struct gfs_log_header));
++      error = gfs_logbh_start(sdp, &lb->lb_bh);
++      if (!error)
++              error = gfs_logbh_wait(sdp, &lb->lb_bh);
++      gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++      out:
++      gfs_log_unlock(sdp);
++
++      kfree(lb);
++      kfree(bmem);
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/log.h linux-patched/fs/gfs/log.h
+--- linux-orig/fs/gfs/log.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/log.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,79 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOG_DOT_H__
++#define __LOG_DOT_H__
++
++/**
++ * gfs_log_lock - acquire the right to mess with the log manager
++ * @sdp: the filesystem
++ *
++ */
++
++static __inline__ void
++gfs_log_lock(struct gfs_sbd *sdp)
++{
++      down(&sdp->sd_log_lock);
++}
++
++/**
++ * gfs_log_unlock - release the right to mess with the log manager
++ * @sdp: the filesystem
++ *
++ */
++
++static __inline__ void
++gfs_log_unlock(struct gfs_sbd *sdp)
++{
++      up(&sdp->sd_log_lock);
++}
++
++unsigned int gfs_struct2blk(struct gfs_sbd *sdp, unsigned int nstruct,
++                          unsigned int ssize);
++unsigned int gfs_blk2seg(struct gfs_sbd *sdp, unsigned int blocks);
++
++int gfs_log_reserve(struct gfs_sbd *sdp, unsigned int segments, int jump_queue);
++void gfs_log_release(struct gfs_sbd *sdp, unsigned int segments);
++
++void gfs_ail_start(struct gfs_sbd *sdp, int flags);
++int gfs_ail_empty(struct gfs_sbd *sdp);
++
++void gfs_log_commit(struct gfs_sbd *sdp, struct gfs_trans *trans);
++void gfs_log_flush(struct gfs_sbd *sdp);
++void gfs_log_flush_glock(struct gfs_glock *gl);
++
++int gfs_log_shutdown(struct gfs_sbd *sdp);
++
++void gfs_log_dump(struct gfs_sbd *sdp, int force);
++
++/*  Internal crap used the log operations  */
++
++/**
++ * gfs_log_is_header - Discover if block is on journal header
++ * @sdp: The GFS superblock
++ * @block: The block number
++ *
++ * Returns: TRUE if the block is on a journal segment boundary, FALSE otherwise
++ */
++
++static __inline__ int
++gfs_log_is_header(struct gfs_sbd *sdp, uint64_t block)
++{
++      return !do_mod(block, sdp->sd_sb.sb_seg_size);
++}
++
++struct gfs_log_buf *gfs_log_get_buf(struct gfs_sbd *sdp, struct gfs_trans *tr);
++void gfs_log_fake_buf(struct gfs_sbd *sdp, struct gfs_trans *tr, char *data,
++                    struct buffer_head *unlock);
++
++#endif /* __LOG_DOT_H__ */
+diff -urN linux-orig/fs/gfs/lops.c linux-patched/fs/gfs/lops.c
+--- linux-orig/fs/gfs/lops.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lops.c        2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,1563 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "log.h"
++#include "lops.h"
++#include "quota.h"
++#include "recovery.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * generic_le_add - generic routine to add a log element to a transaction
++ * @sdp: the filesystem
++ * @le: the log entry
++ *
++ */
++
++static void
++generic_le_add(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_trans *tr;
++
++      GFS_ASSERT_SBD(le->le_ops &&
++                     !le->le_trans &&
++                     list_empty(&le->le_list), sdp,);
++
++      tr = current_transaction;
++      GFS_ASSERT_SBD(tr, sdp,);
++
++      le->le_trans = tr;
++      list_add(&le->le_list, &tr->tr_elements);
++}
++
++/**
++ * glock_trans_end - drop a glock reference
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++glock_trans_end(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++      GFS_ASSERT_GLOCK(gfs_glock_is_locked_by_me(gl) &&
++                       gfs_glock_is_held_excl(gl), gl,);
++      gfs_glock_put(gl);
++}
++
++/**
++ * glock_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++glock_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++      struct gfs_glock *gl;
++
++      switch (where) {
++      case TRANS_IS_NEW:
++              gl = container_of(le, struct gfs_glock, gl_new_le);
++              break;
++      case TRANS_IS_INCORE:
++              gl = container_of(le, struct gfs_glock, gl_incore_le);
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,);
++      }
++
++      printk("  Glock:  (%u, %"PRIu64")\n",
++             gl->gl_name.ln_type,
++             gl->gl_name.ln_number);
++}
++
++/**
++ * glock_overlap_trans - Find any incore transactions that might overlap with this LE
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static struct gfs_trans *
++glock_overlap_trans(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++      return gl->gl_incore_le.le_trans;
++}
++
++/**
++ * glock_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++glock_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                  struct gfs_log_element *le)
++{
++      struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++      if (gl->gl_incore_le.le_trans)
++              GFS_ASSERT_GLOCK(gl->gl_incore_le.le_trans == tr, gl,);
++      else {
++              gl->gl_incore_le.le_trans = tr;
++              list_add(&gl->gl_incore_le.le_list, &tr->tr_elements);
++              if (tr != le->le_trans)
++                      tr->tr_num_gl++;
++      }
++
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++}
++
++/**
++ * glock_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++glock_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++}
++
++/**
++ * glock_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++glock_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                  struct gfs_trans *new_tr)
++{
++      tr->tr_num_gl += new_tr->tr_num_gl;
++}
++
++/**
++ * buf_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++buf_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++      struct gfs_bufdata *bd;
++
++      switch (where) {
++      case TRANS_IS_NEW:
++              bd = container_of(le, struct gfs_bufdata, bd_new_le);
++              break;
++      case TRANS_IS_INCORE:
++              bd = container_of(le, struct gfs_bufdata, bd_incore_le);
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,);
++      }
++
++      printk("  Buffer:  %"PRIu64"\n", (uint64_t)bd->bd_bh->b_blocknr);
++}
++
++/**
++ * buf_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++buf_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                struct gfs_log_element *le)
++{
++      struct gfs_bufdata *bd = container_of(le, struct gfs_bufdata, bd_new_le);
++
++      if (bd->bd_frozen) {
++              kfree(bd->bd_frozen);
++              bd->bd_frozen = NULL;
++      }
++
++      if (bd->bd_incore_le.le_trans) {
++              GFS_ASSERT_SBD(bd->bd_incore_le.le_trans == tr, sdp,);
++              gfs_dunpin(sdp, bd->bd_bh, NULL);
++      } else {
++              bd->bd_incore_le.le_trans = tr;
++              list_add(&bd->bd_incore_le.le_list, &tr->tr_elements);
++              if (tr != le->le_trans)
++                      tr->tr_num_buf++;
++
++              sdp->sd_log_buffers++;
++      }
++
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++}
++
++/**
++ * buf_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++buf_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_bufdata *bd = container_of(le,
++                                             struct gfs_bufdata,
++                                             bd_incore_le);
++
++      gfs_dunpin(sdp, bd->bd_bh, le->le_trans);
++
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++
++      GFS_ASSERT_SBD(sdp->sd_log_buffers, sdp,);
++      sdp->sd_log_buffers--;
++}
++
++/**
++ * buf_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++buf_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++             unsigned int *mblks, unsigned int *eblks,
++             unsigned int *blocks, unsigned int *bmem)
++{
++      unsigned int cblks;
++
++      if (tr->tr_num_buf) {
++              cblks = gfs_struct2blk(sdp, tr->tr_num_buf,
++                                     sizeof(struct gfs_block_tag));
++
++              if (mblks)
++                      *mblks += tr->tr_num_buf;
++              if (blocks)
++                      *blocks += tr->tr_num_buf + cblks;
++              if (bmem)
++                      *bmem += cblks;
++      }
++}
++
++/**
++ * buf_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++buf_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                struct gfs_trans *new_tr)
++{
++      tr->tr_num_buf += new_tr->tr_num_buf;
++}
++
++/**
++ * increment_generation - increment the generation number in metadata buffer
++ * @sdp: the filesystem
++ * @bd: the struct gfs_bufdata structure associated with the buffer
++ *
++ */
++
++static void
++increment_generation(struct gfs_sbd *sdp, struct gfs_bufdata *bd)
++{
++      struct gfs_meta_header *mh, *mh2;
++      uint64_t tmp64;
++
++      mh = (struct gfs_meta_header *)bd->bd_bh->b_data;
++
++      tmp64 = gfs64_to_cpu(mh->mh_generation) + 1;
++      tmp64 = cpu_to_gfs64(tmp64);
++
++      if (bd->bd_frozen) {
++              mh2 = (struct gfs_meta_header *)bd->bd_frozen;
++              GFS_ASSERT_SBD(mh->mh_generation == mh2->mh_generation, sdp,);
++              mh2->mh_generation = tmp64;
++      }
++      mh->mh_generation = tmp64;
++}
++
++/**
++ * buf_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++buf_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head;
++      struct gfs_log_element *le;
++      struct gfs_bufdata *bd;
++      struct gfs_log_descriptor desc;
++      struct gfs_block_tag tag;
++      struct gfs_log_buf *clb = NULL;
++      unsigned int num_ctl;
++      unsigned int offset = sizeof(struct gfs_log_descriptor);
++      unsigned int x, bufs;
++
++      if (!tr->tr_num_buf)
++              return;
++
++      /* set up control buffers for descriptor and tags */
++
++      num_ctl = gfs_struct2blk(sdp, tr->tr_num_buf,
++                               sizeof(struct gfs_block_tag));
++
++      for (x = 0; x < num_ctl; x++) {
++              if (clb)
++                      gfs_log_get_buf(sdp, tr);
++              else
++                      clb = gfs_log_get_buf(sdp, tr);
++      }
++
++      memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++      desc.ld_header.mh_magic = GFS_MAGIC;
++      desc.ld_header.mh_type = GFS_METATYPE_LD;
++      desc.ld_header.mh_format = GFS_FORMAT_LD;
++      desc.ld_type = GFS_LOG_DESC_METADATA;
++      desc.ld_length = num_ctl + tr->tr_num_buf;
++      desc.ld_data1 = tr->tr_num_buf;
++      gfs_desc_out(&desc, clb->lb_bh.b_data);
++
++      x = 1;
++      bufs = 0;
++
++      for (head = &tr->tr_elements, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              if (le->le_ops != &gfs_buf_lops)
++                      continue;
++              bd = container_of(le, struct gfs_bufdata, bd_incore_le);
++
++              gfs_meta_check(sdp, bd->bd_bh);
++
++              gfs_lock_buffer(bd->bd_bh);
++
++              increment_generation(sdp, bd);
++
++              gfs_log_fake_buf(sdp, tr,
++                               (bd->bd_frozen) ? bd->bd_frozen : bd->bd_bh->b_data,
++                               bd->bd_bh);
++
++              if (offset + sizeof(struct gfs_block_tag) > sdp->sd_sb.sb_bsize) {
++                      clb = list_entry(clb->lb_list.prev,
++                                       struct gfs_log_buf, lb_list);
++                      if (gfs_log_is_header(sdp, clb->lb_bh.b_blocknr))
++                              clb = list_entry(clb->lb_list.prev,
++                                               struct gfs_log_buf, lb_list);
++                      x++;
++                      offset = 0;
++              }
++
++              memset(&tag, 0, sizeof(struct gfs_block_tag));
++              tag.bt_blkno = bd->bd_bh->b_blocknr;
++
++              gfs_block_tag_out(&tag, clb->lb_bh.b_data + offset);
++
++              offset += sizeof(struct gfs_block_tag);
++              bufs++;
++      }
++
++      GFS_ASSERT_SBD(x == num_ctl, sdp,);
++      GFS_ASSERT_SBD(bufs == tr->tr_num_buf, sdp,);
++}
++
++/**
++ * buf_before_scan - called before journal replay
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be replayed
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++buf_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++              struct gfs_log_header *head, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_A1)
++              sdp->sd_recovery_replays =
++                      sdp->sd_recovery_skips =
++                      sdp->sd_recovery_sames = 0;
++}
++
++/**
++ * replay_block - Replay a single metadata block
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being replayed
++ * @gl: the journal's glock
++ * @tag: the block tag describing the inplace location of the block
++ * @blkno: the location of the log's copy of the block
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++replay_block(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++           struct gfs_glock *gl, struct gfs_block_tag *tag, uint64_t blkno)
++{
++      struct buffer_head *inplace_bh, *log_bh;
++      struct gfs_meta_header inplace_mh, log_mh;
++      int replay_block = TRUE;
++      int error = 0;
++
++      gfs_replay_check(sdp);
++
++      /* Warning:  Using a real buffer here instead of a tempbh can be bad
++         on a OS that won't support multiple simultaneous buffers for the
++         same block on different glocks. */
++
++      error = gfs_dread(sdp, tag->bt_blkno, gl,
++                        DIO_START | DIO_WAIT, &inplace_bh);
++      if (error)
++              return error;
++      gfs_meta_check(sdp, inplace_bh);
++      gfs_meta_header_in(&inplace_mh, inplace_bh->b_data);
++
++      error = gfs_dread(sdp, blkno, gl, DIO_START | DIO_WAIT, &log_bh);
++      if (error) {
++              brelse(inplace_bh);
++              return error;
++      }
++      gfs_meta_check(sdp, log_bh);
++      gfs_meta_header_in(&log_mh, log_bh->b_data);
++
++      if (log_mh.mh_generation < inplace_mh.mh_generation) {
++              replay_block = FALSE;
++              sdp->sd_recovery_skips++;
++      } else if (log_mh.mh_generation == inplace_mh.mh_generation) {
++              if (memcmp(log_bh->b_data,
++                         inplace_bh->b_data,
++                         sdp->sd_sb.sb_bsize) == 0) {
++                      replay_block = FALSE;
++                      sdp->sd_recovery_sames++;
++              }
++      }
++
++      if (replay_block) {
++              memcpy(inplace_bh->b_data,
++                     log_bh->b_data,
++                     sdp->sd_sb.sb_bsize);
++
++              error = gfs_replay_buf(gl, inplace_bh);
++              if (!error)
++                      sdp->sd_recovery_replays++;
++      }
++
++      brelse(log_bh);
++      brelse(inplace_bh);
++
++      return error;
++}
++
++/**
++ * buf_scan_elements - Replay a metadata log descriptor
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being replayed
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++buf_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                struct gfs_glock *gl, uint64_t start,
++                struct gfs_log_descriptor *desc, unsigned int pass)
++{
++      struct gfs_block_tag tag;
++      struct buffer_head *bh;
++      uint64_t cblk = start;
++      unsigned int num_tags = desc->ld_data1;
++      unsigned int offset = sizeof(struct gfs_log_descriptor);
++      unsigned int x;
++      int error;
++
++      if (pass != GFS_RECPASS_A1)
++              return 0;
++      if (desc->ld_type != GFS_LOG_DESC_METADATA)
++              return 0;
++
++      x = gfs_struct2blk(sdp, num_tags, sizeof(struct gfs_block_tag));
++      while (x--) {
++              error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++              if (error)
++                      return error;
++      }
++
++      for (;;) {
++              GFS_ASSERT_SBD(num_tags, sdp,);
++
++              error = gfs_dread(sdp, cblk, gl, DIO_START | DIO_WAIT, &bh);
++              if (error)
++                      return error;
++
++              /* Do readahead for the inplace blocks in this control block */
++              {
++                      unsigned int o2 = offset;
++                      unsigned int nt2 = num_tags;
++
++                      while (o2 + sizeof(struct gfs_block_tag) <=
++                             sdp->sd_sb.sb_bsize) {
++                              gfs_block_tag_in(&tag, bh->b_data + o2);
++                              gfs_start_ra(gl, tag.bt_blkno, 1);
++                              if (!--nt2)
++                                      break;
++                              o2 += sizeof(struct gfs_block_tag);
++                      }
++              }
++
++              while (offset + sizeof(struct gfs_block_tag) <=
++                     sdp->sd_sb.sb_bsize) {
++                      gfs_block_tag_in(&tag, bh->b_data + offset);
++
++                      error = replay_block(sdp, jdesc, gl, &tag, start);
++                      if (error)
++                              goto out_drelse;
++
++                      if (!--num_tags)
++                              goto out_drelse;
++
++                      error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++                      if (error)
++                              goto out_drelse;
++
++                      offset += sizeof(struct gfs_block_tag);
++              }
++
++              brelse(bh);
++
++              error = gfs_increment_blkno(sdp, jdesc, gl, &cblk, TRUE);
++              if (error)
++                      return error;
++
++              offset = 0;
++      }
++
++      return 0;
++
++ out_drelse:
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * buf_after_scan - called after journal replay
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be replayed
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++buf_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_A1) {
++              printk("GFS: fsid=%s: jid=%u: Replayed %u of %u blocks\n",
++                     sdp->sd_fsname, jid,
++                     sdp->sd_recovery_replays,
++                     sdp->sd_recovery_replays + sdp->sd_recovery_skips +
++                     sdp->sd_recovery_sames);
++              printk("GFS: fsid=%s: jid=%u: replays = %u, skips = %u, sames = %u\n",
++                     sdp->sd_fsname, jid, sdp->sd_recovery_replays,
++                     sdp->sd_recovery_skips, sdp->sd_recovery_sames);
++      }
++}
++
++/**
++ * unlinked_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++unlinked_print(struct gfs_sbd *sdp, struct gfs_log_element *le,
++             unsigned int where)
++{
++      struct gfs_unlinked *ul;
++      char *type;
++
++      switch (where) {
++      case TRANS_IS_NEW:
++              ul = container_of(le, struct gfs_unlinked, ul_new_le);
++              type = (test_bit(ULF_NEW_UL, &ul->ul_flags)) ?
++                      "unlink" : "dealloc";
++              break;
++      case TRANS_IS_INCORE:
++              ul = container_of(le, struct gfs_unlinked, ul_incore_le);
++              type = (test_bit(ULF_INCORE_UL, &ul->ul_flags)) ?
++                      "unlink" : "dealloc";
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,);
++      }
++
++      printk("  unlinked:  %"PRIu64"/%"PRIu64", %s\n",
++             ul->ul_inum.no_formal_ino, ul->ul_inum.no_addr,
++             type);
++}
++
++/**
++ * unlinked_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                     struct gfs_log_element *le)
++{
++      struct gfs_unlinked *ul = container_of(le,
++                                             struct gfs_unlinked,
++                                             ul_new_le);
++      int n = !!test_bit(ULF_NEW_UL, &ul->ul_flags);
++      int i = !!test_bit(ULF_INCORE_UL, &ul->ul_flags);
++
++      if (ul->ul_incore_le.le_trans) {
++              GFS_ASSERT_SBD(ul->ul_incore_le.le_trans == tr, sdp,);
++              GFS_ASSERT_SBD(n != i, sdp,);
++
++              ul->ul_incore_le.le_trans = NULL;
++              list_del_init(&ul->ul_incore_le.le_list);
++              gfs_unlinked_put(sdp, ul);
++
++              if (i) {
++                      GFS_ASSERT_SBD(tr->tr_num_iul, sdp,);
++                      tr->tr_num_iul--;
++              } else {
++                      GFS_ASSERT_SBD(tr->tr_num_ida, sdp,);
++                      tr->tr_num_ida--;
++              }
++      } else {
++              gfs_unlinked_hold(sdp, ul);
++              ul->ul_incore_le.le_trans = tr;
++              list_add(&ul->ul_incore_le.le_list, &tr->tr_elements);
++
++              if (n) {
++                      set_bit(ULF_INCORE_UL, &ul->ul_flags);
++                      if (tr != le->le_trans)
++                              tr->tr_num_iul++;
++              } else {
++                      clear_bit(ULF_INCORE_UL, &ul->ul_flags);
++                      if (tr != le->le_trans)
++                              tr->tr_num_ida++;
++              }
++      }
++
++      if (n) {
++              gfs_unlinked_hold(sdp, ul);
++              GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags), sdp,);
++              set_bit(ULF_IC_LIST, &ul->ul_flags);
++              atomic_inc(&sdp->sd_unlinked_ic_count);
++      } else {
++              GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags), sdp,);
++              clear_bit(ULF_IC_LIST, &ul->ul_flags);
++              gfs_unlinked_put(sdp, ul);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count), sdp,);
++              atomic_dec(&sdp->sd_unlinked_ic_count);
++      }
++
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++      gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * unlinked_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_unlinked *ul = container_of(le,
++                                              struct gfs_unlinked,
++                                              ul_incore_le);
++      int i = !!test_bit(ULF_INCORE_UL, &ul->ul_flags);
++
++      if (i) {
++              gfs_unlinked_hold(sdp, ul);
++              GFS_ASSERT_SBD(!test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++              set_bit(ULF_OD_LIST, &ul->ul_flags);
++              atomic_inc(&sdp->sd_unlinked_od_count);
++      } else {
++              GFS_ASSERT_SBD(test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++              clear_bit(ULF_OD_LIST, &ul->ul_flags);
++              gfs_unlinked_put(sdp, ul);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++              atomic_dec(&sdp->sd_unlinked_od_count);
++      }
++
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++      gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * unlinked_clean_dump - clean up a LE after a log dump
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_clean_dump(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++}
++
++/**
++ * unlinked_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++unlinked_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                  unsigned int *mblks, unsigned int *eblks,
++                  unsigned int *blocks, unsigned int *bmem)
++{
++      unsigned int ublks = 0;
++
++      if (tr->tr_num_iul)
++              ublks = gfs_struct2blk(sdp, tr->tr_num_iul,
++                                     sizeof(struct gfs_inum));
++      if (tr->tr_num_ida)
++              ublks += gfs_struct2blk(sdp, tr->tr_num_ida,
++                                      sizeof(struct gfs_inum));
++
++      if (eblks)
++              *eblks += ublks;
++      if (blocks)
++              *blocks += ublks;
++      if (bmem)
++              *bmem += ublks;
++}
++
++/**
++ * unlinked_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++unlinked_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                     struct gfs_trans *new_tr)
++{
++      tr->tr_num_iul += new_tr->tr_num_iul;
++      tr->tr_num_ida += new_tr->tr_num_ida;
++}
++
++/**
++ * unlinked_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++unlinked_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head;
++      struct gfs_log_element *le;
++      struct gfs_unlinked *ul;
++      struct gfs_log_descriptor desc;
++      struct gfs_log_buf *lb;
++      unsigned int pass = 2;
++      unsigned int type, number;
++      unsigned int offset, entries;
++
++      while (pass--) {
++              if (tr->tr_flags & TRF_LOG_DUMP) {
++                      if (pass) {
++                              type = GFS_LOG_DESC_IUL;
++                              number = tr->tr_num_iul;
++                      } else
++                              break;
++              } else {
++                      if (pass) {
++                              type = GFS_LOG_DESC_IUL;
++                              number = tr->tr_num_iul;
++                      } else {
++                              type = GFS_LOG_DESC_IDA;
++                              number = tr->tr_num_ida;
++                      }
++
++                      if (!number)
++                              continue;
++              }
++
++              lb = gfs_log_get_buf(sdp, tr);
++
++              memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++              desc.ld_header.mh_magic = GFS_MAGIC;
++              desc.ld_header.mh_type = GFS_METATYPE_LD;
++              desc.ld_header.mh_format = GFS_FORMAT_LD;
++              desc.ld_type = type;
++              desc.ld_length = gfs_struct2blk(sdp, number, sizeof(struct gfs_inum));
++              desc.ld_data1 = (tr->tr_flags & TRF_LOG_DUMP) ? TRUE : FALSE;
++              gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++              offset = sizeof(struct gfs_log_descriptor);
++              entries = 0;
++
++              for (head = &tr->tr_elements, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      le = list_entry(tmp, struct gfs_log_element, le_list);
++                      if (le->le_ops != &gfs_unlinked_lops)
++                              continue;
++                      if (tr->tr_flags & TRF_LOG_DUMP)
++                              ul = container_of(le,
++                                                struct gfs_unlinked,
++                                                ul_ondisk_le);
++                      else {
++                              ul = container_of(le,
++                                                struct gfs_unlinked,
++                                                ul_incore_le);
++                              if (!!test_bit(ULF_INCORE_UL, &ul->ul_flags) != pass)
++                                      continue;
++                      }
++
++                      if (offset + sizeof(struct gfs_inum) > sdp->sd_sb.sb_bsize) {
++                              offset = 0;
++                              lb = gfs_log_get_buf(sdp, tr);
++                      }
++
++                      gfs_inum_out(&ul->ul_inum,
++                                   lb->lb_bh.b_data + offset);
++
++                      offset += sizeof(struct gfs_inum);
++                      entries++;
++              }
++
++              GFS_ASSERT_SBD(entries == number, sdp,);
++      }
++}
++
++/**
++ * unlinked_dump_size - compute how much space the LE class takes up in a log dump
++ * @sdp: the filesystem
++ * @elements: the number of log elements in the dump
++ * @blocks: the number of blocks in the dump
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++unlinked_dump_size(struct gfs_sbd *sdp, unsigned int *elements,
++                 unsigned int *blocks, unsigned int *bmem)
++{
++      unsigned int c = atomic_read(&sdp->sd_unlinked_od_count);
++      unsigned int b = gfs_struct2blk(sdp, c, sizeof(struct gfs_inum));
++
++      if (elements)
++              *elements += c;
++      if (blocks)
++              *blocks += b;
++      if (bmem)
++              *bmem += b;
++}
++
++/**
++ * unlinked_build_dump - create a transaction that represents a log dump for this LE class
++ * @sdp: the filesystem
++ * @tr: the transaction to fill
++ *
++ */
++
++static void
++unlinked_build_dump(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head;
++      struct gfs_unlinked *ul;
++      unsigned int x = 0;
++
++      tr->tr_num_iul = atomic_read(&sdp->sd_unlinked_od_count);
++
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      for (head = &sdp->sd_unlinked_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++              if (!test_bit(ULF_OD_LIST, &ul->ul_flags))
++                      continue;
++
++              GFS_ASSERT_SBD(!ul->ul_ondisk_le.le_trans, sdp,);
++              ul->ul_ondisk_le.le_trans = tr;
++              list_add(&ul->ul_ondisk_le.le_list, &tr->tr_elements);
++
++              x++;
++      }
++
++      spin_unlock(&sdp->sd_unlinked_lock);
++
++      GFS_ASSERT_SBD(x == atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++}
++
++/**
++ * unlinked_before_scan - called before a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++unlinked_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++                   struct gfs_log_header *head, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_B1)
++              clear_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * unlinked_scan_elements - scan unlinked inodes from the journal
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being scaned
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++unlinked_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                     struct gfs_glock *gl, uint64_t start,
++                     struct gfs_log_descriptor *desc, unsigned int pass)
++{
++      struct gfs_inum inum;
++      struct buffer_head *bh;
++      unsigned int offset = sizeof(struct gfs_log_descriptor);
++      unsigned int x;
++      int error;
++
++      if (pass != GFS_RECPASS_B1)
++              return 0;
++
++      switch (desc->ld_type) {
++      case GFS_LOG_DESC_IUL:
++              if (test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags))
++                      GFS_ASSERT_SBD(!desc->ld_data1, sdp,);
++              else {
++                      GFS_ASSERT_SBD(desc->ld_data1, sdp,);
++                      set_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags);
++              }
++              break;
++
++      case GFS_LOG_DESC_IDA:
++              GFS_ASSERT_SBD(test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags),
++                             sdp,);
++              break;
++
++      default:
++              return 0;
++      }
++
++      for (x = 0; x < desc->ld_length; x++) {
++              error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++              if (error)
++                      return error;
++
++              for (;
++                   offset + sizeof(struct gfs_inum) <= sdp->sd_sb.sb_bsize;
++                   offset += sizeof(struct gfs_inum)) {
++                      gfs_inum_in(&inum, bh->b_data + offset);
++
++                      if (inum.no_addr)
++                              gfs_unlinked_merge(sdp, desc->ld_type, &inum);
++              }
++
++              brelse(bh);
++
++              error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++              if (error)
++                      return error;
++
++              offset = 0;
++      }
++
++      return 0;
++}
++
++/**
++ * unlinked_after_scan - called after a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++unlinked_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_B1) {
++              GFS_ASSERT_SBD(test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags),
++                             sdp,);
++              printk("GFS: fsid=%s: Found %d unlinked inodes\n",
++                     sdp->sd_fsname, atomic_read(&sdp->sd_unlinked_ic_count));
++      }
++}
++
++/**
++ * quota_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++quota_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++      struct gfs_quota_le *ql;
++
++      ql = container_of(le, struct gfs_quota_le, ql_le);
++      printk("  quota:  %s %u:  %"PRId64" blocks\n",
++             (test_bit(QDF_USER, &ql->ql_data->qd_flags)) ? "user" : "group",
++             ql->ql_data->qd_id, ql->ql_change);
++}
++
++/**
++ * quota_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++quota_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                  struct gfs_log_element *le)
++{
++      struct gfs_quota_le *ql = container_of(le, struct gfs_quota_le, ql_le);
++      struct gfs_quota_data *qd = ql->ql_data;
++
++      GFS_ASSERT_SBD(ql->ql_change, sdp,);
++
++      /*  Make this change under the sd_quota_lock, so other processes
++         checking qd_change_ic don't have to acquire the log lock.  */
++
++      spin_lock(&sdp->sd_quota_lock);
++      qd->qd_change_new -= ql->ql_change;
++      qd->qd_change_ic += ql->ql_change;
++      spin_unlock(&sdp->sd_quota_lock);
++
++      if (le->le_trans == tr)
++              list_add(&ql->ql_data_list, &qd->qd_le_list);
++      else {
++              struct list_head *tmp, *head;
++              struct gfs_quota_le *tmp_ql;
++              int found = FALSE;
++
++              for (head = &qd->qd_le_list, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      tmp_ql = list_entry(tmp, struct gfs_quota_le, ql_data_list);
++                      if (tmp_ql->ql_le.le_trans != tr)
++                              continue;
++
++                      tmp_ql->ql_change += ql->ql_change;
++
++                      list_del(&le->le_list);
++                      gfs_quota_put(sdp, qd);
++                      kfree(ql);
++
++                      if (!tmp_ql->ql_change) {
++                              list_del(&tmp_ql->ql_data_list);
++                              list_del(&tmp_ql->ql_le.le_list);
++                              gfs_quota_put(sdp, tmp_ql->ql_data);
++                              kfree(tmp_ql);
++                              tr->tr_num_q--;
++                      }
++
++                      found = TRUE;
++                      break;
++              }
++
++              if (!found) {
++                      le->le_trans = tr;
++                      list_move(&le->le_list, &tr->tr_elements);
++                      tr->tr_num_q++;
++                      list_add(&ql->ql_data_list, &qd->qd_le_list);
++              }
++      }
++}
++
++/**
++ * quota_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++quota_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      struct gfs_quota_le *ql = container_of(le, struct gfs_quota_le, ql_le);
++      struct gfs_quota_data *qd = ql->ql_data;
++
++      qd->qd_change_od += ql->ql_change;
++      if (qd->qd_change_od) {
++              if (!test_bit(QDF_OD_LIST, &qd->qd_flags)) {
++                      gfs_quota_hold(sdp, qd);
++                      set_bit(QDF_OD_LIST, &qd->qd_flags);
++                      atomic_inc(&sdp->sd_quota_od_count);
++              }
++      } else {
++              GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags), sdp,);
++              clear_bit(QDF_OD_LIST, &qd->qd_flags);
++              gfs_quota_put(sdp, qd);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_quota_od_count), sdp,);
++              atomic_dec(&sdp->sd_quota_od_count);
++      }
++
++      list_del(&ql->ql_data_list);
++      list_del(&le->le_list);
++      gfs_quota_put(sdp, qd);
++      kfree(ql);
++}
++
++/**
++ * quota_clean_dump - clean up a LE after a log dump
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++quota_clean_dump(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      le->le_trans = NULL;
++      list_del_init(&le->le_list);
++}
++
++/**
++ * quota_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++quota_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++               unsigned int *mblks, unsigned int *eblks,
++               unsigned int *blocks, unsigned int *bmem)
++{
++      unsigned int qblks;
++
++      if (tr->tr_num_q) {
++              qblks = gfs_struct2blk(sdp, tr->tr_num_q,
++                                     sizeof(struct gfs_quota_tag));
++
++              if (eblks)
++                      *eblks += qblks;
++              if (blocks)
++                      *blocks += qblks;
++              if (bmem)
++                      *bmem += qblks;
++      }
++}
++
++/**
++ * quota_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++quota_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                  struct gfs_trans *new_tr)
++{
++      tr->tr_num_q += new_tr->tr_num_q;
++}
++
++/**
++ * quota_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++quota_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head;
++      struct gfs_log_element *le;
++      struct gfs_quota_le *ql;
++      struct gfs_log_descriptor desc;
++      struct gfs_quota_tag tag;
++      struct gfs_log_buf *lb;
++      unsigned int offset = sizeof(struct gfs_log_descriptor), entries = 0;
++
++      if (!tr->tr_num_q && !(tr->tr_flags & TRF_LOG_DUMP))
++              return;
++
++      lb = gfs_log_get_buf(sdp, tr);
++
++      memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++      desc.ld_header.mh_magic = GFS_MAGIC;
++      desc.ld_header.mh_type = GFS_METATYPE_LD;
++      desc.ld_header.mh_format = GFS_FORMAT_LD;
++      desc.ld_type = GFS_LOG_DESC_Q;
++      desc.ld_length = gfs_struct2blk(sdp, tr->tr_num_q,
++                                      sizeof(struct gfs_quota_tag));
++      desc.ld_data1 = tr->tr_num_q;
++      desc.ld_data2 = (tr->tr_flags & TRF_LOG_DUMP) ? TRUE : FALSE;
++      gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++      for (head = &tr->tr_elements, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              if (le->le_ops != &gfs_quota_lops)
++                      continue;
++
++              ql = container_of(le, struct gfs_quota_le, ql_le);
++
++              if (offset + sizeof(struct gfs_quota_tag) >
++                  sdp->sd_sb.sb_bsize) {
++                      offset = 0;
++                      lb = gfs_log_get_buf(sdp, tr);
++              }
++
++              memset(&tag, 0, sizeof(struct gfs_quota_tag));
++              tag.qt_change = ql->ql_change;
++              tag.qt_flags = (test_bit(QDF_USER, &ql->ql_data->qd_flags)) ?
++                      GFS_QTF_USER : 0;
++              tag.qt_id = ql->ql_data->qd_id;
++
++              gfs_quota_tag_out(&tag, lb->lb_bh.b_data + offset);
++
++              offset += sizeof(struct gfs_quota_tag);
++              entries++;
++      }
++
++      GFS_ASSERT_SBD(entries == tr->tr_num_q, sdp,);
++}
++
++/**
++ * quota_dump_size - compute how much space the LE class takes up in a log dump
++ * @sdp: the filesystem
++ * @elements: the number of log elements in the dump
++ * @blocks: the number of blocks in the dump
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++quota_dump_size(struct gfs_sbd *sdp, unsigned int *elements,
++              unsigned int *blocks, unsigned int *bmem)
++{
++      unsigned int c = atomic_read(&sdp->sd_quota_od_count);
++      unsigned int b = gfs_struct2blk(sdp, c, sizeof(struct gfs_quota_tag));
++
++      if (elements)
++              *elements += c;
++      if (blocks)
++              *blocks += b;
++      if (bmem)
++              *bmem += b;
++}
++
++/**
++ * quota_build_dump - create a transaction that represents a log dump for this LE class
++ * @sdp: the filesystem
++ * @tr: the transaction to fill
++ *
++ */
++
++static void
++quota_build_dump(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++      struct list_head *tmp, *head;
++      struct gfs_quota_data *qd;
++      struct gfs_quota_le *ql;
++      unsigned int x = 0;
++
++      tr->tr_num_q = atomic_read(&sdp->sd_quota_od_count);
++
++      spin_lock(&sdp->sd_quota_lock);
++
++      for (head = &sdp->sd_quota_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++              if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++                      continue;
++
++              ql = &qd->qd_ondisk_ql;
++
++              ql->ql_le.le_ops = &gfs_quota_lops;
++              GFS_ASSERT_SBD(!ql->ql_le.le_trans, sdp,);
++              ql->ql_le.le_trans = tr;
++              list_add(&ql->ql_le.le_list, &tr->tr_elements);
++
++              ql->ql_data = qd;
++              ql->ql_change = qd->qd_change_od;
++
++              x++;
++      }
++
++      spin_unlock(&sdp->sd_quota_lock);
++
++      GFS_ASSERT_SBD(x == atomic_read(&sdp->sd_quota_od_count), sdp,);
++}
++
++/**
++ * quota_before_scan - called before a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++quota_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++                struct gfs_log_header *head, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_B1)
++              clear_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * quota_scan_elements - scan quota inodes from the journal
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being scaned
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++quota_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                  struct gfs_glock *gl, uint64_t start,
++                  struct gfs_log_descriptor *desc, unsigned int pass)
++{
++      struct gfs_quota_tag tag;
++      struct buffer_head *bh;
++      unsigned int num_tags = desc->ld_data1;
++      unsigned int offset = sizeof(struct gfs_log_descriptor);
++      unsigned int x;
++      int error;
++
++      if (pass != GFS_RECPASS_B1)
++              return 0;
++      if (desc->ld_type != GFS_LOG_DESC_Q)
++              return 0;
++
++      if (test_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags))
++              GFS_ASSERT_SBD(!desc->ld_data2, sdp,);
++      else {
++              GFS_ASSERT_SBD(desc->ld_data2, sdp,);
++              set_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags);
++      }
++
++      if (!num_tags)
++              return 0;
++
++      for (x = 0; x < desc->ld_length; x++) {
++              error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++              if (error)
++                      return error;
++
++              while (offset + sizeof(struct gfs_quota_tag) <=
++                     sdp->sd_sb.sb_bsize) {
++                      gfs_quota_tag_in(&tag, bh->b_data + offset);
++
++                      error = gfs_quota_merge(sdp, &tag);
++                      if (error)
++                              goto out_drelse;
++
++                      if (!--num_tags)
++                              goto out_drelse;
++
++                      offset += sizeof(struct gfs_quota_tag);
++              }
++
++              brelse(bh);
++
++              error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++              if (error)
++                      return error;
++
++              offset = 0;
++      }
++
++      return 0;
++
++ out_drelse:
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * quota_after_scan - called after a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++quota_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++      if (pass == GFS_RECPASS_B1) {
++              GFS_ASSERT_SBD(!sdp->sd_sb.sb_quota_di.no_formal_ino ||
++                             test_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags),
++                             sdp,);
++              printk("GFS: fsid=%s: Found quota changes for %d IDs\n",
++                     sdp->sd_fsname, atomic_read(&sdp->sd_quota_od_count));
++      }
++}
++
++struct gfs_log_operations gfs_glock_lops = {
++      .lo_add = generic_le_add,
++      .lo_trans_end = glock_trans_end,
++      .lo_print = glock_print,
++      .lo_overlap_trans = glock_overlap_trans,
++      .lo_incore_commit = glock_incore_commit,
++      .lo_add_to_ail = glock_add_to_ail,
++      .lo_trans_combine = glock_trans_combine,
++      .lo_name = "glock"
++};
++
++struct gfs_log_operations gfs_buf_lops = {
++      .lo_add = generic_le_add,
++      .lo_print = buf_print,
++      .lo_incore_commit = buf_incore_commit,
++      .lo_add_to_ail = buf_add_to_ail,
++      .lo_trans_size = buf_trans_size,
++      .lo_trans_combine = buf_trans_combine,
++      .lo_build_bhlist = buf_build_bhlist,
++      .lo_before_scan = buf_before_scan,
++      .lo_scan_elements = buf_scan_elements,
++      .lo_after_scan = buf_after_scan,
++      .lo_name = "buf"
++};
++
++struct gfs_log_operations gfs_unlinked_lops = {
++      .lo_add = generic_le_add,
++      .lo_print = unlinked_print,
++      .lo_incore_commit = unlinked_incore_commit,
++      .lo_add_to_ail = unlinked_add_to_ail,
++      .lo_clean_dump = unlinked_clean_dump,
++      .lo_trans_size = unlinked_trans_size,
++      .lo_trans_combine = unlinked_trans_combine,
++      .lo_build_bhlist = unlinked_build_bhlist,
++      .lo_dump_size = unlinked_dump_size,
++      .lo_build_dump = unlinked_build_dump,
++      .lo_before_scan = unlinked_before_scan,
++      .lo_scan_elements = unlinked_scan_elements,
++      .lo_after_scan = unlinked_after_scan,
++      .lo_name = "unlinked"
++};
++
++struct gfs_log_operations gfs_quota_lops = {
++      .lo_add = generic_le_add,
++      .lo_print = quota_print,
++      .lo_incore_commit = quota_incore_commit,
++      .lo_add_to_ail = quota_add_to_ail,
++      .lo_clean_dump = quota_clean_dump,
++      .lo_trans_size = quota_trans_size,
++      .lo_trans_combine = quota_trans_combine,
++      .lo_build_bhlist = quota_build_bhlist,
++      .lo_dump_size = quota_dump_size,
++      .lo_build_dump = quota_build_dump,
++      .lo_before_scan = quota_before_scan,
++      .lo_scan_elements = quota_scan_elements,
++      .lo_after_scan = quota_after_scan,
++      .lo_name = "quota"
++};
++
++struct gfs_log_operations *gfs_log_ops[] = {
++      &gfs_glock_lops,
++      &gfs_buf_lops,
++      &gfs_unlinked_lops,
++      &gfs_quota_lops,
++      NULL
++};
+diff -urN linux-orig/fs/gfs/lops.h linux-patched/fs/gfs/lops.h
+--- linux-orig/fs/gfs/lops.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lops.h        2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,179 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOPS_DOT_H__
++#define __LOPS_DOT_H__
++
++extern struct gfs_log_operations gfs_glock_lops;
++extern struct gfs_log_operations gfs_buf_lops;
++extern struct gfs_log_operations gfs_unlinked_lops;
++extern struct gfs_log_operations gfs_quota_lops;
++
++extern struct gfs_log_operations *gfs_log_ops[];
++
++#define INIT_LE(le, lops) \
++do \
++{ \
++  (le)->le_ops = (lops); \
++  (le)->le_trans = NULL; \
++  INIT_LIST_HEAD(&(le)->le_list); \
++} \
++while (0)
++
++#define LO_ADD(sdp, le) \
++do \
++{ \
++  if ((le)->le_ops->lo_add) \
++    (le)->le_ops->lo_add((sdp), (le)); \
++} \
++while (0)
++
++#define LO_TRANS_END(sdp, le) \
++do \
++{ \
++  if ((le)->le_ops->lo_trans_end) \
++    (le)->le_ops->lo_trans_end((sdp), (le)); \
++} \
++while (0)
++
++#define LO_PRINT(sdp, le, where) \
++do \
++{ \
++  if ((le)->le_ops->lo_print) \
++    (le)->le_ops->lo_print((sdp), (le), (where)); \
++} \
++while (0)
++
++static __inline__ struct gfs_trans *
++LO_OVERLAP_TRANS(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++      if (le->le_ops->lo_overlap_trans)
++              return le->le_ops->lo_overlap_trans(sdp, le);
++      else
++              return NULL;
++}
++
++#define LO_INCORE_COMMIT(sdp, tr, le) \
++do \
++{ \
++  if ((le)->le_ops->lo_incore_commit) \
++    (le)->le_ops->lo_incore_commit((sdp), (tr), (le)); \
++} \
++while (0)
++
++#define LO_ADD_TO_AIL(sdp, le) \
++do \
++{ \
++  if ((le)->le_ops->lo_add_to_ail) \
++    (le)->le_ops->lo_add_to_ail((sdp), (le)); \
++} \
++while (0)
++
++#define LO_CLEAN_DUMP(sdp, le) \
++do \
++{ \
++  if ((le)->le_ops->lo_clean_dump) \
++    (le)->le_ops->lo_clean_dump((sdp), (le)); \
++} \
++while (0)
++
++#define LO_TRANS_SIZE(sdp, tr, mblks, eblks, blocks, bmem) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_trans_size) \
++      gfs_log_ops[__lops_x]->lo_trans_size((sdp), (tr), (mblks), (eblks), (blocks), (bmem)); \
++} \
++while (0)
++
++#define LO_TRANS_COMBINE(sdp, tr, new_tr) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_trans_combine) \
++      gfs_log_ops[__lops_x]->lo_trans_combine((sdp), (tr), (new_tr)); \
++} \
++while (0)
++
++#define LO_BUILD_BHLIST(sdp, tr) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_build_bhlist) \
++      gfs_log_ops[__lops_x]->lo_build_bhlist((sdp), (tr)); \
++} \
++while (0)
++
++#define LO_DUMP_SIZE(sdp, elements, blocks, bmem) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_dump_size) \
++      gfs_log_ops[__lops_x]->lo_dump_size((sdp), (elements), (blocks), (bmem)); \
++} \
++while (0)
++
++#define LO_BUILD_DUMP(sdp, tr) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_build_dump) \
++      gfs_log_ops[__lops_x]->lo_build_dump((sdp), (tr)); \
++} \
++while (0)
++
++#define LO_BEFORE_SCAN(sdp, jid, head, pass) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_before_scan) \
++      gfs_log_ops[__lops_x]->lo_before_scan((sdp), (jid), (head), (pass)); \
++} \
++while (0)
++
++static __inline__ int
++LO_SCAN_ELEMENTS(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++               struct gfs_glock *gl, uint64_t start,
++               struct gfs_log_descriptor *desc, unsigned int pass)
++{
++      int x;
++      int error;
++
++      for (x = 0; gfs_log_ops[x]; x++)
++              if (gfs_log_ops[x]->lo_scan_elements) {
++                      error = gfs_log_ops[x]->lo_scan_elements(sdp, jdesc, gl,
++                                                               start, desc, pass);
++                      if (error)
++                              return error;
++              }
++
++      return 0;
++}
++
++#define LO_AFTER_SCAN(sdp, jid, pass) \
++do \
++{ \
++  int __lops_x; \
++  for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++    if (gfs_log_ops[__lops_x]->lo_after_scan) \
++      gfs_log_ops[__lops_x]->lo_after_scan((sdp), (jid), (pass)); \
++} \
++while (0)
++
++#endif /* __LOPS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/lvb.c linux-patched/fs/gfs/lvb.c
+--- linux-orig/fs/gfs/lvb.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lvb.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,148 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++
++#define pv(struct, member, fmt) printk("  "#member" = "fmt"\n", struct->member);
++
++#define CPIN_08(s1, s2, member, count) {memcpy((s1->member), (s2->member), (count));}
++#define CPOUT_08(s1, s2, member, count) {memcpy((s2->member), (s1->member), (count));}
++#define CPIN_16(s1, s2, member) {(s1->member) = gfs16_to_cpu((s2->member));}
++#define CPOUT_16(s1, s2, member) {(s2->member) = cpu_to_gfs16((s1->member));}
++#define CPIN_32(s1, s2, member) {(s1->member) = gfs32_to_cpu((s2->member));}
++#define CPOUT_32(s1, s2, member) {(s2->member) = cpu_to_gfs32((s1->member));}
++#define CPIN_64(s1, s2, member) {(s1->member) = gfs64_to_cpu((s2->member));}
++#define CPOUT_64(s1, s2, member) {(s2->member) = cpu_to_gfs64((s1->member));}
++
++/**
++ * gfs_rgrp_lvb_in - Read in rgrp data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_rgrp_lvb_in(struct gfs_rgrp_lvb *rb, char *lvb)
++{
++      struct gfs_rgrp_lvb *str = (struct gfs_rgrp_lvb *)lvb;
++
++      CPIN_32(rb, str, rb_magic);
++      CPIN_32(rb, str, rb_free);
++      CPIN_32(rb, str, rb_useddi);
++      CPIN_32(rb, str, rb_freedi);
++      CPIN_32(rb, str, rb_usedmeta);
++      CPIN_32(rb, str, rb_freemeta);
++}
++
++/**
++ * gfs_rgrp_lvb_out - Write out rgrp data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_rgrp_lvb_out(struct gfs_rgrp_lvb *rb, char *lvb)
++{
++      struct gfs_rgrp_lvb *str = (struct gfs_rgrp_lvb *)lvb;
++
++      CPOUT_32(rb, str, rb_magic);
++      CPOUT_32(rb, str, rb_free);
++      CPOUT_32(rb, str, rb_useddi);
++      CPOUT_32(rb, str, rb_freedi);
++      CPOUT_32(rb, str, rb_usedmeta);
++      CPOUT_32(rb, str, rb_freemeta);
++}
++
++/**
++ * gfs_rgrp_lvb_print - Print out rgrp data
++ * @rb: the cpu-order structure
++ * @console - TRUE if this should be printed to the console,
++ *            FALSE if it should be just printed to the incore debug
++ *            buffer
++ */
++
++void
++gfs_rgrp_lvb_print(struct gfs_rgrp_lvb *rb)
++{
++      pv(rb, rb_magic, "%u");
++      pv(rb, rb_free, "%u");
++      pv(rb, rb_useddi, "%u");
++      pv(rb, rb_freedi, "%u");
++      pv(rb, rb_usedmeta, "%u");
++      pv(rb, rb_freemeta, "%u");
++}
++
++/**
++ * gfs_quota_lvb_in - Read in quota data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_quota_lvb_in(struct gfs_quota_lvb *qb, char *lvb)
++{
++      struct gfs_quota_lvb *str = (struct gfs_quota_lvb *)lvb;
++
++      CPIN_32(qb, str, qb_magic);
++      CPIN_32(qb, str, qb_pad);
++      CPIN_64(qb, str, qb_limit);
++      CPIN_64(qb, str, qb_warn);
++      CPIN_64(qb, str, qb_value);
++}
++
++/**
++ * gfs_quota_lvb_out - Write out quota data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_quota_lvb_out(struct gfs_quota_lvb *qb, char *lvb)
++{
++      struct gfs_quota_lvb *str = (struct gfs_quota_lvb *)lvb;
++
++      CPOUT_32(qb, str, qb_magic);
++      CPOUT_32(qb, str, qb_pad);
++      CPOUT_64(qb, str, qb_limit);
++      CPOUT_64(qb, str, qb_warn);
++      CPOUT_64(qb, str, qb_value);
++}
++
++/**
++ * gfs_quota_lvb_print - Print out quota data
++ * @rb: the cpu-order structure
++ * @console - TRUE if this should be printed to the console,
++ *            FALSE if it should be just printed to the incore debug
++ *            buffer
++ */
++
++void
++gfs_quota_lvb_print(struct gfs_quota_lvb *qb)
++{
++      pv(qb, qb_magic, "%u");
++      pv(qb, qb_pad, "%u");
++      pv(qb, qb_limit, "%"PRIu64);
++      pv(qb, qb_warn, "%"PRIu64);
++      pv(qb, qb_value, "%"PRId64);
++}
+diff -urN linux-orig/fs/gfs/lvb.h linux-patched/fs/gfs/lvb.h
+--- linux-orig/fs/gfs/lvb.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lvb.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,48 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LVB_DOT_H__
++#define __LVB_DOT_H__
++
++#define GFS_MIN_LVB_SIZE (32)
++
++struct gfs_rgrp_lvb {
++      uint32_t rb_magic;
++      uint32_t rb_free;
++      uint32_t rb_useddi;
++      uint32_t rb_freedi;
++      uint32_t rb_usedmeta;
++      uint32_t rb_freemeta;
++};
++
++struct gfs_quota_lvb {
++      uint32_t qb_magic;
++      uint32_t qb_pad;
++      uint64_t qb_limit;
++      uint64_t qb_warn;
++      int64_t qb_value;
++};
++
++/*  Translation functions  */
++
++void gfs_rgrp_lvb_in(struct gfs_rgrp_lvb *rb, char *lvb);
++void gfs_rgrp_lvb_out(struct gfs_rgrp_lvb *rb, char *lvb);
++void gfs_quota_lvb_in(struct gfs_quota_lvb *qb, char *lvb);
++void gfs_quota_lvb_out(struct gfs_quota_lvb *qb, char *lvb);
++
++/*  Printing functions  */
++
++void gfs_rgrp_lvb_print(struct gfs_rgrp_lvb *rb);
++void gfs_quota_lvb_print(struct gfs_quota_lvb *qb);
++
++#endif /* __LVB_DOT_H__ */
+diff -urN linux-orig/fs/gfs/main.c linux-patched/fs/gfs/main.c
+--- linux-orig/fs/gfs/main.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/main.c        2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,142 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/init.h>
++
++#include "gfs.h"
++#include "mount.h"
++#include "ops_fstype.h"
++
++struct proc_dir_entry *gfs_proc_entry = NULL;
++
++/**
++ * init_gfs_fs - Register GFS as a filesystem
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int __init
++init_gfs_fs(void)
++{
++      int error = 0;
++
++      init_MUTEX(&gfs_mount_args_lock);
++
++      gfs_proc_entry = create_proc_read_entry("fs/gfs", S_IFREG | 0200, NULL, NULL, NULL);
++      if (!gfs_proc_entry) {
++              printk("GFS: can't register /proc/fs/gfs\n");
++              error = -EINVAL;
++              goto fail;
++      }
++      gfs_proc_entry->write_proc = gfs_proc_write;
++
++      gfs_random_number = xtime.tv_nsec;
++
++      gfs_glock_cachep = kmem_cache_create("gfs_glock", sizeof(struct gfs_glock),
++                                           0, 0,
++                                           NULL, NULL);
++      if (!gfs_glock_cachep)
++              goto fail2;
++
++      gfs_inode_cachep = kmem_cache_create("gfs_inode", sizeof(struct gfs_inode),
++                                           0, 0,
++                                           NULL, NULL);
++      if (!gfs_inode_cachep)
++              goto fail2;
++
++      gfs_bufdata_cachep = kmem_cache_create("gfs_bufdata", sizeof(struct gfs_bufdata),
++                                             0, 0,
++                                             NULL, NULL);
++      if (!gfs_bufdata_cachep)
++              goto fail2;
++
++      gfs_mhc_cachep = kmem_cache_create("gfs_meta_header_cache", sizeof(struct gfs_meta_header_cache),
++                                         0, 0,
++                                         NULL, NULL);
++      if (!gfs_mhc_cachep)
++              goto fail2;
++
++      error = register_filesystem(&gfs_fs_type);
++      if (error)
++              goto fail2;
++
++      printk("GFS %s (built %s %s) installed\n",
++             GFS_RELEASE_NAME, __DATE__, __TIME__);
++
++      return 0;
++
++      fail2:
++      if (gfs_mhc_cachep)
++              kmem_cache_destroy(gfs_mhc_cachep);
++
++      if (gfs_bufdata_cachep)
++              kmem_cache_destroy(gfs_bufdata_cachep);
++
++      if (gfs_inode_cachep)
++              kmem_cache_destroy(gfs_inode_cachep);
++
++      if (gfs_glock_cachep)
++              kmem_cache_destroy(gfs_glock_cachep);
++
++      down(&gfs_mount_args_lock);
++      if (gfs_mount_args) {
++              kfree(gfs_mount_args);
++              gfs_mount_args = NULL;
++      }
++      up(&gfs_mount_args_lock);
++      remove_proc_entry("fs/gfs", NULL);
++
++      fail:
++      return error;
++}
++
++/**
++ * exit_gfs_fs - Unregister the file system
++ *
++ */
++
++void __exit
++exit_gfs_fs(void)
++{
++      unregister_filesystem(&gfs_fs_type);
++
++      kmem_cache_destroy(gfs_mhc_cachep);
++      kmem_cache_destroy(gfs_bufdata_cachep);
++      kmem_cache_destroy(gfs_inode_cachep);
++      kmem_cache_destroy(gfs_glock_cachep);
++
++      down(&gfs_mount_args_lock);
++      if (gfs_mount_args) {
++              kfree(gfs_mount_args);
++              gfs_mount_args = NULL;
++      }
++      up(&gfs_mount_args_lock);
++      remove_proc_entry("fs/gfs", NULL);
++}
++
++MODULE_DESCRIPTION("Global File System " GFS_RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++module_init(init_gfs_fs);
++module_exit(exit_gfs_fs);
++
+diff -urN linux-orig/fs/gfs/mount.c linux-patched/fs/gfs/mount.c
+--- linux-orig/fs/gfs/mount.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/mount.c       2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,212 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "mount.h"
++
++char *gfs_mount_args = NULL;
++struct semaphore gfs_mount_args_lock;
++
++/**
++ * gfs_make_args - Parse mount arguments
++ * @data:
++ * @args:
++ *
++ * Return: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_args(char *data, struct gfs_args *args)
++{
++      char *options, *x, *y;
++      int do_free = FALSE;
++      int error = 0;
++
++      /*  If someone preloaded options, use those instead  */
++
++      down(&gfs_mount_args_lock);
++      if (gfs_mount_args) {
++              data = gfs_mount_args;
++              gfs_mount_args = NULL;
++              do_free = TRUE;
++      }
++      up(&gfs_mount_args_lock);
++
++      /*  Set some defaults  */
++
++      memset(args, 0, sizeof(struct gfs_args));
++      args->ar_num_glockd = GFS_GLOCKD_DEFAULT;
++
++      /*  Split the options into tokens with the "," character and
++          process them  */
++
++      for (options = data; (x = strsep(&options, ",")); ) {
++              if (!*x)
++                      continue;
++
++              y = strchr(x, '=');
++              if (y)
++                      *y++ = 0;
++
++              if (!strcmp(x, "lockproto")) {
++                      if (!y) {
++                              printk("GFS: need argument to lockproto\n");
++                              error = -EINVAL;
++                              break;
++                      }
++                      strncpy(args->ar_lockproto, y, 256);
++                      args->ar_lockproto[255] = 0;
++              }
++
++              else if (!strcmp(x, "locktable")) {
++                      if (!y) {
++                              printk("GFS: need argument to locktable\n");
++                              error = -EINVAL;
++                              break;
++                      }
++                      strncpy(args->ar_locktable, y, 256);
++                      args->ar_locktable[255] = 0;
++              }
++
++              else if (!strcmp(x, "hostdata")) {
++                      if (!y) {
++                              printk("GFS: need argument to hostdata\n");
++                              error = -EINVAL;
++                              break;
++                      }
++                      strncpy(args->ar_hostdata, y, 256);
++                      args->ar_hostdata[255] = 0;
++              }
++
++              else if (!strcmp(x, "ignore_local_fs"))
++                      args->ar_ignore_local_fs = TRUE;
++
++              else if (!strcmp(x, "localflocks"))
++                      args->ar_localflocks = TRUE;
++
++              else if (!strcmp(x, "localcaching"))
++                      args->ar_localcaching = TRUE;
++
++              else if (!strcmp(x, "upgrade"))
++                      args->ar_upgrade = TRUE;
++
++              else if (!strcmp(x, "num_glockd")) {
++                      if (!y) {
++                              printk("GFS: need argument to num_glockd\n");
++                              error = -EINVAL;
++                              break;
++                      }
++                      sscanf(y, "%u", &args->ar_num_glockd);
++                      if (!args->ar_num_glockd || args->ar_num_glockd > GFS_GLOCKD_MAX) {
++                              printk("GFS: 0 < num_glockd <= %u  (not %u)\n",
++                                     GFS_GLOCKD_MAX, args->ar_num_glockd);
++                              error = -EINVAL;
++                              break;
++                      }
++              }
++
++              else if (!strcmp(x, "acl"))
++                      args->ar_posixacls = TRUE;
++
++              /*  Unknown  */
++
++              else {
++                      printk("GFS: unknown option: %s\n", x);
++                      error = -EINVAL;
++                      break;
++              }
++      }
++
++      if (error)
++              printk("GFS: invalid mount option(s)\n");
++
++      if (do_free)
++              kfree(data);
++
++      return error;
++}
++
++/**
++ * gfs_proc_write - Read in some mount options
++ * @file: unused
++ * @buffer: a buffer of mount options
++ * @count: the length of the mount options
++ * @data: unused
++ *
++ * Called when someone writes to /proc/fs/gfs.
++ * It allows you to specify mount options when you can't do it
++ * from mount.  i.e. from a inital ramdisk
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_proc_write(struct file *file,
++             const char *buffer, unsigned long count,
++             void *data)
++{
++      int error;
++      char *p;
++
++      if (!try_module_get(THIS_MODULE))
++              return -EAGAIN; /* Huh!?! */
++      down(&gfs_mount_args_lock);
++
++      if (gfs_mount_args) {
++              kfree(gfs_mount_args);
++              gfs_mount_args = NULL;
++      }
++
++      if (!count) {
++              error = 0;
++              goto fail;
++      }
++
++      gfs_mount_args = gmalloc(count + 1);
++
++      error = -EFAULT;
++      if (copy_from_user(gfs_mount_args, buffer, count))
++              goto fail_free;
++
++      gfs_mount_args[count] = 0;
++
++      /*  Get rid of extra newlines  */
++
++      for (p = gfs_mount_args; *p; p++)
++              if (*p == '\n')
++                      *p = 0;
++
++      up(&gfs_mount_args_lock);
++      module_put(THIS_MODULE);
++
++      return count;
++
++      fail_free:
++      kfree(gfs_mount_args);
++      gfs_mount_args = NULL;
++
++      fail:
++      up(&gfs_mount_args_lock);
++      module_put(THIS_MODULE);
++      return error;
++}
+diff -urN linux-orig/fs/gfs/mount.h linux-patched/fs/gfs/mount.h
+--- linux-orig/fs/gfs/mount.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/mount.h       2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,27 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MOUNT_DOT_H__
++#define __MOUNT_DOT_H__
++
++int gfs_make_args(char *data, struct gfs_args *args);
++
++/*  Allow args to be passed to GFS when using an initial ram disk  */
++
++extern char *gfs_mount_args;
++extern struct semaphore gfs_mount_args_lock;
++
++int gfs_proc_write(struct file *file, const char *buffer,
++                 unsigned long count, void *data);
++
++#endif /* __MOUNT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ondisk.c linux-patched/fs/gfs/ondisk.c
+--- linux-orig/fs/gfs/ondisk.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ondisk.c      2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,28 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++
++#define pv(struct, member, fmt) printk("  "#member" = "fmt"\n", struct->member);
++
++#define WANT_GFS_CONVERSION_FUNCTIONS
++#include <linux/gfs_ondisk.h>
++
+diff -urN linux-orig/fs/gfs/ops_address.c linux-patched/fs/gfs/ops_address.c
+--- linux-orig/fs/gfs/ops_address.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_address.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,476 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/pagemap.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "file.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_address.h"
++#include "page.h"
++#include "quota.h"
++#include "trans.h"
++
++/**
++ * get_block - Fills in a buffer head with details about a block
++ * @inode: The inode
++ * @lblock: The block number to look up
++ * @bh_result: The buffer head to return the result in
++ * @create: Non-zero if we may add block to the file
++ *
++ * Returns: errno
++ */
++
++static int
++get_block(struct inode *inode, sector_t lblock, 
++        struct buffer_head *bh_result, int create)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      int new = create;
++      uint64_t dblock;
++      int error;
++
++      error = gfs_block_map(ip, lblock, &new, &dblock, NULL);
++      if (error)
++              return error;
++
++      GFS_ASSERT_INODE(dblock || !create, ip,);
++
++      if (!dblock)
++              return 0;
++
++      map_bh(bh_result, inode->i_sb, dblock);
++      if (new)
++              set_buffer_new(bh_result);
++
++      return 0;
++}
++
++/**
++ * get_block_noalloc - Fills in a buffer head with details about a block
++ * @inode: The inode
++ * @lblock: The block number to look up
++ * @bh_result: The buffer head to return the result in
++ * @create: Non-zero if we may add block to the file
++ *
++ * Returns: errno
++ */
++
++static int
++get_block_noalloc(struct inode *inode, sector_t lblock,
++                struct buffer_head *bh_result, int create)
++{
++      int error;
++
++      error = get_block(inode, lblock, bh_result, FALSE);
++
++      GFS_ASSERT_INODE(!create || buffer_mapped(bh_result),
++                       vn2ip(inode),);
++
++      return error;
++}
++
++/**
++ * get_blocks - 
++ * @inode:
++ * @lblock:
++ * @max_blocks:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: errno
++ */
++
++static int
++get_blocks(struct inode *inode, sector_t lblock,
++         unsigned long max_blocks,
++         struct buffer_head *bh_result, int create)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      int new = create;
++      uint64_t dblock;
++      uint32_t extlen;
++      int error;
++
++      error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++      if (error)
++              return error;
++
++      GFS_ASSERT_INODE(dblock || !create, ip,);
++
++      if (!dblock)
++              return 0;
++
++      map_bh(bh_result, inode->i_sb, dblock);
++      if (new)
++              set_buffer_new(bh_result);
++
++      if (extlen > max_blocks)
++              extlen = max_blocks;
++      bh_result->b_size = extlen << inode->i_blkbits;
++
++      return 0;
++}
++
++/**
++ * get_blocks_noalloc - 
++ * @inode:
++ * @lblock:
++ * @max_blocks:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: errno
++ */
++
++static int
++get_blocks_noalloc(struct inode *inode, sector_t lblock,
++                 unsigned long max_blocks,
++                 struct buffer_head *bh_result, int create)
++{
++      int error;
++
++      error = get_blocks(inode, lblock, max_blocks, bh_result, FALSE);
++
++      GFS_ASSERT_INODE(!create || buffer_mapped(bh_result),
++                       vn2ip(inode),);
++
++      return error;
++}
++
++/**
++ * gfs_writepage - Write complete page
++ * @page: Page to write
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_writepage(struct page *page, struct writeback_control *wbc)
++{
++      struct gfs_inode *ip = vn2ip(page->mapping->host);
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_address);
++
++      GFS_ASSERT_INODE(gfs_glock_is_held_excl(ip->i_gl) &&
++                       !gfs_is_stuffed(ip), ip,);
++
++      error = block_write_full_page(page, get_block_noalloc, wbc);
++
++      gfs_flush_meta_cache(ip);
++
++      if (error == -EIO)
++              gfs_io_error_inode(ip);
++
++      return error;
++}
++
++/**
++ * stuffed_readpage - Fill in a Linux page with stuffed file data
++ * @ip: the inode
++ * @page: the page
++ *
++ * Returns: errno
++ */
++
++static int
++stuffed_readpage(struct gfs_inode *ip, struct page *page)
++{
++      struct buffer_head *dibh;
++      void *kaddr;
++      int error;
++
++      GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (!error) {
++              kaddr = kmap(page);
++              memcpy((char *)kaddr,
++                     dibh->b_data + sizeof(struct gfs_dinode),
++                     ip->i_di.di_size);
++              memset((char *)kaddr + ip->i_di.di_size,
++                     0,
++                     PAGE_CACHE_SIZE - ip->i_di.di_size);
++              kunmap(page);
++
++              brelse(dibh);
++
++              SetPageUptodate(page);
++      }
++
++      return error;
++}
++
++/**
++ * readi_readpage - readpage that goes through gfs_internal_read()
++ * @page: The page to read
++ *
++ * Returns: errno
++ */
++
++static int
++readi_readpage(struct page *page)
++{
++      struct gfs_inode *ip = vn2ip(page->mapping->host);
++      void *kaddr;
++      int ret;
++
++      kaddr = kmap(page);
++
++      ret = gfs_internal_read(ip, kaddr,
++                              (uint64_t)page->index << PAGE_CACHE_SHIFT,
++                              PAGE_CACHE_SIZE);
++      if (ret >= 0) {
++              if (ret < PAGE_CACHE_SIZE)
++                      memset(kaddr + ret, 0, PAGE_CACHE_SIZE - ret);
++              SetPageUptodate(page);
++              ret = 0;
++      }
++
++      kunmap(page);
++
++      unlock_page(page);
++
++      return ret;
++}
++
++/**
++ * gfs_readpage - readpage with locking
++ * @file: The file to read a page for
++ * @page: The page to read
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_readpage(struct file *file, struct page *page)
++{
++      struct gfs_inode *ip = vn2ip(page->mapping->host);
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_address);
++
++      if (!gfs_glock_is_locked_by_me(ip->i_gl)) {
++              unlock_page(page);
++              bitch_about(ip->i_sbd, &ip->i_sbd->sd_last_unlocked_aop,
++                          "unlocked readpage request");
++              return -ENOSYS;
++      }
++
++      if (!gfs_is_jdata(ip)) {
++              if (gfs_is_stuffed(ip) && !page->index) {
++                      error = stuffed_readpage(ip, page);
++                      unlock_page(page);
++              } else
++                      error = block_read_full_page(page, get_block);
++      } else
++              error = readi_readpage(page);
++
++      if (error == -EIO)
++              gfs_io_error_inode(ip);
++
++      return error;
++}
++
++/**
++ * gfs_prepare_write - Prepare to write to a file
++ * @file: The file to write to
++ * @page: The page which is to be prepared for writing
++ * @from: From (byte range within page)
++ * @to: To (byte range within page)
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_prepare_write(struct file *file, struct page *page,
++                unsigned from, unsigned to)
++{
++      struct gfs_inode *ip = vn2ip(page->mapping->host);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_address);
++
++      if (!gfs_glock_is_locked_by_me(ip->i_gl)) {
++              bitch_about(sdp, &sdp->sd_last_unlocked_aop,
++                          "unlocked prepare_write request");
++              return -ENOSYS;
++      }
++
++      if (gfs_is_stuffed(ip)) {
++              uint64_t file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
++
++              if (file_size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) {
++                      error = gfs_unstuff_dinode(ip, gfs_unstuffer_page, page);
++                      if (!error)
++                              error = block_prepare_write(page, from, to, get_block);
++              } else if (!PageUptodate(page))
++                      error = stuffed_readpage(ip, page);
++      } else
++              error = block_prepare_write(page, from, to, get_block);
++
++      if (error == -EIO)
++              gfs_io_error_inode(ip);
++
++      return error;
++}
++
++/**
++ * gfs_commit_write - Commit write to a file
++ * @file: The file to write to
++ * @page: The page containing the data
++ * @from: From (byte range within page)
++ * @to: To (byte range within page)
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_commit_write(struct file *file, struct page *page,
++               unsigned from, unsigned to)
++{
++      struct inode *inode = page->mapping->host;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_address);
++
++      if (gfs_is_stuffed(ip)) {
++              struct buffer_head *dibh;
++              uint64_t file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
++              void *kaddr;
++
++              GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      goto fail;
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++
++              kaddr = kmap(page);
++              memcpy(dibh->b_data + sizeof(struct gfs_dinode) + from,
++                     (char *)kaddr + from,
++                     to - from);
++              kunmap(page);
++
++              brelse(dibh);
++
++              SetPageUptodate(page);
++
++              if (inode->i_size < file_size)
++                      i_size_write(inode, file_size);
++      } else {
++              error = generic_commit_write(file, page, from, to);
++              if (error)
++                      goto fail;
++      }
++
++      return 0;
++
++ fail:
++      ClearPageUptodate(page);
++
++      return error;
++}
++
++/**
++ * gfs_bmap - Block map function
++ * @mapping: Address space info
++ * @lblock: The block to map
++ *
++ * Returns: The disk address for the block or 0 on hole or error
++ */
++
++static sector_t
++gfs_bmap(struct address_space *mapping, sector_t lblock)
++{
++      struct gfs_inode *ip = vn2ip(mapping->host);
++      struct gfs_holder i_gh;
++      int dblock = 0;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_address);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (error)
++              return 0;
++
++      if (!gfs_is_stuffed(ip))
++              dblock = generic_block_bmap(mapping, lblock, get_block);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return dblock;
++}
++
++/**
++ * gfs_direct_IO - 
++ * @rw:
++ * @iocb:
++ * @iov:
++ * @offset:
++ * @nr_segs:
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
++            loff_t offset, unsigned long nr_segs)
++{
++      struct file *file = iocb->ki_filp;
++      struct inode *inode = file->f_mapping->host;
++      struct gfs_inode *ip = vn2ip(inode);
++      get_blocks_t *gb = get_blocks;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_address);
++
++      GFS_ASSERT_INODE(gfs_glock_is_locked_by_me(ip->i_gl), ip,);
++      GFS_ASSERT_INODE(!gfs_is_stuffed(ip), ip,);
++
++      if (rw == WRITE && !current_transaction)
++              gb = get_blocks_noalloc;
++
++      error = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
++                                 offset, nr_segs, gb, NULL);
++
++      if (error == -EIO)
++              gfs_io_error_inode(ip); 
++
++      return error;
++}
++
++struct address_space_operations gfs_file_aops = {
++      .writepage = gfs_writepage,
++      .readpage = gfs_readpage,
++      .sync_page = block_sync_page,
++      .prepare_write = gfs_prepare_write,
++      .commit_write = gfs_commit_write,
++      .bmap = gfs_bmap,
++      .direct_IO = gfs_direct_IO,
++};
+diff -urN linux-orig/fs/gfs/ops_address.h linux-patched/fs/gfs/ops_address.h
+--- linux-orig/fs/gfs/ops_address.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_address.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_ADDRESS_DOT_H__
++#define __OPS_ADDRESS_DOT_H__
++
++extern struct address_space_operations gfs_file_aops;
++
++#endif /* __OPS_ADDRESS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_dentry.c linux-patched/fs/gfs/ops_dentry.c
+--- linux-orig/fs/gfs/ops_dentry.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_dentry.c  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,124 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dir.h"
++#include "glock.h"
++#include "ops_dentry.h"
++
++/**
++ * gfs_drevalidate - Check directory lookup consistency
++ * @dentry: the mapping to check
++ * @nd:
++ *
++ * Check to make sure the lookup necessary to arrive at this inode from its
++ * parent is still good.
++ *
++ * Returns: 1 if the dentry is ok, 0 if it isn't
++ */
++
++static int
++gfs_drevalidate(struct dentry *dentry, struct nameidata *nd)
++{
++      struct dentry *parent = dget_parent(dentry);
++      struct gfs_inode *dip;
++      struct inode *inode;
++      struct gfs_holder d_gh;
++      struct gfs_inode *ip;
++      struct gfs_inum inum;
++      unsigned int type;
++      int error;
++
++      lock_kernel();
++
++      dip = vn2ip(parent->d_inode);
++      GFS_ASSERT(dip,);
++
++      atomic_inc(&dip->i_sbd->sd_ops_dentry);
++
++      if (dip->i_sbd->sd_args.ar_localcaching)
++              goto valid;
++
++      inode = dentry->d_inode;
++      if (inode && is_bad_inode(inode))
++              goto invalid;
++
++      error = gfs_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
++      if (error)
++              goto fail;
++
++      error = gfs_dir_search(dip, &dentry->d_name, &inum, &type);
++      switch (error) {
++      case 0:
++              if (!inode)
++                      goto invalid_gunlock;
++              break;
++      case -ENOENT:
++              if (!inode)
++                      goto valid_gunlock;
++              goto invalid_gunlock;
++      default:
++              goto fail_gunlock;
++      }
++
++      ip = vn2ip(inode);
++      GFS_ASSERT_SBD(ip, dip->i_sbd,);
++
++      if (ip->i_num.no_formal_ino != inum.no_formal_ino)
++              goto invalid_gunlock;
++
++      GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++
++ valid_gunlock:
++      gfs_glock_dq_uninit(&d_gh);
++
++ valid:
++      unlock_kernel();
++      dput(parent);
++      return 1;
++
++ invalid_gunlock:
++      gfs_glock_dq_uninit(&d_gh);
++
++ invalid:
++      if (inode && S_ISDIR(inode->i_mode)) {
++              if (have_submounts(dentry))
++                      goto valid;
++              shrink_dcache_parent(dentry);
++      }
++      d_drop(dentry);
++
++      unlock_kernel();
++      dput(parent);
++      return 0;
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&d_gh);
++
++ fail:
++      unlock_kernel();
++      dput(parent);
++      return 0;
++}
++
++struct dentry_operations gfs_dops = {
++      .d_revalidate = gfs_drevalidate,
++};
+diff -urN linux-orig/fs/gfs/ops_dentry.h linux-patched/fs/gfs/ops_dentry.h
+--- linux-orig/fs/gfs/ops_dentry.h     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_dentry.h  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_DENTRY_DOT_H__
++#define __OPS_DENTRY_DOT_H__
++
++extern struct dentry_operations gfs_dops;
++
++#endif /* __OPS_DENTRY_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_export.c linux-patched/fs/gfs/ops_export.c
+--- linux-orig/fs/gfs/ops_export.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_export.c  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,415 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "dir.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ops_export.h"
++#include "rgrp.h"
++
++struct inode_cookie
++{
++      uint64_t formal_ino;
++      uint32_t gen;
++      int gen_valid;
++};
++
++struct get_name_filldir
++{
++      uint64_t formal_ino;
++      char *name;
++};
++
++/**
++ * gfs_decode_fh -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_decode_fh(struct super_block *sb, __u32 *fh, int fh_len, int fh_type,
++            int (*acceptable)(void *context, struct dentry *dentry),
++            void *context)
++{
++      struct inode_cookie this, parent;
++
++      atomic_inc(&vfs2sdp(sb)->sd_ops_export);
++
++      if (fh_type != fh_len)
++              return NULL;
++
++      memset(&parent, 0, sizeof(struct inode_cookie));
++
++      switch (fh_type) {
++      case 6:
++              parent.gen_valid = TRUE;
++              parent.gen = fh[5];
++      case 5:
++              parent.formal_ino = ((uint64_t)gfs32_to_cpu(fh[3])) << 32;
++              parent.formal_ino |= (uint64_t)gfs32_to_cpu(fh[4]);
++      case 3:
++              this.gen_valid = TRUE;
++              this.gen = gfs32_to_cpu(fh[2]);
++              this.formal_ino = ((uint64_t)gfs32_to_cpu(fh[0])) << 32;
++              this.formal_ino |= (uint64_t)gfs32_to_cpu(fh[1]);
++              break;
++      default:
++              return NULL;
++      }
++
++      return gfs_export_ops.find_exported_dentry(sb, &this, &parent,
++                                                 acceptable, context);
++}
++
++/**
++ * gfs_encode_fh -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++int 
++gfs_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
++            int connectable)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      int maxlen = *len;
++
++      atomic_inc(&ip->i_sbd->sd_ops_export);
++
++      if (maxlen < 3)
++              return 255;
++
++      fh[0] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino >> 32));
++      fh[1] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino & 0xFFFFFFFF));
++      fh[2] = cpu_to_gfs32(inode->i_generation);
++      *len = 3;
++
++      if (maxlen < 5 || !connectable)
++              return 3;
++
++      spin_lock(&dentry->d_lock);
++
++      inode = dentry->d_parent->d_inode;
++      ip = vn2ip(inode);
++
++      fh[3] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino >> 32));
++      fh[4] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino & 0xFFFFFFFF));
++      *len = 5;
++
++      if (maxlen < 6) {
++              spin_unlock(&dentry->d_lock);
++              return 5;
++      }
++
++      fh[5] = cpu_to_gfs32(inode->i_generation);
++
++      spin_unlock(&dentry->d_lock);
++
++      *len = 6;
++
++      return 6;
++}
++
++/**
++ * get_name_filldir - 
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++get_name_filldir(void *opaque,
++               const char *name, unsigned int length,
++               uint64_t offset,
++               struct gfs_inum *inum, unsigned int type)
++{
++      struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
++
++      if (inum->no_formal_ino != gnfd->formal_ino)
++              return 0;
++
++      memcpy(gnfd->name, name, length);
++      gnfd->name[length] = 0;
++
++      return 1;
++}
++
++/**
++ * gfs_get_name -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++int gfs_get_name(struct dentry *parent, char *name,
++               struct dentry *child)
++{
++      struct inode *dir = parent->d_inode;
++      struct inode *inode = child->d_inode;
++      struct gfs_inode *dip, *ip;
++      struct get_name_filldir gnfd;
++      struct gfs_holder gh;
++      uint64_t offset = 0;
++      int error;
++
++      if (!dir)
++              return -EINVAL;
++
++      atomic_inc(&vfs2sdp(dir->i_sb)->sd_ops_export);
++
++      if (!S_ISDIR(dir->i_mode) || !inode)
++              return -EINVAL;
++
++      dip = vn2ip(dir);
++      ip = vn2ip(inode);
++
++      *name = 0;
++      gnfd.formal_ino = ip->i_num.no_formal_ino;
++      gnfd.name = name;
++
++      error = gfs_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
++      if (error)
++              return error;
++
++      error = gfs_dir_read(dip, &offset, &gnfd, get_name_filldir);
++
++      gfs_glock_dq_uninit(&gh);
++
++      if (!error & !*name)
++              error = -ENOENT;
++
++      return error;
++}
++
++/**
++ * gfs_get_parent -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_get_parent(struct dentry *child)
++{
++      struct gfs_inode *dip = vn2ip(child->d_inode);
++      struct gfs_holder d_gh, i_gh;
++      struct qstr dotdot = { .name = "..", .len = 2 };
++      struct gfs_inode *ip;
++      struct inode *inode;
++      struct dentry *dentry;
++      int error;
++
++      atomic_inc(&dip->i_sbd->sd_ops_export);
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++      error = gfs_lookupi(&d_gh, &dotdot, TRUE, &i_gh);
++      if (error)
++              goto fail;
++
++      error = -ENOENT;
++      if (!i_gh.gh_gl)
++              goto fail;
++
++      ip = gl2ip(i_gh.gh_gl);
++
++      gfs_glock_dq_uninit(&d_gh);
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return ERR_PTR(-ENOMEM);
++
++      dentry = d_alloc_anon(inode);
++      if (!dentry) {
++              iput(inode);
++              return ERR_PTR(-ENOMEM);
++      }
++
++      return dentry;
++
++ fail:
++      gfs_holder_uninit(&d_gh);
++      return ERR_PTR(error);
++}
++
++/**
++ * gfs_get_dentry -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_get_dentry(struct super_block *sb, void *inump)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++      struct inode_cookie *cookie = (struct inode_cookie *)inump;
++      struct gfs_inum inum;
++      struct gfs_holder i_gh, ri_gh, rgd_gh;
++      struct gfs_rgrpd *rgd;
++      struct buffer_head *bh;
++      struct gfs_dinode *di;
++      struct gfs_inode *ip;
++      struct inode *inode;
++      struct dentry *dentry;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_export);
++
++      if (!cookie->formal_ino ||
++          cookie->formal_ino == sdp->sd_jiinode->i_num.no_formal_ino ||
++          cookie->formal_ino == sdp->sd_riinode->i_num.no_formal_ino ||
++          cookie->formal_ino == sdp->sd_qinode->i_num.no_formal_ino ||
++          cookie->formal_ino == sdp->sd_linode->i_num.no_formal_ino)
++              return ERR_PTR(-EINVAL);
++
++      inum.no_formal_ino = cookie->formal_ino;
++      inum.no_addr = cookie->formal_ino;
++
++      error = gfs_glock_nq_num(sdp,
++                               inum.no_formal_ino, &gfs_inode_glops,
++                               LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
++                               &i_gh);
++      if (error)
++              return ERR_PTR(error);
++
++      error = gfs_inode_get(i_gh.gh_gl, &inum, NO_CREATE, &ip);
++      if (error)
++              goto fail;
++      if (ip)
++              goto out;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              goto fail;
++
++      error = -EINVAL;
++      rgd = gfs_blk2rgrpd(sdp, inum.no_addr);
++      if (!rgd)
++              goto fail_rindex;
++
++      error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
++      if (error)
++              goto fail_rindex;
++
++      error = -ESTALE;
++      if (gfs_get_block_type(rgd, inum.no_addr) != GFS_BLKST_USEDMETA)
++              goto fail_rgd;
++
++      error = gfs_dread(sdp, inum.no_addr, i_gh.gh_gl,
++                        DIO_START | DIO_WAIT, &bh);
++      if (error)
++              goto fail_rgd;
++
++      di = (struct gfs_dinode *)bh->b_data;
++
++      error = -ESTALE;
++      if (gfs32_to_cpu(di->di_header.mh_magic) != GFS_MAGIC ||
++          gfs32_to_cpu(di->di_header.mh_type) != GFS_METATYPE_DI ||
++          (gfs32_to_cpu(di->di_flags) & GFS_DIF_UNUSED))
++              goto fail_relse;
++
++      brelse(bh);
++      gfs_glock_dq_uninit(&rgd_gh);
++      gfs_glock_dq_uninit(&ri_gh);
++
++      error = gfs_inode_get(i_gh.gh_gl, &inum, CREATE, &ip);
++      if (error)
++              goto fail;
++
++      atomic_inc(&sdp->sd_fh2dentry_misses);
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return ERR_PTR(-ENOMEM);
++
++      if (cookie->gen_valid && cookie->gen != inode->i_generation) {
++              iput(inode);
++              return ERR_PTR(-ESTALE);
++      }
++
++      dentry = d_alloc_anon(inode);
++      if (!dentry) {
++              iput(inode);
++              return ERR_PTR(-ENOMEM);
++      }
++
++      return dentry;
++
++ fail_relse:
++        brelse(bh);
++
++ fail_rgd:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex:
++      gfs_glock_dq_uninit(&ri_gh);
++
++ fail:
++      gfs_glock_dq_uninit(&i_gh);
++      return ERR_PTR(error);
++}
++
++struct export_operations gfs_export_ops = {
++      .decode_fh = gfs_decode_fh,
++      .encode_fh = gfs_encode_fh,
++      .get_name = gfs_get_name,
++      .get_parent = gfs_get_parent,
++      .get_dentry = gfs_get_dentry,
++};
++
+diff -urN linux-orig/fs/gfs/ops_export.h linux-patched/fs/gfs/ops_export.h
+--- linux-orig/fs/gfs/ops_export.h     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_export.h  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_EXPORT_DOT_H__
++#define __OPS_EXPORT_DOT_H__
++
++extern struct export_operations gfs_export_ops;
++
++#endif /* __OPS_EXPORT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_file.c linux-patched/fs/gfs/ops_file.c
+--- linux-orig/fs/gfs/ops_file.c       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_file.c    2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,1552 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++#include <linux/pagemap.h>
++#include <linux/uio.h>
++#include <linux/blkdev.h>
++#include <linux/mm.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "file.h"
++#include "flock.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "log.h"
++#include "ops_file.h"
++#include "ops_vm.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++struct filldir_bad_entry {
++      char *fbe_name;
++      unsigned int fbe_length;
++      uint64_t fbe_offset;
++      struct gfs_inum fbe_inum;
++      unsigned int fbe_type;
++};
++
++struct filldir_bad {
++      struct gfs_sbd *fdb_sbd;
++      int fdb_prefetch;
++
++      struct filldir_bad_entry *fdb_entry;
++      unsigned int fdb_entry_num;
++      unsigned int fdb_entry_off;
++
++      char *fdb_name;
++      unsigned int fdb_name_size;
++      unsigned int fdb_name_off;
++};
++
++struct filldir_reg {
++      struct gfs_sbd *fdr_sbd;
++      int fdr_prefetch;
++
++      filldir_t fdr_filldir;
++      void *fdr_opaque;
++};
++
++typedef ssize_t(*do_rw_t) (struct file * file,
++                         char *buf,
++                         size_t size, loff_t * offset,
++                         unsigned int num_gh, struct gfs_holder * ghs);
++
++/**
++ * gfs_llseek - seek to a location in a file
++ * @file: the file
++ * @offset: the offset
++ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
++ *
++ * SEEK_END requires the glock for the file because it references the
++ * file's size.
++ *
++ * Returns: The new offset, or -EXXX on error
++ */
++
++static loff_t
++gfs_llseek(struct file *file, loff_t offset, int origin)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      struct gfs_holder i_gh;
++      loff_t error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++
++      if (origin == 2) {
++              error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++              if (!error) {
++                      error = remote_llseek(file, offset, origin);
++                      gfs_glock_dq_uninit(&i_gh);
++              }
++      } else
++              error = remote_llseek(file, offset, origin);
++
++      return error;
++}
++
++#define vma2state(vma) \
++((((vma)->vm_flags & (VM_MAYWRITE | VM_MAYSHARE)) == \
++ (VM_MAYWRITE | VM_MAYSHARE)) ? \
++ LM_ST_EXCLUSIVE : LM_ST_SHARED) \
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static ssize_t
++walk_vm_hard(struct file *file, char *buf, size_t size, loff_t *offset,
++           do_rw_t operation)
++{
++      struct gfs_holder *ghs;
++      unsigned int num_gh = 0;
++      ssize_t count;
++
++      {
++              struct super_block *sb = file->f_dentry->d_inode->i_sb;
++              struct mm_struct *mm = current->mm;
++              struct vm_area_struct *vma;
++              unsigned long start = (unsigned long)buf;
++              unsigned long end = start + size;
++              int dumping = (current->flags & PF_DUMPCORE);
++              unsigned int x = 0;
++
++              for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++                      if (end <= vma->vm_start)
++                              break;
++                      if (vma->vm_file &&
++                          vma->vm_file->f_dentry->d_inode->i_sb == sb) {
++                              num_gh++;
++                      }
++              }
++
++              ghs = kmalloc((num_gh + 1) * sizeof(struct gfs_holder), GFP_KERNEL);
++              if (!ghs) {
++                      if (!dumping)
++                              up_read(&mm->mmap_sem);
++                      return -ENOMEM;
++              }
++
++              for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++                      if (end <= vma->vm_start)
++                              break;
++                      if (vma->vm_file) {
++                              struct inode *inode = vma->vm_file->f_dentry->d_inode;
++                              if (inode->i_sb == sb)
++                                      gfs_holder_init(vn2ip(inode)->i_gl,
++                                                      vma2state(vma),
++                                                      0, &ghs[x++]);
++                      }
++              }
++
++              if (!dumping)
++                      up_read(&mm->mmap_sem);
++
++              GFS_ASSERT_SBD(x == num_gh, vfs2sdp(sb),);
++      }
++
++      count = operation(file, buf, size, offset, num_gh, ghs);
++
++      while (num_gh--)
++              gfs_holder_uninit(&ghs[num_gh]);
++      kfree(ghs);
++
++      return count;
++}
++
++/**
++ * walk_vma - Walk the vmas associated with a buffer for read or write.
++ *    If any of them are gfs, pass the gfs inode down to the read/write
++ *    worker function so that locks can be acquired in the correct order.
++ * @file: The file to read/write from/to
++ * @buf: The buffer to copy to/from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @operation: The read or write worker function
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -errno on failure
++ */
++
++static ssize_t
++walk_vm(struct file *file, char *buf, size_t size, loff_t *offset,
++      do_rw_t operation)
++{
++      if (current->mm) {
++              struct super_block *sb = file->f_dentry->d_inode->i_sb;
++              struct mm_struct *mm = current->mm;
++              struct vm_area_struct *vma;
++              unsigned long start = (unsigned long)buf;
++              unsigned long end = start + size;
++              int dumping = (current->flags & PF_DUMPCORE);
++
++              if (!dumping)
++                      down_read(&mm->mmap_sem);
++
++              for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++                      if (end <= vma->vm_start)
++                              break;
++                      if (vma->vm_file &&
++                          vma->vm_file->f_dentry->d_inode->i_sb == sb)
++                              goto do_locks;
++              }
++
++              if (!dumping)
++                      up_read(&mm->mmap_sem);
++      }
++
++      {
++              struct gfs_holder gh;
++              return operation(file, buf, size, offset, 0, &gh);
++      }
++
++ do_locks:
++      return walk_vm_hard(file, buf, size, offset, operation);
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static ssize_t
++do_read_readi(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      ssize_t count = 0;
++
++      if (*offset < 0)
++              return -EINVAL;
++      if (!access_ok(VERIFY_WRITE, buf, size))
++              return -EFAULT;
++
++      if (!(file->f_flags & O_LARGEFILE)) {
++              if (*offset >= 0x7FFFFFFFull)
++                      return -EFBIG;
++              if (*offset + size > 0x7FFFFFFFull)
++                      size = 0x7FFFFFFFull - *offset;
++      }
++
++      count = gfs_readi(ip, buf, *offset, size, gfs_copy2user);
++
++      if (count > 0)
++              *offset += count;
++
++      return count;
++}
++
++/**
++ * do_read_direct - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @ghs: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++do_read_direct(struct file *file, char *buf, size_t size, loff_t *offset,
++             unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct inode *inode = file->f_mapping->host;
++      struct gfs_inode *ip = vn2ip(inode);
++      unsigned int state = LM_ST_DEFERRED;
++      int flags = 0;
++      unsigned int x;
++      ssize_t count = 0;
++      int error;
++
++      for (x = 0; x < num_gh; x++)
++              if (ghs[x].gh_gl == ip->i_gl) {
++                      state = LM_ST_SHARED;
++                      flags |= GL_LOCAL_EXCL;
++                      break;
++              }
++
++      gfs_holder_init(ip->i_gl, state, flags, &ghs[num_gh]);
++
++      error = gfs_glock_nq_m(num_gh + 1, ghs);
++      if (error)
++              goto out;
++
++      error = -EINVAL;
++      if (gfs_is_jdata(ip))
++              goto out_gunlock;
++
++      if (gfs_is_stuffed(ip)) {
++              size_t mask = bdev_hardsect_size(inode->i_sb->s_bdev) - 1;
++
++              if (((*offset) & mask) || (((unsigned long)buf) & mask))
++                      goto out_gunlock;
++
++              count = do_read_readi(file, buf, size & ~mask, offset);
++      }
++      else
++              count = generic_file_read(file, buf, size, offset);
++
++      error = 0;
++
++ out_gunlock:
++      gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++      gfs_holder_uninit(&ghs[num_gh]);
++
++      return (count) ? count : error;
++}
++
++/**
++ * do_read_buf - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @ghs: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++do_read_buf(struct file *file, char *buf, size_t size, loff_t *offset,
++          unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      ssize_t count = 0;
++      int error;
++
++      gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &ghs[num_gh]);
++
++      error = gfs_glock_nq_m_atime(num_gh + 1, ghs);
++      if (error)
++              goto out;
++
++      if (gfs_is_jdata(ip) ||
++          (gfs_is_stuffed(ip) && !test_bit(GIF_PAGED, &ip->i_flags)))
++              count = do_read_readi(file, buf, size, offset);
++      else
++              count = generic_file_read(file, buf, size, offset);
++
++      gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++      gfs_holder_uninit(&ghs[num_gh]);
++
++      return (count) ? count : error;
++}
++
++/**
++ * gfs_read - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++gfs_read(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++      atomic_inc(&vfs2sdp(file->f_mapping->host->i_sb)->sd_ops_file);
++
++      if (file->f_flags & O_DIRECT)
++              return walk_vm(file, buf, size, offset, do_read_direct);
++      else
++              return walk_vm(file, buf, size, offset, do_read_buf);
++}
++
++/**
++ * grope_mapping - feel up a mapping that needs to be written
++ * @buf: the start of the memory to be written
++ * @size: the size of the memory to be written
++ *
++ * We do this after acquiring the locks on the mapping,
++ * but before starting the write transaction.  We need to make
++ * sure that we don't cause recursive transactions if blocks
++ * need to be allocated to the file backing the mapping.
++ *
++ * Returns:  0 on success, -EXXX on failure
++ */
++
++static int
++grope_mapping(char *buf, size_t size)
++{
++      unsigned long start = (unsigned long)buf;
++      unsigned long stop = start + size;
++      char c;
++
++      while (start < stop) {
++              if (copy_from_user(&c, (char *)start, 1))
++                      return -EFAULT;
++
++              start += PAGE_CACHE_SIZE;
++              start &= PAGE_CACHE_MASK;
++      }
++
++      return 0;
++}
++
++/**
++ * do_write_direct_alloc - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_direct_alloc(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++      struct inode *inode = file->f_mapping->host;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = NULL;
++      struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++      struct buffer_head *dibh;
++      unsigned int data_blocks, ind_blocks;
++      ssize_t count;
++      int error;
++
++      gfs_write_calc_reserv(ip, size, &data_blocks, &ind_blocks);
++
++      al = gfs_alloc_get(ip);
++
++      error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (error)
++              goto fail;
++
++      error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++      if (error)
++              goto fail_gunlock_q;
++
++      al->al_requested_meta = ind_blocks;
++      al->al_requested_data = data_blocks;
++
++      error = gfs_inplace_reserve(ip);
++      if (error)
++              goto fail_gunlock_q;
++
++      /* Trans may require:
++         All blocks for a RG bitmap, whatever indirect blocks we
++         need, a modified dinode, and a quota change. */
++
++      error = gfs_trans_begin(sdp,
++                              1 + al->al_rgd->rd_ri.ri_length + ind_blocks,
++                              1);
++      if (error)
++              goto fail_ipres;
++
++      if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) {
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      goto fail_end_trans;
++
++              ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? (~(S_ISUID | S_ISGID)) : (~S_ISUID);
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++              gfs_dinode_out(&ip->i_di, dibh->b_data);
++              brelse(dibh);
++      }
++
++      if (gfs_is_stuffed(ip)) {
++              error = gfs_unstuff_dinode(ip, gfs_unstuffer_sync, NULL);
++              if (error)
++                      goto fail_end_trans;
++      }
++
++      count = generic_file_write_nolock(file, &local_iov, 1, offset);
++      if (count < 0) {
++              error = count;
++              goto fail_end_trans;
++      }
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      if (error)
++              goto fail_end_trans;
++
++      if (ip->i_di.di_size < inode->i_size)
++              ip->i_di.di_size = inode->i_size;
++      ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++      gfs_trans_add_bh(ip->i_gl, dibh);
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      brelse(dibh);
++
++      gfs_trans_end(sdp);
++
++      if (file->f_flags & O_SYNC)
++              gfs_log_flush_glock(ip->i_gl);
++
++      gfs_inplace_release(ip);
++      gfs_quota_unlock_m(ip);
++      gfs_alloc_put(ip);
++
++      return count;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++      gfs_quota_unlock_m(ip);
++
++ fail:
++      gfs_alloc_put(ip);
++
++      return error;
++}
++
++/**
++ * do_write_direct - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @gh: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_direct(struct file *file, char *buf, size_t size, loff_t *offset,
++              unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_file *fp = vf2fp(file);
++      unsigned int state = LM_ST_DEFERRED;
++      int alloc_required;
++      unsigned int x;
++      size_t s;
++      ssize_t count = 0;
++      int error;
++
++      if (test_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags))
++              state = LM_ST_EXCLUSIVE;
++      else
++              for (x = 0; x < num_gh; x++)
++                      if (ghs[x].gh_gl == ip->i_gl) {
++                              state = LM_ST_EXCLUSIVE;
++                              break;
++                      }
++
++ restart:
++      gfs_holder_init(ip->i_gl, state, 0, &ghs[num_gh]);
++
++      error = gfs_glock_nq_m(num_gh + 1, ghs);
++      if (error)
++              goto out;
++
++      error = -EINVAL;
++      if (gfs_is_jdata(ip))
++              goto out_gunlock;
++
++      if (num_gh) {
++              error = grope_mapping(buf, size);
++              if (error)
++                      goto out_gunlock;
++      }
++
++      if (file->f_flags & O_APPEND)
++              *offset = ip->i_di.di_size;
++
++      if (!(file->f_flags & O_LARGEFILE)) {
++              error = -EFBIG;
++              if (*offset >= 0x7FFFFFFFull)
++                      goto out_gunlock;
++              if (*offset + size > 0x7FFFFFFFull)
++                      size = 0x7FFFFFFFull - *offset;
++      }
++
++      if (gfs_is_stuffed(ip) ||
++          *offset + size > ip->i_di.di_size ||
++          ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)))
++              alloc_required = TRUE;
++      else {
++              error = gfs_write_alloc_required(ip, *offset, size,
++                                               &alloc_required);
++              if (error)
++                      goto out_gunlock;
++      }
++
++      if (alloc_required && state != LM_ST_EXCLUSIVE) {
++              gfs_glock_dq_m(num_gh + 1, ghs);
++              gfs_holder_uninit(&ghs[num_gh]);
++              state = LM_ST_EXCLUSIVE;
++              goto restart;
++      }
++
++      if (alloc_required) {
++              set_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags);
++
++              while (size) {
++                      s = sdp->sd_tune.gt_max_atomic_write;
++                      if (s > size)
++                              s = size;
++
++                      error = do_write_direct_alloc(file, buf, s, offset);
++                      if (error < 0)
++                              goto out_gunlock;
++
++                      buf += error;
++                      size -= error;
++                      count += error;
++              }
++      } else {
++              struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++              struct gfs_holder t_gh;
++
++              clear_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags);
++
++              error = gfs_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
++              if (error)
++                      goto out_gunlock;
++
++              count = generic_file_write_nolock(file, &local_iov, 1, offset);
++
++              gfs_glock_dq_uninit(&t_gh);
++      }
++
++      error = 0;
++
++      out_gunlock:
++      gfs_glock_dq_m(num_gh + 1, ghs);
++
++      out:
++      gfs_holder_uninit(&ghs[num_gh]);
++
++      return (count) ? count : error;
++}
++
++/**
++ * do_do_write_buf - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_do_write_buf(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++      struct inode *inode = file->f_mapping->host;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = NULL;
++      struct buffer_head *dibh;
++      unsigned int data_blocks, ind_blocks;
++      int alloc_required, journaled;
++      ssize_t count;
++      int error;
++
++      journaled = gfs_is_jdata(ip);
++
++      gfs_write_calc_reserv(ip, size, &data_blocks, &ind_blocks);
++
++      error = gfs_write_alloc_required(ip, *offset, size, &alloc_required);
++      if (error)
++              return error;
++
++      if (alloc_required) {
++              al = gfs_alloc_get(ip);
++
++              error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++              if (error)
++                      goto fail;
++
++              error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++              if (error)
++                      goto fail_gunlock_q;
++
++              if (journaled)
++                      al->al_requested_meta = ind_blocks + data_blocks;
++              else {
++                      al->al_requested_meta = ind_blocks;
++                      al->al_requested_data = data_blocks;
++              }
++
++              error = gfs_inplace_reserve(ip);
++              if (error)
++                      goto fail_gunlock_q;
++
++              /* Trans may require:
++                 All blocks for a RG bitmap, whatever indirect blocks we
++                 need, a modified dinode, and a quota change. */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + al->al_rgd->rd_ri.ri_length +
++                                      ind_blocks +
++                                      ((journaled) ? data_blocks : 0), 1);
++              if (error)
++                      goto fail_ipres;
++      } else {
++              /* Trans may require:
++                 A modified dinode. */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + ((journaled) ? data_blocks : 0), 0);
++              if (error)
++                      goto fail_ipres;
++      }
++
++      if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) {
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      goto fail_end_trans;
++
++              ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? (~(S_ISUID | S_ISGID)) : (~S_ISUID);
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++              gfs_dinode_out(&ip->i_di, dibh->b_data);
++              brelse(dibh);
++      }
++
++      if (journaled ||
++          (gfs_is_stuffed(ip) && !test_bit(GIF_PAGED, &ip->i_flags) &&
++           *offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))) {
++
++              count = gfs_writei(ip, buf, *offset, size, gfs_copy_from_user);
++              if (count < 0) {
++                      error = count;
++                      goto fail_end_trans;
++              }
++
++              *offset += count;
++      } else {
++              struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++
++              count = generic_file_write_nolock(file, &local_iov, 1, offset);
++              if (count < 0) {
++                      error = count;
++                      goto fail_end_trans;
++              }
++
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      goto fail_end_trans;
++
++              if (ip->i_di.di_size < inode->i_size)
++                      ip->i_di.di_size = inode->i_size;
++              ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++              gfs_dinode_out(&ip->i_di, dibh->b_data);
++              brelse(dibh);
++      }
++
++      gfs_trans_end(sdp);
++
++      if (file->f_flags & O_SYNC)
++              gfs_log_flush_glock(ip->i_gl);
++
++      if (alloc_required) {
++              GFS_ASSERT_INODE(count != size ||
++                               al->al_alloced_meta ||
++                               al->al_alloced_data, ip,);
++              gfs_inplace_release(ip);
++              gfs_quota_unlock_m(ip);
++              gfs_alloc_put(ip);
++      }
++
++      return count;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      if (alloc_required)
++              gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++      if (alloc_required)
++              gfs_quota_unlock_m(ip);
++
++ fail:
++      if (alloc_required)
++              gfs_alloc_put(ip);
++
++      return error;
++}
++
++/**
++ * do_write_buf - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @gh: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_buf(struct file *file,
++           char *buf, size_t size, loff_t *offset,
++           unsigned int num_gh, struct gfs_holder *ghs)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      size_t s;
++      ssize_t count = 0;
++      int error;
++
++      gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh]);
++
++      error = gfs_glock_nq_m(num_gh + 1, ghs);
++      if (error)
++              goto out;
++
++      if (num_gh) {
++              error = grope_mapping(buf, size);
++              if (error)
++                      goto out_gunlock;
++      }
++
++      if (file->f_flags & O_APPEND)
++              *offset = ip->i_di.di_size;
++
++      if (!(file->f_flags & O_LARGEFILE)) {
++              error = -EFBIG;
++              if (*offset >= 0x7FFFFFFFull)
++                      goto out_gunlock;
++              if (*offset + size > 0x7FFFFFFFull)
++                      size = 0x7FFFFFFFull - *offset;
++      }
++
++      while (size) {
++              s = sdp->sd_tune.gt_max_atomic_write;
++              if (s > size)
++                      s = size;
++
++              error = do_do_write_buf(file, buf, s, offset);
++              if (error < 0)
++                      goto out_gunlock;
++
++              buf += error;
++              size -= error;
++              count += error;
++      }
++
++      error = 0;
++
++ out_gunlock:
++      gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++      gfs_holder_uninit(&ghs[num_gh]);
++
++      return (count) ? count : error;
++}
++
++/**
++ * gfs_write - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++gfs_write(struct file *file, const char *buf, size_t size, loff_t *offset)
++{
++      struct inode *inode = file->f_mapping->host;
++      ssize_t count;
++
++      atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_file);
++
++      if (*offset < 0)
++              return -EINVAL;
++      if (!access_ok(VERIFY_READ, buf, size))
++              return -EFAULT;
++
++      down(&inode->i_sem);
++      if (file->f_flags & O_DIRECT)
++              count = walk_vm(file, (char *)buf, size, offset, do_write_direct);
++      else
++              count = walk_vm(file, (char *)buf, size, offset, do_write_buf);
++      up(&inode->i_sem);
++
++      return count;
++}
++
++/**
++ * filldir_reg_func - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++static int
++filldir_reg_func(void *opaque,
++               const char *name, unsigned int length,
++               uint64_t offset,
++               struct gfs_inum *inum, unsigned int type)
++{
++      struct filldir_reg *fdr = (struct filldir_reg *)opaque;
++      struct gfs_sbd *sdp = fdr->fdr_sbd;
++      unsigned int vfs_type;
++      int error;
++
++      switch (type) {
++      case GFS_FILE_NON:
++              vfs_type = DT_UNKNOWN;
++              break;
++      case GFS_FILE_REG:
++              vfs_type = DT_REG;
++              break;
++      case GFS_FILE_DIR:
++              vfs_type = DT_DIR;
++              break;
++      case GFS_FILE_LNK:
++              vfs_type = DT_LNK;
++              break;
++      case GFS_FILE_BLK:
++              vfs_type = DT_BLK;
++              break;
++      case GFS_FILE_CHR:
++              vfs_type = DT_CHR;
++              break;
++      case GFS_FILE_FIFO:
++              vfs_type = DT_FIFO;
++              break;
++      case GFS_FILE_SOCK:
++              vfs_type = DT_SOCK;
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,
++                             printk("type = %u\n", type););
++      }
++
++      error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
++                               inum->no_formal_ino, vfs_type);
++      if (error)
++              return 1;
++
++      if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
++              gfs_glock_prefetch_num(sdp,
++                                     inum->no_formal_ino, &gfs_inode_glops,
++                                     LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
++              gfs_glock_prefetch_num(sdp,
++                                     inum->no_addr, &gfs_iopen_glops,
++                                     LM_ST_SHARED, LM_FLAG_TRY);
++      }
++
++      return 0;
++}
++
++/**
++ * readdir_reg - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++readdir_reg(struct file *file, void *dirent, filldir_t filldir)
++{
++      struct gfs_inode *dip = vn2ip(file->f_mapping->host);
++      struct filldir_reg fdr;
++      struct gfs_holder d_gh;
++      uint64_t offset = file->f_pos;
++      int error;
++
++      fdr.fdr_sbd = dip->i_sbd;
++      fdr.fdr_prefetch = GFS_ASYNC_LM(dip->i_sbd);
++      fdr.fdr_filldir = filldir;
++      fdr.fdr_opaque = dirent;
++
++      gfs_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
++      error = gfs_glock_nq_atime(&d_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              return error;
++      }
++
++      error = gfs_dir_read(dip, &offset, &fdr, filldir_reg_func);
++
++      gfs_glock_dq_uninit(&d_gh);
++
++      file->f_pos = offset;
++
++      return error;
++}
++
++/**
++ * filldir_bad_func - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++static int
++filldir_bad_func(void *opaque,
++               const char *name, unsigned int length,
++               uint64_t offset,
++               struct gfs_inum *inum, unsigned int type)
++{
++      struct filldir_bad *fdb = (struct filldir_bad *)opaque;
++      struct gfs_sbd *sdp = fdb->fdb_sbd;
++      struct filldir_bad_entry *fbe;
++
++      if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
++          fdb->fdb_name_off + length > fdb->fdb_name_size)
++              return 1;
++
++      fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
++      fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
++      memcpy(fbe->fbe_name, name, length);
++      fbe->fbe_length = length;
++      fbe->fbe_offset = offset;
++      fbe->fbe_inum = *inum;
++      fbe->fbe_type = type;
++
++      fdb->fdb_entry_off++;
++      fdb->fdb_name_off += length;
++
++      if (fdb->fdb_prefetch && !(length == 1 && *name == '.')) {
++              gfs_glock_prefetch_num(sdp,
++                                     inum->no_formal_ino, &gfs_inode_glops,
++                                     LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
++              gfs_glock_prefetch_num(sdp,
++                                     inum->no_addr, &gfs_iopen_glops,
++                                     LM_ST_SHARED, LM_FLAG_TRY);
++      }
++
++      return 0;
++}
++
++/**
++ * readdir_bad - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++readdir_bad(struct file *file, void *dirent, filldir_t filldir)
++{
++      struct gfs_inode *dip = vn2ip(file->f_mapping->host);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct filldir_reg fdr;
++      unsigned int entries, size;
++      struct filldir_bad *fdb;
++      struct gfs_holder d_gh;
++      uint64_t offset = file->f_pos;
++      unsigned int x;
++      struct filldir_bad_entry *fbe;
++      int error;
++
++      entries = sdp->sd_tune.gt_entries_per_readdir;
++      size = sizeof(struct filldir_bad) +
++          entries * (sizeof(struct filldir_bad_entry) + GFS_FAST_NAME_SIZE);
++
++      fdb = gmalloc(size);
++      memset(fdb, 0, size);
++
++      fdb->fdb_sbd = sdp;
++      fdb->fdb_prefetch = GFS_ASYNC_LM(sdp);
++      fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
++      fdb->fdb_entry_num = entries;
++      fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
++              entries * sizeof(struct filldir_bad_entry);
++      fdb->fdb_name_size = entries * GFS_FAST_NAME_SIZE;
++
++      gfs_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
++      error = gfs_glock_nq_atime(&d_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              goto out;
++      }
++
++      error = gfs_dir_read(dip, &offset, fdb, filldir_bad_func);
++
++      gfs_glock_dq_uninit(&d_gh);
++
++      fdr.fdr_sbd = sdp;
++      fdr.fdr_prefetch = FALSE;
++      fdr.fdr_filldir = filldir;
++      fdr.fdr_opaque = dirent;
++
++      for (x = 0; x < fdb->fdb_entry_off; x++) {
++              fbe = &fdb->fdb_entry[x];
++
++              error = filldir_reg_func(&fdr,
++                                       fbe->fbe_name, fbe->fbe_length,
++                                       fbe->fbe_offset,
++                                       &fbe->fbe_inum, fbe->fbe_type);
++              if (error) {
++                      file->f_pos = fbe->fbe_offset;
++                      error = 0;
++                      goto out;
++              }
++      }
++
++      file->f_pos = offset;
++
++ out:
++      kfree(fdb);
++
++      return error;
++}
++
++/**
++ * gfs_readdir - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_readdir(struct file *file, void *dirent, filldir_t filldir)
++{
++      int error;
++
++      atomic_inc(&vfs2sdp(file->f_mapping->host->i_sb)->sd_ops_file);
++
++      if (strcmp(current->comm, "nfsd") != 0)
++              error = readdir_reg(file, dirent, filldir);
++      else
++              error = readdir_bad(file, dirent, filldir);
++
++      return error;
++}
++
++/**
++ * gfs_ioctl - do an ioctl on a file
++ * @inode: the inode
++ * @file: the file pointer
++ * @cmd: the ioctl command
++ * @arg: the argument
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_ioctl(struct inode *inode, struct file *file,
++        unsigned int cmd, unsigned long arg)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++      return gfs_ioctli(ip, cmd, (void *)arg);
++}
++
++/**
++ * gfs_open - open a file
++ * @inode: the inode to open
++ * @file: the struct file for this opening
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_open(struct inode *inode, struct file *file)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_holder i_gh;
++      struct gfs_file *fp;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++
++      fp = gmalloc(sizeof(struct gfs_file));
++      memset(fp, 0, sizeof(struct gfs_file));
++
++      init_MUTEX(&fp->f_fl_lock);
++
++      fp->f_inode = ip;
++      fp->f_vfile = file;
++
++      GFS_ASSERT_INODE(!vf2fp(file), ip,);
++      vf2fp(file) = fp;
++
++      if (ip->i_di.di_type == GFS_FILE_REG) {
++              error = gfs_glock_nq_init(ip->i_gl,
++                                        LM_ST_SHARED, LM_FLAG_ANY,
++                                        &i_gh);
++              if (error)
++                      goto fail;
++
++              if (!(file->f_flags & O_LARGEFILE) &&
++                  ip->i_di.di_size > 0x7FFFFFFFull) {
++                      error = -EFBIG;
++                      goto fail_gunlock;
++              }
++
++              /* If this is an exclusive create, make sure our gfs_create()
++                 says we created the file.  The O_EXCL flag isn't passed
++                 to gfs_create(), so we have to check it here. */
++
++              if (file->f_flags & O_CREAT) {
++                      if (ip->i_creat_task == current &&
++                          ip->i_creat_pid == current->pid) {
++                              ip->i_creat_task = NULL;
++                              ip->i_creat_pid = 0;
++                      } else if (file->f_flags & O_EXCL) {
++                              error = -EEXIST;
++                              goto fail_gunlock;
++                      }
++              }
++
++              /* Listen to the Direct I/O flag */
++
++              if (ip->i_di.di_flags & GFS_DIF_DIRECTIO)
++                      file->f_flags |= O_DIRECT;
++
++              /* Don't let the user open O_DIRECT on a jdata file */
++
++              if ((file->f_flags & O_DIRECT) && gfs_is_jdata(ip)) {
++                      error = -EINVAL;
++                      goto fail_gunlock;
++              }
++
++              gfs_glock_dq_uninit(&i_gh);
++      }
++
++      return 0;
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++      vf2fp(file) = NULL;
++      kfree(fp);
++
++      return error;
++}
++
++/**
++ * gfs_close - called to close a struct file
++ * @inode: the inode the struct file belongs to
++ * @file: the struct file being closed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_close(struct inode *inode, struct file *file)
++{
++      struct gfs_file *fp;
++
++      atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_file);
++
++      fp = vf2fp(file);
++      vf2fp(file) = NULL;
++
++      GFS_ASSERT(fp,);
++
++      kfree(fp);
++
++      return 0;
++}
++
++/**
++ * gfs_fsync - sync the dirty data for a file (across the cluster)
++ * @file: the file that points to the dentry (Huh?)
++ * @dentry: the dentry that points to the inode to sync
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++      struct gfs_inode *ip = vn2ip(dentry->d_inode);
++      struct gfs_holder i_gh;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++      if (error)
++              return error;
++
++      if (gfs_is_jdata(ip))
++              gfs_log_flush_glock(ip->i_gl);
++      else
++              i_gh.gh_flags |= GL_SYNC;
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_lock - acquire/release a flock or posix lock on a file
++ * @file: the file pointer
++ * @cmd: either modify or retrieve lock state, possibly wait
++ * @fl: type and range of lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_lock(struct file *file, int cmd, struct file_lock *fl)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct lm_lockname name;
++      uint64_t start = fl->fl_start, end = fl->fl_end;
++      pid_t pid = fl->fl_pid;
++      int plock = (fl->fl_flags & FL_POSIX);
++      int flock = (fl->fl_flags & FL_FLOCK);
++      int get, set, wait, ex, sh, un;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_file);
++
++      if (sdp->sd_args.ar_localflocks)
++              return LOCK_USE_CLNT;
++
++      if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
++              return -ENOLCK;
++
++      if (!flock && !plock)
++              return -ENOLCK;
++
++      get = (IS_GETLK(cmd)) ? TRUE : FALSE;
++      set = (IS_SETLK(cmd)) ? TRUE : FALSE;
++      wait = (IS_SETLKW(cmd)) ? TRUE : FALSE;
++
++      if ((flock && (get || (!set && !wait))) ||
++          (plock && (!get && !set && !wait)))
++              return -EINVAL;
++
++      ex = (fl->fl_type == F_WRLCK) ? TRUE : FALSE;
++      sh = (fl->fl_type == F_RDLCK) ? TRUE : FALSE;
++      un = (fl->fl_type == F_UNLCK) ? TRUE : FALSE;
++
++      if (!ex && !sh && !un)
++              return -EINVAL;
++
++      if (flock) {
++              struct gfs_file *fp = vf2fp(file);
++              GFS_ASSERT(fp,);
++
++              if (un)
++                      error = gfs_funlock(fp);
++              else
++                      error = gfs_flock(fp, ex, wait);
++      } else {
++              name.ln_number = ip->i_num.no_formal_ino;
++              name.ln_type = LM_TYPE_PLOCK;
++              if (get) {
++                      error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
++                                      sdp->sd_lockstruct.ls_lockspace,
++                                      &name, (unsigned long)fl->fl_owner,
++                                      &start, &end, &ex, (unsigned long*)&pid);
++                      if (error < 0)
++                              return error;
++
++                      fl->fl_type = F_UNLCK;
++                      if (!error)
++                              return error;
++
++                      fl->fl_start = start;
++                      fl->fl_end = end;
++                      fl->fl_pid = pid;
++                      fl->fl_type = (ex) ? F_WRLCK : F_RDLCK;
++
++                      error = 0;
++              } else if (un)
++                      error = sdp->sd_lockstruct.ls_ops->lm_punlock(
++                                      sdp->sd_lockstruct.ls_lockspace,
++                                      &name, (unsigned long)fl->fl_owner,
++                                      start, end);
++              else
++                      error = sdp->sd_lockstruct.ls_ops->lm_plock(
++                                      sdp->sd_lockstruct.ls_lockspace,
++                                      &name, (unsigned long)fl->fl_owner,
++                                      wait, ex, start, end);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_sendfile - Send bytes to a file or socket
++ * @in_file: The file to read from
++ * @out_file: The file to write to
++ * @count: The amount of data
++ * @offset: The beginning file offset
++ *
++ * Outputs: offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes sent, -EXXX on failure
++ */
++
++static ssize_t
++gfs_sendfile(struct file *in_file, loff_t *offset, size_t count, read_actor_t actor, void __user *target)
++{
++      struct gfs_inode *ip = vn2ip(in_file->f_mapping->host);
++      struct gfs_holder gh;
++      ssize_t retval;
++
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++
++      gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
++
++      retval = gfs_glock_nq_atime(&gh);
++      if (retval)
++              goto out;
++
++      if (gfs_is_jdata(ip))
++              retval = -ENOSYS;
++      else 
++              retval = generic_file_sendfile(in_file, offset, count, actor, target);
++
++      gfs_glock_dq(&gh);
++
++ out:
++      gfs_holder_uninit(&gh);
++
++      return retval;
++}
++
++/**
++ * gfs_mmap - We don't support shared writable mappings right now
++ * @file: The file to map
++ * @vma: The VMA which described the mapping
++ *
++ * Returns: 0 or error code
++ */
++
++static int
++gfs_mmap(struct file *file, struct vm_area_struct *vma)
++{
++      struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++      struct gfs_holder i_gh;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_file);
++
++      gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
++      error = gfs_glock_nq_atime(&i_gh);
++      if (error) {
++              gfs_holder_uninit(&i_gh);
++              return error;
++      }
++
++      if (gfs_is_jdata(ip)) {
++              if (vma->vm_flags & VM_MAYSHARE)
++                      error = -ENOSYS;
++              else
++                      vma->vm_ops = &gfs_vm_ops_private;
++      } else {
++              /* This is VM_MAYWRITE instead of VM_WRITE because a call
++                 to mprotect() can turn on VM_WRITE later. */
++
++              if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == (VM_MAYSHARE | VM_MAYWRITE))
++                      vma->vm_ops = &gfs_vm_ops_sharewrite;
++              else
++                      vma->vm_ops = &gfs_vm_ops_private;
++      }
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++struct file_operations gfs_file_fops = {
++      .llseek = gfs_llseek,
++      .read = gfs_read,
++      .write = gfs_write,
++      .ioctl = gfs_ioctl,
++      .mmap = gfs_mmap,
++      .open = gfs_open,
++      .release = gfs_close,
++      .fsync = gfs_fsync,
++      .lock = gfs_lock,
++      .sendfile = gfs_sendfile,
++};
++
++struct file_operations gfs_dir_fops = {
++      .readdir = gfs_readdir,
++      .ioctl = gfs_ioctl,
++      .open = gfs_open,
++      .release = gfs_close,
++      .fsync = gfs_fsync,
++      .lock = gfs_lock,
++};
+diff -urN linux-orig/fs/gfs/ops_file.h linux-patched/fs/gfs/ops_file.h
+--- linux-orig/fs/gfs/ops_file.h       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_file.h    2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_FILE_DOT_H__
++#define __OPS_FILE_DOT_H__
++
++extern struct file_operations gfs_file_fops;
++extern struct file_operations gfs_dir_fops;
++
++#endif /* __OPS_FILE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_fstype.c linux-patched/fs/gfs/ops_fstype.c
+--- linux-orig/fs/gfs/ops_fstype.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_fstype.c  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,626 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/vmalloc.h>
++#include <linux/blkdev.h>
++
++#include "gfs.h"
++#include "daemon.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "locking.h"
++#include "mount.h"
++#include "ops_export.h"
++#include "ops_fstype.h"
++#include "ops_super.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_read_super - Read in superblock
++ * @sb: The VFS superblock
++ * @data: Mount options
++ * @silent: Don't complain if its not a GFS filesystem
++ *
++ * Returns: The VFS superblock, or NULL on error
++ */
++
++static int
++fill_super(struct super_block *sb, void *data, int silent)
++{
++      struct gfs_sbd *sdp;
++      struct gfs_holder mount_gh, sb_gh, ji_gh;
++      struct inode *inode;
++      int super = TRUE, jindex = TRUE;
++      unsigned int x;
++      int error;
++
++      error = -ENOMEM;
++      sdp = vmalloc(sizeof(struct gfs_sbd));
++      if (!sdp)
++              goto fail;
++
++      memset(sdp, 0, sizeof(struct gfs_sbd));
++
++      vfs2sdp(sb) = sdp;
++      sdp->sd_vfs = sb;
++
++      /*  Init rgrp variables  */
++
++      INIT_LIST_HEAD(&sdp->sd_rglist);
++      init_MUTEX(&sdp->sd_rindex_lock);
++      INIT_LIST_HEAD(&sdp->sd_rg_mru_list);
++      spin_lock_init(&sdp->sd_rg_mru_lock);
++      INIT_LIST_HEAD(&sdp->sd_rg_recent);
++      spin_lock_init(&sdp->sd_rg_recent_lock);
++      spin_lock_init(&sdp->sd_rg_forward_lock);
++
++      for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++              sdp->sd_gl_hash[x].hb_lock = RW_LOCK_UNLOCKED;
++              INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
++      }
++
++      INIT_LIST_HEAD(&sdp->sd_reclaim_list);
++      spin_lock_init(&sdp->sd_reclaim_lock);
++      init_waitqueue_head(&sdp->sd_reclaim_wchan);
++
++      for (x = 0; x < GFS_MHC_HASH_SIZE; x++)
++              INIT_LIST_HEAD(&sdp->sd_mhc[x]);
++      INIT_LIST_HEAD(&sdp->sd_mhc_single);
++      spin_lock_init(&sdp->sd_mhc_lock);
++
++      for (x = 0; x < GFS_DEPEND_HASH_SIZE; x++)
++              INIT_LIST_HEAD(&sdp->sd_depend[x]);
++      spin_lock_init(&sdp->sd_depend_lock);
++
++      init_MUTEX(&sdp->sd_freeze_lock);
++
++      init_MUTEX(&sdp->sd_thread_lock);
++      init_completion(&sdp->sd_thread_completion);
++
++      spin_lock_init(&sdp->sd_log_seg_lock);
++      INIT_LIST_HEAD(&sdp->sd_log_seg_list);
++      init_waitqueue_head(&sdp->sd_log_seg_wait);
++      INIT_LIST_HEAD(&sdp->sd_log_ail);
++      INIT_LIST_HEAD(&sdp->sd_log_incore);
++      init_MUTEX(&sdp->sd_log_lock);
++      INIT_LIST_HEAD(&sdp->sd_unlinked_list);
++      spin_lock_init(&sdp->sd_unlinked_lock);
++      INIT_LIST_HEAD(&sdp->sd_quota_list);
++      spin_lock_init(&sdp->sd_quota_lock);
++
++      INIT_LIST_HEAD(&sdp->sd_dirty_j);
++      spin_lock_init(&sdp->sd_dirty_j_lock);
++
++      spin_lock_init(&sdp->sd_ail_lock);
++      INIT_LIST_HEAD(&sdp->sd_recovery_bufs);
++
++      gfs_init_tune_data(sdp);
++
++      error = gfs_make_args((char *)data, &sdp->sd_args);
++      if (error) {
++              printk("GFS: can't parse mount arguments\n");
++              goto fail_vfree;
++      }
++
++      /*  Copy out mount flags  */
++
++      if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
++              set_bit(SDF_NOATIME, &sdp->sd_flags);
++      if (sb->s_flags & MS_RDONLY)
++              set_bit(SDF_ROFS, &sdp->sd_flags);
++
++      /*  Setup up Virtual Super Block  */
++
++      sb->s_magic = GFS_MAGIC;
++      sb->s_op = &gfs_super_ops;
++      sb->s_export_op = &gfs_export_ops;
++      sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
++      sb->s_maxbytes = ~0ULL;
++
++      if (sdp->sd_args.ar_posixacls)
++              sb->s_flags |= MS_POSIXACL;
++
++      /*  Set up the buffer cache and fill in some fake values
++         to allow us to read in the superblock.  */
++
++      sdp->sd_sb.sb_bsize = bdev_hardsect_size(sb->s_bdev);
++      if (sdp->sd_sb.sb_bsize < GFS_BASIC_BLOCK)
++              sdp->sd_sb.sb_bsize = GFS_BASIC_BLOCK;
++      sdp->sd_sb.sb_bsize_shift = ffs(sdp->sd_sb.sb_bsize) - 1;
++      sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS_BASIC_BLOCK_SHIFT;
++      sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
++
++      GFS_ASSERT_SBD(sizeof(struct gfs_sb) <= sdp->sd_sb.sb_bsize, sdp,);
++
++      set_blocksize(sb->s_bdev, sdp->sd_sb.sb_bsize);
++      sb->s_blocksize = sdp->sd_sb.sb_bsize;
++      sb->s_blocksize_bits = sdp->sd_sb.sb_bsize_shift;
++
++      error = gfs_mount_lockproto(sdp, silent);
++      if (error)
++              goto fail_vfree;
++
++      printk("GFS: fsid=%s: Joined cluster. Now mounting FS...\n",
++             sdp->sd_fsname);
++
++      if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
++          !sdp->sd_args.ar_ignore_local_fs) {
++              /*  Force local [p|f]locks  */
++              sdp->sd_args.ar_localflocks = TRUE;
++
++              /*  Force local read ahead and caching  */
++              sdp->sd_args.ar_localcaching = TRUE;
++      }
++
++      /*  Start up the scand thread  */
++
++      error = kernel_thread(gfs_scand, sdp, 0);
++      if (error < 0) {
++              printk("GFS: fsid=%s: can't start scand thread: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_lockproto;
++      }
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Start up the glockd thread  */
++
++      for (sdp->sd_glockd_num = 0;
++           sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
++           sdp->sd_glockd_num++) {
++              error = kernel_thread(gfs_glockd, sdp, 0);
++              if (error < 0) {
++                      printk("GFS: fsid=%s: can't start glockd thread: %d\n",
++                             sdp->sd_fsname, error);
++                      goto fail_glockd;
++              }
++              wait_for_completion(&sdp->sd_thread_completion);
++      }
++
++      error = gfs_glock_nq_num(sdp,
++                               GFS_MOUNT_LOCK, &gfs_nondisk_glops,
++                               LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
++                               &mount_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't acquire mount glock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_glockd;
++      }
++
++      error = gfs_glock_nq_num(sdp,
++                               GFS_LIVE_LOCK, &gfs_nondisk_glops,
++                               LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
++                               &sdp->sd_live_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't acquire live glock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_mount;
++      }
++
++      sdp->sd_live_gh.gh_owner = NULL;
++
++      error = gfs_glock_nq_num(sdp,
++                               GFS_SB_LOCK, &gfs_meta_glops,
++                               (sdp->sd_args.ar_upgrade) ? LM_ST_EXCLUSIVE : LM_ST_SHARED,
++                               0, &sb_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't acquire superblock glock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_live;
++      }
++
++      error = gfs_read_sb(sdp, sb_gh.gh_gl, silent);
++      if (error) {
++              printk("GFS: fsid=%s: can't read superblock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_sb;
++      }
++
++      /*  Set up the buffer cache and SB for real  */
++
++      error = -EINVAL;
++      if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
++              printk("GFS: fsid=%s: FS block size (%u) is too small for device block size (%u)\n",
++                     sdp->sd_fsname, sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
++              goto fail_gunlock_sb;
++      }
++      if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
++              printk("GFS: fsid=%s: FS block size (%u) is too big for machine page size (%u)\n",
++                     sdp->sd_fsname, sdp->sd_sb.sb_bsize,
++                     (unsigned int)PAGE_SIZE);
++              goto fail_gunlock_sb;
++      }
++
++      /*  Get rid of buffers from the original block size  */
++      sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
++      sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
++
++      set_blocksize(sb->s_bdev, sdp->sd_sb.sb_bsize);
++      sb->s_blocksize = sdp->sd_sb.sb_bsize;
++      sb->s_blocksize_bits = sdp->sd_sb.sb_bsize_shift;
++
++      /*  Read in journal index inode  */
++
++      error = gfs_get_jiinode(sdp);
++      if (error) {
++              printk("GFS: fsid=%s: can't get journal index inode: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_sb;
++      }
++
++      init_MUTEX(&sdp->sd_jindex_lock);
++
++      /*  Get a handle on the transaction glock  */
++
++      error = gfs_glock_get(sdp, GFS_TRANS_LOCK, &gfs_trans_glops,
++                            CREATE, &sdp->sd_trans_gl);
++      if (error)
++              goto fail_ji_free;
++      set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
++
++      /*  Upgrade version numbers if we need to  */
++
++      if (sdp->sd_args.ar_upgrade) {
++              error = gfs_do_upgrade(sdp, sb_gh.gh_gl);
++              if (error)
++                      goto fail_trans_gl;
++      }
++
++      /*  Load in the journal index  */
++
++      error = gfs_jindex_hold(sdp, &ji_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't read journal index: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_trans_gl;
++      }
++
++      error = -EINVAL;
++      if (sdp->sd_lockstruct.ls_jid >= sdp->sd_journals) {
++              printk("GFS: fsid=%s: can't mount journal #%u\n",
++                     sdp->sd_fsname, sdp->sd_lockstruct.ls_jid);
++              printk("GFS: fsid=%s: there are only %u journals (0 - %u)\n",
++                   sdp->sd_fsname, sdp->sd_journals, sdp->sd_journals - 1);
++              goto fail_gunlock_ji;
++      }
++      sdp->sd_jdesc = sdp->sd_jindex[sdp->sd_lockstruct.ls_jid];
++      sdp->sd_log_seg_free = sdp->sd_jdesc.ji_nsegment - 1;
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_jdesc.ji_addr, &gfs_meta_glops,
++                               LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
++                               &sdp->sd_journal_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't acquire the journal glock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_ji;
++      }
++
++      if (sdp->sd_lockstruct.ls_first) {
++              for (x = 0; x < sdp->sd_journals; x++) {
++                      error = gfs_recover_journal(sdp,
++                                                  x, sdp->sd_jindex + x,
++                                                  TRUE);
++                      if (error) {
++                              printk("GFS: fsid=%s: error recovering journal %u: %d\n",
++                                     sdp->sd_fsname, x, error);
++                              goto fail_gunlock_journal;
++                      }
++              }
++
++              sdp->sd_lockstruct.ls_ops->lm_others_may_mount(sdp->sd_lockstruct.ls_lockspace);
++              sdp->sd_lockstruct.ls_first = FALSE;
++      } else {
++              error = gfs_recover_journal(sdp,
++                                          sdp->sd_lockstruct.ls_jid, &sdp->sd_jdesc,
++                                          TRUE);
++              if (error) {
++                      printk("GFS: fsid=%s: error recovering my journal: %d\n",
++                             sdp->sd_fsname, error);
++                      goto fail_gunlock_journal;
++              }
++      }
++
++      gfs_glock_dq_uninit(&ji_gh);
++      jindex = FALSE;
++
++      /*  Disown my Journal glock  */
++
++      sdp->sd_journal_gh.gh_owner = NULL;
++
++      /*  Drop our cache and reread all the things we read before the replay.  */
++
++      error = gfs_read_sb(sdp, sb_gh.gh_gl, FALSE);
++      if (error) {
++              printk("GFS: fsid=%s: can't read superblock: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_journal;
++      }
++
++      gfs_glock_force_drop(sdp->sd_jiinode->i_gl);
++
++      error = gfs_jindex_hold(sdp, &ji_gh);
++      if (error) {
++              printk("GFS: fsid=%s: can't read journal index: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_gunlock_journal;
++      }
++      gfs_glock_dq_uninit(&ji_gh);
++
++      /*  Make the FS read/write  */
++
++      if (!test_bit(SDF_ROFS, &sdp->sd_flags)) {
++              error = gfs_make_fs_rw(sdp);
++              if (error) {
++                      printk("GFS: fsid=%s: can't make FS RW: %d\n",
++                             sdp->sd_fsname, error);
++                      goto fail_gunlock_journal;
++              }
++      }
++
++      /*  Start up the recover thread  */
++
++      error = kernel_thread(gfs_recoverd, sdp, 0);
++      if (error < 0) {
++              printk("GFS: fsid=%s: can't start recoverd thread: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_recover_dump;
++      }
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Read in the resource index inode  */
++
++      error = gfs_get_riinode(sdp);
++      if (error) {
++              printk("GFS: fsid=%s: can't get resource index inode: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_recoverd;
++      }
++
++      /*  Get the root inode  */
++
++      error = gfs_get_rootinode(sdp);
++      if (error) {
++              printk("GFS: fsid=%s: can't read in root inode: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_ri_free;
++      }
++
++      /*  Read in the quota inode  */
++
++      error = gfs_get_qinode(sdp);
++      if (error) {
++              printk("GFS: fsid=%s: can't get quota file inode: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_root_free;
++      }
++
++      /*  Read in the license inode  */
++
++      error = gfs_get_linode(sdp);
++      if (error) {
++              printk("GFS: fsid=%s: can't get license file inode: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_qi_free;
++      }
++
++      /*  We're through with the superblock lock  */
++
++      gfs_glock_dq_uninit(&sb_gh);
++      super = FALSE;
++
++      /*  Get the inode/dentry  */
++
++      inode = gfs_iget(sdp->sd_rooti, CREATE);
++      if (!inode) {
++              printk("GFS: fsid=%s: can't get root inode\n", sdp->sd_fsname);
++              error = -ENOMEM;
++              goto fail_li_free;
++      }
++
++      sb->s_root = d_alloc_root(inode);
++      if (!sb->s_root) {
++              iput(inode);
++              printk("GFS: fsid=%s: can't get root dentry\n", sdp->sd_fsname);
++              error = -ENOMEM;
++              goto fail_li_free;
++      }
++
++      /*  Start up the logd thread  */
++
++      sdp->sd_jindex_refresh_time = jiffies;
++
++      error = kernel_thread(gfs_logd, sdp, 0);
++      if (error < 0) {
++              printk("GFS: fsid=%s: can't start logd thread: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_dput;
++      }
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Start up the quotad thread  */
++
++      error = kernel_thread(gfs_quotad, sdp, 0);
++      if (error < 0) {
++              printk("GFS: fsid=%s: can't start quotad thread: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_logd;
++      }
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Start up the inoded thread  */
++
++      error = kernel_thread(gfs_inoded, sdp, 0);
++      if (error < 0) {
++              printk("GFS: fsid=%s: can't start inoded thread: %d\n",
++                     sdp->sd_fsname, error);
++              goto fail_quotad;
++      }
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Get a handle on the rename lock  */
++
++      error = gfs_glock_get(sdp, GFS_RENAME_LOCK, &gfs_nondisk_glops,
++                            CREATE, &sdp->sd_rename_gl);
++      if (error)
++              goto fail_inoded;
++
++      gfs_glock_dq_uninit(&mount_gh);
++
++      return 0;
++
++      fail_inoded:
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_INODED_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_inoded_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      fail_quotad:
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_quotad_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      fail_logd:
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_logd_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      fail_dput:
++      dput(sb->s_root);
++
++      fail_li_free:
++      gfs_inode_put(sdp->sd_linode);
++
++      fail_qi_free:
++      gfs_inode_put(sdp->sd_qinode);
++
++      fail_root_free:
++      gfs_inode_put(sdp->sd_rooti);
++
++      fail_ri_free:
++      gfs_inode_put(sdp->sd_riinode);
++      gfs_clear_rgrpd(sdp);
++
++      fail_recoverd:
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_recoverd_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      fail_recover_dump:
++      clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++      gfs_unlinked_cleanup(sdp);
++      gfs_quota_cleanup(sdp);
++
++      fail_gunlock_journal:
++      gfs_glock_dq_uninit(&sdp->sd_journal_gh);
++
++      fail_gunlock_ji:
++      if (jindex)
++              gfs_glock_dq_uninit(&ji_gh);
++
++      fail_trans_gl:
++      gfs_glock_put(sdp->sd_trans_gl);
++
++      fail_ji_free:
++      gfs_inode_put(sdp->sd_jiinode);
++      gfs_clear_journals(sdp);
++
++      fail_gunlock_sb:
++      if (super)
++              gfs_glock_dq_uninit(&sb_gh);
++
++      fail_gunlock_live:
++      gfs_glock_dq_uninit(&sdp->sd_live_gh);
++
++      fail_gunlock_mount:
++      gfs_glock_dq_uninit(&mount_gh);
++
++      fail_glockd:
++      clear_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++      wake_up(&sdp->sd_reclaim_wchan);
++      while (sdp->sd_glockd_num--)
++              wait_for_completion(&sdp->sd_thread_completion);
++
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_scand_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      fail_lockproto:
++      gfs_gl_hash_clear(sdp, TRUE);
++      gfs_unmount_lockproto(sdp);
++      gfs_clear_dirty_j(sdp);
++      while (invalidate_inodes(sb))
++              yield();
++
++      fail_vfree:
++      vfree(sdp);
++
++      fail:
++      vfs2sdp(sb) = NULL;
++      return error;
++}
++
++/**
++ * gfs_get_sb - 
++ * @fs_type:
++ * @flags:
++ * @dev_name:
++ * @data:
++ *
++ * Returns: the new superblock
++ */
++
++struct super_block *gfs_get_sb(struct file_system_type *fs_type, int flags,
++                             const char *dev_name, void *data)
++{
++      return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
++}
++
++/**
++ * gfs_kill_sb - 
++ * @sb:
++ *
++ */
++
++void gfs_kill_sb(struct super_block *sb)
++{
++      kill_block_super(sb);
++}
++
++struct file_system_type gfs_fs_type = {
++      .name = "gfs",
++      .fs_flags = FS_REQUIRES_DEV /*| FS_REVAL_DOT*/,
++      .get_sb = gfs_get_sb,
++      .kill_sb = gfs_kill_sb,
++      .owner = THIS_MODULE,
++};
+diff -urN linux-orig/fs/gfs/ops_fstype.h linux-patched/fs/gfs/ops_fstype.h
+--- linux-orig/fs/gfs/ops_fstype.h     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_fstype.h  2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_FSTYPE_DOT_H__
++#define __OPS_FSTYPE_DOT_H__
++
++extern struct file_system_type gfs_fs_type;
++
++#endif /* __OPS_FSTYPE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_inode.c linux-patched/fs/gfs/ops_inode.c
+--- linux-orig/fs/gfs/ops_inode.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_inode.c   2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,1723 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/namei.h>
++#include <linux/utsname.h>
++#include <asm/uaccess.h>
++#include <linux/xattr.h>
++#include <linux/mm.h>
++#include <linux/posix_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_dentry.h"
++#include "ops_inode.h"
++#include "page.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * gfs_create - Create a file
++ * @dir: The directory in which to create the file
++ * @dentry: The dentry of the new file
++ * @mode: The mode of the new file
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_create(struct inode *dir, struct dentry *dentry,
++         int mode, struct nameidata *nd)
++{
++      struct gfs_inode *dip = vn2ip(dir), *ip;
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_holder d_gh, i_gh;
++      struct inode *inode;
++      int new = TRUE;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++      for (;;) {
++              error = gfs_createi(&d_gh, &dentry->d_name,
++                                  GFS_FILE_REG, mode,
++                                  &i_gh);
++              if (!error)
++                      break;
++              else if (error != -EEXIST) {
++                      gfs_holder_uninit(&d_gh);
++                      return error;
++              }
++
++              error = gfs_lookupi(&d_gh, &dentry->d_name,
++                                  FALSE, &i_gh);
++              if (!error) {
++                      if (i_gh.gh_gl) {
++                              new = FALSE;
++                              break;
++                      }
++              } else {
++                      gfs_holder_uninit(&d_gh);
++                      return error;
++              }
++      }
++
++      GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++      ip = gl2ip(i_gh.gh_gl);
++
++      if (new) {
++              gfs_trans_end(sdp);
++              if (dip->i_alloc->al_rgd)
++                      gfs_inplace_release(dip);
++              gfs_quota_unlock_m(dip);
++              gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++              gfs_alloc_put(dip);
++
++              ip->i_creat_task = current;
++              ip->i_creat_pid = current->pid;
++      }
++
++      gfs_glock_dq_uninit(&d_gh);
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return -ENOMEM;
++
++      d_instantiate(dentry, inode);
++      if (new)
++              mark_inode_dirty(inode);
++
++      return 0;
++}
++
++/**
++ * lookup_cdpn_sub_at - Maybe lookup a Context Dependent Pathname
++ * @sdp: the filesystem
++ * @dentry: the original dentry to lookup
++ * @new_dentry: the new dentry, if this was a substitutable path.
++ *
++ */
++
++static void
++lookup_cdpn_sub_at(struct gfs_sbd *sdp, struct dentry *dentry,
++                 struct dentry **new_dentry)
++{
++      struct dentry *parent = dget_parent(dentry);
++      char *buf = gmalloc(2 * __NEW_UTS_LEN + 2);
++
++      if (gfs_filecmp(&dentry->d_name, "@hostname", 9))
++              *new_dentry = lookup_one_len(system_utsname.nodename,
++                                           parent,
++                                           strlen(system_utsname.nodename));
++      else if (gfs_filecmp(&dentry->d_name, "@mach", 5))
++              *new_dentry = lookup_one_len(system_utsname.machine,
++                                           parent,
++                                           strlen(system_utsname.machine));
++      else if (gfs_filecmp(&dentry->d_name, "@os", 3))
++              *new_dentry = lookup_one_len(system_utsname.sysname,
++                                           parent,
++                                           strlen(system_utsname.sysname));
++      else if (gfs_filecmp(&dentry->d_name, "@uid", 4))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u", current->fsuid));
++      else if (gfs_filecmp(&dentry->d_name, "@gid", 4))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u", current->fsgid));
++      else if (gfs_filecmp(&dentry->d_name, "@sys", 4))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%s_%s",
++                                                   system_utsname.machine,
++                                                   system_utsname.sysname));
++      else if (gfs_filecmp(&dentry->d_name, "@jid", 4))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u",
++                                                   sdp->sd_lockstruct.ls_jid));
++
++      kfree(buf);
++      dput(parent);
++}
++
++/**
++ * lookup_cdpn_sub_brace - Maybe lookup a Context Dependent Pathname
++ * @sdp: the filesystem
++ * @dentry: the original dentry to lookup
++ * @new_dentry: the new dentry, if this was a substitutable path.
++ *
++ */
++
++static void
++lookup_cdpn_sub_brace(struct gfs_sbd *sdp, struct dentry *dentry,
++                 struct dentry **new_dentry)
++{
++      struct dentry *parent = dget_parent(dentry);
++      char *buf = gmalloc(2 * __NEW_UTS_LEN + 2);
++
++      if (gfs_filecmp(&dentry->d_name, "{hostname}", 10))
++              *new_dentry = lookup_one_len(system_utsname.nodename,
++                                           parent,
++                                           strlen(system_utsname.nodename));
++      else if (gfs_filecmp(&dentry->d_name, "{mach}", 6))
++              *new_dentry = lookup_one_len(system_utsname.machine,
++                                           parent,
++                                           strlen(system_utsname.machine));
++      else if (gfs_filecmp(&dentry->d_name, "{os}", 4))
++              *new_dentry = lookup_one_len(system_utsname.sysname,
++                                           parent,
++                                           strlen(system_utsname.sysname));
++      else if (gfs_filecmp(&dentry->d_name, "{uid}", 5))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u", current->fsuid));
++      else if (gfs_filecmp(&dentry->d_name, "{gid}", 5))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u", current->fsgid));
++      else if (gfs_filecmp(&dentry->d_name, "{sys}", 5))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%s_%s",
++                                                   system_utsname.machine,
++                                                   system_utsname.sysname));
++      else if (gfs_filecmp(&dentry->d_name, "{jid}", 5))
++              *new_dentry = lookup_one_len(buf,
++                                           parent,
++                                           sprintf(buf, "%u",
++                                                   sdp->sd_lockstruct.ls_jid));
++
++      kfree(buf);
++      dput(parent);
++}
++
++/**
++ * gfs_lookup - Look up a filename in a directory and return its inode
++ * @dir: The directory inode
++ * @dentry: The dentry of the new inode
++ *
++ * Called by the VFS layer. Lock dir and call gfs_lookupi()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static struct dentry *
++gfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
++{
++      struct gfs_inode *dip = vn2ip(dir), *ip;
++      struct gfs_holder d_gh, i_gh;
++      struct inode *inode = NULL;
++      int error;
++
++      atomic_inc(&dip->i_sbd->sd_ops_inode);
++
++      /*  Do Context Dependent Path Name expansion  */
++
++      if (*dentry->d_name.name == '@' && dentry->d_name.len > 1) {
++              struct dentry *new_dentry = NULL;
++              lookup_cdpn_sub_at(dip->i_sbd, dentry, &new_dentry);
++              if (new_dentry)
++                      return new_dentry;
++      } else if (*dentry->d_name.name == '{' && dentry->d_name.len > 2) {
++              struct dentry *new_dentry = NULL;
++              lookup_cdpn_sub_brace(dip->i_sbd, dentry, &new_dentry);
++              if (new_dentry)
++                      return new_dentry;
++      }
++
++      dentry->d_op = &gfs_dops;
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++      error = gfs_lookupi(&d_gh, &dentry->d_name, FALSE, &i_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              return ERR_PTR(error);
++      }
++
++      if (i_gh.gh_gl) {
++              ip = gl2ip(i_gh.gh_gl);
++
++              gfs_glock_dq_uninit(&d_gh);
++              gfs_glock_dq_uninit(&i_gh);
++
++              inode = gfs_iget(ip, CREATE);
++              gfs_inode_put(ip);
++
++              if (!inode)
++                      return ERR_PTR(-ENOMEM);
++      } else
++              gfs_holder_uninit(&d_gh);
++
++      if (inode)
++              return d_splice_alias(inode, dentry);
++      d_add(dentry, inode);
++      return NULL;
++}
++
++/**
++ * gfs_link - Link to a file
++ * @old_dentry: The inode to link
++ * @dir: Add link to this directory
++ * @dentry: The name of the link
++ *
++ * Link the inode in "old_dentry" into the directory "dir" with the
++ * name in "dentry".
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
++{
++      struct gfs_inode *dip = vn2ip(dir);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct inode *inode = old_dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_alloc *al = NULL;
++      struct gfs_holder ghs[2];
++      int alloc_required;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      if (ip->i_di.di_type == GFS_FILE_DIR)
++              return -EPERM;
++
++      gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++      gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++      error = gfs_glock_nq_m(2, ghs);
++      if (error)
++              goto fail;
++
++      error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++      if (error)
++              goto fail_gunlock;
++
++      error = gfs_dir_search(dip, &dentry->d_name, NULL, NULL);
++      switch (error) {
++      case -ENOENT:
++              break;
++      case 0:
++              error = -EEXIST;
++      default:
++              goto fail_gunlock;
++      }
++
++      if (!dip->i_di.di_nlink) {
++              error = -EINVAL;
++              goto fail_gunlock;
++      }
++      if (dip->i_di.di_entries == (uint32_t)-1) {
++              error = -EFBIG;
++              goto fail_gunlock;
++      }
++      if (!ip->i_di.di_nlink) {
++              error = -EINVAL;
++              goto fail_gunlock;
++      }
++      if (ip->i_di.di_nlink == (uint32_t)-1) {
++              error = -EMLINK;
++              goto fail_gunlock;
++      }
++
++      error = gfs_diradd_alloc_required(dip, &dentry->d_name, &alloc_required);
++      if (error)
++              goto fail_gunlock;
++
++      if (alloc_required) {
++              al = gfs_alloc_get(dip);
++
++              error = gfs_quota_lock_m(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++              if (error)
++                      goto fail_alloc;
++
++              error = gfs_quota_check(dip, dip->i_di.di_uid, dip->i_di.di_gid);
++              if (error)
++                      goto fail_gunlock_q;
++
++              al->al_requested_meta = sdp->sd_max_dirres;
++
++              error = gfs_inplace_reserve(dip);
++              if (error)
++                      goto fail_gunlock_q;
++
++              /* Trans may require:
++                 two dinode blocks, directory modifications to add an entry,
++                 RG bitmap blocks to allocate from, and quota change */
++
++              error = gfs_trans_begin(sdp,
++                                      2 + sdp->sd_max_dirres +
++                                      al->al_rgd->rd_ri.ri_length,
++                                      1);
++              if (error)
++                      goto fail_ipres;
++      } else {
++              /*  Trans may require:
++                  Two dinode blocks and a leaf block.  */
++
++              error = gfs_trans_begin(sdp, 3, 0);
++              if (error)
++                      goto fail_ipres;
++      }
++
++      error = gfs_dir_add(dip, &dentry->d_name, &ip->i_num, ip->i_di.di_type);
++      if (error)
++              goto fail_end_trans;
++
++      error = gfs_change_nlink(ip, +1);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_end(sdp);
++
++      if (alloc_required) {
++              GFS_ASSERT_INODE(al->al_alloced_meta, dip,);
++              gfs_inplace_release(dip);
++              gfs_quota_unlock_m(dip);
++              gfs_alloc_put(dip);
++      }
++
++      gfs_glock_dq_m(2, ghs);
++
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      atomic_inc(&inode->i_count);
++
++      d_instantiate(dentry, inode);
++      mark_inode_dirty(inode);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      if (alloc_required)
++              gfs_inplace_release(dip);
++
++ fail_gunlock_q:
++      if (alloc_required)
++              gfs_quota_unlock_m(dip);
++
++ fail_alloc:
++      if (alloc_required)
++              gfs_alloc_put(dip);
++
++ fail_gunlock:
++      gfs_glock_dq_m(2, ghs);
++
++ fail:
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      return error;
++}
++
++/**
++ * gfs_unlink - Unlink a file
++ * @dir: The inode of the directory containing the file to unlink
++ * @dentry: The file itself
++ *
++ * Unlink a file.  Call gfs_unlinki()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_unlink(struct inode *dir, struct dentry *dentry)
++{
++      struct gfs_inode *dip = vn2ip(dir);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_inode *ip = vn2ip(dentry->d_inode);
++      struct gfs_holder ghs[2];
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++      gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++      error = gfs_glock_nq_m(2, ghs);
++      if (error)
++              goto fail;
++
++      error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++      if (error)
++              goto fail_gunlock;
++
++      if ((dip->i_di.di_mode & S_ISVTX) &&
++          dip->i_di.di_uid != current->fsuid &&
++          ip->i_di.di_uid != current->fsuid &&
++          !capable(CAP_FOWNER)) {
++              error = -EPERM;
++              goto fail_gunlock;
++      }
++
++      error = gfs_revalidate(dip, &dentry->d_name, ip);
++      if (error)
++              goto fail_gunlock;
++
++      /*  Trans may require:
++          Two dinode blocks and one modified directory leaf block
++          and one unlinked tag.  */
++
++      error = gfs_trans_begin(sdp, 3, 1);
++      if (error)
++              goto fail_gunlock;
++
++      error = gfs_unlinki(dip, &dentry->d_name, ip);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_m(2, ghs);
++
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_gunlock:
++      gfs_glock_dq_m(2, ghs);
++
++ fail:
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      return error;
++}
++
++/**
++ * gfs_symlink - Create a symlink
++ * @dir: The directory to create the symlink in
++ * @dentry: The dentry to put the symlink in
++ * @symname: The thing which the link points to
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
++{
++      struct gfs_inode *dip = vn2ip(dir), *ip;
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_holder d_gh, i_gh;
++      struct inode *inode;
++      struct buffer_head *dibh;
++      int size;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      /* Must be stuffed with a null terminator for gfs_follow_link() */
++      size = strlen(symname);
++      if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode) - 1)
++              return -ENAMETOOLONG;
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++      error = gfs_createi(&d_gh, &dentry->d_name,
++                          GFS_FILE_LNK, 0777,
++                          &i_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              return error;
++      }
++
++      GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++      ip = gl2ip(i_gh.gh_gl);
++
++      ip->i_di.di_size = size;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      GFS_ASSERT_INODE(!error, ip,);
++
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++      memcpy(dibh->b_data + sizeof(struct gfs_dinode), symname, size);
++
++      brelse(dibh);
++
++      gfs_trans_end(sdp);
++      if (dip->i_alloc->al_rgd)
++              gfs_inplace_release(dip);
++      gfs_quota_unlock_m(dip);
++      gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++      gfs_alloc_put(dip);
++
++      gfs_glock_dq_uninit(&d_gh);
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return -ENOMEM;
++
++      d_instantiate(dentry, inode);
++      mark_inode_dirty(inode);
++
++      return 0;
++}
++
++/**
++ * gfs_mkdir - Make a directory
++ * @dir: The parent directory of the new one
++ * @dentry: The dentry of the new directory
++ * @mode: The mode of the new directory
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
++{
++      struct gfs_inode *dip = vn2ip(dir), *ip;
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_holder d_gh, i_gh;
++      struct inode *inode;
++      struct buffer_head *dibh;
++      struct gfs_dinode *di;
++      struct gfs_dirent *dent;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++      error = gfs_createi(&d_gh, &dentry->d_name,
++                          GFS_FILE_DIR, mode,
++                          &i_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              return error;
++      }
++
++      GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++      ip = gl2ip(i_gh.gh_gl);
++
++      ip->i_di.di_nlink = 2;
++      ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++      ip->i_di.di_flags |= GFS_DIF_JDATA;
++      ip->i_di.di_payload_format = GFS_FORMAT_DE;
++      ip->i_di.di_entries = 2;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      GFS_ASSERT_INODE(!error, ip,);
++
++      di = (struct gfs_dinode *)dibh->b_data;
++
++      error = gfs_dirent_alloc(ip, dibh, 1, &dent);
++      GFS_ASSERT_INODE(!error, ip,); /* This should never fail */
++
++      dent->de_inum = di->di_num; /* already GFS endian */
++      dent->de_hash = gfs_dir_hash(".", 1);
++      dent->de_hash = cpu_to_gfs32(dent->de_hash);
++      dent->de_type = cpu_to_gfs16(GFS_FILE_DIR);
++      memcpy((char *) (dent + 1), ".", 1);
++      di->di_entries = cpu_to_gfs32(1);
++
++      error = gfs_dirent_alloc(ip, dibh, 2, &dent);
++      GFS_ASSERT_INODE(!error, ip,);  /*  This should never fail  */
++
++      gfs_inum_out(&dip->i_num, (char *) &dent->de_inum);
++      dent->de_hash = gfs_dir_hash("..", 2);
++      dent->de_hash = cpu_to_gfs32(dent->de_hash);
++      dent->de_type = cpu_to_gfs16(GFS_FILE_DIR);
++      memcpy((char *) (dent + 1), "..", 2);
++
++      gfs_dinode_out(&ip->i_di, (char *)di);
++
++      brelse(dibh);
++
++      error = gfs_change_nlink(dip, +1);
++      GFS_ASSERT_INODE(!error, dip,); /* dip already pinned */
++
++      gfs_trans_end(sdp);
++      if (dip->i_alloc->al_rgd)
++              gfs_inplace_release(dip);
++      gfs_quota_unlock_m(dip);
++      gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++      gfs_alloc_put(dip);
++
++      gfs_glock_dq_uninit(&d_gh);
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return -ENOMEM;
++
++      d_instantiate(dentry, inode);
++      mark_inode_dirty(inode);
++
++      return 0;
++}
++
++/**
++ * gfs_rmdir - Remove a directory
++ * @dir: The parent directory of the directory to be removed
++ * @dentry: The dentry of the directory to remove
++ *
++ * Remove a directory. Call gfs_rmdiri()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_rmdir(struct inode *dir, struct dentry *dentry)
++{
++      struct gfs_inode *dip = vn2ip(dir);
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_inode *ip = vn2ip(dentry->d_inode);
++      struct gfs_holder ghs[2];
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++      gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++      error = gfs_glock_nq_m(2, ghs);
++      if (error)
++              goto fail;
++
++      error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++      if (error)
++              goto fail_gunlock;
++
++      if ((dip->i_di.di_mode & S_ISVTX) &&
++          dip->i_di.di_uid != current->fsuid &&
++          ip->i_di.di_uid != current->fsuid &&
++          !capable(CAP_FOWNER)) {
++              error = -EPERM;
++              goto fail_gunlock;
++      }
++
++      error = gfs_revalidate(dip, &dentry->d_name, ip);
++      if (error)
++              goto fail_gunlock;
++
++      GFS_ASSERT_INODE(ip->i_di.di_entries >= 2, ip,
++                       gfs_dinode_print(&ip->i_di););
++
++      if (ip->i_di.di_entries > 2) {
++              error = -ENOTEMPTY;
++              goto fail_gunlock;
++      }
++
++      /* Trans may require:
++         Two dinode blocks, one directory leaf block containing the
++         entry to be rmdired, two leaf blocks containing . and .. of
++         the directory being rmdired, and one unlinked tag */
++
++      error = gfs_trans_begin(sdp, 5, 1);
++      if (error)
++              goto fail_gunlock;
++
++      error = gfs_rmdiri(dip, &dentry->d_name, ip);
++      if (error)
++              goto fail_end_trans;
++
++      gfs_trans_end(sdp);
++
++      gfs_glock_dq_m(2, ghs);
++
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_gunlock:
++      gfs_glock_dq_m(2, ghs);
++
++ fail:
++      gfs_holder_uninit(&ghs[0]);
++      gfs_holder_uninit(&ghs[1]);
++
++      return error;
++}
++
++/**
++ * gfs_mknod - Make a special file
++ * @dir: The directory in which the special file will reside
++ * @dentry: The dentry of the special file
++ * @mode: The mode of the special file
++ * @rdev: The device specification of the special file
++ *
++ */
++
++static int
++gfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
++{
++      struct gfs_inode *dip = vn2ip(dir), *ip;
++      struct gfs_sbd *sdp = dip->i_sbd;
++      struct gfs_holder d_gh, i_gh;
++      struct inode *inode;
++      struct buffer_head *dibh;
++      uint16_t type = 0;
++      uint32_t major = 0, minor = 0;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      switch (mode & S_IFMT) {
++      case S_IFBLK:
++              type = GFS_FILE_BLK;
++              major = MAJOR(dev);
++              minor = MINOR(dev);
++              break;
++      case S_IFCHR:
++              type = GFS_FILE_CHR;
++              major = MAJOR(dev);
++              minor = MINOR(dev);
++              break;
++      case S_IFIFO:
++              type = GFS_FILE_FIFO;
++              break;
++      case S_IFSOCK:
++              type = GFS_FILE_SOCK;
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,
++                             printk("mode = %d\n", mode););
++              break;
++      };
++
++      gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++      error = gfs_createi(&d_gh, &dentry->d_name,
++                          type, mode,
++                          &i_gh);
++      if (error) {
++              gfs_holder_uninit(&d_gh);
++              return error;
++      }
++
++      GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++      ip = gl2ip(i_gh.gh_gl);
++
++      ip->i_di.di_major = major;
++      ip->i_di.di_minor = minor;
++
++      error = gfs_get_inode_buffer(ip, &dibh);
++      GFS_ASSERT_INODE(!error, ip,);
++
++      gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++      brelse(dibh);
++
++      gfs_trans_end(sdp);
++      if (dip->i_alloc->al_rgd)
++              gfs_inplace_release(dip);
++      gfs_quota_unlock_m(dip);
++      gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++      gfs_alloc_put(dip);
++
++      gfs_glock_dq_uninit(&d_gh);
++      gfs_glock_dq_uninit(&i_gh);
++
++      inode = gfs_iget(ip, CREATE);
++      gfs_inode_put(ip);
++
++      if (!inode)
++              return -ENOMEM;
++
++      d_instantiate(dentry, inode);
++      mark_inode_dirty(inode);
++
++      return 0;
++}
++
++/**
++ * gfs_rename - Rename a file
++ * @odir: Parent directory of old file name
++ * @odentry: The old dentry of the file
++ * @ndir: Parent directory of new file name
++ * @ndentry: The new dentry of the file
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_rename(struct inode *odir, struct dentry *odentry,
++         struct inode *ndir, struct dentry *ndentry)
++{
++      struct gfs_inode *odip = vn2ip(odir);
++      struct gfs_inode *ndip = vn2ip(ndir);
++      struct gfs_inode *ip = vn2ip(odentry->d_inode);
++      struct gfs_inode *nip = NULL;
++      struct gfs_sbd *sdp = odip->i_sbd;
++      struct qstr name;
++      struct gfs_alloc *al;
++      struct gfs_holder ghs[4], r_gh;
++      unsigned int num_gh;
++      int dir_rename = FALSE;
++      int alloc_required;
++      unsigned int x;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      gfs_unlinked_limit(sdp);
++
++      if (ndentry->d_inode) {
++              nip = vn2ip(ndentry->d_inode);
++              if (ip == nip)
++                      return 0;
++      }
++
++      /*  Make sure we aren't trying to move a dirctory into it's subdir  */
++
++      if (ip->i_di.di_type == GFS_FILE_DIR && odip != ndip) {
++              dir_rename = TRUE;
++
++              error = gfs_glock_nq_init(sdp->sd_rename_gl,
++                                        LM_ST_EXCLUSIVE, 0,
++                                        &r_gh);
++              if (error)
++                      return error;
++
++              error = gfs_ok_to_move(ip, ndip);
++              if (error)
++                      goto fail;
++      }
++
++      gfs_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++      gfs_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++      num_gh = 2;
++
++      if (nip)
++              gfs_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh++]);
++
++      if (dir_rename)
++              gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh++]);
++
++      error = gfs_glock_nq_m(num_gh, ghs);
++      if (error)
++              goto fail_uninit;
++
++      /*  Check out the old directory  */
++
++      error = permission(odir, MAY_WRITE | MAY_EXEC, NULL);
++      if (error)
++              goto fail_gunlock;
++
++      if ((odip->i_di.di_mode & S_ISVTX) &&
++          odip->i_di.di_uid != current->fsuid &&
++          ip->i_di.di_uid != current->fsuid &&
++          !capable(CAP_FOWNER)) {
++              error = -EPERM;
++              goto fail_gunlock;
++      }
++
++      error = gfs_revalidate(odip, &odentry->d_name, ip);
++      if (error)
++              goto fail_gunlock;
++
++      /*  Check out the new directory  */
++
++      error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
++      if (error)
++              goto fail_gunlock;
++
++      if (nip) {
++              if ((ndip->i_di.di_mode & S_ISVTX) &&
++                  ndip->i_di.di_uid != current->fsuid &&
++                  nip->i_di.di_uid != current->fsuid &&
++                  !capable(CAP_FOWNER)) {
++                      error = -EPERM;
++                      goto fail_gunlock;
++              }
++
++              error = gfs_revalidate(ndip, &ndentry->d_name, nip);
++              if (error)
++                      goto fail_gunlock;
++
++              if (nip->i_di.di_type == GFS_FILE_DIR) {
++                      GFS_ASSERT_INODE(nip->i_di.di_entries >= 2, ip,
++                                       gfs_dinode_print(&nip->i_di););
++                      if (nip->i_di.di_entries > 2) {
++                              error = -ENOTEMPTY;
++                              goto fail_gunlock;
++                      }
++              }
++      } else {
++              error = gfs_dir_search(ndip, &ndentry->d_name, NULL, NULL);
++              switch (error) {
++              case -ENOENT:
++                      error = 0;
++                      break;
++              case 0:
++                      error = -EEXIST;
++              default:
++                      goto fail_gunlock;
++              };
++
++              if (odip != ndip) {
++                      if (!ndip->i_di.di_nlink) {
++                              error = -EINVAL;
++                              goto fail_gunlock;
++                      }
++                      if (ndip->i_di.di_entries == (uint32_t)-1) {
++                              error = -EFBIG;
++                              goto fail_gunlock;
++                      }
++                      if (ip->i_di.di_type == GFS_FILE_DIR &&
++                          ndip->i_di.di_nlink == (uint32_t)-1) {
++                              error = -EMLINK;
++                              goto fail_gunlock;
++                      }
++              }
++      }
++
++      error = gfs_diradd_alloc_required(ndip, &ndentry->d_name, &alloc_required);
++      if (error)
++              goto fail_gunlock;
++
++      if (alloc_required) {
++              al = gfs_alloc_get(ndip);
++
++              error = gfs_quota_lock_m(ndip,
++                                          NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++              if (error)
++                      goto fail_alloc;
++
++              error = gfs_quota_check(ndip, ndip->i_di.di_uid, ndip->i_di.di_gid);
++              if (error)
++                      goto fail_gunlock_q;
++
++              al->al_requested_meta = sdp->sd_max_dirres;
++
++              error = gfs_inplace_reserve(ndip);
++              if (error)
++                      goto fail_gunlock_q;
++
++              /* Trans may require:
++                 Dinodes for the srcdir, srcino, dstdir, dstino.  Blocks for
++                 adding the entry to dstdir.  RG bitmaps for that allocation.
++                 One leaf block in the srcdir for removal of the entry.
++                 One leaf block for changing .. in srcino (if it's a directory).
++                 Two leaf blocks for removing . and .. from dstino (if it exists
++                 and it's a directory), one unlinked tag, and one quota block. */
++
++              error = gfs_trans_begin(sdp,
++                                      8 + sdp->sd_max_dirres +
++                                      al->al_rgd->rd_ri.ri_length,
++                                      2);
++              if (error)
++                      goto fail_ipres;
++      } else {
++              /* Trans may require:
++                 Dinodes for the srcdir, srcino, dstdir, dstino.  One block for
++                 adding the entry to dstdir.
++                 One leaf block in the srcdir for removal of the entry.
++                 One leaf block for changing .. in srcino (if it's a directory).
++                 Two leaf blocks for removing . and .. from dstino (if it exists
++                 and it's a directory), and one unlinked tag. */
++
++              error = gfs_trans_begin(sdp, 9, 1);
++              if (error)
++                      goto fail_ipres;
++      }
++
++      /*  Remove the target file, if it exists  */
++
++      if (nip) {
++              if (nip->i_di.di_type == GFS_FILE_DIR)
++                      error = gfs_rmdiri(ndip, &ndentry->d_name, nip);
++              else
++                      error = gfs_unlinki(ndip, &ndentry->d_name, nip);
++
++              if (error)
++                      goto fail_end_trans;
++      }
++
++      if (dir_rename) {
++              error = gfs_change_nlink(ndip, +1);
++              if (error)
++                      goto fail_end_trans;
++              error = gfs_change_nlink(odip, -1);
++              if (error)
++                      goto fail_end_trans;
++
++              name.len = 2;
++              name.name = "..";
++
++              error = gfs_dir_mvino(ip, &name, &ndip->i_num, GFS_FILE_DIR);
++              if (error)
++                      goto fail_end_trans;
++      }
++
++      error = gfs_dir_del(odip, &odentry->d_name);
++      if (error)
++              goto fail_end_trans;
++
++      error = gfs_dir_add(ndip, &ndentry->d_name, &ip->i_num, ip->i_di.di_type);
++      if (error)
++              goto fail_end_trans;
++
++      if (dir_rename)
++              gfs_trans_add_gl(sdp->sd_rename_gl);
++
++      gfs_trans_end(sdp);
++
++      if (alloc_required) {
++              /*  Don't check al->al_alloced_meta and friends.  */
++              gfs_inplace_release(ndip);
++              gfs_quota_unlock_m(ndip);
++              gfs_alloc_put(ndip);
++      }
++
++      gfs_glock_dq_m(num_gh, ghs);
++
++      for (x = 0; x < num_gh; x++)
++              gfs_holder_uninit(&ghs[x]);
++
++      if (dir_rename)
++              gfs_glock_dq_uninit(&r_gh);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      if (alloc_required)
++              gfs_inplace_release(ndip);
++
++ fail_gunlock_q:
++      if (alloc_required)
++              gfs_quota_unlock_m(ndip);
++
++ fail_alloc:
++      if (alloc_required)
++              gfs_alloc_put(ndip);
++
++ fail_gunlock:
++      gfs_glock_dq_m(num_gh, ghs);
++
++ fail_uninit:
++      for (x = 0; x < num_gh; x++)
++              gfs_holder_uninit(&ghs[x]);
++
++ fail:
++      if (dir_rename)
++              gfs_glock_dq_uninit(&r_gh);
++
++      return error;
++}
++
++/**
++ * gfs_readlink - Read the value of a symlink
++ * @dentry: the symlink
++ * @buf: the buffer to read the symlink data into
++ * @size: the size of the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_readlink(struct dentry *dentry, char *user_buf, int user_size)
++{
++      struct gfs_inode *ip = vn2ip(dentry->d_inode);
++      char array[GFS_FAST_NAME_SIZE], *buf = array;
++      unsigned int len = GFS_FAST_NAME_SIZE;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++      error = gfs_readlinki(ip, &buf, &len);
++      if (error)
++              return error;
++
++      GFS_ASSERT_INODE(len, ip,);
++
++      if (user_size > len - 1)
++              user_size = len - 1;
++
++      if (copy_to_user(user_buf, buf, user_size))
++              error = -EFAULT;
++      else
++              error = user_size;
++
++      if (buf != array)
++              kfree(buf);
++
++      return error;
++}
++
++/**
++ * gfs_follow_link - Follow a symbolic link
++ * @dentry: The dentry of the link
++ * @nd: Data that we pass to vfs_follow_link()
++ *
++ * This can handle symlinks of any size. It is optimised for symlinks
++ * under GFS_FAST_NAME_SIZE.
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_follow_link(struct dentry *dentry, struct nameidata *nd)
++{
++      struct gfs_inode *ip = vn2ip(dentry->d_inode);
++      char array[GFS_FAST_NAME_SIZE], *buf = array;
++      unsigned int len = GFS_FAST_NAME_SIZE;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++      error = gfs_readlinki(ip, &buf, &len);
++      if (!error) {
++              error = vfs_follow_link(nd, buf);
++              if (buf != array)
++                      kfree(buf);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_permission -
++ * @inode:
++ * @mask:
++ * @nd:
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_holder i_gh;
++      struct posix_acl *acl;
++      umode_t mode = inode->i_mode;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++      error = gfs_glock_nq_init(ip->i_gl,
++                                LM_ST_SHARED, LM_FLAG_ANY,
++                                &i_gh);
++      if (error)
++              return error;
++
++      if (mask & MAY_WRITE) {
++              if (IS_RDONLY(inode) &&
++                  (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
++                      error = -EROFS;
++                      goto out;
++              }
++              if (IS_IMMUTABLE(inode)) {
++                      error = -EACCES;
++                      goto out;
++              }
++      }
++
++      if (capable(CAP_DAC_OVERRIDE))
++              if (!(mask & MAY_EXEC) || (mode & S_IXUGO))
++                      goto out;
++
++      if (capable(CAP_DAC_READ_SEARCH) &&
++          (mask == MAY_READ ||
++           (!(mask & MAY_WRITE) && S_ISDIR(mode))))
++              goto out;
++
++      if (inode->i_uid == current->fsuid) {
++              if ((mask & (mode >> 6)) != mask)
++                      error = -EACCES;
++              goto out;
++      }
++
++      if ((mask & (mode >> 3)) == mask) {
++              error = gfs_getacl(inode, TRUE, &acl);
++              if (acl) {
++                      error = posix_acl_permission(inode, acl, mask);
++                      goto out;
++              } else if (error && error != -ENODATA)
++                      goto out;
++              error = 0;
++              if (in_group_p(inode->i_gid)) {
++                      error = 0;
++                      goto out;
++              }
++      } else if (in_group_p(inode->i_gid)) {
++              error = -EACCES;
++              goto out;
++      }
++
++      if ((mask & mode) == mask)
++              goto out;
++
++      error = -EACCES;
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_setattr - Change attributes on an inode
++ * @dentry: The dentry which is changing
++ * @attr: The structure describing the change
++ *
++ * The VFS layer wants to change one or more of an inodes attributes.  Write
++ * that change out to disk.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_setattr(struct dentry *dentry, struct iattr *attr)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_holder i_gh;
++      struct gfs_alloc *al;
++      struct buffer_head *dibh;
++      uint32_t ouid, ogid, nuid, ngid;
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (error)
++              return error;
++
++      error = inode_change_ok(inode, attr);
++      if (error)
++              goto fail;
++
++      if (attr->ia_valid & ATTR_SIZE) {
++              error = permission(inode, MAY_WRITE, NULL);
++              if (error)
++                      goto fail;
++
++              if (attr->ia_size != ip->i_di.di_size) {
++                      error = vmtruncate(inode, attr->ia_size);
++                      if (error)
++                              goto fail;
++              }
++
++              error = gfs_truncatei(ip, attr->ia_size, gfs_truncator_page);
++              if (error)
++                      goto fail;
++
++              if ((sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) &&
++                  !gfs_is_jdata(ip))
++                      i_gh.gh_flags |= GL_SYNC;
++      }
++
++      else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) {
++              ouid = ip->i_di.di_uid;
++              ogid = ip->i_di.di_gid;
++              nuid = attr->ia_uid;
++              ngid = attr->ia_gid;
++
++              if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
++                      ouid = nuid = NO_QUOTA_CHANGE;
++              if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
++                      ogid = ngid = NO_QUOTA_CHANGE;
++
++              al = gfs_alloc_get(ip);
++
++              error = gfs_quota_lock_m(ip, nuid, ngid);
++              if (error)
++                      goto fail_alloc;
++
++              if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
++                      error = gfs_quota_check(ip, nuid, ngid);
++                      if (error)
++                              goto fail_gunlock_q;
++              }
++
++              /* Trans may require:
++                 one dinode block and one quota change block */
++
++              error = gfs_trans_begin(sdp, 1, 1);
++              if (error)
++                      goto fail_gunlock_q;
++
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (error)
++                      goto fail_end_trans;
++
++              if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
++                      gfs_trans_add_quota(sdp, -ip->i_di.di_blocks,
++                                          ouid, ogid);
++                      gfs_trans_add_quota(sdp, ip->i_di.di_blocks,
++                                          nuid, ngid);
++              }
++
++              inode_setattr(inode, attr);
++              gfs_inode_attr_out(ip);
++
++              gfs_trans_add_bh(ip->i_gl, dibh);
++              gfs_dinode_out(&ip->i_di, dibh->b_data);
++              brelse(dibh);
++
++              gfs_trans_end(sdp);
++
++              gfs_quota_unlock_m(ip);
++              gfs_alloc_put(ip);
++      }
++
++      else {
++              /* Trans may require:
++                 one dinode block plus changes for acl. */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + GFS_MAX_EA_ACL_BLKS, 0);
++              if (error)
++                      goto fail;
++
++              error = gfs_get_inode_buffer(ip, &dibh);
++              if (!error) {
++                      inode_setattr(inode, attr);
++                      gfs_inode_attr_out(ip);
++
++                      if (attr->ia_valid & ATTR_MODE)
++                              error = gfs_acl_setattr(inode);
++
++                      gfs_trans_add_bh(ip->i_gl, dibh);
++                      gfs_dinode_out(&ip->i_di, dibh->b_data);
++                      brelse(dibh);
++              }
++
++              gfs_trans_end(sdp);
++      }
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      mark_inode_dirty(inode);
++
++      return error;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_gunlock_q:
++      gfs_quota_unlock_m(ip);
++
++ fail_alloc:
++      gfs_alloc_put(ip);
++
++ fail:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_getattr - Read out an inode's attributes
++ * @mnt: ?
++ * @dentry: The dentry to stat
++ * @stat: The inode's stats
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_holder gh;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
++      if (!error)
++      {
++              generic_fillattr(inode, stat);
++              gfs_glock_dq_uninit(&gh);
++      }
++
++      return error;
++}
++
++/**
++ * get_eatype - get the type of the ea, and trucate the type from the name
++ * @namep: ea name, possibly with type appended
++ *
++ * Returns: GFS_EATYPE_XXX
++ */
++
++int
++get_eatype(const char *name, char **truncated_name)
++{
++      int type;
++
++      if (strncmp(name, "system.", 7) == 0) {
++              type = GFS_EATYPE_SYS;
++              *truncated_name = strchr(name, '.') + 1;
++      } else if (strncmp(name, "user.", 5) == 0) {
++              type = GFS_EATYPE_USR;
++              *truncated_name = strchr(name, '.') + 1;
++      } else {
++              type = GFS_EATYPE_UNUSED;
++              *truncated_name = NULL;
++      }
++
++      return type;
++}
++
++/**
++ * gfs_setxattr - Set (or create or replace) an inode's extended attribute
++ * @dentry: inode's dentry
++ * @name: name of the extended attribute
++ * @data: the value of the extended attribute
++ * @size: the size of data
++ * @flags: used to specify create or replace actions
++ *
++ * Returns:  0 on success, -EXXX on error
++ */
++
++int
++gfs_setxattr(struct dentry *dentry, const char *name,
++           const void *data, size_t size,
++           int flags)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_easet_io req;
++      char *truncated_name;
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      req.es_type = get_eatype(name, &truncated_name);
++
++      if (req.es_type == GFS_EATYPE_UNUSED)
++              error = -EOPNOTSUPP;
++      else {
++              req.es_data = data;
++              req.es_name = truncated_name;
++              req.es_data_len = size;
++              req.es_name_len = strlen(truncated_name);
++              if (flags & XATTR_CREATE)
++                      req.es_cmd = GFS_EACMD_CREATE;
++              else if (flags & XATTR_REPLACE)
++                      req.es_cmd = GFS_EACMD_REPLACE;
++              else
++                      req.es_cmd = GFS_EACMD_SET;
++              error = gfs_set_eattr(sdp, ip, &req);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_getxattr -
++ * @dentry:
++ * @name:
++ * @data:
++ * @size:
++ *
++ * Returns:  0 on success, -EXXX on error
++ */
++
++ssize_t
++gfs_getxattr(struct dentry *dentry, const char *name,
++           void *data, size_t size)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_eaget_io req;
++      char *truncated_name;
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      req.eg_type = get_eatype(name, &truncated_name);
++
++      if (req.eg_type == GFS_EATYPE_UNUSED)
++              error = -EOPNOTSUPP;
++      else {
++              req.eg_name = truncated_name;
++              req.eg_name_len = strlen(truncated_name);
++              req.eg_data = data;
++              req.eg_data_len = size;
++              req.eg_len = NULL;
++              error = gfs_get_eattr(sdp, ip, &req, gfs_ea_memcpy);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_listxattr - 
++ * @dentry:
++ * @buffer:
++ * @size:
++ *
++ * Returns:  0 on success, -EXXX on error
++ */
++
++ssize_t
++gfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_eaget_io req;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      req.eg_type = 0;
++      req.eg_name = NULL;
++      req.eg_name_len = 0;
++      req.eg_data = buffer;
++      req.eg_data_len = size;
++      req.eg_len = NULL;
++
++      return gfs_get_eattr(sdp, ip, &req, gfs_ea_memcpy);
++}
++
++/**
++ * gfs_removexattr -
++ * @dentry:
++ * @name:
++ *
++ * Returns:  0 on success, -EXXX on error
++ */
++
++int
++gfs_removexattr(struct dentry *dentry, const char *name)
++{
++      struct inode *inode = dentry->d_inode;
++      struct gfs_inode *ip = vn2ip(inode);
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_easet_io req;
++      char *truncated_name;
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_inode);
++
++      req.es_type = get_eatype(name, &truncated_name);
++
++      if (req.es_type == GFS_EATYPE_UNUSED)
++              error = -EOPNOTSUPP;
++      else {
++              req.es_name = truncated_name;
++              req.es_data = NULL;
++              req.es_data_len = 0;
++              req.es_name_len = strlen(truncated_name);
++              req.es_cmd = GFS_EACMD_REMOVE;
++              error = gfs_set_eattr(sdp, ip, &req);
++      }
++
++      return error;
++}
++
++struct inode_operations gfs_file_iops = {
++      .permission = gfs_permission,
++      .setattr = gfs_setattr,
++      .getattr = gfs_getattr,
++      .setxattr = gfs_setxattr,
++      .getxattr = gfs_getxattr,
++      .listxattr = gfs_listxattr,
++      .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_dev_iops = {
++      .permission = gfs_permission,
++      .setattr = gfs_setattr,
++      .getattr = gfs_getattr,
++      .setxattr = gfs_setxattr,
++      .getxattr = gfs_getxattr,
++      .listxattr = gfs_listxattr,
++      .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_dir_iops = {
++      .create = gfs_create,
++      .lookup = gfs_lookup,
++      .link = gfs_link,
++      .unlink = gfs_unlink,
++      .symlink = gfs_symlink,
++      .mkdir = gfs_mkdir,
++      .rmdir = gfs_rmdir,
++      .mknod = gfs_mknod,
++      .rename = gfs_rename,
++      .permission = gfs_permission,
++      .setattr = gfs_setattr,
++      .getattr = gfs_getattr,
++      .setxattr = gfs_setxattr,
++      .getxattr = gfs_getxattr,
++      .listxattr = gfs_listxattr,
++      .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_symlink_iops = {
++      .readlink = gfs_readlink,
++      .follow_link = gfs_follow_link,
++      .permission = gfs_permission,
++      .setattr = gfs_setattr,
++      .getattr = gfs_getattr,
++      .setxattr = gfs_setxattr,
++      .getxattr = gfs_getxattr,
++      .listxattr = gfs_listxattr,
++      .removexattr = gfs_removexattr,
++};
++
+diff -urN linux-orig/fs/gfs/ops_inode.h linux-patched/fs/gfs/ops_inode.h
+--- linux-orig/fs/gfs/ops_inode.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_inode.h   2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_INODE_DOT_H__
++#define __OPS_INODE_DOT_H__
++
++extern struct inode_operations gfs_file_iops;
++extern struct inode_operations gfs_dir_iops;
++extern struct inode_operations gfs_symlink_iops;
++extern struct inode_operations gfs_dev_iops;
++
++#endif /* __OPS_INODE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_super.c linux-patched/fs/gfs/ops_super.c
+--- linux-orig/fs/gfs/ops_super.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_super.c   2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,416 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/vmalloc.h>
++#include <linux/statfs.h>
++#include <linux/seq_file.h>
++#include <linux/mount.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "inode.h"
++#include "locking.h"
++#include "log.h"
++#include "ops_super.h"
++#include "page.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++
++/**
++ * gfs_write_inode - Make sure the inode is stable on the disk
++ * @inode: The inode
++ * @sync: synchronous write flag
++ *
++ */
++
++static void
++gfs_write_inode(struct inode *inode, int sync)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++
++      atomic_inc(&ip->i_sbd->sd_ops_super);
++
++      if (ip && sync && !gfs_in_panic)
++              gfs_log_flush_glock(ip->i_gl);
++}
++
++/**
++ * gfs_put_inode - put an inode
++ * @inode: The inode
++ *
++ * If i_nlink is zero, any dirty data for the inode is thrown away.
++ * If a process on another machine has the file open, it may need that
++ * data.  So, sync it out.
++ */
++
++static void
++gfs_put_inode(struct inode *inode)
++{
++      struct gfs_sbd *sdp = vfs2sdp(inode->i_sb);
++      struct gfs_inode *ip = vn2ip(inode);
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      if (ip &&
++          !inode->i_nlink &&
++          S_ISREG(inode->i_mode) &&
++          !sdp->sd_args.ar_localcaching)
++              gfs_sync_page_i(inode, DIO_START | DIO_WAIT);
++}
++
++/**
++ * gfs_put_super - Unmount the filesystem
++ * @sb: The VFS superblock
++ *
++ */
++
++static void
++gfs_put_super(struct super_block *sb)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++      int error;
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      /*  Unfreeze the filesystem, if we need to  */
++
++      down(&sdp->sd_freeze_lock);
++      if (sdp->sd_freeze_count)
++              gfs_glock_dq_uninit(&sdp->sd_freeze_gh);
++      up(&sdp->sd_freeze_lock);
++
++      /*  Kill off the inode thread  */
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_INODED_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_inoded_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Kill off the quota thread  */
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_quotad_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Kill off the log thread  */
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_logd_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Kill off the recoverd thread  */
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_recoverd_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Kill off the glockd threads  */
++      clear_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++      wake_up(&sdp->sd_reclaim_wchan);
++      while (sdp->sd_glockd_num--)
++              wait_for_completion(&sdp->sd_thread_completion);
++
++      /*  Kill off the scand thread  */
++      down(&sdp->sd_thread_lock);
++      clear_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++      wake_up_process(sdp->sd_scand_process);
++      up(&sdp->sd_thread_lock);
++      wait_for_completion(&sdp->sd_thread_completion);
++
++      if (!test_bit(SDF_ROFS, &sdp->sd_flags)) {
++              gfs_log_flush(sdp);
++              gfs_quota_sync(sdp);
++              gfs_quota_sync(sdp);
++
++              error = gfs_make_fs_ro(sdp);
++              if (error)
++                      gfs_io_error(sdp);
++      }
++
++      /*  At this point, we're through modifying the disk  */
++
++      /*  Release stuff  */
++
++      gfs_inode_put(sdp->sd_riinode);
++      gfs_inode_put(sdp->sd_jiinode);
++      gfs_inode_put(sdp->sd_rooti);
++      gfs_inode_put(sdp->sd_qinode);
++      gfs_inode_put(sdp->sd_linode);
++
++      gfs_glock_put(sdp->sd_trans_gl);
++      gfs_glock_put(sdp->sd_rename_gl);
++
++      gfs_glock_dq_uninit(&sdp->sd_journal_gh);
++
++      gfs_glock_dq_uninit(&sdp->sd_live_gh);
++
++      /*  Get rid of rgrp bitmap structures  */
++      gfs_clear_rgrpd(sdp);
++      gfs_clear_journals(sdp);
++
++      /*  Take apart glock structures and buffer lists  */
++      gfs_gl_hash_clear(sdp, TRUE);
++
++      /*  Unmount the locking protocol  */
++      gfs_unmount_lockproto(sdp);
++
++      /*  At this point, we're through participating in the lockspace  */
++
++      gfs_clear_dirty_j(sdp);
++
++      /*  Get rid of any extra inodes  */
++      while (invalidate_inodes(sb))
++              yield();
++
++      vfree(sdp);
++
++      vfs2sdp(sb) = NULL;
++}
++
++/**
++ * gfs_write_super - disk commit all incore transactions
++ * @sb: the filesystem
++ *
++ * This function is called every time sync(2) is called.
++ * After this exits, all dirty buffers and synced.
++ */
++
++static void
++gfs_write_super(struct super_block *sb)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      if (!gfs_in_panic)
++              gfs_log_flush(sdp);
++}
++
++/**
++ * gfs_write_super_lockfs - prevent further writes to the filesystem
++ * @sb: the VFS structure for the filesystem
++ *
++ */
++
++static void
++gfs_write_super_lockfs(struct super_block *sb)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++      int error;
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      for (;;) {
++              error = gfs_freeze_fs(sdp);
++              if (!error)
++                      break;
++
++              switch (error) {
++              case -EBUSY:
++                      printk("GFS: fsid=%s: waiting for recovery before freeze\n",
++                             sdp->sd_fsname);
++                      break;
++
++              default:
++                      printk("GFS: fsid=%s: error freezing FS: %d\n",
++                             sdp->sd_fsname, error);
++                      break;
++              }
++
++              printk("GFS: fsid=%s: retrying...\n", sdp->sd_fsname);
++
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ);
++      }
++}
++
++/**
++ * gfs_unlockfs - reallow writes to the filesystem
++ * @sb: the VFS structure for the filesystem
++ *
++ */
++
++static void
++gfs_unlockfs(struct super_block *sb)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      gfs_unfreeze_fs(sdp);
++}
++
++/**
++ * gfs_statfs - Gather and return stats about the filesystem
++ * @sb: The superblock
++ * @statfsbuf: The buffer
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++      struct gfs_usage usage;
++      int error;
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      error = gfs_stat_gfs(sdp, &usage, TRUE);
++      if (error)
++              return error;
++
++      memset(buf, 0, sizeof(struct kstatfs));
++
++      buf->f_type = GFS_MAGIC;
++      buf->f_bsize = usage.gu_block_size;
++      buf->f_blocks = usage.gu_total_blocks;
++      buf->f_bfree = usage.gu_free + usage.gu_free_dinode + usage.gu_free_meta;
++      buf->f_bavail = usage.gu_free + usage.gu_free_dinode + usage.gu_free_meta;
++      buf->f_files = usage.gu_used_dinode + usage.gu_free_dinode + usage.gu_free_meta + usage.gu_free;
++      buf->f_ffree = usage.gu_free_dinode + usage.gu_free_meta + usage.gu_free;
++      buf->f_namelen = GFS_FNAMESIZE;
++
++      return 0;
++}
++
++/**
++ * gfs_remount_fs - called when the FS is remounted
++ * @sb:  the filesystem
++ * @flags:  the remount flags
++ * @data:  extra data passed in (not used right now)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_remount_fs(struct super_block *sb, int *flags, char *data)
++{
++      struct gfs_sbd *sdp = vfs2sdp(sb);
++      int error = 0;
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      if (*flags & (MS_NOATIME | MS_NODIRATIME))
++              set_bit(SDF_NOATIME, &sdp->sd_flags);
++      else
++              clear_bit(SDF_NOATIME, &sdp->sd_flags);
++
++      if (*flags & MS_RDONLY) {
++              if (!test_bit(SDF_ROFS, &sdp->sd_flags))
++                      error = gfs_make_fs_ro(sdp);
++      } else if (!(*flags & MS_RDONLY) &&
++                 test_bit(SDF_ROFS, &sdp->sd_flags)) {
++              error = gfs_make_fs_rw(sdp);
++      }
++
++      /*  Don't let the VFS update atimes.  */
++      *flags |= MS_NOATIME | MS_NODIRATIME;
++
++      return error;
++}
++
++/**
++ * gfs_clear_inode - Deallocate an inode when VFS is done with it
++ * @inode: The VFS inode
++ *
++ */
++
++static void
++gfs_clear_inode(struct inode *inode)
++{
++      struct gfs_inode *ip = vn2ip(inode);
++
++      atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_super);
++
++      if (ip) {
++              spin_lock(&ip->i_lock);
++              ip->i_vnode = NULL;
++              vn2ip(inode) = NULL;
++              spin_unlock(&ip->i_lock);
++
++              gfs_glock_schedule_for_reclaim(ip->i_gl);
++              gfs_inode_put(ip);
++      }
++}
++
++/**
++ * gfs_show_options - Show mount options for /proc/mounts
++ * @s: seq_file structure
++ * @mnt: vfsmount
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_show_options(struct seq_file *s, struct vfsmount *mnt)
++{
++      struct gfs_sbd *sdp = vfs2sdp(mnt->mnt_sb);
++      struct gfs_args *args = &sdp->sd_args;
++
++      atomic_inc(&sdp->sd_ops_super);
++
++      if (args->ar_lockproto[0]) {
++              seq_printf(s, ",lockproto=");
++              seq_puts(s, args->ar_lockproto);
++      }
++      if (args->ar_locktable[0]) {
++              seq_printf(s, ",locktable=");
++              seq_puts(s, args->ar_locktable);
++      }
++      if (args->ar_hostdata[0]) {
++              seq_printf(s, ",hostdata=");
++              seq_puts(s, args->ar_hostdata);
++      }
++      if (args->ar_ignore_local_fs)
++              seq_printf(s, ",ignore_local_fs");
++      if (args->ar_localflocks)
++              seq_printf(s, ",localflocks");
++      if (args->ar_localcaching)
++              seq_printf(s, ",localcaching");
++      if (args->ar_upgrade)
++              seq_printf(s, ",upgrade");
++      if (args->ar_num_glockd != GFS_GLOCKD_DEFAULT)
++              seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
++      if (args->ar_posixacls)
++              seq_printf(s, ",acl");
++
++      return 0;
++}
++
++struct super_operations gfs_super_ops = {
++      .write_inode = gfs_write_inode,
++      .put_inode = gfs_put_inode,
++      .put_super = gfs_put_super,
++      .write_super = gfs_write_super,
++      .write_super_lockfs = gfs_write_super_lockfs,
++      .unlockfs = gfs_unlockfs,
++      .statfs = gfs_statfs,
++      .remount_fs = gfs_remount_fs,
++      .clear_inode = gfs_clear_inode,
++      .show_options = gfs_show_options,
++};
+diff -urN linux-orig/fs/gfs/ops_super.h linux-patched/fs/gfs/ops_super.h
+--- linux-orig/fs/gfs/ops_super.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_super.h   2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_SUPER_DOT_H__
++#define __OPS_SUPER_DOT_H__
++
++extern struct super_operations gfs_super_ops;
++
++#endif /* __OPS_SUPER_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_vm.c linux-patched/fs/gfs/ops_vm.c
+--- linux-orig/fs/gfs/ops_vm.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_vm.c      2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,212 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_vm.h"
++#include "page.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++/**
++ * gfs_private_nopage -
++ * @area:
++ * @address:
++ * @type:
++ *
++ * Returns: the page
++ */
++
++static struct page *
++gfs_private_nopage(struct vm_area_struct *area,
++                 unsigned long address, int *type)
++{
++      struct gfs_inode *ip = vn2ip(area->vm_file->f_mapping->host);
++      struct gfs_holder i_gh;
++      struct page *result;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_vm);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
++      if (error)
++              return NULL;
++
++      set_bit(GIF_PAGED, &ip->i_flags);
++
++      result = filemap_nopage(area, address, type);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return result;
++}
++
++/**
++ * alloc_page_backing -
++ * @ip:
++ * @index:
++ *
++ * Returns: errno
++ */
++
++static int
++alloc_page_backing(struct gfs_inode *ip, unsigned long index)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      uint64_t lblock = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
++      unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
++      struct gfs_alloc *al;
++      unsigned int x;
++      int error;
++
++      al = gfs_alloc_get(ip);
++
++      error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++      if (error)
++              goto out;
++
++      error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++      if (error)
++              goto out_gunlock_q;
++
++      gfs_write_calc_reserv(ip, PAGE_CACHE_SIZE,
++                            &al->al_requested_data, &al->al_requested_meta);
++
++      error = gfs_inplace_reserve(ip);
++      if (error)
++              goto out_gunlock_q;
++
++      /* Trans may require:
++         a dinode block, RG bitmaps to allocate from,
++         indirect blocks, and a quota block */
++
++      error = gfs_trans_begin(sdp,
++                              1 + al->al_rgd->rd_ri.ri_length +
++                              al->al_requested_meta, 1);
++      if (error)
++              goto out_ipres;
++
++      if (gfs_is_stuffed(ip)) {
++              error = gfs_unstuff_dinode(ip, gfs_unstuffer_page, NULL);
++              if (error)
++                      goto out_trans;
++      }
++
++      for (x = 0; x < blocks; ) {
++              uint64_t dblock;
++              unsigned int extlen;
++              int new = TRUE;
++
++              error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++              if (error)
++                      goto out_trans;
++              GFS_ASSERT_INODE(dblock, ip,);
++
++              lblock += extlen;
++              x += extlen;
++      }
++
++      GFS_ASSERT_INODE(al->al_alloced_meta || al->al_alloced_data, ip,);
++
++ out_trans:
++      gfs_trans_end(sdp);
++
++ out_ipres:
++      gfs_inplace_release(ip);
++
++ out_gunlock_q:
++      gfs_quota_unlock_m(ip);
++
++ out:
++      gfs_alloc_put(ip);
++
++      return error;
++}
++
++/**
++ * gfs_sharewrite_nopage -
++ * @area:
++ * @address:
++ * @type:
++ *
++ * Returns: the page
++ */
++
++static struct page *
++gfs_sharewrite_nopage(struct vm_area_struct *area,
++                    unsigned long address, int *type)
++{
++      struct gfs_inode *ip = vn2ip(area->vm_file->f_mapping->host);
++      struct gfs_holder i_gh;
++      struct page *result = NULL;
++      unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
++      int alloc_required;
++      int error;
++
++      atomic_inc(&ip->i_sbd->sd_ops_vm);
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (error)
++              return NULL;
++
++      if (gfs_is_jdata(ip))
++              goto out;
++
++      set_bit(GIF_PAGED, &ip->i_flags);
++      set_bit(GIF_SW_PAGED, &ip->i_flags);
++
++      error = gfs_write_alloc_required(ip, (uint64_t)index << PAGE_CACHE_SHIFT,
++                                       PAGE_CACHE_SIZE, &alloc_required);
++      if (error)
++              goto out;
++
++      result = filemap_nopage(area, address, type);
++      if (!result || result == NOPAGE_OOM)
++              goto out;
++
++      if (alloc_required) {
++              error = alloc_page_backing(ip, index);
++              if (error) {
++                      page_cache_release(result);
++                      result = NULL;
++              }
++              set_page_dirty(result);
++      }
++
++ out:
++      gfs_glock_dq_uninit(&i_gh);
++
++      return result;
++}
++
++struct vm_operations_struct gfs_vm_ops_private = {
++      .nopage = gfs_private_nopage,
++};
++
++struct vm_operations_struct gfs_vm_ops_sharewrite = {
++      .nopage = gfs_sharewrite_nopage,
++};
++
+diff -urN linux-orig/fs/gfs/ops_vm.h linux-patched/fs/gfs/ops_vm.h
+--- linux-orig/fs/gfs/ops_vm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_vm.h      2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_VM_DOT_H__
++#define __OPS_VM_DOT_H__
++
++extern struct vm_operations_struct gfs_vm_ops_private;
++extern struct vm_operations_struct gfs_vm_ops_sharewrite;
++
++#endif /* __OPS_VM_DOT_H__ */
+diff -urN linux-orig/fs/gfs/page.c linux-patched/fs/gfs/page.c
+--- linux-orig/fs/gfs/page.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/page.c        2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,276 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/pagemap.h>
++#include <linux/mm.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "inode.h"
++#include "page.h"
++
++/**
++ * gfs_inval_pte - Sync and invalidate all PTEs associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_pte(struct gfs_glock *gl)
++{
++      struct gfs_inode *ip;
++      struct inode *inode;
++
++      ip = gl2ip(gl);
++      if (!ip ||
++          ip->i_di.di_type != GFS_FILE_REG)
++              return;
++
++      if (!test_bit(GIF_PAGED, &ip->i_flags))
++              return;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (inode) {
++              unmap_shared_mapping_range(inode->i_mapping, 0, 0);
++              iput(inode);
++
++              if (test_bit(GIF_SW_PAGED, &ip->i_flags))
++                      set_bit(GLF_DIRTY, &gl->gl_flags);
++      }
++
++      clear_bit(GIF_SW_PAGED, &ip->i_flags);
++}
++
++/**
++ * gfs_inval_page - Invalidate all pages associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_page(struct gfs_glock *gl)
++{
++      struct gfs_inode *ip;
++      struct inode *inode;
++
++      ip = gl2ip(gl);
++      if (!ip ||
++          ip->i_di.di_type != GFS_FILE_REG)
++              return;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (inode) {
++              struct address_space *mapping = inode->i_mapping;
++
++              truncate_inode_pages(mapping, 0);
++              GFS_ASSERT_INODE(!mapping->nrpages, ip,);
++
++              iput(inode);
++      }
++
++      clear_bit(GIF_PAGED, &ip->i_flags);
++}
++
++/**
++ * gfs_sync_page_i - Sync the pages for a struct inode
++ * @inode: the inode
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_page_i(struct inode *inode, int flags)
++{
++      struct address_space *mapping = inode->i_mapping;
++      int error = 0;
++
++      if (flags & DIO_START)
++              error = filemap_fdatawrite(mapping);
++      if (!error && (flags & DIO_WAIT))
++              filemap_fdatawait(mapping);
++
++      if (error)
++              gfs_io_error_inode(vn2ip(inode));
++}
++
++/**
++ * gfs_sync_page - sync the pages associated with a glock
++ * @gl: the glock
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_page(struct gfs_glock *gl, int flags)
++{
++      struct gfs_inode *ip;
++      struct inode *inode;
++
++      ip = gl2ip(gl);
++      if (!ip ||
++          ip->i_di.di_type != GFS_FILE_REG)
++              return;
++
++      inode = gfs_iget(ip, NO_CREATE);
++      if (inode) {
++              gfs_sync_page_i(inode, flags);
++              iput(inode);
++      }
++}
++
++/**
++ * gfs_unstuffer_page - unstuff a stuffed inode into a block cached by a page
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: any locked page held by the caller process
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_page(struct gfs_inode *ip, struct buffer_head *dibh,
++                 uint64_t block, void *private)
++{
++      struct inode *inode = ip->i_vnode;
++      struct page *page = (struct page *)private;
++      struct buffer_head *bh;
++      int release = FALSE;
++
++      if (!page || page->index) {
++              RETRY_MALLOC(page = grab_cache_page(inode->i_mapping, 0), page);
++              release = TRUE;
++      }
++
++      GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++      if (!PageUptodate(page)) {
++              void *kaddr = kmap(page);
++
++              memcpy(kaddr,
++                     dibh->b_data + sizeof(struct gfs_dinode),
++                     ip->i_di.di_size);
++              memset(kaddr + ip->i_di.di_size,
++                     0,
++                     PAGE_CACHE_SIZE - ip->i_di.di_size);
++              kunmap(page);
++
++              SetPageUptodate(page);
++      }
++
++      if (!page_has_buffers(page))
++              create_empty_buffers(page, 1 << inode->i_blkbits,
++                                   (1 << BH_Uptodate));
++
++      bh = page_buffers(page);
++
++      if (!buffer_mapped(bh))
++              map_bh(bh, inode->i_sb, block);
++      else
++              GFS_ASSERT_INODE(bh->b_bdev == inode->i_sb->s_bdev &&
++                               bh->b_blocknr == block,
++                               ip,);
++
++      set_buffer_uptodate(bh);
++      mark_buffer_dirty(bh);
++
++      if (release) {
++              unlock_page(page);
++              page_cache_release(page);
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_truncator_page - truncate a partial data block in the page cache
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncator_page(struct gfs_inode *ip, uint64_t size)
++{
++      struct inode *inode = ip->i_vnode;
++      struct page *page;
++      struct buffer_head *bh;
++      void *kaddr;
++      uint64_t lbn, dbn;
++      unsigned long index;
++      unsigned int offset;
++      unsigned int bufnum;
++      int not_new = 0;
++      int error;
++
++      lbn = size >> inode->i_blkbits;
++      error = gfs_block_map(ip,
++                            lbn, &not_new,
++                            &dbn, NULL);
++      if (error || !dbn)
++              return error;
++
++      index = size >> PAGE_CACHE_SHIFT;
++      offset = size & (PAGE_CACHE_SIZE - 1);
++      bufnum = lbn - (index << (PAGE_CACHE_SHIFT - inode->i_blkbits));
++
++      /* Not in a transaction here -- a non-disk-I/O error is ok. */
++
++      page = read_cache_page(inode->i_mapping, index,
++                             (filler_t *)inode->i_mapping->a_ops->readpage,
++                             NULL);
++      if (IS_ERR(page))
++              return PTR_ERR(page);
++
++      lock_page(page);
++
++      if (!PageUptodate(page) || PageError(page)) {
++              error = -EIO;
++              goto out;
++      }
++
++      kaddr = kmap(page);
++      memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
++      kunmap(page);
++
++      if (!page_has_buffers(page))
++              create_empty_buffers(page, 1 << inode->i_blkbits,
++                                   (1 << BH_Uptodate));
++
++      for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
++              /* Do nothing */;
++
++      if (!buffer_mapped(bh))
++              map_bh(bh, inode->i_sb, dbn);
++      else
++              GFS_ASSERT_INODE(bh->b_bdev == inode->i_sb->s_bdev &&
++                               bh->b_blocknr == dbn,
++                               ip,);
++
++      set_buffer_uptodate(bh);
++      mark_buffer_dirty(bh);
++
++ out:
++      unlock_page(page);
++      page_cache_release(page);
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/page.h linux-patched/fs/gfs/page.h
+--- linux-orig/fs/gfs/page.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/page.h        2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,26 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __PAGE_DOT_H__
++#define __PAGE_DOT_H__
++
++void gfs_inval_pte(struct gfs_glock *gl);
++void gfs_inval_page(struct gfs_glock *gl);
++void gfs_sync_page_i(struct inode *inode, int flags);
++void gfs_sync_page(struct gfs_glock *gl, int flags);
++
++int gfs_unstuffer_page(struct gfs_inode *ip, struct buffer_head *dibh,
++                     uint64_t block, void *private);
++int gfs_truncator_page(struct gfs_inode *ip, uint64_t size);
++
++#endif /* __PAGE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/quota.c linux-patched/fs/gfs/quota.c
+--- linux-orig/fs/gfs/quota.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/quota.c       2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,1146 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/tty.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "log.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * gfs_quota_get - Get a structure to represent a quota change
++ * @sdp: the filesystem
++ * @user: TRUE if this is a user quota
++ * @id: the uid or gid
++ * @create: if TRUE, create the structure, otherwise return NULL
++ * @qdp: the returned quota structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_get(struct gfs_sbd *sdp, int user, uint32_t id, int create,
++            struct gfs_quota_data **qdp)
++{
++      struct gfs_quota_data *qd = NULL, *new_qd = NULL;
++      struct list_head *tmp, *head;
++      int error = 0;
++
++      for (;;) {
++              spin_lock(&sdp->sd_quota_lock);
++
++              for (head = &sdp->sd_quota_list, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++                      if (qd->qd_id == id &&
++                          !test_bit(QDF_USER, &qd->qd_flags) == !user) {
++                              qd->qd_count++;
++                              break;
++                      }
++              }
++
++              if (tmp == head)
++                      qd = NULL;
++
++              if (!qd && new_qd) {
++                      qd = new_qd;
++                      list_add(&qd->qd_list, &sdp->sd_quota_list);
++                      new_qd = NULL;
++              }
++
++              spin_unlock(&sdp->sd_quota_lock);
++
++              if (qd || !create) {
++                      if (new_qd) {
++                              gfs_lvb_unhold(new_qd->qd_gl);
++                              kfree(new_qd);
++                              atomic_dec(&sdp->sd_quota_count);
++                      }
++                      goto out;
++              }
++
++              new_qd = gmalloc(sizeof(struct gfs_quota_data));
++              memset(new_qd, 0, sizeof(struct gfs_quota_data));
++
++              new_qd->qd_count = 1;
++
++              new_qd->qd_id = id;
++              if (user)
++                      set_bit(QDF_USER, &new_qd->qd_flags);
++
++              INIT_LIST_HEAD(&new_qd->qd_le_list);
++
++              error = gfs_glock_get(sdp, 2 * (uint64_t)id + ((user) ? 0 : 1),
++                                    &gfs_quota_glops, CREATE,
++                                    &new_qd->qd_gl);
++              if (error) {
++                      kfree(new_qd);
++                      goto out;
++              }
++
++              error = gfs_lvb_hold(new_qd->qd_gl);
++
++              gfs_glock_put(new_qd->qd_gl);
++
++              if (error) {
++                      kfree(new_qd);
++                      goto out;
++              }
++
++              atomic_inc(&sdp->sd_quota_count);
++      }
++
++ out:
++      *qdp = qd;
++
++      return error;
++}
++
++/**
++ * gfs_quota_hold - increment the usage count on a struct gfs_quota_data
++ * @sdp: the filesystem
++ * @qd: the structure
++ *
++ */
++
++void
++gfs_quota_hold(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++      spin_lock(&sdp->sd_quota_lock);
++      qd->qd_count++;
++      spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * gfs_quota_put - decrement the usage count on a struct gfs_quota_data
++ * @sdp: the filesystem
++ * @qd: the structure
++ *
++ * Free the structure if its reference count hits zero.
++ *
++ */
++
++void
++gfs_quota_put(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++      spin_lock(&sdp->sd_quota_lock);
++      GFS_ASSERT_SBD(qd->qd_count, sdp,);
++      qd->qd_count--;
++      spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * quota_find - Find a quota change to sync to the quota file
++ * @sdp: the filesystem
++ *
++ * The returned structure is locked and needs to be unlocked
++ * with quota_unlock().
++ *
++ * Returns: A quota structure, or NULL
++ */
++
++static struct gfs_quota_data *
++quota_find(struct gfs_sbd *sdp)
++{
++      struct list_head *tmp, *head;
++      struct gfs_quota_data *qd = NULL;
++
++      if (test_bit(SDF_ROFS, &sdp->sd_flags))
++              return NULL;
++
++      gfs_log_lock(sdp);
++      spin_lock(&sdp->sd_quota_lock);
++
++      if (!atomic_read(&sdp->sd_quota_od_count))
++              goto out;
++
++      for (head = &sdp->sd_quota_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++
++              if (test_bit(QDF_LOCK, &qd->qd_flags))
++                      continue;
++              if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++                      continue;
++              if (qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
++                      continue;
++
++              list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
++
++              set_bit(QDF_LOCK, &qd->qd_flags);
++              qd->qd_count++;
++              qd->qd_change_sync = qd->qd_change_od;
++
++              goto out;
++      }
++
++      qd = NULL;
++
++ out:
++      spin_unlock(&sdp->sd_quota_lock);
++      gfs_log_unlock(sdp);
++
++      return qd;
++}
++
++/**
++ * quota_trylock - Try to lock a given quota entry
++ * @sdp: the filesystem
++ * @qd: the quota data structure
++ *
++ * Returns: TRUE if the lock was successful, FALSE, otherwise
++ */
++
++static int
++quota_trylock(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++      int ret = FALSE;
++
++      if (test_bit(SDF_ROFS, &sdp->sd_flags))
++              return FALSE;
++
++      gfs_log_lock(sdp);
++      spin_lock(&sdp->sd_quota_lock);
++
++      if (test_bit(QDF_LOCK, &qd->qd_flags))
++              goto out;
++      if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++              goto out;
++
++      list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
++
++      set_bit(QDF_LOCK, &qd->qd_flags);
++      qd->qd_count++;
++      qd->qd_change_sync = qd->qd_change_od;
++
++      ret = TRUE;
++
++ out:
++      spin_unlock(&sdp->sd_quota_lock);
++      gfs_log_unlock(sdp);
++
++      return ret;
++}
++
++/**
++ * quota_unlock - drop and a reference on a quota structure
++ * @sdp: the filesystem
++ * @qd: the quota inode structure
++ *
++ */
++
++static void
++quota_unlock(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++      spin_lock(&sdp->sd_quota_lock);
++
++      GFS_ASSERT_SBD(test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++      clear_bit(QDF_LOCK, &qd->qd_flags);
++
++      GFS_ASSERT_SBD(qd->qd_count, sdp,);
++      qd->qd_count--;
++
++      spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * gfs_quota_merge - add/remove a quota change from the in-memory list
++ * @sdp: the filesystem
++ * @tag: the quota change tag
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_merge(struct gfs_sbd *sdp, struct gfs_quota_tag *tag)
++{
++      struct gfs_quota_data *qd;
++      int error;
++
++      error = gfs_quota_get(sdp,
++                            tag->qt_flags & GFS_QTF_USER, tag->qt_id,
++                            CREATE, &qd);
++      if (error)
++              return error;
++
++      GFS_ASSERT_SBD(qd->qd_change_ic == qd->qd_change_od, sdp,);
++
++      gfs_log_lock(sdp);
++
++      qd->qd_change_ic += tag->qt_change;
++      qd->qd_change_od += tag->qt_change;
++
++      if (qd->qd_change_od) {
++              if (!test_bit(QDF_OD_LIST, &qd->qd_flags)) {
++                      gfs_quota_hold(sdp, qd);
++                      set_bit(QDF_OD_LIST, &qd->qd_flags);
++                      atomic_inc(&sdp->sd_quota_od_count);
++              }
++      } else {
++              GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags), sdp,);
++              clear_bit(QDF_OD_LIST, &qd->qd_flags);
++              gfs_quota_put(sdp, qd);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_quota_od_count), sdp,);
++              atomic_dec(&sdp->sd_quota_od_count);
++      }
++
++      gfs_log_unlock(sdp);
++
++      gfs_quota_put(sdp, qd);
++
++      return 0;
++}
++
++/**
++ * gfs_quota_scan - Look for unused struct gfs_quota_data structures to throw away
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_quota_scan(struct gfs_sbd *sdp)
++{
++      struct list_head *head, *tmp, *next;
++      struct gfs_quota_data *qd;
++      LIST_HEAD(dead);
++
++      spin_lock(&sdp->sd_quota_lock);
++
++      for (head = &sdp->sd_quota_list, tmp = head->next, next = tmp->next;
++           tmp != head;
++           tmp = next, next = next->next) {
++              qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++              if (!qd->qd_count)
++                      list_move(&qd->qd_list, &dead);
++      }
++
++      spin_unlock(&sdp->sd_quota_lock);
++
++      while (!list_empty(&dead)) {
++              qd = list_entry(dead.next, struct gfs_quota_data, qd_list);
++
++              GFS_ASSERT_SBD(!qd->qd_count, sdp,);
++              GFS_ASSERT_SBD(!test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++                             !test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++              GFS_ASSERT_SBD(!qd->qd_change_new && !qd->qd_change_ic &&
++                             !qd->qd_change_od, sdp,);
++
++              list_del(&qd->qd_list);
++              gfs_lvb_unhold(qd->qd_gl);
++              kfree(qd);
++              atomic_dec(&sdp->sd_quota_count);
++      }
++}
++
++/**
++ * gfs_quota_cleanup - get rid of any extra struct gfs_quota_data structures
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_quota_cleanup(struct gfs_sbd *sdp)
++{
++      struct gfs_quota_data *qd;
++
++ restart:
++      gfs_log_lock(sdp);
++
++      spin_lock(&sdp->sd_quota_lock);
++
++      while (!list_empty(&sdp->sd_quota_list)) {
++              qd = list_entry(sdp->sd_quota_list.next,
++                              struct gfs_quota_data,
++                              qd_list);
++
++              if (qd->qd_count > 1) {
++                      spin_unlock(&sdp->sd_quota_lock);
++                      gfs_log_unlock(sdp);
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      goto restart;
++
++              } else if (qd->qd_count) {
++                      GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++                                     !test_bit(QDF_LOCK, &qd->qd_flags),
++                                     sdp,);
++                      GFS_ASSERT_SBD(qd->qd_change_od &&
++                                     qd->qd_change_od == qd->qd_change_ic,
++                                     sdp,);
++                      GFS_ASSERT_SBD(!qd->qd_change_new, sdp,);
++
++                      list_del(&qd->qd_list);
++                      atomic_dec(&sdp->sd_quota_od_count);
++
++                      spin_unlock(&sdp->sd_quota_lock);
++                      gfs_lvb_unhold(qd->qd_gl);
++                      kfree(qd);
++                      atomic_dec(&sdp->sd_quota_count);
++                      spin_lock(&sdp->sd_quota_lock);
++
++              } else {
++                      GFS_ASSERT_SBD(!test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++                                     !test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++                      GFS_ASSERT_SBD(!qd->qd_change_new &&
++                                     !qd->qd_change_ic &&
++                                     !qd->qd_change_od, sdp,);
++
++                      list_del(&qd->qd_list);
++
++                      spin_unlock(&sdp->sd_quota_lock);
++                      gfs_lvb_unhold(qd->qd_gl);
++                      kfree(qd);
++                      atomic_dec(&sdp->sd_quota_count);
++                      spin_lock(&sdp->sd_quota_lock);
++              }
++      }
++
++      spin_unlock(&sdp->sd_quota_lock);
++
++      GFS_ASSERT_SBD(!atomic_read(&sdp->sd_quota_od_count), sdp,);
++
++      gfs_log_unlock(sdp);
++}
++
++/**
++ * sort_qd - figure out the order between two quota data structures
++ * @a: first quota data structure
++ * @b: second quota data structure
++ *
++ * Returns: -1 if @a comes before @b, 0 if @a equals @b, 1 if @b comes before @a
++ */
++
++static int
++sort_qd(void *a, void *b)
++{
++      struct gfs_quota_data *qd_a = *(struct gfs_quota_data **)a;
++      struct gfs_quota_data *qd_b = *(struct gfs_quota_data **)b;
++      int ret = 0;
++
++      if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
++          !test_bit(QDF_USER, &qd_b->qd_flags)) {
++              if (test_bit(QDF_USER, &qd_a->qd_flags))
++                      ret = -1;
++              else
++                      ret = 1;
++      } else {
++              if (qd_a->qd_id < qd_b->qd_id)
++                      ret = -1;
++              else if (qd_a->qd_id > qd_b->qd_id)
++                      ret = 1;
++      }
++
++      return ret;
++}
++
++/**
++ * do_quota_sync - Sync a bunch quota changes to the quota file
++ * @sdp: the filesystem
++ * @qda: an array of struct gfs_quota_data structures to be synced
++ * @num_qd: the number of elements in @qda
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_quota_sync(struct gfs_sbd *sdp, struct gfs_quota_data **qda,
++            unsigned int num_qd)
++{
++      struct gfs_inode *ip = sdp->sd_qinode;
++      struct gfs_alloc *al = NULL;
++      struct gfs_holder i_gh, *ghs;
++      struct gfs_quota q;
++      char buf[sizeof(struct gfs_quota)];
++      uint64_t offset;
++      unsigned int qx, x;
++      int ar;
++      unsigned int nalloc = 0;
++      unsigned int data_blocks, ind_blocks;
++      int error;
++
++      gfs_write_calc_reserv(ip, sizeof(struct gfs_quota), &data_blocks,
++                            &ind_blocks);
++
++      ghs = gmalloc(num_qd * sizeof(struct gfs_holder));
++
++      gfs_sort(qda, num_qd, sizeof (struct gfs_quota_data *), sort_qd);
++      for (qx = 0; qx < num_qd; qx++) {
++              error = gfs_glock_nq_init(qda[qx]->qd_gl,
++                                        LM_ST_EXCLUSIVE,
++                                        GL_NOCACHE, &ghs[qx]);
++              if (error)
++                      goto fail;
++      }
++
++      error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++      if (error)
++              goto fail;
++
++      for (x = 0; x < num_qd; x++) {
++              offset = (2 * (uint64_t)qda[x]->qd_id +
++                        ((test_bit(QDF_USER, &qda[x]->qd_flags)) ? 0 : 1)) *
++                      sizeof(struct gfs_quota);
++
++              error = gfs_write_alloc_required(ip, offset,
++                                               sizeof(struct gfs_quota),
++                                               &ar);
++              if (error)
++                      goto fail_gunlock;
++
++              if (ar)
++                      nalloc++;
++      }
++
++      if (nalloc) {
++              al = gfs_alloc_get(ip);
++
++              error =
++                  gfs_quota_hold_m(ip, NO_QUOTA_CHANGE,
++                                         NO_QUOTA_CHANGE);
++              if (error)
++                      goto fail_alloc;
++
++              al->al_requested_meta = nalloc * (data_blocks + ind_blocks);
++
++              error = gfs_inplace_reserve(ip);
++              if (error)
++                      goto fail_qs;
++
++              /* Trans may require:
++                 two (journaled) data blocks, a dinode block, RG bitmaps to allocate from,
++                 indirect blocks, and a quota block */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + al->al_rgd->rd_ri.ri_length +
++                                      num_qd * data_blocks +
++                                      nalloc * ind_blocks,
++                                      gfs_struct2blk(sdp, num_qd + 2,
++                                                     sizeof(struct gfs_quota_tag)));
++              if (error)
++                      goto fail_ipres;
++      } else {
++              /* Trans may require:
++                 Data blocks, a dinode block, and quota blocks */
++
++              error = gfs_trans_begin(sdp,
++                                      1 + data_blocks * num_qd,
++                                      gfs_struct2blk(sdp, num_qd,
++                                                     sizeof(struct gfs_quota_tag)));
++              if (error)
++                      goto fail_gunlock;
++      }
++
++      for (x = 0; x < num_qd; x++) {
++              offset = (2 * (uint64_t)qda[x]->qd_id +
++                        ((test_bit(QDF_USER, &qda[x]->qd_flags)) ? 0 : 1)) *
++                      sizeof(struct gfs_quota);
++
++              /*  The quota file may not be a multiple of sizeof(struct gfs_quota) bytes.  */
++              memset(buf, 0, sizeof(struct gfs_quota));
++
++              error = gfs_internal_read(ip, buf, offset,
++                                        sizeof(struct gfs_quota));
++              if (error < 0)
++                      goto fail_end_trans;
++
++              gfs_quota_in(&q, buf);
++              q.qu_value += qda[x]->qd_change_sync;
++              gfs_quota_out(&q, buf);
++
++              error = gfs_internal_write(ip, buf, offset,
++                                         sizeof(struct gfs_quota));
++              if (error < 0)
++                      goto fail_end_trans;
++              else if (error != sizeof(struct gfs_quota)) {
++                      error = -EIO;
++                      goto fail_end_trans;
++              }
++
++              if (test_bit(QDF_USER, &qda[x]->qd_flags))
++                      gfs_trans_add_quota(sdp, -qda[x]->qd_change_sync,
++                                          qda[x]->qd_id, NO_QUOTA_CHANGE);
++              else
++                      gfs_trans_add_quota(sdp, -qda[x]->qd_change_sync,
++                                          NO_QUOTA_CHANGE, qda[x]->qd_id);
++
++              memset(&qda[x]->qd_qb, 0, sizeof(struct gfs_quota_lvb));
++              qda[x]->qd_qb.qb_magic = GFS_MAGIC;
++              qda[x]->qd_qb.qb_limit = q.qu_limit;
++              qda[x]->qd_qb.qb_warn = q.qu_warn;
++              qda[x]->qd_qb.qb_value = q.qu_value;
++
++              gfs_quota_lvb_out(&qda[x]->qd_qb, qda[x]->qd_gl->gl_lvb);
++              clear_bit(GLF_LVB_INVALID, &qda[x]->qd_gl->gl_flags);
++      }
++
++      gfs_trans_end(sdp);
++
++      if (nalloc) {
++              GFS_ASSERT_SBD(al->al_alloced_meta, sdp,);
++              gfs_inplace_release(ip);
++              gfs_quota_unhold_m(ip);
++              gfs_alloc_put(ip);
++      }
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      for (x = 0; x < num_qd; x++)
++              gfs_glock_dq_uninit(&ghs[x]);
++
++      kfree(ghs);
++
++      gfs_log_flush_glock(ip->i_gl);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_ipres:
++      if (nalloc)
++              gfs_inplace_release(ip);
++
++ fail_qs:
++      if (nalloc)
++              gfs_quota_unhold_m(ip);
++
++ fail_alloc:
++      if (nalloc)
++              gfs_alloc_put(ip);
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++      while (qx--)
++              gfs_glock_dq_uninit(&ghs[qx]);
++
++      kfree(ghs);
++
++      return error;
++}
++
++/**
++ * glock_q - Acquire a lock for a quota entry
++ * @sdp: the filesystem
++ * @qd: the quota data structure to glock
++ * @force_refresh: If TRUE, always read from the quota file
++ * @q_gh: the glock holder for the quota lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++glock_q(struct gfs_sbd *sdp, struct gfs_quota_data *qd, int force_refresh,
++      struct gfs_holder *q_gh)
++{
++      struct gfs_holder i_gh;
++      struct gfs_quota q;
++      char buf[sizeof(struct gfs_quota)];
++      int error;
++
++ restart:
++      error = gfs_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
++      if (error)
++              return error;
++
++      gfs_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
++
++      if (force_refresh ||
++          qd->qd_qb.qb_magic != GFS_MAGIC ||
++          test_bit(GLF_LVB_INVALID, &qd->qd_gl->gl_flags)) {
++              gfs_glock_dq_uninit(q_gh);
++              error = gfs_glock_nq_init(qd->qd_gl,
++                                        LM_ST_EXCLUSIVE, GL_NOCACHE,
++                                        q_gh);
++              if (error)
++                      return error;
++
++              error = gfs_glock_nq_init(sdp->sd_qinode->i_gl,
++                                        LM_ST_SHARED, 0,
++                                        &i_gh);
++              if (error)
++                      goto fail;
++
++              memset(buf, 0, sizeof(struct gfs_quota));
++
++              error = gfs_internal_read(sdp->sd_qinode, buf,
++                                        (2 * (uint64_t)qd->qd_id +
++                                         ((test_bit(QDF_USER, &qd->qd_flags)) ? 0 : 1)) *
++                                        sizeof(struct gfs_quota),
++                                        sizeof(struct gfs_quota));
++              if (error < 0)
++                      goto fail_gunlock;
++
++              gfs_glock_dq_uninit(&i_gh);
++
++              gfs_quota_in(&q, buf);
++
++              memset(&qd->qd_qb, 0, sizeof(struct gfs_quota_lvb));
++              qd->qd_qb.qb_magic = GFS_MAGIC;
++              qd->qd_qb.qb_limit = q.qu_limit;
++              qd->qd_qb.qb_warn = q.qu_warn;
++              qd->qd_qb.qb_value = q.qu_value;
++
++              gfs_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
++              clear_bit(GLF_LVB_INVALID, &qd->qd_gl->gl_flags);
++
++              gfs_glock_dq_uninit(q_gh);
++              force_refresh = FALSE;
++              goto restart;
++      }
++
++      return 0;
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++      gfs_glock_dq_uninit(q_gh);
++
++      return error;
++}
++
++/**
++ * gfs_quota_hold_m - Hold the quota structures for up to 4 IDs
++ * @ip: Two of the IDs are the UID and GID from this file
++ * @uid: a UID or the constant NO_QUOTA_CHANGE
++ * @gid: a GID or the constant NO_QUOTA_CHANGE
++ *
++ * The struct gfs_quota_data structures representing the locks are
++ * stored in the ip->i_alloc->al_qd array.
++ * 
++ * Returns:  0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_hold_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      unsigned int x = 0;
++      int error;
++
++      GFS_ASSERT_INODE(al && !al->al_qd_num &&
++                       !test_bit(GIF_QD_LOCKED, &ip->i_flags), ip,);
++
++      if (!sdp->sd_tune.gt_quota_account)
++              return 0;
++
++      error = gfs_quota_get(sdp, TRUE, ip->i_di.di_uid,
++                            CREATE, &al->al_qd[x]);
++      if (error)
++              goto fail;
++      x++;
++
++      error = gfs_quota_get(sdp, FALSE, ip->i_di.di_gid,
++                            CREATE, &al->al_qd[x]);
++      if (error)
++              goto fail;
++      x++;
++
++      if (uid != NO_QUOTA_CHANGE) {
++              error = gfs_quota_get(sdp, TRUE, uid,
++                                    CREATE, &al->al_qd[x]);
++              if (error)
++                      goto fail;
++              x++;
++      }
++
++      if (gid != NO_QUOTA_CHANGE) {
++              error = gfs_quota_get(sdp, FALSE, gid,
++                                    CREATE, &al->al_qd[x]);
++              if (error)
++                      goto fail;
++              x++;
++      }
++
++      al->al_qd_num = x;
++
++      return 0;
++
++ fail:
++      if (x) {
++              al->al_qd_num = x;
++              gfs_quota_unhold_m(ip);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_quota_unhold_m - throw away some quota locks
++ * @ip: the inode who's ip->i_alloc->al_qd array holds the structures
++ *
++ */
++
++void
++gfs_quota_unhold_m(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      unsigned int x;
++
++      GFS_ASSERT_INODE(al &&
++                       !test_bit(GIF_QD_LOCKED, &ip->i_flags), ip,);
++
++      for (x = 0; x < al->al_qd_num; x++) {
++              gfs_quota_put(sdp, al->al_qd[x]);
++              al->al_qd[x] = NULL;
++      }
++      al->al_qd_num = 0;
++}
++
++/**
++ * gfs_quota_lock_m - Acquire the quota locks for up to 4 IDs
++ * @ip: Two of the IDs are the UID and GID from this file
++ * @uid: a UID or the constant NO_QUOTA_CHANGE
++ * @gid: a GID or the constant NO_QUOTA_CHANGE
++ *
++ * The struct gfs_quota_data structures representing the locks are
++ * stored in the ip->i_alloc->al_qd array.
++ * 
++ * Returns:  0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_lock_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      unsigned int x;
++      int error;
++
++      gfs_quota_hold_m(ip, uid, gid);
++
++      if (!sdp->sd_tune.gt_quota_enforce)
++              return 0;
++      if (capable(CAP_SYS_RESOURCE))
++              return 0;
++
++      gfs_sort(al->al_qd, al->al_qd_num,
++               sizeof(struct gfs_quota_data *), sort_qd);
++
++      for (x = 0; x < al->al_qd_num; x++) {
++              error = glock_q(sdp, al->al_qd[x], FALSE, &al->al_qd_ghs[x]);
++              if (error)
++                      goto fail;
++      }
++
++      set_bit(GIF_QD_LOCKED, &ip->i_flags);
++
++      return 0;
++
++      fail:
++      while (x--)
++              gfs_glock_dq_uninit(&al->al_qd_ghs[x]);
++
++      return error;
++}
++
++/**
++ * gfs_quota_unlock_m - drop some quota locks
++ * @ip: the inode who's ip->i_alloc->al_qd array holds the locks
++ *
++ */
++
++void
++gfs_quota_unlock_m(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      struct gfs_quota_data *qd, *qda[4];
++      int64_t value;
++      unsigned int count = 0;
++      unsigned int x;
++      int do_sync;
++
++      if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
++              goto out;
++
++      for (x = 0; x < al->al_qd_num; x++) {
++              qd = al->al_qd[x];
++
++              spin_lock(&sdp->sd_quota_lock);
++              value = qd->qd_change_new + qd->qd_change_ic;
++              spin_unlock(&sdp->sd_quota_lock);
++
++              do_sync = TRUE;
++              if (!qd->qd_qb.qb_limit)
++                      do_sync = FALSE;
++              else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
++                      do_sync = FALSE;
++              else {
++                      int64_t v;
++                      v = value * gfs_num_journals(sdp) * sdp->sd_tune.gt_quota_scale_num;
++                      do_div(v, sdp->sd_tune.gt_quota_scale_den);
++                      v += qd->qd_qb.qb_value;
++                      if (v < (int64_t)qd->qd_qb.qb_limit)
++                              do_sync = FALSE;
++              }
++
++              gfs_glock_dq_uninit(&al->al_qd_ghs[x]);
++
++              if (do_sync) {
++                      gfs_log_flush(sdp);
++                      if (quota_trylock(sdp, qd))
++                              qda[count++] = qd;
++              }
++      }
++
++      if (count) {
++              do_quota_sync(sdp, qda, count);
++
++              for (x = 0; x < count; x++)
++                      quota_unlock(sdp, qda[x]);
++      }
++
++ out:
++      gfs_quota_unhold_m(ip);
++}
++
++/**
++ * print_quota_message - print a message to the user's tty about quotas
++ * @sdp: the filesystem
++ * @qd: the quota ID that the message is about
++ * @type: the type of message ("exceeded" or "warning")
++ *
++ */
++
++static void
++print_quota_message(struct gfs_sbd *sdp, struct gfs_quota_data *qd, char *type)
++{
++      char *line = gmalloc(256);
++      int len;
++      struct tty_struct *tty;
++
++      len = snprintf(line, 256, "GFS: fsid=%s: quota %s for %s %u\r\n",
++                     sdp->sd_fsname, type,
++                     (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
++                     qd->qd_id);
++
++      if (current->signal) {
++              tty = current->signal->tty;
++              if (tty && tty->driver->write)
++                      tty->driver->write(tty, 0, line, len);
++      }
++
++      kfree(line);
++}
++
++/**
++ * gfs_quota_check - Check to see if a block allocation is possible
++ * @ip: the inode who's ip->i_res.ir_qd array holds the quota locks
++ * @uid: the UID the block is allocated for
++ * @gid: the GID the block is allocated for
++ *
++ */
++
++int
++gfs_quota_check(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      struct gfs_quota_data *qd;
++      int64_t value;
++      unsigned int x;
++      int error = 0;
++
++      if (!al)
++              return 0;
++
++      for (x = 0; x < al->al_qd_num; x++) {
++              qd = al->al_qd[x];
++
++              if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
++                    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
++                      continue;
++
++              spin_lock(&sdp->sd_quota_lock);
++              value = qd->qd_change_new + qd->qd_change_ic;
++              spin_unlock(&sdp->sd_quota_lock);
++              value += qd->qd_qb.qb_value;
++
++              if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
++                      print_quota_message(sdp, qd, "exceeded");
++                      error = -EDQUOT;
++                      break;
++              } else if (qd->qd_qb.qb_warn &&
++                         (int64_t)qd->qd_qb.qb_warn < value &&
++                         time_after_eq(jiffies,
++                                       qd->qd_last_warn +
++                                       sdp->sd_tune.gt_quota_warn_period * HZ)) {
++                      print_quota_message(sdp, qd, "warning");
++                      qd->qd_last_warn = jiffies;
++              }
++      }
++
++      return error;
++}
++
++/**
++ * gfs_quota_sync - Sync quota changes to the quota file
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_sync(struct gfs_sbd *sdp)
++{
++      struct gfs_quota_data **qda;
++      unsigned int max_qd = sdp->sd_tune.gt_quota_simul_sync;
++      unsigned int num_qd;
++      unsigned int x;
++      int error = 0;
++
++      sdp->sd_quota_sync_gen++;
++
++      qda = gmalloc(max_qd * sizeof(struct gfs_quota_data *));
++
++      memset(qda, 0, max_qd * sizeof(struct gfs_quota_data *));
++
++      do {
++              num_qd = 0;
++
++              for (;;) {
++                      qda[num_qd] = quota_find(sdp);
++                      if (!qda[num_qd])
++                              break;
++
++                      if (++num_qd == max_qd)
++                              break;
++              }
++
++              if (num_qd) {
++                      error = do_quota_sync(sdp, qda, num_qd);
++                      if (!error)
++                              for (x = 0; x < num_qd; x++)
++                                      qda[x]->qd_sync_gen =
++                                              sdp->sd_quota_sync_gen;
++
++                      for (x = 0; x < num_qd; x++)
++                              quota_unlock(sdp, qda[x]);
++              }
++      }
++      while (!error && num_qd == max_qd);
++
++      kfree(qda);
++
++      return error;
++}
++
++/**
++ * gfs_quota_refresh - Refresh the LVB for a given quota ID
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_quota_name in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_refresh(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_quota_name qn;
++      struct gfs_quota_data *qd;
++      struct gfs_holder q_gh;
++      int error;
++
++      if (copy_from_user(&qn, arg, sizeof(struct gfs_quota_name)))
++              return -EFAULT;
++
++      error = gfs_quota_get(sdp, qn.qn_user, qn.qn_id, CREATE, &qd);
++      if (error)
++              return error;
++
++      error = glock_q(sdp, qd, TRUE, &q_gh);
++      if (!error)
++              gfs_glock_dq_uninit(&q_gh);
++
++      gfs_quota_put(sdp, qd);
++
++      return error;
++}
++
++/**
++ * gfs_quota_read - Read the info a given quota ID
++ * @sdp: the filesystem
++ * @arg: a pointer to a gfs_quota_refresh_t in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_read(struct gfs_sbd *sdp, void *arg)
++{
++      struct gfs_quota_name qn;
++      struct gfs_quota_data *qd;
++      struct gfs_holder q_gh;
++      struct gfs_quota q;
++      int error;
++
++      if (copy_from_user(&qn, arg, sizeof(struct gfs_quota_name)))
++              return -EFAULT;
++
++      if (((qn.qn_user) ?
++           (qn.qn_id != current->fsuid) :
++           (!in_group_p(qn.qn_id))) &&
++          !capable(CAP_SYS_ADMIN))
++              return -EACCES;
++
++      error = gfs_quota_get(sdp, qn.qn_user, qn.qn_id, CREATE, &qd);
++      if (error)
++              return error;
++
++      error = glock_q(sdp, qd, FALSE, &q_gh);
++      if (error)
++              goto out;
++
++      memset(&q, 0, sizeof(struct gfs_quota));
++      q.qu_limit = qd->qd_qb.qb_limit;
++      q.qu_warn = qd->qd_qb.qb_warn;
++      q.qu_value = qd->qd_qb.qb_value;
++
++      spin_lock(&sdp->sd_quota_lock);
++      q.qu_value += qd->qd_change_new + qd->qd_change_ic;
++      spin_unlock(&sdp->sd_quota_lock);
++
++      gfs_glock_dq_uninit(&q_gh);
++
++ out:
++      gfs_quota_put(sdp, qd);
++
++      if (!error &&
++          copy_to_user((char *)arg + sizeof(struct gfs_quota_name),
++                       &q, sizeof(struct gfs_quota)))
++              error = -EFAULT;
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/quota.h linux-patched/fs/gfs/quota.h
+--- linux-orig/fs/gfs/quota.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/quota.h       2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,40 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __QUOTA_DOT_H__
++#define __QUOTA_DOT_H__
++
++#define NO_QUOTA_CHANGE ((uint32_t)-1)
++
++int gfs_quota_get(struct gfs_sbd *sdp, int user, uint32_t id, int create,
++                   struct gfs_quota_data **qdp);
++void gfs_quota_hold(struct gfs_sbd *sdp, struct gfs_quota_data *qd);
++void gfs_quota_put(struct gfs_sbd *sdp, struct gfs_quota_data *qd);
++
++int gfs_quota_merge(struct gfs_sbd *sdp, struct gfs_quota_tag *tag);
++void gfs_quota_scan(struct gfs_sbd *sdp);
++void gfs_quota_cleanup(struct gfs_sbd *sdp);
++
++int gfs_quota_hold_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++void gfs_quota_unhold_m(struct gfs_inode *ip);
++
++int gfs_quota_lock_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++void gfs_quota_unlock_m(struct gfs_inode *ip);
++
++int gfs_quota_check(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++
++int gfs_quota_sync(struct gfs_sbd *sdp);
++int gfs_quota_refresh(struct gfs_sbd *sdp, void *arg);
++int gfs_quota_read(struct gfs_sbd *sdp, void *arg);
++
++#endif /* __QUOTA_DOT_H__ */
+diff -urN linux-orig/fs/gfs/recovery.c linux-patched/fs/gfs/recovery.c
+--- linux-orig/fs/gfs/recovery.c       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/recovery.c    2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,749 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "lops.h"
++#include "recovery.h"
++
++#define bn2seg(bn) (((uint32_t)((bn) - jdesc->ji_addr)) / sdp->sd_sb.sb_seg_size)
++#define seg2bn(seg) ((seg) * sdp->sd_sb.sb_seg_size + jdesc->ji_addr)
++
++struct dirty_j {
++      struct list_head dj_list;
++      unsigned int dj_jid;
++      struct gfs_jindex dj_desc;
++};
++
++/**
++ * gfs_add_dirty_j - add a jid to the list of dirty journals
++ * @sdp: the filesystem
++ * @jid: the journal ID number
++ *
++ */
++
++void
++gfs_add_dirty_j(struct gfs_sbd *sdp, unsigned int jid)
++{
++      struct dirty_j *dj;
++
++      dj = gmalloc(sizeof(struct dirty_j));
++      memset(dj, 0, sizeof(struct dirty_j));
++
++      dj->dj_jid = jid;
++
++      spin_lock(&sdp->sd_dirty_j_lock);
++      list_add(&dj->dj_list, &sdp->sd_dirty_j);
++      spin_unlock(&sdp->sd_dirty_j_lock);
++}
++
++/**
++ * get_dirty_j - return a dirty journal from the list
++ * @sdp: the filesystem
++ *
++ * Returns: a struct dirty_j or NULL
++ */
++
++static struct dirty_j *
++get_dirty_j(struct gfs_sbd *sdp)
++{
++      struct dirty_j *dj = NULL;
++
++      spin_lock(&sdp->sd_dirty_j_lock);
++      if (!list_empty(&sdp->sd_dirty_j)) {
++              dj = list_entry(sdp->sd_dirty_j.prev, struct dirty_j, dj_list);
++              list_del(&dj->dj_list);
++      }
++      spin_unlock(&sdp->sd_dirty_j_lock);
++
++      return dj;
++}
++
++/**
++ * gfs_clear_dirty_j - destroy the list of dirty journals
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_clear_dirty_j(struct gfs_sbd *sdp)
++{
++      struct dirty_j *dj;
++      for (;;) {
++              dj = get_dirty_j(sdp);
++              if (!dj)
++                      break;
++              kfree(dj);
++      }
++}
++
++/**
++ * gfs_log_header - read the log header for a given segment
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @seg: the segment to look at
++ * @lh: the log header to return
++ *
++ * Read the log header for a given segement in a given journal.  Do a few
++ * sanity checks on it.
++ *
++ * Returns: 0 on success, 1 if the header was invalid or incomplete and, -EXXX on error
++ */
++
++static int
++get_log_header(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++             struct gfs_glock *gl, uint32_t seg, struct gfs_log_header *lh)
++{
++      struct buffer_head *bh;
++      struct gfs_log_header lh2;
++      int error;
++
++      error = gfs_dread(sdp, seg2bn(seg), gl, DIO_START | DIO_WAIT, &bh);
++      if (error)
++              return error;
++
++      gfs_log_header_in(lh, bh->b_data);
++      gfs_log_header_in(&lh2,
++                        bh->b_data + GFS_BASIC_BLOCK -
++                        sizeof(struct gfs_log_header));
++
++      brelse(bh);
++
++      if (memcmp(lh, &lh2, sizeof(struct gfs_log_header)) != 0 ||
++          lh->lh_header.mh_magic != GFS_MAGIC ||
++          lh->lh_header.mh_type != GFS_METATYPE_LH)
++              error = 1;
++
++      return error;
++}
++
++/**
++ * find_good_lh - find a good log header
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @seg: the segment to start searching from (it's also filled in with a new value.) 
++ * @lh: the log header to fill in
++ * @forward: if true search forward in the log, else search backward
++ *
++ * Call get_log_header() to get a log header for a segment, but if the
++ * segment is bad, either scan forward or backward until we find a good one.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++find_good_lh(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++           struct gfs_glock *gl, uint32_t *seg, struct gfs_log_header *lh,
++           int forward)
++{
++      int error;
++      uint32_t orig_seg = *seg;
++
++      for (;;) {
++              error = get_log_header(sdp, jdesc, gl, *seg, lh);
++              if (error <= 0)
++                      return error;
++
++              if (forward) {
++                      if (++*seg == jdesc->ji_nsegment)
++                              *seg = 0;
++              } else {
++                      if (*seg-- == 0)
++                              *seg = jdesc->ji_nsegment - 1;
++              }
++
++              GFS_ASSERT_SBD(*seg != orig_seg, sdp,);
++      }
++}
++
++/**
++ * verify_jhead - make sure we've found the head of the log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: this is filled in with the log descriptor of the head
++ *
++ * At this point, seg and lh should be either the head of the log or just
++ * before.  Scan forward until we find the head.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++verify_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++           struct gfs_glock *gl, struct gfs_log_header *head)
++{
++      struct gfs_log_header lh;
++      uint32_t seg;
++      int error;
++
++      seg = bn2seg(head->lh_first);
++
++      for (;;) {
++              if (++seg == jdesc->ji_nsegment)
++                      seg = 0;
++
++              error = get_log_header(sdp, jdesc, gl, seg, &lh);
++              if (error < 0)
++                      return error;
++
++              if (error == 1)
++                      continue;
++              if (lh.lh_sequence == head->lh_sequence)
++                      continue;
++
++              if (lh.lh_sequence < head->lh_sequence)
++                      break;
++
++              memcpy(head, &lh, sizeof(struct gfs_log_header));
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_find_jhead - find the head of a log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: the log descriptor for the head of the log is returned here
++ *
++ * Do a binary search of a journal and find the valid log entry with the
++ * highest sequence number.  (i.e. the log head)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_find_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++             struct gfs_glock *gl, struct gfs_log_header *head)
++{
++      struct gfs_log_header lh1, lh_m;
++      uint32_t seg1, seg2, seg_m;
++      int error;
++
++      seg1 = 0;
++      seg2 = jdesc->ji_nsegment - 1;
++
++      for (;;) {
++              seg_m = (seg1 + seg2) / 2;
++
++              error = find_good_lh(sdp, jdesc, gl, &seg1, &lh1, TRUE);
++              if (error)
++                      break;
++
++              if (seg1 == seg_m) {
++                      error = verify_jhead(sdp, jdesc, gl, &lh1);
++                      memcpy(head, &lh1, sizeof(struct gfs_log_header));
++                      break;
++              }
++
++              error = find_good_lh(sdp, jdesc, gl, &seg_m, &lh_m, FALSE);
++              if (error)
++                      break;
++
++              if (lh1.lh_sequence <= lh_m.lh_sequence)
++                      seg1 = seg_m;
++              else
++                      seg2 = seg_m;
++      }
++
++      return error;
++}
++
++/**
++ * gfs_increment_blkno - move to the next block in a journal
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @addr: the block number to increment
++ * @skip_header: if this is TRUE, skip log headers
++ *
++ * Replace @addr with the location of the next block in the log.
++ * Take care of journal wrap and skip of log header if necessary.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_increment_blkno(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                  struct gfs_glock *gl, uint64_t *addr, int skip_headers)
++{
++      struct gfs_log_header header;
++      int error;
++
++      (*addr)++;
++
++      /* Handle journal wrap */
++
++      if (*addr == seg2bn(jdesc->ji_nsegment))
++              *addr -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++
++      gfs_start_ra(gl, *addr,
++                   jdesc->ji_addr +
++                   jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size - *addr);
++
++      /* Handle landing on a header block */
++
++      if (skip_headers && !do_mod(*addr, sdp->sd_sb.sb_seg_size)) {
++              error = get_log_header(sdp, jdesc, gl, bn2seg(*addr), &header);
++              if (error < 0)
++                      return error;
++
++              GFS_ASSERT_SBD(!error, sdp,); /* Corrupt headers here are bad */
++              GFS_ASSERT_SBD(header.lh_first != *addr, sdp,
++                             gfs_log_header_print(&header);
++                             printk("*addr = %"PRIu64"\n", *addr););
++
++              (*addr)++;
++              /* Can't wrap here */
++      }
++
++      return 0;
++}
++
++/**
++ * foreach_descriptor - go through the active part of the log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @start: the first log header in the active region
++ * @end: the last log header (don't process the contents of this entry))
++ * @pass: the recovery pass
++ *
++ * Call a given function once for every log descriptor in the active
++ * portion of the log.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++foreach_descriptor(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                 struct gfs_glock *gl, uint64_t start, uint64_t end,
++                 unsigned int pass)
++{
++      struct gfs_log_header header;
++      struct gfs_log_descriptor desc;
++      struct buffer_head *bh;
++      int error = 0;
++
++      while (start != end) {
++              GFS_ASSERT_SBD(!do_mod(start, sdp->sd_sb.sb_seg_size), sdp,);
++
++              error = get_log_header(sdp, jdesc, gl, bn2seg(start), &header);
++              if (error < 0)
++                      return error;
++
++              GFS_ASSERT_SBD(!error, sdp,); /* Corrupt headers are bad */
++              GFS_ASSERT_SBD(header.lh_first == start, sdp,
++                             gfs_log_header_print(&header);
++                             printk("start = %"PRIu64"\n", start););
++
++              start++;
++
++              for (;;) {
++                      error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              return error;
++
++                      gfs_metatype_check(sdp, bh, GFS_METATYPE_LD);
++                      gfs_desc_in(&desc, bh->b_data);
++
++                      brelse(bh);
++
++                      if (desc.ld_type != GFS_LOG_DESC_LAST) {
++                              error = LO_SCAN_ELEMENTS(sdp, jdesc, gl, start,
++                                                       &desc, pass);
++                              if (error)
++                                      return error;
++
++                              while (desc.ld_length--) {
++                                      error = gfs_increment_blkno(sdp, jdesc, gl,
++                                                                  &start, TRUE);
++                                      if (error)
++                                              return error;
++                              }
++                      } else {
++                              while (desc.ld_length--) {
++                                      error = gfs_increment_blkno(sdp, jdesc, gl,
++                                                                  &start,
++                                                                  !!desc.ld_length);
++                                      if (error)
++                                              return error;
++                              }
++
++                              break;
++                      }
++              }
++      }
++
++      return error;
++}
++
++/**
++ * clean_journal - mark a dirty journal as being clean
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: the head journal to start from
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++clean_journal(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++            struct gfs_glock *gl, struct gfs_log_header *head)
++{
++      struct gfs_log_header lh;
++      struct gfs_log_descriptor desc;
++      struct buffer_head *bh;
++      uint32_t seg;
++      uint64_t blkno;
++      int error;
++
++      seg = bn2seg(head->lh_first);
++
++      for (;;) {
++              if (++seg == jdesc->ji_nsegment)
++                      seg = 0;
++
++              error = get_log_header(sdp, jdesc, gl, seg, &lh);
++              if (error < 0)
++                      return error;
++
++              /* Rewrite corrupt header blocks */
++
++              if (error == 1) {
++                      bh = gfs_dgetblk(sdp, seg2bn(seg), gl);
++
++                      gfs_prep_new_buffer(bh);
++                      gfs_buffer_clear(bh);
++                      gfs_log_header_out(head, bh->b_data);
++                      gfs_log_header_out(head,
++                                         bh->b_data + GFS_BASIC_BLOCK -
++                                         sizeof(struct gfs_log_header));
++
++                      error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++                      brelse(bh);
++                      if (error)
++                              return error;
++              }
++
++              /* Stop when we get to the end of the log. */
++
++              if (lh.lh_sequence < head->lh_sequence)
++                      break;
++      }
++
++      /*  Build a "last" descriptor for the transaction we are
++         about to commit by writing the shutdown header.  */
++
++      memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++      desc.ld_header.mh_magic = GFS_MAGIC;
++      desc.ld_header.mh_type = GFS_METATYPE_LD;
++      desc.ld_header.mh_format = GFS_FORMAT_LD;
++      desc.ld_type = GFS_LOG_DESC_LAST;
++      desc.ld_length = 0;
++
++      for (blkno = head->lh_first + 1; blkno != seg2bn(seg);) {
++              if (do_mod(blkno, sdp->sd_sb.sb_seg_size))
++                      desc.ld_length++;
++              if (++blkno == seg2bn(jdesc->ji_nsegment))
++                      blkno -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++      }
++
++      /*  Write the descriptor  */
++
++      bh = gfs_dgetblk(sdp, head->lh_first + 1, gl);
++
++      gfs_prep_new_buffer(bh);
++      gfs_buffer_clear(bh);
++      gfs_desc_out(&desc, bh->b_data);
++
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++      brelse(bh);
++      if (error)
++              return error;
++
++      /*  Build a log header that says the journal is clean  */
++
++      memset(&lh, 0, sizeof(struct gfs_log_header));
++      lh.lh_header.mh_magic = GFS_MAGIC;
++      lh.lh_header.mh_type = GFS_METATYPE_LH;
++      lh.lh_header.mh_format = GFS_FORMAT_LH;
++      lh.lh_flags = GFS_LOG_HEAD_UNMOUNT;
++      lh.lh_first = seg2bn(seg);
++      lh.lh_sequence = head->lh_sequence + 1;
++      /*  Don't care about tail  */
++      lh.lh_last_dump = head->lh_last_dump;
++
++      /*  Write the header  */
++
++      bh = gfs_dgetblk(sdp, lh.lh_first, gl);
++
++      gfs_prep_new_buffer(bh);
++      gfs_buffer_clear(bh);
++      gfs_log_header_out(&lh, bh->b_data);
++      gfs_log_header_out(&lh,
++                         bh->b_data + GFS_BASIC_BLOCK -
++                         sizeof(struct gfs_log_header));
++
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++      brelse(bh);
++
++      return error;
++}
++
++/**
++ * gfs_recover_journal - recovery a given journal
++ * @sdp: the filesystem
++ * @jid: the number of the journal to recover
++ * @jdesc: the struct gfs_jindex describing the journal
++ * @wait: Don't return until the journal is clean (or an error is encountered)
++ *
++ * Acquire a journals lock, check to see if the journal is clean, and
++ * do recovery if necessary.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_recover_journal(struct gfs_sbd *sdp,
++                  unsigned int jid, struct gfs_jindex *jdesc,
++                  int wait)
++{
++      struct gfs_log_header head;
++      struct gfs_holder j_gh, t_gh;
++      unsigned long t;
++      int error;
++
++      printk("GFS: fsid=%s: jid=%u: Trying to acquire journal lock...\n",
++             sdp->sd_fsname, jid);
++
++      /*  Aquire the journal lock so we can do recovery  */
++
++      error = gfs_glock_nq_num(sdp,
++                               jdesc->ji_addr, &gfs_meta_glops,
++                               LM_ST_EXCLUSIVE,
++                               LM_FLAG_NOEXP |
++                               ((wait) ? 0 : LM_FLAG_TRY) |
++                               GL_NOCACHE, &j_gh);
++      switch (error) {
++      case 0:
++              break;
++
++      case GLR_TRYFAILED:
++              GFS_ASSERT_SBD(!wait, sdp,);
++              printk("GFS: fsid=%s: jid=%u: Busy\n", sdp->sd_fsname, jid);
++              error = 0;
++
++      default:
++              goto fail;
++      };
++
++      printk("GFS: fsid=%s: jid=%u: Looking at journal...\n",
++             sdp->sd_fsname, jid);
++
++      error = gfs_find_jhead(sdp, jdesc, j_gh.gh_gl, &head);
++      if (error)
++              goto fail_gunlock;
++
++      if (!(head.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
++              if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++                      printk("GFS: fsid=%s: jid=%u: Can't replay: read-only FS\n",
++                             sdp->sd_fsname, jid);
++                      error = -EROFS;
++                      goto fail_gunlock;
++              }
++
++              printk("GFS: fsid=%s: jid=%u: Acquiring the transaction lock...\n",
++                     sdp->sd_fsname, jid);
++
++              t = jiffies;
++
++              /*  Acquire an exclusive hold on the transaction lock  */
++
++              error = gfs_glock_nq_init(sdp->sd_trans_gl,
++                                        LM_ST_EXCLUSIVE,
++                                        LM_FLAG_NOEXP |
++                                        LM_FLAG_PRIORITY |
++                                        GL_NOCACHE,
++                                        &t_gh);
++              if (error)
++                      goto fail_gunlock;
++
++              if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++                      printk("GFS: fsid=%s: jid=%u: Can't replay: read-only FS\n",
++                             sdp->sd_fsname, jid);
++                      error = -EROFS;
++                      goto fail_gunlock_tr;
++              }
++
++              printk("GFS: fsid=%s: jid=%u: Replaying journal...\n",
++                     sdp->sd_fsname, jid);
++
++              set_bit(GLF_DIRTY, &j_gh.gh_gl->gl_flags);
++
++              LO_BEFORE_SCAN(sdp, jid, &head, GFS_RECPASS_A1);
++
++              error = foreach_descriptor(sdp, jdesc, j_gh.gh_gl,
++                                         head.lh_tail, head.lh_first,
++                                         GFS_RECPASS_A1);
++              if (error)
++                      goto fail_gunlock_tr;
++
++              LO_AFTER_SCAN(sdp, jid, GFS_RECPASS_A1);
++
++              gfs_replay_wait(sdp);
++
++              error = clean_journal(sdp, jdesc, j_gh.gh_gl, &head);
++              if (error)
++                      goto fail_gunlock_tr;
++
++              gfs_glock_dq_uninit(&t_gh);
++
++              t = DIV_RU(jiffies - t, HZ);
++              
++              printk("GFS: fsid=%s: jid=%u: Journal replayed in %lus\n",
++                     sdp->sd_fsname, jid, t);
++      }
++
++      sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++                                                  jid,
++                                                  LM_RD_SUCCESS);
++
++      gfs_glock_dq_uninit(&j_gh);
++
++      printk("GFS: fsid=%s: jid=%u: Done\n", sdp->sd_fsname, jid);
++
++      return 0;
++
++ fail_gunlock_tr:
++      gfs_replay_wait(sdp);
++      gfs_glock_dq_uninit(&t_gh);
++
++ fail_gunlock:
++      gfs_glock_dq_uninit(&j_gh);
++
++      printk("GFS: fsid=%s: jid=%u: %s\n",
++             sdp->sd_fsname, jid, (error) ? "Failed" : "Done");
++
++ fail:
++      sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++                                                  jid,
++                                                  LM_RD_GAVEUP);
++
++      return error;
++}
++
++/**
++ * gfs_check_journals - Recovery any dirty journals
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_check_journals(struct gfs_sbd *sdp)
++{
++      struct dirty_j *dj;
++
++      for (;;) {
++              dj = get_dirty_j(sdp);
++              if (!dj)
++                      break;
++
++              down(&sdp->sd_jindex_lock);
++
++              if (dj->dj_jid != sdp->sd_lockstruct.ls_jid &&
++                  dj->dj_jid < sdp->sd_journals) {
++                      memcpy(&dj->dj_desc,
++                             sdp->sd_jindex + dj->dj_jid,
++                             sizeof(struct gfs_jindex));
++                      up(&sdp->sd_jindex_lock);
++
++                      gfs_recover_journal(sdp,
++                                          dj->dj_jid, &dj->dj_desc,
++                                          FALSE);
++                      
++              } else {
++                      up(&sdp->sd_jindex_lock);
++                      sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++                                                                  dj->dj_jid, LM_RD_GAVEUP);
++              }
++
++              kfree(dj);
++      }
++}
++
++/**
++ * gfs_recover_dump - recover the log elements in this machine's journal
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_recover_dump(struct gfs_sbd *sdp)
++{
++      struct gfs_log_header head;
++      int error;
++
++      error = gfs_find_jhead(sdp, &sdp->sd_jdesc, sdp->sd_journal_gh.gh_gl,
++                             &head);
++      if (error)
++              goto fail;
++
++      GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++      if (!head.lh_last_dump)
++              return error;
++
++      printk("GFS: fsid=%s: Scanning for log elements...\n",
++             sdp->sd_fsname);
++
++      LO_BEFORE_SCAN(sdp, sdp->sd_lockstruct.ls_jid, &head, GFS_RECPASS_B1);
++
++      error = foreach_descriptor(sdp, &sdp->sd_jdesc, sdp->sd_journal_gh.gh_gl,
++                                 head.lh_last_dump, head.lh_first,
++                                 GFS_RECPASS_B1);
++      if (error)
++              goto fail;
++
++      LO_AFTER_SCAN(sdp, sdp->sd_lockstruct.ls_jid, GFS_RECPASS_B1);
++
++      /* We need to make sure if we crash during the next log dump that
++         all intermediate headers in the transaction point to the last
++         log dump before the one we're making so we don't lose it. */
++
++      sdp->sd_log_dump_last = head.lh_last_dump;
++
++      printk("GFS: fsid=%s: Done\n", sdp->sd_fsname);
++
++      return 0;
++
++ fail:
++      printk("GFS: fsid=%s: Failed\n", sdp->sd_fsname);
++
++      return error;
++}
+diff -urN linux-orig/fs/gfs/recovery.h linux-patched/fs/gfs/recovery.h
+--- linux-orig/fs/gfs/recovery.h       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/recovery.h    2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,36 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVERY_DOT_H__
++#define __RECOVERY_DOT_H__
++
++#define GFS_RECPASS_A1  (12)
++#define GFS_RECPASS_B1  (14)
++
++void gfs_add_dirty_j(struct gfs_sbd *sdp, unsigned int jid);
++void gfs_clear_dirty_j(struct gfs_sbd *sdp);
++
++int gfs_find_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                 struct gfs_glock *gl, struct gfs_log_header *head);
++int gfs_increment_blkno(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++                      struct gfs_glock *gl, uint64_t *addr,
++                      int skip_headers);
++
++int gfs_recover_journal(struct gfs_sbd *sdp,
++                      unsigned int jid, struct gfs_jindex *jdesc,
++                      int wait);
++void gfs_check_journals(struct gfs_sbd *sdp);
++
++int gfs_recover_dump(struct gfs_sbd *sdp);
++
++#endif /* __RECOVERY_DOT_H__ */
+diff -urN linux-orig/fs/gfs/rgrp.c linux-patched/fs/gfs/rgrp.c
+--- linux-orig/fs/gfs/rgrp.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/rgrp.c        2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,1932 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bits.h"
++#include "dio.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * mhc_hash: find the mhc hash bucket for a buffer
++ * @bh: the buffer
++ *
++ * Returns: The bucket number
++ */
++
++static unsigned int
++mhc_hash(struct buffer_head *bh)
++{
++      uint64_t blkno;
++      unsigned int h;
++
++      blkno = bh->b_blocknr;
++      h = gfs_hash(&blkno, sizeof(uint64_t)) & GFS_MHC_HASH_MASK;
++
++      return h;
++}
++
++/**
++ * mhc_trim - 
++ * @sdp:
++ * @max:
++ *
++ */
++
++static void
++mhc_trim(struct gfs_sbd *sdp, unsigned int max)
++{
++      struct gfs_meta_header_cache *mc;
++
++      for (;;) {
++              spin_lock(&sdp->sd_mhc_lock);
++              if (list_empty(&sdp->sd_mhc_single)) {
++                      spin_unlock(&sdp->sd_mhc_lock);
++                      return;
++              } else {
++                      mc = list_entry(sdp->sd_mhc_single.prev,
++                                      struct gfs_meta_header_cache,
++                                      mc_list_single);
++                      list_del(&mc->mc_list_hash);
++                      list_del(&mc->mc_list_single);
++                      list_del(&mc->mc_list_rgd);
++                      spin_unlock(&sdp->sd_mhc_lock);
++
++                      kmem_cache_free(gfs_mhc_cachep, mc);
++                      atomic_dec(&sdp->sd_mhc_count);
++
++                      if (atomic_read(&sdp->sd_mhc_count) <= max)
++                              return;
++              }
++      }
++}
++
++/**
++ * gfs_mhc_add - add buffers to the cache of metadata
++ * @rgd: a RG
++ * @bh: an array of buffers
++ * @num: the number of buffers in the array
++ *
++ */
++
++void
++gfs_mhc_add(struct gfs_rgrpd *rgd,
++          struct buffer_head **bh, unsigned int num)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_meta_header_cache *mc;
++      unsigned int x;
++      uint64_t gen;
++      struct list_head *head;
++
++      for (x = 0; x < num; x++) {
++              gfs_meta_check(sdp, bh[x]);
++
++              RETRY_MALLOC(mc = kmem_cache_alloc(gfs_mhc_cachep, GFP_KERNEL), mc);
++              memset(mc, 0, sizeof(struct gfs_meta_header_cache));
++
++              mc->mc_block = bh[x]->b_blocknr;
++              memcpy(&mc->mc_mh, bh[x]->b_data,
++                     sizeof(struct gfs_meta_header));
++
++              gen = gfs64_to_cpu(mc->mc_mh.mh_generation) + 2;
++              mc->mc_mh.mh_generation = cpu_to_gfs64(gen);
++
++              head = &sdp->sd_mhc[mhc_hash(bh[x])];
++
++              spin_lock(&sdp->sd_mhc_lock);
++              list_add(&mc->mc_list_hash, head);
++              list_add(&mc->mc_list_single, &sdp->sd_mhc_single);
++              list_add(&mc->mc_list_rgd, &rgd->rd_mhc);
++              spin_unlock(&sdp->sd_mhc_lock);
++
++              atomic_inc(&sdp->sd_mhc_count);
++      }
++
++      if (atomic_read(&sdp->sd_mhc_count) > sdp->sd_tune.gt_max_mhc)
++              mhc_trim(sdp, sdp->sd_tune.gt_max_mhc);
++}
++
++/**
++ * gfs_mhc_fish - Try to fill in a buffer with data from the cache
++ * @sdp: the filesystem
++ * @bh: the buffer to fill in
++ *
++ * Returns: TRUE if the buffer was cached, FALSE otherwise
++ */
++
++int
++gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++      struct list_head *tmp, *head;
++      struct gfs_meta_header_cache *mc;
++
++      head = &sdp->sd_mhc[mhc_hash(bh)];
++
++      spin_lock(&sdp->sd_mhc_lock);
++
++      for (tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              mc = list_entry(tmp, struct gfs_meta_header_cache, mc_list_hash);
++              if (mc->mc_block != bh->b_blocknr)
++                      continue;
++
++              list_del(&mc->mc_list_hash);
++              list_del(&mc->mc_list_single);
++              list_del(&mc->mc_list_rgd);
++              spin_unlock(&sdp->sd_mhc_lock);
++
++              gfs_prep_new_buffer(bh);
++              memcpy(bh->b_data, &mc->mc_mh,
++                     sizeof(struct gfs_meta_header));
++
++              kmem_cache_free(gfs_mhc_cachep, mc);
++              atomic_dec(&sdp->sd_mhc_count);
++
++              return TRUE;
++      }
++
++      spin_unlock(&sdp->sd_mhc_lock);
++
++      return FALSE;
++}
++
++/**
++ * gfs_mhc_zap - Get rid of the data in the cache of metadata headers
++ * @rgd: a RG
++ *
++ */
++
++void
++gfs_mhc_zap(struct gfs_rgrpd *rgd)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_meta_header_cache *mc;
++
++      spin_lock(&sdp->sd_mhc_lock);
++
++      while (!list_empty(&rgd->rd_mhc)) {
++              mc = list_entry(rgd->rd_mhc.next,
++                              struct gfs_meta_header_cache,
++                              mc_list_rgd);
++
++              list_del(&mc->mc_list_hash);
++              list_del(&mc->mc_list_single);
++              list_del(&mc->mc_list_rgd);
++              spin_unlock(&sdp->sd_mhc_lock);
++
++              kmem_cache_free(gfs_mhc_cachep, mc);
++              atomic_dec(&sdp->sd_mhc_count);
++
++              spin_lock(&sdp->sd_mhc_lock);
++      }
++
++      spin_unlock(&sdp->sd_mhc_lock);
++}
++
++/**
++ * depend_hash() - Turn glock number into hash bucket number
++ * @formal_ino:
++ *
++ * Returns: The number of the corresponding hash bucket
++ */
++
++static unsigned int
++depend_hash(uint64_t formal_ino)
++{
++      unsigned int h;
++
++      h = gfs_hash(&formal_ino, sizeof(uint64_t));
++      h &= GFS_DEPEND_HASH_MASK;
++
++      return h;
++}
++
++/**
++ * depend_sync_one -
++ * @sdp:
++ * @gd:
++ *
++ */
++
++static void
++depend_sync_one(struct gfs_sbd *sdp, struct gfs_depend *gd)
++{
++      struct gfs_glock *gl;
++
++      spin_lock(&sdp->sd_depend_lock);
++      list_del(&gd->gd_list_hash);
++      spin_unlock(&sdp->sd_depend_lock);
++      list_del(&gd->gd_list_rgd);
++
++      gl = gfs_glock_find(sdp,
++                          &(struct lm_lockname){gd->gd_formal_ino,
++                                                LM_TYPE_INODE});
++      if (gl) {
++              if (gl->gl_ops->go_sync)
++                      gl->gl_ops->go_sync(gl,
++                                          DIO_METADATA |
++                                          DIO_INVISIBLE);
++              gfs_glock_put(gl);
++      }
++
++      kfree(gd);
++      atomic_dec(&sdp->sd_depend_count);
++}
++
++/**
++ * depend_sync_old -
++ * @rgd:
++ *
++ */
++
++static void
++depend_sync_old(struct gfs_rgrpd *rgd)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_depend *gd;
++
++      for (;;) {
++              gd = list_entry(rgd->rd_depend.prev,
++                              struct gfs_depend,
++                              gd_list_rgd);
++
++              if (time_before(jiffies,
++                              gd->gd_time +
++                              sdp->sd_tune.gt_depend_secs * HZ))
++                      return;
++
++              depend_sync_one(sdp, gd);
++      }
++}
++
++/**
++ * gfs_depend_add -
++ * @rgd:
++ * @formal_ino:
++ *
++ */
++
++void
++gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct list_head *head, *tmp;
++      struct gfs_depend *gd;
++
++      head = &sdp->sd_depend[depend_hash(formal_ino)];
++
++      spin_lock(&sdp->sd_depend_lock);
++
++      for (tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              gd = list_entry(tmp, struct gfs_depend, gd_list_hash);
++              if (gd->gd_rgd == rgd &&
++                  gd->gd_formal_ino == formal_ino) {
++                      list_move(&gd->gd_list_hash, head);
++                      spin_unlock(&sdp->sd_depend_lock);
++                      list_move(&gd->gd_list_rgd, &rgd->rd_depend);
++                      gd->gd_time = jiffies;
++                      return;
++              }
++      }
++
++      spin_unlock(&sdp->sd_depend_lock);
++
++      gd = gmalloc(sizeof(struct gfs_depend));
++      memset(gd, 0, sizeof(struct gfs_depend));
++
++      gd->gd_rgd = rgd;
++      gd->gd_formal_ino = formal_ino;
++      gd->gd_time = jiffies;
++
++      spin_lock(&sdp->sd_depend_lock);
++      list_add(&gd->gd_list_hash, head);
++      spin_unlock(&sdp->sd_depend_lock);
++      list_add(&gd->gd_list_rgd, &rgd->rd_depend);
++
++      atomic_inc(&sdp->sd_depend_count);
++
++      depend_sync_old(rgd);
++}
++
++/**
++ * gfs_depend_sync -
++ * @rgd:
++ *
++ */
++
++void
++gfs_depend_sync(struct gfs_rgrpd *rgd)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_depend *gd;
++
++      while (!list_empty(&rgd->rd_depend)) {
++              gd = list_entry(rgd->rd_depend.next,
++                              struct gfs_depend,
++                              gd_list_rgd);
++              depend_sync_one(sdp, gd);
++      }
++}
++
++/**
++ * rgrp_verify - Verify that a resource group is consistent
++ * @sdp: the filesystem
++ * @rgd: the rgrp
++ *
++ * Somebody should have already called gfs_glock_rg() on this RG.
++ */
++
++static void
++rgrp_verify(struct gfs_rgrpd *rgd)
++{
++      struct gfs_bitmap *bits = NULL;
++      uint32_t length = rgd->rd_ri.ri_length;
++      uint32_t count[4], tmp;
++      int buf, x;
++
++      memset(count, 0, 4 * sizeof(uint32_t));
++
++      for (buf = 0; buf < length; buf++) {
++              bits = &rgd->rd_bits[buf];
++              for (x = 0; x < 4; x++)
++                      count[x] += gfs_bitcount(rgd,
++                                               rgd->rd_bh[buf]->b_data +
++                                               bits->bi_offset,
++                                               bits->bi_len, x);
++      }
++
++      GFS_ASSERT_RGRPD(count[0] == rgd->rd_rg.rg_free, rgd,
++                       printk("free data mismatch:  %u != %u\n",
++                              count[0], rgd->rd_rg.rg_free););
++
++      tmp = rgd->rd_ri.ri_data -
++              (rgd->rd_rg.rg_usedmeta + rgd->rd_rg.rg_freemeta) -
++              (rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi) -
++              rgd->rd_rg.rg_free;
++      GFS_ASSERT_RGRPD(count[1] == tmp, rgd,
++                       printk("used data mismatch:  %u != %u\n",
++                              count[1], tmp););
++
++      GFS_ASSERT_RGRPD(count[2] == rgd->rd_rg.rg_freemeta, rgd,
++                       printk("free metadata mismatch:  %u != %u\n",
++                              count[2], rgd->rd_rg.rg_freemeta););
++
++      tmp = rgd->rd_rg.rg_usedmeta +
++              (rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi);
++      GFS_ASSERT_RGRPD(count[3] == tmp, rgd,
++                       printk("used metadata mismatch:  %u != %u\n",
++                              count[3], tmp););
++}
++
++/**
++ * gfs_blk2rgrpd - Find resource group for a given data block number
++ * @sdp: The GFS superblock
++ * @n: The data block number
++ *
++ * Returns: Ths resource group, or NULL if not found
++ */
++
++struct gfs_rgrpd *
++gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk)
++{
++      struct list_head *tmp, *head;
++      struct gfs_rgrpd *rgd = NULL;
++      struct gfs_rindex *ri;
++
++      spin_lock(&sdp->sd_rg_mru_lock);
++
++      for (head = &sdp->sd_rg_mru_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              rgd = list_entry(tmp, struct gfs_rgrpd, rd_list_mru);
++              ri = &rgd->rd_ri;
++
++              if (ri->ri_data1 <= blk && blk < ri->ri_data1 + ri->ri_data) {
++                      list_move(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
++                      spin_unlock(&sdp->sd_rg_mru_lock);
++                      return rgd;
++              }
++      }
++
++      spin_unlock(&sdp->sd_rg_mru_lock);
++
++      return NULL;
++}
++
++/**
++ * gfs_rgrpd_get_first - get the first RG
++ * @sdp: The GFS superblock
++ *
++ * Returns: The first rgrp in the filesystem
++ */
++
++struct gfs_rgrpd *
++gfs_rgrpd_get_first(struct gfs_sbd *sdp)
++{
++      GFS_ASSERT_SBD(!list_empty(&sdp->sd_rglist), sdp,);
++      return list_entry(sdp->sd_rglist.next, struct gfs_rgrpd, rd_list);
++}
++
++/**
++ * gfs_rgrpd_get_next - get the next RG
++ * @rgd: A RG
++ *
++ * Returns: The next rgrp
++ */
++
++struct gfs_rgrpd *
++gfs_rgrpd_get_next(struct gfs_rgrpd *rgd)
++{
++      if (rgd->rd_list.next == &rgd->rd_sbd->sd_rglist)
++              return NULL;
++      return list_entry(rgd->rd_list.next, struct gfs_rgrpd, rd_list);
++}
++
++/**
++ * clear_rgrpdi - Clear up rgrps
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++clear_rgrpdi(struct gfs_sbd *sdp)
++{
++      struct gfs_rgrpd *rgd;
++      struct gfs_glock *gl;
++
++      sdp->sd_rg_forward = NULL;
++
++      while (!list_empty(&sdp->sd_rg_recent)) {
++              rgd = list_entry(sdp->sd_rg_recent.next,
++                               struct gfs_rgrpd, rd_recent);
++              list_del(&rgd->rd_recent);
++      }
++
++      while (!list_empty(&sdp->sd_rglist)) {
++              rgd = list_entry(sdp->sd_rglist.next,
++                               struct gfs_rgrpd, rd_list);
++              gl = rgd->rd_gl;
++
++              list_del(&rgd->rd_list);
++              list_del(&rgd->rd_list_mru);
++
++              if (gl) {
++                      gfs_glock_force_drop(gl);
++                      if (atomic_read(&gl->gl_lvb_count))
++                              gfs_lvb_unhold(gl);
++                      gl2rgd(gl) = NULL;
++                      gfs_glock_put(gl);
++              }
++
++              if (rgd->rd_bits)
++                      kfree(rgd->rd_bits);
++              if (rgd->rd_bh)
++                      kfree(rgd->rd_bh);
++
++              kfree(rgd);
++      }
++}
++
++/**
++ * gfs_clear_rgrpd - Clear up rgrps
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_clear_rgrpd(struct gfs_sbd *sdp)
++{
++      down(&sdp->sd_rindex_lock);
++      clear_rgrpdi(sdp);
++      up(&sdp->sd_rindex_lock);
++}
++
++/**
++ * gfs_compute_bitstructs - Compute the bitmap sizes
++ * @rgd: The resource group descriptor
++ *
++ */
++
++static void
++compute_bitstructs(struct gfs_rgrpd *rgd)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_bitmap *bits;
++      uint32_t length = rgd->rd_ri.ri_length;
++      uint32_t bytes_left, bytes;
++      int x;
++
++      rgd->rd_bits = gmalloc(length * sizeof(struct gfs_bitmap));
++      memset(rgd->rd_bits, 0, length * sizeof(struct gfs_bitmap));
++
++      bytes_left = rgd->rd_ri.ri_bitbytes;
++
++      for (x = 0; x < length; x++) {
++              bits = &rgd->rd_bits[x];
++
++              if (length == 1) {
++                      bytes = bytes_left;
++                      bits->bi_offset = sizeof(struct gfs_rgrp);
++                      bits->bi_start = 0;
++                      bits->bi_len = bytes;
++              } else if (x == 0) {
++                      bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_rgrp);
++                      bits->bi_offset = sizeof(struct gfs_rgrp);
++                      bits->bi_start = 0;
++                      bits->bi_len = bytes;
++              } else if (x + 1 == length) {
++                      bytes = bytes_left;
++                      bits->bi_offset = sizeof(struct gfs_meta_header);
++                      bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
++                      bits->bi_len = bytes;
++              } else {
++                      bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++                      bits->bi_offset = sizeof(struct gfs_meta_header);
++                      bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
++                      bits->bi_len = bytes;
++              }
++
++              bytes_left -= bytes;
++      }
++
++      GFS_ASSERT_RGRPD(!bytes_left, rgd,);
++      GFS_ASSERT_RGRPD((rgd->rd_bits[length - 1].bi_start +
++                        rgd->rd_bits[length - 1].bi_len) * GFS_NBBY ==
++                       rgd->rd_ri.ri_data, rgd,
++                       printk("start=%u len=%u offset=%u\n",
++                              rgd->rd_bits[length - 1].bi_start,
++                              rgd->rd_bits[length - 1].bi_len,
++                              rgd->rd_bits[length - 1].bi_offset);
++                       gfs_rindex_print(&rgd->rd_ri););
++
++      rgd->rd_bh = gmalloc(length * sizeof(struct buffer_head *));
++      memset(rgd->rd_bh, 0, length * sizeof(struct buffer_head *));
++}
++
++/**
++ * gfs_ri_update - Pull in a new resource index from the disk
++ * @gl: The glock covering the rindex inode
++ *
++ * Returns: 0 on successful update, error code otherwise
++ */
++
++static int
++gfs_ri_update(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_rgrpd *rgd;
++      char buf[sizeof(struct gfs_rindex)];
++      int error;
++
++      GFS_ASSERT_SBD(!do_mod(ip->i_di.di_size, sizeof(struct gfs_rindex)),
++                     sdp,);
++
++      clear_rgrpdi(sdp);
++
++      for (sdp->sd_rgcount = 0;; sdp->sd_rgcount++) {
++              error = gfs_internal_read(ip, buf,
++                                        sdp->sd_rgcount *
++                                        sizeof(struct gfs_rindex),
++                                        sizeof(struct gfs_rindex));
++              if (!error)
++                      break;
++              if (error != sizeof(struct gfs_rindex)) {
++                      if (error > 0)
++                              error = -EIO;
++                      goto fail;
++              }
++
++              rgd = gmalloc(sizeof(struct gfs_rgrpd));
++              memset(rgd, 0, sizeof(struct gfs_rgrpd));
++
++              INIT_LIST_HEAD(&rgd->rd_mhc);
++              INIT_LIST_HEAD(&rgd->rd_depend);
++              rgd->rd_sbd = sdp;
++
++              list_add_tail(&rgd->rd_list, &sdp->sd_rglist);
++              list_add_tail(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
++
++              gfs_rindex_in(&rgd->rd_ri, buf);
++
++              compute_bitstructs(rgd);
++
++              error = gfs_glock_get(sdp, rgd->rd_ri.ri_addr, &gfs_rgrp_glops,
++                                    CREATE, &rgd->rd_gl);
++              if (error)
++                      goto fail;
++
++              error = gfs_lvb_hold(rgd->rd_gl);
++              if (error)
++                      goto fail;
++
++              gl2rgd(rgd->rd_gl) = rgd;
++              rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
++      }
++
++      sdp->sd_riinode_vn = ip->i_gl->gl_vn;
++
++      return 0;
++
++ fail:
++      clear_rgrpdi(sdp);
++
++      return error;
++}
++
++/**
++ * gfs_rindex_hold - Grab a lock on the rindex
++ * @sdp: The GFS superblock
++ * @ri_gh: the glock holder
++ *
++ * We grab a lock in the rindex inode to make sure that it doesn't
++ * change whilst we are performing an operation. We keep this lock
++ * for quite long periods of time compared to other locks. This
++ * doesn't matter, since its shared and it is very, very rarely
++ * accessed in the exclusive mode.
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh)
++{
++      struct gfs_inode *ip = sdp->sd_riinode;
++      struct gfs_glock *gl = ip->i_gl;
++      int error;
++
++      error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
++      if (error)
++              return error;
++
++      if (sdp->sd_riinode_vn != gl->gl_vn) {
++              down(&sdp->sd_rindex_lock);
++              if (sdp->sd_riinode_vn != gl->gl_vn) {
++                      error = gfs_ri_update(ip);
++                      if (error)
++                              gfs_glock_dq_uninit(ri_gh);
++              }
++              up(&sdp->sd_rindex_lock);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_rgrp_read - Read in a RG's bitmaps
++ * @rgd: the struct gfs_rgrpd describing the RG to read in
++ *
++ * Read in RG bitmaps.  Must call gfs_rgrp_relse() it free the bitmaps.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_rgrp_read(struct gfs_rgrpd *rgd)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_glock *gl = rgd->rd_gl;
++      unsigned int x, length = rgd->rd_ri.ri_length;
++      int error;
++
++      for (x = 0; x < length; x++) {
++              GFS_ASSERT_RGRPD(!rgd->rd_bh[x], rgd,);
++              rgd->rd_bh[x] = gfs_dgetblk(sdp, rgd->rd_ri.ri_addr + x, gl);
++      }
++
++      for (x = 0; x < length; x++) {
++              error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_START);
++              if (error)
++                      goto fail;
++      }
++
++      for (x = length; x--;) {
++              error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_WAIT);
++              if (error)
++                      goto fail;
++              gfs_metatype_check(sdp, rgd->rd_bh[x],
++                                 (x) ? GFS_METATYPE_RB : GFS_METATYPE_RG);
++      }
++
++      if (rgd->rd_rg_vn != gl->gl_vn) {
++              gfs_rgrp_in(&rgd->rd_rg, (rgd->rd_bh[0])->b_data);
++              rgd->rd_rg_vn = gl->gl_vn;
++      }
++
++      return 0;
++
++ fail:
++      for (x = 0; x < length; x++) {
++              brelse(rgd->rd_bh[x]);
++              rgd->rd_bh[x] = NULL;
++      }
++
++      return error;
++}
++
++/**
++ * gfs_rgrp_relse - Release RG bitmaps read in with gfs_rgrp_read()
++ * @rgd: the struct gfs_rgrpd describing the RG to read in
++ *
++ */
++
++void
++gfs_rgrp_relse(struct gfs_rgrpd *rgd)
++{
++      int x, length = rgd->rd_ri.ri_length;
++
++      for (x = 0; x < length; x++) {
++              brelse(rgd->rd_bh[x]);
++              rgd->rd_bh[x] = NULL;
++      }
++}
++
++/**
++ * gfs_rgrp_lvb_fill - copy RG usage data out of the struct gfs_rgrp into the struct gfs_rgrp_lvb
++ * @rgd: the resource group data structure
++ *
++ */
++
++void
++gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd)
++{
++      struct gfs_rgrp *rg = &rgd->rd_rg;
++      struct gfs_rgrp_lvb *rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++
++      rb->rb_magic = cpu_to_gfs32(GFS_MAGIC);
++      rb->rb_free = cpu_to_gfs32(rg->rg_free);
++      rb->rb_useddi = cpu_to_gfs32(rg->rg_useddi);
++      rb->rb_freedi = cpu_to_gfs32(rg->rg_freedi);
++      rb->rb_usedmeta = cpu_to_gfs32(rg->rg_usedmeta);
++      rb->rb_freemeta = cpu_to_gfs32(rg->rg_freemeta);
++
++      clear_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags);
++}
++
++/**
++ * gfs_rgrp_lvb_init - Init the data of a RG LVB
++ * @rgd: the resource group data structure
++ *
++ * Returns:  0 on success, -EXXX on failure
++ */
++
++int
++gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd)
++{
++      struct gfs_glock *gl = rgd->rd_gl;
++      struct gfs_holder rgd_gh;
++      int error;
++
++      error = gfs_glock_nq_init(gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++      if (!error) {
++              gfs_rgrp_lvb_fill(rgd);
++              gfs_glock_dq_uninit(&rgd_gh);
++      }
++
++      return error;
++}
++
++/**
++ * gfs_alloc_get - allocate a struct gfs_alloc structure for an inode
++ * @ip: the inode
++ *
++ * Returns: the struct gfs_alloc
++ */
++
++struct gfs_alloc *
++gfs_alloc_get(struct gfs_inode *ip)
++{
++      struct gfs_alloc *al = ip->i_alloc;
++
++      GFS_ASSERT_INODE(!al, ip,);
++
++      al = gmalloc(sizeof(struct gfs_alloc));
++      memset(al, 0, sizeof(struct gfs_alloc));
++
++      ip->i_alloc = al;
++
++      return al;
++}
++
++/**
++ * gfs_alloc_put - throw away the struct gfs_alloc for an inode
++ * @ip: the inode
++ *
++ */
++
++void
++gfs_alloc_put(struct gfs_inode *ip)
++{
++      struct gfs_alloc *al = ip->i_alloc;
++
++      GFS_ASSERT_INODE(al, ip,);
++
++      ip->i_alloc = NULL;
++      kfree(al);
++}
++
++/**
++ * try_rgrp_fit - See if a given reservation will fit in a given RG
++ * @rgd: the RG data
++ * @al: the struct gfs_alloc structure describing the reservation
++ *
++ * Sets the $ir_datares field in @res.
++ * Sets the $ir_metares field in @res.
++ *
++ * Returns: 1 on success, 0 on failure
++ */
++
++static int
++try_rgrp_fit(struct gfs_rgrpd *rgd, struct gfs_alloc *al)
++{
++      uint32_t freeblks = rgd->rd_rg.rg_free;
++      uint32_t freemeta = rgd->rd_rg.rg_freemeta;
++      uint32_t metares = al->al_requested_meta;
++      uint32_t datares = al->al_requested_data;
++
++      /* First take care of the data blocks required */
++
++      if (freeblks < al->al_requested_data)
++              return 0;
++
++      freeblks -= al->al_requested_data;
++
++      /* Then take care of the dinodes */
++
++      metares += al->al_requested_di;
++
++      /* Then take care of the metadata blocks */
++
++      while (freemeta < metares) {
++              if (freeblks < GFS_META_CLUMP)
++                      return 0;
++
++              freeblks -= GFS_META_CLUMP;
++              freemeta += GFS_META_CLUMP;
++
++              datares += GFS_META_CLUMP;
++      }
++
++      al->al_rgd = rgd;
++      al->al_reserved_meta = metares;
++      al->al_reserved_data = datares;
++
++      return 1;
++}
++
++/**
++ * recent_rgrp_first - get first RG from recent list
++ * @sdp: The GFS superblock
++ * @rglast: address of the rgrp used last
++ *
++ * Returns: The first rgrp in the recent list
++ */
++
++static struct gfs_rgrpd *
++recent_rgrp_first(struct gfs_sbd *sdp, uint64_t rglast)
++{
++      struct list_head *tmp, *head;
++      struct gfs_rgrpd *rgd = NULL;
++
++      spin_lock(&sdp->sd_rg_recent_lock);
++
++      if (list_empty(&sdp->sd_rg_recent))
++              goto out;
++
++      if (!rglast)
++              goto first;
++
++      for (head = &sdp->sd_rg_recent, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++              if (rgd->rd_ri.ri_addr == rglast)
++                      goto out;
++      }
++
++ first:
++      rgd = list_entry(sdp->sd_rg_recent.next, struct gfs_rgrpd, rd_recent);
++
++ out:
++      spin_unlock(&sdp->sd_rg_recent_lock);
++
++      return rgd;
++}
++
++/**
++ * recent_rgrp_next - get next RG from recent list
++ * @cur_rgd: current rgrp
++ *
++ * Returns: The next rgrp in the recent list
++ */
++
++static struct gfs_rgrpd *
++recent_rgrp_next(struct gfs_rgrpd *cur_rgd)
++{
++      struct gfs_sbd *sdp = cur_rgd->rd_sbd;
++      struct list_head *tmp, *head;
++      struct gfs_rgrpd *rgd;
++
++      spin_lock(&sdp->sd_rg_recent_lock);
++
++      for (head = &sdp->sd_rg_recent, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++              if (rgd == cur_rgd) {
++                      if (cur_rgd->rd_recent.next != &sdp->sd_rg_recent)
++                              rgd = list_entry(cur_rgd->rd_recent.next,
++                                               struct gfs_rgrpd, rd_recent);
++                      else
++                              rgd = NULL;
++
++                      goto out;
++              }
++      }
++
++      rgd = NULL;
++
++ out:
++      spin_unlock(&sdp->sd_rg_recent_lock);
++
++      return rgd;
++}
++
++/**
++ * recent_rgrp_remove - remove an RG from recent list
++ * @rgd: The rgrp to remove
++ *
++ */
++
++static void
++recent_rgrp_remove(struct gfs_rgrpd *rgd)
++{
++      spin_lock(&rgd->rd_sbd->sd_rg_recent_lock);
++      list_del(&rgd->rd_recent);
++      spin_unlock(&rgd->rd_sbd->sd_rg_recent_lock);
++}
++
++/**
++ * recent_rgrp_add - add an RG to recent list
++ * @new_rgd: The rgrp to add
++ *
++ */
++
++static void
++recent_rgrp_add(struct gfs_rgrpd *new_rgd)
++{
++      struct gfs_sbd *sdp = new_rgd->rd_sbd;
++      struct list_head *tmp, *head;
++      struct gfs_rgrpd *rgd = NULL;
++      unsigned int count = 0;
++      unsigned int max = sdp->sd_rgcount / gfs_num_journals(sdp);
++
++      spin_lock(&sdp->sd_rg_recent_lock);
++
++      for (head = &sdp->sd_rg_recent, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++              if (rgd == new_rgd)
++                      goto out;
++
++              if (++count >= max)
++                      goto out;
++      }
++      list_add_tail(&new_rgd->rd_recent, &sdp->sd_rg_recent);
++
++ out:
++      spin_unlock(&sdp->sd_rg_recent_lock);
++}
++
++/**
++ * forward_rgrp_get - get an rgrp to try next from full list
++ * @sdp: The GFS superblock
++ *
++ * Returns: The rgrp to try next
++ */
++
++static struct gfs_rgrpd *
++forward_rgrp_get(struct gfs_sbd *sdp)
++{
++      struct gfs_rgrpd *rgd;
++      unsigned int journals = gfs_num_journals(sdp);
++      unsigned int rg = 0, x;
++
++      spin_lock(&sdp->sd_rg_forward_lock);
++
++      rgd = sdp->sd_rg_forward;
++      if (!rgd) {
++              if (sdp->sd_rgcount >= journals)
++                      rg = sdp->sd_rgcount *
++                              sdp->sd_lockstruct.ls_jid /
++                              journals;
++
++              for (x = 0, rgd = gfs_rgrpd_get_first(sdp);
++                   x < rg;
++                   x++, rgd = gfs_rgrpd_get_next(rgd))
++                      /* Do Nothing */;
++
++              sdp->sd_rg_forward = rgd;
++      }
++
++      spin_unlock(&sdp->sd_rg_forward_lock);
++
++      return rgd;
++}
++
++/**
++ * forward_rgrp_set - set the forward rgrp pointer
++ * @sdp: the filesystem
++ * @rgd: The new forward rgrp
++ *
++ */
++
++static void
++forward_rgrp_set(struct gfs_sbd *sdp, struct gfs_rgrpd *rgd)
++{
++      spin_lock(&sdp->sd_rg_forward_lock);
++      sdp->sd_rg_forward = rgd;
++      spin_unlock(&sdp->sd_rg_forward_lock);
++}
++
++/**
++ * get_local_rgrp - Choose and lock a rgrp for allocation
++ * @ip: the inode to reserve space for
++ * @rgp: the chosen and locked rgrp
++ *
++ * Try to acquire rgrp in way which avoids contending with others.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_local_rgrp(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_rgrpd *rgd, *begin, *next = NULL;
++      struct gfs_alloc *al = ip->i_alloc;
++      int flags = LM_FLAG_TRY;
++      int error = 0;
++      int skipped = 0;
++      int loops = 0;
++      int update_recent = FALSE;
++
++      /* Try recently successful rgrps */
++
++      rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
++
++      while (rgd) {
++              error = gfs_glock_nq_init(rgd->rd_gl,
++                                        LM_ST_EXCLUSIVE, LM_FLAG_TRY,
++                                        &al->al_rgd_gh);
++              switch (error) {
++              case 0:
++                      if (try_rgrp_fit(rgd, al))
++                              goto out;
++
++                      next = recent_rgrp_next(rgd);
++                      recent_rgrp_remove(rgd);
++                      gfs_glock_dq_uninit(&al->al_rgd_gh);
++                      rgd = next;
++                      break;
++
++              case GLR_TRYFAILED:
++                      rgd = recent_rgrp_next(rgd);
++                      break;
++
++              default:
++                      GFS_ASSERT_RGRPD(error < 0, rgd,);
++                      return error;
++              }
++      }
++
++      /* Go through full list of rgrps */
++
++      update_recent = TRUE;
++      begin = rgd = forward_rgrp_get(sdp);
++
++      for (;;) {
++              error = gfs_glock_nq_init(rgd->rd_gl,
++                                        LM_ST_EXCLUSIVE, flags,
++                                        &al->al_rgd_gh);
++              switch (error) {
++              case 0:
++                      if (try_rgrp_fit(rgd, al))
++                              goto out;
++                      gfs_glock_dq_uninit(&al->al_rgd_gh);
++                      break;
++
++              case GLR_TRYFAILED:
++                      GFS_ASSERT_RGRPD(flags == LM_FLAG_TRY, rgd,);
++                      skipped++;
++                      break;
++
++              default:
++                      GFS_ASSERT_RGRPD(error < 0, rgd,);
++                      return error;
++              }
++
++              rgd = gfs_rgrpd_get_next(rgd);
++              if (!rgd)
++                      rgd = gfs_rgrpd_get_first(sdp);
++
++              if (rgd == begin) {
++                      if (++loops >= 2 || !skipped) {
++                              return -ENOSPC;
++                      }
++                      flags = 0;
++              }
++      }
++
++ out:
++      ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
++
++      if (update_recent) {
++              recent_rgrp_add(rgd);
++              rgd = gfs_rgrpd_get_next(rgd);
++              forward_rgrp_set(sdp, rgd);
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_inplace_reserve_i - Reserve space in the filesystem
++ * @ip: the inode to reserve space for
++ *
++ * Acquire resource group locks to allow for the maximum allocation
++ * described by "res".
++ *
++ * This should probably become more complex again, but for now, let's go
++ * for simple (one resource group) reservations.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_inplace_reserve_i(struct gfs_inode *ip,
++                   char *file, unsigned int line)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      int error;
++
++      GFS_ASSERT_INODE(al->al_requested_di ||
++                       al->al_requested_data ||
++                       al->al_requested_meta, ip,);
++
++      error = gfs_rindex_hold(sdp, &al->al_ri_gh);
++      if (error)
++              return error;
++
++      error = get_local_rgrp(ip);
++      if (error) {
++              gfs_glock_dq_uninit(&al->al_ri_gh);
++              return error;
++      }
++
++      gfs_depend_sync(al->al_rgd);
++
++      al->al_file = file;
++      al->al_line = line;
++
++      return 0;
++}
++
++/**
++ * gfs_inplace_release - release an inplace reservation
++ * @ip: the inode the reservation was taken out on
++ *
++ * Release a reservation made by gfs_inplace_reserve().
++ */
++
++void
++gfs_inplace_release(struct gfs_inode *ip)
++{
++      struct gfs_alloc *al = ip->i_alloc;
++
++      GFS_ASSERT_INODE(al->al_alloced_di <= al->al_requested_di, ip,
++                       printk("al_alloced_di = %u, al_requested_di = %u\n",
++                              al->al_alloced_di, al->al_requested_di);
++                       printk("al_file = %s, al_line = %u\n",
++                              al->al_file, al->al_line););
++      GFS_ASSERT_INODE(al->al_alloced_meta <= al->al_reserved_meta, ip,
++                       printk("al_alloced_meta = %u, al_reserved_meta = %u\n",
++                              al->al_alloced_meta, al->al_reserved_meta);
++                       printk("al_file = %s, al_line = %u\n",
++                              al->al_file, al->al_line););
++      GFS_ASSERT_INODE(al->al_alloced_data <= al->al_reserved_data, ip,
++                       printk("al_alloced_data = %u, al_reserved_data = %u\n",
++                              al->al_alloced_data, al->al_reserved_data);
++                       printk("al_file = %s, al_line = %u\n",
++                              al->al_file, al->al_line););
++
++      al->al_rgd = NULL;
++      gfs_glock_dq_uninit(&al->al_rgd_gh);
++      gfs_glock_dq_uninit(&al->al_ri_gh);
++}
++
++/**
++ * gfs_get_block_type - Check a block in a RG is of given type
++ * @rgd: the resource group holding the block
++ * @block: the block number
++ *
++ * Returns: The block type (GFS_BLKST_*)
++ */
++
++unsigned char
++gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block)
++{
++      struct gfs_bitmap *bits = NULL;
++      uint32_t length, rgrp_block, buf_block;
++      unsigned int buf;
++      unsigned char type;
++
++      length = rgd->rd_ri.ri_length;
++      rgrp_block = block - rgd->rd_ri.ri_data1;
++
++      for (buf = 0; buf < length; buf++) {
++              bits = &rgd->rd_bits[buf];
++              if (rgrp_block < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++                      break;
++      }
++
++      GFS_ASSERT_RGRPD(buf < length, rgd,);
++      buf_block = rgrp_block - bits->bi_start * GFS_NBBY;
++
++      type = gfs_testbit(rgd,
++                         rgd->rd_bh[buf]->b_data + bits->bi_offset,
++                         bits->bi_len, buf_block);
++
++      return type;
++}
++
++/**
++ * blkalloc_internal - allocate a single block
++ * @rgd: the resource group descriptor
++ * @goal: the goal block in the RG
++ * @old_state: the type of block to find
++ * @new_state: the resulting block type
++ *
++ * This function never fails.
++ *
++ * Returns:  returns the block allocated
++ */
++
++static uint32_t
++blkalloc_internal(struct gfs_rgrpd *rgd,
++                uint32_t goal,
++                unsigned char old_state, unsigned char new_state)
++{
++      struct gfs_bitmap *bits = NULL;
++      uint32_t length = rgd->rd_ri.ri_length;
++      uint32_t blk = 0;
++      unsigned int buf, x;
++
++      for (buf = 0; buf < length; buf++) {
++              bits = &rgd->rd_bits[buf];
++              if (goal < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++                      break;
++      }
++
++      GFS_ASSERT_RGRPD(buf < length, rgd,);
++      goal -= bits->bi_start * GFS_NBBY;
++
++      /* "x <= length" because we're skipping over some of the first
++         buffer when the goal is non-zero. */
++
++      for (x = 0; x <= length; x++) {
++              blk = gfs_bitfit(rgd,
++                               rgd->rd_bh[buf]->b_data + bits->bi_offset,
++                               bits->bi_len, goal, old_state);
++              if (blk != BFITNOENT)
++                      break;
++
++              buf = (buf + 1) % length;
++              bits = &rgd->rd_bits[buf];
++              goal = 0;
++      }
++
++      GFS_ASSERT_RGRPD(x <= length, rgd,);
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
++      gfs_setbit(rgd,
++                 rgd->rd_bh[buf]->b_data + bits->bi_offset,
++                 bits->bi_len, blk, new_state);
++
++      return bits->bi_start * GFS_NBBY + blk;
++}
++
++/**
++ * blkfree_internal - Free a block
++ * @sdp: the filesystem
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ * @new_state: the new state of the block
++ *
++ */
++
++static struct gfs_rgrpd *
++blkfree_internal(struct gfs_sbd *sdp, uint64_t bstart, uint32_t blen,
++               unsigned char new_state)
++{
++      struct gfs_rgrpd *rgd;
++      struct gfs_bitmap *bits = NULL;
++      uint32_t length, rgrp_blk, buf_blk;
++      unsigned int buf;
++
++      rgd = gfs_blk2rgrpd(sdp, bstart);
++      GFS_ASSERT_SBD(rgd, sdp,
++                     printk("block = %"PRIu64"\n", bstart););
++
++      length = rgd->rd_ri.ri_length;
++      rgrp_blk = bstart - rgd->rd_ri.ri_data1;
++
++      while (blen--) {
++              for (buf = 0; buf < length; buf++) {
++                      bits = &rgd->rd_bits[buf];
++                      if (rgrp_blk < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++                              break;
++              }
++
++              GFS_ASSERT_RGRPD(buf < length, rgd,);
++              buf_blk = rgrp_blk - bits->bi_start * GFS_NBBY;
++              rgrp_blk++;
++
++              gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
++              gfs_setbit(rgd,
++                         rgd->rd_bh[buf]->b_data + bits->bi_offset,
++                         bits->bi_len, buf_blk, new_state);
++      }
++
++      return rgd;
++}
++
++/**
++ * clump_alloc - Allocate a clump of metadata
++ * @rgd: the resource group descriptor
++ * @first: returns the first block allocated
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++clump_alloc(struct gfs_rgrpd *rgd, uint32_t *first)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_meta_header mh;
++      struct buffer_head **bh;
++      uint32_t goal, blk;
++      unsigned int x;
++      int error = 0;
++
++      memset(&mh, 0, sizeof(struct gfs_meta_header));
++      mh.mh_magic = GFS_MAGIC;
++      mh.mh_type = GFS_METATYPE_NONE;
++
++      bh = gmalloc(GFS_META_CLUMP * sizeof(struct buffer_head *));
++      memset(bh, 0, sizeof(GFS_META_CLUMP * sizeof(struct buffer_head *)));
++
++      goal = rgd->rd_last_alloc_data;
++
++      for (x = 0; x < GFS_META_CLUMP; x++) {
++              blk = blkalloc_internal(rgd, goal, GFS_BLKST_FREE,
++                                      GFS_BLKST_FREEMETA);
++              if (!x)
++                      *first = blk;
++
++              bh[x] = gfs_dgetblk(sdp, rgd->rd_ri.ri_data1 + blk, rgd->rd_gl);
++
++              gfs_prep_new_buffer(bh[x]);
++
++              gfs_meta_header_out(&mh, bh[x]->b_data);
++              ((struct gfs_meta_header *)bh[x]->b_data)->mh_generation = 0;
++
++              error = gfs_dwrite(sdp, bh[x], DIO_DIRTY | DIO_START);
++              if (error)
++                      goto out;
++
++              goal = blk;
++      }
++
++      rgd->rd_last_alloc_data = goal;
++
++      for (x = 0; x < GFS_META_CLUMP; x++) {
++              error = gfs_dwrite(sdp, bh[x], DIO_WAIT);
++              if (error)
++                      goto out;
++      }
++
++      gfs_mhc_add(rgd, bh, GFS_META_CLUMP);
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_free >= GFS_META_CLUMP, rgd,);
++      rgd->rd_rg.rg_free -= GFS_META_CLUMP;
++      rgd->rd_rg.rg_freemeta += GFS_META_CLUMP;
++
++ out:
++      for (x = 0; x < GFS_META_CLUMP; x++)
++              if (bh[x]) {
++                      gfs_dwrite(sdp, bh[x], DIO_WAIT);
++                      brelse(bh[x]);
++              }
++      kfree(bh);
++
++      return error;
++}
++
++/**
++ * gfs_blkalloc - Allocate a data block
++ * @ip: the inode to allocate the data block for
++ * @block: the block allocated
++ *
++ */
++
++void
++gfs_blkalloc(struct gfs_inode *ip, uint64_t *block)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      struct gfs_rgrpd *rgd = al->al_rgd;
++      uint32_t goal, blk;
++      int same;
++
++      GFS_ASSERT_INODE(rgd, ip,);
++
++      same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
++      goal = (same) ? ip->i_di.di_goal_dblk : rgd->rd_last_alloc_data;
++
++      blk = blkalloc_internal(rgd, goal,
++                              GFS_BLKST_FREE, GFS_BLKST_USED);
++      rgd->rd_last_alloc_data = blk;
++
++      if (!same) {
++              ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++              ip->i_di.di_goal_mblk = 0;
++      }
++      ip->i_di.di_goal_dblk = blk;
++
++      *block = rgd->rd_ri.ri_data1 + blk;
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_free, rgd,);
++      rgd->rd_rg.rg_free--;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++      al->al_alloced_data++;
++
++      gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
++}
++
++/**
++ * gfs_metaalloc - Allocate a metadata block to a file
++ * @ip:  the file
++ * @block: the block allocated
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_metaalloc(struct gfs_inode *ip, uint64_t *block)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_alloc *al = ip->i_alloc;
++      struct gfs_rgrpd *rgd = al->al_rgd;
++      uint32_t goal, blk;
++      int same;
++      int error;
++
++      GFS_ASSERT_INODE(rgd, ip,);
++
++      same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
++
++      if (!rgd->rd_rg.rg_freemeta) {
++              error = clump_alloc(rgd, &goal);
++              if (error)
++                      return error;
++
++              al->al_alloced_data += GFS_META_CLUMP;
++      } else
++              goal = (same) ? ip->i_di.di_goal_mblk : rgd->rd_last_alloc_meta;
++
++      blk = blkalloc_internal(rgd, goal,
++                              GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
++      rgd->rd_last_alloc_meta = blk;
++
++      if (!same) {
++              ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++              ip->i_di.di_goal_dblk = 0;
++      }
++      ip->i_di.di_goal_mblk = blk;
++
++      *block = rgd->rd_ri.ri_data1 + blk;
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_freemeta, rgd,);
++      rgd->rd_rg.rg_freemeta--;
++      rgd->rd_rg.rg_usedmeta++;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++      al->al_alloced_meta++;
++
++      gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
++
++      return 0;
++}
++
++/**
++ * gfs_dialloc - Allocate a dinode
++ * @dip: the directory that the inode is going in
++ * @block: the block
++ *
++ * Returns: errno
++ */
++
++int
++gfs_dialloc(struct gfs_inode *dip, uint64_t *block)
++{
++      struct gfs_alloc *al = dip->i_alloc;
++      struct gfs_rgrpd *rgd = al->al_rgd;
++      uint32_t goal, blk;
++      int error = 0;
++
++      GFS_ASSERT_INODE(rgd, dip,);
++
++      if (rgd->rd_rg.rg_freemeta)
++              goal = rgd->rd_last_alloc_meta;
++      else {
++              error = clump_alloc(rgd, &goal);
++              if (error)
++                      return error;
++
++              al->al_alloced_data += GFS_META_CLUMP;
++      }
++
++      blk = blkalloc_internal(rgd, goal,
++                              GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
++      rgd->rd_last_alloc_meta = blk;
++
++      *block = rgd->rd_ri.ri_data1 + blk;
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_freemeta, rgd,);
++      rgd->rd_rg.rg_freemeta--;
++      rgd->rd_rg.rg_useddi++;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++      al->al_alloced_di++;
++      al->al_alloced_meta++;
++
++      return error;
++}
++
++/**
++ * gfs_blkfree - free a piece of data
++ * @ip: the inode these blocks are being free from
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ *
++ */
++
++void
++gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_rgrpd *rgd;
++
++      rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREE);
++
++      rgd->rd_rg.rg_free += blen;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++      gfs_trans_add_quota(sdp, -(int64_t)blen,
++                          ip->i_di.di_uid,
++                          ip->i_di.di_gid);
++}
++
++/**
++ * gfs_metafree - free a piece of metadata
++ * @ip: the inode these blocks are being free from
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ *
++ */
++
++void
++gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      struct gfs_rgrpd *rgd;
++
++      rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREEMETA);
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_usedmeta >= blen, rgd,);
++      rgd->rd_rg.rg_usedmeta -= blen;
++      rgd->rd_rg.rg_freemeta += blen;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++      gfs_trans_add_quota(sdp, -(int64_t)blen,
++                          ip->i_di.di_uid,
++                          ip->i_di.di_gid);
++      gfs_wipe_buffers(ip, rgd, bstart, blen);
++}
++
++/**
++ * gfs_difree_uninit - free a piece of metadata
++ * @rgd: the resource group that contains the dinode
++ * @addr: the dinode address
++ *
++ */
++
++void
++gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr)
++{
++      struct gfs_sbd *sdp = rgd->rd_sbd;
++      struct gfs_rgrpd *tmp_rgd;
++
++      tmp_rgd = blkfree_internal(sdp, addr, 1,
++                                 GFS_BLKST_FREEMETA);
++      GFS_ASSERT_RGRPD(rgd == tmp_rgd, rgd,);
++
++      GFS_ASSERT_RGRPD(rgd->rd_rg.rg_useddi, rgd,);
++      rgd->rd_rg.rg_useddi--;
++      rgd->rd_rg.rg_freemeta++;
++
++      gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++      gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++}
++
++/**
++ * gfs_difree - free a piece of metadata
++ * @rgd: the resource group that contains the dinode
++ * @ip: the inode representing the dinode to free
++ *
++ */
++
++void
++gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip)
++{
++      gfs_difree_uninit(rgd, ip->i_num.no_addr);
++
++      gfs_trans_add_quota(ip->i_sbd, -1, ip->i_di.di_uid, ip->i_di.di_gid);
++      gfs_wipe_buffers(ip, rgd, ip->i_num.no_addr, 1);
++}
++
++/**
++ * gfs_rlist_add - add a RG to a list of RGs
++ * @sdp: the filesystem
++ * @rlist: the list of resource groups
++ * @block: the block
++ *
++ * Figure out what RG a block belongs to and add that RG to the list
++ *
++ */
++
++void
++gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist, uint64_t block)
++{
++      struct gfs_rgrpd *rgd;
++      struct gfs_rgrpd **tmp;
++      unsigned int new_space;
++      unsigned int x;
++
++      GFS_ASSERT_SBD(rlist->rl_rgrps <= rlist->rl_space, sdp,);
++      GFS_ASSERT_SBD(!rlist->rl_ghs, sdp,);
++
++      rgd = gfs_blk2rgrpd(sdp, block);
++      GFS_ASSERT_SBD(rgd, sdp,
++                     printk("block = %"PRIu64"\n", block););
++
++      for (x = 0; x < rlist->rl_rgrps; x++)
++              if (rlist->rl_rgd[x] == rgd)
++                      return;
++
++      if (rlist->rl_rgrps == rlist->rl_space) {
++              new_space = rlist->rl_space + 10;
++
++              tmp = gmalloc(new_space * sizeof(struct gfs_rgrpd *));
++
++              if (rlist->rl_rgd) {
++                      memcpy(tmp, rlist->rl_rgd,
++                             rlist->rl_space * sizeof(struct gfs_rgrpd *));
++                      kfree(rlist->rl_rgd);
++              }
++
++              rlist->rl_space = new_space;
++              rlist->rl_rgd = tmp;
++      }
++
++      rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
++}
++
++/**
++ * gfs_rlist_alloc - all RGs have been added to the rlist, allocated holders for them
++ * @rlist: the list of resource groups
++ * @state: the lock state to acquire the RG lock in
++ * @flags: the modifier flags for the holder structures
++ *
++ */
++
++void
++gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state, int flags)
++{
++      unsigned int x;
++
++      rlist->rl_ghs = gmalloc(rlist->rl_rgrps * sizeof(struct gfs_holder));
++      for (x = 0; x < rlist->rl_rgrps; x++)
++              gfs_holder_init(rlist->rl_rgd[x]->rd_gl,
++                              state, flags,
++                              &rlist->rl_ghs[x]);
++}
++
++/**
++ * gfs_rlist_free - free a resource group list
++ * @list: the list of resource groups
++ *
++ */
++
++void
++gfs_rlist_free(struct gfs_rgrp_list *rlist)
++{
++      unsigned int x;
++
++      if (rlist->rl_rgd)
++              kfree(rlist->rl_rgd);
++
++      if (rlist->rl_ghs) {
++              for (x = 0; x < rlist->rl_rgrps; x++)
++                      gfs_holder_uninit(&rlist->rl_ghs[x]);
++              kfree(rlist->rl_ghs);
++      }
++}
++
++/**
++ * gfs_reclaim_metadata - reclaims unused metadata
++ * @sdp: the file system
++ * @stats: stats on reclaimation
++ *
++ * This function will look through the resource groups and
++ * free the unused metadata.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_reclaim_metadata(struct gfs_sbd *sdp, struct gfs_reclaim_stats *stats)
++{
++      struct gfs_holder ji_gh, ri_gh, rgd_gh, t_gh;
++      struct gfs_rgrpd *rgd;
++      struct gfs_rgrp *rg;
++      struct gfs_dinode *di;
++      struct gfs_inum next;
++      struct buffer_head *bh;
++      uint32_t flags;
++      uint32_t goal;
++      unsigned int x;
++      int error = 0;
++
++      /* Acquire the jindex lock here so we don't deadlock with a
++         process writing the the jindex inode. :-( */
++
++      error = gfs_jindex_hold(sdp, &ji_gh);
++      if (error)
++              goto fail;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              goto fail_jindex_relse;
++
++      for (rgd = gfs_rgrpd_get_first(sdp);
++           rgd;
++           rgd = gfs_rgrpd_get_next(rgd)) {
++              error = gfs_glock_nq_init(rgd->rd_gl,
++                                        LM_ST_EXCLUSIVE, GL_NOCACHE,
++                                        &rgd_gh);
++              if (error)
++                      goto fail_rindex_relse;
++
++              rgrp_verify(rgd);
++
++              rg = &rgd->rd_rg;
++
++              if (!rg->rg_freedi && !rg->rg_freemeta) {
++                      gfs_glock_dq_uninit(&rgd_gh);
++                      continue;
++              }
++
++              gfs_mhc_zap(rgd);
++              gfs_depend_sync(rgd);
++
++              error = gfs_lock_fs_check_clean(sdp, LM_ST_EXCLUSIVE, &t_gh);
++              if (error)
++                      goto fail_gunlock_rg;
++
++              error = gfs_trans_begin(sdp, rgd->rd_ri.ri_length, 0);
++              if (error)
++                      goto fail_unlock_fs;
++
++              next = rg->rg_freedi_list;
++
++              for (x = rg->rg_freedi; x--;) {
++                      GFS_ASSERT_RGRPD(next.no_formal_ino &&
++                                       next.no_addr, rgd,);
++
++                      blkfree_internal(sdp, next.no_addr, 1, GFS_BLKST_FREE);
++
++                      error = gfs_dread(sdp, next.no_addr, rgd->rd_gl,
++                                        DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++                      if (error)
++                              goto fail_end_trans;
++
++                      di = (struct gfs_dinode *)bh->b_data;
++                      flags = di->di_flags;
++                      flags = gfs32_to_cpu(flags);
++                      GFS_ASSERT_RGRPD(flags & GFS_DIF_UNUSED, rgd,);
++
++                      gfs_inum_in(&next, (char *)&di->di_next_unused);
++
++                      brelse(bh);
++
++                      rg->rg_freedi--;
++                      rg->rg_free++;
++                      stats->rc_inodes++;
++              }
++
++              GFS_ASSERT_RGRPD(!next.no_formal_ino && !next.no_addr, rgd,);
++              rg->rg_freedi_list = next;
++
++              goal = 0;
++              for (x = rg->rg_freemeta; x--;) {
++                      goal = blkalloc_internal(rgd, goal,
++                                               GFS_BLKST_FREEMETA, GFS_BLKST_FREE);
++                      rg->rg_freemeta--;
++                      rg->rg_free++;
++                      stats->rc_metadata++;
++              }
++
++              gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++              gfs_rgrp_out(rg, rgd->rd_bh[0]->b_data);
++
++              gfs_trans_end(sdp);
++
++              gfs_glock_dq_uninit(&t_gh);
++
++              gfs_glock_dq_uninit(&rgd_gh);
++      }
++
++      gfs_glock_dq_uninit(&ri_gh);
++
++      gfs_glock_dq_uninit(&ji_gh);
++
++      return 0;
++
++ fail_end_trans:
++      gfs_trans_end(sdp);
++
++ fail_unlock_fs:
++      gfs_glock_dq_uninit(&t_gh);
++
++ fail_gunlock_rg:
++      gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex_relse:
++      gfs_glock_dq_uninit(&ri_gh);
++
++ fail_jindex_relse:
++      gfs_glock_dq_uninit(&ji_gh);
++
++ fail:
++      return error;
++}
+diff -urN linux-orig/fs/gfs/rgrp.h linux-patched/fs/gfs/rgrp.h
+--- linux-orig/fs/gfs/rgrp.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/rgrp.h        2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,75 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RGRP_DOT_H__
++#define __RGRP_DOT_H__
++
++void gfs_mhc_add(struct gfs_rgrpd *rgd, struct buffer_head **bh,
++                       unsigned int num);
++int gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_mhc_zap(struct gfs_rgrpd *rgd);
++
++void gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino);
++void gfs_depend_sync(struct gfs_rgrpd *rgd);
++
++struct gfs_rgrpd *gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk);
++struct gfs_rgrpd *gfs_rgrpd_get_first(struct gfs_sbd *sdp);
++struct gfs_rgrpd *gfs_rgrpd_get_next(struct gfs_rgrpd *rgd);
++
++void gfs_clear_rgrpd(struct gfs_sbd *sdp);
++
++int gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh);
++
++int gfs_rgrp_read(struct gfs_rgrpd *rgd);
++void gfs_rgrp_relse(struct gfs_rgrpd *rgd);
++
++void gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd);
++int gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd);
++
++struct gfs_alloc *gfs_alloc_get(struct gfs_inode *ip);
++void gfs_alloc_put(struct gfs_inode *ip);
++
++int gfs_inplace_reserve_i(struct gfs_inode *ip,
++                       char *file, unsigned int line);
++#define gfs_inplace_reserve(ip) \
++gfs_inplace_reserve_i((ip), __FILE__, __LINE__)
++
++void gfs_inplace_release(struct gfs_inode *ip);
++
++unsigned char gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block);
++
++void gfs_blkalloc(struct gfs_inode *ip, uint64_t *block);
++int gfs_metaalloc(struct gfs_inode *ip, uint64_t *block);
++int gfs_dialloc(struct gfs_inode *dip, uint64_t *block);
++
++void gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
++void gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
++void gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr);
++void gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip);
++
++struct gfs_rgrp_list {
++      unsigned int rl_rgrps;
++      unsigned int rl_space;
++      struct gfs_rgrpd **rl_rgd;
++      struct gfs_holder *rl_ghs;
++};
++
++void gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist,
++                 uint64_t block);
++void gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state,
++                   int flags);
++void gfs_rlist_free(struct gfs_rgrp_list *rlist);
++
++int gfs_reclaim_metadata(struct gfs_sbd *sdp, struct gfs_reclaim_stats *stats);
++
++#endif /* __RGRP_DOT_H__ */
+diff -urN linux-orig/fs/gfs/super.c linux-patched/fs/gfs/super.c
+--- linux-orig/fs/gfs/super.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/super.c       2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,1035 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "file.h"
++#include "format.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_init_tune_data - Fill in the struct gfs_tune (sd_tune) in the struct gfs_sbd.
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_init_tune_data(struct gfs_sbd *sdp)
++{
++      struct gfs_tune *gt = &sdp->sd_tune;
++
++      gt->gt_tune_version = GFS_TUNE_VERSION;
++
++      gt->gt_ilimit1 = 100;
++      gt->gt_ilimit1_tries = 3;
++      gt->gt_ilimit1_min = 1;
++      gt->gt_ilimit2 = 500;
++      gt->gt_ilimit2_tries = 10;
++      gt->gt_ilimit2_min = 3;
++      gt->gt_demote_secs = 300;
++      gt->gt_incore_log_blocks = 1024;
++      gt->gt_jindex_refresh_secs = 60;
++      gt->gt_depend_secs = 60;
++      gt->gt_scand_secs = 5;
++      gt->gt_recoverd_secs = 60;
++      gt->gt_logd_secs = 1;
++      gt->gt_quotad_secs = 5;
++      gt->gt_inoded_secs = 15;
++      gt->gt_quota_simul_sync = 64;
++      gt->gt_quota_warn_period = 10;
++      gt->gt_atime_quantum = 3600;
++      gt->gt_quota_quantum = 60;
++      gt->gt_quota_scale_num = 1;
++      gt->gt_quota_scale_den = 1;
++      gt->gt_quota_enforce = 1;
++      gt->gt_quota_account = 1;
++      gt->gt_new_files_jdata = 0;
++      gt->gt_new_files_directio = 0;
++      gt->gt_max_atomic_write = 4 << 20;
++      gt->gt_max_readahead = 1 << 18;
++      gt->gt_lockdump_size = 131072;
++      gt->gt_stall_secs = 600;
++      gt->gt_complain_secs = 10;
++      gt->gt_reclaim_limit = 5000;
++      gt->gt_entries_per_readdir = 32;
++      gt->gt_prefetch_secs = 10;
++      gt->gt_statfs_slots = 64;
++      gt->gt_max_mhc = 10000;
++}
++
++/**
++ * gfs_check_sb - Check superblock
++ * @sdp: the filesystem
++ * @sb: The superblock
++ * @silent: Don't print a message if the check fails
++ *
++ * Checks the version code of the FS is one that we understand how to
++ * read and that the sizes of the various on-disk structures have not
++ * changed.
++ */
++
++int
++gfs_check_sb(struct gfs_sbd *sdp, struct gfs_sb *sb, int silent)
++{
++      unsigned int x;
++
++      if (sb->sb_header.mh_magic != GFS_MAGIC ||
++          sb->sb_header.mh_type != GFS_METATYPE_SB) {
++              if (!silent)
++                      printk("GFS: not a GFS filesystem\n");
++              return -EINVAL;
++      }
++
++      /*  If format numbers match exactly, we're done.  */
++
++      if (sb->sb_fs_format == GFS_FORMAT_FS &&
++          sb->sb_multihost_format == GFS_FORMAT_MULTI)
++              return 0;
++
++      if (sb->sb_fs_format != GFS_FORMAT_FS) {
++              for (x = 0; gfs_old_fs_formats[x]; x++)
++                      if (gfs_old_fs_formats[x] == sb->sb_fs_format)
++                              break;
++
++              if (!gfs_old_fs_formats[x]) {
++                      printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++                             GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++                             sb->sb_fs_format, sb->sb_multihost_format);
++                      printk("GFS: I don't know how to upgrade this FS\n");
++                      return -EINVAL;
++              }
++      }
++
++      if (sb->sb_multihost_format != GFS_FORMAT_MULTI) {
++              for (x = 0; gfs_old_multihost_formats[x]; x++)
++                      if (gfs_old_multihost_formats[x] == sb->sb_multihost_format)
++                              break;
++
++              if (!gfs_old_multihost_formats[x]) {
++                      printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++                           GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++                             sb->sb_fs_format, sb->sb_multihost_format);
++                      printk("GFS: I don't know how to upgrade this FS\n");
++                      return -EINVAL;
++              }
++      }
++
++      if (!sdp->sd_args.ar_upgrade) {
++              printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++                     GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++                     sb->sb_fs_format, sb->sb_multihost_format);
++              printk("GFS: Use the \"upgrade\" mount option to upgrade the FS\n");
++              printk("GFS: See the manual for more details\n");
++              return -EINVAL;
++      }
++
++      return 0;
++}
++
++/**
++ * gfs_read_sb - Read super block
++ * @sdp: The GFS superblock
++ * @gl: the glock for the superblock (assumed to be held)
++ * @silent: Don't print message if mount fails
++ *
++ */
++
++int
++gfs_read_sb(struct gfs_sbd *sdp, struct gfs_glock *gl, int silent)
++{
++      struct buffer_head *bh;
++      uint32_t hash_blocks, ind_blocks, leaf_blocks;
++      uint32_t tmp_blocks;
++      uint64_t space = 0;
++      unsigned int x;
++      int error;
++
++      error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift,
++                        gl, DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++      if (error) {
++              if (!silent)
++                      printk("GFS: fsid=%s: can't read superblock\n",
++                             sdp->sd_fsname);
++              return error;
++      }
++
++      GFS_ASSERT_SBD(sizeof(struct gfs_sb) <= bh->b_size, sdp,);
++
++      gfs_sb_in(&sdp->sd_sb, bh->b_data);
++
++      brelse(bh);
++
++      error = gfs_check_sb(sdp, &sdp->sd_sb, silent);
++      if (error)
++              return error;
++
++      sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
++              GFS_BASIC_BLOCK_SHIFT;
++      sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
++      sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) /
++              sizeof(uint64_t);
++      sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_indirect)) /
++              sizeof(uint64_t);
++      sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++      sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
++      sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
++      sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
++
++      /*  Compute maximum reservation required to add a entry to a directory  */
++
++      hash_blocks = DIV_RU(sizeof(uint64_t) * (1 << GFS_DIR_MAX_DEPTH),
++                           sdp->sd_jbsize);
++
++      ind_blocks = 0;
++      for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
++              tmp_blocks = DIV_RU(tmp_blocks, sdp->sd_inptrs);
++              ind_blocks += tmp_blocks;
++      }
++
++      leaf_blocks = 2 + GFS_DIR_MAX_DEPTH;
++
++      sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
++
++      sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++      sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
++      for (x = 2;; x++) {
++              uint64_t d;
++              uint32_t m;
++              space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
++              d = space;
++              m = do_div(d, sdp->sd_inptrs);
++
++              if (d != sdp->sd_heightsize[x - 1] || m)
++                      break;
++              sdp->sd_heightsize[x] = space;
++      }
++      sdp->sd_max_height = x;
++      GFS_ASSERT_SBD(sdp->sd_max_height <= GFS_MAX_META_HEIGHT, sdp,);
++
++      sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++      sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
++      for (x = 2;; x++) {
++              uint64_t d;
++              uint32_t m;
++              space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
++              d = space;
++              m = do_div(d, sdp->sd_inptrs);
++
++              if (d != sdp->sd_jheightsize[x - 1] || m)
++                      break;
++              sdp->sd_jheightsize[x] = space;
++      }
++      sdp->sd_max_jheight = x;
++      GFS_ASSERT_SBD(sdp->sd_max_jheight <= GFS_MAX_META_HEIGHT, sdp,);
++
++      return 0;
++}
++
++/**
++ * gfs_do_upgrade - upgrade a filesystem
++ * @sdp: The GFS superblock
++ *
++ */
++
++int
++gfs_do_upgrade(struct gfs_sbd *sdp, struct gfs_glock *sb_gl)
++{
++      struct gfs_holder ji_gh, t_gh, j_gh;
++      struct gfs_log_header lh;
++      struct buffer_head *bh;
++      unsigned int x;
++      int error;
++
++      /*  If format numbers match exactly, we're done.  */
++
++      if (sdp->sd_sb.sb_fs_format == GFS_FORMAT_FS &&
++          sdp->sd_sb.sb_multihost_format == GFS_FORMAT_MULTI) {
++              printk("GFS: fsid=%s: no upgrade necessary\n",
++                     sdp->sd_fsname);
++              sdp->sd_args.ar_upgrade = FALSE;
++              return 0;
++      }
++
++      error = gfs_jindex_hold(sdp, &ji_gh);
++      if (error)
++              goto fail;
++
++      error = gfs_glock_nq_init(sdp->sd_trans_gl,
++                                LM_ST_EXCLUSIVE, GL_NOCACHE,
++                                &t_gh);
++      if (error)
++              goto fail_ji_relse;
++
++      if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++              printk("GFS: fsid=%s: can't upgrade: read-only FS\n",
++                     sdp->sd_fsname);
++              error = -EROFS;
++              goto fail_gunlock_tr;
++      }
++
++      for (x = 0; x < sdp->sd_journals; x++) {
++              error = gfs_glock_nq_num(sdp,
++                                       sdp->sd_jindex[x].ji_addr,
++                                       &gfs_meta_glops, LM_ST_SHARED,
++                                       LM_FLAG_TRY | GL_NOCACHE, &j_gh);
++              switch (error) {
++              case 0:
++                      break;
++
++              case GLR_TRYFAILED:
++                      printk("GFS: fsid=%s: journal %u is busy\n",
++                             sdp->sd_fsname, x);
++                      error = -EBUSY;
++
++              default:
++                      goto fail_gunlock_tr;
++              }
++
++              error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
++                                     j_gh.gh_gl, &lh);
++
++              gfs_glock_dq_uninit(&j_gh);
++
++              if (error)
++                      goto fail_gunlock_tr;
++
++              if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT) || lh.lh_last_dump) {
++                      printk("GFS: fsid=%s: journal %u is busy\n",
++                             sdp->sd_fsname, x);
++                      error = -EBUSY;
++                      goto fail_gunlock_tr;
++              }
++      }
++
++      /* We don't need to journal this change because we're changing
++         only one sector of one block.  We definitely don't want to have
++         the journaling code running at this point. */
++
++      error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, sb_gl,
++                        DIO_START | DIO_WAIT, &bh);
++      if (error)
++              goto fail_gunlock_tr;
++
++      gfs_sb_in(&sdp->sd_sb, bh->b_data);
++
++      error = gfs_check_sb(sdp, &sdp->sd_sb, FALSE);
++      GFS_ASSERT_SBD(!error, sdp,);
++
++      sdp->sd_sb.sb_fs_format = GFS_FORMAT_FS;
++      sdp->sd_sb.sb_multihost_format = GFS_FORMAT_MULTI;
++
++      gfs_sb_out(&sdp->sd_sb, bh->b_data);
++
++      set_bit(GLF_DIRTY, &sb_gl->gl_flags);
++      error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++
++      brelse(bh);
++
++      gfs_glock_dq_uninit(&t_gh);
++
++      gfs_glock_dq_uninit(&ji_gh);
++
++      if (!error) {
++              printk("GFS: fsid=%s: upgrade successful\n",
++                     sdp->sd_fsname);
++              sdp->sd_args.ar_upgrade = FALSE;
++      }
++
++      return error;
++
++ fail_gunlock_tr:
++      gfs_glock_dq_uninit(&t_gh);
++
++ fail_ji_relse:
++      gfs_glock_dq_uninit(&ji_gh);
++
++ fail:
++      if (error == -EBUSY)
++              printk("GFS: fsid=%s: can't upgrade: the FS is still busy or contains dirty journals\n",
++                     sdp->sd_fsname);
++      else
++              printk("GFS: fsid=%s: can't upgrade: %d\n",
++                     sdp->sd_fsname, error);
++
++      return error;
++}
++
++/**
++ * clear_journalsi - Clear all the journal index information (without locking)
++ * @sdp: The GFS superblock
++ *
++ */
++
++static void
++clear_journalsi(struct gfs_sbd *sdp)
++{
++      if (sdp->sd_jindex) {
++              kfree(sdp->sd_jindex);
++              sdp->sd_jindex = NULL;
++      }
++      sdp->sd_journals = 0;
++}
++
++/**
++ * gfs_clear_journals - Clear all the journal index information
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_clear_journals(struct gfs_sbd *sdp)
++{
++      down(&sdp->sd_jindex_lock);
++      clear_journalsi(sdp);
++      up(&sdp->sd_jindex_lock);
++}
++
++/**
++ * gfs_ji_update - Update the journal index information
++ * @ip: The journal index inode
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++gfs_ji_update(struct gfs_inode *ip)
++{
++      struct gfs_sbd *sdp = ip->i_sbd;
++      char buf[sizeof(struct gfs_jindex)];
++      unsigned int j;
++      int error;
++
++      GFS_ASSERT_SBD(!do_mod(ip->i_di.di_size, sizeof(struct gfs_jindex)),
++                     sdp,);
++
++      clear_journalsi(sdp);
++
++      sdp->sd_jindex = gmalloc(ip->i_di.di_size);
++      memset(sdp->sd_jindex, 0, ip->i_di.di_size);
++
++      for (j = 0;; j++) {
++              error = gfs_internal_read(ip, buf,
++                                        j * sizeof(struct gfs_jindex),
++                                        sizeof(struct gfs_jindex));
++              if (!error)
++                      break;
++              if (error != sizeof(struct gfs_jindex)) {
++                      if (error > 0)
++                              error = -EIO;
++                      goto fail;
++              }
++
++              gfs_jindex_in(sdp->sd_jindex + j, buf);
++      }
++
++      GFS_ASSERT_SBD(j * sizeof(struct gfs_jindex) == ip->i_di.di_size,
++                     sdp,);
++
++      sdp->sd_journals = j;
++      sdp->sd_jiinode_vn = ip->i_gl->gl_vn;
++
++      return 0;
++
++ fail:
++      clear_journalsi(sdp);
++      return error;
++}
++
++/**
++ * gfs_jindex_hold - Grab a lock on the jindex
++ * @sdp: The GFS superblock
++ * @ji_gh: the holder for the jindex glock
++ *
++ * This is very similar to the gfs_rindex_hold() function, except that
++ * in general we hold the jindex lock for longer periods of time and
++ * we grab it far less frequently (in general) then the rgrp lock.
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_jindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ji_gh)
++{
++      struct gfs_inode *ip = sdp->sd_jiinode;
++      struct gfs_glock *gl = ip->i_gl;
++      int error;
++
++      error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ji_gh);
++      if (error)
++              return error;
++
++      if (sdp->sd_jiinode_vn != gl->gl_vn) {
++              down(&sdp->sd_jindex_lock);
++              if (sdp->sd_jiinode_vn != gl->gl_vn)
++                      error = gfs_ji_update(ip);
++              up(&sdp->sd_jindex_lock);
++      }
++
++      if (error)
++              gfs_glock_dq_uninit(ji_gh);
++
++      return error;
++}
++
++/**
++ * gfs_get_jiinode - Read in the jindex inode for the superblock
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_jiinode(struct gfs_sbd *sdp)
++{
++      struct gfs_holder ji_gh;
++      int error;
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_sb.sb_jindex_di.no_formal_ino,
++                               &gfs_inode_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL,
++                               &ji_gh);
++      if (error)
++              return error;
++
++      error = gfs_inode_get(ji_gh.gh_gl, &sdp->sd_sb.sb_jindex_di,
++                            CREATE, &sdp->sd_jiinode);
++      if (!error) {
++              sdp->sd_jiinode_vn = ji_gh.gh_gl->gl_vn - 1;
++              set_bit(GLF_STICKY, &ji_gh.gh_gl->gl_flags);
++      }
++
++      gfs_glock_dq_uninit(&ji_gh);
++
++      return error;
++}
++
++/**
++ * gfs_get_riinode - Read in the rindex inode for the superblock
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_riinode(struct gfs_sbd *sdp)
++{
++      struct gfs_holder ri_gh;
++      int error;
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_sb.sb_rindex_di.no_formal_ino,
++                               &gfs_inode_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL,
++                               &ri_gh);
++      if (error)
++              return error;
++
++      error = gfs_inode_get(ri_gh.gh_gl, &sdp->sd_sb.sb_rindex_di,
++                            CREATE, &sdp->sd_riinode);
++      if (!error) {
++              sdp->sd_riinode_vn = ri_gh.gh_gl->gl_vn - 1;
++              set_bit(GLF_STICKY, &ri_gh.gh_gl->gl_flags);
++      }
++
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return error;
++}
++
++/**
++ * gfs_get_rootinode - Read in the root inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_rootinode(struct gfs_sbd *sdp)
++{
++      struct gfs_holder i_gh;
++      int error;
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_sb.sb_root_di.no_formal_ino,
++                               &gfs_inode_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL,
++                               &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_root_di,
++                            CREATE, &sdp->sd_rooti);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_get_qinode - Read in the quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_qinode(struct gfs_sbd *sdp)
++{
++      struct gfs_holder i_gh;
++      int error;
++
++      if (!sdp->sd_sb.sb_quota_di.no_formal_ino) {
++              error = gfs_alloc_qinode(sdp);
++              if (error)
++                      return error;
++      }
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_sb.sb_quota_di.no_formal_ino,
++                               &gfs_inode_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL,
++                               &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_quota_di,
++                            CREATE, &sdp->sd_qinode);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_get_linode - Read in the quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_linode(struct gfs_sbd *sdp)
++{
++      struct gfs_holder i_gh;
++      int error;
++
++      if (!sdp->sd_sb.sb_license_di.no_formal_ino) {
++              error = gfs_alloc_linode(sdp);
++              if (error)
++                      return error;
++      }
++
++      error = gfs_glock_nq_num(sdp,
++                               sdp->sd_sb.sb_license_di.no_formal_ino,
++                               &gfs_inode_glops,
++                               LM_ST_SHARED, GL_LOCAL_EXCL,
++                               &i_gh);
++      if (error)
++              return error;
++
++      error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_license_di,
++                            CREATE, &sdp->sd_linode);
++
++      gfs_glock_dq_uninit(&i_gh);
++
++      return error;
++}
++
++/**
++ * gfs_make_fs_rw - Turn a RO FS into a RW one
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_fs_rw(struct gfs_sbd *sdp)
++{
++      struct gfs_glock *j_gl = sdp->sd_journal_gh.gh_gl;
++      struct gfs_holder t_gh;
++      struct gfs_log_header head;
++      int error;
++
++      error = gfs_glock_nq_init(sdp->sd_trans_gl,
++                                LM_ST_SHARED,
++                                GL_LOCAL_EXCL | GL_EXACT,
++                                &t_gh);
++      if (error)
++              return error;
++
++      j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
++
++      error = gfs_find_jhead(sdp, &sdp->sd_jdesc, j_gl, &head);
++      if (error)
++              goto fail;
++
++      GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++
++      /*  Initialize some head of the log stuff  */
++      sdp->sd_sequence = head.lh_sequence;
++      sdp->sd_log_head = head.lh_first + 1;
++
++      error = gfs_recover_dump(sdp);
++      if (error)
++              goto fail;
++
++      set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++      clear_bit(SDF_ROFS, &sdp->sd_flags);
++
++      set_bit(GLF_DIRTY, &j_gl->gl_flags);
++      gfs_log_dump(sdp, TRUE);
++
++      gfs_glock_dq_uninit(&t_gh);
++
++      return 0;
++
++ fail:
++      t_gh.gh_flags |= GL_NOCACHE;
++      gfs_glock_dq_uninit(&t_gh);
++
++      return error;
++}
++
++/**
++ * gfs_make_fs_ro - Turn a RW FS into a RO one
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_fs_ro(struct gfs_sbd *sdp)
++{
++      struct gfs_holder t_gh;
++      int error;
++
++      error = gfs_glock_nq_init(sdp->sd_trans_gl,
++                                LM_ST_SHARED,
++                                GL_LOCAL_EXCL | GL_EXACT | GL_NOCACHE,
++                                &t_gh);
++      if (error)
++              return error;
++
++      gfs_sync_meta(sdp);
++      gfs_log_dump(sdp, TRUE);
++
++      error = gfs_log_shutdown(sdp);
++      if (error)
++              gfs_io_error(sdp);
++
++      set_bit(SDF_ROFS, &sdp->sd_flags);
++      clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++
++      gfs_glock_dq_uninit(&t_gh);
++
++      gfs_unlinked_cleanup(sdp);
++      gfs_quota_cleanup(sdp);
++
++      return error;
++}
++
++/**
++ * stat_gfs_async - Stat a filesystem using asynchronous locking
++ * @sdp: the filesystem
++ * @usage: the usage info that will be returned
++ * @interruptible: TRUE if we should look for signals.
++ *
++ * Any error (other than a signal) will cause this routine to fall back
++ * to the synchronous version.
++ *
++ * This really shouldn't busy wait like this.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++stat_gfs_async(struct gfs_sbd *sdp, struct gfs_usage *usage, int interruptible)
++{
++      struct gfs_rgrpd *rgd_next = gfs_rgrpd_get_first(sdp), *rgd;
++      struct gfs_holder *gha, *gh;
++      struct gfs_rgrp_lvb *rb;
++      unsigned int slots = sdp->sd_tune.gt_statfs_slots;
++      unsigned int x;
++      int done;
++      int error = 0, err;
++
++      gha = gmalloc(slots * sizeof(struct gfs_holder));
++      memset(gha, 0, slots * sizeof(struct gfs_holder));
++
++      for (;;) {
++              done = TRUE;
++
++              for (x = 0; x < slots; x++) {
++                      gh = gha + x;
++
++                      if (gh->gh_gl && gfs_glock_poll(gh)) {
++                              err = gfs_glock_wait(gh);
++                              if (err) {
++                                      gfs_holder_uninit(gh);
++                                      error = err;
++                              } else {
++                                      rgd = gl2rgd(gh->gh_gl);
++
++                                      rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++                                      if (gfs32_to_cpu(rb->rb_magic) == GFS_MAGIC &&
++                                          !test_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags)) {
++                                              usage->gu_total_blocks += rgd->rd_ri.ri_data;
++                                              usage->gu_free += gfs32_to_cpu(rb->rb_free);
++                                              usage->gu_used_dinode += gfs32_to_cpu(rb->rb_useddi);
++                                              usage->gu_free_dinode += gfs32_to_cpu(rb->rb_freedi);
++                                              usage->gu_used_meta += gfs32_to_cpu(rb->rb_usedmeta);
++                                              usage->gu_free_meta += gfs32_to_cpu(rb->rb_freemeta);
++                                      } else
++                                              error = -EINVAL;
++
++                                      gfs_glock_dq_uninit(gh);
++                              }
++                      }
++
++                      if (gh->gh_gl)
++                              done = FALSE;
++                      else if (rgd_next && !error) {
++                              gfs_glock_nq_init(rgd_next->rd_gl,
++                                                LM_ST_SHARED,
++                                                GL_LOCAL_EXCL | GL_SKIP | GL_ASYNC,
++                                                gh);
++                              rgd_next = gfs_rgrpd_get_next(rgd_next);
++                              done = FALSE;
++                      }
++
++                      if (interruptible && signal_pending(current))
++                              error = -ERESTARTSYS;
++              }
++
++              if (done)
++                      break;
++
++              yield();
++      }
++
++      kfree(gha);
++
++      return error;
++}
++
++/**
++ * gfs_stat_gfs - Do a statfs
++ * @sdp: the filesystem
++ * @usage: the usage structure
++ * @interruptible:  Stop if there is a signal pending
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_stat_gfs(struct gfs_sbd *sdp, struct gfs_usage *usage, int interruptible)
++{
++      struct gfs_holder ri_gh, rgd_gh;
++      struct gfs_rgrpd *rgd;
++      struct gfs_rgrp_lvb *rb;
++      int error;
++
++      memset(usage, 0, sizeof(struct gfs_usage));
++      usage->gu_block_size = sdp->sd_sb.sb_bsize;
++
++      error = gfs_rindex_hold(sdp, &ri_gh);
++      if (error)
++              return error;
++
++      if (GFS_ASYNC_LM(sdp)) {
++              error = stat_gfs_async(sdp, usage, interruptible);
++              if (!error || error == -ERESTARTSYS)
++                      goto out;
++
++              memset(usage, 0, sizeof(struct gfs_usage));
++              usage->gu_block_size = sdp->sd_sb.sb_bsize;
++      }
++
++      for (rgd = gfs_rgrpd_get_first(sdp);
++           rgd;
++           rgd = gfs_rgrpd_get_next(rgd)) {
++              for (;;) {
++                      error = gfs_glock_nq_init(rgd->rd_gl,
++                                                LM_ST_SHARED,
++                                                GL_LOCAL_EXCL | GL_SKIP,
++                                                &rgd_gh);
++                      if (error)
++                              goto out;
++
++                      rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++                      if (gfs32_to_cpu(rb->rb_magic) == GFS_MAGIC &&
++                          !test_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags)) {
++                              usage->gu_total_blocks += rgd->rd_ri.ri_data;
++                              usage->gu_free += gfs32_to_cpu(rb->rb_free);
++                              usage->gu_used_dinode += gfs32_to_cpu(rb->rb_useddi);
++                              usage->gu_free_dinode += gfs32_to_cpu(rb->rb_freedi);
++                              usage->gu_used_meta += gfs32_to_cpu(rb->rb_usedmeta);
++                              usage->gu_free_meta += gfs32_to_cpu(rb->rb_freemeta);
++
++                              gfs_glock_dq_uninit(&rgd_gh);
++
++                              break;
++                      } else {
++                              gfs_glock_dq_uninit(&rgd_gh);
++
++                              error = gfs_rgrp_lvb_init(rgd);
++                              if (error)
++                                      goto out;
++                      }
++              }
++
++              if (interruptible && signal_pending(current)) {
++                      error = -ERESTARTSYS;
++                      goto out;
++              }
++      }
++
++ out:
++      gfs_glock_dq_uninit(&ri_gh);
++
++      return error;
++}
++
++/**
++ * gfs_lock_fs_check_clean - Stop all writes to the FS and check that all journals are clean
++ * @sdp: the file system
++ * @state: the state to put the transaction lock into
++ * @t_gh: the hold on the transaction lock
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_lock_fs_check_clean(struct gfs_sbd *sdp, unsigned int state,
++                      struct gfs_holder *t_gh)
++{
++      struct gfs_holder ji_gh, cl_gh;
++      struct gfs_log_header lh;
++      unsigned int x;
++      int error;
++
++      error = gfs_jindex_hold(sdp, &ji_gh);
++      if (error)
++              return error;
++
++      error = gfs_glock_nq_num(sdp,
++                               GFS_CRAP_LOCK, &gfs_meta_glops,
++                               LM_ST_SHARED, GL_NOCACHE,
++                               &cl_gh);
++      if (error)
++              goto fail;
++
++      error = gfs_glock_nq_init(sdp->sd_trans_gl, state,
++                                LM_FLAG_PRIORITY | GL_EXACT | GL_NOCACHE,
++                                t_gh);
++      if (error)
++              goto fail_gunlock_craplock;
++
++      for (x = 0; x < sdp->sd_journals; x++) {
++              error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
++                                     cl_gh.gh_gl, &lh);
++              if (error)
++                      goto fail_gunlock_trans;
++
++              if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
++                      error = -EBUSY;
++                      goto fail_gunlock_trans;
++              }
++      }
++
++      gfs_glock_dq_uninit(&cl_gh);
++      gfs_glock_dq_uninit(&ji_gh);
++
++      return 0;
++
++ fail_gunlock_trans:
++      gfs_glock_dq_uninit(t_gh);
++
++ fail_gunlock_craplock:
++      gfs_glock_dq_uninit(&cl_gh);
++
++ fail:
++      gfs_glock_dq_uninit(&ji_gh);
++
++      return error;
++}
++
++/**
++ * gfs_freeze_fs - freezes the file system
++ * @sdp: the file system
++ *
++ * This function flushes data and meta data for all machines by
++ * aquiring the transaction log exclusively.  All journals are
++ * ensured to be in a clean state as well.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_freeze_fs(struct gfs_sbd *sdp)
++{
++      int error = 0;
++
++      down(&sdp->sd_freeze_lock);
++
++      if (!sdp->sd_freeze_count++) {
++              error = gfs_lock_fs_check_clean(sdp, LM_ST_DEFERRED,
++                                              &sdp->sd_freeze_gh);
++              if (error)
++                      sdp->sd_freeze_count--;
++              else
++                      sdp->sd_freeze_gh.gh_owner = NULL;
++      }
++
++      up(&sdp->sd_freeze_lock);
++
++      return error;
++}
++
++/**
++ * gfs_unfreeze_fs - unfreezes the file system
++ * @sdp: the file system
++ *
++ * This function allows the file system to proceed by unlocking
++ * the exclusively held transaction lock.  Other GFS nodes are
++ * now free to acquire the lock shared and go on with their lives.
++ *
++ */
++
++void
++gfs_unfreeze_fs(struct gfs_sbd *sdp)
++{
++      down(&sdp->sd_freeze_lock);
++
++      if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
++              gfs_glock_dq_uninit(&sdp->sd_freeze_gh);
++
++      up(&sdp->sd_freeze_lock);
++}
+diff -urN linux-orig/fs/gfs/super.h linux-patched/fs/gfs/super.h
+--- linux-orig/fs/gfs/super.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/super.h       2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,53 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SUPER_DOT_H__
++#define __SUPER_DOT_H__
++
++void gfs_init_tune_data(struct gfs_sbd *sdp);
++
++int gfs_check_sb(struct gfs_sbd *sdp, struct gfs_sb *sb, int silent);
++int gfs_read_sb(struct gfs_sbd *sdp, struct gfs_glock *gl, int silent);
++int gfs_do_upgrade(struct gfs_sbd *sdp, struct gfs_glock *gl_sb);
++
++static __inline__ unsigned int
++gfs_num_journals(struct gfs_sbd *sdp)
++{
++      unsigned int num;
++      down(&sdp->sd_jindex_lock);
++      num = sdp->sd_journals;
++      up(&sdp->sd_jindex_lock);
++      return num;
++}
++
++int gfs_jindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ji_gh);
++void gfs_clear_journals(struct gfs_sbd *sdp);
++
++int gfs_get_jiinode(struct gfs_sbd *sdp);
++int gfs_get_riinode(struct gfs_sbd *sdp);
++int gfs_get_rootinode(struct gfs_sbd *sdp);
++int gfs_get_qinode(struct gfs_sbd *sdp);
++int gfs_get_linode(struct gfs_sbd *sdp);
++
++int gfs_make_fs_rw(struct gfs_sbd *sdp);
++int gfs_make_fs_ro(struct gfs_sbd *sdp);
++
++int gfs_stat_gfs(struct gfs_sbd *sdp, struct gfs_usage *usage,
++               int interruptible);
++
++int gfs_lock_fs_check_clean(struct gfs_sbd *sdp, unsigned int state,
++                          struct gfs_holder *t_gh);
++int gfs_freeze_fs(struct gfs_sbd *sdp);
++void gfs_unfreeze_fs(struct gfs_sbd *sdp);
++
++#endif /* __SUPER_DOT_H__ */
+diff -urN linux-orig/fs/gfs/trans.c linux-patched/fs/gfs/trans.c
+--- linux-orig/fs/gfs/trans.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/trans.c       2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,410 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "log.h"
++#include "lops.h"
++#include "quota.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * gfs_trans_print - Print a transaction to the console
++ * @sdp: the filesystem
++ * @tr: The GFS transaction
++ * @where: Situation of transaction
++ *
++ */
++
++void
++gfs_trans_print(struct gfs_sbd *sdp, struct gfs_trans *tr, unsigned int where)
++{
++      struct gfs_log_element *le;
++      struct list_head *tmp, *head;
++      unsigned int mblks = 0, eblks = 0;
++
++      LO_TRANS_SIZE(sdp, tr, &mblks, &eblks, NULL, NULL);
++
++      printk("Transaction:  (%s, %u)\n", tr->tr_file, tr->tr_line);
++      printk("  tr_mblks_asked = %u, tr_eblks_asked = %u, tr_seg_reserved = %u\n",
++             tr->tr_mblks_asked, tr->tr_eblks_asked, tr->tr_seg_reserved);
++      printk("  mblks = %u, eblks = %u\n", mblks, eblks);
++      printk("  tr_flags = 0x%.8X\n", tr->tr_flags);
++
++      for (head = &tr->tr_elements, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              LO_PRINT(sdp, le, where);
++      }
++
++      printk("End Trans\n");
++}
++
++/**
++ * gfs_trans_begin_i - Perpare to start a transaction
++ * @sdp: The GFS superblock
++ * @meta_blocks: Reserve this many metadata blocks in the log
++ * @extra_blocks: Number of non-metadata blocks to reserve
++ *
++ * Allocate the struct gfs_trans struct.  Do in-place and
++ * log reservations.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_trans_begin_i(struct gfs_sbd *sdp,
++                unsigned int meta_blocks, unsigned int extra_blocks,
++                char *file, unsigned int line)
++{
++      struct gfs_trans *tr;
++      unsigned int blocks;
++      int error;
++
++      tr = gmalloc(sizeof(struct gfs_trans));
++      memset(tr, 0, sizeof(struct gfs_trans));
++
++      INIT_LIST_HEAD(&tr->tr_elements);
++      INIT_LIST_HEAD(&tr->tr_free_bufs);
++      INIT_LIST_HEAD(&tr->tr_free_bmem);
++      INIT_LIST_HEAD(&tr->tr_bufs);
++      INIT_LIST_HEAD(&tr->tr_ail_bufs);
++
++      tr->tr_file = file;
++      tr->tr_line = line;
++      tr->tr_t_gh = gfs_holder_get(sdp->sd_trans_gl, LM_ST_SHARED, 0);
++
++      error = gfs_glock_nq(tr->tr_t_gh);
++      if (error)
++              goto fail;
++
++      if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++              tr->tr_t_gh->gh_flags |= GL_NOCACHE;
++              error = -EROFS;
++              goto fail_gunlock;
++      }
++
++      /*  Do log reservation  */
++
++      tr->tr_mblks_asked = meta_blocks;
++      tr->tr_eblks_asked = extra_blocks;
++
++      blocks = 1;
++      if (meta_blocks)
++              blocks += gfs_struct2blk(sdp, meta_blocks,
++                                       sizeof(struct gfs_block_tag)) +
++                      meta_blocks;
++      blocks += extra_blocks;
++      tr->tr_seg_reserved = gfs_blk2seg(sdp, blocks);
++
++      error = gfs_log_reserve(sdp, tr->tr_seg_reserved, FALSE);
++      if (error)
++              goto fail_gunlock;
++
++      GFS_ASSERT_SBD(!current_transaction, sdp,);
++      current_transaction = tr;
++
++      return 0;
++
++ fail_gunlock:
++      gfs_glock_dq(tr->tr_t_gh);
++
++ fail:
++      gfs_holder_put(tr->tr_t_gh);
++      kfree(tr);
++
++      return error;
++}
++
++/**
++ * gfs_trans_end - End a transaction
++ * @sdp: The GFS superblock
++ *
++ * If buffers were actually added to the transaction,
++ * commit it.
++ */
++
++void
++gfs_trans_end(struct gfs_sbd *sdp)
++{
++      struct gfs_trans *tr;
++      struct gfs_holder *t_gh;
++      struct list_head *tmp, *head;
++      struct gfs_log_element *le;
++
++      tr = current_transaction;
++      GFS_ASSERT_SBD(tr, sdp,);
++      current_transaction = NULL;
++
++      t_gh = tr->tr_t_gh;
++      tr->tr_t_gh = NULL;
++
++      if (list_empty(&tr->tr_elements)) {
++              gfs_log_release(sdp, tr->tr_seg_reserved);
++              kfree(tr);
++
++              gfs_glock_dq(t_gh);
++              gfs_holder_put(t_gh);
++
++              return;
++      }
++
++      for (head = &tr->tr_elements, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              LO_TRANS_END(sdp, le);
++      }
++
++      gfs_log_commit(sdp, tr);
++
++      gfs_glock_dq(t_gh);
++      gfs_holder_put(t_gh);
++
++      if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
++              gfs_log_flush(sdp);
++}
++
++/**
++ * gfs_trans_add_gl - Add a glock to a transaction
++ * @gl: the glock
++ *
++ * Add the given glock to this process's transaction
++ */
++
++void
++gfs_trans_add_gl(struct gfs_glock *gl)
++{
++      if (!gl->gl_new_le.le_trans) {
++              GFS_ASSERT_GLOCK(gfs_glock_is_locked_by_me(gl) &&
++                               gfs_glock_is_held_excl(gl), gl,);
++              gfs_glock_hold(gl); /* Released in glock_trans_end() */
++
++              set_bit(GLF_DIRTY, &gl->gl_flags);
++
++              LO_ADD(gl->gl_sbd, &gl->gl_new_le);
++              gl->gl_new_le.le_trans->tr_num_gl++;
++      }
++}
++
++/**
++ * gfs_trans_add_bh - Add a buffer to the current transaction
++ * @gl: the glock the buffer belongs to
++ * @bh: The buffer to add
++ *
++ * Add a buffer to the current transaction.  The glock for the buffer
++ * should be held.  This pins the buffer as well.
++ *
++ * Call this as many times as you want during transaction formation.
++ * It only does its work once.
++ *
++ */
++
++void
++gfs_trans_add_bh(struct gfs_glock *gl, struct buffer_head *bh)
++{
++      struct gfs_sbd *sdp = gl->gl_sbd;
++      struct gfs_bufdata *bd;
++
++      bd = bh2bd(bh);
++      if (!bd) {
++              gfs_attach_bufdata(bh, gl);
++              bd = bh2bd(bh);
++      }
++
++      if (bd->bd_new_le.le_trans)
++              return;
++
++      gfs_meta_check(sdp, bh);
++
++      GFS_ASSERT_GLOCK(bd->bd_gl == gl, gl,);
++
++      if (!gl->gl_new_le.le_trans)
++              gfs_trans_add_gl(gl);
++
++      gfs_dpin(sdp, bh);
++
++      LO_ADD(sdp, &bd->bd_new_le);
++      bd->bd_new_le.le_trans->tr_num_buf++;
++}
++
++/**
++ * gfs_trans_add_unlinked - Add a unlinked/dealloced tag to the current transaction
++ * @sdp: the filesystem
++ * @type: the type of entry
++ * @inum: the inode number
++ *
++ * Returns: the unlinked structure
++ */
++
++struct gfs_unlinked *
++gfs_trans_add_unlinked(struct gfs_sbd *sdp, unsigned int type,
++                     struct gfs_inum *inum)
++{
++      struct gfs_unlinked *ul;
++
++      ul = gfs_unlinked_get(sdp, inum, CREATE);
++
++      LO_ADD(sdp, &ul->ul_new_le);
++
++      switch (type) {
++      case GFS_LOG_DESC_IUL:
++              set_bit(ULF_NEW_UL, &ul->ul_flags);
++              ul->ul_new_le.le_trans->tr_num_iul++;
++              break;
++      case GFS_LOG_DESC_IDA:
++              clear_bit(ULF_NEW_UL, &ul->ul_flags);
++              ul->ul_new_le.le_trans->tr_num_ida++;
++              break;
++      default:
++              GFS_ASSERT_SBD(FALSE, sdp,);
++              break;
++      }
++
++      return ul;
++}
++
++/**
++ * gfs_trans_add_quota - Add quota changes to a transaction
++ * @sdp: the filesystem
++ * @change: The number of blocks allocated (positive) or freed (negative)
++ * @uid: the user ID doing the change
++ * @gid: the group ID doing the change
++ *
++ */
++
++void
++gfs_trans_add_quota(struct gfs_sbd *sdp, int64_t change,
++                  uint32_t uid, uint32_t gid)
++{
++      struct gfs_trans *tr;
++      struct list_head *tmp, *head, *next;
++      struct gfs_log_element *le;
++      struct gfs_quota_le *ql;
++      int found_uid, found_gid;
++      int error;
++
++      if (!sdp->sd_tune.gt_quota_account)
++              return;
++
++      GFS_ASSERT_SBD(change, sdp,);
++
++      found_uid = (uid == NO_QUOTA_CHANGE);
++      found_gid = (gid == NO_QUOTA_CHANGE);
++
++      GFS_ASSERT_SBD(!found_uid || !found_gid, sdp,);
++
++      tr = current_transaction;
++      GFS_ASSERT_SBD(tr, sdp,);
++
++      for (head = &tr->tr_elements, tmp = head->next, next = tmp->next;
++           tmp != head;
++           tmp = next, next = next->next) {
++              le = list_entry(tmp, struct gfs_log_element, le_list);
++              if (le->le_ops != &gfs_quota_lops)
++                      continue;
++
++              ql = container_of(le, struct gfs_quota_le, ql_le);
++
++              if (test_bit(QDF_USER, &ql->ql_data->qd_flags)) {
++                      if (ql->ql_data->qd_id == uid) {
++                              ql->ql_change += change;
++
++                              spin_lock(&sdp->sd_quota_lock);
++                              ql->ql_data->qd_change_new += change;
++                              spin_unlock(&sdp->sd_quota_lock);
++
++                              list_del(&le->le_list);
++
++                              if (ql->ql_change)
++                                      list_add(&le->le_list,
++                                               &tr->tr_elements);
++                              else {
++                                      gfs_quota_put(sdp, ql->ql_data);
++                                      kfree(ql);
++                                      tr->tr_num_q--;
++                              }
++
++                              GFS_ASSERT_SBD(!found_uid, sdp,);
++                              found_uid = TRUE;
++                              if (found_gid)
++                                      break;
++                      }
++              } else {
++                      if (ql->ql_data->qd_id == gid) {
++                              ql->ql_change += change;
++
++                              spin_lock(&sdp->sd_quota_lock);
++                              ql->ql_data->qd_change_new += change;
++                              spin_unlock(&sdp->sd_quota_lock);
++
++                              list_del(&le->le_list);
++
++                              if (ql->ql_change)
++                                      list_add(&le->le_list,
++                                               &tr->tr_elements);
++                              else {
++                                      gfs_quota_put(sdp, ql->ql_data);
++                                      kfree(ql);
++                                      tr->tr_num_q--;
++                              }
++
++                              GFS_ASSERT_SBD(!found_gid, sdp,);
++                              found_gid = TRUE;
++                              if (found_uid)
++                                      break;
++                      }
++              }
++      }
++
++      while (!found_uid || !found_gid) {
++              ql = gmalloc(sizeof(struct gfs_quota_le));
++              memset(ql, 0, sizeof(struct gfs_quota_le));
++
++              INIT_LE(&ql->ql_le, &gfs_quota_lops);
++
++              if (found_uid) {
++                      error = gfs_quota_get(sdp, FALSE, gid,
++                                            NO_CREATE,
++                                            &ql->ql_data);
++                      found_gid = TRUE;
++              } else {
++                      error = gfs_quota_get(sdp, TRUE, uid,
++                                            NO_CREATE,
++                                            &ql->ql_data);
++                      found_uid = TRUE;
++              }
++
++              GFS_ASSERT_SBD(!error && ql->ql_data, sdp,);
++
++              ql->ql_change = change;
++
++              spin_lock(&sdp->sd_quota_lock);
++              ql->ql_data->qd_change_new += change;
++              spin_unlock(&sdp->sd_quota_lock);
++
++              LO_ADD(sdp, &ql->ql_le);
++              tr->tr_num_q++;
++      }
++}
+diff -urN linux-orig/fs/gfs/trans.h linux-patched/fs/gfs/trans.h
+--- linux-orig/fs/gfs/trans.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/trans.h       2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,37 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __TRANS_DOT_H__
++#define __TRANS_DOT_H__
++
++#define TRANS_IS_NEW            (53)
++#define TRANS_IS_INCORE         (54)
++void gfs_trans_print(struct gfs_sbd *sdp, struct gfs_trans *tr,
++                   unsigned int where);
++
++int gfs_trans_begin_i(struct gfs_sbd *sdp,
++                    unsigned int meta_blocks, unsigned int extra_blocks,
++                    char *file, unsigned int line);
++#define gfs_trans_begin(sdp, mb, eb) \
++gfs_trans_begin_i((sdp), (mb), (eb), __FILE__, __LINE__)
++
++void gfs_trans_end(struct gfs_sbd *sdp);
++
++void gfs_trans_add_gl(struct gfs_glock *gl);
++void gfs_trans_add_bh(struct gfs_glock *gl, struct buffer_head *bh);
++struct gfs_unlinked *gfs_trans_add_unlinked(struct gfs_sbd *sdp, unsigned int type,
++                                          struct gfs_inum *inum);
++void gfs_trans_add_quota(struct gfs_sbd *sdp, int64_t change, uint32_t uid,
++                       uint32_t gid);
++
++#endif /* __TRANS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/unlinked.c linux-patched/fs/gfs/unlinked.c
+--- linux-orig/fs/gfs/unlinked.c       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/unlinked.c    2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,427 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "inode.h"
++#include "log.h"
++#include "lops.h"
++#include "unlinked.h"
++
++/**
++ * gfs_unlinked_get - Get a structure to represent an unlinked inode
++ * @sdp: the filesystem
++ * @inum: the inode that's unlinked
++ * @create: if TRUE, create the structure, otherwise return NULL
++ *
++ * Returns: the structure, or NULL
++ */
++
++struct gfs_unlinked *
++gfs_unlinked_get(struct gfs_sbd *sdp, struct gfs_inum *inum, int create)
++{
++      struct gfs_unlinked *ul = NULL, *new_ul = NULL;
++      struct list_head *tmp, *head;
++
++      for (;;) {
++              spin_lock(&sdp->sd_unlinked_lock);
++
++              for (head = &sdp->sd_unlinked_list, tmp = head->next;
++                   tmp != head;
++                   tmp = tmp->next) {
++                      ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++                      if (gfs_inum_equal(&ul->ul_inum, inum)) {
++                              ul->ul_count++;
++                              break;
++                      }
++              }
++
++              if (tmp == head)
++                      ul = NULL;
++
++              if (!ul && new_ul) {
++                      ul = new_ul;
++                      list_add(&ul->ul_list, &sdp->sd_unlinked_list);
++                      new_ul = NULL;
++              }
++
++              spin_unlock(&sdp->sd_unlinked_lock);
++
++              if (ul || !create) {
++                      if (new_ul)
++                              kfree(new_ul);
++                      return ul;
++              }
++
++              new_ul = gmalloc(sizeof(struct gfs_unlinked));
++              memset(new_ul, 0, sizeof(struct gfs_unlinked));
++
++              new_ul->ul_count = 1;
++              new_ul->ul_inum = *inum;
++
++              INIT_LE(&new_ul->ul_new_le, &gfs_unlinked_lops);
++              INIT_LE(&new_ul->ul_incore_le, &gfs_unlinked_lops);
++              INIT_LE(&new_ul->ul_ondisk_le, &gfs_unlinked_lops);
++      }
++}
++
++/**
++ * gfs_unlinked_hold - increment the usage count on a struct gfs_unlinked
++ * @sdp: the filesystem
++ * @ul: the structure
++ *
++ */
++
++void
++gfs_unlinked_hold(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++      spin_lock(&sdp->sd_unlinked_lock);
++      ul->ul_count++;
++      spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * gfs_unlinked_put - decrement the usage count on a struct gfs_unlinked
++ * @sdp: the filesystem
++ * @ul: the structure
++ *
++ * Free the structure if its reference count hits zero.
++ *
++ */
++
++void
++gfs_unlinked_put(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      GFS_ASSERT_SBD(ul->ul_count, sdp,);
++      ul->ul_count--;
++
++      if (!ul->ul_count) {
++              GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++                             !test_bit(ULF_OD_LIST, &ul->ul_flags) &&
++                             !test_bit(ULF_LOCK, &ul->ul_flags),
++                             sdp,);
++              list_del(&ul->ul_list);
++              spin_unlock(&sdp->sd_unlinked_lock);
++              kfree(ul);
++      } else
++              spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * unlinked_find - Find a inode to try to deallocate
++ * @sdp: the filesystem
++ *
++ * The returned structure is locked and needs to be unlocked
++ * with gfs_unlinked_unlock().
++ *
++ * Returns: A unlinked structure, or NULL
++ */
++
++struct gfs_unlinked *
++unlinked_find(struct gfs_sbd *sdp)
++{
++      struct list_head *tmp, *head;
++      struct gfs_unlinked *ul = NULL;
++
++      if (test_bit(SDF_ROFS, &sdp->sd_flags))
++              return NULL;
++
++      gfs_log_lock(sdp);
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      if (!atomic_read(&sdp->sd_unlinked_ic_count))
++              goto out;
++
++      for (head = &sdp->sd_unlinked_list, tmp = head->next;
++           tmp != head;
++           tmp = tmp->next) {
++              ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++
++              if (test_bit(ULF_LOCK, &ul->ul_flags))
++                      continue;
++              if (!test_bit(ULF_IC_LIST, &ul->ul_flags))
++                      continue;
++
++              list_move_tail(&ul->ul_list, &sdp->sd_unlinked_list);
++
++              set_bit(ULF_LOCK, &ul->ul_flags);
++              ul->ul_count++;
++
++              goto out;
++      }
++
++      ul = NULL;
++
++ out:
++      spin_unlock(&sdp->sd_unlinked_lock);
++      gfs_log_unlock(sdp);
++
++      return ul;
++}
++
++/**
++ * gfs_unlinked_lock - lock a unlinked structure
++ * @sdp: the filesystem
++ * @ul: the unlinked inode structure
++ *
++ */
++
++void
++gfs_unlinked_lock(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      GFS_ASSERT_SBD(!test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++      set_bit(ULF_LOCK, &ul->ul_flags);
++
++      ul->ul_count++;
++
++      spin_unlock(&sdp->sd_unlinked_lock);    
++}
++
++/**
++ * gfs_unlinked_unlock - drop and a reference on a unlinked structure
++ * @sdp: the filesystem
++ * @ul: the unlinked inode structure
++ *
++ */
++
++void
++gfs_unlinked_unlock(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      GFS_ASSERT_SBD(test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++      clear_bit(ULF_LOCK, &ul->ul_flags);
++
++      GFS_ASSERT_SBD(ul->ul_count, sdp,);
++      ul->ul_count--;
++
++      if (!ul->ul_count) {
++              GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++                             !test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++              list_del(&ul->ul_list);
++              spin_unlock(&sdp->sd_unlinked_lock);
++              kfree(ul);
++      } else
++              spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * gfs_unlinked_merge - add/remove a unlinked inode from the in-memory list
++ * @sdp: the filesystem
++ * @type: is this a unlink tag or a dealloc tag
++ * @inum: the inode number
++ *
++ */
++
++void
++gfs_unlinked_merge(struct gfs_sbd *sdp, unsigned int type,
++                 struct gfs_inum *inum)
++{
++      struct gfs_unlinked *ul;
++
++      GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count) ==
++                     atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++      ul = gfs_unlinked_get(sdp, inum, CREATE);
++
++      gfs_log_lock(sdp);
++
++      switch (type) {
++      case GFS_LOG_DESC_IUL:
++              gfs_unlinked_hold(sdp, ul);
++              gfs_unlinked_hold(sdp, ul);
++              GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++                             !test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++              set_bit(ULF_IC_LIST, &ul->ul_flags);
++              set_bit(ULF_OD_LIST, &ul->ul_flags);
++              atomic_inc(&sdp->sd_unlinked_ic_count);
++              atomic_inc(&sdp->sd_unlinked_od_count);
++
++              break;
++
++      case GFS_LOG_DESC_IDA:
++              GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++                             test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++              clear_bit(ULF_IC_LIST, &ul->ul_flags);
++              clear_bit(ULF_OD_LIST, &ul->ul_flags);
++              gfs_unlinked_put(sdp, ul);
++              gfs_unlinked_put(sdp, ul);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count), sdp,);
++              atomic_dec(&sdp->sd_unlinked_ic_count);
++              GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++              atomic_dec(&sdp->sd_unlinked_od_count);
++
++              break;
++      }
++
++      gfs_log_unlock(sdp);
++
++      gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * gfs_unlinked_cleanup - get rid of any extra struct gfs_unlinked structures
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_unlinked_cleanup(struct gfs_sbd *sdp)
++{
++      struct gfs_unlinked *ul;
++
++      restart:
++      gfs_log_lock(sdp);
++
++      GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count) ==
++                     atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++      spin_lock(&sdp->sd_unlinked_lock);
++
++      while (!list_empty(&sdp->sd_unlinked_list)) {
++              ul = list_entry(sdp->sd_unlinked_list.next,
++                              struct gfs_unlinked, ul_list);
++
++              if (ul->ul_count > 2) {
++                      spin_unlock(&sdp->sd_unlinked_lock);
++                      gfs_log_unlock(sdp);
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      goto restart;
++              }
++              GFS_ASSERT_SBD(ul->ul_count == 2, sdp,);
++
++              GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++                             test_bit(ULF_OD_LIST, &ul->ul_flags) &&
++                             !test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++
++              list_del(&ul->ul_list);
++
++              atomic_dec(&sdp->sd_unlinked_ic_count);
++              atomic_dec(&sdp->sd_unlinked_od_count);
++
++              spin_unlock(&sdp->sd_unlinked_lock);
++              kfree(ul);
++              spin_lock(&sdp->sd_unlinked_lock);
++      }
++
++      spin_unlock(&sdp->sd_unlinked_lock);
++
++      GFS_ASSERT_SBD(!atomic_read(&sdp->sd_unlinked_ic_count) &&
++                     !atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++      gfs_log_unlock(sdp);
++}
++
++/**
++ * gfs_unlinked_limit - limit the number of inodes waiting to be deallocated
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure;
++ */
++
++void
++gfs_unlinked_limit(struct gfs_sbd *sdp)
++{
++      unsigned int tries = 0, min = 0;
++      int error;
++
++      if (atomic_read(&sdp->sd_unlinked_ic_count) >=
++          sdp->sd_tune.gt_ilimit2) {
++              tries = sdp->sd_tune.gt_ilimit2_tries;
++              min = sdp->sd_tune.gt_ilimit2_min;
++      } else if (atomic_read(&sdp->sd_unlinked_ic_count) >=
++                 sdp->sd_tune.gt_ilimit1) {
++              tries = sdp->sd_tune.gt_ilimit1_tries;
++              min = sdp->sd_tune.gt_ilimit1_min;
++      }
++
++      while (tries--) {
++              struct gfs_unlinked *ul = unlinked_find(sdp);
++              if (!ul)
++                      break;
++
++              error = gfs_inode_dealloc(sdp, &ul->ul_inum);
++
++              gfs_unlinked_unlock(sdp, ul);
++
++              if (!error) {
++                      if (!--min)
++                              break;
++              } else if (error != 1)
++                      break;
++      }
++}
++
++/**
++ * gfs_unlinked_dealloc - Go through the list of inodes to be deallocated
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_unlinked_dealloc(struct gfs_sbd *sdp)
++{
++      unsigned int hits, strikes;
++      int error;
++
++      for (;;) {
++              hits = 0;
++              strikes = 0;
++
++              for (;;) {
++                      struct gfs_unlinked *ul = unlinked_find(sdp);
++                      if (!ul)
++                              return;
++
++                      error = gfs_inode_dealloc(sdp, &ul->ul_inum);
++
++                      gfs_unlinked_unlock(sdp, ul);
++
++                      if (!error) {
++                              hits++;
++                              if (strikes)
++                                      strikes--;
++                      } else if (error == 1) {
++                              strikes++;
++                              if (strikes >= atomic_read(&sdp->sd_unlinked_ic_count)) {
++                                      error = 0;
++                                      break;
++                              }
++                      } else
++                              goto out;
++              }
++
++              if (!hits || !test_bit(SDF_INODED_RUN, &sdp->sd_flags))
++                      break;
++
++              cond_resched();
++      }
++
++ out:
++      if (error && error != -EROFS)
++              printk("GFS: fsid=%s: error deallocating inodes: %d\n",
++                     sdp->sd_fsname, error);
++}
+diff -urN linux-orig/fs/gfs/unlinked.h linux-patched/fs/gfs/unlinked.h
+--- linux-orig/fs/gfs/unlinked.h       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/unlinked.h    2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UNLINKED_DOT_H__
++#define __UNLINKED_DOT_H__
++
++struct gfs_unlinked *gfs_unlinked_get(struct gfs_sbd *sdp,
++                                    struct gfs_inum *inum, int create);
++void gfs_unlinked_hold(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++void gfs_unlinked_put(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++
++void gfs_unlinked_lock(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++void gfs_unlinked_unlock(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++
++void gfs_unlinked_merge(struct gfs_sbd *sdp, unsigned int type,
++                      struct gfs_inum *inum);
++void gfs_unlinked_cleanup(struct gfs_sbd *sdp);
++
++void gfs_unlinked_limit(struct gfs_sbd *sdp);
++void gfs_unlinked_dealloc(struct gfs_sbd *sdp);
++
++#endif /* __UNLINKED_DOT_H__ */
+diff -urN linux-orig/fs/gfs/util.c linux-patched/fs/gfs/util.c
+--- linux-orig/fs/gfs/util.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/util.c        2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,317 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "glock.h"
++
++uint32_t gfs_random_number;
++
++volatile int gfs_in_panic = FALSE;
++
++kmem_cache_t *gfs_glock_cachep = NULL;
++kmem_cache_t *gfs_inode_cachep = NULL;
++kmem_cache_t *gfs_bufdata_cachep = NULL;
++kmem_cache_t *gfs_mhc_cachep = NULL;
++
++/**
++ * gfs_random - Generate a random 32-bit number
++ *
++ * Generate a semi-crappy 32-bit pseudo-random number without using
++ * floating point.
++ *
++ * The PRNG is from "Numerical Recipes in C" (second edition), page 284.
++ *
++ * Returns: a 32-bit random number
++ */
++
++uint32_t
++gfs_random(void)
++{
++      gfs_random_number = 0x0019660D * gfs_random_number + 0x3C6EF35F;
++      return gfs_random_number;
++}
++
++/**
++ * hash_more_internal - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ * @hash: the hash from a previous call
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * Hash guts
++ *
++ * Returns: the hash
++ */
++
++static __inline__ uint32_t
++hash_more_internal(const void *data, unsigned int len, uint32_t hash)
++{
++      unsigned char *p = (unsigned char *)data;
++      unsigned char *e = p + len;
++      uint32_t h = hash;
++
++      while (p < e) {
++              h ^= (uint32_t)(*p++);
++              h *= 0x01000193;
++      }
++
++      return h;
++}
++
++/**
++ * gfs_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_hash(const void *data, unsigned int len)
++{
++      uint32_t h = 0x811C9DC5;
++      h = hash_more_internal(data, len, h);
++      return h;
++}
++
++/**
++ * gfs_hash_more - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ * @hash: the hash from a previous call
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * This version let's you hash together discontinuous regions.
++ * For example, to compute the combined hash of the memory in
++ * (data1, len1), (data2, len2), and (data3, len3) you:
++ *
++ *   h = gfs_hash(data1, len1);
++ *   h = gfs_hash_more(data2, len2, h);
++ *   h = gfs_hash_more(data3, len3, h);
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_hash_more(const void *data, unsigned int len, uint32_t hash)
++{
++      uint32_t h;
++      h = hash_more_internal(data, len, hash);
++      return h;
++}
++
++/**
++ * gfs_sort - Sort base array using bubble sort algorithm
++ * @array: the input array
++ * @num: number of elements in array
++ * @size: size of each element in array
++ * @compare: fxn to compare array elements (returns negative for lt, 0 for eq, and positive for gt)
++ *
++ * Sorts the array passed in using the compar fxn to compare elements using
++ * the bubble sort algorithm
++ */
++
++void
++gfs_sort(void *array, unsigned int num, unsigned int size,
++       int (*compare) (void *, void *))
++{
++      char buf[size];
++      char *p1, *p2;
++      int changed;
++      unsigned int x;
++
++      if (num <= 1)
++              return;
++
++      do {
++              changed = FALSE;
++              p1 = (char *)array;
++              p2 = (char *)array + size;
++
++              for (x = num - 1; x--;) {
++                      if (compare(p1, p2) > 0) {
++                              memcpy(buf, p1, size);
++                              memcpy(p1, p2, size);
++                              memcpy(p2, buf, size);
++                              changed = TRUE;
++                      }
++
++                      p1 = p2;
++                      p2 += size;
++              }
++      }
++      while (changed);
++}
++
++/**
++ * bitch_about - 
++ * @sdp: the filesystem
++ * @last: the last time we bitched
++ * @about:
++ *
++ */
++
++void
++bitch_about(struct gfs_sbd *sdp, unsigned long *last, char *about)
++{
++      if (time_after_eq(jiffies, *last + sdp->sd_tune.gt_complain_secs * HZ)) {
++              printk("GFS: fsid=%s: %s by program \"%s\"\n",
++                     sdp->sd_fsname, about, current->comm);
++              *last = jiffies;
++      }
++}
++
++/**
++ * gfs_assert_i - Stop the machine
++ * @assertion: the assertion that failed
++ * @file: the file that called us
++ * @line: the line number of the file that called us
++ *
++ * Don't do ENTER() and EXIT() here.
++ *
++ */
++
++void
++gfs_assert_i(char *assertion,
++           unsigned int type, void *ptr,
++           char *file, unsigned int line)
++{
++      gfs_in_panic = TRUE;
++
++      printk("\nGFS: Assertion failed on line %d of file %s\n"
++             "GFS: assertion: \"%s\"\n"
++             "GFS: time = %lu\n",
++             line, file, assertion, get_seconds());
++
++      switch (type) {
++      case GFS_ASSERT_TYPE_SBD:
++      {
++              struct gfs_sbd *sdp = (struct gfs_sbd *)ptr;
++              printk("GFS: fsid=%s\n", sdp->sd_fsname);
++      }
++      break;
++
++      case GFS_ASSERT_TYPE_GLOCK:
++      {
++              struct gfs_glock *gl = (struct gfs_glock *)ptr;
++              struct gfs_sbd *sdp = gl->gl_sbd;
++              printk("GFS: fsid=%s: glock = (%u, %"PRIu64")\n",
++                     sdp->sd_fsname,
++                     gl->gl_name.ln_type,
++                     gl->gl_name.ln_number);
++      }
++      break;
++
++      case GFS_ASSERT_TYPE_INODE:
++      {
++              struct gfs_inode *ip = (struct gfs_inode *)ptr;
++              struct gfs_sbd *sdp = ip->i_sbd;
++              printk("GFS: fsid=%s: inode = %"PRIu64"/%"PRIu64"\n",
++                     sdp->sd_fsname,
++                     ip->i_num.no_formal_ino, ip->i_num.no_addr);
++      }
++      break;
++
++      case GFS_ASSERT_TYPE_RGRPD:
++      {
++              struct gfs_rgrpd *rgd = (struct gfs_rgrpd *)ptr;
++              struct gfs_sbd *sdp = rgd->rd_sbd;
++              printk("GFS: fsid=%s: rgroup = %"PRIu64"\n",
++                     sdp->sd_fsname, rgd->rd_ri.ri_addr);
++      }
++      break;
++      }
++
++      printk("\n");
++#if 0
++      printk("GFS: Record message above and reboot.\n");
++      BUG();
++#endif
++      panic("GFS: Record message above and reboot.\n");
++}
++
++/**
++ * gfs_io_errori - handle an I/O error
++ * @sdp: the filesystem
++ * @bh: the buffer the error happened on (can be NULL)
++ *
++ * This will do something other than panic, eventually.
++ *
++ */
++
++void gfs_io_error_i(struct gfs_sbd *sdp,
++                  unsigned int type, void *ptr,
++                  char *file, unsigned int line)
++{
++      switch (type) {
++      case GFS_IO_ERROR_TYPE_BH:
++      {
++              struct buffer_head *bh = (struct buffer_head *)ptr;
++              printk("GFS: fsid=%s: I/O error on block %"PRIu64"\n",
++                     sdp->sd_fsname, (uint64_t)bh->b_blocknr);
++      }
++      break;
++
++      case GFS_IO_ERROR_TYPE_INODE:
++      {
++              struct gfs_inode *ip = (struct gfs_inode *)ptr;
++              printk("GFS: fsid=%s: I/O error in inode %"PRIu64"/%"PRIu64"\n",
++                     sdp->sd_fsname,
++                     ip->i_num.no_formal_ino, ip->i_num.no_addr);
++      }
++      break;
++
++      default:
++      printk("GFS: fsid=%s: I/O error\n", sdp->sd_fsname);
++      break;
++      }
++
++      GFS_ASSERT_SBD(FALSE, sdp,);
++}
++
++/**
++ * gmalloc - malloc a small amount of memory
++ * @size: the number of bytes to malloc
++ *
++ * Returns: the memory
++ */
++
++void *
++gmalloc(unsigned int size)
++{
++      void *p;
++      RETRY_MALLOC(p = kmalloc(size, GFP_KERNEL), p);
++      return p;
++}
++
+diff -urN linux-orig/fs/gfs/util.h linux-patched/fs/gfs/util.h
+--- linux-orig/fs/gfs/util.h   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/util.h        2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,156 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++
++/* Utility functions */
++
++extern uint32_t gfs_random_number;
++uint32_t gfs_random(void);
++
++uint32_t gfs_hash(const void *data, unsigned int len);
++uint32_t gfs_hash_more(const void *data, unsigned int len, uint32_t hash);
++
++void gfs_sort(void *array, unsigned int num, unsigned int size,
++            int (*compare) (void *, void *));
++
++void bitch_about(struct gfs_sbd *sdp, unsigned long *last, char *about);
++
++
++
++/* Assertion stuff */
++
++#define GFS_ASSERT_TYPE_NONE      (18)
++#define GFS_ASSERT_TYPE_SBD       (19)
++#define GFS_ASSERT_TYPE_GLOCK     (20)
++#define GFS_ASSERT_TYPE_INODE     (21)
++#define GFS_ASSERT_TYPE_RGRPD     (22)
++
++#define GFS_ASSERT(x, todo) \
++do \
++{ \
++  if (!(x)) \
++  { \
++    {todo} \
++    gfs_assert_i(#x, GFS_ASSERT_TYPE_NONE, NULL, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++#define GFS_ASSERT_SBD(x, sdp, todo) \
++do \
++{ \
++  if (!(x)) \
++  { \
++    struct gfs_sbd *gfs_assert_sbd = (sdp); \
++    {todo} \
++    gfs_assert_i(#x, GFS_ASSERT_TYPE_SBD, gfs_assert_sbd, __FILE__, __LINE__); \
++  } \
++} \
++while (0)
++
++#define GFS_ASSERT_GLOCK(x, gl, todo) \
++do \
++{ \
++  if (!(x)) \
++  { \
++    struct gfs_glock *gfs_assert_glock = (gl); \
++    {todo} \
++    gfs_assert_i(#x, GFS_ASSERT_TYPE_GLOCK, gfs_assert_glock, __FILE__, __LINE__); \
++  } \
++} \
++while (0)
++
++#define GFS_ASSERT_INODE(x, ip, todo) \
++do \
++{ \
++  if (!(x)) \
++  { \
++    struct gfs_inode *gfs_assert_inode = (ip); \
++    {todo} \
++    gfs_assert_i(#x, GFS_ASSERT_TYPE_INODE, gfs_assert_inode, __FILE__, __LINE__); \
++  } \
++} \
++while (0)
++
++#define GFS_ASSERT_RGRPD(x, rgd, todo) \
++do \
++{ \
++  if (!(x)) \
++  { \
++    struct gfs_rgrpd *gfs_assert_rgrpd = (rgd); \
++    {todo} \
++    gfs_assert_i(#x, GFS_ASSERT_TYPE_RGRPD, gfs_assert_rgrpd, __FILE__, __LINE__); \
++  } \
++} \
++while (0)
++
++extern volatile int gfs_in_panic;
++void gfs_assert_i(char *assertion,
++                unsigned int type, void *ptr,
++                char *file, unsigned int line) __attribute__ ((noreturn));
++
++
++/* I/O error stuff */
++
++#define GFS_IO_ERROR_TYPE_NONE    (118)
++#define GFS_IO_ERROR_TYPE_BH      (119)
++#define GFS_IO_ERROR_TYPE_INODE   (120)
++
++#define gfs_io_error(sdp) \
++gfs_io_error_i((sdp), GFS_ASSERT_TYPE_NONE, NULL, __FILE__, __LINE__);
++
++#define gfs_io_error_bh(sdp, bh) \
++do \
++{ \
++  struct buffer_head *gfs_io_error_bh = (bh); \
++  gfs_io_error_i((sdp), GFS_IO_ERROR_TYPE_BH, gfs_io_error_bh, __FILE__, __LINE__); \
++} \
++while (0)
++
++#define gfs_io_error_inode(ip) \
++do \
++{ \
++  struct gfs_inode *gfs_io_error_inode = (ip); \
++  gfs_io_error_i((ip)->i_sbd, GFS_IO_ERROR_TYPE_INODE, gfs_io_error_inode, __FILE__, __LINE__); \
++} \
++while (0)
++
++void gfs_io_error_i(struct gfs_sbd *sdp,
++                  unsigned int type, void *ptr,
++                  char *file, unsigned int line);
++
++
++/* Memory stuff */
++
++#define RETRY_MALLOC(do_this, until_this) \
++for (;;) \
++{ \
++  do { do_this; } while (0); \
++  if (until_this) \
++    break; \
++  printk("GFS: out of memory: %s, %u\n", __FILE__, __LINE__); \
++  yield();\
++}
++
++extern kmem_cache_t *gfs_glock_cachep;
++extern kmem_cache_t *gfs_inode_cachep;
++extern kmem_cache_t *gfs_bufdata_cachep;
++extern kmem_cache_t *gfs_mhc_cachep;
++
++void *gmalloc(unsigned int size);
++
++
++#endif /* __UTIL_DOT_H__ */
+diff -urN linux-orig/include/linux/gfs_ioctl.h linux-patched/include/linux/gfs_ioctl.h
+--- linux-orig/include/linux/gfs_ioctl.h       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/gfs_ioctl.h    2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,218 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_IOCTL_DOT_H__
++#define __GFS_IOCTL_DOT_H__
++
++#define GFS_IOCTL_VERSION (0)
++
++#define _GFSC_(x)               (('G' << 8) | (x))
++
++/*
++   Ioctls implemented
++
++   Reserved Ioctls:  3, 7, 8, 9, 10, 4, 13
++   Next Ioctl:  44
++   */
++
++#define GFS_STACK_PRINT         _GFSC_(40)
++
++#define GFS_GET_META            _GFSC_(31)
++#define GFS_FILE_STAT           _GFSC_(30)
++
++#define GFS_SHRINK              _GFSC_(5)
++
++#define GFS_GET_ARGS            _GFSC_(29)
++#define GFS_GET_LOCKSTRUCT      _GFSC_(39)
++#define GFS_GET_SUPER           _GFSC_(19)
++#define GFS_JREAD               _GFSC_(23)
++#define GFS_JWRITE              _GFSC_(24)
++#define GFS_JSTAT               _GFSC_(20)
++#define GFS_JTRUNC              _GFSC_(33)
++
++#define GFS_LOCK_DUMP           _GFSC_(11)
++
++#define GFS_STATGFS             _GFSC_(12)
++
++#define GFS_FREEZE              _GFSC_(14)
++#define GFS_UNFREEZE            _GFSC_(15)
++
++#define GFS_RECLAIM_METADATA    _GFSC_(16)
++
++#define GFS_QUOTA_SYNC          _GFSC_(17)
++#define GFS_QUOTA_REFRESH       _GFSC_(18)
++#define GFS_QUOTA_READ          _GFSC_(32)
++
++#define GFS_GET_TUNE            _GFSC_(21)
++#define GFS_SET_TUNE            _GFSC_(22)
++
++#define GFS_EATTR_GET           _GFSC_(26)
++#define GFS_EATTR_SET           _GFSC_(27)
++
++#define GFS_WHERE_ARE_YOU       _GFSC_(35)
++
++#define GFS_SET_FLAG            _GFSC_(36)
++#define GFS_CLEAR_FLAG          _GFSC_(37)
++
++#define GFS_GET_COUNTERS        _GFSC_(43)
++
++#define GFS_FILE_FLUSH          _GFSC_(42)
++
++struct gfs_user_buffer {
++      char *ub_data;
++      unsigned int ub_size;
++      unsigned int ub_count;
++};
++
++/*  Structure for jread/jwrite  */
++
++#define GFS_HIDDEN_JINDEX       (0x10342345)
++#define GFS_HIDDEN_RINDEX       (0x10342346)
++#define GFS_HIDDEN_QUOTA        (0x10342347)
++#define GFS_HIDDEN_LICENSE      (0x10342348)
++
++struct gfs_jio {
++      unsigned int jio_file;
++
++      uint32_t jio_size;
++      uint64_t jio_offset;
++      char *jio_data;
++
++      uint32_t jio_count;
++};
++
++/*  Structure for better GFS-specific df  */
++
++struct gfs_usage {
++      unsigned int gu_block_size;
++      uint64_t gu_total_blocks;
++      uint64_t gu_free;
++      uint64_t gu_used_dinode;
++      uint64_t gu_free_dinode;
++      uint64_t gu_used_meta;
++      uint64_t gu_free_meta;
++};
++
++struct gfs_reclaim_stats {
++      uint64_t rc_inodes;
++      uint64_t rc_metadata;
++};
++
++struct gfs_quota_name {
++      int qn_user;
++      uint32_t qn_id;
++};
++
++/*
++ *  You can tune a filesystem, but you can't tune a yak.
++ */
++
++#define GFS_TUNE_VERSION ((GFS_IOCTL_VERSION << 16) | (138))
++
++struct gfs_tune {
++      unsigned int gt_tune_version;
++
++      unsigned int gt_ilimit1;
++      unsigned int gt_ilimit1_tries;
++      unsigned int gt_ilimit1_min;
++      unsigned int gt_ilimit2;
++      unsigned int gt_ilimit2_tries;
++      unsigned int gt_ilimit2_min;
++      unsigned int gt_demote_secs;
++      unsigned int gt_incore_log_blocks;
++      unsigned int gt_jindex_refresh_secs;
++      unsigned int gt_depend_secs;
++      unsigned int gt_scand_secs;
++      unsigned int gt_recoverd_secs;
++      unsigned int gt_logd_secs;
++      unsigned int gt_quotad_secs;
++      unsigned int gt_inoded_secs;
++      unsigned int gt_quota_simul_sync;
++      unsigned int gt_quota_warn_period;
++      unsigned int gt_atime_quantum;
++      unsigned int gt_quota_quantum;
++      unsigned int gt_quota_scale_num;
++      unsigned int gt_quota_scale_den;
++      unsigned int gt_quota_enforce;
++      unsigned int gt_quota_account;
++      unsigned int gt_new_files_jdata;
++      unsigned int gt_new_files_directio;
++      unsigned int gt_max_atomic_write;
++      unsigned int gt_max_readahead;
++      unsigned int gt_lockdump_size;
++      unsigned int gt_stall_secs;
++      unsigned int gt_complain_secs;
++      unsigned int gt_reclaim_limit;
++      unsigned int gt_entries_per_readdir;
++      unsigned int gt_prefetch_secs;
++      unsigned int gt_statfs_slots;
++      unsigned int gt_max_mhc;
++};
++
++/*
++ * Extended Attribute Ioctl structures
++ *
++ * Note: The name_len does not include a null character.
++ *
++ * Getting and setting EAs return the following errors that aren't
++ * what they seem
++ *
++ * ENODATA - No such extended attribute
++ * ERANGE - Extended attribute data is too large for the buffer
++ * ENOSPC - No space left for extended attributes
++ * EEXIST - Extended attribute already exists
++ */
++
++#define GFS_EACMD_SET       (0)
++#define GFS_EACMD_CREATE    (1)
++#define GFS_EACMD_REPLACE   (2)
++#define GFS_EACMD_REMOVE    (3)
++
++struct gfs_eaget_io {
++      char *eg_data;
++      char *eg_name;
++      char *eg_len;
++      uint32_t eg_data_len;
++      uint8_t eg_name_len;
++      uint8_t eg_type;        /* GFS_EATYPE_... */
++};
++
++struct gfs_easet_io {
++      const char *es_data;
++      char *es_name;
++      uint16_t es_data_len;
++      uint8_t es_name_len;    /* not counting the NULL */
++      uint8_t es_cmd;         /* GFS_EACMD_...  */
++      uint8_t es_type;        /* GFS_EATYPE_... */
++};
++
++#define GFS_GLOCKD_DEFAULT (1)
++#define GFS_GLOCKD_MAX (32)
++
++struct gfs_args {
++      char ar_lockproto[256]; /* The name of the Lock Protocol */
++      char ar_locktable[256]; /* The name of the Lock Table */
++      char ar_hostdata[256];  /* The host specific data */
++
++      int ar_ignore_local_fs; /* Ignore the local_fs field in the struct lm_lockops */
++      int ar_localflocks;     /* let the VFS do flock|fcntl locks for us */
++      int ar_localcaching;    /* Local-style caching (dangerous on mulithost) */
++
++      int ar_upgrade;         /* Upgrade ondisk/multihost format */
++
++      unsigned int ar_num_glockd;
++
++      int ar_posixacls;       /* Enable posix acls */
++};
++
++#endif /* ___GFS_IOCTL_DOT_H__ */
+diff -urN linux-orig/include/linux/gfs_ondisk.h linux-patched/include/linux/gfs_ondisk.h
+--- linux-orig/include/linux/gfs_ondisk.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/gfs_ondisk.h   2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,1720 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++* NOTE:
++* If you add 8 byte fields to these structures, they must be 8 byte
++* aligned.  4 byte field must be 4 byte aligned, etc...
++*
++* All structures must be a multiple of 8 bytes long.
++*
++* GRIPES:
++* We should have forgetten about supporting 512B FS block sizes 
++* and made the di_reserved field in the struct gfs_dinode structure
++* much bigger.
++*
++* de_rec_len in struct gfs_dirent should really have been a 32-bit value
++* as it now limits us to a 64k FS block size (with the current code
++* in dir.c).
++*/
++
++#ifndef __GFS_ONDISK_DOT_H__
++#define __GFS_ONDISK_DOT_H__
++
++#define GFS_MAGIC               (0x01161970)
++#define GFS_BASIC_BLOCK         (512)
++#define GFS_BASIC_BLOCK_SHIFT   (9)
++#define GFS_DUMPS_PER_LOG       (4)
++
++/*  Lock numbers of the LM_TYPE_NONDISK type  */
++
++#define GFS_MOUNT_LOCK          (0)
++#define GFS_LIVE_LOCK           (1)
++#define GFS_TRANS_LOCK          (2)
++#define GFS_RENAME_LOCK         (3)
++
++/*  Format numbers for various metadata types  */
++
++#define GFS_FORMAT_SB           (100)
++#define GFS_FORMAT_RG           (200)
++#define GFS_FORMAT_RB           (300)
++#define GFS_FORMAT_DI           (400)
++#define GFS_FORMAT_IN           (500)
++#define GFS_FORMAT_LF           (600)
++#define GFS_FORMAT_JD           (700)
++#define GFS_FORMAT_LH           (800)
++#define GFS_FORMAT_LD           (900)
++/*  These don't have actual struct gfs_meta_header structures to go with them  */
++#define GFS_FORMAT_JI           (1000)
++#define GFS_FORMAT_RI           (1100)
++#define GFS_FORMAT_DE           (1200)
++#define GFS_FORMAT_QU           (1500)
++#define GFS_FORMAT_EA           (1600)
++/*  These are part of the superblock  */
++#define GFS_FORMAT_FS           (1309)
++#define GFS_FORMAT_MULTI        (1401)
++
++/*
++ *  An on-disk inode number
++ */
++
++#define gfs_inum_equal(ino1, ino2) \
++(((ino1)->no_formal_ino == (ino2)->no_formal_ino) && \
++ ((ino1)->no_addr == (ino2)->no_addr))
++
++struct gfs_inum {
++      uint64_t no_formal_ino;
++      uint64_t no_addr;
++};
++
++/*
++ *  Generic metadata head structure
++ *
++ *  Every inplace buffer logged in the journal must start with this.
++ */
++
++#define GFS_METATYPE_NONE       (0)
++#define GFS_METATYPE_SB         (1)
++#define GFS_METATYPE_RG         (2)
++#define GFS_METATYPE_RB         (3)
++#define GFS_METATYPE_DI         (4)
++#define GFS_METATYPE_IN         (5)
++#define GFS_METATYPE_LF         (6)
++#define GFS_METATYPE_JD         (7)
++#define GFS_METATYPE_LH         (8)
++#define GFS_METATYPE_LD         (9)
++#define GFS_METATYPE_EA         (10)
++
++#define GFS_META_CLUMP          (64)
++
++struct gfs_meta_header {
++      uint32_t mh_magic;      /* Magic number */
++      uint32_t mh_type;       /* GFS_METATYPE_XX */
++      uint64_t mh_generation; /* Generation number */
++      uint32_t mh_format;     /* GFS_FORMAT_XX */
++      uint32_t mh_incarn;
++};
++
++/*
++ *  super-block structure
++ *
++ *  It's probably good if SIZEOF_SB <= GFS_BASIC_BLOCK
++ */
++
++/*  Address of SuperBlock in GFS basic blocks  */
++#define GFS_SB_ADDR             (128)
++/*  The lock number for the superblock (must be zero)  */
++#define GFS_SB_LOCK             (0)
++#define GFS_CRAP_LOCK           (1)
++
++/*  Requirement:  GFS_LOCKNAME_LEN % 8 == 0
++    Includes: the fencing zero at the end  */
++#define GFS_LOCKNAME_LEN        (64)
++
++struct gfs_sb {
++      /*  Order is important  */
++      struct gfs_meta_header sb_header;
++
++      uint32_t sb_fs_format;
++      uint32_t sb_multihost_format;
++      uint32_t sb_flags;
++
++      /*  Important information  */
++      uint32_t sb_bsize;      /* fundamental fs block size in bytes */
++      uint32_t sb_bsize_shift;        /* log2(sb_bsize) */
++      uint32_t sb_seg_size;   /* Journal segment size in FS blocks */
++
++      struct gfs_inum sb_jindex_di;   /* journal index inode number (GFS_SB_LOCK) */
++      struct gfs_inum sb_rindex_di;   /* resource index inode number (GFS_SB_LOCK) */
++      struct gfs_inum sb_root_di;     /* root directory inode number (GFS_ROOT_LOCK) */
++
++      char sb_lockproto[GFS_LOCKNAME_LEN];    /* Type of locking this FS uses */
++      char sb_locktable[GFS_LOCKNAME_LEN];    /* Name of lock table for this FS */
++
++      struct gfs_inum sb_quota_di;
++      struct gfs_inum sb_license_di;
++
++      char sb_reserved[96];
++};
++
++/*
++ *  journal index structure 
++ */
++
++struct gfs_jindex {
++      uint64_t ji_addr;       /* starting block of the journal */
++      uint32_t ji_nsegment;   /* number of segments in journal */
++      uint32_t ji_pad;
++
++      char ji_reserved[64];
++};
++
++/*
++ *  resource index structure 
++ */
++
++struct gfs_rindex {
++      uint64_t ri_addr;       /* rgrp block disk address */
++      uint32_t ri_length;     /* length of rgrp header in fs blocks */
++      uint32_t ri_pad;
++
++      uint64_t ri_data1;      /* first data location */
++      uint32_t ri_data;       /* num of data blocks in rgrp */
++
++      uint32_t ri_bitbytes;   /* number of bytes in data bitmaps */
++
++      char ri_reserved[64];
++};
++
++/*
++ *  resource group header structure
++ *
++ */
++
++/* Number of blocks per byte in rgrp */
++#define GFS_NBBY                (4)
++#define GFS_BIT_SIZE            (2)
++#define GFS_BIT_MASK            (0x00000003)
++
++#define GFS_BLKST_FREE          (0)
++#define GFS_BLKST_USED          (1)
++#define GFS_BLKST_FREEMETA      (2)
++#define GFS_BLKST_USEDMETA      (3)
++
++struct gfs_rgrp {
++      struct gfs_meta_header rg_header;
++
++      uint32_t rg_flags;      /* flags */
++
++      uint32_t rg_free;       /* number of free data blocks */
++
++      uint32_t rg_useddi;     /* number of dinodes */
++      uint32_t rg_freedi;     /* number of unused dinodes */
++      struct gfs_inum rg_freedi_list; /* list of free dinodes */
++
++      uint32_t rg_usedmeta;   /* number of used metadata blocks (not including dinodes) */
++      uint32_t rg_freemeta;   /* number of unused metadata blocks */
++
++      char rg_reserved[64];
++};
++
++/*
++ *  Quota Structures
++ */
++
++struct gfs_quota {
++      uint64_t qu_limit;
++      uint64_t qu_warn;
++      int64_t qu_value;
++
++      char qu_reserved[64];
++};
++
++/*
++ *  dinode structure
++ */
++
++#define GFS_MAX_META_HEIGHT     (10)
++#define GFS_DIR_MAX_DEPTH       (17)
++
++/*  Dinode types  */
++#define GFS_FILE_NON            (0)
++#define GFS_FILE_REG            (1)
++#define GFS_FILE_DIR            (2)
++#define GFS_FILE_LNK            (5)
++#define GFS_FILE_BLK            (7)
++#define GFS_FILE_CHR            (8)
++#define GFS_FILE_FIFO           (101)
++#define GFS_FILE_SOCK           (102)
++
++/*  Dinode flags  */
++#define GFS_DIF_JDATA               (0x00000001)
++#define GFS_DIF_EXHASH              (0x00000002)
++#define GFS_DIF_UNUSED              (0x00000004)
++#define GFS_DIF_EA_INDIRECT         (0x00000008)
++#define GFS_DIF_DIRECTIO            (0x00000010)
++#define GFS_DIF_IMMUTABLE           (0x00000020)
++#define GFS_DIF_APPENDONLY          (0x00000040)
++#define GFS_DIF_NOATIME             (0x00000080)
++#define GFS_DIF_SYNC                (0x00000100)
++#define GFS_DIF_INHERIT_DIRECTIO    (0x40000000)
++#define GFS_DIF_INHERIT_JDATA       (0x80000000)
++
++struct gfs_dinode {
++      struct gfs_meta_header di_header;
++
++      struct gfs_inum di_num;
++
++      uint32_t di_mode;       /* mode of file */
++      uint32_t di_uid;        /* owner's user id */
++      uint32_t di_gid;        /* owner's group id */
++      uint32_t di_nlink;      /* number of links to this file */
++      uint64_t di_size;       /* number of bytes in file */
++      uint64_t di_blocks;     /* number of blocks in file */
++      int64_t di_atime;       /* time last accessed */
++      int64_t di_mtime;       /* time last modified */
++      int64_t di_ctime;       /* time last changed */
++      uint32_t di_major;      /* device major number */
++      uint32_t di_minor;      /* device minor number */
++
++      uint64_t di_rgrp;       /* dinode rgrp block number */
++      uint64_t di_goal_rgrp;  /* rgrp to alloc from next */
++      uint32_t di_goal_dblk;  /* data block goal */
++      uint32_t di_goal_mblk;  /* metadata block goal */
++      uint32_t di_flags;      /* flags */
++      uint32_t di_payload_format;     /* struct gfs_rindex, struct gfs_jindex, or struct gfs_dirent */
++      uint16_t di_type;       /* type of file */
++      uint16_t di_height;     /* height of metadata */
++      uint32_t di_incarn;     /* incarnation number */
++      uint16_t di_pad;
++
++      /*  These only apply to directories  */
++      uint16_t di_depth;      /* Number of bits in the table */
++      uint32_t di_entries;    /* The number of entries in the directory */
++
++      /*  This only applies to unused inodes  */
++      struct gfs_inum di_next_unused;
++
++      uint64_t di_eattr;      /* extended attribute block number */
++
++      char di_reserved[56];
++};
++
++/*
++ *  indirect block header
++ */
++
++struct gfs_indirect {
++      struct gfs_meta_header in_header;
++
++      char in_reserved[64];
++};
++
++/*
++ *  directory structure - many of these per directory file
++ */
++
++#define GFS_FNAMESIZE               (255)
++#define GFS_DIRENT_SIZE(name_len) ((sizeof(struct gfs_dirent) + (name_len) + 7) & ~7)
++
++struct gfs_dirent {
++      struct gfs_inum de_inum;        /* Inode number */
++      uint32_t de_hash;       /* hash of the filename */
++      uint16_t de_rec_len;    /* the length of the dirent */
++      uint16_t de_name_len;   /* the length of the name */
++      uint16_t de_type;       /* type of dinode this points to */
++
++      char de_reserved[14];
++};
++
++/*
++ *  Header of leaf directory nodes
++ */
++
++struct gfs_leaf {
++      struct gfs_meta_header lf_header;
++
++      uint16_t lf_depth;      /* Depth of leaf */
++      uint16_t lf_entries;    /* Number of dirents in leaf */
++      uint32_t lf_dirent_format;      /* Format of the dirents */
++      uint64_t lf_next;       /* Next leaf, if overflow */
++
++      char lf_reserved[64];
++};
++
++/*
++ *  Log header structure
++ */
++
++#define GFS_LOG_HEAD_UNMOUNT    (0x00000001)
++
++struct gfs_log_header {
++      struct gfs_meta_header lh_header;
++
++      uint32_t lh_flags;      /* Flags */
++      uint32_t lh_pad;
++
++      uint64_t lh_first;      /* Block number of first header in this trans */
++      uint64_t lh_sequence;   /* Sequence number of this transaction */
++
++      uint64_t lh_tail;       /* Block number of log tail */
++      uint64_t lh_last_dump;  /* block number of last dump */
++
++      char lh_reserved[64];
++};
++
++/*
++ *  Log type descriptor
++ */
++
++#define GFS_LOG_DESC_METADATA   (300)
++/*  ld_data1 is the number of metadata blocks in the descriptor.
++    ld_data2 is unused.
++    */
++
++#define GFS_LOG_DESC_IUL        (400)
++/*  ld_data1 is TRUE if this is a dump.
++    ld_data2 is unused.
++    FixMe!!!  ld_data1 should be the number of entries.
++              ld_data2 should be "TRUE if this is a dump".
++    */
++
++#define GFS_LOG_DESC_IDA        (401)
++/*  ld_data1 is unused.
++    ld_data2 is unused.
++    FixMe!!!  ld_data1 should be the number of entries.
++    */
++
++#define GFS_LOG_DESC_Q          (402)
++/*  ld_data1 is the number of quota changes in the descriptor.
++    ld_data2 is TRUE if this is a dump.
++    */
++
++#define GFS_LOG_DESC_LAST       (500)
++/*  ld_data1 is unused.
++    ld_data2 is unused.
++    */
++
++struct gfs_log_descriptor {
++      struct gfs_meta_header ld_header;
++
++      uint32_t ld_type;       /* Type of data in this log chunk */
++      uint32_t ld_length;     /* Number of buffers in this chunk */
++      uint32_t ld_data1;      /* descriptor specific field */
++      uint32_t ld_data2;      /* descriptor specific field */
++
++      char ld_reserved[64];
++};
++
++/*
++ *  Metadata block tags
++ */
++
++struct gfs_block_tag {
++      uint64_t bt_blkno;      /* inplace block number */
++      uint32_t bt_flags;      /* flags */
++      uint32_t bt_pad;
++};
++
++/*
++ *  Quota Journal Tag
++ */
++
++#define GFS_QTF_USER            (0x00000001)
++
++struct gfs_quota_tag {
++      int64_t qt_change;
++      uint32_t qt_flags;
++      uint32_t qt_id;
++};
++
++/*
++ *  Extended attribute header format
++ */
++
++#define GFS_EA_MAX_NAME_LEN     (255)
++#define GFS_EA_MAX_DATA_LEN     (65535)
++
++#define GFS_EATYPE_LAST               (2)
++
++#define GFS_EATYPE_UNUSED       (0)
++#define GFS_EATYPE_USR          (1)
++#define GFS_EATYPE_SYS          (2)
++#define GFS_EATYPE_VALID(x)     ((x) && (x) <= GFS_EATYPE_LAST)       /* this is only
++                                                                 for requests */
++
++#define GFS_EAFLAG_LAST         (0x01)        /* last ea in block */
++
++struct gfs_ea_header {
++      uint32_t ea_rec_len;
++      uint32_t ea_data_len;
++      uint8_t ea_name_len;    /* no NULL pointer after the string */
++      uint8_t ea_type;        /* GFS_EATYPE_... */
++      uint8_t ea_flags;
++      uint8_t ea_num_ptrs;
++      uint32_t ea_pad;
++};
++
++/*  Endian functions  */
++
++#define GFS_ENDIAN_BIG
++
++#ifdef GFS_ENDIAN_BIG
++
++#define gfs16_to_cpu be16_to_cpu
++#define gfs32_to_cpu be32_to_cpu
++#define gfs64_to_cpu be64_to_cpu
++
++#define cpu_to_gfs16 cpu_to_be16
++#define cpu_to_gfs32 cpu_to_be32
++#define cpu_to_gfs64 cpu_to_be64
++
++#else                         /*  GFS_ENDIAN_BIG  */
++
++#define gfs16_to_cpu le16_to_cpu
++#define gfs32_to_cpu le32_to_cpu
++#define gfs64_to_cpu le64_to_cpu
++
++#define cpu_to_gfs16 cpu_to_le16
++#define cpu_to_gfs32 cpu_to_le32
++#define cpu_to_gfs64 cpu_to_le64
++
++#endif                                /*  GFS_ENDIAN_BIG  */
++
++/*  Translation functions  */
++
++void gfs_inum_in(struct gfs_inum *no, char *buf);
++void gfs_inum_out(struct gfs_inum *no, char *buf);
++void gfs_meta_header_in(struct gfs_meta_header *mh, char *buf);
++void gfs_meta_header_out(struct gfs_meta_header *mh, char *buf);
++void gfs_sb_in(struct gfs_sb *sb, char *buf);
++void gfs_sb_out(struct gfs_sb *sb, char *buf);
++void gfs_jindex_in(struct gfs_jindex *jindex, char *buf);
++void gfs_jindex_out(struct gfs_jindex *jindex, char *buf);
++void gfs_rindex_in(struct gfs_rindex *rindex, char *buf);
++void gfs_rindex_out(struct gfs_rindex *rindex, char *buf);
++void gfs_rgrp_in(struct gfs_rgrp *rgrp, char *buf);
++void gfs_rgrp_out(struct gfs_rgrp *rgrp, char *buf);
++void gfs_quota_in(struct gfs_quota *quota, char *buf);
++void gfs_quota_out(struct gfs_quota *quota, char *buf);
++void gfs_dinode_in(struct gfs_dinode *dinode, char *buf);
++void gfs_dinode_out(struct gfs_dinode *dinode, char *buf);
++void gfs_indirect_in(struct gfs_indirect *indirect, char *buf);
++void gfs_indirect_out(struct gfs_indirect *indirect, char *buf);
++void gfs_dirent_in(struct gfs_dirent *dirent, char *buf);
++void gfs_dirent_out(struct gfs_dirent *dirent, char *buf);
++void gfs_leaf_in(struct gfs_leaf *leaf, char *buf);
++void gfs_leaf_out(struct gfs_leaf *leaf, char *buf);
++void gfs_log_header_in(struct gfs_log_header *head, char *buf);
++void gfs_log_header_out(struct gfs_log_header *head, char *buf);
++void gfs_desc_in(struct gfs_log_descriptor *desc, char *buf);
++void gfs_desc_out(struct gfs_log_descriptor *desc, char *buf);
++void gfs_block_tag_in(struct gfs_block_tag *btag, char *buf);
++void gfs_block_tag_out(struct gfs_block_tag *btag, char *buf);
++void gfs_quota_tag_in(struct gfs_quota_tag *qtag, char *buf);
++void gfs_quota_tag_out(struct gfs_quota_tag *qtag, char *buf);
++void gfs_ea_header_in(struct gfs_ea_header *qtag, char *buf);
++void gfs_ea_header_out(struct gfs_ea_header *qtag, char *buf);
++
++/*  Printing functions  */
++
++void gfs_inum_print(struct gfs_inum *no);
++void gfs_meta_header_print(struct gfs_meta_header *mh);
++void gfs_sb_print(struct gfs_sb *sb);
++void gfs_jindex_print(struct gfs_jindex *jindex);
++void gfs_rindex_print(struct gfs_rindex *rindex);
++void gfs_rgrp_print(struct gfs_rgrp *rgrp);
++void gfs_quota_print(struct gfs_quota *quota);
++void gfs_dinode_print(struct gfs_dinode *dinode);
++void gfs_indirect_print(struct gfs_indirect *indirect);
++void gfs_dirent_print(struct gfs_dirent *dirent, char *name);
++void gfs_leaf_print(struct gfs_leaf *leaf);
++void gfs_log_header_print(struct gfs_log_header *head);
++void gfs_desc_print(struct gfs_log_descriptor *desc);
++void gfs_block_tag_print(struct gfs_block_tag *tag);
++void gfs_quota_tag_print(struct gfs_quota_tag *tag);
++void gfs_ea_header_print(struct gfs_ea_header *tag);
++
++/*  The hash function for ExHash directories  */
++
++uint32_t gfs_dir_hash(const char *data, int len);
++
++#endif /* __GFS_ONDISK_DOT_H__ */
++
++
++
++#ifdef WANT_GFS_CONVERSION_FUNCTIONS
++
++#define CPIN_08(s1, s2, member, count) {memcpy((s1->member), (s2->member), (count));}
++#define CPOUT_08(s1, s2, member, count) {memcpy((s2->member), (s1->member), (count));}
++#define CPIN_16(s1, s2, member) {(s1->member) = gfs16_to_cpu((s2->member));}
++#define CPOUT_16(s1, s2, member) {(s2->member) = cpu_to_gfs16((s1->member));}
++#define CPIN_32(s1, s2, member) {(s1->member) = gfs32_to_cpu((s2->member));}
++#define CPOUT_32(s1, s2, member) {(s2->member) = cpu_to_gfs32((s1->member));}
++#define CPIN_64(s1, s2, member) {(s1->member) = gfs64_to_cpu((s2->member));}
++#define CPOUT_64(s1, s2, member) {(s2->member) = cpu_to_gfs64((s1->member));}
++
++#define pa(struct, member, count) print_array(#member, struct->member, count);
++
++/**
++ * print_array - Print out an array of bytes
++ * @title: what to print before the array
++ * @buf: the array
++ * @count: the number of bytes
++ *
++ */
++
++static void
++print_array(char *title, char *buf, int count)
++{
++      int x;
++
++      printk("  %s =\n", title);
++      for (x = 0; x < count; x++) {
++              printk("%.2X ", (unsigned char)buf[x]);
++              if (x % 16 == 15)
++                      printk("\n");
++      }
++      if (x % 16)
++              printk("\n");
++}
++
++/**
++ * gfs_inum_in - Read in an inode number
++ * @no: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_inum_in(struct gfs_inum *no, char *buf)
++{
++      struct gfs_inum *str = (struct gfs_inum *)buf;
++
++      CPIN_64(no, str, no_formal_ino);
++      CPIN_64(no, str, no_addr);
++}
++
++/**
++ * gfs_inum_out - Write out an inode number
++ * @no: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_inum_out(struct gfs_inum *no, char *buf)
++{
++      struct gfs_inum *str = (struct gfs_inum *)buf;
++
++      CPOUT_64(no, str, no_formal_ino);
++      CPOUT_64(no, str, no_addr);
++}
++
++/**
++ * gfs_inum_print - Print out a inode number
++ * @no: the cpu-order buffer
++ *
++ */
++
++void
++gfs_inum_print(struct gfs_inum *no)
++{
++      pv(no, no_formal_ino, "%"PRIu64);
++      pv(no, no_addr, "%"PRIu64);
++}
++
++/**
++ * gfs_meta_header_in - Read in a metadata header
++ * @mh: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_meta_header_in(struct gfs_meta_header *mh, char *buf)
++{
++      struct gfs_meta_header *str = (struct gfs_meta_header *)buf;
++
++      CPIN_32(mh, str, mh_magic);
++      CPIN_32(mh, str, mh_type);
++      CPIN_64(mh, str, mh_generation);
++      CPIN_32(mh, str, mh_format);
++      CPIN_32(mh, str, mh_incarn);
++}
++
++/**
++ * gfs_meta_header_in - Write out a metadata header
++ * @mh: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ * Don't ever change the generation number in this routine.
++ * It's done manually in increment_generation().
++ */
++
++void
++gfs_meta_header_out(struct gfs_meta_header *mh, char *buf)
++{
++      struct gfs_meta_header *str = (struct gfs_meta_header *)buf;
++
++      CPOUT_32(mh, str, mh_magic);
++      CPOUT_32(mh, str, mh_type);
++#if 0
++      /* Don't do this!
++         Mh_generation should only be change manually. */
++      CPOUT_64(mh, str, mh_generation);
++#endif
++      CPOUT_32(mh, str, mh_format);
++      CPOUT_32(mh, str, mh_incarn);
++}
++
++/**
++ * gfs_meta_header_print - Print out a metadata header
++ * @mh: the cpu-order buffer
++ *
++ */
++
++void
++gfs_meta_header_print(struct gfs_meta_header *mh)
++{
++      pv(mh, mh_magic, "0x%.8X");
++      pv(mh, mh_type, "%u");
++      pv(mh, mh_generation, "%"PRIu64);
++      pv(mh, mh_format, "%u");
++      pv(mh, mh_incarn, "%u");
++}
++
++/**
++ * gfs_sb_in - Read in a superblock
++ * @sb: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_sb_in(struct gfs_sb *sb, char *buf)
++{
++      struct gfs_sb *str = (struct gfs_sb *)buf;
++
++      gfs_meta_header_in(&sb->sb_header, buf);
++
++      CPIN_32(sb, str, sb_fs_format);
++      CPIN_32(sb, str, sb_multihost_format);
++      CPIN_32(sb, str, sb_flags);
++
++      CPIN_32(sb, str, sb_bsize);
++      CPIN_32(sb, str, sb_bsize_shift);
++      CPIN_32(sb, str, sb_seg_size);
++
++      gfs_inum_in(&sb->sb_jindex_di, (char *)&str->sb_jindex_di);
++      gfs_inum_in(&sb->sb_rindex_di, (char *)&str->sb_rindex_di);
++      gfs_inum_in(&sb->sb_root_di, (char *)&str->sb_root_di);
++
++      CPIN_08(sb, str, sb_lockproto, GFS_LOCKNAME_LEN);
++      CPIN_08(sb, str, sb_locktable, GFS_LOCKNAME_LEN);
++
++      gfs_inum_in(&sb->sb_quota_di, (char *)&str->sb_quota_di);
++      gfs_inum_in(&sb->sb_license_di, (char *)&str->sb_license_di);
++
++      CPIN_08(sb, str, sb_reserved, 96);
++}
++
++/**
++ * gfs_sb_out - Write out a superblock
++ * @sb: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_sb_out(struct gfs_sb *sb, char *buf)
++{
++      struct gfs_sb *str = (struct gfs_sb *)buf;
++
++      gfs_meta_header_out(&sb->sb_header, buf);
++
++      CPOUT_32(sb, str, sb_fs_format);
++      CPOUT_32(sb, str, sb_multihost_format);
++      CPOUT_32(sb, str, sb_flags);
++
++      CPOUT_32(sb, str, sb_bsize);
++      CPOUT_32(sb, str, sb_bsize_shift);
++      CPOUT_32(sb, str, sb_seg_size);
++
++      gfs_inum_out(&sb->sb_jindex_di, (char *)&str->sb_jindex_di);
++      gfs_inum_out(&sb->sb_rindex_di, (char *)&str->sb_rindex_di);
++      gfs_inum_out(&sb->sb_root_di, (char *)&str->sb_root_di);
++
++      CPOUT_08(sb, str, sb_lockproto, GFS_LOCKNAME_LEN);
++      CPOUT_08(sb, str, sb_locktable, GFS_LOCKNAME_LEN);
++
++      gfs_inum_out(&sb->sb_quota_di, (char *)&str->sb_quota_di);
++      gfs_inum_out(&sb->sb_license_di, (char *)&str->sb_license_di);
++
++      CPOUT_08(sb, str, sb_reserved, 96);
++}
++
++/**
++ * gfs_sb_print - Print out a superblock
++ * @sb: the cpu-order buffer
++ *
++ */
++
++void
++gfs_sb_print(struct gfs_sb *sb)
++{
++      gfs_meta_header_print(&sb->sb_header);
++
++      pv(sb, sb_fs_format, "%u");
++      pv(sb, sb_multihost_format, "%u");
++      pv(sb, sb_flags, "%u");
++
++      pv(sb, sb_bsize, "%u");
++      pv(sb, sb_bsize_shift, "%u");
++      pv(sb, sb_seg_size, "%u");
++
++      gfs_inum_print(&sb->sb_jindex_di);
++      gfs_inum_print(&sb->sb_rindex_di);
++      gfs_inum_print(&sb->sb_root_di);
++
++      pv(sb, sb_lockproto, "%s");
++      pv(sb, sb_locktable, "%s");
++
++      gfs_inum_print(&sb->sb_quota_di);
++      gfs_inum_print(&sb->sb_license_di);
++
++      pa(sb, sb_reserved, 96);
++}
++
++/**
++ * gfs_jindex_in - Read in a journal index structure
++ * @jindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_jindex_in(struct gfs_jindex *jindex, char *buf)
++{
++      struct gfs_jindex *str = (struct gfs_jindex *)buf;
++
++      CPIN_64(jindex, str, ji_addr);
++      CPIN_32(jindex, str, ji_nsegment);
++      CPIN_32(jindex, str, ji_pad);
++
++      CPIN_08(jindex, str, ji_reserved, 64);
++}
++
++/**
++ * gfs_jindex_out - Write out a journal index structure
++ * @jindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_jindex_out(struct gfs_jindex *jindex, char *buf)
++{
++      struct gfs_jindex *str = (struct gfs_jindex *)buf;
++
++      CPOUT_64(jindex, str, ji_addr);
++      CPOUT_32(jindex, str, ji_nsegment);
++      CPOUT_32(jindex, str, ji_pad);
++
++      CPOUT_08(jindex, str, ji_reserved, 64);
++}
++
++/**
++ * gfs_jindex_print - Print out a journal index structure
++ * @ji: the cpu-order buffer
++ *
++ */
++
++void
++gfs_jindex_print(struct gfs_jindex *ji)
++{
++      pv(ji, ji_addr, "%"PRIu64);
++      pv(ji, ji_nsegment, "%u");
++      pv(ji, ji_pad, "%u");
++
++      pa(ji, ji_reserved, 64);
++}
++
++/**
++ * gfs_rindex_in - Read in a resource index structure
++ * @rindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rindex_in(struct gfs_rindex *rindex, char *buf)
++{
++      struct gfs_rindex *str = (struct gfs_rindex *)buf;
++
++      CPIN_64(rindex, str, ri_addr);
++      CPIN_32(rindex, str, ri_length);
++      CPIN_32(rindex, str, ri_pad);
++
++      CPIN_64(rindex, str, ri_data1);
++      CPIN_32(rindex, str, ri_data);
++
++      CPIN_32(rindex, str, ri_bitbytes);
++
++      CPIN_08(rindex, str, ri_reserved, 64);
++}
++
++/**
++ * gfs_rindex_out - Write out a resource index structure
++ * @rindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rindex_out(struct gfs_rindex *rindex, char *buf)
++{
++      struct gfs_rindex *str = (struct gfs_rindex *)buf;
++
++      CPOUT_64(rindex, str, ri_addr);
++      CPOUT_32(rindex, str, ri_length);
++      CPOUT_32(rindex, str, ri_pad);
++
++      CPOUT_64(rindex, str, ri_data1);
++      CPOUT_32(rindex, str, ri_data);
++
++      CPOUT_32(rindex, str, ri_bitbytes);
++
++      CPOUT_08(rindex, str, ri_reserved, 64);
++}
++
++/**
++ * gfs_rindex_print - Print out a resource index structure
++ * @ri: the cpu-order buffer
++ *
++ */
++
++void
++gfs_rindex_print(struct gfs_rindex *ri)
++{
++      pv(ri, ri_addr, "%"PRIu64);
++      pv(ri, ri_length, "%u");
++      pv(ri, ri_pad, "%u");
++
++      pv(ri, ri_data1, "%"PRIu64);
++      pv(ri, ri_data, "%u");
++
++      pv(ri, ri_bitbytes, "%u");
++
++      pa(ri, ri_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_in - Read in a resource group header
++ * @rgrp: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rgrp_in(struct gfs_rgrp *rgrp, char *buf)
++{
++      struct gfs_rgrp *str = (struct gfs_rgrp *)buf;
++
++      gfs_meta_header_in(&rgrp->rg_header, buf);
++
++      CPIN_32(rgrp, str, rg_flags);
++
++      CPIN_32(rgrp, str, rg_free);
++
++      CPIN_32(rgrp, str, rg_useddi);
++      CPIN_32(rgrp, str, rg_freedi);
++      gfs_inum_in(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list);
++
++      CPIN_32(rgrp, str, rg_usedmeta);
++      CPIN_32(rgrp, str, rg_freemeta);
++
++      CPIN_08(rgrp, str, rg_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_out - Write out a resource group header
++ * @rgrp: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rgrp_out(struct gfs_rgrp *rgrp, char *buf)
++{
++      struct gfs_rgrp *str = (struct gfs_rgrp *)buf;
++
++      gfs_meta_header_out(&rgrp->rg_header, buf);
++
++      CPOUT_32(rgrp, str, rg_flags);
++
++      CPOUT_32(rgrp, str, rg_free);
++
++      CPOUT_32(rgrp, str, rg_useddi);
++      CPOUT_32(rgrp, str, rg_freedi);
++      gfs_inum_out(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list);
++
++      CPOUT_32(rgrp, str, rg_usedmeta);
++      CPOUT_32(rgrp, str, rg_freemeta);
++
++      CPOUT_08(rgrp, str, rg_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_print - Print out a resource group header
++ * @rg: the cpu-order buffer
++ *
++ */
++
++void
++gfs_rgrp_print(struct gfs_rgrp *rg)
++{
++      gfs_meta_header_print(&rg->rg_header);
++
++      pv(rg, rg_flags, "%u");
++
++      pv(rg, rg_free, "%u");
++
++      pv(rg, rg_useddi, "%u");
++      pv(rg, rg_freedi, "%u");
++      gfs_inum_print(&rg->rg_freedi_list);
++
++      pv(rg, rg_usedmeta, "%u");
++      pv(rg, rg_freemeta, "%u");
++
++      pa(rg, rg_reserved, 64);
++}
++
++/**
++ * gfs_quota_in - Read in a quota structures
++ * @quota: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_in(struct gfs_quota *quota, char *buf)
++{
++      struct gfs_quota *str = (struct gfs_quota *)buf;
++
++      CPIN_64(quota, str, qu_limit);
++      CPIN_64(quota, str, qu_warn);
++      CPIN_64(quota, str, qu_value);
++
++      CPIN_08(quota, str, qu_reserved, 64);
++}
++
++/**
++ * gfs_quota_out - Write out a quota structure
++ * @quota: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_out(struct gfs_quota *quota, char *buf)
++{
++      struct gfs_quota *str = (struct gfs_quota *)buf;
++
++      CPOUT_64(quota, str, qu_limit);
++      CPOUT_64(quota, str, qu_warn);
++      CPOUT_64(quota, str, qu_value);
++
++      CPOUT_08(quota, str, qu_reserved, 64);
++}
++
++/**
++ * gfs_quota_print - Print out a quota structure
++ * @quota: the cpu-order buffer
++ *
++ */
++
++void
++gfs_quota_print(struct gfs_quota *quota)
++{
++      pv(quota, qu_limit, "%"PRIu64);
++      pv(quota, qu_warn, "%"PRIu64);
++      pv(quota, qu_value, "%"PRId64);
++
++      pa(quota, qu_reserved, 64);
++}
++
++/**
++ * gfs_dinode_in - Read in a dinode
++ * @dinode: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dinode_in(struct gfs_dinode *dinode, char *buf)
++{
++      struct gfs_dinode *str = (struct gfs_dinode *)buf;
++
++      gfs_meta_header_in(&dinode->di_header, buf);
++
++      gfs_inum_in(&dinode->di_num, (char *)&str->di_num);
++
++      CPIN_32(dinode, str, di_mode);
++      CPIN_32(dinode, str, di_uid);
++      CPIN_32(dinode, str, di_gid);
++      CPIN_32(dinode, str, di_nlink);
++      CPIN_64(dinode, str, di_size);
++      CPIN_64(dinode, str, di_blocks);
++      CPIN_64(dinode, str, di_atime);
++      CPIN_64(dinode, str, di_mtime);
++      CPIN_64(dinode, str, di_ctime);
++      CPIN_32(dinode, str, di_major);
++      CPIN_32(dinode, str, di_minor);
++
++      CPIN_64(dinode, str, di_rgrp);
++      CPIN_64(dinode, str, di_goal_rgrp);
++      CPIN_32(dinode, str, di_goal_dblk);
++      CPIN_32(dinode, str, di_goal_mblk);
++      CPIN_32(dinode, str, di_flags);
++      CPIN_32(dinode, str, di_payload_format);
++      CPIN_16(dinode, str, di_type);
++      CPIN_16(dinode, str, di_height);
++      CPIN_32(dinode, str, di_incarn);
++      CPIN_16(dinode, str, di_pad);
++
++      CPIN_16(dinode, str, di_depth);
++      CPIN_32(dinode, str, di_entries);
++
++      gfs_inum_in(&dinode->di_next_unused, (char *)&str->di_next_unused);
++
++      CPIN_64(dinode, str, di_eattr);
++
++      CPIN_08(dinode, str, di_reserved, 56);
++}
++
++/**
++ * gfs_dinode_out - Write out a dinode
++ * @dinode: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dinode_out(struct gfs_dinode *dinode, char *buf)
++{
++      struct gfs_dinode *str = (struct gfs_dinode *)buf;
++
++      gfs_meta_header_out(&dinode->di_header, buf);
++
++      gfs_inum_out(&dinode->di_num, (char *)&str->di_num);
++
++      CPOUT_32(dinode, str, di_mode);
++      CPOUT_32(dinode, str, di_uid);
++      CPOUT_32(dinode, str, di_gid);
++      CPOUT_32(dinode, str, di_nlink);
++      CPOUT_64(dinode, str, di_size);
++      CPOUT_64(dinode, str, di_blocks);
++      CPOUT_64(dinode, str, di_atime);
++      CPOUT_64(dinode, str, di_mtime);
++      CPOUT_64(dinode, str, di_ctime);
++      CPOUT_32(dinode, str, di_major);
++      CPOUT_32(dinode, str, di_minor);
++
++      CPOUT_64(dinode, str, di_rgrp);
++      CPOUT_64(dinode, str, di_goal_rgrp);
++      CPOUT_32(dinode, str, di_goal_dblk);
++      CPOUT_32(dinode, str, di_goal_mblk);
++      CPOUT_32(dinode, str, di_flags);
++      CPOUT_32(dinode, str, di_payload_format);
++      CPOUT_16(dinode, str, di_type);
++      CPOUT_16(dinode, str, di_height);
++      CPOUT_32(dinode, str, di_incarn);
++      CPOUT_16(dinode, str, di_pad);
++
++      CPOUT_16(dinode, str, di_depth);
++      CPOUT_32(dinode, str, di_entries);
++
++      gfs_inum_out(&dinode->di_next_unused, (char *)&str->di_next_unused);
++
++      CPOUT_64(dinode, str, di_eattr);
++
++      CPOUT_08(dinode, str, di_reserved, 56);
++}
++
++/**
++ * gfs_dinode_print - Print out a dinode
++ * @di: the cpu-order buffer
++ *
++ */
++
++void
++gfs_dinode_print(struct gfs_dinode *di)
++{
++      gfs_meta_header_print(&di->di_header);
++
++      gfs_inum_print(&di->di_num);
++
++      pv(di, di_mode, "0%o");
++      pv(di, di_uid, "%u");
++      pv(di, di_gid, "%u");
++      pv(di, di_nlink, "%u");
++      pv(di, di_size, "%"PRIu64);
++      pv(di, di_blocks, "%"PRIu64);
++      pv(di, di_atime, "%"PRId64);
++      pv(di, di_mtime, "%"PRId64);
++      pv(di, di_ctime, "%"PRId64);
++      pv(di, di_major, "%u");
++      pv(di, di_minor, "%u");
++
++      pv(di, di_rgrp, "%"PRIu64);
++      pv(di, di_goal_rgrp, "%"PRIu64);
++      pv(di, di_goal_dblk, "%u");
++      pv(di, di_goal_mblk, "%u");
++      pv(di, di_flags, "0x%.8X");
++      pv(di, di_payload_format, "%u");
++      pv(di, di_type, "%u");
++      pv(di, di_height, "%u");
++      pv(di, di_incarn, "%u");
++      pv(di, di_pad, "%u");
++
++      pv(di, di_depth, "%u");
++      pv(di, di_entries, "%u");
++
++      gfs_inum_print(&di->di_next_unused);
++
++      pv(di, di_eattr, "%"PRIu64);
++
++      pa(di, di_reserved, 56);
++}
++
++/**
++ * gfs_indirect_in - copy in the header of an indirect block
++ * @indirect: the in memory copy
++ * @buf: the buffer copy
++ *
++ */
++
++void
++gfs_indirect_in(struct gfs_indirect *indirect, char *buf)
++{
++      struct gfs_indirect *str = (struct gfs_indirect *)buf;
++
++      gfs_meta_header_in(&indirect->in_header, buf);
++
++      CPIN_08(indirect, str, in_reserved, 64);
++}
++
++/**
++ * gfs_indirect_out - copy out the header of an indirect block
++ * @indirect: the in memory copy
++ * @buf: the buffer copy
++ *
++ */
++
++void
++gfs_indirect_out(struct gfs_indirect *indirect, char *buf)
++{
++      struct gfs_indirect *str = (struct gfs_indirect *)buf;
++
++      gfs_meta_header_out(&indirect->in_header, buf);
++
++      CPOUT_08(indirect, str, in_reserved, 64);
++}
++
++/**
++ * gfs_indirect_print - Print out a indirect block header
++ * @indirect: the cpu-order buffer
++ *
++ */
++
++void
++gfs_indirect_print(struct gfs_indirect *indirect)
++{
++      gfs_meta_header_print(&indirect->in_header);
++
++      pa(indirect, in_reserved, 64);
++}
++
++/**
++ * gfs_dirent_in - Read in a directory entry
++ * @dirent: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dirent_in(struct gfs_dirent *dirent, char *buf)
++{
++      struct gfs_dirent *str = (struct gfs_dirent *)buf;
++
++      gfs_inum_in(&dirent->de_inum, (char *)&str->de_inum);
++      CPIN_32(dirent, str, de_hash);
++      CPIN_16(dirent, str, de_rec_len);
++      CPIN_16(dirent, str, de_name_len);
++      CPIN_16(dirent, str, de_type);
++
++      CPIN_08(dirent, str, de_reserved, 14);
++}
++
++/**
++ * gfs_dirent_out - Write out a directory entry
++ * @dirent: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dirent_out(struct gfs_dirent *dirent, char *buf)
++{
++      struct gfs_dirent *str = (struct gfs_dirent *)buf;
++
++      gfs_inum_out(&dirent->de_inum, (char *)&str->de_inum);
++      CPOUT_32(dirent, str, de_hash);
++      CPOUT_16(dirent, str, de_rec_len);
++      CPOUT_16(dirent, str, de_name_len);
++      CPOUT_16(dirent, str, de_type);
++
++      CPOUT_08(dirent, str, de_reserved, 14);
++}
++
++/**
++ * gfs_dirent_print - Print out a directory entry
++ * @de: the cpu-order buffer
++ * @name: the filename
++ *
++ */
++
++void
++gfs_dirent_print(struct gfs_dirent *de, char *name)
++{
++      char buf[GFS_FNAMESIZE + 1];
++
++      gfs_inum_print(&de->de_inum);
++      pv(de, de_hash, "0x%.8X");
++      pv(de, de_rec_len, "%u");
++      pv(de, de_name_len, "%u");
++      pv(de, de_type, "%u");
++
++      pa(de, de_reserved, 14);
++
++      memset(buf, 0, GFS_FNAMESIZE + 1);
++      memcpy(buf, name, de->de_name_len);
++      printk("  name = %s\n", buf);
++}
++
++/**
++ * gfs_leaf_in - Read in a directory leaf header
++ * @leaf: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_leaf_in(struct gfs_leaf *leaf, char *buf)
++{
++      struct gfs_leaf *str = (struct gfs_leaf *)buf;
++
++      gfs_meta_header_in(&leaf->lf_header, buf);
++
++      CPIN_16(leaf, str, lf_depth);
++      CPIN_16(leaf, str, lf_entries);
++      CPIN_32(leaf, str, lf_dirent_format);
++      CPIN_64(leaf, str, lf_next);
++
++      CPIN_08(leaf, str, lf_reserved, 64);
++}
++
++/**
++ * gfs_leaf_out - Write out a directory leaf header
++ * @leaf: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_leaf_out(struct gfs_leaf *leaf, char *buf)
++{
++      struct gfs_leaf *str = (struct gfs_leaf *)buf;
++
++      gfs_meta_header_out(&leaf->lf_header, buf);
++
++      CPOUT_16(leaf, str, lf_depth);
++      CPOUT_16(leaf, str, lf_entries);
++      CPOUT_32(leaf, str, lf_dirent_format);
++      CPOUT_64(leaf, str, lf_next);
++
++      CPOUT_08(leaf, str, lf_reserved, 64);
++}
++
++/**
++ * gfs_leaf_print - Print out a directory leaf header
++ * @lf: the cpu-order buffer
++ *
++ */
++
++void
++gfs_leaf_print(struct gfs_leaf *lf)
++{
++      gfs_meta_header_print(&lf->lf_header);
++
++      pv(lf, lf_depth, "%u");
++      pv(lf, lf_entries, "%u");
++      pv(lf, lf_dirent_format, "%u");
++      pv(lf, lf_next, "%"PRIu64);
++
++      pa(lf, lf_reserved, 64);
++}
++
++/**
++ * gfs_log_header_in - Read in a log header
++ * @head: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_log_header_in(struct gfs_log_header *head, char *buf)
++{
++      struct gfs_log_header *str = (struct gfs_log_header *)buf;
++
++      gfs_meta_header_in(&head->lh_header, buf);
++
++      CPIN_32(head, str, lh_flags);
++      CPIN_32(head, str, lh_pad);
++
++      CPIN_64(head, str, lh_first);
++      CPIN_64(head, str, lh_sequence);
++
++      CPIN_64(head, str, lh_tail);
++      CPIN_64(head, str, lh_last_dump);
++
++      CPIN_08(head, str, lh_reserved, 64);
++}
++
++/**
++ * gfs_log_header_out - Write out a log header
++ * @head: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_log_header_out(struct gfs_log_header *head, char *buf)
++{
++      struct gfs_log_header *str = (struct gfs_log_header *)buf;
++
++      gfs_meta_header_out(&head->lh_header, buf);
++
++      CPOUT_32(head, str, lh_flags);
++      CPOUT_32(head, str, lh_pad);
++
++      CPOUT_64(head, str, lh_first);
++      CPOUT_64(head, str, lh_sequence);
++
++      CPOUT_64(head, str, lh_tail);
++      CPOUT_64(head, str, lh_last_dump);
++
++      CPOUT_08(head, str, lh_reserved, 64);
++}
++
++/**
++ * gfs_log_header_print - Print out a log header
++ * @head: the cpu-order buffer
++ *
++ */
++
++void
++gfs_log_header_print(struct gfs_log_header *lh)
++{
++      gfs_meta_header_print(&lh->lh_header);
++
++      pv(lh, lh_flags, "0x%.8X");
++      pv(lh, lh_pad, "%u");
++
++      pv(lh, lh_first, "%"PRIu64);
++      pv(lh, lh_sequence, "%"PRIu64);
++
++      pv(lh, lh_tail, "%"PRIu64);
++      pv(lh, lh_last_dump, "%"PRIu64);
++
++      pa(lh, lh_reserved, 64);
++}
++
++/**
++ * gfs_desc_in - Read in a log descriptor
++ * @desc: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_desc_in(struct gfs_log_descriptor *desc, char *buf)
++{
++      struct gfs_log_descriptor *str = (struct gfs_log_descriptor *)buf;
++
++      gfs_meta_header_in(&desc->ld_header, buf);
++
++      CPIN_32(desc, str, ld_type);
++      CPIN_32(desc, str, ld_length);
++      CPIN_32(desc, str, ld_data1);
++      CPIN_32(desc, str, ld_data2);
++
++      CPIN_08(desc, str, ld_reserved, 64);
++}
++
++/**
++ * gfs_desc_out - Write out a log descriptor
++ * @desc: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_desc_out(struct gfs_log_descriptor *desc, char *buf)
++{
++      struct gfs_log_descriptor *str = (struct gfs_log_descriptor *)buf;
++
++      gfs_meta_header_out(&desc->ld_header, buf);
++
++      CPOUT_32(desc, str, ld_type);
++      CPOUT_32(desc, str, ld_length);
++      CPOUT_32(desc, str, ld_data1);
++      CPOUT_32(desc, str, ld_data2);
++
++      CPOUT_08(desc, str, ld_reserved, 64);
++}
++
++/**
++ * gfs_desc_print - Print out a log descriptor
++ * @ld: the cpu-order buffer
++ *
++ */
++
++void
++gfs_desc_print(struct gfs_log_descriptor *ld)
++{
++      gfs_meta_header_print(&ld->ld_header);
++
++      pv(ld, ld_type, "%u");
++      pv(ld, ld_length, "%u");
++      pv(ld, ld_data1, "%u");
++      pv(ld, ld_data2, "%u");
++
++      pa(ld, ld_reserved, 64);
++}
++
++/**
++ * gfs_block_tag_in - Read in a block tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_block_tag_in(struct gfs_block_tag *tag, char *buf)
++{
++      struct gfs_block_tag *str = (struct gfs_block_tag *)buf;
++
++      CPIN_64(tag, str, bt_blkno);
++      CPIN_32(tag, str, bt_flags);
++      CPIN_32(tag, str, bt_pad);
++}
++
++/**
++ * gfs_block_tag_out - Write out a block tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_block_tag_out(struct gfs_block_tag *tag, char *buf)
++{
++      struct gfs_block_tag *str = (struct gfs_block_tag *)buf;
++
++      CPOUT_64(tag, str, bt_blkno);
++      CPOUT_32(tag, str, bt_flags);
++      CPOUT_32(tag, str, bt_pad);
++}
++
++/**
++ * gfs_block_tag_print - Print out a block tag
++ * @tag: the cpu-order buffer
++ *
++ */
++
++void
++gfs_block_tag_print(struct gfs_block_tag *tag)
++{
++      pv(tag, bt_blkno, "%"PRIu64);
++      pv(tag, bt_flags, "%u");
++      pv(tag, bt_pad, "%u");
++}
++
++/**
++ * gfs_quota_tag_in - Read in a quota tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_in(struct gfs_quota_tag *tag, char *buf)
++{
++      struct gfs_quota_tag *str = (struct gfs_quota_tag *)buf;
++
++      CPIN_64(tag, str, qt_change);
++      CPIN_32(tag, str, qt_flags);
++      CPIN_32(tag, str, qt_id);
++}
++
++/**
++ * gfs_quota_tag_out - Write out a quota tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_out(struct gfs_quota_tag *tag, char *buf)
++{
++      struct gfs_quota_tag *str = (struct gfs_quota_tag *)buf;
++
++      CPOUT_64(tag, str, qt_change);
++      CPOUT_32(tag, str, qt_flags);
++      CPOUT_32(tag, str, qt_id);
++}
++
++/**
++ * gfs_quota_tag_print - Print out a quota tag
++ * @tag: the cpu-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_print(struct gfs_quota_tag *tag)
++{
++      pv(tag, qt_change, "%"PRId64);
++      pv(tag, qt_flags, "0x%.8X");
++      pv(tag, qt_id, "%u");
++}
++
++/**
++ * gfs_ea_header_in - Read in a Extended Attribute header
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_ea_header_in(struct gfs_ea_header *ea, char *buf)
++{
++      struct gfs_ea_header *str = (struct gfs_ea_header *)buf;
++
++      CPIN_32(ea, str, ea_rec_len);
++      CPIN_32(ea, str, ea_data_len);
++      ea->ea_name_len = str->ea_name_len;
++      ea->ea_type = str->ea_type;
++      ea->ea_flags = str->ea_flags;
++      ea->ea_num_ptrs = str->ea_num_ptrs;
++      CPIN_32(ea, str, ea_pad);
++}
++
++/**
++ * gfs_ea_header_out - Write out a Extended Attribute header
++ * @ea: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_ea_header_out(struct gfs_ea_header *ea, char *buf)
++{
++      struct gfs_ea_header *str = (struct gfs_ea_header *)buf;
++
++      CPOUT_32(ea, str, ea_rec_len);
++      CPOUT_32(ea, str, ea_data_len);
++      str->ea_name_len = ea->ea_name_len;
++      str->ea_type = ea->ea_type;
++      str->ea_flags = ea->ea_flags;
++      str->ea_num_ptrs = ea->ea_num_ptrs;
++      CPOUT_32(ea, str, ea_pad);
++}
++
++/**
++ * gfs_ea_header_printt - Print out a Extended Attribute header
++ * @ea: the cpu-order buffer
++ *
++ */
++
++void
++gfs_ea_header_print(struct gfs_ea_header *ea)
++{
++      pv(ea, ea_rec_len, "%u");
++      pv(ea, ea_data_len, "%u");
++      pv(ea, ea_name_len, "%u");
++      pv(ea, ea_type, "%u");
++      pv(ea, ea_flags, "%u");
++      pv(ea, ea_num_ptrs, "%u");
++      pv(ea, ea_pad, "%u");
++}
++
++static const uint32_t crc_32_tab[] =
++{
++  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
++  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
++  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
++  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
++  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
++  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
++  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
++  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
++  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
++  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
++  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
++  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
++  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
++  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
++  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
++  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
++  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
++  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
++  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
++  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
++  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
++  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
++  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
++  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
++  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
++  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
++  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
++  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
++  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
++  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
++  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
++  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * gfs_dir_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data.  The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at.  That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_dir_hash(const char *data, int len)
++{
++      uint32_t hash = 0xFFFFFFFF;
++
++      for (; len--; data++)
++              hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++      hash = ~hash;
++
++      return hash;
++}
++
++#endif  /* WANT_GFS_CONVERSION_FUNCTIONS */
++
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/group.c linux-patched/fs/gfs_locking/lock_dlm/group.c
+--- linux-orig/fs/gfs_locking/lock_dlm/group.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/group.c      2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,776 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**  
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++
++#include "lock_dlm.h"
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++
++struct kcl_service_ops mg_ops;
++
++/*
++ * Get the node struct for a given nodeid.
++ */
++
++static dlm_node_t *find_node_by_nodeid(dlm_t *dlm, uint32_t nodeid)
++{
++      dlm_node_t *node;
++
++      list_for_each_entry(node, &dlm->mg_nodes, list) {
++              if (node->nodeid == nodeid)
++                      return node;
++      }
++      return NULL;
++}
++
++/*
++ * Get the node struct for a given journalid.
++ */
++
++static dlm_node_t *find_node_by_jid(dlm_t *dlm, uint32_t jid)
++{
++      dlm_node_t *node;
++
++      list_for_each_entry(node, &dlm->mg_nodes, list) {
++              if (node->jid == jid)
++                      return node;
++      }
++      return NULL;
++}
++
++/*
++ * If the given ID is clear, get it, setting to the given VALUE.  The ID is a
++ * journalid, the VALUE is our nodeid.  When successful, the held ID-lock is
++ * returned (in shared mode).  As long as this ID-lock is held, the journalid
++ * is owned.
++ */
++
++static int id_test_and_set(dlm_t *dlm, uint32_t id, uint32_t val,
++                         dlm_lock_t **lp_set)
++{
++      dlm_lock_t *lp = NULL;
++      struct lm_lockname name;
++      lm_lock_t *lock;
++      char *lvb;
++      uint32_t exist_val, beval;
++      int error;
++
++      name.ln_type = LM_TYPE_JID;
++      name.ln_number = id;
++
++      error = lm_dlm_get_lock(dlm, &name, &lock);
++      if (error)
++              goto fail;
++
++      error = lm_dlm_hold_lvb(lock, &lvb);
++      if (error)
++              goto fail_put;
++
++      lp = (dlm_lock_t *) lock;
++      set_bit(LFL_IDLOCK, &lp->flags);
++
++ retry:
++
++      error = lm_dlm_lock_sync(lock, LM_ST_UNLOCKED, LM_ST_SHARED,
++                               LM_FLAG_TRY | LM_FLAG_NOEXP);
++      if (error == -EAGAIN) {
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ);
++              goto retry;
++      }
++      if (error)
++              goto fail_unhold;
++
++      memcpy(&beval, lvb, sizeof(beval));
++      exist_val = be32_to_cpu(beval);
++
++      if (!exist_val) {
++              /*
++               * This id is unused.  Attempt to claim it by getting EX mode
++               * and writing our nodeid into the lvb.
++               */
++              error = lm_dlm_lock_sync(lock, LM_ST_SHARED, LM_ST_EXCLUSIVE,
++                                       LM_FLAG_TRY | LM_FLAG_NOEXP);
++              if (error == -EAGAIN) {
++                      lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++                      current->state = TASK_UNINTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      goto retry;
++              }
++              if (error)
++                      goto fail_unlock;
++
++              beval = cpu_to_be32(val);
++              memcpy(lvb, &beval, sizeof(beval));
++
++              error = lm_dlm_lock_sync(lock, LM_ST_EXCLUSIVE, LM_ST_SHARED,
++                                       LM_FLAG_NOEXP);
++              DLM_ASSERT(!error,);
++
++              *lp_set = lp;
++              error = 0;
++      } else {
++              /*
++               * This id is already used. It has a non-zero nodeid in the lvb
++               */
++              lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++              lm_dlm_unhold_lvb(lock, lvb);
++              lm_dlm_put_lock(lock);
++              error = exist_val;
++      }
++
++      return error;
++
++ fail_unlock:
++      lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++
++ fail_unhold:
++      lm_dlm_unhold_lvb(lock, lvb);
++
++ fail_put:
++      lm_dlm_put_lock(lock);
++
++ fail:
++      return error;
++}
++
++/*
++ * Release a held ID-lock clearing its VALUE.  We have to acquire the lock in
++ * EX again so we can write out a zeroed lvb.
++ */
++
++static void id_clear(dlm_t *dlm, dlm_lock_t *lp)
++{
++      lm_lock_t *lock = (lm_lock_t *) lp;
++      int error;
++
++      /*
++       * This flag means that DLM_LKF_CONVDEADLK should not be used.
++       */
++      set_bit(LFL_FORCE_PROMOTE, &lp->flags);
++
++ retry:
++
++      error = lm_dlm_lock_sync(lock, LM_ST_SHARED, LM_ST_EXCLUSIVE,
++                               LM_FLAG_TRY | LM_FLAG_NOEXP);
++      if (error == -EAGAIN) {
++              schedule();
++              goto retry;
++      }
++      if (error)
++              goto end;
++
++      memset(lp->lvb, 0, DLM_LVB_LEN);
++      lm_dlm_unlock_sync(lock, LM_ST_EXCLUSIVE);
++
++ end:
++      lm_dlm_unhold_lvb(lock, lp->lvb);
++      lm_dlm_put_lock(lock);
++}
++
++/*
++ * Get the VALUE for a given ID.  The ID is a journalid, the VALUE is a nodeid.
++ */
++
++static int id_value(dlm_t *dlm, uint32_t id, uint32_t *val)
++{
++      dlm_lock_t *lp = NULL;
++      struct lm_lockname name;
++      lm_lock_t *lock;
++      char *lvb;
++      uint32_t beval;
++      int error;
++
++      name.ln_type = LM_TYPE_JID;
++      name.ln_number = id;
++
++      error = lm_dlm_get_lock(dlm, &name, &lock);
++      if (error)
++              goto out;
++
++      error = lm_dlm_hold_lvb(lock, &lvb);
++      if (error)
++              goto out_put;
++
++      lp = (dlm_lock_t *) lock;
++      set_bit(LFL_IDLOCK, &lp->flags);
++
++      retry:
++
++      error = lm_dlm_lock_sync(lock, LM_ST_UNLOCKED, LM_ST_SHARED,
++                               LM_FLAG_TRY | LM_FLAG_NOEXP);
++      if (error == -EAGAIN) {
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ);
++              goto retry;
++      }
++      if (error)
++              goto out_unhold;
++
++      memcpy(&beval, lvb, sizeof(beval));
++      *val = be32_to_cpu(beval);
++
++      lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++
++      error = 0;
++
++ out_unhold:
++      lm_dlm_unhold_lvb(lock, lvb);
++
++ out_put:
++      lm_dlm_put_lock(lock);
++
++ out:
++      return error;
++}
++
++/*
++ * Find an ID with a given VALUE.  The ID is a journalid, the VALUE is a
++ * nodeid.
++ */
++
++static int id_find(dlm_t *dlm, uint32_t value, uint32_t *id_out)
++{
++      uint32_t val, id;
++      int error = 0, found = FALSE;
++
++      for (id = 0; id < dlm->max_nodes; id++) {
++              error = id_value(dlm, id, &val);
++              if (error)
++                      break;
++
++              if (val == value) {
++                      *id_out = id;
++                      error = 0;
++                      found = TRUE;
++                      break;
++              }
++      }
++
++      if (!error && !found)
++              error = -ENOENT;
++
++      return error;
++}
++
++/*
++ * Get a journalid to use.  The journalid must be owned exclusively as long as
++ * this fs is mounted.  Other nodes must be able to discover our nodeid as the
++ * owner of the journalid.  The journalid we claim should have the lowest value
++ * of all unused journalids.
++ */
++
++static int claim_jid(dlm_t *dlm)
++{
++      dlm_node_t *node;
++      uint32_t id;
++      int error = 0;
++
++      DLM_ASSERT(dlm->our_nodeid,);
++
++      /*
++       * Search an arbitrary number (8) past max nodes so we're sure to find
++       * one so we can let the GFS handle the "too big jid" error and fail
++       * the mount.
++       */
++
++      for (id = 0; id < dlm->max_nodes + 8; id++) {
++              error = id_test_and_set(dlm, id, dlm->our_nodeid, &dlm->jid_lock);
++              if (error < 0)
++                      break;
++              if (error > 0)
++                      continue;
++
++              dlm->jid = id;
++              node = find_node_by_nodeid(dlm, dlm->our_nodeid);
++              node->jid = id;
++              set_bit(NFL_HAVE_JID, &node->flags);
++              break;
++      }
++
++      /*
++       * If we have a problem getting a jid, pick a bogus one which should
++       * cause GFS to complain and fail to mount.
++       */
++
++      if (error) {
++              printk("lock_dlm: %s: no journal id available (%d)\n",
++                     dlm->fsname, error);
++              dlm->jid = dlm->max_nodes + dlm->our_nodeid;
++      }
++
++      log_debug("claim_jid %u", dlm->jid);
++      return 0;
++}
++
++/*
++ * Release our journalid, allowing it to be used by a node subsequently
++ * mounting the fs.
++ */
++
++static void release_jid(dlm_t *dlm)
++{
++      id_clear(dlm, dlm->jid_lock);
++      dlm->jid_lock = NULL;
++}
++
++/*
++ * For all nodes in the mountgroup, find the journalid being used by each.
++ */
++
++static int discover_jids(dlm_t *dlm)
++{
++      dlm_node_t *node;
++      uint32_t id;
++      int error, notfound = 0;
++
++      list_for_each_entry(node, &dlm->mg_nodes, list) {
++              if (test_bit(NFL_HAVE_JID, &node->flags))
++                      continue;
++
++              error = id_find(dlm, node->nodeid, &id);
++              if (error) {
++                      log_debug("jid for node %d not found", node->nodeid);
++                      notfound++;
++                      continue;
++              }
++
++              node->jid = id;
++              set_bit(NFL_HAVE_JID, &node->flags);
++      }
++
++      return notfound;
++}
++
++/*
++ * Discover the nodeid that we've been assigned by the cluster manager.
++ */
++
++static int get_our_nodeid(dlm_t *dlm)
++{
++      LIST_HEAD(cur_memb);
++      struct kcl_cluster_node *cur_node;
++
++      kcl_get_members(&cur_memb);
++
++      list_for_each_entry(cur_node, &cur_memb, list) {
++              if (cur_node->us) {
++                      dlm->our_nodeid = cur_node->node_id;
++                      break;
++              }
++      }
++
++      while (!list_empty(&cur_memb)) {
++              cur_node = list_entry(cur_memb.next, struct kcl_cluster_node,
++                                    list);
++              list_del(&cur_node->list);
++              kfree(cur_node);
++      }
++
++      return 0;
++}
++
++/* 
++ * Run in dlm_async thread
++ */
++
++void process_start(dlm_t *dlm, dlm_start_t *ds)
++{
++      dlm_node_t *node;
++      uint32_t nodeid;
++      int last_stop, last_start, error, i, new = FALSE, found;
++
++
++      log_debug("start c %d type %d e %d", ds->count, ds->type, ds->event_id);
++
++      /*
++       * gfs won't do journal recoveries once it's sent us an unmount
++       */
++
++      if (test_bit(DFL_UMOUNT, &dlm->flags)) {
++              log_debug("process_start %d skip for umount", ds->event_id);
++              kcl_start_done(dlm->mg_local_id, ds->event_id);
++              goto out;
++      }
++
++      /* 
++       * check if first start
++       */
++
++      if (!test_and_set_bit(DFL_GOT_NODEID, &dlm->flags)) {
++              get_our_nodeid(dlm);
++              if (ds->count == 1)
++                      set_bit(DFL_FIRST_MOUNT, &dlm->flags);
++      }
++
++      down(&dlm->mg_nodes_lock);
++
++      /* 
++       * find nodes which are gone
++       */
++
++      list_for_each_entry(node, &dlm->mg_nodes, list) {
++              found = FALSE;
++              for (i = 0; i < ds->count; i++) {
++                      if (node->nodeid != ds->nodeids[i])
++                              continue;
++                      found = TRUE;
++                      break;
++              }
++              
++              /* node is still a member */
++              if (found)
++                      continue;
++
++              set_bit(NFL_NOT_MEMBER, &node->flags);
++
++              /* no gfs recovery needed for nodes that left cleanly */
++              if (ds->type != SERVICE_NODE_FAILED)
++                      continue;
++
++              /* callbacks sent only for nodes in last completed MG */
++              if (!test_bit(NFL_LAST_FINISH, &node->flags))
++                      continue;
++
++              /* only send a single callback per node */
++              if (test_and_set_bit(NFL_SENT_CB, &node->flags))
++                      continue;
++
++              dlm->fscb(dlm->fsdata, LM_CB_NEED_RECOVERY, &node->jid);
++              set_bit(DFL_NEED_STARTDONE, &dlm->flags);
++              log_debug("cb_need_recovery jid %u", node->jid);
++      }
++
++      /*
++       * add new nodes
++       */
++
++      for (i = 0; i < ds->count; i++) {
++              nodeid = ds->nodeids[i];
++
++              node = find_node_by_nodeid(dlm, nodeid);
++              if (node)
++                      continue;
++
++              DLM_RETRY(node = kmalloc(sizeof(dlm_node_t), GFP_KERNEL), node);
++
++              memset(node, 0, sizeof(dlm_node_t));
++
++              node->nodeid = nodeid;
++              list_add(&node->list, &dlm->mg_nodes);
++              new = TRUE;
++      }
++
++      up(&dlm->mg_nodes_lock);
++
++      /*
++       * get a jid for ourself when started for first time
++       */
++
++      if (!test_and_set_bit(DFL_HAVE_JID, &dlm->flags))
++              claim_jid(dlm);
++      else if (new) {
++              /* give new nodes a little time to claim a jid */
++              current->state = TASK_INTERRUPTIBLE;
++              schedule_timeout(HZ);
++      }
++
++      /* 
++       * find jid's of new nodes
++       */
++
++      for (;;) {
++              /* we don't need to do these jid lookups if this start has been
++                 followed by a stop event (and thus cancelled) */
++
++              spin_lock(&dlm->async_lock);
++              last_stop = dlm->mg_last_stop;
++              last_start = dlm->mg_last_start;
++              spin_unlock(&dlm->async_lock);
++
++              if (last_stop >= ds->event_id)
++                      break;
++
++              error = discover_jids(dlm);
++              if (error) {
++                      /* Not all jids were found.  Wait for a time to let all
++                         new nodes claim_jid, then try to scan for jids
++                         again. */
++                      current->state = TASK_INTERRUPTIBLE;
++                      schedule_timeout(HZ);
++                      continue;
++              }
++              break;
++      }
++
++      /* 
++       * tell SM we're done if there are no GFS recoveries to wait for
++       */
++
++      if (last_start > last_stop) {
++              error = 0;
++              down(&dlm->mg_nodes_lock);
++
++              list_for_each_entry(node, &dlm->mg_nodes, list) {
++                      if (!test_bit(NFL_SENT_CB, &node->flags))
++                              continue;
++                      error = 1;
++                      break;
++              }
++              up(&dlm->mg_nodes_lock);
++
++              if (!error)
++                      kcl_start_done(dlm->mg_local_id, ds->event_id);
++      }
++
++ out:
++      kfree(ds->nodeids);
++      kfree(ds);
++}
++
++void process_finish(dlm_t *dlm)
++{
++      struct list_head *tmp, *tmpsafe;
++      dlm_node_t *node;
++      dlm_lock_t *lp;
++
++      spin_lock(&dlm->async_lock);
++      clear_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++
++      list_for_each_safe(tmp, tmpsafe, &dlm->delayed) {
++              lp = list_entry(tmp, dlm_lock_t, dlist);
++
++              if (lp->type != QUEUE_LOCKS_BLOCKED)
++                      continue;
++
++              lp->type = 0;
++              list_del(&lp->dlist);
++              list_add_tail(&lp->slist, &dlm->submit);
++
++              clear_bit(LFL_DLIST, &lp->flags);
++              set_bit(LFL_SLIST, &lp->flags);
++      }
++      spin_unlock(&dlm->async_lock);
++
++      down(&dlm->mg_nodes_lock);
++
++      list_for_each_safe(tmp, tmpsafe, &dlm->mg_nodes) {
++              node = list_entry(tmp, dlm_node_t, list);
++
++              if (test_bit(NFL_NOT_MEMBER, &node->flags)) {
++                      list_del(&node->list);
++                      kfree(node);
++              } else
++                      set_bit(NFL_LAST_FINISH, &node->flags);
++      }
++      up(&dlm->mg_nodes_lock);
++
++      wake_up(&dlm->wait);
++}
++
++/*
++ * Run in user process
++ */
++
++int init_mountgroup(dlm_t *dlm)
++{
++      int error;
++      int id;
++
++      error = kcl_register_service(dlm->fsname, dlm->fnlen, SERVICE_LEVEL_GFS,
++                                   &mg_ops, TRUE, (void *) dlm, &id);
++      if (error)
++              goto out;
++
++      dlm->mg_local_id = id;
++
++      /* BLOCK_LOCKS is cleared when the join is finished */
++      set_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++
++      error = kcl_join_service(id);
++      if (error)
++              goto out_unreg;
++
++      if (test_bit(DFL_START_ERROR, &dlm->flags))
++              goto out_leave;
++
++      return 0;
++
++ out_leave:
++      kcl_leave_service(dlm->mg_local_id);
++
++ out_unreg:
++      kcl_unregister_service(id);
++
++ out:
++      printk("lock_dlm: service error %d\n", error);
++      return error;
++}
++
++void release_mountgroup(dlm_t *dlm)
++{
++      int last_start, last_stop;
++
++      /* this flag causes a kcl_start_done() to be sent right away for
++         any start callbacks we get from SM */
++
++      log_debug("umount flags %lx", dlm->flags);
++      set_bit(DFL_UMOUNT, &dlm->flags);
++
++      /* gfs has done a unmount and will not call jid_recovery_done()
++         any longer so make necessary kcl_start_done() calls so
++         kcl_leave_service() will complete */
++
++      spin_lock(&dlm->async_lock);
++      last_start = dlm->mg_last_start;
++      last_stop = dlm->mg_last_stop;
++      spin_unlock(&dlm->async_lock);
++
++      if ((last_start > last_stop) &&
++          test_and_clear_bit(DFL_NEED_STARTDONE, &dlm->flags)) {
++              log_debug("umount doing start_done %d", last_start);
++              kcl_start_done(dlm->mg_local_id, last_start);
++      }
++
++      kcl_leave_service(dlm->mg_local_id);
++      kcl_unregister_service(dlm->mg_local_id);
++      release_jid(dlm);
++}
++
++/*
++ * Run in GFS thread
++ */
++
++void jid_recovery_done(dlm_t *dlm, unsigned int jid, unsigned int message)
++{
++      dlm_node_t *node;
++      int last_start, last_stop;
++      int remain = 0;
++
++      log_debug("recovery_done jid %u msg %u", jid, message);
++
++      node = find_node_by_jid(dlm, jid);
++      if (!node)
++              goto out;
++
++      log_debug("recovery_done %u,%u f %lx", jid, node->nodeid, node->flags);
++
++      if (!test_bit(NFL_SENT_CB, &node->flags))
++              goto out;
++
++      if (!test_bit(NFL_NOT_MEMBER, &node->flags))
++              goto out;
++
++      set_bit(NFL_RECOVERY_DONE, &node->flags);
++
++      /* 
++       * when recovery is done for all nodes, we're done with the start
++       */
++
++      down(&dlm->mg_nodes_lock);
++
++      list_for_each_entry(node, &dlm->mg_nodes, list) {
++              if (test_bit(NFL_SENT_CB, &node->flags) &&
++                  !test_bit(NFL_RECOVERY_DONE, &node->flags))
++                      remain++;
++      }
++      up(&dlm->mg_nodes_lock);
++
++      if (!remain) {
++              /* don't send a start_done if there's since been a stop which
++               * cancels this start */
++
++              spin_lock(&dlm->async_lock);
++              last_start = dlm->mg_last_start;
++              last_stop = dlm->mg_last_stop;
++              spin_unlock(&dlm->async_lock);
++
++              if (last_start > last_stop) {
++                      log_debug("recovery_done start_done %d", last_start);
++                      kcl_start_done(dlm->mg_local_id, last_start);
++                      clear_bit(DFL_NEED_STARTDONE, &dlm->flags);
++              }
++      }
++
++ out:
++      return;
++}
++
++/* 
++ * Run in CMAN SM thread
++ */
++
++static void queue_start(dlm_t *dlm, uint32_t *nodeids, int count,
++                      int event_id, int type)
++{
++      dlm_start_t *ds;
++
++      DLM_RETRY(ds = kmalloc(sizeof(dlm_start_t), GFP_KERNEL), ds);
++
++      memset(ds, 0, sizeof(dlm_start_t));
++
++      ds->nodeids = nodeids;
++      ds->count = count;
++      ds->event_id = event_id;
++      ds->type = type;
++
++      spin_lock(&dlm->async_lock);
++      dlm->mg_last_start = event_id;
++      list_add_tail(&ds->list, &dlm->starts);
++      spin_unlock(&dlm->async_lock);
++
++      wake_up(&dlm->wait);
++}
++
++static int mg_stop(void *data)
++{
++      dlm_t *dlm = (dlm_t *) data;
++
++      spin_lock(&dlm->async_lock);
++      set_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++      dlm->mg_last_stop = dlm->mg_last_start;
++      spin_unlock(&dlm->async_lock);
++
++      return 0;
++}
++
++static int mg_start(void *data, uint32_t *nodeids, int count, int event_id,
++                  int type)
++{
++      dlm_t *dlm = (dlm_t *) data;
++
++      queue_start(dlm, nodeids, count, event_id, type);
++
++      return 0;
++}
++
++static void mg_finish(void *data, int event_id)
++{
++      dlm_t *dlm = (dlm_t *) data;
++
++      spin_lock(&dlm->async_lock);
++      dlm->mg_last_finish = event_id;
++      set_bit(DFL_MG_FINISH, &dlm->flags);
++      spin_unlock(&dlm->async_lock);
++
++      wake_up(&dlm->wait);
++}
++
++struct kcl_service_ops mg_ops = {
++      .stop = mg_stop,
++      .start = mg_start,
++      .finish = mg_finish
++};
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/lock.c linux-patched/fs/gfs_locking/lock_dlm/lock.c
+--- linux-orig/fs/gfs_locking/lock_dlm/lock.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/lock.c       2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,561 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++/*
++ * Run in DLM thread
++ */
++
++static void queue_complete(dlm_lock_t *lp)
++{
++      dlm_t *dlm = lp->dlm;
++
++      clear_bit(LFL_WAIT_COMPLETE, &lp->flags);
++
++      spin_lock(&dlm->async_lock);
++      list_add_tail(&lp->clist, &dlm->complete);
++      set_bit(LFL_CLIST, &lp->flags);
++      spin_unlock(&dlm->async_lock);
++      wake_up(&dlm->wait);
++}
++
++static void queue_blocking(dlm_lock_t *lp, int mode)
++{
++      dlm_t *dlm = lp->dlm;
++
++      if (test_bit(LFL_WAIT_COMPLETE, &lp->flags)) {
++              /* We often receive basts for EX while we're promoting
++                 from SH to EX. */
++              /* printk("lock_dlm: bast before complete %x,%"PRIx64" "
++                     "gr=%d rq=%d bast=%d\n", lp->lockname.ln_type,
++                     lp->lockname.ln_number, lp->cur, lp->req, mode); */
++              return;
++      }
++
++      spin_lock(&dlm->async_lock);
++
++      if (!lp->bast_mode) {
++              list_add_tail(&lp->blist, &dlm->blocking);
++              set_bit(LFL_BLIST, &lp->flags);
++              lp->bast_mode = mode;
++      } else if (lp->bast_mode < mode)
++              lp->bast_mode = mode;
++
++      spin_unlock(&dlm->async_lock);
++      wake_up(&dlm->wait);
++}
++
++static __inline__ void lock_ast(void *astargs)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) astargs;
++      queue_complete(lp);
++}
++
++static __inline__ void lock_bast(void *astargs, int mode)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) astargs;
++      queue_blocking(lp, mode);
++}
++
++/*
++ * Run in GFS or user thread
++ */
++
++/**
++ * queue_delayed - add request to queue to be submitted later
++ * @lp: DLM lock
++ * @type: the reason the lock is blocked
++ *
++ * Queue of locks which need submitting sometime later.  Locks here
++ * due to BLOCKED_LOCKS are moved to request queue when recovery is
++ * done.  Locks here due to an ERROR are moved to request queue after
++ * some delay.  This could also be called from dlm_async thread.
++ */
++
++void queue_delayed(dlm_lock_t *lp, int type)
++{
++      dlm_t *dlm = lp->dlm;
++
++      lp->type = type;
++
++      spin_lock(&dlm->async_lock);
++      list_add_tail(&lp->dlist, &dlm->delayed);
++      set_bit(LFL_DLIST, &lp->flags);
++      spin_unlock(&dlm->async_lock);
++}
++
++/**
++ * make_mode - convert to DLM_LOCK_
++ * @lmstate: GFS lock state
++ *
++ * Returns: DLM lock mode
++ */
++
++static int16_t make_mode(int16_t lmstate)
++{
++      switch (lmstate) {
++      case LM_ST_UNLOCKED:
++              return DLM_LOCK_NL;
++      case LM_ST_EXCLUSIVE:
++              return DLM_LOCK_EX;
++      case LM_ST_DEFERRED:
++              return DLM_LOCK_CW;
++      case LM_ST_SHARED:
++              return DLM_LOCK_PR;
++      default:
++              DLM_ASSERT(0, printk("unknown LM state %d\n", lmstate););
++      }
++}
++
++/**
++ * make_lmstate - convert to LM_ST_
++ * @dlmmode: DLM lock mode 
++ *
++ * Returns: GFS lock state 
++ */
++
++int16_t make_lmstate(int16_t dlmmode)
++{
++      switch (dlmmode) {
++      case DLM_LOCK_IV:
++      case DLM_LOCK_NL:
++              return LM_ST_UNLOCKED;
++      case DLM_LOCK_EX:
++              return LM_ST_EXCLUSIVE;
++      case DLM_LOCK_CW:
++              return LM_ST_DEFERRED;
++      case DLM_LOCK_PR:
++              return LM_ST_SHARED;
++      default:
++              DLM_ASSERT(0, printk("unknown DLM mode %d\n", dlmmode););
++      }
++}
++
++/**
++ * check_cur_state - verify agreement with GFS on the current lock state
++ * @lp: the DLM lock 
++ * @cur_state: the current lock state from GFS
++ *
++ * NB: DLM_LOCK_NL and DLM_LOCK_IV are both considered 
++ * LM_ST_UNLOCKED by GFS.
++ *
++ */
++
++static void check_cur_state(dlm_lock_t *lp, unsigned int cur_state)
++{
++      int16_t cur = make_mode(cur_state);
++      if (lp->cur != DLM_LOCK_IV)
++              DLM_ASSERT(lp->cur == cur, printk("%d, %d\n", lp->cur, cur););
++}
++
++/**
++ * make_flags - put together necessary DLM flags
++ * @lp: DLM lock
++ * @gfs_flags: GFS flags
++ * @cur: current DLM lock mode
++ * @req: requested DLM lock mode
++ *
++ * Returns: DLM flags
++ */
++
++static unsigned int make_flags(dlm_lock_t *lp, unsigned int gfs_flags,
++                             int16_t cur, int16_t req)
++{
++      unsigned int lkf = 0;
++
++      if (gfs_flags & LM_FLAG_TRY)
++              lkf |= DLM_LKF_NOQUEUE;
++
++      if (gfs_flags & LM_FLAG_TRY_1CB) {
++              lkf |= DLM_LKF_NOQUEUE;
++              lkf |= DLM_LKF_NOQUEUEBAST;
++      }
++
++      if (lp->lksb.sb_lkid != 0) {
++              lkf |= DLM_LKF_CONVERT;
++
++              if (gfs_flags & LM_FLAG_PRIORITY)
++                      lkf |= DLM_LKF_EXPEDITE;
++              else if (req > cur)
++                      lkf |= DLM_LKF_QUECVT;
++
++              /* Conversion deadlock avoidance by DLM */
++
++              if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
++                  cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
++                      lkf |= DLM_LKF_CONVDEADLK;
++      }
++
++      if (lp->lvb)
++              lkf |= DLM_LKF_VALBLK;
++
++      return lkf;
++}
++
++/**
++ * make_strname - convert GFS lock numbers to string
++ * @lockname: the lock type/number 
++ * @str: the lock string/length
++ *
++ */
++
++static __inline__ void make_strname(struct lm_lockname *lockname,
++                                  strname_t *str)
++{
++      sprintf(str->name, "%8x%16"PRIx64, lockname->ln_type,
++              lockname->ln_number);
++      str->namelen = LOCK_DLM_STRNAME_BYTES;
++}
++
++int create_lp(dlm_t *dlm, struct lm_lockname *name, dlm_lock_t **lpp)
++{
++      dlm_lock_t *lp;
++
++      lp = kmalloc(sizeof(dlm_lock_t), GFP_KERNEL);
++      if (!lp)
++              return -ENOMEM;
++
++      memset(lp, 0, sizeof(dlm_lock_t));
++      lp->lockname = *name;
++      lp->dlm = dlm;
++      lp->cur = DLM_LOCK_IV;
++      init_completion(&lp->uast_wait);
++      *lpp = lp;
++      return 0;
++}
++
++/**
++ * dlm_get_lock - get a lm_lock_t given a descripton of the lock
++ * @lockspace: the lockspace the lock lives in
++ * @name: the name of the lock
++ * @lockp: return the lm_lock_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int lm_dlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                  lm_lock_t **lockp)
++{
++      dlm_lock_t *lp;
++      int error;
++
++      error = create_lp((dlm_t *) lockspace, name, &lp);
++
++      *lockp = (lm_lock_t *) lp;
++      return error;
++}
++
++int do_unlock(dlm_lock_t *lp)
++{
++      int error;
++
++      init_completion(&lp->uast_wait);
++
++      set_bit(LFL_DLM_UNLOCK, &lp->flags);
++
++      error = dlm_unlock(lp->dlm->gdlm_lsp, lp->lksb.sb_lkid, 0, &lp->lksb,
++                          (void *) lp);
++
++      DLM_ASSERT(!error, printk("%s: error=%d num=%x,%"PRIx64"\n",
++                            lp->dlm->fsname, error, lp->lockname.ln_type,
++                            lp->lockname.ln_number););
++
++      wait_for_completion(&lp->uast_wait);
++
++      spin_lock(&lp->dlm->async_lock);
++      if (test_bit(LFL_CLIST, &lp->flags)) {
++              printk("lock_dlm: dlm_put_lock lp on clist num=%x,%"PRIx64"\n",                lp->lockname.ln_type, lp->lockname.ln_number);
++              list_del(&lp->clist);
++      }
++      if (test_bit(LFL_BLIST, &lp->flags)) {
++              printk("lock_dlm: dlm_put_lock lp on blist num=%x,%"PRIx64"\n",
++                      lp->lockname.ln_type, lp->lockname.ln_number);
++              list_del(&lp->blist);
++      }
++      if (test_bit(LFL_DLIST, &lp->flags)) {
++              printk("lock_dlm: dlm_put_lock lp on dlist num=%x,%"PRIx64"\n",
++                     lp->lockname.ln_type, lp->lockname.ln_number);
++              list_del(&lp->dlist);
++      }
++      if (test_bit(LFL_SLIST, &lp->flags)) {
++              printk("lock_dlm: dlm_put_lock lp on slist num=%x,%"PRIx64"\n",
++                     lp->lockname.ln_type, lp->lockname.ln_number);
++              list_del(&lp->slist);
++      }
++      spin_unlock(&lp->dlm->async_lock);
++
++      return 0;
++}
++
++/**
++ * dlm_put_lock - get rid of a lock structure
++ * @lock: the lock to throw away
++ *
++ */
++
++void lm_dlm_put_lock(lm_lock_t *lock)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      if (lp->cur != DLM_LOCK_IV) {
++              do_unlock(lp);
++              kfree(lp);
++      }
++}
++
++/**
++ * do_lock - acquire a lock
++ * @lp: the DLM lock 
++ * @range: optional range
++ */
++
++void do_lock(dlm_lock_t *lp, struct dlm_range *range)
++{
++      dlm_t *dlm = lp->dlm;
++      strname_t str;
++      int error;
++
++      /*
++       * When recovery is in progress, delay lock requests for submission
++       * once recovery is done.  Requests for recovery (NOEXP) and unlocks
++       * can pass.
++       */
++
++      if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++          !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
++              queue_delayed(lp, QUEUE_LOCKS_BLOCKED);
++              return;
++      }
++
++      /*
++       * Submit the actual lock request.
++       */
++
++      make_strname(&lp->lockname, &str);
++
++      set_bit(LFL_WAIT_COMPLETE, &lp->flags);
++
++      error = dlm_lock(dlm->gdlm_lsp, lp->req, &lp->lksb, lp->lkf, str.name,
++                        str.namelen, 0, lock_ast, (void *) lp,
++                        lp->posix ? NULL : lock_bast, range);
++
++      if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
++              lp->lksb.sb_status = -EAGAIN;
++              queue_complete(lp);
++              error = 0;
++      }
++
++      DLM_ASSERT(!error,
++                 printk("%s: num=%x,%"PRIx64" err=%d cur=%d req=%d lkf=%x\n",
++                        dlm->fsname, lp->lockname.ln_type,
++                        lp->lockname.ln_number, error, lp->cur, lp->req,
++                        lp->lkf););
++}
++
++/**
++ * lm_dlm_lock - acquire a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ * @req_state: the requested state
++ * @flags: modifier flags
++ *
++ * Returns: A bitmap of LM_OUT_* on success, -EXXX on failure
++ */
++
++unsigned int lm_dlm_lock(lm_lock_t *lock, unsigned int cur_state,
++                       unsigned int req_state, unsigned int flags)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      if (flags & LM_FLAG_NOEXP)
++              set_bit(LFL_NOBLOCK, &lp->flags);
++
++      check_cur_state(lp, cur_state);
++      lp->req = make_mode(req_state);
++      lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
++
++      do_lock(lp, NULL);
++      return LM_OUT_ASYNC;
++}
++
++int lm_dlm_lock_sync(lm_lock_t *lock, unsigned int cur_state,
++                   unsigned int req_state, unsigned int flags)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      init_completion(&lp->uast_wait);
++      lm_dlm_lock(lock, cur_state, req_state, flags);
++      wait_for_completion(&lp->uast_wait);
++
++      return lp->lksb.sb_status;
++}
++
++/**
++ * lm_dlm_unlock - unlock a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++unsigned int lm_dlm_unlock(lm_lock_t *lock, unsigned int cur_state)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      check_cur_state(lp, cur_state);
++      lp->req = DLM_LOCK_NL;
++      lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
++
++      do_lock(lp, NULL);
++
++      return LM_OUT_ASYNC;
++}
++
++void lm_dlm_unlock_sync(lm_lock_t *lock, unsigned int cur_state)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      init_completion(&lp->uast_wait);
++      lm_dlm_unlock(lock, cur_state);
++      wait_for_completion(&lp->uast_wait);
++}
++
++/**
++ * dlm_cancel - cancel a request that is blocked due to DFL_BLOCK_LOCKS
++ * @lock: the lock to cancel request for
++ *
++ */
++
++void lm_dlm_cancel(lm_lock_t *lock)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++      int dlist = FALSE;
++
++      printk("lock_dlm: cancel num=%x,%"PRIx64"\n",
++             lp->lockname.ln_type, lp->lockname.ln_number);
++
++      spin_lock(&lp->dlm->async_lock);
++      if (test_and_clear_bit(LFL_DLIST, &lp->flags)) {
++              list_del(&lp->dlist);
++              lp->type = 0;
++              dlist = TRUE;
++      }
++      spin_unlock(&lp->dlm->async_lock);
++
++      if (dlist) {
++              set_bit(LFL_CANCEL, &lp->flags);
++              queue_complete(lp);
++      }
++}
++
++/**
++ * dlm_hold_lvb - hold on to a lock value block
++ * @lock: the lock the LVB is associated with
++ * @lvbp: return the lvb memory here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int lm_dlm_hold_lvb(lm_lock_t *lock, char **lvbp)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++      char *lvb;
++
++      lvb = kmalloc(DLM_LVB_SIZE, GFP_KERNEL);
++      if (!lvb)
++              return -ENOMEM;
++
++      memset(lvb, 0, DLM_LVB_SIZE);
++
++      lp->lksb.sb_lvbptr = lvb;
++      lp->lvb = lvb;
++      *lvbp = lvb;
++
++      return 0;
++}
++
++/**
++ * dlm_unhold_lvb - release a LVB
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++void lm_dlm_unhold_lvb(lm_lock_t *lock, char *lvb)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++      kfree(lvb);
++      lp->lvb = NULL;
++      lp->lksb.sb_lvbptr = NULL;
++}
++
++/**
++ * dlm_sync_lvb - sync out the value of a lvb
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++void lm_dlm_sync_lvb(lm_lock_t *lock, char *lvb)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++      if (lp->cur != DLM_LOCK_EX)
++              return;
++
++      init_completion(&lp->uast_wait);
++      set_bit(LFL_SYNC_LVB, &lp->flags);
++
++      lp->req = DLM_LOCK_EX;
++      lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
++
++      do_lock(lp, NULL);
++      wait_for_completion(&lp->uast_wait);
++}
++
++/**
++ * dlm_recovery_done - reset the expired locks for a given jid
++ * @lockspace: the lockspace
++ * @jid: the jid
++ *
++ */
++
++void lm_dlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
++                        unsigned int message)
++{
++      jid_recovery_done((dlm_t *) lockspace, jid, message);
++}
++
++/*
++ * Run in dlm_async
++ */
++
++/**
++ * process_submit - make DLM lock requests from dlm_async thread
++ * @lp: DLM Lock
++ *
++ */
++
++void process_submit(dlm_lock_t *lp)
++{
++      struct dlm_range range, *r = NULL;
++
++      if (lp->posix) {
++              range.ra_start = lp->posix->start;
++              range.ra_end = lp->posix->end;
++              r = &range;
++      }
++
++      do_lock(lp, r);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/lock_dlm.h linux-patched/fs/gfs_locking/lock_dlm/lock_dlm.h
+--- linux-orig/fs/gfs_locking/lock_dlm/lock_dlm.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/lock_dlm.h   2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,323 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef LOCK_DLM_DOT_H
++#define LOCK_DLM_DOT_H
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/spinlock.h>
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <linux/lm_interface.h>
++#include <cluster/dlm.h>
++
++/* We take a shortcut and use lm_lockname structs for internal locks.  This
++   means we must be careful to keep these types different from those used in
++   lm_interface.h. */
++
++#define LM_TYPE_JID           (0x10)
++#define LM_TYPE_PLOCK_UPDATE  (0x11)
++
++#define DLM_LVB_SIZE          (DLM_LVB_LEN)
++
++/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
++   We sprintf these numbers into a 24 byte string of hex values to make them
++   human-readable (to make debugging simpler.) */
++
++#define LOCK_DLM_STRNAME_BYTES        (24)
++
++#define LOCK_DLM_MAX_NODES    (128)
++
++struct dlm;
++struct dlm_lock;
++struct dlm_node;
++struct dlm_start;
++struct strname;
++
++typedef struct dlm dlm_t;
++typedef struct dlm_lock dlm_lock_t;
++typedef struct dlm_node dlm_node_t;
++typedef struct dlm_start dlm_start_t;
++typedef struct strname strname_t;
++
++#define DFL_FIRST_MOUNT         0
++#define DFL_THREAD_STOP         1
++#define DFL_GOT_NODEID          2
++#define DFL_MG_FINISH           3
++#define DFL_HAVE_JID            4
++#define DFL_BLOCK_LOCKS         5
++#define DFL_START_ERROR         6
++#define DFL_UMOUNT            7
++#define DFL_NEED_STARTDONE    8
++
++struct dlm {
++      uint32_t                jid;
++      uint32_t                our_nodeid;
++      unsigned long           flags;
++
++      int                     cnlen;
++      char *                  clustername;
++      int                     fnlen;
++      char *                  fsname;
++      int                     max_nodes;
++
++      dlm_lockspace_t *       gdlm_lsp;
++
++      lm_callback_t           fscb;
++      lm_fsdata_t *           fsdata;
++      dlm_lock_t *            jid_lock;
++
++      spinlock_t              async_lock;
++      struct list_head        complete;
++      struct list_head        blocking;
++      struct list_head        delayed;
++      struct list_head        submit;
++      struct list_head        starts;
++
++      wait_queue_head_t       wait;
++      atomic_t                threads;
++
++      int                     mg_local_id;
++      int                     mg_last_start;
++      int                     mg_last_stop;
++      int                     mg_last_finish;
++      struct list_head        mg_nodes;
++      struct semaphore        mg_nodes_lock;
++
++      struct list_head        resources;
++      struct semaphore        res_lock;
++};
++
++struct dlm_resource {
++      dlm_t *                 dlm;
++      struct list_head        list;           /* list of resources */
++      struct lm_lockname      name;           /* the resource name */
++      struct semaphore        sema;
++      struct list_head        locks;          /* one lock for each range */
++      int                     count;
++      dlm_lock_t *            update;
++      struct list_head        async_locks;
++      spinlock_t              async_spin;
++};
++
++struct posix_lock {
++      struct list_head        list;           /* resource locks list */
++      struct list_head        async_list;     /* resource async_locks list */
++      struct dlm_resource *   resource;
++      dlm_lock_t *            lp;
++      unsigned long           owner;
++      uint64_t                start;
++      uint64_t                end;
++      int                     count;
++      int                     ex;
++};
++
++#define LFL_NOBLOCK             0
++#define LFL_NOCACHE             1
++#define LFL_UNLOCK_RECOVERY     2
++#define LFL_DLM_UNLOCK          3
++#define LFL_TRYFAILED           4
++#define LFL_SYNC_LVB            5
++#define LFL_FORCE_PROMOTE       6
++#define LFL_REREQUEST           7
++#define LFL_WAIT_COMPLETE       8
++#define LFL_CLIST               9
++#define LFL_BLIST               10
++#define LFL_DLIST               11
++#define LFL_SLIST               12
++#define LFL_IDLOCK              13
++#define LFL_CANCEL              14
++
++struct dlm_lock {
++      dlm_t *                 dlm;
++      struct lm_lockname      lockname;
++      char *                  lvb;
++      struct dlm_lksb         lksb;
++
++      int16_t                 cur;
++      int16_t                 req;
++      int16_t                 prev_req;
++      unsigned int            lkf;
++      unsigned int            type;
++      unsigned long           flags;
++
++      int                     bast_mode;      /* protected by async_lock */
++      struct completion       uast_wait;
++
++      struct list_head        clist;          /* complete */
++      struct list_head        blist;          /* blocking */
++      struct list_head        dlist;          /* delayed */
++      struct list_head        slist;          /* submit */
++
++      struct posix_lock *     posix;
++};
++
++#define NFL_SENT_CB             0
++#define NFL_NOT_MEMBER          1
++#define NFL_RECOVERY_DONE       2
++#define NFL_LAST_FINISH         3
++#define NFL_HAVE_JID            4
++
++struct dlm_node {
++      uint32_t                nodeid;
++      uint32_t                jid;
++      unsigned long           flags;
++      struct list_head        list;
++};
++
++#define QUEUE_LOCKS_BLOCKED     1
++#define QUEUE_ERROR_UNLOCK      2
++#define QUEUE_ERROR_LOCK        3
++#define QUEUE_ERROR_RETRY       4
++
++struct strname {
++      unsigned char           name[LOCK_DLM_STRNAME_BYTES];
++      unsigned short          namelen;
++};
++
++struct dlm_start {
++      uint32_t *              nodeids;
++      int                     count;
++      int                     type;
++      int                     event_id;
++      struct list_head        list;
++};
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++extern struct lm_lockops lock_dlm_ops;
++
++/* group.c */
++
++int init_mountgroup(dlm_t * dlm);
++void release_mountgroup(dlm_t * dlm);
++void process_start(dlm_t * dlm, dlm_start_t * ds);
++void process_finish(dlm_t * dlm);
++void jid_recovery_done(dlm_t * dlm, unsigned int jid, unsigned int message);
++
++/* thread.c */
++
++int init_async_thread(dlm_t * dlm);
++void release_async_thread(dlm_t * dlm);
++
++/* lock.c */
++
++int16_t make_lmstate(int16_t dlmmode);
++void queue_delayed(dlm_lock_t * lp, int type);
++void process_submit(dlm_lock_t * lp);
++int create_lp(dlm_t *dlm, struct lm_lockname *name, dlm_lock_t **lpp);
++void do_lock(dlm_lock_t *lp, struct dlm_range *range);
++int do_unlock(dlm_lock_t *lp);
++
++int lm_dlm_get_lock(lm_lockspace_t * lockspace, struct lm_lockname * name,
++               lm_lock_t ** lockp);
++void lm_dlm_put_lock(lm_lock_t * lock);
++unsigned int lm_dlm_lock(lm_lock_t * lock, unsigned int cur_state,
++                    unsigned int req_state, unsigned int flags);
++int lm_dlm_lock_sync(lm_lock_t * lock, unsigned int cur_state,
++                unsigned int req_state, unsigned int flags);
++unsigned int lm_dlm_unlock(lm_lock_t * lock, unsigned int cur_state);
++void lm_dlm_unlock_sync(lm_lock_t * lock, unsigned int cur_state);
++void lm_dlm_cancel(lm_lock_t * lock);
++int lm_dlm_hold_lvb(lm_lock_t * lock, char **lvbp);
++void lm_dlm_unhold_lvb(lm_lock_t * lock, char *lvb);
++void lm_dlm_sync_lvb(lm_lock_t * lock, char *lvb);
++void lm_dlm_recovery_done(lm_lockspace_t * lockspace, unsigned int jid,
++                     unsigned int message);
++
++/* plock.c */
++
++int lm_dlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++              unsigned long owner, int wait, int ex, uint64_t start,
++              uint64_t end);
++
++int lm_dlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                unsigned long owner, uint64_t start, uint64_t end);
++
++int lm_dlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                  unsigned long owner, uint64_t *start, uint64_t *end,
++                  int *ex, unsigned long *rowner);
++
++/* main.c */
++
++void lock_dlm_debug_log(const char *fmt, ...);
++void lock_dlm_debug_dump(void);
++
++
++#define LOCK_DLM_DEBUG
++
++#ifdef LOCK_DLM_DEBUG
++#define log_debug(fmt, args...) lock_dlm_debug_log(fmt, ##args)
++#else
++#define log_debug(fmt, args...)
++#endif
++
++#define DLM_ASSERT(x, do) \
++{ \
++  if (!(x)) \
++  { \
++    lock_dlm_debug_dump(); \
++    printk("\nlock_dlm:  Assertion failed on line %d of file %s\n" \
++           "lock_dlm:  assertion:  \"%s\"\n" \
++         "lock_dlm:  time = %lu\n", \
++         __LINE__, __FILE__, #x, jiffies); \
++    {do} \
++    printk("\n"); \
++    panic("lock_dlm:  Record message above and reboot.\n"); \
++  } \
++}
++
++#define DLM_RETRY(do_this, until_this) \
++for (;;) \
++{ \
++  do { do_this; } while (0); \
++  if (until_this) \
++    break; \
++  printk("lock_dlm:  out of memory:  %s, %u\n", __FILE__, __LINE__); \
++  schedule();\
++}
++
++#endif
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/main.c linux-patched/fs/gfs_locking/lock_dlm/main.c
+--- linux-orig/fs/gfs_locking/lock_dlm/main.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/main.c       2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,192 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++
++#if defined(LOCK_DLM_DEBUG)
++#define LOCK_DLM_DEBUG_SIZE     (1024)
++#define MAX_DEBUG_MSG_LEN       (64)
++#else
++#define LOCK_DLM_DEBUG_SIZE     (0)
++#define MAX_DEBUG_MSG_LEN       (0)
++#endif
++
++static char *                   debug_buf;
++static unsigned int             debug_size;
++static unsigned int             debug_point;
++static int                      debug_wrap;
++static spinlock_t               debug_lock;
++static struct proc_dir_entry *  debug_proc_entry = NULL;
++
++
++void lock_dlm_debug_log(const char *fmt, ...)
++{
++      va_list va;
++      int i, n, size, len;
++      char buf[MAX_DEBUG_MSG_LEN+1];
++
++      spin_lock(&debug_lock);
++
++      if (!debug_buf)
++              goto out;
++
++      size = MAX_DEBUG_MSG_LEN;
++      memset(buf, 0, size+1);
++
++      n = 0;
++      /* n = snprintf(buf, size, "%s ", dlm->fsname); */
++      size -= n;
++
++      va_start(va, fmt);
++      vsnprintf(buf+n, size, fmt, va);
++      va_end(va);
++
++      len = strlen(buf);
++      if (len > MAX_DEBUG_MSG_LEN-1)
++              len = MAX_DEBUG_MSG_LEN-1;
++      buf[len] = '\n';
++      buf[len+1] = '\0';
++
++      for (i = 0; i < strlen(buf); i++) {
++              debug_buf[debug_point++] = buf[i];
++
++              if (debug_point == debug_size) {
++                      debug_point = 0;
++                      debug_wrap = 1;
++              }
++      }
++ out:
++      spin_unlock(&debug_lock);
++}
++
++static void debug_setup(int size)
++{
++      char *b = NULL;
++
++      if (size > PAGE_SIZE)
++              size = PAGE_SIZE;
++      if (size)
++              b = kmalloc(size, GFP_KERNEL);
++
++      spin_lock(&debug_lock);
++      if (debug_buf)
++              kfree(debug_buf);
++      if (!size || !b)
++              goto out;
++      debug_size = size;
++      debug_point = 0;
++      debug_wrap = 0;
++      debug_buf = b;
++      memset(debug_buf, 0, debug_size);
++ out:
++      spin_unlock(&debug_lock);
++}
++
++static void debug_init(void)
++{
++      debug_buf = NULL;
++      debug_size = 0;
++      debug_point = 0;
++      debug_wrap = 0;
++      spin_lock_init(&debug_lock);
++      debug_setup(LOCK_DLM_DEBUG_SIZE);
++}
++
++void lock_dlm_debug_dump(void)
++{
++      int i;
++
++      spin_lock(&debug_lock);
++
++      if (debug_wrap) {
++              for (i = debug_point; i < debug_size; i++)
++                      printk("%c", debug_buf[i]);
++      }
++      for (i = 0; i < debug_point; i++)
++              printk("%c", debug_buf[i]);
++
++      spin_unlock(&debug_lock);
++}
++
++#ifdef CONFIG_PROC_FS
++int lock_dlm_debug_info(char *b, char **start, off_t offset, int length)
++{
++      int i, n = 0;
++
++      spin_lock(&debug_lock);
++
++      if (debug_wrap) {
++              for (i = debug_point; i < debug_size; i++)
++                      n += sprintf(b + n, "%c", debug_buf[i]);
++      }
++      for (i = 0; i < debug_point; i++)
++              n += sprintf(b + n, "%c", debug_buf[i]);
++
++      spin_unlock(&debug_lock);
++
++      return n;
++}
++#endif
++
++/**
++ * init_dlm - Initialize the dlm module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init init_lock_dlm(void)
++{
++      int error;
++
++      error = lm_register_proto(&lock_dlm_ops);
++      if (error) {
++              printk("lock_dlm:  can't register protocol: (%d)\n", error);
++              return error;
++      }
++
++#ifdef CONFIG_PROC_FS
++      debug_proc_entry = create_proc_entry("cluster/lock_dlm_debug", S_IRUGO,
++                                           NULL);
++      if (debug_proc_entry)
++              debug_proc_entry->get_info = &lock_dlm_debug_info;
++#endif
++      debug_init();
++
++      printk("Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
++      return 0;
++}
++
++/**
++ * exit_dlm - cleanup the dlm module
++ *
++ */
++
++void __exit exit_lock_dlm(void)
++{
++      lm_unregister_proto(&lock_dlm_ops);
++
++#ifdef CONFIG_PROC_FS
++      if (debug_proc_entry)
++              remove_proc_entry("cluster/lock_dlm_debug", NULL);
++#endif
++      debug_setup(0);
++}
++
++module_init(init_lock_dlm);
++module_exit(exit_lock_dlm);
++
++MODULE_DESCRIPTION("GFS DLM Locking Module");
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/mount.c linux-patched/fs/gfs_locking/lock_dlm/mount.c
+--- linux-orig/fs/gfs_locking/lock_dlm/mount.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/mount.c      2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,335 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++
++#include "lock_dlm.h"
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++static int init_cman(dlm_t *dlm)
++{
++      int error = -1;
++      char *name = NULL;
++
++      if (!dlm->clustername)
++              goto fail;
++
++      error = kcl_addref_cluster();
++      if (error) {
++              printk("lock_dlm: cannot get cman reference %d\n", error);
++              goto fail;
++      }
++
++      error = kcl_cluster_name(&name);
++      if (error) {
++              printk("lock_dlm: cannot get cman cluster name %d\n", error);
++              goto fail_ref;
++      }
++
++      if (strcmp(name, dlm->clustername)) {
++              error = -1;
++              printk("lock_dlm: cman cluster name \"%s\" does not match "
++                     "file system cluster name \"%s\"\n",
++                     name, dlm->clustername);
++              goto fail_ref;
++      }
++
++      kfree(name);
++      return 0;
++
++ fail_ref:
++      kcl_releaseref_cluster();
++ fail:
++      if (name)
++              kfree(name);
++      return error;
++}
++
++static int release_cman(dlm_t *dlm)
++{
++      return kcl_releaseref_cluster();
++}
++
++static int init_cluster(dlm_t *dlm, char *table_name)
++{
++      char *buf, *c, *clname, *fsname;
++      int len, error = -1;
++
++      /*  
++       * Parse superblock lock table <clustername>:<fsname>  
++       */
++
++      len = strlen(table_name) + 1;
++      buf = kmalloc(len, GFP_KERNEL);
++      if (!buf)
++              goto out;
++      memset(buf, 0, len);
++      memcpy(buf, table_name, strlen(table_name));
++
++      c = strstr(buf, ":");
++      if (!c)
++              goto out_buf;
++
++      *c = '\0';
++      clname = buf;
++      fsname = ++c;
++
++      dlm->max_nodes = LOCK_DLM_MAX_NODES;
++
++      len = strlen(clname) + 1;
++      c = kmalloc(len, GFP_KERNEL);
++      if (!c)
++              goto out_buf;
++      memset(c, 0, len);
++      memcpy(c, clname, len-1);
++      dlm->cnlen = len-1;
++      dlm->clustername = c;
++
++      len = strlen(fsname) + 1;
++      c = kmalloc(len, GFP_KERNEL);
++      if (!c)
++              goto out_cn;
++      memset(c, 0, len);
++      memcpy(c, fsname, len-1);
++      dlm->fnlen = len-1;
++      dlm->fsname = c;
++
++      error = init_cman(dlm);
++      if (error)
++              goto out_fn;
++
++      kfree(buf);
++      return 0;
++
++      out_fn:
++      kfree(dlm->fsname);
++      out_cn:
++      kfree(dlm->clustername);
++      out_buf:
++      kfree(buf);
++      out:
++      printk("lock_dlm: init_cluster error %d\n", error);
++      return error;
++}
++
++static int release_cluster(dlm_t *dlm)
++{
++      release_cman(dlm);
++      kfree(dlm->clustername);
++      kfree(dlm->fsname);
++      return 0;
++}
++
++static int init_fence(dlm_t *dlm)
++{
++      LIST_HEAD(head);
++      struct kcl_service *s, *safe;
++      int error, found = FALSE;
++
++      error = kcl_get_services(&head, SERVICE_LEVEL_FENCE);
++      if (error < 0)
++              goto out;
++
++      list_for_each_entry_safe(s, safe, &head, list) {
++              list_del(&s->list);
++              if (!found && !strcmp(s->name, "default"))
++                      found = TRUE;
++              kfree(s);
++      }
++
++      if (found)
++              return 0;
++
++      error = -1;
++ out:
++      printk("lock_dlm: init_fence error %d\n", error);
++      return error;
++}
++
++static int release_fence(dlm_t *dlm)
++{
++      return 0;
++}
++
++static int init_gdlm(dlm_t *dlm)
++{
++      int error;
++
++      error = dlm_new_lockspace(dlm->fsname, dlm->fnlen, &dlm->gdlm_lsp,
++                                 DLM_LSF_NOTIMERS);
++      if (error)
++              printk("lock_dlm: new lockspace error %d\n", error);
++
++      return error;
++}
++
++static int release_gdlm(dlm_t *dlm)
++{
++      dlm_release_lockspace(dlm->gdlm_lsp, 1);
++      return 0;
++}
++
++static dlm_t *init_dlm(lm_callback_t cb, lm_fsdata_t *fsdata)
++{
++      dlm_t *dlm;
++
++      dlm = kmalloc(sizeof(dlm_t), GFP_KERNEL);
++      if (!dlm)
++              return NULL;
++
++      memset(dlm, 0, sizeof(dlm_t));
++
++      dlm->fscb = cb;
++      dlm->fsdata = fsdata;
++
++      spin_lock_init(&dlm->async_lock);
++
++      INIT_LIST_HEAD(&dlm->complete);
++      INIT_LIST_HEAD(&dlm->blocking);
++      INIT_LIST_HEAD(&dlm->delayed);
++      INIT_LIST_HEAD(&dlm->submit);
++      INIT_LIST_HEAD(&dlm->starts);
++      INIT_LIST_HEAD(&dlm->resources);
++
++      init_waitqueue_head(&dlm->wait);
++
++      INIT_LIST_HEAD(&dlm->mg_nodes);
++      init_MUTEX(&dlm->mg_nodes_lock);
++      init_MUTEX(&dlm->res_lock);
++
++      return dlm;
++}
++
++/**
++ * dlm_mount - mount a dlm lockspace
++ * @table_name: the name of the space to mount
++ * @host_data: host specific data
++ * @cb: the callback
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int lm_dlm_mount(char *table_name, char *host_data,
++                      lm_callback_t cb, lm_fsdata_t *fsdata,
++                      unsigned int min_lvb_size,
++                      struct lm_lockstruct *lockstruct)
++{
++      dlm_t *dlm;
++      int error = -ENOMEM;
++
++      if (min_lvb_size > DLM_LVB_SIZE)
++              goto out;
++
++      dlm = init_dlm(cb, fsdata);
++      if (!dlm)
++              goto out;
++
++      error = init_cluster(dlm, table_name);
++      if (error)
++              goto out_free;
++
++      error = init_fence(dlm);
++      if (error)
++              goto out_cluster;
++
++      error = init_gdlm(dlm);
++      if (error)
++              goto out_fence;
++
++      error = init_async_thread(dlm);
++      if (error)
++              goto out_gdlm;
++
++      error = init_mountgroup(dlm);
++      if (error)
++              goto out_thread;
++
++      lockstruct->ls_jid = dlm->jid;
++      lockstruct->ls_first = test_bit(DFL_FIRST_MOUNT, &dlm->flags);
++      lockstruct->ls_lockspace = dlm;
++      lockstruct->ls_ops = &lock_dlm_ops;
++      lockstruct->ls_flags = LM_LSFLAG_ASYNC;
++      lockstruct->ls_lvb_size = DLM_LVB_SIZE;
++      return 0;
++
++      out_thread:
++      release_async_thread(dlm);
++
++      out_gdlm:
++      release_gdlm(dlm);
++
++      out_fence:
++      release_fence(dlm);
++
++      out_cluster:
++      release_cluster(dlm);
++
++      out_free:
++      kfree(dlm);
++
++      out:
++      return error;
++}
++
++/**
++ * dlm_others_may_mount
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void lm_dlm_others_may_mount(lm_lockspace_t *lockspace)
++{
++      /* Do nothing.  The first node to join the Mount Group will complete
++       * before Service Manager allows another node to join. */
++}
++
++/**
++ * dlm_unmount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void lm_dlm_unmount(lm_lockspace_t *lockspace)
++{
++      dlm_t *dlm = (dlm_t *) lockspace;
++
++      release_mountgroup(dlm);
++      release_async_thread(dlm);
++      release_gdlm(dlm);
++      release_fence(dlm);
++      release_cluster(dlm);
++      kfree(dlm);
++}
++
++struct lm_lockops lock_dlm_ops = {
++      lm_proto_name:"lock_dlm",
++      lm_mount:lm_dlm_mount,
++      lm_others_may_mount:lm_dlm_others_may_mount,
++      lm_unmount:lm_dlm_unmount,
++      lm_get_lock:lm_dlm_get_lock,
++      lm_put_lock:lm_dlm_put_lock,
++      lm_lock:lm_dlm_lock,
++      lm_unlock:lm_dlm_unlock,
++      lm_plock:lm_dlm_plock,
++      lm_punlock:lm_dlm_punlock,
++      lm_plock_get:lm_dlm_plock_get,
++      lm_cancel:lm_dlm_cancel,
++      lm_hold_lvb:lm_dlm_hold_lvb,
++      lm_unhold_lvb:lm_dlm_unhold_lvb,
++      lm_sync_lvb:lm_dlm_sync_lvb,
++      lm_recovery_done:lm_dlm_recovery_done,
++      lm_owner:THIS_MODULE,
++};
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/plock.c linux-patched/fs/gfs_locking/lock_dlm/plock.c
+--- linux-orig/fs/gfs_locking/lock_dlm/plock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/plock.c      2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,1037 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++#define MIN(a,b) ((a) <= (b)) ? (a) : (b)
++#define MAX(a,b) ((a) >= (b)) ? (a) : (b)
++
++#define CREATE    1
++#define NO_CREATE 0
++
++#define WAIT      1
++#define NO_WAIT   0
++#define X_WAIT   -1
++
++#define EX      1
++#define NO_EX     0
++#define SH        NO_EX
++
++
++static int check_conflict(dlm_t *dlm, struct dlm_resource *r,
++                        struct lm_lockname *name, unsigned long owner,
++                        uint64_t start, uint64_t end, int ex);
++
++
++static int lock_resource(struct dlm_resource *r)
++{
++      dlm_lock_t *lp;
++      struct lm_lockname name;
++      int error;
++
++      name.ln_type = LM_TYPE_PLOCK_UPDATE;
++      name.ln_number = r->name.ln_number;
++
++      error = create_lp(r->dlm, &name, &lp);
++      if (error)
++              return error;
++
++      set_bit(LFL_IDLOCK, &lp->flags);
++      lp->req = DLM_LOCK_EX;
++      do_lock(lp, NULL);
++      wait_for_completion(&lp->uast_wait);
++
++      error = lp->lksb.sb_status;
++      if (error) {
++              kfree(lp);
++              lp = NULL;
++      }
++
++      r->update = lp;
++      return error;
++}
++
++static void unlock_resource(struct dlm_resource *r)
++{
++      do_unlock(r->update);
++      kfree(r->update);
++}
++
++static struct dlm_resource *search_resource(dlm_t *dlm, struct lm_lockname *name)
++{
++      struct dlm_resource *r;
++
++      list_for_each_entry(r, &dlm->resources, list) {
++              if (lm_name_equal(&r->name, name))
++                      return r;
++      }
++      return NULL;
++}
++
++static int get_resource(dlm_t *dlm, struct lm_lockname *name, int create,
++                      struct dlm_resource **res)
++{
++      struct dlm_resource *r, *r2;
++      int error = -ENOMEM;
++
++      down(&dlm->res_lock);
++      r = search_resource(dlm, name);
++      if (r)
++              r->count++;
++      up(&dlm->res_lock);
++
++      if (r)
++              goto out;
++
++      if (create == NO_CREATE) {
++              error = -ENOENT;
++              goto fail;
++      }
++
++      r = kmalloc(sizeof(struct dlm_resource), GFP_KERNEL);
++      if (!r)
++              goto fail;
++
++      memset(r, 0, sizeof(struct dlm_resource));
++      r->dlm = dlm;
++      r->name = *name;
++      r->count = 1;
++      INIT_LIST_HEAD(&r->locks);
++      INIT_LIST_HEAD(&r->async_locks);
++      init_MUTEX(&r->sema);
++      spin_lock_init(&r->async_spin);
++
++      down(&dlm->res_lock);
++      r2 = search_resource(dlm, name);
++      if (r2) {
++              r2->count++;
++              up(&dlm->res_lock);
++              kfree(r);
++              r = r2;
++              goto out;
++      }
++
++      list_add_tail(&r->list, &dlm->resources);
++      up(&dlm->res_lock);
++
++ out:
++      *res = r;
++      return 0;
++ fail:
++      return error;
++}
++
++static void put_resource(struct dlm_resource *r)
++{
++      dlm_t *dlm = r->dlm;
++
++      down(&dlm->res_lock);
++      r->count--;
++      if (r->count == 0) {
++              DLM_ASSERT(list_empty(&r->locks), );
++              DLM_ASSERT(list_empty(&r->async_locks), );
++              list_del(&r->list);
++              kfree(r);
++      }
++      up(&dlm->res_lock);
++}
++
++static inline void hold_resource(struct dlm_resource *r)
++{
++      down(&r->dlm->res_lock);
++      r->count++;
++      up(&r->dlm->res_lock);
++}
++
++static inline int ranges_overlap(uint64_t start1, uint64_t end1,
++                               uint64_t start2, uint64_t end2)
++{
++      if (end1 < start2 || start1 > end2)
++              return FALSE;
++      return TRUE;
++}
++
++/**
++ * overlap_type - returns a value based on the type of overlap
++ * @s1 - start of new lock range
++ * @e1 - end of new lock range
++ * @s2 - start of existing lock range
++ * @e2 - end of existing lock range
++ *
++ */
++
++static int overlap_type(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2)
++{
++      int ret;
++
++      /*
++       * ---r1---
++       * ---r2---
++       */
++
++      if (s1 == s2 && e1 == e2)
++              ret = 0;
++
++      /*
++       * --r1--
++       * ---r2---
++       */
++
++      else if (s1 == s2 && e1 < e2)
++              ret = 1;
++
++      /*
++       *   --r1--
++       * ---r2---
++       */
++
++      else if (s1 > s2 && e1 == e2)
++              ret = 1;
++
++      /*
++       *  --r1--
++       * ---r2---
++       */
++
++      else if (s1 > s2 && e1 < e2)
++              ret = 2;
++
++      /*
++       * ---r1---  or  ---r1---  or  ---r1---
++       * --r2--          --r2--       --r2--
++       */
++
++      else if (s1 <= s2 && e1 >= e2)
++              ret = 3;
++
++      /*
++       *   ---r1---
++       * ---r2---
++       */
++
++      else if (s1 > s2 && e1 > e2)
++              ret = 4;
++
++      /*
++       * ---r1---
++       *   ---r2---
++       */
++
++      else if (s1 < s2 && e1 < e2)
++              ret = 4;
++
++      else
++              ret = -1;
++
++      return ret;
++}
++
++/* shrink the range start2:end2 by the partially overlapping start:end */
++
++static int shrink_range2(uint64_t *start2, uint64_t *end2,
++                       uint64_t start, uint64_t end)
++{
++      int error = 0;
++
++      if (*start2 < start)
++              *end2 = start - 1;
++      else if (*end2 > end)
++              *start2 =  end + 1;
++      else
++              error = -1;
++      return error;
++}
++
++static int shrink_range(struct posix_lock *po, uint64_t start, uint64_t end)
++{
++      return shrink_range2(&po->start, &po->end, start, end);
++}
++
++static void put_lock(dlm_lock_t *lp)
++{
++      struct posix_lock *po = lp->posix;
++
++      po->count--;
++      if (po->count == 0) {
++              kfree(po);
++              kfree(lp);
++      }
++}
++
++static int create_lock(struct dlm_resource *r, unsigned long owner, int ex,
++                     uint64_t start, uint64_t end, dlm_lock_t **lpp)
++{
++      dlm_lock_t *lp;
++      struct posix_lock *po;
++      int error;
++
++      error = create_lp(r->dlm, &r->name, &lp);
++      if (error)
++              return error;
++
++      po = kmalloc(sizeof(struct posix_lock), GFP_KERNEL);
++      if (!po) {
++              kfree(lp);
++              return -ENOMEM;
++      }
++      memset(po, 0, sizeof(struct posix_lock));
++
++      lp->posix = po;
++      po->lp = lp;
++      po->resource = r;
++      po->count = 1;
++      po->start = start;
++      po->end = end;
++      po->owner = owner;
++      po->ex = ex;
++      list_add_tail(&po->list, &r->locks);
++
++      *lpp = lp;
++      return 0;
++}
++
++static unsigned int make_flags_posix(dlm_lock_t *lp, int wait)
++{
++      unsigned int lkf = 0;
++
++      if (wait == NO_WAIT || wait == X_WAIT)
++              lkf |= DLM_LKF_NOQUEUE;
++
++      if (lp->lksb.sb_lkid != 0) {
++              lkf |= DLM_LKF_CONVERT;
++              if (wait == WAIT)
++                      lkf |= DLM_LKF_EXPEDITE;
++      }
++      return lkf;
++}
++
++static void do_range_lock(dlm_lock_t *lp)
++{
++      struct dlm_range range = { lp->posix->start, lp->posix->end };
++      do_lock(lp, &range);
++}
++
++static void request_lock(dlm_lock_t *lp, int wait)
++{
++      log_debug("req %x,%"PRIx64" %s %"PRIx64"-%"PRIx64" %u w %u",
++                lp->lockname.ln_type, lp->lockname.ln_number,
++                lp->posix->ex ? "ex" : "sh", lp->posix->start,
++                lp->posix->end, current->pid, wait);
++
++      set_bit(LFL_IDLOCK, &lp->flags);
++      lp->req = lp->posix->ex ? DLM_LOCK_EX : DLM_LOCK_PR;
++      lp->lkf = make_flags_posix(lp, wait);
++
++      do_range_lock(lp);
++}
++
++static void add_async(struct posix_lock *po, struct dlm_resource *r)
++{
++      spin_lock(&r->async_spin);
++      list_add_tail(&po->async_list, &r->async_locks);
++      spin_unlock(&r->async_spin);
++}
++
++static void del_async(struct posix_lock *po, struct dlm_resource *r)
++{
++      spin_lock(&r->async_spin);
++      list_del(&po->async_list);
++      spin_unlock(&r->async_spin);
++}
++
++static int wait_async(dlm_lock_t *lp)
++{
++      wait_for_completion(&lp->uast_wait);
++      del_async(lp->posix, lp->posix->resource);
++      return lp->lksb.sb_status;
++}
++
++static void wait_async_list(struct dlm_resource *r, unsigned long owner)
++{
++      struct posix_lock *po;
++      int error, found;
++
++ restart:
++      found = FALSE;
++      spin_lock(&r->async_spin);
++      list_for_each_entry(po, &r->async_locks, async_list) {
++              if (po->owner != owner)
++                      continue;
++              found = TRUE;
++              break;
++      }
++      spin_unlock(&r->async_spin);
++
++      if (found) {
++              DLM_ASSERT(po->lp, );
++              error = wait_async(po->lp);
++              DLM_ASSERT(!error, );
++              goto restart;
++      }
++}
++
++static void update_lock(dlm_lock_t *lp, int wait)
++{
++      request_lock(lp, wait);
++      add_async(lp->posix, lp->posix->resource);
++
++      if (wait == NO_WAIT || wait == X_WAIT) {
++              int error = wait_async(lp);
++              DLM_ASSERT(!error, printk("error=%d\n", error););
++      }
++}
++
++static void add_lock(struct dlm_resource *r, unsigned long owner, int wait,
++                   int ex, uint64_t start, uint64_t end)
++{
++      dlm_lock_t *lp;
++      int error;
++
++      error = create_lock(r, owner, ex, start, end, &lp);
++      DLM_ASSERT(!error, );
++
++      hold_resource(r);
++      update_lock(lp, wait);
++}
++
++static int remove_lock(dlm_lock_t *lp)
++{
++      struct dlm_resource *r = lp->posix->resource;
++
++      log_debug("remove %x,%"PRIx64" %u",
++                r->name.ln_type, r->name.ln_number, current->pid);
++
++      do_unlock(lp);
++      put_lock(lp);
++      put_resource(r);
++      return 0;
++}
++
++/* RN within RE (and starts or ends on RE boundary)
++   1. add new lock for non-overlap area of RE, orig mode
++   2. convert RE to RN range and mode */
++
++static int lock_case1(struct posix_lock *po, struct dlm_resource *r,
++                    unsigned long owner, int wait, int ex, uint64_t start,
++                    uint64_t end)
++{
++      uint64_t start2, end2;
++
++      /* non-overlapping area start2:end2 */
++      start2 = po->start;
++      end2 = po->end;
++      shrink_range2(&start2, &end2, start, end);
++
++      po->start = start;
++      po->end = end;
++      po->ex = ex;
++
++      if (ex) {
++              add_lock(r, owner, X_WAIT, SH, start2, end2);
++              update_lock(po->lp, wait);
++      } else {
++              add_lock(r, owner, WAIT, EX, start2, end2);
++              update_lock(po->lp, X_WAIT);
++      }
++      return 0;
++}
++
++/* RN within RE (RE overlaps RN on both sides)
++   1. add new lock for front fragment, orig mode
++   2. add new lock for back fragment, orig mode
++   3. convert RE to RN range and mode */
++                       
++static int lock_case2(struct posix_lock *po, struct dlm_resource *r,
++                    unsigned long owner, int wait, int ex, uint64_t start,
++                    uint64_t end)
++{
++      if (ex) {
++              add_lock(r, owner, X_WAIT, SH, po->start, start-1);
++              add_lock(r, owner, X_WAIT, SH, end+1, po->end);
++
++              po->start = start;
++              po->end = end;
++              po->ex = ex;
++
++              update_lock(po->lp, wait);
++      } else {
++              add_lock(r, owner, WAIT, EX, po->start, start-1);
++              add_lock(r, owner, WAIT, EX, end+1, po->end);
++
++              po->start = start;
++              po->end = end;
++              po->ex = ex;
++
++              update_lock(po->lp, X_WAIT);
++      }
++      return 0;
++}
++
++/* returns ranges from exist list in order of their start values */
++
++static int next_exist(struct list_head *exist, uint64_t *start, uint64_t *end)
++{
++      struct posix_lock *po;
++      int first = TRUE, first_call = FALSE;
++
++      if (!*start && !*end)
++              first_call = TRUE;
++
++      list_for_each_entry(po, exist, list) {
++              if (!first_call && (po->start <= *start))
++                      continue;
++
++              if (first) {
++                      *start = po->start;
++                      *end = po->end;
++                      first = FALSE;
++              } else if (po->start < *start) {
++                      *start = po->start;
++                      *end = po->end;
++              }
++      }
++
++      return (first ? -1 : 0);
++}
++
++/* adds locks in gaps between existing locks from start to end */
++
++static int fill_gaps(struct list_head *exist, struct dlm_resource *r,
++                   unsigned long owner, int wait, int ex, uint64_t start,
++                   uint64_t end)
++{
++      uint64_t exist_start = 0, exist_end = 0;
++
++      /* cover gaps in front of each existing lock */
++      for (;;) {
++              if (next_exist(exist, &exist_start, &exist_end))
++                      break;
++              if (start < exist_start)
++                      add_lock(r, owner, wait, ex, start, exist_start-1);
++              start = exist_end + 1;
++      }
++
++      /* cover gap after last existing lock */
++      if (exist_end < end)
++              add_lock(r, owner, wait, ex, exist_end+1, end);
++
++      return 0;
++}
++
++/* RE within RN (possibly more than one RE lock, all within RN) */
++
++static int lock_case3(struct list_head *exist, struct dlm_resource *r,
++                    unsigned long owner, int wait, int ex, uint64_t start,
++                    uint64_t end)
++{
++      struct posix_lock *po, *safe;
++
++      fill_gaps(exist, r, owner, wait, ex, start, end);
++
++      if (!ex)
++              wait = X_WAIT;
++
++      /* update existing locks to new mode and put back in locks list */
++      list_for_each_entry_safe(po, safe, exist, list) {
++              list_move_tail(&po->list, &r->locks);
++              if (po->ex == ex)
++                      continue;
++              po->ex = ex;
++              update_lock(po->lp, wait);
++      }
++
++      return 0;
++}
++
++/* RE within RN (possibly more than one RE lock, one RE partially overlaps RN)
++   1. add new locks with new mode for RN gaps not covered by RE's
++   2. convert RE locks' mode to new mode
++   other steps deal with the partial-overlap fragment and depend on whether
++   the request is sh->ex or ex->sh */
++
++static int lock_case4(struct posix_lock *opo, struct list_head *exist,
++                    struct dlm_resource *r, unsigned long owner, int wait,
++                    int ex, uint64_t start, uint64_t end)
++{
++      struct posix_lock *po, *safe;
++      uint64_t over_start = 0, over_end = 0;
++      uint64_t frag_start = 0, frag_end = 0;
++
++      /* fragment (non-overlap) range of opo */
++      if (opo->start < start) {
++              frag_start = opo->start;
++              frag_end = start - 1;
++      } else {
++              frag_start = end + 1;
++              frag_end = opo->end;
++      }
++
++      /* overlap range of opo */
++      if (opo->start < start) {
++              over_start = start;
++              over_end = opo->end;
++      } else {
++              over_start = opo->start;
++              opo->end = end;
++      }
++
++      /* cut off the non-overlap portion of opo so fill_gaps will work */
++      opo->start = over_start;
++      opo->end = over_end;
++
++      fill_gaps(exist, r, owner, wait, ex, start, end);
++
++      /* update existing locks to new mode and put back in locks list */
++      list_for_each_entry_safe(po, safe, exist, list) {
++              list_move_tail(&po->list, &r->locks);
++              if (po == opo)
++                      continue;
++              if (po->ex == ex)
++                      continue;
++              po->ex = ex;
++              update_lock(po->lp, wait);
++      }
++
++      /* deal with the RE that partially overlaps the requested range */
++
++      if (ex == opo->ex)
++              return 0;
++
++      if (ex) {
++              /* 1. add a shared lock in the non-overlap range
++                 2. convert RE to overlap range and requested mode */
++
++              add_lock(r, owner, X_WAIT, SH, frag_start, frag_end);
++
++              opo->start = over_start;
++              opo->end = over_end;
++              opo->ex = ex;
++
++              update_lock(opo->lp, wait);
++      } else {
++              /* 1. request a shared lock in the overlap range
++                 2. convert RE to non-overlap range
++                 3. wait for shared lock to complete */
++
++              add_lock(r, owner, WAIT, SH, over_start, over_end);
++
++              opo->start = frag_start;
++              opo->end = frag_end;
++
++              update_lock(opo->lp, X_WAIT);
++      }
++
++      return 0;
++}
++
++/* go through r->locks to find what needs to be done to extend,
++   shrink, shift, split, etc existing locks (this often involves adding new
++   locks in addition to modifying existing locks. */
++
++static int plock_internal(struct dlm_resource *r, unsigned long owner,
++                        int wait, int ex, uint64_t start, uint64_t end)
++{
++      LIST_HEAD(exist);
++      struct posix_lock *po, *safe, *case4_po = NULL;
++      int error = 0;
++
++      list_for_each_entry_safe(po, safe, &r->locks, list) {
++              if (po->owner != owner)
++                      continue;
++              if (!ranges_overlap(po->start, po->end, start, end))
++                      continue;
++
++              /* existing range (RE) overlaps new range (RN) */
++
++              switch(overlap_type(start, end, po->start, po->end)) {
++
++              case 0:
++                      if (po->ex == ex)
++                              goto out;
++
++                      /* ranges the same - just update the existing lock */
++                      po->ex = ex;
++                      update_lock(po->lp, wait);
++                      goto out;
++
++              case 1:
++                      if (po->ex == ex)
++                              goto out;
++
++                      error = lock_case1(po, r, owner, wait, ex, start, end);
++                      goto out;
++
++              case 2:
++                      if (po->ex == ex)
++                              goto out;
++
++                      error = lock_case2(po, r, owner, wait, ex, start, end);
++                      goto out;
++
++              case 3:
++                      list_move_tail(&po->list, &exist);
++                      break;
++
++              case 4:
++                      DLM_ASSERT(!case4_po, );
++                      case4_po = po;
++                      list_move_tail(&po->list, &exist);
++                      break;
++
++              default:
++                      error = -1;
++                      goto out;
++              }
++      }
++
++      if (case4_po)
++              error = lock_case4(case4_po, &exist, r, owner, wait, ex,
++                                 start, end);
++      else if (!list_empty(&exist))
++              error = lock_case3(&exist, r, owner, wait, ex, start, end);
++      else
++              add_lock(r, owner, wait, ex, start, end);
++
++ out:
++      return error;
++}
++
++static int punlock_internal(struct dlm_resource *r, unsigned long owner,
++                          uint64_t start, uint64_t end)
++{
++      struct posix_lock *po, *safe;
++      int error = 0;
++
++      list_for_each_entry_safe(po, safe, &r->locks, list) {
++              if (po->owner != owner)
++                      continue;
++              if (!ranges_overlap(po->start, po->end, start, end))
++                      continue;
++
++              /* existing range (RE) overlaps new range (RN) */
++
++              switch(overlap_type(start, end, po->start, po->end)) {
++
++              case 0:
++                      /* ranges the same - just remove the existing lock */
++
++                      list_del(&po->list);
++                      remove_lock(po->lp);
++                      goto out;
++
++              case 1:
++                      /* RN within RE and starts or ends on RE boundary -
++                       * shrink and update RE */
++
++                      shrink_range(po, start, end);
++                      update_lock(po->lp, X_WAIT);
++                      goto out;
++
++              case 2:
++                      /* RN within RE - shrink and update RE to be front
++                       * fragment, and add a new lock for back fragment */
++
++                      add_lock(r, owner, po->ex ? WAIT : X_WAIT, po->ex,
++                               end+1, po->end);
++
++                      po->end = start - 1;
++                      update_lock(po->lp, X_WAIT);
++                      goto out;
++
++              case 3:
++                      /* RE within RN - remove RE, then continue checking
++                       * because RN could cover other locks */
++
++                      list_del(&po->list);
++                      remove_lock(po->lp);
++                      continue;
++
++              case 4:
++                      /* front of RE in RN, or end of RE in RN - shrink and
++                       * update RE, then continue because RN could cover
++                       * other locks */
++
++                      shrink_range(po, start, end);
++                      update_lock(po->lp, X_WAIT);
++                      continue;
++
++              default:
++                      error = -1;
++                      goto out;
++              }
++      }
++
++ out:
++      return error;
++}
++
++int lm_dlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                 unsigned long owner, int wait, int ex, uint64_t start,
++                 uint64_t end)
++{
++      dlm_t *dlm = (dlm_t *) lockspace;
++      struct dlm_resource *r;
++      int error;
++
++      log_debug("en plock %u %x,%"PRIx64"", current->pid,
++                name->ln_type, name->ln_number);
++
++      error = get_resource(dlm, name, CREATE, &r);
++      if (error)
++              goto out;
++
++#if 0
++      /* Wait, without holding any locks, until this plock request is not
++         blocked by plocks of *other* *local* processes.  Then, none of the
++         dlm requests below will wait on a lock from a local process.
++
++         This should not be necessary since we wait for completion after
++         up().  This means a local process p1 can unlock lkb X while local p2
++         is waiting for X (in wait_async_list). */
++      error = wait_local(r, owner, wait, ex, start, end);
++      if (error)
++              goto out_put;
++#endif
++
++      down(&r->sema);
++      error = lock_resource(r);
++      if (error)
++              goto out_up;
++
++      /* check_conflict() checks for conflicts with plocks from other local
++         processes and other nodes. */
++
++      if (!wait && check_conflict(dlm, r, name, owner, start, end, ex)) {
++              error = -1;
++              unlock_resource(r);
++              goto out_up;
++      }
++
++      /* If NO_WAIT all requests should return immediately.
++         If WAIT all requests go on r->async_locks which we wait on in
++         wait_async_locks().  This means DLM should not return -EAGAIN and we
++         should never block waiting for a plock to be released (by a local or
++         remote process) until we call wait_async_list(). */
++
++      error = plock_internal(r, owner, wait, ex, start, end);
++      unlock_resource(r);
++
++      /* wait_async_list() must follow the up() because we must be able
++         to punlock a range on this resource while there's a blocked plock
++         request to prevent deadlock between nodes (and processes). */
++
++ out_up:
++      up(&r->sema);
++      wait_async_list(r, owner);
++      put_resource(r);
++ out:
++      log_debug("ex plock %u error %d", current->pid, error);
++      return error;
++}
++
++int lm_dlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                   unsigned long owner, uint64_t start, uint64_t end)
++{
++      dlm_t *dlm = (dlm_t *) lockspace;
++      struct dlm_resource *r;
++      int error;
++
++      log_debug("en punlock %u %x,%"PRIx64"", current->pid,
++                name->ln_type, name->ln_number);
++
++      error = get_resource(dlm, name, NO_CREATE, &r);
++      if (error)
++              goto out;
++
++      down(&r->sema);
++      error = lock_resource(r);
++      if (error)
++              goto out_up;
++
++      error = punlock_internal(r, owner, start, end);
++      unlock_resource(r);
++
++ out_up:
++      up(&r->sema);
++      wait_async_list(r, owner);
++      put_resource(r);
++ out:
++      log_debug("ex punlock %u error %d", current->pid, error);
++      return error;
++}
++
++static void query_ast(void *astargs)
++{
++      dlm_lock_t *lp = (dlm_lock_t *) astargs;;
++      complete(&lp->uast_wait);
++}
++
++static int get_conflict_global(dlm_t *dlm, struct lm_lockname *name,
++                             unsigned long owner, uint64_t *start,
++                             uint64_t *end, int *ex, unsigned long *rowner)
++{
++      dlm_lock_t *lp;
++      struct dlm_queryinfo qinfo;
++      struct dlm_lockinfo *lki;
++      int query = 0, s, error;
++
++      /* acquire a null lock on which base the query */
++
++      error = create_lp(dlm, name, &lp);
++      if (error)
++              goto ret;
++
++      lp->req = DLM_LOCK_NL;
++      set_bit(LFL_IDLOCK, &lp->flags);
++      do_lock(lp, NULL);
++      wait_for_completion(&lp->uast_wait);
++
++      /* do query, repeating if insufficient space */
++
++      query = DLM_LOCK_THIS | DLM_QUERY_QUEUE_GRANTED |
++              DLM_QUERY_LOCKS_HIGHER;
++
++      for (s = 16; s < dlm->max_nodes + 1; s += 16) {
++
++              lki = kmalloc(s * sizeof(struct dlm_lockinfo), GFP_KERNEL);
++              if (!lki) {
++                      error = -ENOMEM;
++                      goto out;
++              }
++              memset(lki, 0, s * sizeof(struct dlm_lockinfo));
++              memset(&qinfo, 0, sizeof(qinfo));
++              qinfo.gqi_locksize = s;
++              qinfo.gqi_lockinfo = lki;
++
++              init_completion(&lp->uast_wait);
++              error = dlm_query(dlm->gdlm_lsp, &lp->lksb, query, &qinfo,
++                                 query_ast, (void *) lp);
++              if (error) {
++                      kfree(lki);
++                      goto out;
++              }
++              wait_for_completion(&lp->uast_wait);
++              error = lp->lksb.sb_status;
++
++              if (!error)
++                      break;
++              kfree(lki);
++              if (error != -E2BIG)
++                      goto out;
++      }
++
++      /* check query results for blocking locks */
++
++      for (s = 0; s < qinfo.gqi_lockcount; s++) {
++
++              lki = &qinfo.gqi_lockinfo[s];
++
++              if (!ranges_overlap(*start, *end, lki->lki_grrange.ra_start,
++                                  lki->lki_grrange.ra_end))
++                      continue;
++
++              if (lki->lki_node == dlm->our_nodeid)
++                      continue;
++
++              if (lki->lki_grmode == DLM_LOCK_EX || *ex) {
++                      *start = lki->lki_grrange.ra_start;
++                      *end = lki->lki_grrange.ra_end;
++                      *ex = (lki->lki_grmode == DLM_LOCK_EX) ? 1 : 0;
++                      *rowner = lki->lki_node;
++                      error = -EAGAIN;
++                      break;
++              }
++      }
++
++      kfree(qinfo.gqi_lockinfo);
++
++ out:
++      do_unlock(lp);
++      kfree(lp);
++ ret:
++      return error;
++}
++
++static int get_conflict_local(dlm_t *dlm, struct dlm_resource *r,
++                            struct lm_lockname *name, unsigned long owner,
++                            uint64_t *start, uint64_t *end, int *ex,
++                            unsigned long *rowner)
++{
++      struct posix_lock *po;
++      int found = FALSE;
++
++      list_for_each_entry(po, &r->locks, list) {
++              if (po->owner == owner)
++                      continue;
++              if (!ranges_overlap(po->start, po->end, *start, *end))
++                      continue;
++
++              if (*ex || po->ex) {
++                      *start = po->start;
++                      *end = po->end;
++                      *ex = po->ex;
++                      *rowner = po->owner;
++                      found = TRUE;
++                      break;
++              }
++      }
++      return found;
++}
++
++int lm_dlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
++                     unsigned long owner, uint64_t *start, uint64_t *end,
++                     int *ex, unsigned long *rowner)
++{
++      dlm_t *dlm = (dlm_t *) lockspace;
++      struct dlm_resource *r;
++      int error, found;
++
++      error = get_resource(dlm, name, NO_CREATE, &r);
++      if (!error) {
++              down(&r->sema);
++              found = get_conflict_local(dlm, r, name, owner, start, end, ex,
++                                         rowner);
++              up(&r->sema);
++              put_resource(r);
++              if (found)
++                      goto out;
++      }
++
++      error = get_conflict_global(dlm, name, owner, start, end, ex, rowner);
++ out:
++      return error;
++}
++
++static int check_conflict(dlm_t *dlm, struct dlm_resource *r,
++                        struct lm_lockname *name, unsigned long owner,
++                        uint64_t start, uint64_t end, int ex)
++{
++      uint64_t get_start = start, get_end = end;
++      unsigned long get_owner = 0;
++      int get_ex = ex, error;
++
++      error = get_conflict_local(dlm, r, name, owner,
++                                 &get_start, &get_end, &get_ex, &get_owner);
++      if (error)
++              goto out;
++
++      error = get_conflict_global(dlm, name, owner,
++                                  &get_start, &get_end, &get_ex, &get_owner);
++ out:
++      log_debug("check_conflict %d %"PRIx64"-%"PRIx64" %"PRIx64"-%"PRIx64" "
++                "ex %d %d own %lu %lu pid %u", error, start, end,
++                get_start, get_end, ex, get_ex, owner, get_owner,
++                current->pid);
++      return error;
++}
++
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/thread.c linux-patched/fs/gfs_locking/lock_dlm/thread.c
+--- linux-orig/fs/gfs_locking/lock_dlm/thread.c        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/thread.c     2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,388 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++/* 
++ * Run in dlm_async thread 
++ */
++
++/**
++ * queue_submit - add lock request to queue for dlm_async thread
++ * @lp: DLM lock
++ *
++ * A lock placed on this queue is re-submitted to DLM as soon as
++ * dlm_async thread gets to it.  
++ */
++
++static void queue_submit(dlm_lock_t *lp)
++{
++      dlm_t *dlm = lp->dlm;
++
++      spin_lock(&dlm->async_lock);
++      list_add_tail(&lp->slist, &dlm->submit);
++      set_bit(LFL_SLIST, &lp->flags);
++      spin_unlock(&dlm->async_lock);
++      wake_up(&dlm->wait);
++}
++
++/**
++ * process_blocking - processing of blocking callback
++ * @lp: DLM lock
++ *
++ */
++
++static void process_blocking(dlm_lock_t *lp, int bast_mode)
++{
++      dlm_t *dlm = lp->dlm;
++      unsigned int cb;
++
++      switch (make_lmstate(bast_mode)) {
++      case LM_ST_EXCLUSIVE:
++              cb = LM_CB_NEED_E;
++              break;
++      case LM_ST_DEFERRED:
++              cb = LM_CB_NEED_D;
++              break;
++      case LM_ST_SHARED:
++              cb = LM_CB_NEED_S;
++              break;
++      default:
++              DLM_ASSERT(0, printk("unknown bast mode %u\n", lp->bast_mode););
++      }
++
++      dlm->fscb(dlm->fsdata, cb, &lp->lockname);
++}
++
++/**
++ * process_complete - processing of completion callback for a lock request
++ * @lp: DLM lock
++ *
++ */
++
++static void process_complete(dlm_lock_t *lp)
++{
++      dlm_t *dlm = lp->dlm;
++      struct lm_async_cb acb;
++      int16_t prev_mode = lp->cur;
++
++      memset(&acb, 0, sizeof(acb));
++
++      /*
++       * This is an AST for an unlock.
++       */
++
++      if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
++
++              /* FIXME: Add an assertion to catch NOFAIL promotions from
++               * non-NL modes? */
++
++              if (lp->lksb.sb_status == -DLM_ECANCEL) {
++
++                      /* lp->cur remains the same, is there anything to clear
++                       * or reset to put this lp into an "ordinary" state? */
++
++                      printk("lock_dlm: -DLM_ECANCEL num=%x,%"PRIx64"\n",
++                             lp->lockname.ln_type, lp->lockname.ln_number);
++              } else {
++                      DLM_ASSERT(lp->lksb.sb_status == -DLM_EUNLOCK,
++                                 printk("num=%x,%"PRIx64" status=%d\n",
++                                        lp->lockname.ln_type,
++                                        lp->lockname.ln_number,
++                                        lp->lksb.sb_status););
++                      lp->cur = DLM_LOCK_IV;
++              }
++
++              complete(&lp->uast_wait);
++              return;
++      }
++
++      /*
++       * A canceled lock request.  The lock was just taken off the delayed
++       * list and was never even submitted to dlm.
++       */
++
++      if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
++              lp->req = lp->cur;
++              acb.lc_ret |= LM_OUT_CANCELED;
++              goto out;
++      }
++
++      /*
++       * An error occured.
++       */
++
++      if (lp->lksb.sb_status) {
++              lp->req = lp->cur;
++              if (lp->cur == DLM_LOCK_IV)
++                      lp->lksb.sb_lkid = 0;
++
++              if ((lp->lksb.sb_status == -EAGAIN) &&
++                  (lp->lkf & DLM_LKF_NOQUEUE)) {
++                      /* a "normal" error */
++              } else
++                      printk("lock_dlm: process_complete error id=%x "
++                             "status=%d\n", lp->lksb.sb_lkid,
++                             lp->lksb.sb_status);
++              goto out;
++      }
++
++      /*
++       * This is an AST for an EX->EX conversion for sync_lvb from GFS.
++       */
++
++      if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
++              complete(&lp->uast_wait);
++              return;
++      }
++
++      /*
++       * A lock has been demoted to NL because it initially completed during
++       * BLOCK_LOCKS.  Now it must be requested in the originally requested
++       * mode.
++       */
++
++      if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
++
++              DLM_ASSERT(lp->req == DLM_LOCK_NL,);
++              DLM_ASSERT(lp->prev_req > DLM_LOCK_NL,);
++
++              lp->cur = DLM_LOCK_NL;
++              lp->req = lp->prev_req;
++              lp->prev_req = DLM_LOCK_IV;
++              lp->lkf &= ~DLM_LKF_CONVDEADLK;
++              lp->lkf |= DLM_LKF_QUECVT;
++
++              set_bit(LFL_NOCACHE, &lp->flags);
++
++              if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++                  !test_bit(LFL_NOBLOCK, &lp->flags))
++                      queue_delayed(lp, QUEUE_LOCKS_BLOCKED);
++              else
++                      queue_submit(lp);
++              return;
++      }
++
++      /* 
++       * A request is granted during dlm recovery.  It may be granted
++       * because the locks of a failed node were cleared.  In that case,
++       * there may be inconsistent data beneath this lock and we must wait
++       * for recovery to complete to use it.  When gfs recovery is done this
++       * granted lock will be converted to NL and then reacquired in this
++       * granted state.
++       */
++
++      if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++          !test_bit(LFL_NOBLOCK, &lp->flags) &&
++          lp->req != DLM_LOCK_NL) {
++
++              lp->cur = lp->req;
++              lp->prev_req = lp->req;
++              lp->req = DLM_LOCK_NL;
++              lp->lkf |= DLM_LKF_CONVERT;
++              lp->lkf &= ~DLM_LKF_CONVDEADLK;
++              lp->lkf &= ~DLM_LKF_QUECVT;
++
++              set_bit(LFL_REREQUEST, &lp->flags);
++              queue_submit(lp);
++              return;
++      }
++
++      /*
++       * DLM demoted the lock to NL before it was granted so GFS must be
++       * told it cannot cache data for this lock.
++       */
++
++      if (lp->lksb.sb_flags == DLM_SBF_DEMOTED)
++              set_bit(LFL_NOCACHE, &lp->flags);
++
++      out:
++
++      /*
++       * This is an internal lock_dlm lock used for managing JIDs.
++       */
++
++      if (test_bit(LFL_IDLOCK, &lp->flags)) {
++              clear_bit(LFL_NOBLOCK, &lp->flags);
++              lp->cur = lp->req;
++              complete(&lp->uast_wait);
++              return;
++      }
++
++      /*
++       * Normal completion of a lock request.  Tell GFS it now has the lock.
++       */
++
++      clear_bit(LFL_NOBLOCK, &lp->flags);
++      lp->cur = lp->req;
++
++      acb.lc_name = lp->lockname;
++      acb.lc_ret |= make_lmstate(lp->cur);
++
++      if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
++          (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
++              acb.lc_ret |= LM_OUT_CACHEABLE;
++
++      dlm->fscb(dlm->fsdata, LM_CB_ASYNC, &acb);
++}
++
++/**
++ * no_work - determine if there's work for the dlm_async thread
++ * @dlm:
++ *
++ * Returns: 1 if no work, 0 otherwise
++ */
++
++static __inline__ int no_work(dlm_t * dlm)
++{
++      int ret;
++
++      spin_lock(&dlm->async_lock);
++
++      ret = list_empty(&dlm->complete) &&
++          list_empty(&dlm->blocking) &&
++          list_empty(&dlm->submit) &&
++          list_empty(&dlm->starts) && !test_bit(DFL_MG_FINISH, &dlm->flags);
++
++      spin_unlock(&dlm->async_lock);
++
++      return ret;
++}
++
++/**
++ * dlm_async - thread for a variety of asynchronous processing
++ * @data:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int dlm_async(void *data)
++{
++      dlm_t *dlm = (dlm_t *) data;
++      dlm_lock_t *lp = NULL;
++      dlm_start_t *ds = NULL;
++      uint8_t complete, blocking, submit, start, finish;
++      DECLARE_WAITQUEUE(wait, current);
++
++      daemonize("lock_dlm");
++      atomic_inc(&dlm->threads);
++
++      do {
++              current->state = TASK_INTERRUPTIBLE;
++              add_wait_queue(&dlm->wait, &wait);
++              if (no_work(dlm))
++                      schedule();
++              remove_wait_queue(&dlm->wait, &wait);
++              current->state = TASK_RUNNING;
++
++              complete = blocking = submit = start = finish = 0;
++
++              spin_lock(&dlm->async_lock);
++
++              if (!list_empty(&dlm->complete)) {
++                      lp = list_entry(dlm->complete.next, dlm_lock_t, clist);
++                      list_del(&lp->clist);
++                      clear_bit(LFL_CLIST, &lp->flags);
++                      complete = 1;
++              } else if (!list_empty(&dlm->blocking)) {
++                      lp = list_entry(dlm->blocking.next, dlm_lock_t, blist);
++                      list_del(&lp->blist);
++                      clear_bit(LFL_BLIST, &lp->flags);
++                      blocking = lp->bast_mode;
++                      lp->bast_mode = 0;
++              } else if (!list_empty(&dlm->submit)) {
++                      lp = list_entry(dlm->submit.next, dlm_lock_t, slist);
++                      list_del(&lp->slist);
++                      clear_bit(LFL_SLIST, &lp->flags);
++                      submit = 1;
++              } else if (!list_empty(&dlm->starts)) {
++                      ds = list_entry(dlm->starts.next, dlm_start_t, list);
++                      list_del(&ds->list);
++                      start = 1;
++              } else if (test_and_clear_bit(DFL_MG_FINISH, &dlm->flags)) {
++                      finish = 1;
++              }
++
++              spin_unlock(&dlm->async_lock);
++
++              if (complete)
++                      process_complete(lp);
++
++              else if (blocking)
++                      process_blocking(lp, blocking);
++
++              else if (submit)
++                      process_submit(lp);
++
++              else if (start)
++                      process_start(dlm, ds);
++
++              else if (finish)
++                      process_finish(dlm);
++
++              schedule();
++      }
++      while (!test_bit(DFL_THREAD_STOP, &dlm->flags));
++
++      atomic_dec(&dlm->threads);
++      return 0;
++}
++
++/**
++ * init_async_thread
++ * @dlm:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int init_async_thread(dlm_t * dlm)
++{
++      int error;
++
++      clear_bit(DFL_THREAD_STOP, &dlm->flags);
++      atomic_set(&dlm->threads, 0);
++
++      error = kernel_thread(dlm_async, dlm, 0);
++      if (error < 0)
++              goto out;
++
++      error = kernel_thread(dlm_async, dlm, 0);
++      if (error < 0) {
++              release_async_thread(dlm);
++              goto out;
++      }
++
++      while (atomic_read(&dlm->threads) != 2)
++              schedule();
++      error = 0;
++
++      out:
++      if (error)
++              printk("lock_dlm: can't start async thread %d\n", error);
++      return error;
++}
++
++/**
++ * release_async_thread
++ * @dlm:
++ *
++ */
++
++void release_async_thread(dlm_t * dlm)
++{
++      set_bit(DFL_THREAD_STOP, &dlm->flags);
++      while (atomic_read(&dlm->threads)) {
++              wake_up(&dlm->wait);
++              schedule();
++      }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gio_wiretypes.h linux-patched/fs/gfs_locking/lock_gulm/gio_wiretypes.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gio_wiretypes.h        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gio_wiretypes.h     2004-06-16 12:03:21.956895230 -0500
+@@ -0,0 +1,404 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++#ifndef __gio_wiretypes_h__
++#define __gio_wiretypes_h__
++
++/* an attempt to do something about tracking changes to the protocol over
++ * the wires.
++ * If I was really cute, this would be effectivily a checksum of this file.
++ */
++#define GIO_WIREPROT_VERS (0x67000010)
++
++/*****************Error codes.
++ * everyone uses these same error codes.
++ */
++#define gio_Err_Ok              (0)
++#define gio_Err_BadLogin        (1001)
++#define gio_Err_BadCluster      (1003)
++#define gio_Err_BadConfig       (1004)
++#define gio_Err_BadGeneration   (1005)
++#define gio_Err_BadWireProto    (1019)
++
++#define gio_Err_NotAllowed      (1006)
++#define gio_Err_Unknown_Cs      (1007)
++#define gio_Err_BadStateChg     (1008)
++#define gio_Err_MemoryIssues    (1009)
++
++#define gio_Err_PushQu          (1010)        /* client should never see this one */
++#define gio_Err_TryFailed       (1011)
++#define gio_Err_AlreadyPend     (1013)
++#define gio_Err_Canceled        (1015)
++
++#define gio_Err_NoSuchFS        (1016)
++#define gio_Err_NoSuchJID       (1017)
++#define gio_Err_NoSuchName      (1018)
++
++/* next free error code: 1002 1012 1014 1020 */
++
++/*
++ * Error:  just sort of a generic error code thing.
++ *    uint32: gERR
++ *    uint32: opcode that this is in reply to. (can be zeros)
++ *    uint32: error code
++ */
++#define gulm_err_reply (0x67455252)   /* gERR */
++
++#define gulm_nop (0x674e4f50) /* gNOP */
++
++/********************* Core *****************/
++/* 
++ * login request
++ *    uint32: gCL0
++ *    uint32: proto version
++ *    string: cluster ID
++ *    string: My Name
++ *    uint64: generation number
++ *    uint32: config CRC
++ *    uint32: rank
++ * login reply
++ *    uint32: gCL1
++ *    uint64: generation number
++ *    uint32: error code
++ *    uint32: rank
++ *    uint8:  ama
++ *   If I am the Master or Arbitrating and there are no errors, A
++ *   serialization of the current nodelist follows. And a client or slave
++ *   is connecting (not resources).
++ *
++ * logout request:
++ *    uint32: gCL2
++ *    string: node name
++ *    uint8:  S/P/A/M/R
++ * logout reply:   Don't seem to use this....
++ *    uint32: gCL3
++ *    uint32: error code
++ *
++ * resource login request:
++ *    uint32: gCL4
++ *    uint32: proto version
++ *    string: cluster ID
++ *    string: resource name
++ *    uint32: options
++ *  login reply (gCL1) is sent in return.
++ *
++ * beat req
++ *    uint32: gCB0
++ *    string: My Name
++ * beat rpl
++ *    uint32: gCB1
++ *    uint32: error code
++ *
++ * Membership Request
++ *    uint32: gCMA
++ *    string: node name
++ *
++ * Membership update
++ *    uint32: gCMU
++ *    string: node name
++ *    IPv6:   IP
++ *    uint8:  Current State
++ *
++ * Membership list request info.
++ *    uint32: gCMl
++ *
++ * Membership list info.
++ *    uint32: gCML
++ *    list_start_marker
++ *     string: node name
++ *     IPv6:   IP
++ *     uint8:  state
++ *     uint8:  laststate
++ *     uint8:  mode (S/P/A/M/C)
++ *     uint32: missed beats
++ *     uint64: last beat
++ *     uint64: delay avg
++ *     uint64: max delay
++ *    list_stop_marker
++ *
++ * Request Resource info
++ *    uint32: gCR0
++ *
++ * Resource list info
++ *    uint32: gCR1
++ *    list_start_marker
++ *     string: name
++ *    list_stop_marker
++ *
++ * Force node into Expired:
++ *    uint32: gCFE
++ *    string: node name
++ *
++ * Core state request:
++ *    uint32: gCSR
++ *
++ * Core state changes:
++ *    uint32: gCSC
++ *    uint8:  state  (slave, pending, arbitrating, master)
++ *  If state == Slave, then the next two will follow.
++ *    IPv6:   MasterIP
++ *    string: MasterName
++ *
++ * Core shutdown req:
++ *    uint32: gCSD
++ *
++ * Switch core from current state into Pending:
++ *    uint32: gCSP
++ *
++ */
++#define gulm_core_login_req  (0x67434c00)     /* gCL0 */
++#define gulm_core_login_rpl  (0x67434c01)     /* gCL1 */
++#define gulm_core_logout_req (0x67434c02)     /* gCL2 */
++#define gulm_core_logout_rpl (0x67434c03)     /* gCL3 */
++#define gulm_core_reslgn_req (0x67434c04)     /* gCL4 */
++#define gulm_core_beat_req   (0x67434200)     /* gCB0 */
++#define gulm_core_beat_rpl   (0x67434201)     /* gCB1 */
++#define gulm_core_mbr_req    (0x67434d41)     /* gCMA */
++#define gulm_core_mbr_updt   (0x67434d55)     /* gCMU */
++#define gulm_core_mbr_lstreq (0x67434d6c)     /* gCMl */
++#define gulm_core_mbr_lstrpl (0x67434d4c)     /* gCML */
++#define gulm_core_mbr_force  (0x67434645)     /* gCFE */
++#define gulm_core_res_req    (0x67435200)     /* gCR0 */
++#define gulm_core_res_list   (0x67435201)     /* gCR1 */
++#define gulm_core_state_req  (0x67435352)     /* gCSR */
++#define gulm_core_state_chgs (0x67435343)     /* gCSC */
++#define gulm_core_shutdown   (0x67435344)     /* gCSD */
++#define gulm_core_forcepend  (0x67435350)     /* gCSP */
++
++/* in the st field */
++#define gio_Mbr_Logged_in  (0x05)
++#define gio_Mbr_Logged_out (0x06)
++#define gio_Mbr_Expired    (0x07)
++#define gio_Mbr_Killed     (0x08)
++#define gio_Mbr_OM_lgin    (0x09)
++
++/* in the ama field */
++#define gio_Mbr_ama_Slave       (0x01)
++#define gio_Mbr_ama_Master      (0x02)
++#define gio_Mbr_ama_Pending     (0x03)
++#define gio_Mbr_ama_Arbitrating (0x04)
++#define gio_Mbr_ama_Resource    (0x05)
++#define gio_Mbr_ama_Client      (0x06)
++/* the Client entery is ONLY for mode tracking.
++ * nodelist reply is the only place it is used.
++ */
++
++/* options that affect behavors on services. (resources) */
++#define gulm_svc_opt_important (0x00000001)
++
++/********************* Info Traffic *****************
++ *
++ * Note that for many of these, they can be sent to all of the servers and
++ * will get sane replies.  Some of these can only be sent to specific
++ * servers.
++ *
++ * stats req:
++ *    uint32: gIS0
++ * stats rpl:
++ *    uint32: gIS1
++ *    list start:
++ *       string: key
++ *       string: value
++ *    list stop:
++ * Notes:
++ *  The stats reply is a set of string pairs.  This way the server can send
++ *  whatever things it wants, and the same client code will work for
++ *  anything.
++ *
++ * set verbosity:
++ *    uint32: gIV0
++ *    string: verb flags (with -/+) to [un]set
++ * Note:
++ *  We don't bother with a reply for this.  If the server got it, it works.
++ *  If it didn't, it cannot send an error back anyways.
++ *
++ * close socket:
++ *   uint32: gSC0
++ * Note:
++ *   Tells the server to close this connection cleanly.  We're done with
++ *   it.  This is *not* the same as loging out.  You must login before you
++ *   can logout.  And many commands sent from gulm_tool happen without
++ *   logging in.  These commands would be useful for clients in many cases,
++ *   so I don't want to put a close at the end of them, but if I don't,
++ *   there will be error messages printed on the console when gulm_tool
++ *   calls them.
++ *   So we need a way to close a connection cleanly that has not been
++ *   logged in.
++ *
++ * request slave list:
++ *    uint32: gIL0
++ * slave list replay:
++ *    uint32: gIL1
++ *    list start:
++ *       string: name
++ *       uint32: poller idx
++ *    list stop:
++ */
++#define gulm_info_stats_req      (0x67495300) /* gIS0 */
++#define gulm_info_stats_rpl      (0x67495301) /* gIS1 */
++#define gulm_info_set_verbosity  (0x67495600) /* gIV0 */
++#define gulm_socket_close        (0x67534300) /* gSC0 */
++#define gulm_info_slave_list_req (0x67494c00) /* gIL0 */
++#define gulm_info_slave_list_rpl (0x67494c01) /* gIL1 */
++
++/********************* Lock Traffic *****************
++ * All lock traffic.
++ *
++ * login req:
++ *    uint32: gLL0
++ *    uint32: proto version
++ *    string: node name
++ *    uint8:  Client/Slave
++ * login rpl:
++ *    uint32: gLL1
++ *    uint32: error code
++ *    uint8:  Slave/Master
++ *    xdr of current lock state if no errors and master sending reply
++ *       and you're a slave.
++ *
++ * logout req:
++ *    uint32: gLL2
++ * logout rpl:
++ *    uint32: gLL3
++ *
++ * select lockspace:
++ *    uint32: gLS0
++ *    raw:    usually just four bytes for lockspace name.
++ *            but can be most anything.
++ *
++ * lock req:
++ *    uint32: gLR0
++ *    raw:    key
++ *    uint8:  state
++ *    uint32: flags
++ *    raw:    lvb -- Only exists if hasLVB flag is true.
++ * lock rpl:
++ *    uint32: gLR1
++ *    raw:    key
++ *    uint8:  state
++ *    uint32: flags
++ *    uint32: error code
++ *    raw:    lvb -- Only exists if hasLVB flag is true.
++ *
++ * lock state update:
++ *    uint32: gLRU
++ *    string: node name
++ *    raw:    key
++ *    uint8:  state
++ *    uint32: flags
++ *    raw:    lvb -- Only exists if hasLVB flag is true.
++ *
++ * Action req:
++ *    uint32: gLA0
++ *    raw:    key
++ *    uint8:  action
++ *    raw:    lvb -- Only exists if action is SyncLVB
++ * Action Rpl:
++ *    uint32: gLA1
++ *    raw:    key
++ *    uint8:  action
++ *    uint32: error code
++ *
++ * Action update:
++ *    uint32: gLAU
++ *    string: node name
++ *    raw:    key
++ *    uint8:  action
++ *    raw:    lvb -- Only exists if action is SyncLVB
++ *
++ * Slave Update Rply:   -- for both actions and requests.
++ *    uint32: gLUR
++ *    raw:    key
++ *
++ * Drop lock Callback:
++ *    uint32: gLC0
++ *    raw:    key
++ *    uint8:  state
++ *
++ * Drop all locks callback:  This is the highwater locks thing
++ *    uint32: gLC2
++ *
++ * Drop expired locks:
++ *    uint32: gLEO
++ *    string: node name  if NULL, then drap all exp for mask.
++ *    raw:    keymask  if keymask & key == key, then dropexp on this lock.
++ *
++ * Lock list req:
++ *    uint32: gLD0
++ * Lock list rpl:
++ *    uint32: gLD1
++ *    list start mark
++ *     uint8: key length
++ *     raw:   key
++ *     uint8: state
++ *     uint8: lvb length
++ *     if lvb length > 0, raw: LVB
++ *     uint32: Holder count
++ *     list start mark
++ *      string: holders
++ *     list stop mark
++ *     uint32: LVB holder count
++ *     list start mark
++ *      string: LVB Holders
++ *     list stop mark
++ *     uint32: Expired holder count
++ *     list start mark
++ *      string: ExpHolders
++ *     list stop mark
++ *    list stop mark
++ *
++ */
++#define gulm_lock_login_req   (0x674C4C00)    /* gLL0 */
++#define gulm_lock_login_rpl   (0x674C4C01)    /* gLL1 */
++#define gulm_lock_logout_req  (0x674C4C02)    /* gLL2 */
++#define gulm_lock_logout_rpl  (0x674C4C03)    /* gLL3 */
++#define gulm_lock_sel_lckspc  (0x674C5300)    /* gLS0 */
++#define gulm_lock_state_req   (0x674C5200)    /* gLR0 */
++#define gulm_lock_state_rpl   (0x674C5201)    /* gLR1 */
++#define gulm_lock_state_updt  (0x674C5255)    /* gLRU */
++#define gulm_lock_action_req  (0x674C4100)    /* gLA0 */
++#define gulm_lock_action_rpl  (0x674C4101)    /* gLA1 */
++#define gulm_lock_action_updt (0x674C4155)    /* gLAU */
++#define gulm_lock_update_rpl  (0x674c5552)    /* gLUR */
++#define gulm_lock_cb_state    (0x674C4300)    /* gLC0 */
++#define gulm_lock_cb_dropall  (0x674C4302)    /* gLC2 */
++#define gulm_lock_drop_exp    (0x674C454F)    /* gLEO */
++#define gulm_lock_dump_req    (0x674c4400)    /* gLD0 */
++#define gulm_lock_dump_rpl    (0x674c4401)    /* gLD1 */
++#define gulm_lock_rerunqueues (0x674c5152)    /* gLQR */
++
++/* marks for the login */
++#define gio_lck_st_Slave     (0x00)
++#define gio_lck_st_Client    (0x01)
++
++/* state change requests */
++#define gio_lck_st_Unlock    (0x00)
++#define gio_lck_st_Exclusive (0x01)
++#define gio_lck_st_Deferred  (0x02)
++#define gio_lck_st_Shared    (0x03)
++/* actions */
++#define gio_lck_st_Cancel    (0x09)
++#define gio_lck_st_HoldLVB   (0x0b)
++#define gio_lck_st_UnHoldLVB (0x0c)
++#define gio_lck_st_SyncLVB   (0x0d)
++
++/* flags */
++#define gio_lck_fg_Do_CB       (0x00000001)
++#define gio_lck_fg_Try         (0x00000002)
++#define gio_lck_fg_Any         (0x00000004)
++#define gio_lck_fg_NoExp       (0x00000008)
++#define gio_lck_fg_hasLVB      (0x00000010)
++#define gio_lck_fg_Cachable    (0x00000020)
++#define gio_lck_fg_Piority     (0x00000040)
++
++#endif /*__gio_wiretypes_h__*/
++/* vim: set ai cin et sw=3 ts=3 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm.h linux-patched/fs/gfs_locking/lock_gulm/gulm.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm.h      2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,288 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef GULM_DOT_H
++#define GULM_DOT_H
++
++#define GULM_RELEASE_NAME "v6.0.0"
++
++#ifdef MODVERSIONS
++#include <linux/modversions.h>
++#endif                                /*  MODVERSIONS  */
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/spinlock.h>
++#include <asm/atomic.h>
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/smp_lock.h>
++#include <linux/ctype.h>
++#include <linux/string.h>
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++#include <linux/list.h>
++
++#undef MAX
++#define MAX(a,b) ((a>b)?a:b)
++
++#undef MIN
++#define MIN(a,b) ((a<b)?a:b)
++
++/*  Extern Macro  */
++
++#ifndef EXTERN
++#define EXTERN extern
++#define INIT(X)
++#else
++#undef EXTERN
++#define EXTERN
++#define INIT(X) =X
++#endif
++
++/*  Static Macro  */
++#ifndef DEBUG_SYMBOLS
++#define STATIC static
++#else
++#define STATIC
++#endif
++
++/*  Divide x by y.  Round up if there is a remainder.  */
++#define DIV_RU(x, y) (((x) + (y) - 1) / (y))
++
++#include <linux/lm_interface.h>
++
++#include "gulm_prints.h"
++
++#include "libgulm.h"
++
++#include "handler.h"
++
++/* Some fixed length constants.
++ * Some of these should be made dynamic in size in the future.
++ */
++#define GIO_KEY_SIZE  (46)
++#define GIO_LVB_SIZE  (32)
++#define GIO_NAME_SIZE (32)
++#define GIO_NAME_LEN  (GIO_NAME_SIZE-1)
++
++/* What we know about this filesytem */
++struct gulm_fs_s {
++      struct list_head fs_list;
++      char fs_name[GIO_NAME_SIZE];    /* lock table name */
++
++      lm_callback_t cb;       /* file system callback function */
++      lm_fsdata_t *fsdata;    /* private file system data */
++
++      callback_qu_t cq;
++
++      uint32_t fsJID;
++      uint32_t lvb_size;
++
++      struct semaphore get_lock;      /* I am not 100% sure this is needed.
++                                       * But it only hurts performance,
++                                       * not correctness if it is
++                                       * useless.  Sometime post52, need
++                                       * to investigate.
++                                       */
++
++      /* Stuff for the first mounter lock and state */
++      int firstmounting;
++      /* the recovery done func needs to behave slightly differnt when we are
++       * the first node in an fs.
++       */
++
++      void *mountlock;        /* this lock holds the Firstmounter state of the FS */
++      /* this is because all lock traffic is async, and really at this point
++       * in time we want a sync behavor, so I'm left with doing something to
++       * achive that.
++       *
++       * this works, but it is crufty, but I don't want to build a huge
++       * queuing system for one lock that we touch twice at the beginning and
++       * once on the end.
++       *
++       * I should change the firstmounter lock to work like the journal locks
++       * and the node locks do.  Things are a lot cleaner now with the libgulm
++       * interface than before. (when the firstmounter lock code was written)
++       */
++      struct completion sleep;
++
++      /* Stuff for JID mapping locks */
++      uint32_t JIDcount;      /* how many JID locks are there. */
++};
++typedef struct gulm_fs_s gulm_fs_t;
++
++/* What we know about each locktable.
++ * only one now-a-days. (the LTPX)
++ * */
++typedef struct lock_table_s {
++      uint32_t magic_one;
++
++      int running;
++      struct task_struct *recver_task;
++      struct completion startup;
++      struct semaphore sender;
++
++      struct task_struct *sender_task;
++      wait_queue_head_t send_wchan;
++      spinlock_t queue_sender;
++      struct list_head to_be_sent;
++
++      int hashbuckets;
++      spinlock_t *hshlk;
++      struct list_head *lkhsh;
++
++      /* stats
++       * it may be wise to make some of these into atomic numbers.
++       * or something.  or not.
++       * */
++      uint32_t locks_total;
++      uint32_t locks_unl;
++      uint32_t locks_exl;
++      uint32_t locks_shd;
++      uint32_t locks_dfr;
++      uint32_t locks_lvbs;
++      atomic_t locks_pending;
++      /* cannot count expired here. clients don't know this */
++
++      uint32_t lops;          /* just incr on each op */
++
++} lock_table_t;
++
++typedef struct gulm_cm_s {
++      uint8_t myName[64];
++      uint8_t clusterID[256]; /* doesn't need to be 256. */
++      uint8_t loaded;         /* True|False whether we grabbed the config data */
++      uint8_t starts;
++
++      uint32_t handler_threads;       /* howmany to have */
++      uint32_t verbosity;
++
++      uint64_t GenerationID;
++
++      lock_table_t ltpx;
++
++      gulm_interface_p hookup;
++
++} gulm_cm_t;
++
++/* things about each lock. */
++typedef struct gulm_lock_s {
++      struct list_head gl_list;
++      atomic_t count;
++
++      uint32_t magic_one;
++      gulm_fs_t *fs;          /* which filesystem we belong to. */
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++      uint8_t last_suc_state; /* last state we succesfully got. */
++      char *lvb;
++
++      /* this is true when there is a lock request sent out for this lock.
++       * All it really means is that if we've lost the master, and reconnect
++       * to another, this lock needs to have it's request resent.
++       *
++       * This now has two stages.  Since a lock could be pending, but still in
++       * the send queue.  So we don't want to resend requests that haven't
++       * been sent yet.
++       *
++       * we don't handle the master losses here any more.  LTPX does that for
++       * us.  Should consider removing the dupicated code then.
++       */
++      int actuallypending;    /* may need to be atomic */
++      int in_to_be_sent;
++
++      enum { glck_nothing, glck_action, glck_state } req_type;
++      /* these three for the lock req.  We save them here so we can rebuild
++       * the lock request if there was a server failover. (?still needed?)
++       */
++      unsigned int cur_state;
++      unsigned int req_state;
++      unsigned int flags;
++
++      /* these three for actions. First is the action, next is result, last is
++       * what threads wait on for the reply.
++       */
++      int action;
++      int result;             /* ok, both are using this. */
++      struct completion actsleep;
++
++} gulm_lock_t;
++
++/*****************************************************************************/
++/* cross pollenate prototypes */
++
++/* from gulm_lt.c */
++void lt_logout (void);
++int lt_login (void);
++int get_mount_lock (gulm_fs_t * fs, int *first);
++int downgrade_mount_lock (gulm_fs_t * fs);
++int drop_mount_lock (gulm_fs_t * fs);
++int send_drop_all_exp (lock_table_t * lt);
++int send_drop_exp (gulm_fs_t * fs, lock_table_t * lt, char *name);
++
++/*from gulm_core.c */
++void cm_logout (void);
++int cm_login (void);
++void delete_ipnames (struct list_head *namelist);
++
++/* from gulm_fs.c */
++void init_gulm_fs (void);
++void request_journal_replay (uint8_t * name);
++void passup_droplocks (void);
++gulm_fs_t *get_fs_by_name (uint8_t * name);
++void dump_internal_lists (void);
++void gulm_recovery_done (lm_lockspace_t * lockspace,
++                       unsigned int jid, unsigned int message);
++void gulm_unmount (lm_lockspace_t * lockspace);
++void gulm_others_may_mount (lm_lockspace_t * lockspace);
++int gulm_mount (char *table_name, char *host_data,
++              lm_callback_t cb, lm_fsdata_t * fsdata,
++              unsigned int min_lvb_size, struct lm_lockstruct *lockstruct);
++
++extern struct lm_lockops gulm_ops;
++
++#endif                                /*  GULM_DOT_H  */
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_core.c linux-patched/fs/gfs_locking/lock_gulm/gulm_core.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_core.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_core.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,255 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "utils_tostr.h"
++
++extern gulm_cm_t gulm_cm;
++
++/* private vars. */
++int cm_thd_running;
++struct completion cm_thd_startup;
++struct task_struct *cm_thd_task;
++
++/**
++ */
++int
++gulm_core_login_reply (void *misc, uint64_t gen, uint32_t error,
++                     uint32_t rank, uint8_t corestate)
++{
++      if (error != 0) {
++              log_err ("Core returned error %d:%s.\n", error,
++                       gio_Err_to_str (error));
++              cm_thd_running = FALSE;
++              return error;
++      }
++
++      if( gulm_cm.GenerationID != 0 ) {
++              GULM_ASSERT(gulm_cm.GenerationID == gen,
++                              printk("us: %"PRIu64" them: %"PRIu64"\n",
++                                      gulm_cm.GenerationID,gen);
++                              );
++      }
++      gulm_cm.GenerationID = gen;
++
++      error = lt_login ();
++      if (error != 0) {
++              log_err ("lt_login failed. %d\n", error);
++              lg_core_logout (gulm_cm.hookup);        /* XXX is this safe? */
++              return error;
++      }
++
++      log_msg (lgm_Network2, "Logged into local core.\n");
++
++      return 0;
++}
++
++/**
++ * gulm_core_logout_reply - 
++ * @misc: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_core_logout_reply (void *misc)
++{
++      log_msg (lgm_Network2, "Logged out of local core.\n");
++      return 0;
++}
++
++/**
++ */
++int
++gulm_core_nodechange (void *misc, char *nodename,
++                    struct in6_addr *nodeip, uint8_t nodestate)
++{
++      if (nodestate == lg_core_Fenced) {
++              request_journal_replay (nodename);
++      }
++      /* if me and state is logout, Need to close out things if we can.
++       */
++      if (gulm_cm.starts && nodestate == lg_core_Logged_out &&
++                      strcmp(gulm_cm.myName, nodename) == 0 ) {
++              lt_logout();
++              cm_thd_running = FALSE;
++              lg_core_logout (gulm_cm.hookup);
++              return -1;
++      }
++      return 0;
++}
++
++int gulm_core_statechange (void *misc, uint8_t corestate,
++                           struct in6_addr *masterip, char *mastername)
++{
++      int *cst = (int *)misc;
++      if( misc != NULL ) {
++              if( corestate != lg_core_Slave &&
++                              corestate != lg_core_Master ) {
++                      *cst = TRUE;
++              }else{
++                      *cst = FALSE;
++              }
++      }
++      return 0;
++}
++
++/**
++ */
++int
++gulm_core_error (void *misc, uint32_t err)
++{
++      log_err ("Got error code %d %#x back fome some reason!\n", err, err);
++      return 0;
++}
++
++static lg_core_callbacks_t core_cb = {
++      login_reply:gulm_core_login_reply,
++      logout_reply:gulm_core_logout_reply,
++      nodechange:gulm_core_nodechange,
++      statechange:gulm_core_statechange,
++      error:gulm_core_error
++};
++
++/**
++ * cm_io_recving_thread - 
++ * @data: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++cm_io_recving_thread (void *data)
++{
++      int err;
++
++      daemonize ("gulm_res_recvd");
++      cm_thd_task = current;
++      complete (&cm_thd_startup);
++
++      while (cm_thd_running) {
++              err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, NULL);
++              if (err != 0) {
++                      log_err
++                          ("Got an error in gulm_res_recvd err: %d\n", err);
++                      if (!cm_thd_running)
++                              break;
++                      /* 
++                       * Pause a bit, then try to log back into the local
++                       * lock_gulmd.  Keep doing this until an outside force
++                       * stops us. (which I don't think there is any at this
++                       * point.  forceunmount would be one, if we ever do
++                       * that.)
++                       *
++                       * If we are still in the gulm_mount() function, we
++                       * should not retry. We should just exit.
++                       */
++                      current->state = TASK_INTERRUPTIBLE;
++                      schedule_timeout (3 * HZ);
++
++                      while ((err =
++                              lg_core_login (gulm_cm.hookup, TRUE)) != 0) {
++                              log_err
++                                  ("Got a %d trying to login to lock_gulmd.  Is it running?\n",
++                                   err);
++                              current->state = TASK_INTERRUPTIBLE;
++                              schedule_timeout (3 * HZ);
++                      }
++              }
++      }                       /* while( gulm_cm.cm_thd_running ) */
++
++      complete (&cm_thd_startup);
++      return 0;
++}
++
++/**
++ * cm_logout - 
++ */
++void
++cm_logout (void)
++{
++
++      if (cm_thd_running) {
++              cm_thd_running = FALSE;
++              lg_core_logout (gulm_cm.hookup);
++
++              /* wait for thread to finish */
++              wait_for_completion (&cm_thd_startup);
++      }
++
++}
++
++/**
++ * cm_login - 
++ * 
++ * Returns: int
++ */
++int
++cm_login (void)
++{
++      int err = -1;
++      int cst=TRUE;
++
++      cm_thd_running = FALSE;
++      init_completion (&cm_thd_startup);
++
++      err = lg_core_login (gulm_cm.hookup, TRUE);
++      if (err != 0) {
++              log_err
++                  ("Got a %d trying to login to lock_gulmd.  Is it running?\n",
++                   err);
++              goto exit;
++      }
++      /* handle login reply.  which will start the lt thread. */
++      err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, NULL);
++      if (err != 0) {
++              goto exit;
++      }
++
++      /* do not pass go until Slave(client) or Master */
++      while(cst) {
++              lg_core_corestate(gulm_cm.hookup);
++              err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, &cst);
++              if (err != 0) {
++                      goto exit;
++              }
++              if(cst) {
++                      current->state = TASK_INTERRUPTIBLE;
++                      schedule_timeout (3 * HZ);
++                      /* if interrupted, exit */
++              }
++      }
++
++      /* start recver thread. */
++      cm_thd_running = TRUE;
++      err = kernel_thread (cm_io_recving_thread, NULL, 0);
++      if (err < 0) {
++              log_err ("Failed to start gulm_res_recvd. (%d)\n", err);
++              goto exit;
++      }
++      wait_for_completion (&cm_thd_startup);
++
++      err = 0;
++      exit:
++      return err;
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_fs.c linux-patched/fs/gfs_locking/lock_gulm/gulm_fs.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_fs.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_fs.c   2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,613 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "load_info.h"
++#include "handler.h"
++#include "gulm_procinfo.h"
++#include "gulm_jid.h"
++
++/* things about myself */
++extern gulm_cm_t gulm_cm;
++
++/* globals for this file.*/
++uint32_t filesystems_count = 0;
++LIST_HEAD (filesystems_list);
++struct semaphore filesystem_lck;      /* we use a sema instead of a spin here because
++                                       * all of the interruptible things we do inside
++                                       * of it.
++                                       * If i stop doing nasty things within this it doesn't need
++                                       * to be a sema.
++                                       */
++struct semaphore start_stop_lock;
++atomic_t start_stop_cnt;
++
++/**
++ * init_gulm_fs - 
++ */
++void
++init_gulm_fs (void)
++{
++      init_MUTEX (&filesystem_lck);
++      init_MUTEX (&start_stop_lock);
++      atomic_set (&start_stop_cnt, 0);
++}
++
++/*****************************************************************************/
++struct rjrpf_s {
++      gulm_fs_t *fs;
++      uint8_t *name;
++};
++
++void
++request_journal_replay_per_fs (void *d)
++{
++      struct rjrpf_s *rf = (struct rjrpf_s *) d;
++      uint32_t jid;
++      unsigned int ujid;
++
++      /* lookup jid <=> name mapping */
++      if (find_jid_by_name_and_mark_replay (rf->fs, rf->name, &jid) != 0) {
++              log_msg (lgm_JIDMap,
++                       "In fs (%s), no jid for name (%s) was found.\n",
++                       rf->fs->fs_name, rf->name);
++      } else {
++              log_msg (lgm_JIDMap,
++                       "In fs (%s), jid %d was found for name (%s).\n",
++                       rf->fs->fs_name, jid, rf->name);
++
++              /* all that the replay journal call back into gfs does is malloc
++               * some memory and add it to a list.  So we really don't need to
++               * queue that action.  Since that is what gfs is doing.
++               *
++               * This will need to change if gfs changes.
++               *
++               * Basically, we assume that the callback is non-blocking.
++               */
++              ujid = jid;
++              rf->fs->cb (rf->fs->fsdata, LM_CB_NEED_RECOVERY, &ujid);
++      }
++
++      kfree (rf->name);
++      kfree (rf);
++
++}
++
++/**
++ * request_journal_replay - give a journal replay request to mounted filesystems
++ * @name: < the name of the node that died.
++ * 
++ * 
++ * Returns: void
++ */
++void
++request_journal_replay (uint8_t * name)
++{
++      struct list_head *tmp;
++      gulm_fs_t *fs;
++      struct rjrpf_s *rf;
++
++      log_msg (lgm_Always, "Checking for journals for node \"%s\"\n",
++               name);
++
++      down (&filesystem_lck);
++
++      list_for_each (tmp, &filesystems_list) {
++              fs = list_entry (tmp, gulm_fs_t, fs_list);
++
++              /* we don't want to process replay requests when we are
++               * still in the first mounter state.  All the journals are
++               * getting replayed anyways, and there could be some issue
++               * with stuff happening twice.
++               */
++              if (fs->firstmounting)
++                      continue;
++
++              /* due to the way the new jid mapping code works, we had to
++               * move it out of here.
++               */
++
++              rf = kmalloc (sizeof (struct rjrpf_s), GFP_KERNEL);
++              GULM_ASSERT (rf != NULL,);
++
++              rf->fs = fs;
++              rf->name = kmalloc (strlen (name) + 1, GFP_KERNEL);
++              GULM_ASSERT (rf->name != NULL,);
++              memcpy (rf->name, name, strlen (name) + 1);
++
++              qu_function_call (&fs->cq, request_journal_replay_per_fs, rf);
++
++      }
++      up (&filesystem_lck);
++}
++
++/**
++ * passup_droplocks - 
++ */
++void
++passup_droplocks (void)
++{
++      struct list_head *tmp;
++      gulm_fs_t *fs;
++      down (&filesystem_lck);
++      list_for_each (tmp, &filesystems_list) {
++              fs = list_entry (tmp, gulm_fs_t, fs_list);
++              qu_drop_req (&fs->cq, fs->cb, fs->fsdata, LM_CB_DROPLOCKS, 0,
++                           0);
++              /* If this decides to block someday, we need to change this function.
++               */
++      }
++      up (&filesystem_lck);
++}
++
++/**
++ * dump_internal_lists - 
++ * 
++ */
++void
++dump_internal_lists (void)
++{
++      struct list_head *tmp;
++      gulm_fs_t *fs;
++      down (&filesystem_lck);
++      list_for_each (tmp, &filesystems_list) {
++              fs = list_entry (tmp, gulm_fs_t, fs_list);
++              log_msg (lgm_Always, "Handler queue for %s\n", fs->fs_name);
++              display_handler_queue (&fs->cq);
++              /* other lists? */
++      }
++      up (&filesystem_lck);
++}
++
++/**
++ * get_fs_by_name - 
++ * @name: 
++ * 
++ * 
++ * Returns: gulm_fs_t
++ */
++gulm_fs_t *
++get_fs_by_name (uint8_t * name)
++{
++      struct list_head *tmp;
++      gulm_fs_t *fs = NULL;
++      down (&filesystem_lck);
++      list_for_each (tmp, &filesystems_list) {
++              fs = list_entry (tmp, gulm_fs_t, fs_list);
++              if (strcmp (name, fs->fs_name) == 0) {
++                      up (&filesystem_lck);
++                      return fs;
++              }
++      }
++      up (&filesystem_lck);
++      return NULL;
++}
++
++/*****************************************************************************/
++
++/**
++ * clear_locks - 
++ * 
++ * quick check to see if there was leaking
++ * should I panic on these? or just complain?
++ * 
++ * Returns: void
++ */
++void
++clear_locks (void)
++{
++      int i;
++      lock_table_t *lt = &gulm_cm.ltpx;
++
++      for (i = 0; i < lt->hashbuckets; i++) {
++              struct list_head *lcktmp, *lckfoo;
++              spin_lock (&lt->hshlk[i]);
++              list_for_each_safe (lcktmp, lckfoo, &lt->lkhsh[i]) {
++                      gulm_lock_t *lck = NULL;
++                      lck = list_entry (lcktmp, gulm_lock_t, gl_list);
++                      /* need to relelase it. umm, should any even exist? */
++                      log_err ("AH! Rogue lock buffer! refcount:%d\n",
++                               atomic_read (&lck->count));
++
++                      if (lck->lvb) {
++                              log_err ("AH! Rogue lock buffer with LVB!\n");
++                              kfree (lck->lvb);
++                      }
++
++                      list_del (lcktmp);
++                      kfree (lck);
++
++              }
++              spin_unlock (&lt->hshlk[i]);
++      }
++      kfree (lt->hshlk);
++      lt->hshlk = NULL;
++      kfree (lt->lkhsh);
++      lt->lkhsh = NULL;
++}
++
++/*****************************************************************************/
++/**
++ * start_gulm_threads - 
++ * @host_data: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++start_gulm_threads (char *csnm, char *host_data)
++{
++      int error = 0;
++
++      down (&start_stop_lock);
++      atomic_inc (&start_stop_cnt);
++      if (atomic_read (&start_stop_cnt) == 1) {
++              /* first one. get stuff going */
++              strncpy (gulm_cm.clusterID, csnm, 255);
++              gulm_cm.clusterID[255] = '\0';
++
++              error = lg_initialize (&gulm_cm.hookup, gulm_cm.clusterID,
++                                     "GFS Kernel Interface");
++              if (error != 0) {
++                      log_err ("lg_initialize failed, %d\n", error);
++                      goto fail;
++              }
++              gulm_cm.starts = TRUE;
++
++              error = load_info (host_data);
++              if (error != 0) {
++                      log_err ("load_info failed. %d\n", error);
++                      goto fail;
++              }
++
++              jid_init ();
++
++              error = cm_login ();
++              if (error != 0) {
++                      log_err ("cm_login failed. %d\n", error);
++                      goto fail;
++              }
++
++              /* lt_login() is called after the success packet for cm_login()
++               * returns.
++               */
++      }
++      fail:
++      up (&start_stop_lock);
++      return error;
++}
++
++/**
++ * stop_gulm_threads - 
++ */
++void
++stop_gulm_threads (void)
++{
++      down (&start_stop_lock);
++      atomic_dec (&start_stop_cnt);
++      if (atomic_read (&start_stop_cnt) == 0) {
++              /* last one, put it all away. */
++              lt_logout ();
++              cm_logout ();
++              clear_locks ();
++              lg_release (gulm_cm.hookup);
++              gulm_cm.hookup = NULL;
++              gulm_cm.loaded = FALSE;
++              gulm_cm.GenerationID = 0;
++      }
++      up (&start_stop_lock);
++}
++
++/*****************************************************************************/
++
++/**
++ * gulm_mount
++ * @table_name: clusterID:FS_Name
++ * @host_data:
++ * @cb: GFS callback function
++ * @fsdata: opaque GFS handle
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_mount (char *table_name, char *host_data,
++          lm_callback_t cb, lm_fsdata_t * fsdata,
++          unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++      gulm_fs_t *gulm;
++      char work[256], *tbln;
++      int first;
++      int error = -1;
++      struct list_head *lltmp;
++
++      strncpy (work, table_name, 256);
++
++      tbln = strstr (work, ":");
++      if (tbln == NULL) {
++              log_err
++                  ("Malformed table name. Couldn't find separator ':' between "
++                   "clusterID and lockspace name.\n");
++              error = -1;
++              goto fail;
++      }
++      *tbln++ = '\0';
++
++      /* make sure that the cluster name exists. */
++      if (strlen (work) <= 0) {
++              log_err ("Cluster name \"%s\" is too short.\n", work);
++              error = -EPROTO;
++              goto fail;
++      }
++      if (strlen (work) > 16) {
++              log_err ("Cluster name \"%s\" is too long.\n", work);
++              error = -EPROTO;
++              goto fail;
++      }
++
++      /* the second one is an artifact of the way I use the name.  
++       * A better fix to this will happen when I actually get dynamic key
++       * lengths working.
++       */
++      if (strlen (tbln) > MIN (GIO_NAME_LEN, (GIO_KEY_SIZE - 13))) {
++              log_err
++                  ("Warning! lockspace name (%s) is longer than %d chars!\n",
++                   tbln, MIN (GIO_NAME_LEN, (GIO_KEY_SIZE - 13)));
++              error = -EPROTO;
++              goto fail;
++      }
++      if (strlen (tbln) <= 0) {
++              log_err ("Table name \"%s\" is too short.\n", tbln);
++              error = -EPROTO;
++              goto fail;
++      }
++
++      /*  Check to make sure this lock table isn't already being used  */
++      down (&filesystem_lck);
++      list_for_each (lltmp, &filesystems_list) {
++              gulm = list_entry (lltmp, gulm_fs_t, fs_list);
++              if (!strncmp (gulm->fs_name, tbln, GIO_NAME_LEN)) {
++                      log_err ("\"%s\" is already in use\n", tbln);
++                      error = -EEXIST;
++                      up (&filesystem_lck);
++                      goto fail;
++              }
++      }
++      up (&filesystem_lck);
++
++      /*  Set up our main structure  */
++
++      gulm = kmalloc (sizeof (gulm_fs_t), GFP_KERNEL);
++      if (!gulm) {
++              log_err ("out of memory\n");
++              error = -ENOMEM;
++              goto fail;
++      }
++      memset (gulm, 0, sizeof (gulm_fs_t));
++
++      INIT_LIST_HEAD (&gulm->fs_list);
++
++      strncpy (gulm->fs_name, tbln, GIO_NAME_LEN);
++      gulm->cb = cb;
++      gulm->fsdata = fsdata;
++      gulm->lvb_size = min_lvb_size;
++      init_completion (&gulm->sleep);
++      init_MUTEX (&gulm->get_lock);
++
++      if ((error = start_gulm_threads (work, host_data)) != 0) {
++              log_err ("Got a %d trying to start the threads.\n", error);
++              goto fail_free_gulm;
++      }
++
++      if ((error =
++           start_callback_qu (&gulm->cq, gulm_cm.handler_threads)) < 0) {
++              log_err ("fsid=%s: Failed to start the callback handler.\n",
++                       gulm->fs_name);
++              goto fail_free_gulm;
++      }
++
++      /* the mount lock HAS to be the first thing done in the LTs for this fs. */
++      error = get_mount_lock (gulm, &first);
++      if (error != 0) {
++              log_err
++                  ("fsid=%s: Error %d while trying to get the mount lock\n",
++                   gulm->fs_name, error);
++              goto fail_callback;
++      }
++
++      jid_lockstate_reserve (gulm, first);
++      jid_fs_init (gulm);
++      get_journalID (gulm);
++
++      /* things act a bit different until the first mounter is finished.
++       */
++      if (first)
++              gulm->firstmounting = TRUE;
++
++      /*  Success  */
++      down (&filesystem_lck);
++      list_add (&gulm->fs_list, &filesystems_list);
++      filesystems_count++;
++      up (&filesystem_lck);
++
++      log_msg (lgm_JIDMap, "fsid=%s: We will be using jid %d\n",
++               gulm->fs_name, gulm->fsJID);
++
++      if (add_to_proc (gulm) != 0) {
++              /* ignored for now */
++      }
++
++      lockstruct->ls_jid = gulm->fsJID;
++      lockstruct->ls_first = first;
++      lockstruct->ls_lvb_size = gulm->lvb_size;
++      lockstruct->ls_lockspace = gulm;
++      lockstruct->ls_ops = &gulm_ops;
++#ifdef USE_SYNC_LOCKING
++      lockstruct->ls_flags = 0;
++
++      log_msg (lgm_Network2, "Done: %s, sync mode\n", table_name);
++#else
++      lockstruct->ls_flags = LM_LSFLAG_ASYNC;
++
++      log_msg (lgm_Network2, "Done: %s, async mode\n", table_name);
++#endif
++
++      gulm_cm.starts = FALSE;
++      return 0;
++
++      fail_callback:
++      stop_callback_qu (&gulm->cq);
++
++      fail_free_gulm:
++      kfree (gulm);
++      stop_gulm_threads ();
++
++      fail:
++
++      gulm_cm.starts = FALSE;
++      log_msg (lgm_Always, "fsid=%s: Exiting gulm_mount with errors %d\n",
++               table_name, error);
++      return error;
++}
++
++/**
++ * gulm_others_may_mount
++ * @lockspace: handle to specific lock space
++ *
++ * GFS calls this function if it was the first mounter after it's done
++ * checking all the journals.
++ *
++ */
++void
++gulm_others_may_mount (lm_lockspace_t * lockspace)
++{
++      gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++      int err = 0;
++      lock_table_t *lt = &gulm_cm.ltpx;
++
++      /* first send the drop all exp message.
++       * */
++      err = send_drop_exp (fs, lt, NULL);
++      if (err < 0)
++              log_err
++                  ("fsid=%s: Problems sending DropExp request to LTPX: %d\n",
++                   fs->fs_name, err);
++
++      /* then move the FirstMountLock to shared so others can mount. */
++      err = downgrade_mount_lock (fs);
++
++      if (err < 0) {
++              log_err ("fsid=%s: error sending Fs_FinMount_Req.(%d)\n",
++                       fs->fs_name, err);
++      }
++
++      /* first mounter is all done.  let the gulm_recovery_done function
++       * behave as normal now.
++       */
++      fs->firstmounting = FALSE;
++}
++
++/**
++ * gulm_umount
++ * @lockspace: handle to specific lock space
++ *
++ */
++void
++gulm_unmount (lm_lockspace_t * lockspace)
++{
++      gulm_fs_t *gulm_fs = (gulm_fs_t *) lockspace;
++
++      down (&filesystem_lck);
++      list_del (&gulm_fs->fs_list);
++      --filesystems_count;
++      up (&filesystem_lck);
++
++      /* close and release stuff */
++      drop_mount_lock (gulm_fs);
++      put_journalID (gulm_fs);
++      jid_fs_release (gulm_fs);
++      jid_lockstate_release (gulm_fs);
++
++      stop_callback_qu (&gulm_fs->cq);
++
++      remove_from_proc (gulm_fs);
++
++      kfree (gulm_fs);
++
++      stop_gulm_threads ();
++
++}
++
++/**
++ * gulm_recovery_done - 
++ * @lockspace: 
++ * @jid: 
++ * 
++ * Returns: void
++ */
++void
++gulm_recovery_done (lm_lockspace_t * lockspace, unsigned int jid,
++                  unsigned int message)
++{
++      gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++      int err;
++      uint8_t name[256];
++
++      if (message != LM_RD_SUCCESS) {
++              /* Need to start thinking about how I want to use this... */
++              return;
++      }
++
++      if (jid == fs->fsJID) { /* this may be drifting crud through. */
++              /* hey! its me! */
++              strncpy (name, gulm_cm.myName, 256);
++      } else if (lookup_name_by_jid (fs, jid, name) != 0) {
++              log_msg (lgm_JIDMap,
++                       "fsid=%s: Could not find a client for jid %d\n",
++                       fs->fs_name, jid);
++              return;
++      }
++      if (strlen (name) == 0) {
++              log_msg (lgm_JIDMap, "fsid=%s: No one mapped to jid %d\n",
++                       fs->fs_name, jid);
++              return;
++      }
++      log_msg (lgm_JIDMap, "fsid=%s: Found %s for jid %d\n",
++               fs->fs_name, name, jid);
++
++      err = send_drop_exp (fs, &gulm_cm.ltpx, name);
++
++      if (jid != fs->fsJID) {
++              /* rather dumb to do this to ourselves right after we mount... */
++              log_msg (lgm_JIDMap,
++                       "fsid=%s: Clearing JID %d for use by others\n",
++                       fs->fs_name, jid);
++              release_JID (fs, jid, FALSE);
++      }
++
++      /* If someone died while replaying someoneelse's journal, there will be
++       * stale expired jids.
++       */
++      check_for_stale_expires (fs);
++
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.c linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.c  2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,806 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++
++extern gulm_cm_t gulm_cm;
++
++/****************************************************************************/
++
++/* jid locks:
++ *
++ * Header lock: "JHeader" + \0\0\0 + fsname
++ *         lvb: <uint32> :number of JIDs
++ * Mappinglock: "JM" + <uint32> + \0\0\0\0 + fsname
++ *         lvb: [012] + <node name>
++ *              0: unused
++ *              1: replaying journal
++ *              2: Mounted
++ * list lock  : "JL" + "listlock" + fsname
++ * Node Locks : "JN" + <nodename[8]> + fsname
++ *
++ */
++#define jid_header_lvb_size (8)
++
++struct jid_lookup_item_s {
++      struct list_head jp_list;
++      uint8_t *key;
++      uint16_t keylen;
++      uint8_t *lvb;
++      uint16_t lvblen;
++      struct completion waitforit;
++};
++typedef struct jid_lookup_item_s jid_lookup_item_t;
++
++LIST_HEAD (jid_pending_locks);
++spinlock_t jid_pending;
++struct semaphore jid_listlock;
++
++/**
++ * jid_init - 
++ */
++void
++jid_init (void)
++{
++      spin_lock_init (&jid_pending);
++      init_MUTEX (&jid_listlock);
++}
++
++/**
++ * jid_get_header_name - 
++ * @fs: <
++ * @key: <>
++ * @keylen: <> 
++ * 
++ * key is buffer to write to, keylen is size of buffer on input, and real
++ * length on output.
++ * 
++ * Returns: int
++ */
++int
++jid_get_header_name (uint8_t * fsname, uint8_t * key, uint16_t * keylen)
++{
++      int len;
++      len = strlen (fsname);
++      if ((len + 11) > *keylen)
++              return -EINVAL;
++      memcpy (key, "JHeader\0\0\0", 10);
++      memcpy (&key[10], fsname, len + 1);
++      *keylen = len + 11;
++      return 0;
++}
++
++int
++jid_get_listlock_name (uint8_t * fsname, uint8_t * key, uint16_t * keylen)
++{
++      int len;
++      len = strlen (fsname);
++      if ((len + 11) > *keylen)
++              return -EINVAL;
++      memcpy (key, "JLlistlock", 10);
++      memcpy (&key[10], fsname, len + 1);
++      *keylen = len + 11;
++      return 0;
++}
++
++/**
++ * jid_get_lock_name - 
++ * @fs: <
++ * @jid: <
++ * @key: <>
++ * @keylen: <>
++ * 
++ * key is buffer to write to, keylen is size of buffer on input, and real
++ * length on output.
++ * 
++ * Returns: int
++ */
++int
++jid_get_lock_name (uint8_t * fsname, uint32_t jid, uint8_t * key,
++                 uint16_t * keylen)
++{
++      int len;
++      len = strlen (fsname);
++      if ((len + 11) > *keylen)
++              return -EINVAL;
++      key[0] = 'J';
++      key[1] = 'M';
++      key[5] = (jid >> 24) & 0xff;
++      key[4] = (jid >> 16) & 0xff;
++      key[3] = (jid >> 8) & 0xff;
++      key[2] = (jid >> 0) & 0xff;
++      key[6] = 0;
++      key[7] = 0;
++      key[8] = 0;
++      key[9] = 0;
++      memcpy (&key[10], fsname, len + 1);
++      *keylen = len + 11;
++      return 0;
++}
++
++/**
++ * jid_hold_lvb - 
++ * @key: 
++ * @keylen: 
++ * 
++ * 
++ */
++void
++jid_hold_lvb (uint8_t * key, uint16_t keylen)
++{
++      jid_lookup_item_t jp;
++      GULM_ASSERT (keylen > 6,);
++      jp.key = key;
++      jp.keylen = keylen;
++      jp.lvb = NULL;
++      jp.lvblen = 0;
++      INIT_LIST_HEAD (&jp.jp_list);
++      init_completion (&jp.waitforit);
++
++      spin_lock (&jid_pending);
++      list_add (&jp.jp_list, &jid_pending_locks);
++      spin_unlock (&jid_pending);
++
++      lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_HoldLVB,
++                          NULL, 0);
++
++      wait_for_completion (&jp.waitforit);
++}
++
++void
++jid_unhold_lvb (uint8_t * key, uint16_t keylen)
++{
++      jid_lookup_item_t jp;
++      GULM_ASSERT (keylen > 6,);
++      jp.key = key;
++      jp.keylen = keylen;
++      jp.lvb = NULL;
++      jp.lvblen = 0;
++      INIT_LIST_HEAD (&jp.jp_list);
++      init_completion (&jp.waitforit);
++
++      spin_lock (&jid_pending);
++      list_add (&jp.jp_list, &jid_pending_locks);
++      spin_unlock (&jid_pending);
++
++      lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_UnHoldLVB,
++                          NULL, 0);
++
++      wait_for_completion (&jp.waitforit);
++}
++
++void
++jid_sync_lvb (uint8_t * key, uint16_t keylen, uint8_t * lvb, uint16_t lvblen)
++{
++      jid_lookup_item_t jp;
++      GULM_ASSERT (keylen > 6,);
++      jp.key = key;
++      jp.keylen = keylen;
++      jp.lvb = NULL;
++      jp.lvblen = 0;
++      INIT_LIST_HEAD (&jp.jp_list);
++      init_completion (&jp.waitforit);
++
++      spin_lock (&jid_pending);
++      list_add (&jp.jp_list, &jid_pending_locks);
++      spin_unlock (&jid_pending);
++
++      lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_SyncLVB,
++                          lvb, lvblen);
++
++      wait_for_completion (&jp.waitforit);
++}
++
++/**
++ * jid_action_reply - 
++ * @key: 
++ * @keylen: 
++ * 
++ * called from the lock handler callback.
++ * 
++ * Returns: void
++ */
++void
++jid_action_reply (uint8_t * key, uint16_t keylen)
++{
++      struct list_head *tmp, *nxt;
++      jid_lookup_item_t *jp, *fnd = NULL;
++      spin_lock (&jid_pending);
++      list_for_each_safe (tmp, nxt, &jid_pending_locks) {
++              jp = list_entry (tmp, jid_lookup_item_t, jp_list);
++              if (memcmp (key, jp->key, MIN (keylen, jp->keylen)) == 0) {
++                      fnd = jp;
++                      list_del (tmp);
++                      break;
++              }
++      }
++      spin_unlock (&jid_pending);
++
++      if (fnd != NULL)
++              complete (&fnd->waitforit);
++}
++
++/**
++ * jid_get_lock_state_inr - 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * @flags:
++ * @lvb: 
++ * @lvblen: 
++ * 
++ * 
++ */
++void
++jid_get_lock_state_inr (uint8_t * key, uint16_t keylen, uint8_t state,
++                      uint32_t flags, uint8_t * lvb, uint16_t lvblen)
++{
++      jid_lookup_item_t jp;
++      GULM_ASSERT (keylen > 6,);
++      jp.key = key;
++      jp.keylen = keylen;
++      jp.lvb = lvb;
++      jp.lvblen = lvblen;
++      INIT_LIST_HEAD (&jp.jp_list);
++      init_completion (&jp.waitforit);
++
++      spin_lock (&jid_pending);
++      list_add (&jp.jp_list, &jid_pending_locks);
++      spin_unlock (&jid_pending);
++
++      lg_lock_state_req (gulm_cm.hookup, key, keylen, state, flags, lvb, lvblen);
++
++      wait_for_completion (&jp.waitforit);
++}
++
++/**
++ * jid_get_lock_state_lvb - 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * @lvb: 
++ * @lvblen: 
++ * 
++ * 
++ */
++void
++jid_get_lock_state_lvb (uint8_t * key, uint16_t keylen, uint8_t state,
++                      uint8_t * lvb, uint16_t lvblen)
++{
++      jid_get_lock_state_inr (key, keylen, state, 0, lvb, lvblen);
++}
++/**
++ * jid_get_lock_state - 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * 
++ * 
++ */
++void
++jid_get_lock_state (uint8_t * key, uint16_t keylen, uint8_t state)
++{
++      jid_get_lock_state_inr (key, keylen, state, 0, NULL, 0);
++}
++
++/**
++ * jid_state_reply - 
++ * @key: 
++ * @keylen: 
++ * @lvb: 
++ * @lvblen: 
++ * 
++ * 
++ */
++void
++jid_state_reply (uint8_t * key, uint16_t keylen, uint8_t * lvb, uint16_t lvblen)
++{
++      struct list_head *tmp, *nxt;
++      jid_lookup_item_t *jp, *fnd = NULL;
++      spin_lock (&jid_pending);
++      list_for_each_safe (tmp, nxt, &jid_pending_locks) {
++              jp = list_entry (tmp, jid_lookup_item_t, jp_list);
++              if (memcmp (key, jp->key, MIN (keylen, jp->keylen)) == 0) {
++                      fnd = jp;
++                      list_del (tmp);
++                      break;
++              }
++      }
++      spin_unlock (&jid_pending);
++
++      if (fnd != NULL) {
++              if (lvb != NULL && fnd->lvb != NULL)
++                      memcpy (fnd->lvb, lvb, MIN (fnd->lvblen, lvblen));
++              complete (&fnd->waitforit);
++      }
++}
++
++/****************************************************************************/
++
++/**
++ * jid_hold_list_lock - 
++ * @fs: 
++ * 
++ * only make one call to this per node.
++ * 
++ * Returns: void
++ */
++void
++jid_hold_list_lock (gulm_fs_t * fs)
++{
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++
++      down (&jid_listlock);
++
++      keylen = sizeof (key);
++      jid_get_listlock_name (fs->fs_name, key, &keylen);
++      jid_get_lock_state (key, keylen, lg_lock_state_Exclusive);
++
++}
++
++/**
++ * jid_release_list_lock - 
++ * @fs: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++jid_release_list_lock (gulm_fs_t * fs)
++{
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++
++      keylen = sizeof (key);
++      jid_get_listlock_name (fs->fs_name, key, &keylen);
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++      up (&jid_listlock);
++}
++
++/**
++ * jid_rehold_lvbs - 
++ * @fs: 
++ * 
++ * 
++ */
++void
++jid_rehold_lvbs (gulm_fs_t * fs)
++{
++      int i;
++      uint32_t oldjcnt;
++      uint8_t key[GIO_KEY_SIZE], lvb[jid_header_lvb_size];
++      uint16_t keylen = GIO_KEY_SIZE;
++
++      oldjcnt = fs->JIDcount;
++
++      jid_get_header_name (fs->fs_name, key, &keylen);
++      jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb,
++                              jid_header_lvb_size);
++      fs->JIDcount = (uint32_t) (lvb[0]) << 0;
++      fs->JIDcount |= (uint32_t) (lvb[1]) << 8;
++      fs->JIDcount |= (uint32_t) (lvb[2]) << 16;
++      fs->JIDcount |= (uint32_t) (lvb[3]) << 24;
++
++      for (i = oldjcnt; i < fs->JIDcount; i++) {
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, i, key, &keylen);
++              jid_hold_lvb (key, keylen);
++      }
++
++}
++
++void
++jid_grow_space (gulm_fs_t * fs)
++{
++      uint8_t key[GIO_KEY_SIZE], lvb[jid_header_lvb_size];
++      uint16_t keylen = GIO_KEY_SIZE;
++      uint32_t jidc;
++
++      keylen = sizeof (key);
++      jid_get_header_name (fs->fs_name, key, &keylen);
++      jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive, lvb,
++                              jid_header_lvb_size);
++      jidc = (uint32_t) (lvb[0]) << 0;
++      jidc |= (uint32_t) (lvb[1]) << 8;
++      jidc |= (uint32_t) (lvb[2]) << 16;
++      jidc |= (uint32_t) (lvb[3]) << 24;
++      jidc += 10;
++      lvb[3] = (jidc >> 24) & 0xff;
++      lvb[2] = (jidc >> 16) & 0xff;
++      lvb[1] = (jidc >> 8) & 0xff;
++      lvb[0] = (jidc >> 0) & 0xff;
++      jid_sync_lvb (key, keylen, lvb, jid_header_lvb_size);
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++      /* do an unlock here, so that when rehold grabs it shared, there is no
++       * lvb writing.
++       */
++
++      jid_rehold_lvbs (fs);
++}
++
++/**
++ * lookup_name_by_jid - 
++ * @fs: 
++ * @jid: 
++ * @name: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lookup_name_by_jid (gulm_fs_t * fs, uint32_t jid, uint8_t * name)
++{
++      uint8_t key[GIO_KEY_SIZE], lvb[64];
++      uint16_t keylen = 64;
++      int err = 0;
++
++      if (jid >= fs->JIDcount) {
++              err = -1;
++              goto exit;
++      }
++
++      jid_hold_list_lock (fs);
++
++      jid_get_lock_name (fs->fs_name, jid, key, &keylen);
++      jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb, 64);
++
++      if (lvb[0] != 0) {
++              memcpy (name, &lvb[1], strlen (&lvb[1]) + 1);
++      } else {
++              err = -1;
++      }
++
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++      jid_release_list_lock (fs);
++
++      exit:
++      return err;
++}
++
++/**
++ * Release_JID - 
++ * @fs: 
++ * @jid: 
++ * 
++ * actually may only need to et first byte to zero
++ * 
++ * Returns: int
++ */
++int
++release_JID (gulm_fs_t * fs, uint32_t jid, int nop)
++{
++      uint8_t key[GIO_KEY_SIZE], lvb[64];
++      uint16_t keylen = 64;
++
++      /* there is no such, so this becomes a nop. */
++      if (jid >= fs->JIDcount)
++              goto exit;
++
++      jid_hold_list_lock (fs);
++
++      jid_get_lock_name (fs->fs_name, jid, key, &keylen);
++      jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive, lvb, 64);
++      lvb[0] = 0;
++      jid_sync_lvb (key, keylen, lvb, strlen (&lvb[1]) + 2);
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++      jid_release_list_lock (fs);
++
++      exit:
++      return 0;
++}
++
++void
++put_journalID (gulm_fs_t * fs)
++{
++      release_JID (fs, fs->fsJID, TRUE);
++}
++
++/**
++ * get_journalID - 
++ * @fs: 
++ * @jid: 
++ * 
++ * This is broken.
++ * 
++ * Returns: int
++ */
++void
++get_journalID (gulm_fs_t * fs)
++{
++      uint32_t i = 0;
++      uint8_t key[GIO_KEY_SIZE], lvb[64];
++      uint16_t keylen;
++      int first_clear = -1;
++
++      retry:
++      jid_hold_list_lock (fs);
++
++      /* find an empty space, or ourselves again */
++      for (i = 0; i < fs->JIDcount; i++) {
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, i, key, &keylen);
++              jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++                                      lvb, 64);
++              jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++              if (first_clear == -1 && lvb[0] == 0 ) {
++                      first_clear = i;
++              } else if (strcmp (gulm_cm.myName, &lvb[1]) == 0) {
++                      first_clear = i;
++                      break;
++              }
++      }
++      if (first_clear >= 0) {
++              /* take the jid we have found */
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, first_clear, key, &keylen);
++              jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++                                      lvb, 64);
++              lvb[0] = 2;
++              memcpy (&lvb[1], gulm_cm.myName, strlen (gulm_cm.myName) + 1);
++              jid_sync_lvb (key, keylen, lvb, strlen (gulm_cm.myName) + 2);
++              jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++              fs->fsJID = first_clear;
++      }
++
++      /* unlock the header lock */
++      jid_release_list_lock (fs);
++
++      if (first_clear < 0) {
++              /* nothing found, grow and try again. */
++              jid_grow_space (fs);
++              goto retry;
++      }
++
++}
++
++/**
++ * find_jid_by_name_and_mark_replay - 
++ * @fs: 
++ * @name: 
++ * @jid: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++find_jid_by_name_and_mark_replay (gulm_fs_t * fs, uint8_t * name,
++                                uint32_t * jid)
++{
++      uint32_t i, found = -1;
++      uint8_t key[GIO_KEY_SIZE], lvb[64];
++      uint16_t keylen;
++
++      /* grab list lock */
++      jid_hold_list_lock (fs);
++
++      for (i = 0; i < fs->JIDcount; i++) {
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, i, key, &keylen);
++              jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++                                      lvb, 64);
++              if (strcmp (name, &lvb[1]) == 0) {
++                      *jid = i;
++                      found = 0;
++                      lvb[0] = 1;
++                      jid_sync_lvb (key, keylen, lvb, strlen (&lvb[1]) + 2);
++                      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++                      break;
++              }
++              jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++      }
++      /* unlock the list lock */
++      jid_release_list_lock (fs);
++
++      return found;
++}
++
++/**
++ * Check_for_replays - 
++ * @fs: 
++ * 
++ * 
++ * Returns: int
++ */
++void
++check_for_stale_expires (gulm_fs_t * fs)
++{
++      uint32_t i;
++      uint8_t key[GIO_KEY_SIZE], lvb[64];
++      uint16_t keylen;
++      unsigned int ujid;
++
++      /* grab list lock */
++      jid_hold_list_lock (fs);
++
++      for (i = 0; i < fs->JIDcount; i++) {
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, i, key, &keylen);
++              jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb,
++                                      64);
++              jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++              if (lvb[0] == 1) {
++                      log_msg (lgm_JIDMap,
++                               "fsid=%s: stale JID %d found\n",
++                               fs->fs_name, i);
++                      ujid = i;
++                      fs->cb (fs->fsdata, LM_CB_NEED_RECOVERY, &ujid);
++              }
++      }
++
++      /* unlock the list lock */
++      jid_release_list_lock (fs);
++}
++
++/**
++ * jid_fs_init - 
++ * @fs: 
++ * 
++ */
++void
++jid_fs_init (gulm_fs_t * fs)
++{
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen = GIO_KEY_SIZE;
++
++      fs->JIDcount = 0;
++
++      jid_get_header_name (fs->fs_name, key, &keylen);
++      jid_hold_lvb (key, keylen);
++      jid_rehold_lvbs (fs);
++}
++
++/**
++ * jid_fs_release - 
++ * @fs: 
++ * 
++ */
++void
++jid_fs_release (gulm_fs_t * fs)
++{
++      uint32_t i;
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++      for (i = 0; i < fs->JIDcount; i++) {
++              keylen = sizeof (key);
++              jid_get_lock_name (fs->fs_name, i, key, &keylen);
++              jid_unhold_lvb (key, keylen);
++      }
++      keylen = sizeof (key);
++      jid_get_header_name (fs->fs_name, key, &keylen);
++      jid_unhold_lvb (key, keylen);
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++}
++
++/**
++ * jid_unlock_callback - 
++ * @d: 
++ * 
++ * *MUST* be called from a Handler thread.
++ * 
++ * Returns: int
++ */
++void
++jid_unlock_callback (void *d)
++{
++      gulm_fs_t *fs = (gulm_fs_t *) d;
++      jid_rehold_lvbs (fs);
++}
++
++/**
++ * jid_header_lock_drop - 
++ * @key: 
++ * @keylen: 
++ * 
++ * Returns: void
++ */
++void
++jid_header_lock_drop (uint8_t * key, uint16_t keylen)
++{
++      gulm_fs_t *fs;
++      /* make sure this is the header lock.... */
++      if (key[1] == 'H' && (fs = get_fs_by_name (&key[10])) != NULL) {
++              qu_function_call (&fs->cq, jid_unlock_callback, fs);
++      }
++}
++
++/****************************************************************************/
++/**
++ * jid_get_lsresv_name - 
++ * @fsname: 
++ * @key: 
++ * @keylen: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++jid_get_lsresv_name (char *fsname, uint8_t * key, uint16_t * keylen)
++{
++      int len;
++
++      key[0] = 'J';
++      key[1] = 'N';
++      len = strlen (gulm_cm.myName) + 1;
++      memset (&key[2], 0, 8);
++      memcpy ((&key[2]), gulm_cm.myName, MIN (len, 8));
++      /* fsname starts at byte 10 so the dropexp pattern will find it. */
++      memcpy ((&key[10]), fsname, strlen (fsname) + 1);
++
++      *keylen = 10 + strlen (fsname) + 1;
++
++      return 0;
++}
++
++/**
++ * jid_lockstate_reserve - 
++ * @fs: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++jid_lockstate_reserve (gulm_fs_t * fs, int first)
++{
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++
++      jid_get_lsresv_name (fs->fs_name, key, &keylen);
++
++      /* if we are expired, this will block until someone else has cleaned our
++       * last mess up.
++       *
++       * Will very well may need to put in some kind of timeout otherwise this
++       * may do a forever lockup much like the FirstMounter lock had.
++       */
++      jid_get_lock_state_inr (key, keylen, lg_lock_state_Exclusive,
++                      first?lg_lock_flag_IgnoreExp:0, NULL, 0);
++
++}
++
++/**
++ * jid_lockstate_release - 
++ * @fs: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++jid_lockstate_release (gulm_fs_t * fs)
++{
++      uint8_t key[GIO_KEY_SIZE];
++      uint16_t keylen;
++
++      jid_get_lsresv_name (fs->fs_name, key, &keylen);
++
++      jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++}
++
++
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.h linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.h     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.h  2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,41 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GULM_JID_H__
++#define __GULM_JID_H__
++#include "gulm.h"
++void jid_init (void);
++void jid_fs_init (gulm_fs_t * fs);
++void jid_fs_release (gulm_fs_t * fs);
++int get_journalID (gulm_fs_t * fs);
++int lookup_jid_by_name (gulm_fs_t * fs, uint8_t * name, uint32_t * injid);
++int lookup_name_by_jid (gulm_fs_t * fs, uint32_t jid, uint8_t * name);
++void release_JID (gulm_fs_t * fs, uint32_t jid, int owner);
++void put_journalID (gulm_fs_t * fs);
++void check_for_stale_expires (gulm_fs_t * fs);
++
++int
++ find_jid_by_name_and_mark_replay (gulm_fs_t * fs, uint8_t * name, uint32_t * jid);
++
++void jid_start_journal_reply (gulm_fs_t * fs, uint32_t jid);
++void jid_finish_journal_reply (gulm_fs_t * fs, uint32_t jid);
++
++void jid_lockstate_reserve (gulm_fs_t * fs, int first);
++void jid_lockstate_release (gulm_fs_t * fs);
++
++/* to be called from the lg_lock callbacks. */
++void jid_state_reply (uint8_t * key, uint16_t keylen, uint8_t * lvb,
++                    uint16_t lvblen);
++void jid_action_reply (uint8_t * key, uint16_t keylen);
++void jid_header_lock_drop (uint8_t * key, uint16_t keylen);
++#endif /*__GULM_JID_H__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h linux-patched/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,40 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_log_msg_bits_h__
++#define __gulm_log_msg_bits_h__
++/* log_msg bit flags
++ * These got thier own file so I can easily include them in both user and
++ * kernel space.
++ * */
++#define lgm_Always      (0x00000000)  /*Print Message no matter what */
++#define lgm_Network     (0x00000001)
++#define lgm_Network2    (0x00000002)
++#define lgm_Stomith     (0x00000004)
++#define lgm_Heartbeat   (0x00000008)
++#define lgm_locking     (0x00000010)
++#define lgm_FuncDebug   (0x00000020)
++#define lgm_Forking     (0x00000040)
++#define lgm_JIDMap      (0x00000080)
++#define lgm_Subscribers (0x00000100)
++#define lgm_LockUpdates (0x00000200)
++#define lgm_LoginLoops  (0x00000400)
++#define lgm_Network3    (0x00000800)
++#define lgm_JIDUpdates  (0x00001000)
++#define lgm_ServerState (0x00002000)
++
++#define lgm_ReallyAll   (0xffffffff)
++
++#define lgm_BitFieldSize (32)
++
++#endif /*__gulm_log_msg_bits_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_lt.c linux-patched/fs/gfs_locking/lock_gulm/gulm_lt.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_lt.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_lt.c   2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,1937 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "handler.h"
++#include "utils_tostr.h"
++#include "gulm_jid.h"
++
++extern gulm_cm_t gulm_cm;
++
++/****************************************************************************/
++/* A bunch of prints that hopefully contain more information that is also
++ * useful
++ *
++ * these are a mess.
++ */
++
++/**
++ * lck_key_to_hex - 
++ * @key: 
++ * @len: 
++ * @workspace: <> place to put string. !! better be 2x len !!
++ * 
++ * 
++ * Returns: char
++ */
++static char *
++lck_key_to_hex (uint8_t * key, uint16_t len, char *workspace)
++{
++      int i;
++      for (i = 0; i < len; i++)
++              sprintf (&workspace[i * 2], "%02x", (key[i] & 0xff));
++      return workspace;
++}
++
++static void __inline__
++db_lck_entered (gulm_lock_t * lck)
++{
++      char bb[GIO_KEY_SIZE * 2 + 3];
++      lck_key_to_hex (lck->key, lck->keylen, bb);
++      printk ("Started  lock 0x%s cur:%#x req:%#x flags:%#x\n", bb,
++              lck->cur_state, lck->req_state, lck->flags);
++}
++static void __inline__
++db_lck_exited (gulm_lock_t * lck)
++{
++      char bb[GIO_KEY_SIZE * 2 + 3];
++      lck_key_to_hex (lck->key, lck->keylen, bb);
++      printk ("Finished lock 0x%s result:%#x\n", bb, lck->result);
++}
++
++static void __inline__
++dump_gulm_lock_t (gulm_lock_t * lck)
++{
++      char bb[GIO_KEY_SIZE * 2 + 3];
++
++      lck_key_to_hex (lck->key, lck->keylen, bb);
++      log_msg (lgm_Always, " key = 0x%s\n", bb);
++      log_msg (lgm_Always, " req_type = %#x\n", lck->req_type);
++      log_msg (lgm_Always, " last_suc_state = %#x\n", lck->last_suc_state);
++      log_msg (lgm_Always, " actuallypending = %d\n", lck->actuallypending);
++      log_msg (lgm_Always, " in_to_be_sent = %d\n", lck->in_to_be_sent);
++      log_msg (lgm_Always, " cur_state = %d\n", lck->cur_state);
++      log_msg (lgm_Always, " req_state = %d\n", lck->req_state);
++      log_msg (lgm_Always, " flags = %#x\n", lck->flags);
++      log_msg (lgm_Always, " action = %d\n", lck->action);
++      log_msg (lgm_Always, " result = %d\n", lck->result);
++}
++
++/* DEBUG_BY_LOCK is gone.  I may later add something back if needed.
++ *
++ * I love the idea of being able to log only certain locks, I just cannot
++ * think of an easy way to do it.  The best I can come up with is some
++ * pattern (or set of) that are used to decide which locks get logged.  But
++ * that could be expensive if the pattern is checked everytime, and won't
++ * behave as expected if only applied in get_lock.
++ * */
++
++/* The old log functions.
++ * These need their own sort of clean up someday as well.
++ * */
++#define log_msg_lk(key, keylen, fmt, args...) {\
++      uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++      lck_key_to_hex( key, keylen, bb); \
++      printk(PROTO_NAME ": On lock 0x%s " fmt , bb , ## args ); \
++   }
++
++#define log_err_lk(key, keylen, fmt, args...) {\
++      uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++      lck_key_to_hex( key, keylen, bb); \
++      printk(KERN_ERR PROTO_NAME ": ERROR On lock 0x%s " fmt , bb , ## args ); \
++   }
++
++#define log_msg_lck(lck, fmt, args...) {\
++      uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++      lck_key_to_hex( (lck)->key, (lck)->keylen, bb); \
++      printk(PROTO_NAME ": On lock 0x%s " fmt , bb , ## args ); \
++   }
++
++#define log_err_lck(lck, fmt, args...) {\
++      uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++      lck_key_to_hex( (lck)->key, (lck)->keylen, bb); \
++      printk(KERN_ERR PROTO_NAME ": ERROR On lock 0x%s " fmt , bb , ## args ); \
++   }
++
++#ifdef DEBUG_LVB
++static void __inline__
++print_lk_lvb (uint8_t * key, uint8_t * lvb, uint8_t st, uint8_t * dir)
++{
++      uint8_t bk[GIO_KEY_SIZE * 2 + 3];
++      uint8_t bl[GIO_LVB_SIZE * 2 + 3];
++      int i;
++      for (i = 0; i < GIO_KEY_SIZE; i++)
++              sprintf (&bk[(i * 2)], "%02x", (key[i]) & 0xff);
++      for (i = 0; i < GIO_LVB_SIZE; i++)
++              sprintf (&bl[(i * 2)], "%02x", (lvb[i]) & 0xff);
++      printk (PROTO_NAME ": On lock 0x%s with state %d\n\t%s LVB 0x%s\n",
++              bk, st, dir, bl);
++}
++
++#define lvb_log_msg_lk(k, fmt, args...) log_msg_lk( k , fmt , ## args )
++#define lvb_log_msg(fmt, args...) log_msg(lgm_Always , fmt , ## args )
++#else                         /*DEBUG_LVB */
++#define print_lk_lvb(k,l,s,d)
++#define lvb_log_msg_lk(k, fmt, args...)
++#define lvb_log_msg(fmt, args...)
++#endif                                /*DEBUG_LVB */
++
++/****************************************************************************/
++/**
++ * find_and_mark_lock - 
++ * @key: 
++ * @keylen: 
++ * @lockp: 
++ * 
++ * looks for a lock struct of key.  If found, marks it.
++ * 
++ * Returns: TRUE or FALSE
++ */
++int
++find_and_mark_lock (uint8_t * key, uint8_t keylen, gulm_lock_t ** lockp)
++{
++      int found = FALSE;
++      uint32_t bkt;
++      gulm_lock_t *lck = NULL;
++      struct list_head *tmp;
++
++      /* now find the lock */
++      bkt = hash_lock_key (key, keylen);
++      bkt %= gulm_cm.ltpx.hashbuckets;
++
++      spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++      list_for_each (tmp, &gulm_cm.ltpx.lkhsh[bkt]) {
++              lck = list_entry (tmp, gulm_lock_t, gl_list);
++              if (memcmp (lck->key, key, keylen) == 0) {
++                      found = TRUE;
++                      atomic_inc (&lck->count);
++                      break;
++              }
++      }
++      spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++
++      if (found)
++              *lockp = lck;
++
++      return found;
++}
++
++/**
++ * mark_lock - 
++ * @lck: 
++ * 
++ * like above, but since we have the lock, don't search for it.
++ * 
++ * Returns: int
++ */
++void __inline__
++mark_lock (gulm_lock_t * lck)
++{
++      atomic_inc (&lck->count);
++}
++
++/**
++ * unmark_and_release_lock - 
++ * @lck: 
++ * 
++ * decrement the counter on a lock, freeing it if it reaches 0.
++ * (also removes it from the hash table)
++ * 
++ * TRUE if lock was freed.
++ *
++ * Returns: TRUE or FALSE
++ */
++int
++unmark_and_release_lock (gulm_lock_t * lck)
++{
++      uint32_t bkt;
++      int deld = FALSE;
++
++      bkt = hash_lock_key (lck->key, lck->keylen);
++      bkt %= gulm_cm.ltpx.hashbuckets;
++      spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++      if (atomic_dec_and_test (&lck->count)) {
++              list_del (&lck->gl_list);
++              deld = TRUE;
++      }
++      spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++      if (deld) {
++              gulm_cm.ltpx.locks_total--;
++              gulm_cm.ltpx.locks_unl--;
++              if (lck->lvb != NULL) {
++                      kfree (lck->lvb);
++              }
++              kfree (lck);
++      }
++
++      return deld;
++}
++
++/****************************************************************************/
++
++void
++gulm_key_to_lm_lockname (uint8_t * key, struct lm_lockname *lockname)
++{
++      (*lockname).ln_number = (u64) (key[9]) << 0;
++      (*lockname).ln_number |= (u64) (key[8]) << 8;
++      (*lockname).ln_number |= (u64) (key[7]) << 16;
++      (*lockname).ln_number |= (u64) (key[6]) << 24;
++      (*lockname).ln_number |= (u64) (key[5]) << 32;
++      (*lockname).ln_number |= (u64) (key[4]) << 40;
++      (*lockname).ln_number |= (u64) (key[3]) << 48;
++      (*lockname).ln_number |= (u64) (key[2]) << 56;
++      (*lockname).ln_type = key[1];
++}
++
++void
++do_drop_lock_req (gulm_fs_t * fs, uint8_t state, uint8_t key[GIO_KEY_SIZE])
++{
++      unsigned int type;
++      struct lm_lockname lockname;
++      /* i might want to shove most of this function into the new lockcallback
++       * handing queue.
++       * later.
++       */
++
++      /* don't do callbacks on the gulm mount lock.
++       * I need to someday come up with a cleaner way of seperating the
++       * firstmounter lock and the rest of gfs's locks.
++       * i duno, this first byte is pretty clean.
++       * */
++      if (key[0] != 'G') {
++              return;
++      }
++
++      switch (state) {
++      case lg_lock_state_Unlock:
++              type = LM_CB_DROPLOCKS;
++              break;
++      case lg_lock_state_Exclusive:
++              type = LM_CB_NEED_E;
++              break;
++      case lg_lock_state_Shared:
++              type = LM_CB_NEED_S;
++              break;
++      case lg_lock_state_Deferred:
++              type = LM_CB_NEED_D;
++              break;
++      default:
++              type = LM_CB_DROPLOCKS;
++              break;
++      }
++      gulm_key_to_lm_lockname (key, &lockname);
++
++      qu_drop_req (&fs->cq, fs->cb, fs->fsdata, type,
++                   lockname.ln_type, lockname.ln_number);
++}
++
++/**
++ * send_async_reply - 
++ * @lck: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++send_async_reply (gulm_lock_t * lck)
++{
++      gulm_fs_t *fs = lck->fs;
++      struct lm_lockname lockname;
++
++      if (lck->key[0] == 'F') {
++              /* whee! it is the first mounter lock.  two things:
++               * A: gfs could care less about this.
++               * B: we need to up the sleeper in the fs.  (hack)
++               */
++              complete (&fs->sleep);
++              return;
++      }
++
++      gulm_key_to_lm_lockname (lck->key, &lockname);
++
++      qu_async_rpl (&fs->cq, fs->cb, fs->fsdata, &lockname, lck->result);
++}
++
++/**
++ * send_drop_exp_inter - 
++ * @lt: 
++ * @name: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++send_drop_exp_inter (gulm_fs_t * fs, lock_table_t * lt, char *name)
++{
++      int err, len;
++      uint8_t mask[GIO_KEY_SIZE];
++
++      memset (mask, 0, GIO_KEY_SIZE);
++      /* pack key mask */
++      mask[0] = 0xff;         /* minor lock type. 'G', 'F', 'J'. */
++      mask[1] = 0xff;         /* GFS lock type. */
++      mask[2] = 0xff;         /* next 8 are lock number */
++      mask[3] = 0xff;
++      mask[4] = 0xff;
++      mask[5] = 0xff;
++      mask[6] = 0xff;
++      mask[7] = 0xff;
++      mask[8] = 0xff;
++      mask[9] = 0xff;
++      /* Now stick the fsname into the remaining space. */
++      len = strlen (fs->fs_name);
++      strncpy (&mask[10], fs->fs_name, GIO_KEY_SIZE - 16);
++      len += 11;              /* 10 for the encoded buf, 1 for the '\0' after the fs name */
++
++      err = lg_lock_drop_exp (gulm_cm.hookup, name, mask, len);
++
++      return err;
++}
++
++/**
++ * send_lock_action - 
++ * @lck: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++send_lock_action (gulm_lock_t * lck, uint8_t action)
++{
++      int err;
++
++      GULM_ASSERT (lck->req_type == glck_action, dump_gulm_lock_t (lck););
++
++      err = lg_lock_action_req (gulm_cm.hookup, lck->key, lck->keylen, action,
++                                lck->lvb, lck->fs->lvb_size);
++      if (err != 0)
++              log_err ("Issues sending action request. %d\n", err);
++
++      return err;
++}
++
++/**
++ * send_lock_req - 
++ * @lck: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++send_lock_req (gulm_lock_t * lck)
++{
++      gulm_fs_t *fs = lck->fs;
++      int err;
++      uint32_t flags = 0;
++      uint8_t state;
++
++      GULM_ASSERT (lck->req_type == glck_state, dump_gulm_lock_t (lck););
++
++      switch (lck->req_state) {
++      case LM_ST_EXCLUSIVE:
++              state = lg_lock_state_Exclusive;
++              break;
++      case LM_ST_DEFERRED:
++              state = lg_lock_state_Deferred;
++              break;
++      case LM_ST_SHARED:
++              state = lg_lock_state_Shared;
++              break;
++      case LM_ST_UNLOCKED:
++              state = lg_lock_state_Unlock;
++              break;
++      default:
++              GULM_ASSERT (0, log_err ("fsid=%s: Anit no lock state %d.\n",
++                                       fs->fs_name, lck->req_state););
++              break;
++      }
++      if (lck->flags & LM_FLAG_TRY) {
++              flags |= lg_lock_flag_Try;
++      }
++      if (lck->flags & LM_FLAG_TRY_1CB) {
++              flags |= lg_lock_flag_Try | lg_lock_flag_DoCB;
++      }
++      if (lck->flags & LM_FLAG_NOEXP) {
++              flags |= lg_lock_flag_IgnoreExp;
++      }
++      if (lck->flags & LM_FLAG_ANY) {
++              flags |= lg_lock_flag_Any;
++      }
++      if (lck->flags & LM_FLAG_PRIORITY) {
++              flags |= lg_lock_flag_Piority;
++      }
++      if (lck->lvb != NULL) {
++              print_lk_lvb (lck->key, lck->lvb, lck->req_state, "Sending");
++      }
++
++      err = lg_lock_state_req (gulm_cm.hookup, lck->key, lck->keylen,
++                               state, flags, lck->lvb, lck->fs->lvb_size);
++      if (err != 0)
++              log_err ("Issues sending state request. %d\n", err);
++
++      return err;
++}
++
++/**
++ * toggle_lock_counters - 
++ * 
++ * called after a succesful request to change lock state.  Decrements
++ * counts for what the lock was, and increments for what it is now.
++ */
++void
++toggle_lock_counters (lock_table_t * lt, int old, int new)
++{
++      /* what we had it in */
++      switch (old) {
++      case LM_ST_EXCLUSIVE:
++              lt->locks_exl--;
++              break;
++      case LM_ST_DEFERRED:
++              lt->locks_dfr--;
++              break;
++      case LM_ST_SHARED:
++              lt->locks_shd--;
++              break;
++      case LM_ST_UNLOCKED:
++              lt->locks_unl--;
++              break;
++      }
++      /* what we have it in */
++      switch (new) {
++      case LM_ST_EXCLUSIVE:
++              lt->locks_exl++;
++              break;
++      case LM_ST_DEFERRED:
++              lt->locks_dfr++;
++              break;
++      case LM_ST_SHARED:
++              lt->locks_shd++;
++              break;
++      case LM_ST_UNLOCKED:
++              lt->locks_unl++;
++              break;
++      }
++}
++
++/**
++ * calc_lock_result - 
++ * @lck: 
++ * @state: 
++ * @error: 
++ * @flags: 
++ * 
++ * This calculates the correct result to return for gfs lock requests.
++ * 
++ * Returns: int
++ */
++int
++calc_lock_result (gulm_lock_t * lck,
++                uint8_t state, uint32_t error, uint32_t flags)
++{
++      gulm_fs_t *fs = lck->fs;
++      lock_table_t *lt = &gulm_cm.ltpx;
++      int result = -69;
++
++      /* adjust result based on success status. */
++      switch (error) {
++      case lg_err_Ok:
++              /* set result to current lock state. */
++              if (!(lck->flags & LM_FLAG_ANY)) {
++                      /* simple case, we got what we asked for. */
++                      result = lck->req_state;
++              } else {
++                      /* complex case, we got something else, but we said that was ok */
++                      switch (state) {
++                      case lg_lock_state_Shared:
++                              result = LM_ST_SHARED;
++                              break;
++                      case lg_lock_state_Deferred:
++                              result = LM_ST_DEFERRED;
++                              break;
++
++                      case lg_lock_state_Exclusive:
++                      case lg_lock_state_Unlock:
++                              GULM_ASSERT (0,
++                                           dump_gulm_lock_t (lck);
++                                           log_err
++                                           ("fsid=%s: lock state %d is invalid on "
++                                            "ANY flag return\n", fs->fs_name,
++                                            state);
++                                  );
++                              break;
++
++                      default:
++                              GULM_ASSERT (0,
++                                           dump_gulm_lock_t (lck);
++                                           log_err_lck (lck,
++                                                        "fsid=%s: Anit no lock state %d.\n",
++                                                        fs->fs_name, state);
++                                  );
++                              break;
++                      }
++              }
++
++              /* toggle counters.
++               * due to ANY flag, new state may not be req_state.
++               * */
++              toggle_lock_counters (lt, lck->cur_state, result);
++
++              /* if no internal unlocks, it is cachable. */
++              if (result != LM_ST_UNLOCKED && (flags & lg_lock_flag_Cachable))
++                      result |= LM_OUT_CACHEABLE;
++
++              /* record and move on
++               * */
++              lck->last_suc_state = result & LM_OUT_ST_MASK;
++              break;
++      case lg_err_Canceled:
++              result = LM_OUT_CANCELED | lck->cur_state;
++              break;
++      case lg_err_TryFailed:
++              result = lck->cur_state;        /* if we didn't get it. */
++              break;
++      default:
++              result = -error;
++              break;
++      }
++
++      return result;
++}
++
++/**
++ * my_strdup - 
++ * @s: 
++ * 
++ * 
++ * Returns: char
++ */
++char *
++my_strdup (char *s)
++{
++      char *tmp;
++      int len;
++      len = strlen (s) + 1;
++      tmp = kmalloc (len, GFP_KERNEL);
++      if (tmp == NULL)
++              return NULL;
++      memcpy (tmp, s, len);
++      return tmp;
++}
++
++/* Instead of directly calling the send function below, the functions will
++ * create of of these.
++ * Which exist only because I cannot stick the lock_t onto two lists
++ * at once.
++ *
++ * this could use some clean up.
++ */
++typedef struct send_req_s {
++      struct list_head sr_list;
++      enum { sr_lock, sr_act, sr_cancel, sr_drop } type;
++      gulm_lock_t *who;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++      char *name;
++} send_req_t;
++
++/**
++ * alloc_send_req - 
++ * @oid: 
++ * 
++ * 
++ * Returns: send_req_t
++ */
++send_req_t *
++alloc_send_req (void)
++{
++      send_req_t *tmp;
++      tmp = kmalloc (sizeof (send_req_t), GFP_KERNEL);
++      GULM_ASSERT (tmp != NULL,);     /* so evil.... */
++      return tmp;
++}
++
++/**
++ * send_drop_exp - 
++ * @fs: 
++ * @lt: 
++ * @name: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++send_drop_exp (gulm_fs_t * fs, lock_table_t * lt, char *name)
++{
++      send_req_t *sr;
++
++      sr = alloc_send_req ();
++      INIT_LIST_HEAD (&sr->sr_list);
++      sr->type = sr_drop;
++      sr->who = NULL;
++      sr->fs = fs;
++      sr->lt = lt;
++      if (name != NULL) {
++              sr->name = my_strdup (name);
++      } else {
++              sr->name = NULL;
++      }
++
++      spin_lock (&lt->queue_sender);
++      list_add (&sr->sr_list, &lt->to_be_sent);
++      spin_unlock (&lt->queue_sender);
++
++      wake_up (&lt->send_wchan);
++      return 0;
++}
++
++/**
++ * add_lock_to_send_req_queue - 
++ * @lt: 
++ * @lck: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++add_lock_to_send_req_queue (lock_table_t * lt, gulm_lock_t * lck, int type)
++{
++      send_req_t *sr;
++
++      sr = alloc_send_req ();
++      INIT_LIST_HEAD (&sr->sr_list);
++      sr->type = type;
++      sr->who = lck;
++      sr->fs = NULL;
++      sr->lt = NULL;
++      sr->name = NULL;
++      if (type != sr_cancel)
++              lck->in_to_be_sent = TRUE;
++
++      mark_lock (lck);
++
++      spin_lock (&lt->queue_sender);
++      list_add (&sr->sr_list, &lt->to_be_sent);
++      spin_unlock (&lt->queue_sender);
++
++      wake_up (&lt->send_wchan);
++}
++
++/**
++ * queue_empty - 
++ * @lt: 
++ * 
++ * 
++ * Returns: int
++ */
++static __inline__ int
++queue_empty (lock_table_t * lt)
++{
++      int ret;
++      spin_lock (&lt->queue_sender);
++      ret = list_empty (&lt->to_be_sent);
++      spin_unlock (&lt->queue_sender);
++      return ret;
++}
++
++/**
++ * lt_io_sender_thread - 
++ * @data: 
++ *
++ * Right now, only gfs lock requests should go through this thread.
++ * Must look, May not even need this.
++ * well, it is nice to get the socket io off of what ever process the user
++ * is running that is going through gfs into here. ?is it?
++ *
++ * 
++ * Returns: int
++ */
++int
++lt_io_sender_thread (void *data)
++{
++      lock_table_t *lt = (lock_table_t *) data;
++      struct list_head *tmp;
++      send_req_t *sr = NULL;
++      int err = 0;
++
++      daemonize ("gulm_LT_sender");
++      lt->sender_task = current;
++      complete (&lt->startup);
++
++      while (lt->running) {
++              do {
++                      DECLARE_WAITQUEUE (__wait_chan, current);
++                      current->state = TASK_INTERRUPTIBLE;
++                      add_wait_queue (&lt->send_wchan, &__wait_chan);
++                      if (queue_empty (lt))
++                              schedule ();
++                      remove_wait_queue (&lt->send_wchan, &__wait_chan);
++                      current->state = TASK_RUNNING;
++              } while (0);
++              if (!lt->running)
++                      break;
++
++              /* check to make sure socket is ok. */
++              down (&lt->sender);
++
++              /* pop next item to be sent
++               *  (it will get pushed back if there was problems.)
++               */
++              spin_lock (&lt->queue_sender);
++              if (list_empty (&lt->to_be_sent)) {
++                      spin_unlock (&lt->queue_sender);
++                      up (&lt->sender);
++                      continue;
++              }
++              tmp = (&lt->to_be_sent)->prev;
++              list_del (tmp);
++              spin_unlock (&lt->queue_sender);
++              sr = list_entry (tmp, send_req_t, sr_list);
++
++              /* send. */
++              if (sr->type == sr_lock) {
++                      err = send_lock_req (sr->who);
++                      if (err == 0) {
++                              sr->who->in_to_be_sent = FALSE;
++                              unmark_and_release_lock (sr->who);
++                      }
++              } else if (sr->type == sr_act) {
++                      err = send_lock_action (sr->who, sr->who->action);
++                      if (err == 0) {
++                              sr->who->in_to_be_sent = FALSE;
++                              unmark_and_release_lock (sr->who);
++                      }
++              } else if (sr->type == sr_cancel) {
++                      err =
++                          lg_lock_cancel_req (gulm_cm.hookup, sr->who->key,
++                                              sr->who->keylen);
++                      if (err == 0)
++                              unmark_and_release_lock (sr->who);
++              } else if (sr->type == sr_drop) {
++                      /* XXX sr->lt isn't really needed.
++                       * just lt should be fine.
++                       * look into it someday.
++                       */
++                      err = send_drop_exp_inter (sr->fs, sr->lt, sr->name);
++              } else {
++                      log_err ("Unknown send_req type! %d\n", sr->type);
++              }
++              up (&lt->sender);
++
++              /* if no errors, remove from queue. */
++              if (err == 0) {
++                      if (sr->type == sr_drop && sr->name != NULL)
++                              kfree (sr->name);
++                      kfree (sr);
++                      sr = NULL;
++              } else {
++                      /* if errors, re-queue.
++                       * the send_* funcs already reported the error, so we won't
++                       * repeat that.
++                       * */
++                      spin_lock (&lt->queue_sender);
++                      /* reset the pointers. otherwise things get weird. */
++                      INIT_LIST_HEAD (&sr->sr_list);
++                      list_add_tail (&sr->sr_list, &lt->to_be_sent);
++                      spin_unlock (&lt->queue_sender);
++
++                      current->state = TASK_INTERRUPTIBLE;
++                      schedule_timeout (3 * HZ);
++
++                      /* gotta break shit up.
++                       * else this loops hard and fast.
++                       */
++              }
++      }                       /* while( lt->running ) */
++
++      complete (&lt->startup);
++      return 0;
++}
++
++/**
++ * cancel_pending_sender - 
++ * @lck: 
++ * 
++ * want to cancel a lock request that we haven't sent to the server yet.
++ * 
++ * this must skip over unlock requests. (never cancel unlocks)
++ * 
++ * Returns: int
++ */
++int
++cancel_pending_sender (gulm_lock_t * lck)
++{
++      lock_table_t *lt = &gulm_cm.ltpx;
++      struct list_head *tmp, *nxt;
++      send_req_t *sr;
++      int found = FALSE;
++
++      spin_lock (&lt->queue_sender);
++
++      list_for_each_safe (tmp, nxt, &lt->to_be_sent) {
++              sr = list_entry (tmp, send_req_t, sr_list);
++              if (sr->who == lck) {   /* good enough? */
++                      if (lck->req_type == sr_cancel)
++                              continue;
++                      if (lck->req_state == LM_ST_UNLOCKED)
++                              continue;       /*donot cancel unlocks */
++                      list_del (tmp);
++                      kfree (sr);
++                      found = TRUE;
++                      lck->in_to_be_sent = FALSE;
++
++                      /* Now we need to tell the waiting lock req that it got canceled.
++                       * basically, we need to fake a lg_err_Canceled return....
++                       */
++                      lck->result = LM_OUT_CANCELED | lck->cur_state;
++                      lck->actuallypending = FALSE;
++                      lck->req_type = glck_nothing;
++                      atomic_dec (&lt->locks_pending);
++#ifndef USE_SYNC_LOCKING
++                      send_async_reply (lck);
++#else
++                      complete (&lck->actsleep);
++#endif
++                      unmark_and_release_lock (lck);
++                      break;
++              }
++      }
++
++      spin_unlock (&lt->queue_sender);
++      return found;
++}
++
++/**
++ * gulm_lt_login_reply - 
++ * @misc: 
++ * @error: 
++ * @which: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_login_reply (void *misc, uint32_t error, uint8_t which)
++{
++      if (error != 0) {
++              gulm_cm.ltpx.running = FALSE;
++              log_err ("LTPX: Got a %d from the login request.\n", error);
++      } else {
++              log_msg (lgm_Network2, "Logged into local LTPX.\n");
++      }
++      return error;
++}
++
++/**
++ * gulm_lt_logout_reply - 
++ * @misc: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_logout_reply (void *misc)
++{
++      gulm_cm.ltpx.running = FALSE;
++      log_msg (lgm_Network2, "Logged out of local LTPX.\n");
++      return 0;
++}
++
++/**
++ * gulm_lt_lock_state - 
++ * @misc: 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * @flags: 
++ * @error: 
++ * @LVB: 
++ * @LVBlen: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_lock_state (void *misc, uint8_t * key, uint16_t keylen,
++                  uint8_t state, uint32_t flags, uint32_t error,
++                  uint8_t * LVB, uint16_t LVBlen)
++{
++      gulm_lock_t *lck;
++
++      if (key[0] == 'J') {
++              jid_state_reply (key, keylen, LVB, LVBlen);
++              return 0;
++      }
++
++      if (!find_and_mark_lock (key, keylen, &lck)) {
++              log_err_lk (key, keylen, "Got a lock state reply for a lock "
++                          "that we don't know of. state:%#x flags:%#x error:%#x\n",
++                          state, flags, error);
++              return 0;
++      }
++
++      lck->result = calc_lock_result (lck, state, error, flags);
++
++      if ((lck->result & LM_OUT_ST_MASK) != LM_ST_UNLOCKED &&
++          lck->lvb != NULL) {
++              memcpy (lck->lvb, LVB, MIN (lck->fs->lvb_size, LVBlen));
++      }
++
++      lck->actuallypending = FALSE;
++      lck->req_type = glck_nothing;
++      atomic_dec (&gulm_cm.ltpx.locks_pending);
++#ifndef USE_SYNC_LOCKING
++      send_async_reply (lck);
++#else
++      complete (&lck->actsleep);
++#endif
++
++      if (error != 0 && error != lg_err_TryFailed && error != lg_err_Canceled)
++              log_msg_lck (lck, "Error: %d:%s (req:%#x rpl:%#x lss:%#x)\n",
++                           error, gio_Err_to_str (error),
++                           lck->req_state, state, lck->last_suc_state);
++
++      unmark_and_release_lock (lck);
++      return 0;
++}
++
++/**
++ * gulm_lt_lock_action - 
++ * @misc: 
++ * @key: 
++ * @keylen: 
++ * @action: 
++ * @error: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_lock_action (void *misc, uint8_t * key, uint16_t keylen,
++                   uint8_t action, uint32_t error)
++{
++      gulm_lock_t *lck;
++
++      if (key[0] == 'J') {
++              jid_action_reply (key, keylen);
++              return 0;
++      }
++
++      if (!find_and_mark_lock (key, keylen, &lck)) {
++              log_err_lk (key, keylen, "Got a lock action reply for a lock "
++                          "that we don't know of. action:%#x error:%#x\n",
++                          action, error);
++              return 0;
++      }
++
++      if (action == lg_lock_act_HoldLVB ||
++          action == lg_lock_act_UnHoldLVB || action == lg_lock_act_SyncLVB) {
++              /*  */
++              lck->result = error;
++              if (error != lg_err_Ok) {
++                      log_err ("on action reply act:%d err:%d\n", action,
++                               error);
++              }
++              lck->req_type = glck_nothing;
++              lck->actuallypending = FALSE;
++              complete (&lck->actsleep);
++      } else {
++              log_err_lck (lck, "Got strange Action %#x\n", action);
++      }
++      unmark_and_release_lock (lck);
++      return 0;
++}
++
++/**
++ * gulm_lt_drop_lock_req - 
++ * @misc: 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_drop_lock_req (void *misc, uint8_t * key, uint16_t keylen,
++                     uint8_t state)
++{
++      gulm_lock_t *lck;
++
++      if (key[0] == 'J') {
++              jid_header_lock_drop (key, keylen);
++              return 0;
++      }
++
++      if (!find_and_mark_lock (key, keylen, &lck)) {
++              log_err_lk (key, keylen, "Got a drop lcok request for a lock "
++                          "that we don't know of. state:%#x\n", state);
++              return 0;
++      }
++
++      do_drop_lock_req (lck->fs, state, key);
++
++      unmark_and_release_lock (lck);
++      return 0;
++}
++
++/**
++ * gulm_lt_drop_all - 
++ * @misc: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_drop_all (void *misc)
++{
++      passup_droplocks ();
++      return 0;
++}
++
++/**
++ * gulm_lt_error - 
++ * @misc: 
++ * @err: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++gulm_lt_error (void *misc, uint32_t err)
++{
++      log_err ("LTPX: RANDOM ERROR %d\n", err);
++      return err;
++}
++
++static lg_lockspace_callbacks_t lock_cb = {
++      login_reply:gulm_lt_login_reply,
++      logout_reply:gulm_lt_logout_reply,
++      lock_state:gulm_lt_lock_state,
++      lock_action:gulm_lt_lock_action,
++      drop_lock_req:gulm_lt_drop_lock_req,
++      drop_all:gulm_lt_drop_all,
++      error:gulm_lt_error
++};
++
++/**
++ * lt_io_recving_thread - 
++ * @data: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lt_io_recving_thread (void *data)
++{
++      lock_table_t *lt = &gulm_cm.ltpx;
++      int err;
++
++      daemonize ("gulm_LT_recver");
++      lt->recver_task = current;
++      complete (&lt->startup);
++
++      while (lt->running) {
++              err = lg_lock_handle_messages (gulm_cm.hookup, &lock_cb, NULL);
++              if (err != 0) {
++                      log_err ("gulm_LT_recver err %d\n", err);
++                      lt->running = FALSE;    /* should stop the sender thread. */
++                      wake_up (&lt->send_wchan);
++                      break;
++              }
++      }                       /* while( lt->running ) */
++
++      complete (&lt->startup);
++      return 0;
++}
++
++/**
++ * lt_logout - log out of all of the lock tables
++ */
++void
++lt_logout (void)
++{
++      lock_table_t *lt = &gulm_cm.ltpx;
++      int err;
++
++      if (lt->running) {
++              lt->running = FALSE;
++
++              /* stop sender thread */
++              wake_up (&lt->send_wchan);
++              wait_for_completion (&lt->startup);
++
++              /* stop recver thread */
++              down (&lt->sender);
++              err = lg_lock_logout (gulm_cm.hookup);
++              up (&lt->sender);
++
++              /* wait for thread to finish */
++              wait_for_completion (&lt->startup);
++      }
++
++}
++
++/**
++ * lt_login - login to lock tables.
++ * 
++ * Returns: int
++ */
++int
++lt_login (void)
++{
++      int err;
++      lock_table_t *lt = &gulm_cm.ltpx;
++
++      if (lt->running)
++              log_err
++                  ("Trying to log into LTPX when it appears to be logged in!\n");
++
++      err = lg_lock_login (gulm_cm.hookup, "GFS ");
++      if (err != 0) {
++              log_err ("Failed to send login request. %d\n", err);
++              goto fail;
++      }
++
++      /* start recver thread. */
++      lt->running = TRUE;
++      err = kernel_thread (lt_io_recving_thread, lt, 0);
++      if (err < 0) {
++              log_err ("Failed to start gulm_lt_IOd. (%d)\n", err);
++              goto fail;
++      }
++      wait_for_completion (&lt->startup);
++
++      /* start sender thread */
++      err = kernel_thread (lt_io_sender_thread, lt, 0);
++      if (err < 0) {
++              log_err ("Failed to start gulm_LT_sender. (%d)\n", err);
++              goto fail;
++      }
++      wait_for_completion (&lt->startup);
++
++      return 0;
++      fail:
++      lt_logout ();
++      log_msg (lgm_Always, "Exiting lt_login. err:%d\n", err);
++      return err;
++}
++
++/****************************************************************************/
++
++/**
++ * internal_gulm_get_lock - 
++ * @fs: 
++ * @key: 
++ * @keylen: 
++ * @lockp: 
++ * 
++ * 
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++internal_gulm_get_lock (gulm_fs_t * fs, uint8_t * key, uint8_t keylen,
++                      gulm_lock_t ** lockp)
++{
++      int found = FALSE;
++      uint32_t bkt;
++      gulm_lock_t *lck = NULL;
++
++      found = find_and_mark_lock (key, keylen, &lck);
++
++      /* malloc space */
++      if (found) {
++              GULM_ASSERT (lck->magic_one == 0xAAAAAAAA,);
++      } else {
++              lck = kmalloc (sizeof (gulm_lock_t), GFP_KERNEL);
++              if (lck == NULL) {
++                      log_err
++                          ("fsid=%s: Out of memory for lock struct in get_lock!\n",
++                           fs->fs_name);
++                      return -ENOMEM;
++              }
++              memset (lck, 0, sizeof (gulm_lock_t));
++              INIT_LIST_HEAD (&lck->gl_list);
++              atomic_set (&lck->count, 1);
++              lck->magic_one = 0xAAAAAAAA;
++              lck->fs = fs;
++              memcpy (lck->key, key, keylen);
++              lck->keylen = keylen;
++              lck->lvb = NULL;
++              init_completion (&lck->actsleep);
++              lck->actuallypending = FALSE;
++              lck->in_to_be_sent = FALSE;
++              lck->result = 0;
++              lck->action = -1;
++              lck->req_type = glck_nothing;
++              lck->last_suc_state = LM_ST_UNLOCKED;
++
++              gulm_cm.ltpx.locks_total++;
++              gulm_cm.ltpx.locks_unl++;
++
++              bkt = hash_lock_key (key, keylen);
++              bkt %= gulm_cm.ltpx.hashbuckets;
++
++              spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++              list_add (&lck->gl_list, &gulm_cm.ltpx.lkhsh[bkt]);
++              spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++      }
++
++      *lockp = lck;
++
++      return 0;
++}
++
++/**
++ * gulm_get_lock - 
++ * @lockspace: 
++ * @name: 
++ * @lockp:
++ * 
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_get_lock (lm_lockspace_t * lockspace, struct lm_lockname *name,
++             lm_lock_t ** lockp)
++{
++      int err, len;
++      gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++      uint8_t key[GIO_KEY_SIZE];
++
++      /* i could add a per fs lock to force only one gulm_get_lock at a time.
++       */
++      down (&fs->get_lock);
++
++      memset (key, 0, GIO_KEY_SIZE);
++      /* pack lockname */
++      key[0] = 'G';           /* G: fs lock, F: First mounter, J: JID mapping lock */
++      key[1] = name->ln_type & 0xff;
++      key[2] = (name->ln_number >> 56) & 0xff;
++      key[3] = (name->ln_number >> 48) & 0xff;
++      key[4] = (name->ln_number >> 40) & 0xff;
++      key[5] = (name->ln_number >> 32) & 0xff;
++      key[6] = (name->ln_number >> 24) & 0xff;
++      key[7] = (name->ln_number >> 16) & 0xff;
++      key[8] = (name->ln_number >> 8) & 0xff;
++      key[9] = (name->ln_number >> 0) & 0xff;
++
++      /* Now stick the fsname into the remaining space. */
++      len = strlen (fs->fs_name);
++      strncpy (&key[10], fs->fs_name, GIO_KEY_SIZE - 16);
++
++      len = MIN (len, GIO_KEY_SIZE - 16);
++      len += 11;              /* 10 for the encoded buf, 1 for the '\0' after the fs name */
++      err = internal_gulm_get_lock (fs, key, len, (gulm_lock_t **) lockp);
++
++      up (&fs->get_lock);
++
++      return err;
++}
++
++/**
++ * gulm_put_lock - 
++ * @lock: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++gulm_put_lock (lm_lock_t * lock)
++{
++      gulm_lock_t *lck = (gulm_lock_t *) lock;
++      lock_table_t *lt = &gulm_cm.ltpx;
++      gulm_fs_t *fs = lck->fs;
++
++      down (&fs->get_lock);
++
++      GULM_ASSERT (lt != NULL,);
++
++      if (lck->last_suc_state != LM_ST_UNLOCKED) {
++              log_err_lck (lck,
++                           "fsid=%s: gulm_put_lock called on a lock that is not unlocked!"
++                           " Current state:%#x\n", lck->fs->fs_name,
++                           lck->last_suc_state);
++              /* I'm still not sure about this one.  We should never see it, so I
++               * don't think it is that big of a deal, but i duno.
++               *
++               * Maybe should just make it an assertion.
++               *
++               * with the mark/unmark code, is it even a concern?
++               */
++      }
++
++      unmark_and_release_lock (lck);
++      /* lck = NULL; */
++
++      up (&fs->get_lock);
++
++}
++
++static int
++valid_trasition (unsigned int cur, unsigned int req)
++{
++      int lock_state_changes[16] = {  /* unl   exl    def    shr  */
++              FALSE, TRUE, TRUE, TRUE,        /* unl */
++              TRUE, FALSE, TRUE, TRUE,        /* exl */
++              TRUE, TRUE, FALSE, TRUE,        /* def */
++              TRUE, TRUE, TRUE, FALSE /* shr */
++      };
++      GULM_ASSERT (cur < 4
++                   && req < 4, log_err ("cur:%d req:%d\n", cur, req););
++
++      return (lock_state_changes[4 * cur + req]);
++}
++
++/**
++ * verify_gulm_lock_t - 
++ * @lck: 
++ * 
++ * wonder if I should add some other checks.
++ * 
++ * Returns: int
++ */
++int
++verify_gulm_lock_t (gulm_lock_t * lck)
++{
++      if (lck == NULL) {
++              log_err ("Lock pointer was NULL!\n");
++              return -1;
++      }
++      if (lck->fs == NULL) {
++              log_err ("This lock has no filesystem!!!\n");
++              return -1;
++      }
++      return 0;
++}
++
++/**
++ * gulm_lock - 
++ * @lock: 
++ * @cur_state: 
++ * @req_state: 
++ * @flags: 
++ * 
++ * 
++ * Returns: int
++ */
++unsigned int
++gulm_lock (lm_lock_t * lock, unsigned int cur_state,
++         unsigned int req_state, unsigned int flags)
++{
++      gulm_lock_t *lck = NULL;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++
++      /* verify vars. */
++      lck = (gulm_lock_t *) lock;
++      if (verify_gulm_lock_t (lck) != 0) {
++              return -EINVAL;
++      }
++      lt = &gulm_cm.ltpx;
++      fs = lck->fs;
++
++      GULM_ASSERT (valid_trasition (cur_state, req_state),
++                   log_err_lck (lck, "want %d with %s thinks:%d\n", req_state,
++                                (LM_FLAG_TRY & flags) ? "try" : (LM_FLAG_NOEXP
++                                                                 & flags) ?
++                                "noexp" : "no flags", cur_state);
++          );
++
++      GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++      /* save the details of this request. */
++      lck->req_type = glck_state;
++      lck->result = 0;
++      lck->cur_state = cur_state;
++      lck->req_state = req_state;
++      lck->flags = flags;
++
++      /* moving these here fixes a race on the s390 that ben found.
++       * basically, the request was sent to the server, the server receives
++       * it, the server processes, the server sends a reply, the client
++       * receives the reply, and the client tries to processe the reply before
++       * this thread could mark it as actuallypending.
++       * */
++      lck->actuallypending = TRUE;
++      atomic_inc (&lt->locks_pending);
++      add_lock_to_send_req_queue (lt, lck, sr_lock);
++
++      lt->lops++;
++#ifdef USE_SYNC_LOCKING
++      wait_for_completion (&lck->actsleep);
++#endif
++
++#ifdef USE_SYNC_LOCKING
++      return lck->result;
++#else
++      return LM_OUT_ASYNC;
++#endif
++}
++
++/**
++ * gulm_unlock - 
++ * @lock: 
++ * @cur_state: 
++ * 
++ * 
++ * Returns: int
++ */
++unsigned int
++gulm_unlock (lm_lock_t * lock, unsigned int cur_state)
++{
++      int e;
++      e = gulm_lock (lock, cur_state, LM_ST_UNLOCKED, 0);
++      return e;
++}
++
++/**
++ * gulm_cancel - 
++ * @lock: 
++ * 
++ */
++void
++gulm_cancel (lm_lock_t * lock)
++{
++      gulm_lock_t *lck;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++
++      /* verify vars. */
++      lck = (gulm_lock_t *) lock;
++      if (verify_gulm_lock_t (lck) != 0) {
++              return;
++      }
++      lt = &gulm_cm.ltpx;
++      fs = lck->fs;
++
++      if (lck->actuallypending) {
++              if (lck->in_to_be_sent) {
++                      /* this should pull the req out of the send queue and have it
++                       * return with a cancel code without going to the server.
++                       */
++                      cancel_pending_sender (lck);
++              } else {
++                      add_lock_to_send_req_queue (lt, lck, sr_cancel);
++              }
++      } else {
++              log_msg_lck (lck, "Cancel called with no pending request.\n");
++      }
++
++}
++
++/**
++ * gulm_hold_lvb - 
++ * @lock: 
++ * @lvbp:
++ * 
++ * 
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_hold_lvb (lm_lock_t * lock, char **lvbp)
++{
++      gulm_lock_t *lck;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++      int err = -1;
++
++      /* verify vars. */
++      lck = (gulm_lock_t *) lock;
++      if (verify_gulm_lock_t (lck) != 0) {
++              return -EINVAL;
++      }
++      lt = &gulm_cm.ltpx;
++      fs = lck->fs;
++
++      /* what where these for? */
++      GULM_ASSERT (lck->magic_one == 0xAAAAAAAA,
++                   log_msg_lck (lck, "Bad gulm_lock magic.\n"););
++      GULM_ASSERT (lt->magic_one == 0xAAAAAAAA,
++                   log_msg_lck (lck, "Bad lock_table magic.\n"););
++
++      lvb_log_msg_lk (lck->key, "Entering gulm_hold_lvb\n");
++
++      GULM_ASSERT (lck->lvb == NULL,
++                   log_msg_lck (lck,
++                                "fsid=%s: Lvb data wasn't null! must be held "
++                                "already.\n", fs->fs_name);
++          );
++
++      GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++      lck->lvb = kmalloc (fs->lvb_size, GFP_KERNEL);
++      if (lck->lvb == NULL) {
++              err = -ENOMEM;
++              goto fail;
++      }
++      memset (lck->lvb, 0, fs->lvb_size);
++
++      lck->req_type = glck_action;
++      lck->action = lg_lock_act_HoldLVB;
++      lck->result = 0;
++      lck->actuallypending = TRUE;
++      add_lock_to_send_req_queue (lt, lck, sr_act);
++
++      wait_for_completion (&lck->actsleep);
++
++      if (lck->result != lg_err_Ok) {
++              log_err ("fsid=%s: Got error %d on hold lvb request.\n",
++                       fs->fs_name, lck->result);
++              kfree (lck->lvb);
++              lck->lvb = NULL;
++              goto fail;
++      }
++
++      lt->locks_lvbs++;
++
++      *lvbp = lck->lvb;
++
++      lvb_log_msg_lk (lck->key, "fsid=%s: Exiting gulm_hold_lvb\n",
++                      fs->fs_name);
++      return 0;
++      fail:
++      if (err != 0)
++              log_msg (lgm_Always,
++                       "fsid=%s: Exiting gulm_hold_lvb with errors (%d)\n",
++                       fs->fs_name, err);
++      return err;
++}
++
++/**
++ * gulm_unhold_lvb - 
++ * @lock: 
++ * @lvb: 
++ * 
++ * 
++ * Returns: void
++ */
++void
++gulm_unhold_lvb (lm_lock_t * lock, char *lvb)
++{
++      gulm_lock_t *lck = NULL;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++
++      /* verify vars. */
++      lck = (gulm_lock_t *) lock;
++      if (verify_gulm_lock_t (lck) != 0) {
++              return;
++      }
++      lt = &gulm_cm.ltpx;
++      fs = lck->fs;
++
++      GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++      if (lck->lvb != lvb) {
++              log_err ("fsid=%s: AH! LVB pointer missmatch! %p != %p\n",
++                       fs->fs_name, lck->lvb, lvb);
++              goto exit;
++      }
++
++      lvb_log_msg_lk (lck->key, "Entering gulm_unhold_lvb\n");
++
++      lck->req_type = glck_action;
++      lck->action = lg_lock_act_UnHoldLVB;
++      lck->result = 0;
++      lck->actuallypending = TRUE;
++      add_lock_to_send_req_queue (lt, lck, sr_act);
++
++      wait_for_completion (&lck->actsleep);
++
++      /* XXX ummm, is it sane to not free the memory if the command fails?
++       * gfs will still think that the lvb was dropped sucessfully....
++       * (it assumes it is always sucessful)
++       * Maybe I should retry the drop request then?
++       */
++      if (lck->result != lg_err_Ok) {
++              log_err ("fsid=%s: Got error %d on unhold LVB request.\n",
++                       lck->fs->fs_name, lck->result);
++      } else {
++              if (lck->lvb != NULL)
++                      kfree (lck->lvb);
++              lck->lvb = NULL;
++              lt->locks_lvbs--;
++      }
++      exit:
++      lvb_log_msg ("Exiting gulm_unhold_lvb\n");
++}
++
++/**
++ * gulm_sync_lvb - 
++ * @lock: 
++ * @lvb: 
++ * 
++ * umm, is this even used anymore? yes.
++ * 
++ * Returns: void
++ */
++void
++gulm_sync_lvb (lm_lock_t * lock, char *lvb)
++{
++      gulm_lock_t *lck = NULL;
++      gulm_fs_t *fs;
++      lock_table_t *lt;
++
++      /* verify vars. */
++      lck = (gulm_lock_t *) lock;
++      if (verify_gulm_lock_t (lck) != 0) {
++              return;
++      }
++      lt = &gulm_cm.ltpx;
++      fs = lck->fs;
++
++      GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++      /* this check is also in the server, so it isn't really needed here. */
++      if (lck->last_suc_state != LM_ST_EXCLUSIVE) {
++              log_err ("sync_lvb: You must hold the lock Exclusive first.\n");
++              goto exit;      /*cannot do anything */
++      }
++      if (lck->lvb == NULL) {
++              log_err ("sync_lvb: You forgot to call hold lvb first.\n");
++              goto exit;
++      }
++      if (lck->lvb != lvb) {
++              log_err ("fsid=%s: AH! LVB pointer missmatch! %p != %p\n",
++                       fs->fs_name, lck->lvb, lvb);
++              goto exit;
++      }
++
++      lvb_log_msg_lk (lck->key, "Entering gulm_sync_lvb\n");
++
++      lck->req_type = glck_action;
++      lck->action = lg_lock_act_SyncLVB;
++      lck->result = 0;
++      lck->actuallypending = TRUE;
++      add_lock_to_send_req_queue (lt, lck, sr_act);
++
++      wait_for_completion (&lck->actsleep);
++
++      /* XXX? retry if I get an error? */
++      if (lck->result != lg_err_Ok) {
++              log_err_lck (lck,
++                           "fsid=%s: Got error %d:%s on Sync LVB request.\n",
++                           fs->fs_name, lck->result,
++                           gio_Err_to_str (lck->result));
++      }
++      exit:
++      lvb_log_msg ("Exiting gulm_sync_lvb\n");
++}
++
++/*****************************************************************************/
++static int
++gulm_plock_get (lm_lockspace_t * lockspace,
++              struct lm_lockname *name, unsigned long owner,
++              uint64_t * start, uint64_t * end, int *exclusive,
++              unsigned long *rowner)
++{
++      return -ENOSYS;
++}
++
++static int
++gulm_plock (lm_lockspace_t * lockspace,
++          struct lm_lockname *name, unsigned long owner,
++          int wait, int exclusive, uint64_t start, uint64_t end)
++{
++      return -ENOSYS;
++}
++
++static int
++gulm_punlock (lm_lockspace_t * lockspace,
++            struct lm_lockname *name, unsigned long owner,
++            uint64_t start, uint64_t end)
++{
++      return -ENOSYS;
++}
++
++/****************************************************************************/
++/****************************************************************************/
++/****************************************************************************/
++/* should move the firstmounter lock stuff into its own file perhaps? */
++/**
++ * get_special_lock - 
++ * @fs: <> filesystem we're getting special lock for
++ *
++ * Returns: gulm_lock_t
++ */
++STATIC gulm_lock_t *
++get_special_lock (gulm_fs_t * fs)
++{
++      int err, len;
++      gulm_lock_t *lck = NULL;
++      uint8_t key[GIO_KEY_SIZE];
++
++      /* pack lockname */
++      memset (key, 0, GIO_KEY_SIZE);
++      /* The F at the beginning doesn't mash with the G that prefixes every fs
++       * lock.
++       */
++      memcpy (key, "FirstMount", 10);
++      len = strlen (fs->fs_name);
++      strncpy (&key[10], fs->fs_name, GIO_KEY_SIZE - 21);
++      len = MIN (len, GIO_KEY_SIZE - 21);
++      len += 11;
++
++      err = internal_gulm_get_lock (fs, key, len, &lck);
++
++      /* return pointer */
++      return lck;
++}
++
++/**
++ * do_lock_time_out - 
++ * @d: 
++ *
++ * after timeout, set cancel request on the handler queue. (since we cannot
++ * call it from within the timer code.
++ * 
++ */
++static void
++do_lock_time_out (unsigned long d)
++{
++      gulm_lock_t *lck = (gulm_lock_t *) d;
++      qu_function_call (&lck->fs->cq, gulm_cancel, lck);
++}
++
++/**
++ * get_mount_lock - 
++ * @fs: 
++ * @first: 
++ * 
++ * Get the Firstmount lock.
++ * We try to grab it Exl.  IF we get that, then we are the first client
++ * mounting this fs.  Otherwise we grab it shared to show that there are
++ * clients using this fs.
++ * 
++ * Returns: int
++ */
++int
++get_mount_lock (gulm_fs_t * fs, int *first)
++{
++      int err;
++      struct timer_list locktimeout;
++      gulm_lock_t *lck = NULL;
++      /*
++       * first we need to get the lock into the hash.
++       * then we can try to get it Exl with try and noexp.
++       * if the try fails, grab it shared.
++       */
++
++      lck = get_special_lock (fs);    /* there is only a mount lock. */
++      if (lck == NULL) {
++              err = -ENOMEM;
++              goto fail;
++      }
++
++      fs->mountlock = lck;
++      try_it_again:
++      *first = FALSE;         /* assume we're not first */
++
++      err = gulm_lock (lck, LM_ST_UNLOCKED, LM_ST_EXCLUSIVE,
++                       LM_FLAG_TRY | LM_FLAG_NOEXP);
++#ifndef USE_SYNC_LOCKING
++      wait_for_completion (&fs->sleep);
++#endif
++
++      if ((lck->result & LM_OUT_ST_MASK) == LM_ST_EXCLUSIVE) {
++              /* we got the lock, we're the first mounter. */
++              *first = TRUE;
++              log_msg (lgm_locking, "fsid=%s: Got mount lock Exclusive.\n",
++                       fs->fs_name);
++              return 0;
++      } else if ((lck->result & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
++              log_msg (lgm_locking,
++                       "fsid=%s: Didn't get mount lock Exl, someone else "
++                       "was first, trying for shared.\n", fs->fs_name);
++
++              /* the try failed, pick it up shared. */
++              /* There was a case (bug #220) where we could hang here.
++               *
++               * To handle this, we put up a timer for a couple of
++               * minutes.  That if it trips, it cancels our shared
++               * request.  Which we then see, so we go back and try the
++               * EXL again.  If the Firstmounter is fine and is just
++               * taking a damn long time to do its work, this just ends
++               * back here, no worse for the wear.
++               *
++               * Another way to do this, is to wait for a killed message
++               * for the master.  When we get that, && we're pending
++               * shared here, send the gulm_canel for the mounter lock.
++               * (too bad we are not in the fs list yet at this point.
++               * (well, maybe that *isn't* a bad thing))
++               */
++              init_timer (&locktimeout);
++              locktimeout.function = do_lock_time_out;
++              locktimeout.data = (unsigned long) lck;
++              mod_timer (&locktimeout, jiffies + (120 * HZ));
++              err = gulm_lock (lck, LM_ST_UNLOCKED, LM_ST_SHARED, 0);
++#ifndef USE_SYNC_LOCKING
++              wait_for_completion (&fs->sleep);
++#endif
++              del_timer (&locktimeout);
++
++              if ((lck->result & LM_OUT_ST_MASK) == LM_ST_SHARED) {
++                      /* kewl we got it. */
++                      log_msg (lgm_locking,
++                               "fsid=%s: Got mount lock shared.\n",
++                               fs->fs_name);
++                      return 0;
++              }
++
++              log_msg (lgm_locking,
++                       "fsid=%s: Shared req timed out, trying Exl again.\n",
++                       fs->fs_name);
++              goto try_it_again;
++      }
++      fail:
++      log_err ("Exit get_mount_lock err=%d\n", err);
++      return err;
++}
++
++/**
++ * downgrade_mount_lock - 
++ * @fs: 
++ * 
++ * drop the Firstmount lock down to shared.  This lets other mount.
++ * 
++ * Returns: int
++ */
++int
++downgrade_mount_lock (gulm_fs_t * fs)
++{
++      int err;
++      gulm_lock_t *lck = (gulm_lock_t *) fs->mountlock;
++      /* we were first, so we have it exl.
++       * shift it to shared so others may mount.
++       */
++      err = gulm_lock (lck, LM_ST_EXCLUSIVE, LM_ST_SHARED, LM_FLAG_NOEXP);
++#ifndef USE_SYNC_LOCKING
++      wait_for_completion (&fs->sleep);
++#endif
++
++      if ((lck->result & LM_OUT_ST_MASK) != LM_ST_SHARED) {
++              log_err
++                  ("fsid=%s: Couldn't downgrade mount lock to shared!!!!!\n",
++                   fs->fs_name);
++      }
++      return 0;
++}
++
++/**
++ * drop_mount_lock - drop our hold on the firstmount lock.
++ * @fs: <> the filesystem pointer.
++ * 
++ * Returns: int
++ */
++int
++drop_mount_lock (gulm_fs_t * fs)
++{
++      int err;
++      gulm_lock_t *lck = (gulm_lock_t *) fs->mountlock;
++
++      if (fs->mountlock == NULL) {
++              log_err ("fsid=%s: There's no Mount lock!!!!!\n", fs->fs_name);
++              return -1;
++      }
++      err = gulm_unlock (lck, LM_ST_SHARED);
++#ifndef USE_SYNC_LOCKING
++      wait_for_completion (&fs->sleep);
++#endif
++
++      if (lck->result != LM_ST_UNLOCKED)
++              log_err ("fsid=%s: Couldn't unlock mount lock!!!!!!\n",
++                       fs->fs_name);
++      gulm_put_lock (fs->mountlock);
++      fs->mountlock = NULL;
++      return 0;
++}
++
++/*****************************************************************************/
++struct lm_lockops gulm_ops = {
++      lm_proto_name:PROTO_NAME,
++      lm_mount:gulm_mount,
++      lm_others_may_mount:gulm_others_may_mount,
++      lm_unmount:gulm_unmount,
++      lm_get_lock:gulm_get_lock,
++      lm_put_lock:gulm_put_lock,
++      lm_lock:gulm_lock,
++      lm_unlock:gulm_unlock,
++      lm_cancel:gulm_cancel,
++      lm_hold_lvb:gulm_hold_lvb,
++      lm_unhold_lvb:gulm_unhold_lvb,
++      lm_sync_lvb:gulm_sync_lvb,
++      lm_plock_get:gulm_plock_get,
++      lm_plock:gulm_plock,
++      lm_punlock:gulm_punlock,
++      lm_recovery_done:gulm_recovery_done,
++      lm_owner:THIS_MODULE,
++};
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_prints.h linux-patched/fs/gfs_locking/lock_gulm/gulm_prints.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_prints.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_prints.h       2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,45 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_prints_h__
++#define __gulm_prints_h__
++#include "gulm_log_msg_bits.h"
++
++#define PROTO_NAME "lock_gulm"
++
++#ifdef GULM_ASSERT
++#undef GULM_ASSERT
++#endif
++#define GULM_ASSERT(x, do) \
++{ \
++  if (!(x)) \
++  { \
++    printk("\n"PROTO_NAME":  Assertion failed on line %d of file %s\n" \
++               PROTO_NAME":  assertion:  \"%s\"\n", \
++               __LINE__, __FILE__, #x ); \
++    {do} \
++    panic("\n"PROTO_NAME":  Record message above and reboot.\n"); \
++  } \
++}
++
++#define log_msg(v, fmt, args...) if(((v)&gulm_cm.verbosity)==(v)||(v)==lgm_Always) {\
++   printk(PROTO_NAME ": " fmt, ## args); \
++}
++#define log_err(fmt, args...) {\
++   printk(KERN_ERR PROTO_NAME ": ERROR " fmt, ## args); \
++}
++
++#define log_nop(fmt, args...)
++#define TICK printk("TICK==>" PROTO_NAME ": [%s:%d] pid:%ld\n",__FILE__,__LINE__,osi_pid())
++
++#endif /*__gulm_prints_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.c linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.c        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.c     2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,165 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++#include <linux/kernel.h>
++#include <linux/proc_fs.h>
++#include "util.h"
++
++extern gulm_cm_t gulm_cm;
++
++struct proc_dir_entry *gulm_proc_dir;
++struct proc_dir_entry *gulm_fs_proc_dir;
++
++/* the read operating function. */
++int
++gulm_fs_proc_read (char *buf, char **start, off_t off, int count, int *eof,
++                 void *data)
++{
++      gulm_fs_t *fs = (gulm_fs_t *) data;
++      count = 0;              /* ignore how much it wants */
++
++      count += sprintf (buf + count, "Filesystem: %s\nJID: %d\n"
++                        "handler_queue_cur: %d\n"
++                        "handler_queue_max: %d\n",
++                        fs->fs_name, fs->fsJID,
++                        fs->cq.task_count, fs->cq.task_max);
++
++      *eof = TRUE;
++      if (off >= count)
++              return 0;
++      *start = buf + off;
++      return (count - off);
++}
++
++/* read the stuff for all */
++int
++gulm_core_proc_read (char *buf, char **start, off_t off, int count,
++                   int *eof, void *data)
++{
++      count = 0;              /* ignore how much it wants */
++
++      count = sprintf (buf,
++                       "cluster id: %s\n"
++                       "my name: %s\n", gulm_cm.clusterID, gulm_cm.myName);
++
++      *eof = TRUE;
++      if (off >= count)
++              return 0;
++      *start = buf + off;
++      return (count - off);
++}
++
++int
++gulm_lt_proc_read (char *buf, char **start, off_t off, int count,
++                 int *eof, void *data)
++{
++      lock_table_t *lt = (lock_table_t *) data;
++      count = 0;              /* ignore how much it wants */
++
++      count += sprintf (buf + count, "\n"
++                        "lock counts:\n"
++                        "  total: %d\n"
++                        "    unl: %d\n"
++                        "    exl: %d\n"
++                        "    shd: %d\n"
++                        "    dfr: %d\n"
++                        "pending: %d\n"
++                        "   lvbs: %d\n"
++                        "   lops: %d\n\n",
++                        lt->locks_total,
++                        lt->locks_unl,
++                        lt->locks_exl,
++                        lt->locks_shd,
++                        lt->locks_dfr,
++                        atomic_read (&lt->locks_pending),
++                        lt->locks_lvbs, lt->lops);
++
++      *eof = TRUE;
++      if (off >= count)
++              return 0;
++      *start = buf + off;
++      return (count - off);
++}
++
++/* add entry to our proc folder
++ * call this on mount.
++ * */
++int
++add_to_proc (gulm_fs_t * fs)
++{
++      if (!(create_proc_read_entry (fs->fs_name, S_IFREG | S_IRUGO,
++                                    gulm_fs_proc_dir, gulm_fs_proc_read,
++                                    (void *) fs))) {
++              log_err ("couldn't register proc entry for %s\n", fs->fs_name);
++              return -EINVAL;
++      }
++      return 0;
++}
++
++/* get rid of it
++ * this on umount.
++ * */
++void
++remove_from_proc (gulm_fs_t * fs)
++{
++      remove_proc_entry (fs->fs_name, gulm_fs_proc_dir);
++}
++
++ /* create our own root dir.
++  * initmodule
++  * */
++int
++init_proc_dir (void)
++{
++      if ((gulm_proc_dir = proc_mkdir ("gulm", &proc_root)) == NULL) {
++              log_err ("cannot create the gulm directory in /proc\n");
++              return -EINVAL;
++      }
++      if (!(create_proc_read_entry ("core", S_IFREG | S_IRUGO, gulm_proc_dir,
++                                    gulm_core_proc_read, NULL))) {
++              log_err ("couldn't register proc entry for core\n");
++              remove_proc_entry ("gulm", &proc_root);
++              return -EINVAL;
++      }
++      if ((gulm_fs_proc_dir =
++           proc_mkdir ("filesystems", gulm_proc_dir)) == NULL) {
++              log_err
++                  ("cannot create the filesystems directory in /proc/gulm\n");
++              remove_proc_entry ("core", gulm_proc_dir);
++              remove_proc_entry ("gulm", &proc_root);
++              return -EINVAL;
++      }
++      if (!(create_proc_read_entry ("lockspace", S_IFREG | S_IRUGO,
++                                    gulm_proc_dir, gulm_lt_proc_read,
++                                    (void *) &gulm_cm.ltpx))) {
++              remove_proc_entry ("filesystems", gulm_proc_dir);
++              remove_proc_entry ("core", gulm_proc_dir);
++              remove_proc_entry ("gulm", &proc_root);
++              return -EINVAL;
++      }
++
++      return 0;
++}
++
++/* destroy it
++ * close module
++ * */
++void
++remove_proc_dir (void)
++{
++      remove_proc_entry ("lockspace", gulm_proc_dir);
++      remove_proc_entry ("filesystems", gulm_proc_dir);
++      remove_proc_entry ("core", gulm_proc_dir);
++      remove_proc_entry ("gulm", &proc_root);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.h linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.h        1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.h     2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __procinfo_h__
++#define __procinfo_h__
++int add_to_proc (gulm_fs_t * fs);
++void remove_from_proc (gulm_fs_t * fs);
++void remove_locktables_from_proc (void);
++void add_locktables_to_proc (void);
++int init_proc_dir (void);
++void remove_proc_dir (void);
++#endif /*__procinfo_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/handler.c linux-patched/fs/gfs_locking/lock_gulm/handler.c
+--- linux-orig/fs/gfs_locking/lock_gulm/handler.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/handler.c   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,343 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/smp_lock.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "handler.h"
++
++/* things about myself
++ * mostly just for verbosity here.
++ * */
++extern gulm_cm_t gulm_cm;
++
++/* the task struct */
++typedef struct runtask_s {
++      struct list_head rt_list;
++
++      gulm_fn fn;
++      lm_callback_t cb;
++      lm_fsdata_t *fsdata;
++      int type;
++      uint64_t lmnum;
++      unsigned int lmtype;
++      int result;
++
++} runtask_t;
++/* ooo crufty. */
++#define LM_CB_GULM_FN 169
++#if LM_CB_GULM_FN == LM_CB_NEED_E || \
++    LM_CB_GULM_FN == LM_CB_NEED_D || \
++    LM_CB_GULM_FN == LM_CB_NEED_S || \
++    LM_CB_GULM_FN == LM_CB_NEED_RECOVERY || \
++    LM_CB_GULM_FN == LM_CB_DROPLOCKS || \
++    LM_CB_GULM_FN == LM_CB_ASYNC
++#error "LM_CB_GULM_FN collision with other LM_CB_*"
++#endif
++
++static __inline__ int
++queue_empty (callback_qu_t * cq)
++{
++      int ret;
++      spin_lock (&cq->list_lock);
++      ret = list_empty (&cq->run_tasks);
++      spin_unlock (&cq->list_lock);
++      return ret;
++}
++
++/**
++ * handler - 
++ * @d: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++handler (void *d)
++{
++      callback_qu_t *cq = (callback_qu_t *) d;
++      runtask_t *rt;
++      struct list_head *tmp;
++      struct lm_lockname lockname;
++      struct lm_async_cb acb;
++
++      daemonize ("gulm_Cb_Handler");
++      atomic_inc (&cq->num_threads);
++      complete (&cq->startup);
++
++      while (cq->running) {
++              do {
++                      DECLARE_WAITQUEUE (__wait_chan, current);
++                      current->state = TASK_INTERRUPTIBLE;
++                      add_wait_queue (&cq->waiter, &__wait_chan);
++                      if (queue_empty (cq))
++                              schedule ();
++                      remove_wait_queue (&cq->waiter, &__wait_chan);
++                      current->state = TASK_RUNNING;
++              } while (0);
++
++              if (!cq->running)
++                      break;
++              /* remove item from list */
++              spin_lock (&cq->list_lock);
++              if (list_empty (&cq->run_tasks)) {
++                      spin_unlock (&cq->list_lock);
++                      continue;       /* nothing here. move on */
++              }
++              /* take items off the end of the list, since we add them to the
++               * beginning.
++               */
++              tmp = (&cq->run_tasks)->prev;
++              list_del (tmp);
++              cq->task_count--;
++              spin_unlock (&cq->list_lock);
++
++              rt = list_entry (tmp, runtask_t, rt_list);
++
++              if (rt->type == LM_CB_ASYNC) {
++                      acb.lc_name.ln_number = rt->lmnum;
++                      acb.lc_name.ln_type = rt->lmtype;
++                      acb.lc_ret = rt->result;
++                      rt->cb (rt->fsdata, rt->type, &acb);
++              } else if (rt->type == LM_CB_GULM_FN) {
++                      rt->fn (rt->fsdata);
++              } else {
++                      lockname.ln_number = rt->lmnum;
++                      lockname.ln_type = rt->lmtype;
++                      rt->cb (rt->fsdata, rt->type, &lockname);
++              }
++
++              kfree (rt);
++
++      }                       /*while(running) */
++
++      atomic_dec (&cq->num_threads);
++      complete (&cq->startup);
++      return 0;
++}
++
++/**
++ * display_handler_queue - 
++ * @cq: 
++ * 
++ * remember, items are added to the head, and removed from the tail.
++ * So the last item listed, is the next item to be handled.
++ * 
++ */
++void
++display_handler_queue (callback_qu_t * cq)
++{
++      struct list_head *lltmp;
++      runtask_t *rt;
++      int i = 0;
++      log_msg (lgm_Always, "Dumping Handler queue with %d items, max %d\n",
++               cq->task_count, cq->task_max);
++      spin_lock (&cq->list_lock);
++      list_for_each (lltmp, &cq->run_tasks) {
++              rt = list_entry (lltmp, runtask_t, rt_list);
++              if (rt->type == LM_CB_ASYNC) {
++                      log_msg (lgm_Always,
++                               "%4d ASYNC    (%" PRIu64 ", %u) result:%#x\n",
++                               i, rt->lmnum, rt->lmtype, rt->result);
++              } else if (rt->type == LM_CB_GULM_FN) {
++                      log_msg (lgm_Always, "%4d GULM FN  func:%p data:%p\n",
++                               i, rt->fn, rt->fsdata);
++              } else {        /* callback. */
++                      log_msg (lgm_Always,
++                               "%4d CALLBACK req:%u (%" PRIu64 ", %u)\n", i,
++                               rt->type, rt->lmnum, rt->lmtype);
++              }
++              i++;
++      }
++      spin_unlock (&cq->list_lock);
++}
++
++/**
++ * alloc_runtask - 
++ * Returns: runtask_t
++ */
++runtask_t *
++alloc_runtask (void)
++{
++      runtask_t *rt;
++      rt = kmalloc (sizeof (runtask_t), GFP_KERNEL);
++      return rt;
++}
++
++/**
++ * qu_function_call - 
++ * @cq: 
++ * @fn: 
++ * @data: 
++ * 
++ * Generic function execing on the handler thread.  Mostly so I can add
++ * single things quick without having to build all the details into the
++ * handler queues.
++ * 
++ * Returns: int
++ */
++int
++qu_function_call (callback_qu_t * cq, gulm_fn fn, void *data)
++{
++      runtask_t *rt;
++      rt = alloc_runtask ();
++      if (rt == NULL)
++              return -ENOMEM;
++      rt->cb = NULL;
++      rt->fn = fn;
++      rt->fsdata = data;
++      rt->type = LM_CB_GULM_FN;
++      rt->lmtype = 0;
++      rt->lmnum = 0;
++      rt->result = 0;
++      INIT_LIST_HEAD (&rt->rt_list);
++      spin_lock (&cq->list_lock);
++      list_add (&rt->rt_list, &cq->run_tasks);
++      cq->task_count++;
++      if (cq->task_count > cq->task_max)
++              cq->task_max = cq->task_count;
++      spin_unlock (&cq->list_lock);
++      wake_up (&cq->waiter);
++      return 0;
++}
++
++/**
++ * qu_async_rpl - 
++ * @cq: 
++ * @cb: 
++ * @fsdata: 
++ * @lockname: 
++ * @result: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++qu_async_rpl (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++            struct lm_lockname *lockname, int result)
++{
++      runtask_t *rt;
++      rt = alloc_runtask ();
++      if (rt == NULL)
++              return -ENOMEM;
++      rt->cb = cb;
++      rt->fsdata = fsdata;
++      rt->type = LM_CB_ASYNC;
++      rt->lmtype = lockname->ln_type;
++      rt->lmnum = lockname->ln_number;
++      rt->result = result;
++      INIT_LIST_HEAD (&rt->rt_list);
++      spin_lock (&cq->list_lock);
++      list_add (&rt->rt_list, &cq->run_tasks);
++      cq->task_count++;
++      if (cq->task_count > cq->task_max)
++              cq->task_max = cq->task_count;
++      spin_unlock (&cq->list_lock);
++      wake_up (&cq->waiter);
++      return 0;
++}
++
++/**
++ * qu_drop_req - 
++ * 
++ * Returns: <0:Error; =0:Ok
++ */
++int
++qu_drop_req (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++           int type, uint8_t lmtype, uint64_t lmnum)
++{
++      runtask_t *rt;
++      rt = alloc_runtask ();
++      if (rt == NULL)
++              return -ENOMEM;
++      rt->cb = cb;
++      rt->fsdata = fsdata;
++      rt->type = type;
++      rt->lmtype = lmtype;
++      rt->lmnum = lmnum;
++      rt->result = 0;
++      INIT_LIST_HEAD (&rt->rt_list);
++      spin_lock (&cq->list_lock);
++      list_add (&rt->rt_list, &cq->run_tasks);
++      cq->task_count++;
++      if (cq->task_count > cq->task_max)
++              cq->task_max = cq->task_count;
++      spin_unlock (&cq->list_lock);
++      wake_up (&cq->waiter);
++      return 0;
++}
++
++/**
++ * stop_callback_qu - stop the handler thread
++ */
++void
++stop_callback_qu (callback_qu_t * cq)
++{
++      struct list_head *lltmp, *tmp;
++      runtask_t *rt;
++
++      if (cq->running) {
++              cq->running = FALSE;
++              /* make sure all thread stop.
++               * */
++              while (atomic_read (&cq->num_threads) > 0) {
++                      wake_up (&cq->waiter);
++                      wait_for_completion (&cq->startup);
++              }
++              /* clear out any left overs. */
++              list_for_each_safe (tmp, lltmp, &cq->run_tasks) {
++                      rt = list_entry (tmp, runtask_t, rt_list);
++                      list_del (tmp);
++                      kfree (rt);
++              }
++      }
++}
++
++/**
++ * start_callback_qu - 
++ *
++ * Returns: <0:Error, >=0:Ok
++ */
++int
++start_callback_qu (callback_qu_t * cq, int cnt)
++{
++      int err;
++      INIT_LIST_HEAD (&cq->run_tasks);
++      spin_lock_init (&cq->list_lock);
++      init_completion (&cq->startup);
++      init_waitqueue_head (&cq->waiter);
++      atomic_set (&cq->num_threads, 0);
++      cq->running = TRUE;
++      cq->task_count = 0;
++      cq->task_max = 0;
++      if (cnt <= 0)
++              cnt = 2;
++      for (; cnt > 0; cnt--) {
++              err = kernel_thread (handler, cq, 0);   /* XXX linux part */
++              if (err < 0) {
++                      stop_callback_qu (cq);
++                      /* calling stop here might not behave correctly in all error
++                       * cases.
++                       */
++                      return err;
++              }
++              wait_for_completion (&cq->startup);
++      }
++      return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/handler.h linux-patched/fs/gfs_locking/lock_gulm/handler.h
+--- linux-orig/fs/gfs_locking/lock_gulm/handler.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/handler.h   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,42 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __handler_c__
++#define __handler_c__
++#include <linux/lm_interface.h>
++
++struct callback_qu_s {
++      struct completion startup;
++      int running;
++      int task_count;
++      int task_max;
++      struct list_head run_tasks;
++      spinlock_t list_lock;
++      wait_queue_head_t waiter;
++      atomic_t num_threads;
++};
++typedef struct callback_qu_s callback_qu_t;
++
++/* kinda an excess overloading */
++typedef void (*gulm_fn) (void *);
++int qu_function_call (callback_qu_t * cq, gulm_fn fn, void *data);
++
++int qu_async_rpl (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++                struct lm_lockname *lockname, int result);
++int qu_drop_req (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++               int type, uint8_t lmtype, uint64_t lmnum);
++int start_callback_qu (callback_qu_t * cq, int cnt);
++void stop_callback_qu (callback_qu_t * cq);
++void display_handler_queue (callback_qu_t * cq);
++
++#endif /*__handler_c__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_core.c linux-patched/fs/gfs_locking/lock_gulm/lg_core.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_core.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_core.c   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,724 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* All of the core related functions for services are here. */
++
++#include "lg_priv.h"
++
++/**
++ * lg_core_selector - 
++ * @ulm_interface_p: 
++ * 
++ * 
++ * Returns: int
++ */
++xdr_socket
++lg_core_selector (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL || lg->first_magic != LGMAGIC
++          || lg->last_magic != LGMAGIC)
++#ifdef __KERNEL__
++              return NULL;
++#else
++              return -EINVAL;
++#endif
++
++      return lg->core_fd;
++}
++
++/**
++ * lg_core_handle_messages - 
++ * @ulm_interface_p: 
++ * @lg_core_callbacks_t: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_handle_messages (gulm_interface_p lgp, lg_core_callbacks_t * ccbp,
++                       void *misc)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_dec_t *dec;
++      int err = 0;
++      uint64_t x_gen;
++      uint32_t x_code, x_error, x_rank;
++      struct in6_addr x_ip;
++      uint8_t x_state, x_mode;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EBADR;
++
++      down (&lg->core_recver);
++      if (lg->in_core_hm)
++              return -EDEADLK;
++      lg->in_core_hm = TRUE;
++      up (&lg->core_recver);
++
++      dec = lg->core_dec;
++
++      err = xdr_dec_uint32 (dec, &x_code);
++      if (err != 0)
++              goto exit;
++
++      if (gulm_core_login_rpl == x_code) {
++              do {
++                      if ((err = xdr_dec_uint64 (dec, &x_gen)) < 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_error)) < 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_rank)) < 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) < 0)
++                              break;
++              } while (0);
++              if (err != 0)
++                      goto exit;
++              if (ccbp->login_reply == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->login_reply (misc, x_gen, x_error, x_rank, x_state);
++              goto exit;
++      } else if (gulm_core_logout_rpl == x_code) {
++              if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                      goto exit;
++              if (ccbp->logout_reply != NULL) {
++                      err = ccbp->logout_reply (misc);
++              }
++
++              xdr_close (&lg->core_fd);
++              xdr_enc_release (lg->core_enc);
++              lg->core_enc = NULL;
++              xdr_dec_release (lg->core_dec);
++              lg->core_dec = NULL;
++
++              goto exit;
++      } else if (gulm_core_mbr_lstrpl == x_code) {
++              if (ccbp->nodelist != NULL) {
++                      err = ccbp->nodelist (misc, lglcb_start, NULL, 0, 0);
++                      if (err != 0)
++                              goto exit;
++              }
++              do {
++                      if ((err = xdr_dec_list_start (dec)) != 0)
++                              break;
++                      while (xdr_dec_list_stop (dec) != 0) {
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->cfba,
++                                                      &lg->cfba_len)) != 0)
++                                      break;
++                              if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint8 (dec, &x_mode)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint8 (dec, &x_mode)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint32 (dec, &x_rank)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++                                      break;
++                              if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++                                      break;
++
++                              if (ccbp->nodelist != NULL) {
++                                      err =
++                                          ccbp->nodelist (misc, lglcb_item,
++                                                          lg->cfba, &x_ip,
++                                                          x_state);
++                                      if (err != 0)
++                                              goto exit;
++                              }
++
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (ccbp->nodelist == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->nodelist (misc, lglcb_stop, NULL, 0, 0);
++              goto exit;
++      } else if (gulm_core_state_chgs == x_code) {
++              do {
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++                      if (x_state == gio_Mbr_ama_Slave) {
++                              if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++                                      break;
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->cfba,
++                                                      &lg->cfba_len)) != 0)
++                                      break;
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (ccbp->statechange == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->statechange (misc, x_state, &x_ip, lg->cfba);
++              goto exit;
++      } else if (gulm_core_mbr_updt == x_code) {
++              do {
++                      if ((err =
++                           xdr_dec_string_ag (dec, &lg->cfba,
++                                              &lg->cfba_len)) != 0)
++                              break;
++                      if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (ccbp->nodechange == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->nodechange (misc, lg->cfba, &x_ip, x_state);
++              goto exit;
++      } else if (gulm_core_res_list == x_code) {
++              if (ccbp->service_list != NULL) {
++                      if ((err =
++                           ccbp->service_list (misc, lglcb_start, NULL)) != 0)
++                              goto exit;
++              }
++              do {
++                      if ((err = xdr_dec_list_start (dec)) != 0)
++                              break;
++                      while (xdr_dec_list_stop (dec)) {
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->cfba,
++                                                      &lg->cfba_len)) != 0)
++                                      break;
++                              if (ccbp->service_list != NULL) {
++                                      if ((err =
++                                           ccbp->service_list (misc,
++                                                               lglcb_item,
++                                                               lg->cfba)) !=
++                                          0) {
++                                              goto exit;
++                                      }
++                              }
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (ccbp->service_list == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->service_list (misc, lglcb_stop, NULL);
++              goto exit;
++      } else if (gulm_info_stats_rpl == x_code) {
++              if (ccbp->status != NULL) {
++                      if ((err =
++                           ccbp->status (misc, lglcb_start, NULL, NULL)) != 0)
++                              goto exit;
++              }
++              do {
++                      if ((err = xdr_dec_list_start (dec)) != 0)
++                              break;
++                      while (xdr_dec_list_stop (dec) != 0) {
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->cfba,
++                                                      &lg->cfba_len)) != 0)
++                                      break;
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->cfbb,
++                                                      &lg->cfbb_len)) != 0)
++                                      break;
++                              if (ccbp->status != NULL) {
++                                      if ((err =
++                                           ccbp->status (misc, lglcb_item,
++                                                         lg->cfba,
++                                                         lg->cfbb)) != 0) {
++                                              goto exit;
++                                      }
++                              }
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (ccbp->status == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->status (misc, lglcb_stop, NULL, NULL);
++              goto exit;
++      } else if (gulm_err_reply == x_code) {
++              if ((err = xdr_dec_uint32 (dec, &x_code)) != 0)
++                      goto exit;
++              if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                      goto exit;
++              if (ccbp->error == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = ccbp->error (misc, x_error);
++              goto exit;
++      } else {
++              /* unknown code. what to do? */
++              err = -EPROTO;
++              goto exit;
++      }
++
++      exit:
++      lg->in_core_hm = FALSE;
++      return err;
++}
++
++/**
++ * lg_core_login - 
++ * @lgp: 
++ * @important: 
++ *
++ * On any error, things are closed and released to the state of things
++ * before you called login.
++ * 
++ * Returns: int
++ */
++int
++lg_core_login (gulm_interface_p lgp, int important)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct sockaddr_in6 adr;
++      int err;
++      xdr_socket cfd;
++      xdr_enc_t *enc;
++      xdr_dec_t *dec;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      adr.sin6_family = AF_INET6;
++      adr.sin6_addr = in6addr_loopback;
++      adr.sin6_port = htons (lg->core_port);
++
++      if ((err = xdr_open (&cfd)) < 0) {
++              return err;
++      }
++
++      if ((err = xdr_connect (&adr, cfd)) < 0) {
++              xdr_close (&cfd);
++              return err;
++      }
++
++      enc = xdr_enc_init (cfd, 128);
++      if (enc == NULL) {
++              xdr_close (&cfd);
++              return -ENOMEM;
++      }
++
++      dec = xdr_dec_init (cfd, 128);
++      if (enc == NULL) {
++              xdr_enc_release (enc);
++              xdr_close (&cfd);
++              return -ENOMEM;
++      }
++
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_reslgn_req)) < 0)
++                      break;
++              if ((err = xdr_enc_uint32 (enc, GIO_WIREPROT_VERS)) < 0)
++                      break;
++              if ((err = xdr_enc_string (enc, lg->clusterID)) < 0)
++                      break;
++              if ((err = xdr_enc_string (enc, lg->service_name)) < 0)
++                      break;
++              if ((err =
++                   xdr_enc_uint32 (enc,
++                                   important ? gulm_svc_opt_important : 0)) !=
++                  0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) < 0)
++                      break;
++      } while (0);
++      if (err != 0) {
++              xdr_dec_release (dec);
++              xdr_enc_release (enc);
++              xdr_close (&cfd);
++              return err;
++      }
++
++      down (&lg->core_sender);
++      lg->core_fd = cfd;
++      lg->core_enc = enc;
++      lg->core_dec = dec;
++      up (&lg->core_sender);
++
++      return 0;
++}
++
++/**
++ * lg_core_logout - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_logout (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_logout_req)) != 0)
++                      break;
++              if ((err = xdr_enc_string (enc, lg->service_name)) != 0)
++                      break;
++              if ((err = xdr_enc_uint8 (enc, gio_Mbr_ama_Resource)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_nodeinfo - 
++ * @lgp: 
++ * @nodename: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_nodeinfo (gulm_interface_p lgp, char *nodename)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      if (nodename == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_req)) != 0)
++                      break;
++              if ((err = xdr_enc_string (enc, nodename)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_nodelist - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_nodelist (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_lstreq)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_servicelist - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_servicelist (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_res_req)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_corestate - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_corestate (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_state_req)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_shutdown - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_shutdown (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_shutdown)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_forceexpire - 
++ * @lgp: 
++ * @node_name: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_forceexpire (gulm_interface_p lgp, char *nodename)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      if (nodename == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_force)) != 0)
++                      break;
++              if ((err = xdr_enc_string (enc, nodename)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_forcepending - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_forcepending (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_core_forcepend)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
++
++/**
++ * lg_core_status - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_core_status (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->core_enc;
++
++      down (&lg->core_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_info_stats_req)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->core_sender);
++      return err;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_lock.c linux-patched/fs/gfs_locking/lock_gulm/lg_lock.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_lock.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_lock.c   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,667 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* all of the lock related fucntion are here. */
++#include "lg_priv.h"
++
++/**
++ * lg_lock_selector - 
++ * @ulm_interface_p: 
++ * 
++ * 
++ * Returns: int
++ */
++xdr_socket
++lg_lock_selector (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL || lg->first_magic != LGMAGIC
++          || lg->last_magic != LGMAGIC)
++#ifdef __KERNEL__
++              return NULL;
++#else
++              return -EINVAL;
++#endif
++
++      return lg->lock_fd;
++}
++
++/**
++ * lg_lock_handle_messages - 
++ * @ulm_interface_p: 
++ * @lg_lockspace_callbacks_t: 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_handle_messages (gulm_interface_p lgp, lg_lockspace_callbacks_t * cbp,
++                       void *misc)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_dec_t *dec;
++      int err = 0;
++      uint32_t x_code, x_error, x_flags;
++      uint16_t x_keylen, x_lvblen = 0;
++      uint8_t x_state;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->core_enc == NULL || lg->core_dec == NULL)
++              return -EBADR;
++
++      down (&lg->lock_recver);
++      if (lg->in_lock_hm)
++              return -EDEADLK;
++      lg->in_lock_hm = TRUE;
++      up (&lg->lock_recver);
++
++      dec = lg->lock_dec;
++
++      err = xdr_dec_uint32 (dec, &x_code);
++      if (err != 0)
++              goto exit;
++
++      if (gulm_lock_login_rpl == x_code) {
++              do {
++                      if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++              } while (0);
++              if (err != 0)
++                      goto exit;
++              if (cbp->login_reply == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = cbp->login_reply (misc, x_error, x_state);
++              goto exit;
++      } else if (gulm_lock_logout_rpl == x_code) {
++              if (cbp->logout_reply != NULL) {
++                      err = cbp->logout_reply (misc);
++              }
++
++              xdr_close (&lg->lock_fd);
++              xdr_enc_release (lg->lock_enc);
++              lg->lock_enc = NULL;
++              xdr_dec_release (lg->lock_dec);
++              lg->lock_dec = NULL;
++
++              goto exit;
++      } else if (gulm_lock_state_rpl == x_code) {
++              do {
++                      if ((err =
++                           xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++                                           &lg->lfba_len, &x_keylen)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_flags)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                              break;
++                      if (x_flags & gio_lck_fg_hasLVB) {
++                              if ((err =
++                                   xdr_dec_raw_ag (dec, (void **) &lg->lfbb,
++                                                   &lg->lfbb_len,
++                                                   &x_lvblen)) != 0)
++                                      break;
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (x_keylen <= 4) {
++                      err = -EPROTO;  /* or something */
++                      goto exit;
++              }
++              if (cbp->lock_state == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = cbp->lock_state (misc, &lg->lfba[4], x_keylen - 4,
++                                     x_state, x_flags, x_error,
++                                     lg->lfbb, x_lvblen);
++              goto exit;
++      } else if (gulm_lock_action_rpl == x_code) {
++              do {
++                      if ((err =
++                           xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++                                           &lg->lfba_len, &x_keylen)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                              break;
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (x_keylen <= 4) {
++                      err = -EPROTO;  /* or something */
++                      goto exit;
++              }
++              if (cbp->lock_action == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err =
++                  cbp->lock_action (misc, &lg->lfba[4], x_keylen - 4, x_state,
++                                    x_error);
++              goto exit;
++      } else if (gulm_lock_cb_state == x_code) {
++              do {
++                      if ((err =
++                           xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++                                           &lg->lfba_len, &x_keylen)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++                              break;
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (cbp->drop_lock_req == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err =
++                  cbp->drop_lock_req (misc, &lg->lfba[4], x_keylen - 4,
++                                      x_state);
++              goto exit;
++      } else if (gulm_lock_cb_dropall == x_code) {
++              if (cbp->drop_all == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = cbp->drop_all (misc);
++              goto exit;
++      } else if (gulm_info_stats_rpl == x_code) {
++              if (cbp->status != NULL) {
++                      if ((err =
++                           cbp->status (misc, lglcb_start, NULL, NULL)) != 0)
++                              goto exit;
++              }
++              do {
++                      if ((err = xdr_dec_list_start (dec)) != 0)
++                              break;
++                      while (xdr_dec_list_stop (dec) != 0) {
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->lfba,
++                                                      &lg->lfba_len)) != 0)
++                                      break;
++                              if ((err =
++                                   xdr_dec_string_ag (dec, &lg->lfbb,
++                                                      &lg->lfbb_len)) != 0)
++                                      break;
++                              if (cbp->status != NULL) {
++                                      if ((err =
++                                           cbp->status (misc, lglcb_item,
++                                                        lg->lfba,
++                                                        lg->lfbb)) != 0) {
++                                              break;
++                                      }
++                              }
++                      }
++              } while (0);
++              if (err != 0) {
++                      goto exit;
++              }
++              if (cbp->status == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = cbp->status (misc, lglcb_stop, NULL, NULL);
++              goto exit;
++      } else if (gulm_err_reply == x_code) {
++              do {
++                      if ((err = xdr_dec_uint32 (dec, &x_code)) != 0)
++                              break;
++                      if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++                              break;
++              } while (0);
++              if (err != 0)
++                      goto exit;
++              if (cbp->error == NULL) {
++                      err = 0;
++                      goto exit;
++              }
++              err = cbp->error (misc, x_error);
++              goto exit;
++      } else {
++              err = -EPROTO;
++              goto exit;
++      }
++
++      exit:
++      lg->in_lock_hm = FALSE;
++      return err;
++}
++
++/**
++ * lg_lock_login - 
++ * @ulm_interface_p: 
++ * @4: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_login (gulm_interface_p lgp, uint8_t lockspace[4])
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct sockaddr_in6 adr;
++      int err;
++      xdr_socket cfd;
++      xdr_enc_t *enc;
++      xdr_dec_t *dec;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      adr.sin6_family = AF_INET6;
++      adr.sin6_addr = in6addr_loopback;
++      adr.sin6_port = htons (lg->lock_port);
++
++      if ((err = xdr_open (&cfd)) < 0) {
++              return err;
++      }
++
++      if ((err = xdr_connect (&adr, cfd)) < 0) {
++              xdr_close (&cfd);
++              return err;
++      }
++
++      enc = xdr_enc_init (cfd, 512);
++      if (enc == NULL) {
++              xdr_close (&cfd);
++              return -ENOMEM;
++      }
++
++      dec = xdr_dec_init (cfd, 512);
++      if (enc == NULL) {
++              xdr_enc_release (enc);
++              xdr_close (&cfd);
++              return -ENOMEM;
++      }
++
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_login_req)) < 0)
++                      break;
++              if ((err = xdr_enc_uint32 (enc, GIO_WIREPROT_VERS)) < 0)
++                      break;
++              if ((err = xdr_enc_string (enc, lg->service_name)) < 0)
++                      break;
++              if ((err = xdr_enc_uint8 (enc, gio_lck_st_Client)) < 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) < 0)
++                      break;
++
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_sel_lckspc)) < 0)
++                      break;
++              if ((err = xdr_enc_raw (enc, lockspace, 4)) < 0)
++                      break;
++              /* don't flush here.
++               * dumb programmer stunt.  This way, the lockspace selection won't
++               * happen until the next thing the user of this lib sends.  Which
++               * means it will be after we have received the login reply.
++               *
++               * Is there really a good reason not to flush here?
++               */
++      } while (0);
++      if (err != 0) {
++              xdr_dec_release (dec);
++              xdr_enc_release (enc);
++              xdr_close (&cfd);
++              return err;
++      }
++
++      down (&lg->lock_sender);
++      lg->lock_fd = cfd;
++      lg->lock_enc = enc;
++      lg->lock_dec = dec;
++
++      memcpy (lg->lockspace, lockspace, 4);
++      up (&lg->lock_sender);
++
++      return 0;
++}
++
++/**
++ * lg_lock_logout - 
++ * @ulm_interface_p: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_logout (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->lock_enc;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_logout_req)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
++
++/**
++ * lg_lock_state_req - 
++ * @lgp: 
++ * @key: 
++ * @keylen: 
++ * @state: 
++ * @flags: 
++ * @LVB: 
++ * @LVBlen: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_state_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen,
++                 uint8_t state, uint32_t flags, uint8_t * LVB,
++                 uint16_t LVBlen)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct iovec iov[2];
++      xdr_enc_t *enc;
++      uint32_t iflgs = 0;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      if (state != lg_lock_state_Unlock &&
++          state != lg_lock_state_Exclusive &&
++          state != lg_lock_state_Deferred && state != lg_lock_state_Shared)
++              return -EINVAL;
++
++      /* make sure only the accepted flags get passed through. */
++      if ((flags & lg_lock_flag_DoCB) == lg_lock_flag_DoCB)
++              iflgs |= lg_lock_flag_DoCB;
++      if ((flags & lg_lock_flag_Try) == lg_lock_flag_Try)
++              iflgs |= lg_lock_flag_Try;
++      if ((flags & lg_lock_flag_Any) == lg_lock_flag_Any)
++              iflgs |= lg_lock_flag_Any;
++      if ((flags & lg_lock_flag_IgnoreExp) == lg_lock_flag_IgnoreExp)
++              iflgs |= lg_lock_flag_IgnoreExp;
++      if ((flags & lg_lock_flag_Piority) == lg_lock_flag_Piority)
++              iflgs |= lg_lock_flag_Piority;
++
++      enc = lg->lock_enc;
++
++      if (LVB != NULL && LVBlen > 0)
++              iflgs |= gio_lck_fg_hasLVB;
++
++      iov[0].iov_base = lg->lockspace;
++      iov[0].iov_len = 4;
++      iov[1].iov_base = key;
++      iov[1].iov_len = keylen;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_state_req)) != 0)
++                      break;
++              if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++                      break;
++              if ((err = xdr_enc_uint8 (enc, state)) != 0)
++                      break;
++              if ((err = xdr_enc_uint32 (enc, iflgs)) != 0)
++                      break;
++              if (iflgs & gio_lck_fg_hasLVB)
++                      if ((err = xdr_enc_raw (enc, LVB, LVBlen)) != 0)
++                              break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
++
++/**
++ * lg_lock_cancel_req - 
++ * @lgp: 
++ * @key: 
++ * @keylen: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_cancel_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct iovec iov[2];
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->lock_enc;
++
++      iov[0].iov_base = lg->lockspace;
++      iov[0].iov_len = 4;
++      iov[1].iov_base = key;
++      iov[1].iov_len = keylen;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_action_req)) != 0)
++                      break;
++              if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++                      break;
++              if ((err = xdr_enc_uint8 (enc, gio_lck_st_Cancel)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
++
++/**
++ * lg_lock_action_req - 
++ * @lgp: 
++ * @key: 
++ * @keylen: 
++ * @action: 
++ * @LVB: 
++ * @LVBlen: 
++ * 
++ * XXX
++ * I wonder if I should actually break this into three seperate calls for
++ * the lvb stuff.  Does it really matter?
++ * 
++ * Returns: int
++ */
++int
++lg_lock_action_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen,
++                  uint8_t action, uint8_t * LVB, uint16_t LVBlen)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct iovec iov[2];
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      if (action != lg_lock_act_HoldLVB &&
++          action != lg_lock_act_UnHoldLVB && action != lg_lock_act_SyncLVB)
++              return -EINVAL;
++
++      enc = lg->lock_enc;
++
++      iov[0].iov_base = lg->lockspace;
++      iov[0].iov_len = 4;
++      iov[1].iov_base = key;
++      iov[1].iov_len = keylen;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_action_req)) != 0)
++                      break;
++              if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++                      break;
++              if ((err = xdr_enc_uint8 (enc, action)) != 0)
++                      break;
++              if (action == gio_lck_st_SyncLVB)
++                      if ((err = xdr_enc_raw (enc, LVB, LVBlen)) != 0)
++                              break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
++
++/**
++ * lg_lock_drop_exp - 
++ * @ulm_interface_p: 
++ * @holder: 
++ * @keymask: 
++ * @kmlen: 
++ * 
++ * holder is the node name of the expired holder that you want to clear.
++ * Only locks matching the keymask will be looked at. (most of the time you
++ * will just set key to a bunch of 0xff to match all) The keymask lets you
++ * basically subdivide your lockspace into smaller seperate parts.
++ * (example, there is one gfs lockspace, but each filesystem gets its own
++ * subpart of that larger space)
++ *
++ * If holder is NULL, all expired holders in your lockspace will get
++ * dropped.
++ * 
++ * Returns: int
++ */
++int
++lg_lock_drop_exp (gulm_interface_p lgp, uint8_t * holder, uint8_t * key,
++                uint16_t keylen)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      struct iovec iov[2];
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->lock_enc;
++
++      iov[0].iov_base = lg->lockspace;
++      iov[0].iov_len = 4;
++      iov[1].iov_base = key;
++      iov[1].iov_len = (key != NULL) ? keylen : 0;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_lock_drop_exp)) != 0)
++                      break;
++              if ((err = xdr_enc_string (enc, holder)) != 0)
++                      break;
++              if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
++
++/**
++ * lg_lock_status - 
++ * @lgp: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_lock_status (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      xdr_enc_t *enc;
++      int err;
++
++      /* make sure it is a gulm_interface_p. */
++      if (lg == NULL)
++              return -EINVAL;
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++              return -EINVAL;
++
++      enc = lg->lock_enc;
++
++      down (&lg->lock_sender);
++      do {
++              if ((err = xdr_enc_uint32 (enc, gulm_info_stats_req)) != 0)
++                      break;
++              if ((err = xdr_enc_flush (enc)) != 0)
++                      break;
++      } while (0);
++      up (&lg->lock_sender);
++      return err;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_main.c linux-patched/fs/gfs_locking/lock_gulm/lg_main.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_main.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_main.c   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,209 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* This is where all of the library specific functions exist.
++ * Not many, but keeps things clean.
++ */
++
++#include "lg_priv.h"
++#include "gulm.h"
++extern gulm_cm_t gulm_cm;
++
++/**
++ * lg_initialize - 
++ * @gulm_interface_p:
++ * @cluster_name:
++ * @service_name: 
++ * 
++ * if returning an error, nothing was done to the value of gulm_interface_p
++ * 
++ * Returns: gulm_interface_p
++ */
++int
++lg_initialize (gulm_interface_p * ret, char *cluster_name, char *service_name)
++{
++      gulm_interface_t *lg;
++      int err, len;
++
++      lg = kmalloc (sizeof (gulm_interface_t), GFP_KERNEL);
++      if (lg == NULL)
++              return -ENOMEM;
++
++      memset (lg, 0, sizeof (gulm_interface_t));
++      lg->first_magic = LGMAGIC;
++      lg->last_magic = LGMAGIC;
++
++      if (cluster_name == NULL)
++              cluster_name = "cluster";
++      len = strlen (cluster_name) + 1;
++      lg->clusterID = kmalloc (len, GFP_KERNEL);
++      if (lg->clusterID == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++      memcpy (lg->clusterID, cluster_name, len);
++
++      len = strlen (service_name) + 1;
++      lg->service_name = kmalloc (len, GFP_KERNEL);
++      if (lg->service_name == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++      memcpy (lg->service_name, service_name, len);
++
++      /* set up flutter bufs. */
++      lg->cfba_len = 64;
++      lg->cfba = kmalloc (lg->cfba_len, GFP_KERNEL);
++      if (lg->cfba == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++
++      lg->cfbb_len = 64;
++      lg->cfbb = kmalloc (lg->cfbb_len, GFP_KERNEL);
++      if (lg->cfbb == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++
++      lg->lfba_len = 128;
++      lg->lfba = kmalloc (lg->lfba_len, GFP_KERNEL);
++      if (lg->lfba == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++
++      lg->lfbb_len = 128;
++      lg->lfbb = kmalloc (lg->lfbb_len, GFP_KERNEL);
++      if (lg->lfbb == NULL) {
++              err = -ENOMEM;
++              goto fail_nomem;
++      }
++
++      /* setup mutexes */
++      init_MUTEX (&lg->core_sender);
++      init_MUTEX (&lg->core_recver);
++      init_MUTEX (&lg->lock_sender);
++      init_MUTEX (&lg->lock_recver);
++
++      lg->core_port = 40040;
++      lg->lock_port = 40042;
++
++      *ret = lg;
++      return 0;
++      fail_nomem:
++      if (lg->clusterID != NULL)
++              kfree (lg->clusterID);
++      if (lg->service_name != NULL)
++              kfree (lg->service_name);
++      if (lg->cfba != NULL)
++              kfree (lg->cfba);
++      if (lg->cfbb != NULL)
++              kfree (lg->cfbb);
++      if (lg->lfba != NULL)
++              kfree (lg->lfba);
++      if (lg->lfbb != NULL)
++              kfree (lg->lfbb);
++      kfree (lg);
++      return err;
++}
++
++/**
++ * lg_release - 
++ * @lg: 
++ * 
++ */
++void
++lg_release (gulm_interface_p lgp)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      if (lgp == NULL)
++              return;
++      /* make sure it is a gulm_interface_p. */
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return;
++
++      if (lg->service_name != NULL)
++              kfree (lg->service_name);
++      if (lg->clusterID != NULL)
++              kfree (lg->clusterID);
++
++      /* wonder if I should send a logout packet? */
++      if (lg->core_enc != NULL)
++              xdr_enc_release (lg->core_enc);
++      if (lg->core_dec != NULL)
++              xdr_dec_release (lg->core_dec);
++      xdr_close (&lg->core_fd);
++
++      if (lg->lock_enc != NULL)
++              xdr_enc_release (lg->lock_enc);
++      if (lg->lock_dec != NULL)
++              xdr_dec_release (lg->lock_dec);
++      xdr_close (&lg->lock_fd);
++
++      if (lg->cfba != NULL)
++              kfree (lg->cfba);
++      if (lg->cfbb != NULL)
++              kfree (lg->cfbb);
++      if (lg->lfba != NULL)
++              kfree (lg->lfba);
++      if (lg->lfbb != NULL)
++              kfree (lg->lfbb);
++
++      kfree (lg);
++}
++
++/**
++ * lg_set_core_port - 
++ * @lgp: 
++ * @new: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_set_core_port (gulm_interface_p lgp, uint16_t new)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      if (lgp == NULL)
++              return -EINVAL;
++      /* make sure it is a gulm_interface_p. */
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      lg->core_port = new;
++      return 0;
++}
++
++/**
++ * lg_set_ltpx_port - 
++ * @lgp: 
++ * @new: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++lg_set_lock_port (gulm_interface_p lgp, uint16_t new)
++{
++      gulm_interface_t *lg = (gulm_interface_t *) lgp;
++      if (lgp == NULL)
++              return -EINVAL;
++      /* make sure it is a gulm_interface_p. */
++      if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++              return -EINVAL;
++
++      lg->lock_port = new;
++
++      return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_priv.h linux-patched/fs/gfs_locking/lock_gulm/lg_priv.h
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_priv.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_priv.h   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,86 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __lg_priv_h__
++#define __lg_priv_h__
++/* private details that we don't want to give the users of this lib access
++ * to go here.
++ */
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "xdr.h"
++#include "gio_wiretypes.h"
++#include "libgulm.h"
++
++#define LGMAGIC (0x474d4354)
++
++struct gulm_interface_s {
++      /* since we've masked this to a void* to the users, it is a nice safty
++       * net to put a little magic in here so we know things stay good.
++       */
++      uint32_t first_magic;
++
++      /* WHAT IS YOUR NAME?!? */
++      char *service_name;
++
++      char *clusterID;
++
++      uint16_t core_port;
++      xdr_socket core_fd;
++      xdr_enc_t *core_enc;
++      xdr_dec_t *core_dec;
++      struct semaphore core_sender;
++      struct semaphore core_recver;
++      int in_core_hm;
++
++      uint16_t lock_port;
++      xdr_socket lock_fd;
++      xdr_enc_t *lock_enc;
++      xdr_dec_t *lock_dec;
++      struct semaphore lock_sender;
++      struct semaphore lock_recver;
++      int in_lock_hm;
++      uint8_t lockspace[4];
++
++      /* in the message recver func, we read data into these buffers and pass
++       * them to the callback function.  This way we avoid doinf mallocs and
++       * frees on every callback.
++       */
++      uint16_t cfba_len;
++      uint8_t *cfba;
++      uint16_t cfbb_len;
++      uint8_t *cfbb;
++      uint16_t lfba_len;
++      uint8_t *lfba;
++      uint16_t lfbb_len;
++      uint8_t *lfbb;
++
++      uint32_t last_magic;
++};
++typedef struct gulm_interface_s gulm_interface_t;
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#endif /*__lg_priv_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/libgulm.h linux-patched/fs/gfs_locking/lock_gulm/libgulm.h
+--- linux-orig/fs/gfs_locking/lock_gulm/libgulm.h      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/libgulm.h   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,191 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __libgulm_h__
++#define __libgulm_h__
++
++/* bit messy, but we need this to be rather seemless in both kernel and
++ * userspace. and this seems the easiest way to do it.
++ */
++
++#ifdef __linux__
++#include <linux/in6.h>
++typedef struct socket *lg_socket;
++#endif /*__linux__*/
++
++typedef void *gulm_interface_p;
++
++/* mallocs the interface structure.
++ */
++int lg_initialize (gulm_interface_p *, char *cluster_name, char *service_name);
++/* frees struct.
++ */
++void lg_release (gulm_interface_p);
++
++/* Determins where we are with a itemlist callback */
++typedef enum { lglcb_start, lglcb_item, lglcb_stop } lglcb_t;
++
++/****** Core specifics ******/
++
++/* leaving a callback pointer as NULL, will cause that message type to 
++ * be ignored. */
++typedef struct lg_core_callbacks_s {
++      int (*login_reply) (void *misc, uint64_t gen, uint32_t error,
++                          uint32_t rank, uint8_t corestate);
++      int (*logout_reply) (void *misc);
++      int (*nodelist) (void *misc, lglcb_t type, char *name,
++                       struct in6_addr * ip, uint8_t state);
++      int (*statechange) (void *misc, uint8_t corestate,
++                          struct in6_addr * masterip, char *mastername);
++      int (*nodechange) (void *misc, char *nodename,
++                         struct in6_addr * nodeip, uint8_t nodestate);
++      int (*service_list) (void *misc, lglcb_t type, char *service);
++      int (*status) (void *misc, lglcb_t type, char *key, char *value);
++      int (*error) (void *misc, uint32_t err);
++} lg_core_callbacks_t;
++
++/* this will trigger a callback from gulm_core_callbacks_t 
++ * handles one message! Either stick this inside of a thread,
++ * or in a poll()/select() loop using the function below.
++ * This will block until there is a message sent from core. 
++ */
++int lg_core_handle_messages (gulm_interface_p, lg_core_callbacks_t *,
++                           void *misc);
++
++/* this returns the filedescriptor that the library is using to 
++ * communicate with the core. This is only for using in a poll() 
++ * or select() call to avoid having the gulm_core_handle_messages()
++ * call block. 
++ */
++lg_socket lg_core_selector (gulm_interface_p);
++
++/* Queue requests. */
++int lg_core_login (gulm_interface_p, int important);
++int lg_core_logout (gulm_interface_p);
++int lg_core_nodeinfo (gulm_interface_p, char *nodename);
++int lg_core_nodelist (gulm_interface_p);
++int lg_core_servicelist (gulm_interface_p);
++int lg_core_corestate (gulm_interface_p);
++
++/* for completeness mostly. */
++int lg_core_shutdown (gulm_interface_p);
++int lg_core_forceexpire (gulm_interface_p, char *node_name);
++int lg_core_forcepending (gulm_interface_p);
++
++int lg_core_status (gulm_interface_p);
++
++/* Node states
++ * First three are actual states, as well as changes.  Last is only a node
++ * change message.
++ * */
++#define lg_core_Logged_in  (0x05)
++#define lg_core_Logged_out (0x06)
++#define lg_core_Expired    (0x07)
++#define lg_core_Fenced     (0x08)
++/* Core states */
++#define lg_core_Slave       (0x01)
++#define lg_core_Master      (0x02)
++#define lg_core_Pending     (0x03)
++#define lg_core_Arbitrating (0x04)
++#define lg_core_Client      (0x06)
++
++/****** lock space specifics *****/
++/* note that this library masks out the lock table seperation. 
++ */
++
++typedef struct lg_lockspace_callbacks_s {
++      int (*login_reply) (void *misc, uint32_t error, uint8_t which);
++      int (*logout_reply) (void *misc);
++      int (*lock_state) (void *misc, uint8_t * key, uint16_t keylen,
++                         uint8_t state, uint32_t flags, uint32_t error,
++                         uint8_t * LVB, uint16_t LVBlen);
++      int (*lock_action) (void *misc, uint8_t * key, uint16_t keylen,
++                          uint8_t action, uint32_t error);
++      int (*cancel_reply) (void *misc, uint8_t * key, uint16_t keylen,
++                           uint32_t error);
++      int (*drop_lock_req) (void *misc, uint8_t * key, uint16_t keylen,
++                            uint8_t state);
++      int (*drop_all) (void *misc);
++      int (*status) (void *misc, lglcb_t type, char *key, char *value);
++      int (*error) (void *misc, uint32_t err);
++} lg_lockspace_callbacks_t;
++
++/* Like the core handle messages function, but for the lockspace.
++ * Handles one message, blocks.
++ */
++
++int lg_lock_handle_messages (gulm_interface_p, lg_lockspace_callbacks_t *,
++                           void *misc);
++
++/* this returns the filedescriptor that the library is using to 
++ * communicate with the ltpx. This is only for using in a poll() 
++ * or select() call to avoid having the gulm_lock_handle_messages()
++ * call block. 
++ */
++lg_socket lg_lock_selector (gulm_interface_p);
++
++/* Lockspace request calls */
++int lg_lock_login (gulm_interface_p, uint8_t lockspace[4]);
++int lg_lock_logout (gulm_interface_p);
++int lg_lock_state_req (gulm_interface_p, uint8_t * key, uint16_t keylen,
++                     uint8_t state, uint32_t flags, uint8_t * LVB,
++                     uint16_t LVBlen);
++int lg_lock_cancel_req (gulm_interface_p, uint8_t * key, uint16_t keylen);
++int lg_lock_action_req (gulm_interface_p, uint8_t * key,
++                      uint16_t keylen, uint8_t action,
++                      uint8_t * LVB, uint16_t LVBlen);
++int lg_lock_drop_exp (gulm_interface_p, uint8_t * holder,
++                    uint8_t * keymask, uint16_t kmlen);
++int lg_lock_status (gulm_interface_p);
++
++/* state requests */
++#define lg_lock_state_Unlock    (0x00)
++#define lg_lock_state_Exclusive (0x01)
++#define lg_lock_state_Deferred  (0x02)
++#define lg_lock_state_Shared    (0x03)
++
++/* actions */
++#define lg_lock_act_HoldLVB     (0x0b)
++#define lg_lock_act_UnHoldLVB   (0x0c)
++#define lg_lock_act_SyncLVB     (0x0d)
++
++/* flags */
++#define lg_lock_flag_DoCB        (0x00000001)
++#define lg_lock_flag_Try         (0x00000002)
++#define lg_lock_flag_Any         (0x00000004)
++#define lg_lock_flag_IgnoreExp   (0x00000008)
++#define lg_lock_flag_Cachable    (0x00000020)
++#define lg_lock_flag_Piority     (0x00000040)
++
++/* These are the possible values that can be in the error fields. */
++#define lg_err_Ok              (0)
++#define lg_err_BadLogin        (1001)
++#define lg_err_BadCluster      (1003)
++#define lg_err_BadConfig       (1004)
++#define lg_err_BadGeneration   (1005)
++#define lg_err_BadWireProto    (1019)
++
++#define lg_err_NotAllowed      (1006)
++#define lg_err_Unknown_Cs      (1007)
++#define lg_err_BadStateChg     (1008)
++#define lg_err_MemoryIssues    (1009)
++
++#define lg_err_TryFailed       (1011)
++#define lg_err_AlreadyPend     (1013)
++#define lg_err_Canceled        (1015)
++
++#define lg_err_NoSuchFS        (1016)
++#define lg_err_NoSuchJID       (1017)
++#define lg_err_NoSuchName      (1018)
++
++#endif /*__libgulm_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/linux_gulm_main.c linux-patched/fs/gfs_locking/lock_gulm/linux_gulm_main.c
+--- linux-orig/fs/gfs_locking/lock_gulm/linux_gulm_main.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/linux_gulm_main.c   2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,109 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++#define WANT_DEBUG_NAMES
++#define WANT_GMALLOC_NAMES
++#define EXTERN
++#include "gulm.h"
++
++#include <linux/init.h>
++
++#include "util.h"
++#include "gulm_procinfo.h"
++
++MODULE_DESCRIPTION ("Grand Unified Locking Module " GULM_RELEASE_NAME);
++MODULE_AUTHOR ("Red Hat, Inc.");
++MODULE_LICENSE ("GPL");
++
++extern gulm_cm_t gulm_cm;
++
++/**
++ * init_gulm - Initialize the gulm module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int __init
++init_gulm (void)
++{
++      int error;
++
++      memset (&gulm_cm, 0, sizeof (gulm_cm_t));
++      gulm_cm.loaded = FALSE;
++      gulm_cm.hookup = NULL;
++
++      /* register with the lm layers. */
++      error = lm_register_proto (&gulm_ops);
++      if (error)
++              goto fail;
++
++      error = init_proc_dir ();
++      if (error != 0) {
++              goto fail_lm;
++      }
++
++      init_gulm_fs ();
++
++      printk ("Gulm %s (built %s %s) installed\n",
++              GULM_RELEASE_NAME, __DATE__, __TIME__);
++
++      return 0;
++
++      fail_lm:
++      lm_unregister_proto (&gulm_ops);
++
++      fail:
++      return error;
++}
++
++/**
++ * exit_gulm - cleanup the gulm module
++ *
++ */
++
++void __exit
++exit_gulm (void)
++{
++      remove_proc_dir ();
++      lm_unregister_proto (&gulm_ops);
++}
++
++module_init (init_gulm);
++module_exit (exit_gulm);
++
++/* the libgulm.h interface. */
++EXPORT_SYMBOL (lg_initialize);
++EXPORT_SYMBOL (lg_release);
++
++EXPORT_SYMBOL (lg_core_handle_messages);
++EXPORT_SYMBOL (lg_core_selector);
++EXPORT_SYMBOL (lg_core_login);
++EXPORT_SYMBOL (lg_core_logout);
++EXPORT_SYMBOL (lg_core_nodeinfo);
++EXPORT_SYMBOL (lg_core_nodelist);
++EXPORT_SYMBOL (lg_core_servicelist);
++EXPORT_SYMBOL (lg_core_corestate);
++EXPORT_SYMBOL (lg_core_shutdown);
++EXPORT_SYMBOL (lg_core_forceexpire);
++EXPORT_SYMBOL (lg_core_forcepending);
++EXPORT_SYMBOL (lg_core_status);
++
++EXPORT_SYMBOL (lg_lock_handle_messages);
++EXPORT_SYMBOL (lg_lock_selector);
++EXPORT_SYMBOL (lg_lock_login);
++EXPORT_SYMBOL (lg_lock_logout);
++EXPORT_SYMBOL (lg_lock_state_req);
++EXPORT_SYMBOL (lg_lock_cancel_req);
++EXPORT_SYMBOL (lg_lock_action_req);
++EXPORT_SYMBOL (lg_lock_drop_exp);
++EXPORT_SYMBOL (lg_lock_status);
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/load_info.c linux-patched/fs/gfs_locking/lock_gulm/load_info.c
+--- linux-orig/fs/gfs_locking/lock_gulm/load_info.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/load_info.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,96 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include <linux/utsname.h>    /* for extern system_utsname */
++
++#include "util.h"
++#include "utils_verb_flags.h"
++
++gulm_cm_t gulm_cm;
++
++/**
++ * init_ltpx - 
++ */
++int
++init_ltpx (void)
++{
++      int j;
++      lock_table_t *lt = &gulm_cm.ltpx;
++
++      INIT_LIST_HEAD (&lt->to_be_sent);
++      spin_lock_init (&lt->queue_sender);
++      init_waitqueue_head (&lt->send_wchan);
++      lt->magic_one = 0xAAAAAAAA;
++      init_MUTEX (&lt->sender);
++      init_completion (&lt->startup);
++      atomic_set (&lt->locks_pending, 0);
++      lt->hashbuckets = 8191;
++      lt->hshlk = kmalloc (sizeof (spinlock_t) * lt->hashbuckets, GFP_KERNEL);
++      if (lt->hshlk == NULL)
++              return -ENOMEM;
++      lt->lkhsh =
++          kmalloc (sizeof (struct list_head) * lt->hashbuckets, GFP_KERNEL);
++      if (lt->lkhsh == NULL) {
++              kfree (lt->hshlk);
++              return -ENOMEM;
++      }
++      for (j = 0; j < lt->hashbuckets; j++) {
++              spin_lock_init (&lt->hshlk[j]);
++              INIT_LIST_HEAD (&lt->lkhsh[j]);
++      }
++      return 0;
++}
++
++/**
++ * load_info - 
++ * @hostdata: < optionally override the name of this node.
++ * 
++ * Returns: int
++ */
++int
++load_info (char *hostdata)
++{
++      int err = 0;
++
++      if (gulm_cm.loaded)
++              goto exit;
++
++      gulm_cm.verbosity = 0;
++      if (hostdata != NULL && strlen (hostdata) > 0) {
++              strncpy (gulm_cm.myName, hostdata, 64);
++      } else {
++              strncpy (gulm_cm.myName, system_utsname.nodename, 64);
++      }
++      gulm_cm.myName[63] = '\0';
++
++      /* breaking away from ccs. just hardcoding defaults here.
++       * Noone really used these anyways and if ppl want them badly, we'll
++       * find another way to set them. (modprobe options for example.)
++       * */
++      gulm_cm.handler_threads = 2;
++      set_verbosity ("Default", &gulm_cm.verbosity);
++
++      init_ltpx ();
++
++      gulm_cm.loaded = TRUE;
++      exit:
++      return err;
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/load_info.h linux-patched/fs/gfs_locking/lock_gulm/load_info.h
+--- linux-orig/fs/gfs_locking/lock_gulm/load_info.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/load_info.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,17 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __load_info_h__
++#define __load_info_h__
++int load_info (char *);
++#endif /*__load_info_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/util.c linux-patched/fs/gfs_locking/lock_gulm/util.c
+--- linux-orig/fs/gfs_locking/lock_gulm/util.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/util.c      2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,109 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include "utils_crc.h"
++
++/**
++ * atoi
++ *
++ * @c:
++ *
++ */
++
++int
++atoi (char *c)
++{
++      int x = 0;
++
++      while ('0' <= *c && *c <= '9') {
++              x = x * 10 + (*c - '0');
++              c++;
++      }
++
++      return (x);
++}
++
++/**
++ * inet_aton
++ *
++ * @ascii:
++ * @ip:
++ *
++ */
++
++int
++inet_aton (char *ascii, uint32_t * ip)
++{
++      uint32_t value;
++      int x;
++
++      *ip = 0;
++
++      for (x = 0; x < 4; x++) {
++              value = atoi (ascii);
++              if (value > 255)
++                      return (-1);
++
++              *ip = (*ip << 8) | value;
++
++              if (x != 3) {
++                      for (; *ascii != '.' && *ascii != '\0'; ascii++) {
++                              if (*ascii < '0' || *ascii > '9') {
++                                      /* not a number. stop */
++                                      return -1;
++                              }
++                      }
++                      if (*ascii == '\0')
++                              return (-1);
++
++                      ascii++;
++              }
++      }
++
++      return (0);
++}
++
++/**
++ * inet_ntoa
++ *
++ * @ascii:
++ * @ip:
++ *
++ */
++void
++inet_ntoa (uint32_t ip, char *buf)
++{
++      int i;
++      char *p;
++
++      p = buf;
++
++      for (i = 3; i >= 0; i--) {
++              p += sprintf (p, "%d", (ip >> (8 * i)) & 0xFF);
++              if (i > 0)
++                      *(p++) = '.';
++      }
++
++}
++
++/* public functions */
++#define hash_init_val 0x6d696b65
++
++uint32_t __inline__
++hash_lock_key (uint8_t * in, uint8_t len)
++{                             /* other hash function was to variable */
++      return crc32 (in, len, hash_init_val);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/util.h linux-patched/fs/gfs_locking/lock_gulm/util.h
+--- linux-orig/fs/gfs_locking/lock_gulm/util.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/util.h      2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++int atoi (char *c);
++int inet_aton (char *ascii, uint32_t * ip);
++void inet_ntoa (uint32_t ip, char *buf);
++void dump_buffer (void *buf, int len);
++
++uint32_t __inline__ hash_lock_key (uint8_t * in, uint8_t len);
++uint8_t __inline__ fourtoone (uint32_t);
++
++__inline__ int testbit (uint16_t bit, uint8_t * set);
++__inline__ void setbit (uint16_t bit, uint8_t * set);
++__inline__ void clearbit (uint16_t bit, uint8_t * set);
++
++#endif                                /*  __UTIL_DOT_H__  */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_crc.c linux-patched/fs/gfs_locking/lock_gulm/utils_crc.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_crc.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_crc.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,92 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/types.h>
++
++static const uint32_t crc_32_tab[] = {
++      0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
++      0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
++      0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
++      0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
++      0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
++      0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
++      0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
++      0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
++      0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
++      0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
++      0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
++      0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
++      0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
++      0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
++      0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
++      0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
++      0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
++      0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
++      0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
++      0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
++      0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
++      0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
++      0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
++      0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
++      0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
++      0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
++      0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
++      0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
++      0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
++      0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
++      0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
++      0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
++      0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
++      0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
++      0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
++      0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
++      0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
++      0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
++      0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
++      0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
++      0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
++      0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
++      0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * crc32 - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * completely copied from GFS/src/fs.c
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data.  The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at.  That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++crc32 (const char *data, int len, uint32_t init)
++{
++      uint32_t hash = init;
++
++      for (; len--; data++)
++              hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++      hash = ~hash;
++
++      return hash;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_crc.h linux-patched/fs/gfs_locking/lock_gulm/utils_crc.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_crc.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_crc.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,17 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_crc_h__
++#define __utils_crc_h__
++uint32_t crc32 (const char *data, int len, uint32_t init);
++#endif /*__utils_crc_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.c linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.c  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.c       2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,207 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gio_wiretypes.h"
++
++char *
++gio_Err_to_str (int x)
++{
++      char *t = "Unknown GULM Err";
++      switch (x) {
++      case gio_Err_Ok:
++              t = "Ok";
++              break;
++
++      case gio_Err_BadLogin:
++              t = "Bad Login";
++              break;
++      case gio_Err_BadCluster:
++              t = "Bad Cluster ID";
++              break;
++      case gio_Err_BadConfig:
++              t = "Incompatible configurations";
++              break;
++      case gio_Err_BadGeneration:
++              t = "Bad Generation ID";
++              break;
++      case gio_Err_BadWireProto:
++              t = "Bad Wire Protocol Version";
++              break;
++
++      case gio_Err_NotAllowed:
++              t = "Not Allowed";
++              break;
++      case gio_Err_Unknown_Cs:
++              t = "Uknown Client";
++              break;
++      case gio_Err_BadStateChg:
++              t = "Bad State Change";
++              break;
++      case gio_Err_MemoryIssues:
++              t = "Memory Problems";
++              break;
++
++      case gio_Err_PushQu:
++              t = "Push Queue";
++              break;
++      case gio_Err_TryFailed:
++              t = "Try Failed";
++              break;
++      case gio_Err_AlreadyPend:
++              t = "Request Already Pending";
++              break;
++      case gio_Err_Canceled:
++              t = "Request Canceled";
++              break;
++
++      case gio_Err_NoSuchFS:
++              t = "No Such Filesystem";
++              break;
++      case gio_Err_NoSuchJID:
++              t = "No Such JID";
++              break;
++      case gio_Err_NoSuchName:
++              t = "No Such Node";
++              break;
++      }
++      return t;
++}
++
++char *
++gio_mbrupdate_to_str (int x)
++{
++      char *t = "Unknown Membership Update";
++      switch (x) {
++      case gio_Mbr_Logged_in:
++              t = "Logged in";
++              break;
++      case gio_Mbr_Logged_out:
++              t = "Logged out";
++              break;
++      case gio_Mbr_Expired:
++              t = "Expired";
++              break;
++      case gio_Mbr_Killed:
++              t = "Fenced";
++              break;
++      case gio_Mbr_OM_lgin:
++              t = "Was Logged in";
++              break;
++      }
++      return t;
++}
++
++char *
++gio_I_am_to_str (int x)
++{
++      switch (x) {
++      case gio_Mbr_ama_Slave:
++              return "Slave";
++              break;
++      case gio_Mbr_ama_Pending:
++              return "Pending";
++              break;
++      case gio_Mbr_ama_Arbitrating:
++              return "Arbitrating";
++              break;
++      case gio_Mbr_ama_Master:
++              return "Master";
++              break;
++      case gio_Mbr_ama_Resource:
++              return "Service";
++              break;
++      case gio_Mbr_ama_Client:
++              return "Client";
++              break;
++      default:
++              return "Unknown I_am state";
++              break;
++      }
++}
++
++char *
++gio_license_states (int x)
++{
++      switch (x) {
++      case 0:
++              return "valid";
++              break;
++      case 1:
++              return "expired";
++              break;
++      case 2:
++              return "invalid";
++              break;
++      default:
++              return "unknown";
++              break;
++      }
++}
++
++char *
++gio_opcodes (int x)
++{
++      switch (x) {
++#define CP(x) case (x): return #x ; break
++              CP (gulm_err_reply);
++
++              CP (gulm_core_login_req);
++              CP (gulm_core_login_rpl);
++              CP (gulm_core_logout_req);
++              CP (gulm_core_logout_rpl);
++              CP (gulm_core_reslgn_req);
++              CP (gulm_core_beat_req);
++              CP (gulm_core_beat_rpl);
++              CP (gulm_core_mbr_req);
++              CP (gulm_core_mbr_updt);
++              CP (gulm_core_mbr_lstreq);
++              CP (gulm_core_mbr_lstrpl);
++              CP (gulm_core_mbr_force);
++              CP (gulm_core_res_req);
++              CP (gulm_core_res_list);
++              CP (gulm_core_state_req);
++              CP (gulm_core_state_chgs);
++              CP (gulm_core_shutdown);
++              CP (gulm_core_forcepend);
++
++              CP (gulm_info_stats_req);
++              CP (gulm_info_stats_rpl);
++              CP (gulm_info_set_verbosity);
++              CP (gulm_socket_close);
++              CP (gulm_info_slave_list_req);
++              CP (gulm_info_slave_list_rpl);
++
++              CP (gulm_lock_login_req);
++              CP (gulm_lock_login_rpl);
++              CP (gulm_lock_logout_req);
++              CP (gulm_lock_logout_rpl);
++              CP (gulm_lock_state_req);
++              CP (gulm_lock_state_rpl);
++              CP (gulm_lock_state_updt);
++              CP (gulm_lock_action_req);
++              CP (gulm_lock_action_rpl);
++              CP (gulm_lock_action_updt);
++              CP (gulm_lock_update_rpl);
++              CP (gulm_lock_cb_state);
++              CP (gulm_lock_cb_dropall);
++              CP (gulm_lock_drop_exp);
++              CP (gulm_lock_dump_req);
++              CP (gulm_lock_dump_rpl);
++              CP (gulm_lock_rerunqueues);
++
++#undef CP
++      default:
++              return "Unknown Op Code";
++              break;
++      }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.h linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.h       2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_tostr_h__
++#define __utils_tostr_h__
++char *gio_Err_to_str (int x);
++char *gio_mbrupdate_to_str (int x);
++char *gio_mbrama_to_str (int x);
++char *gio_I_am_to_str (int x);
++char *gio_license_states (int x);
++char *gio_opcodes (int x);
++#endif /*__utils_tostr_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.c linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.c  2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,271 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "gulm_log_msg_bits.h"
++
++static __inline__ int
++strncasecmp (const char *s1, const char *s2, size_t l)
++{
++      char c1 = '\0', c2 = '\0';
++
++      while (*s1 && *s2 && l-- > 0) {
++              c1 = *s1++;
++              c2 = *s2++;
++
++              if (c1 >= 'A' && c1 <= 'Z')
++                      c1 += 'a' - 'A';
++
++              if (c2 >= 'A' && c2 <= 'Z')
++                      c2 += 'a' - 'A';
++
++              if (c1 != c2)
++                      break;
++      }
++      return (c1 - c2);
++}
++
++static int bit_array[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
++
++#define BITCOUNT(x) (bit_array[x & 0x000F] + \
++                     bit_array[(x >> 4) & 0x000F] + \
++                     bit_array[(x >> 8) & 0x000F] + \
++                     bit_array[(x >> 12) & 0x000F] + \
++                     bit_array[(x >> 16) & 0x000F] + \
++                     bit_array[(x >> 20) & 0x000F] + \
++                     bit_array[(x >> 24) & 0x000F] + \
++                     bit_array[(x >> 28) & 0x000F])
++
++struct {
++      char *name;
++      uint32_t val;
++} verbose_flags[] = {
++      {
++      "Network", lgm_Network,}, {
++      "Network2", lgm_Network2,}, {
++      "Network3", lgm_Network3,}, {
++      "Fencing", lgm_Stomith,}, {
++      "Heartbeat", lgm_Heartbeat,}, {
++      "Locking", lgm_locking,}, {
++      "Forking", lgm_Forking,}, {
++      "JIDMap", lgm_JIDMap,}, {
++      "JIDUpdates", lgm_JIDUpdates,}, {
++      "Subscribers", lgm_Subscribers,}, {
++      "LockUpdates", lgm_LockUpdates,}, {
++      "LoginLoops", lgm_LoginLoops,}, {
++      "ServerState", lgm_ServerState,}, {
++      "Default", lgm_Network | lgm_Stomith | lgm_Forking,},
++/* Since I really don't want people really doing *all* flags with all,
++ * there is AlmostAll, which users really get, and ReallyAll, which is all
++ * bits on.
++ * This is mostly due to Network3, which dumps messages on nearly
++ * every packet. (should actually be every packet.)
++ * Also drop the slave updates, since that is on every packet as well.
++ */
++      {
++      "All",
++                  (lgm_ReallyAll &
++                           ~(lgm_Network3 | lgm_JIDUpdates |
++                                     lgm_LockUpdates)),}, {
++      "AlmostAll",
++                  lgm_ReallyAll & ~(lgm_Network3 | lgm_JIDUpdates |
++                                            lgm_LockUpdates),}, {
++      "ReallyAll", lgm_ReallyAll,}
++};
++
++static int
++add_string (char *name, size_t * cur, char *str, size_t slen)
++{
++      size_t nl;
++
++      nl = strlen (name);
++      if (*cur + nl > slen) {
++              memcpy (str + *cur, "...", 3);
++              cur += 3;
++              str[*cur] = '\0';
++              return -1;
++      }
++      memcpy (str + *cur, name, nl);
++      *cur += nl;
++      str[*cur] = ',';
++      *cur += 1;
++
++      return 0;
++}
++
++/**
++ * get_verbosity_string - 
++ * @str: 
++ * @verb: 
++ * 
++ * 
++ * Returns: int
++ */
++int
++get_verbosity_string (char *str, size_t slen, uint32_t verb)
++{
++      int i, vlen = sizeof (verbose_flags) / sizeof (verbose_flags[0]);
++      size_t cur = 0;
++      int combo_match = -1, error = 0;
++
++      memset (str, 0, slen);
++      slen -= 4;              /* leave room for dots and null */
++
++      if (verb == 0) {
++              error = add_string ("Quiet", &cur, str, slen);
++              goto end;
++      }
++
++      /* Combo verb flag phase */
++      for (i = 0; i < vlen; i++) {
++              if (BITCOUNT (verbose_flags[i].val) > 1) {
++                      /* check to see if this flag matches exclusively */
++                      if ((verbose_flags[i].val ^ verb) == 0) {
++                              error =
++                                  add_string (verbose_flags[i].name, &cur,
++                                              str, slen);
++                              goto end;
++                      }
++
++                      if ((verbose_flags[i].val & verb) ==
++                          verbose_flags[i].val) {
++                              if (combo_match < 0) {
++                                      combo_match = i;
++                              } else {
++                                      /* Compare this combo with the one in combo_match */
++                                      if (BITCOUNT (verbose_flags[i].val) >
++                                          BITCOUNT (verbose_flags
++                                                    [combo_match].val)) {
++                                              combo_match = i;
++                                      }
++                              }
++
++                      }
++              }
++      }
++      /* Add the best combo to the string */
++      if (combo_match > -1) {
++              if (add_string
++                  (verbose_flags[combo_match].name, &cur, str, slen) == -1) {
++                      error = -1;
++                      goto end;
++              }
++      }
++
++      /* Single verb flag phase */
++      for (i = 0; i < vlen; i++) {
++              if (BITCOUNT (verbose_flags[i].val) == 1) {
++                      if (combo_match > -1) {
++                              if ((verbose_flags[combo_match].
++                                   val & verbose_flags[i].val) ==
++                                  verbose_flags[i].val) {
++                                      continue;
++                              }
++                      }
++
++                      if ((verbose_flags[i].val & verb) ==
++                          verbose_flags[i].val) {
++                              if (add_string
++                                  (verbose_flags[i].name, &cur, str,
++                                   slen) == -1) {
++                                      error = -1;
++                                      goto end;
++                              }
++                      }
++              }
++      }
++      end:
++      /* Clear trailing ',' */
++      if (str[cur - 1] == ',') {
++              str[cur - 1] = '\0';
++      }
++      return error;
++}
++
++/**
++ * set_verbosity - 
++ * @str: 
++ * @verb: 
++ *
++ * toggle bits according to the `rules' in the str.
++ * str is a list of verb flags. can be prefexed with '+' or '-'
++ * No prefix is the same as '+' prefix
++ * '+' sets bits
++ * '-' unsets bits.
++ * special 'clear' unsets all.
++ */
++void
++set_verbosity (char *str, uint32_t * verb)
++{
++      char *token, *next;
++      int i, wl, tl, len = sizeof (verbose_flags) / sizeof (verbose_flags[0]);
++
++      if (str == NULL)
++              return;
++
++      wl = strlen (str);
++      if (wl == 0)
++              return;
++      for (token = str, tl = 0; tl < wl &&
++           token[tl] != ',' &&
++           token[tl] != ' ' && token[tl] != '|' && token[tl] != '\0'; tl++) ;
++      next = token + tl + 1;
++
++      for (;;) {
++              if (token[0] == '-') {
++                      token++;
++                      for (i = 0; i < len; i++) {
++                              if (strncasecmp
++                                  (token, verbose_flags[i].name, tl) == 0) {
++                                      (*verb) &= ~(verbose_flags[i].val);
++                              }
++                      }
++              } else if (token[0] == '+') {
++                      token++;
++                      for (i = 0; i < len; i++) {
++                              if (strncasecmp
++                                  (token, verbose_flags[i].name, tl) == 0) {
++                                      (*verb) |= verbose_flags[i].val;
++                              }
++                      }
++              } else {
++                      if (strncasecmp (token, "clear", tl) == 0) {
++                              (*verb) = 0;
++                      } else {
++                              for (i = 0; i < len; i++) {
++                                      if (strncasecmp
++                                          (token, verbose_flags[i].name,
++                                           tl) == 0) {
++                                              (*verb) |= verbose_flags[i].val;
++                                      }
++                              }
++                      }
++              }
++
++              if (next >= str + wl)
++                      return;
++              for (token = next, tl = 0;
++                   tl < wl &&
++                   token[tl] != ',' &&
++                   token[tl] != ' ' &&
++                   token[tl] != '|' && token[tl] != '\0'; tl++) ;
++              next = token + tl + 1;
++
++      }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.h linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.h     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.h  2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,18 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_verb_flags_h__
++#define __utils_verb_flags_h__
++int get_verbosity_string (char *str, size_t slen, uint32_t verb);
++void set_verbosity (char *str, uint32_t * verb);
++#endif /*__utils_verb_flags_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr.h linux-patched/fs/gfs_locking/lock_gulm/xdr.h
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr.h  1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr.h       2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_xdr_h__
++#define __gulm_xdr_h__
++typedef struct xdr_enc_s xdr_enc_t;
++typedef struct xdr_dec_s xdr_dec_t;
++
++/* sockets in kernel space are done a bit different than socket in
++ * userspace.  But we need to have them appear to be the same.
++ */
++#ifdef __KERNEL__
++
++#ifdef __linux__
++#include <linux/net.h>
++#include <linux/in.h>
++#include <linux/in6.h>
++#include <linux/socket.h>
++#include <net/sock.h>
++
++typedef struct socket *xdr_socket;
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++#include <sys/types.h>
++#include <sys/uio.h>
++#include <sys/socket.h>
++#include <netinet/in.h>
++#include <netinet/tcp.h>
++#include <unistd.h>
++#include <errno.h>
++typedef int xdr_socket;
++#endif /*__KERNEL__*/
++
++/* start things up */
++int xdr_open (xdr_socket * sk);
++int xdr_connect (struct sockaddr_in6 *adr, xdr_socket sk);
++void xdr_close (xdr_socket * sk);
++
++/* deep, basic io */
++#ifdef __KERNEL__
++#ifdef __linux__
++size_t xdr_send (struct socket *sock, void *buf, size_t size);
++size_t xdr_recv (struct socket *sock, void *buf, size_t size);
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++ssize_t xdr_recv (int fd, void *buf, size_t len);
++ssize_t xdr_send (int fd, void *buf, size_t len);
++#endif /*__KERNEL__*/
++
++xdr_enc_t *xdr_enc_init (xdr_socket sk, int buffer_size);
++xdr_dec_t *xdr_dec_init (xdr_socket sk, int buffer_size);
++int xdr_enc_flush (xdr_enc_t * xdr);
++int xdr_enc_release (xdr_enc_t * xdr);        /* calls xdr_enc_flush() */
++void xdr_enc_force_release (xdr_enc_t * xdr); /* doesn't call xdr_enc_flush() */
++void xdr_dec_release (xdr_dec_t * xdr);
++/* xdr_enc_force_release() is for when you get and error sending and you
++ * want to free that stuff up right away.  If you use the regular release
++ * for enc, it will fail if it cannot send data over the filedesciptor.
++ */
++
++/* encoders add to a stream */
++int __inline__ xdr_enc_uint64 (xdr_enc_t * xdr, uint64_t i);
++int __inline__ xdr_enc_uint32 (xdr_enc_t * xdr, uint32_t i);
++int __inline__ xdr_enc_uint16 (xdr_enc_t * xdr, uint16_t i);
++int __inline__ xdr_enc_uint8 (xdr_enc_t * xdr, uint8_t i);
++int __inline__ xdr_enc_ipv6 (xdr_enc_t * enc, struct in6_addr *ip);
++int xdr_enc_raw (xdr_enc_t * xdr, void *pointer, uint16_t len);
++int xdr_enc_raw_iov (xdr_enc_t * xdr, int count, struct iovec *iov);
++int xdr_enc_string (xdr_enc_t * xdr, uint8_t * s);
++int xdr_enc_list_start (xdr_enc_t * xdr);
++int xdr_enc_list_stop (xdr_enc_t * xdr);
++
++/* decoders remove from stream */
++int xdr_dec_uint64 (xdr_dec_t * xdr, uint64_t * i);
++int xdr_dec_uint32 (xdr_dec_t * xdr, uint32_t * i);
++int xdr_dec_uint16 (xdr_dec_t * xdr, uint16_t * i);
++int xdr_dec_uint8 (xdr_dec_t * xdr, uint8_t * i);
++int xdr_dec_ipv6 (xdr_dec_t * xdr, struct in6_addr *ip);
++int xdr_dec_raw (xdr_dec_t * xdr, void *p, uint16_t * l);     /* no malloc */
++int xdr_dec_raw_m (xdr_dec_t * xdr, void **p, uint16_t * l);  /* mallocs p */
++int xdr_dec_raw_ag (xdr_dec_t * xdr, void **p, uint16_t * bl, uint16_t * rl);
++int xdr_dec_string (xdr_dec_t * xdr, uint8_t ** strp);        /* mallocs s */
++int xdr_dec_string_nm (xdr_dec_t * xdr, uint8_t * strp, size_t l);    /* no malloc */
++int xdr_dec_string_ag (xdr_dec_t * xdr, uint8_t ** s, uint16_t * bl);
++int xdr_dec_list_start (xdr_dec_t * xdr);
++int xdr_dec_list_stop (xdr_dec_t * xdr);
++
++#endif /*__gulm_xdr_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_base.c linux-patched/fs/gfs_locking/lock_gulm/xdr_base.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_base.c     1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_base.c  2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,904 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * This is a bit of an abstraction layer to get this working in both kernel
++ * and userspace.
++ */
++#define TRUE  (1)
++#define FALSE (0)
++#define MIN(a,b) ((a<b)?a:b)
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "xdr.h"
++
++/**
++ * xdr_realloc - a realloc for kernel space.
++ * @a: < pointer to realloc
++ * @nl: < desired new size
++ * @ol: < current old size
++ * 
++ * Not as good as the real realloc, since it always moves memory.  But good
++ * enough for as little as it will get used here.
++ *
++ * XXX this is broken.
++ * 
++ * Returns: void*
++ */
++static void *
++xdr_realloc (void *a, size_t nl, size_t ol)
++{
++      if (nl == ol) {
++              return a;
++      } else if (nl == 0) {
++              kfree (a);
++              return NULL;
++      } else if (a == NULL && nl > 0) {
++              return kmalloc (nl, GFP_KERNEL);
++      } else {
++              void *tmp;
++              tmp = kmalloc (nl, GFP_KERNEL);
++              if (tmp == NULL)
++                      return NULL;
++              memcpy (tmp, a, MIN (nl, ol));
++              kfree (a);
++              return tmp;
++      }
++}
++
++typedef enum { xdr_enc, xdr_dec } xdr_type;
++
++/* encoders have this sorta non-blocking, growing buffering stunt.
++ * makes them a bit different from the decoders now.
++ */
++struct xdr_enc_s {
++      size_t default_buf_size;
++      xdr_socket fd;
++      xdr_type type;
++      size_t length;
++      size_t curloc;
++      uint8_t *stream;
++};
++
++/* decoders only pull a single item off of the socket at a time.
++ * so this is all they need.
++ */
++struct xdr_dec_s {
++      size_t length;          /* total byte length of the stream */
++      size_t curloc;          /* current byte offset from start */
++      uint8_t *stream;        /* start of the encoded stream. */
++      xdr_socket fd;
++      xdr_type type;
++};
++
++/* the types of data we support. */
++
++#define XDR_NULL          0x00        /* NOT A VALID TAG!!! used in dec code. */
++#define XDR_LIST_START    0x01
++#define XDR_LIST_STOP     0x02
++/* list is a variable length device.  It is a start tag, some number of
++ * xdr_enc_*, then an stop tag.  It's main purpose is to provide a method
++ * of encasing data.
++ * */
++#define XDR_STRING        0x04
++/* string tag is followed by a uint16 which is the byte length */
++#define XDR_RAW           0x05
++/* raw tag is followed by a uint16 which is the byte length
++ * if 65535 bytes isn't enough, split your data and put multiples of these
++ * back to back.  (idea of xdr is to avoid this twit.)
++ * */
++
++/* note, if the size of these should variate, I'm screwed.  Should consider
++ * changing this all to the bit shift and array access to be more concrete.
++ * later.
++ */
++#define XDR_UINT64        0x06
++#define XDR_UINT32        0x07
++#define XDR_UINT16        0x08
++#define XDR_UINT8         0x09
++/* should add signed ints */
++
++#define XDR_IPv6          0x0a        /* 16 bytes, IPv6 address */
++
++/* any other base types?
++ */
++
++#define XDR_DEFAULT_BUFFER_SIZE 4096
++/*****************************************************************************/
++
++/**
++ * xdr_enc_init - 
++ * @fd: 
++ * @buffer_size: 
++ * 
++ * 
++ * Returns: xdr_enc_t*
++ */
++xdr_enc_t *
++xdr_enc_init (xdr_socket fd, int buffer_size)
++{
++      xdr_enc_t *xdr;
++
++      if (buffer_size <= 0)
++              buffer_size = XDR_DEFAULT_BUFFER_SIZE;
++
++      xdr = kmalloc (sizeof (xdr_enc_t), GFP_KERNEL);
++      if (xdr == NULL)
++              return NULL;
++      xdr->stream = kmalloc (buffer_size, GFP_KERNEL);
++      if (xdr->stream == NULL) {
++              kfree (xdr);
++              return NULL;
++      }
++      xdr->fd = fd;
++      xdr->type = xdr_enc;
++      xdr->default_buf_size = buffer_size;
++      xdr->length = buffer_size;
++      xdr->curloc = 0;
++
++      return xdr;
++}
++
++/**
++ * xdr_dec_init - 
++ * @fd: 
++ * @buffer_size: 
++ * 
++ * 
++ * Returns: xdr_dec_t*
++ */
++xdr_dec_t *
++xdr_dec_init (xdr_socket fd, int buffer_size)
++{
++      xdr_dec_t *xdr;
++
++      if (buffer_size <= 0)
++              buffer_size = XDR_DEFAULT_BUFFER_SIZE;
++
++      xdr = kmalloc (sizeof (xdr_dec_t), GFP_KERNEL);
++      if (xdr == NULL)
++              return NULL;
++      xdr->length = buffer_size;
++      xdr->curloc = 0;
++      xdr->stream = kmalloc (buffer_size, GFP_KERNEL);
++      xdr->fd = fd;
++      xdr->type = xdr_dec;
++      if (xdr->stream == NULL) {
++              kfree (xdr);
++              return NULL;
++      }
++      *(xdr->stream) = XDR_NULL;      /* so the first dec_call will call get_next */
++      return xdr;
++}
++
++/*****************************************************************************/
++/**
++ * xdr_enc_flush - 
++ * @xdr: 
++ * 
++ * Returns: int
++ */
++int
++xdr_enc_flush (xdr_enc_t * xdr)
++{
++      int err;
++      if (xdr == NULL)
++              return -EINVAL;
++      if (xdr->type != xdr_enc)
++              return -EINVAL;
++      if (xdr->curloc == 0)
++              return 0;
++
++      err = xdr_send (xdr->fd, xdr->stream, xdr->curloc);
++      if (err < 0)
++              return err;
++      if (err == 0)
++              return -EPROTO; /* why? */
++      xdr->curloc = 0;
++
++      return 0;
++}
++
++/**
++ * xdr_release - 
++ * @xdr: 
++ *
++ * Free the memory, losing whatever may be there.
++ */
++void
++xdr_dec_release (xdr_dec_t * xdr)
++{
++      if (xdr == NULL)
++              return;
++      kfree (xdr->stream);
++      kfree (xdr);
++}
++
++/**
++ * xdr_enc_force_release - 
++ * @xdr: 
++ * 
++ * Free the memory, losing whatever may be there.
++ */
++void
++xdr_enc_force_release (xdr_enc_t * xdr)
++{
++      if (xdr == NULL)
++              return;
++      if (xdr->stream != NULL)
++              kfree (xdr->stream);
++      kfree (xdr);
++}
++
++/**
++ * xdr_enc_release - 
++ * @xdr: 
++ * 
++ * Free things up, trying to send any possible leftover data first.
++ * 
++ * Returns: int
++ */
++int
++xdr_enc_release (xdr_enc_t * xdr)
++{
++      int e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if ((e = xdr_enc_flush (xdr)) != 0)
++              return e;
++      xdr_enc_force_release (xdr);
++      return 0;
++}
++
++/*****************************************************************************/
++/**
++ * grow_stream - 
++ * @xdr: 
++ * @len: 
++ * 
++ * each single encoded call needs to fit within a buffer.  So we make sure
++ * the buffer is big enough.
++ *
++ * If the buffer is big enough, but just doesn't have room, we send the
++ * data in the buffer, emptying it, first.
++ * 
++ * Returns: int
++ */
++static int
++grow_stream (xdr_enc_t * enc, size_t len)
++{
++      int err;
++      uint8_t *c;
++
++      /* buffer must be big enough for one type entry. */
++      if (len > enc->length) {
++              c = xdr_realloc (enc->stream, len, enc->length);
++              if (c == NULL)
++                      return -ENOMEM;
++              enc->stream = c;
++              enc->length = len;
++      }
++
++      /* if there isn't room on the end of this chunk,
++       * try sending what we've got.
++       */
++      if (enc->curloc + len > enc->length) {
++              err = xdr_enc_flush (enc);
++              if (err != 0) {
++                      /* error, better pass this up. */
++                      return err;
++              }
++      }
++
++      return 0;
++}
++
++/**
++ * append_bytes - 
++ * @xdr: 
++ * @xdr_type: 
++ * @bytes: 
++ * @len: 
++ * 
++ * 
++ * Returns: int
++ */
++static int
++append_bytes (xdr_enc_t * xdr, uint8_t xdr_type, void *bytes, size_t len)
++{
++      int e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if (xdr->type != xdr_enc)
++              return -EINVAL;
++
++      /* len + 1; need the one byte for the type code. */
++      if ((e = grow_stream (xdr, len + 1)) != 0)
++              return e;
++      *(xdr->stream + xdr->curloc) = xdr_type;
++      xdr->curloc += 1;
++      memcpy ((xdr->stream + xdr->curloc), bytes, len);
++      xdr->curloc += len;
++
++      return 0;
++}
++
++int __inline__
++xdr_enc_uint64 (xdr_enc_t * xdr, uint64_t i)
++{
++      uint64_t b = cpu_to_be64 (i);
++      return append_bytes (xdr, XDR_UINT64, &b, sizeof (uint64_t));
++}
++
++int __inline__
++xdr_enc_uint32 (xdr_enc_t * xdr, uint32_t i)
++{
++      uint32_t b = cpu_to_be32 (i);
++      return append_bytes (xdr, XDR_UINT32, &b, sizeof (uint32_t));
++}
++
++int __inline__
++xdr_enc_uint16 (xdr_enc_t * xdr, uint16_t i)
++{
++      uint16_t b = cpu_to_be16 (i);
++      return append_bytes (xdr, XDR_UINT16, &b, sizeof (uint16_t));
++}
++
++int __inline__
++xdr_enc_uint8 (xdr_enc_t * xdr, uint8_t i)
++{
++      return append_bytes (xdr, XDR_UINT8, &i, sizeof (uint8_t));
++}
++
++int __inline__
++xdr_enc_ipv6 (xdr_enc_t * xdr, struct in6_addr *ip)
++{                             /* bytes should already be in the right order. */
++      return append_bytes (xdr, XDR_IPv6, ip->s6_addr, 16);
++}
++
++int
++xdr_enc_raw (xdr_enc_t * xdr, void *p, uint16_t len)
++{
++      int e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if ((e = grow_stream (xdr, len + 3)) != 0)
++              return e;
++      *(xdr->stream + xdr->curloc) = XDR_RAW;
++      xdr->curloc += 1;
++      (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++          cpu_to_be16 (len);
++      xdr->curloc += 2;
++      memcpy ((xdr->stream + xdr->curloc), p, len);
++      xdr->curloc += len;
++      return 0;
++}
++
++int
++xdr_enc_raw_iov (xdr_enc_t * xdr, int count, struct iovec *iov)
++{
++      size_t total = 0;
++      int i, err;
++      if (xdr == NULL || count < 1 || iov == NULL)
++              return -EINVAL;
++      for (i = 0; i < count; i++)
++              total += iov[i].iov_len;
++      /* make sure it fits in a uint16_t */
++      if (total > 0xffff)
++              return -EFBIG;
++      /* grow to fit */
++      if ((err = grow_stream (xdr, total + 3)) != 0)
++              return err;
++      /* copy in header and size */
++      *(xdr->stream + xdr->curloc) = XDR_RAW;
++      xdr->curloc += 1;
++      (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++          cpu_to_be16 (total);
++      xdr->curloc += 2;
++      /* copy in all iovbufs */
++      for (i = 0; i < count; i++) {
++              if (iov[i].iov_base == NULL)
++                      continue;
++              memcpy ((xdr->stream + xdr->curloc), iov[i].iov_base,
++                      iov[i].iov_len);
++              xdr->curloc += iov[i].iov_len;
++      }
++      return 0;
++}
++
++int
++xdr_enc_string (xdr_enc_t * xdr, uint8_t * s)
++{
++      int len, e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if (s == NULL)
++              len = 0;
++      else
++              len = strlen (s);
++      if ((e = grow_stream (xdr, len + 3)) != 0)
++              return e;
++      *(xdr->stream + xdr->curloc) = XDR_STRING;
++      xdr->curloc += 1;
++      (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++          cpu_to_be16 (len);
++      xdr->curloc += 2;
++      if (len > 0) {
++              memcpy ((xdr->stream + xdr->curloc), s, len);
++              xdr->curloc += len;
++      }
++      return 0;
++}
++
++int
++xdr_enc_list_start (xdr_enc_t * xdr)
++{
++      int e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if ((e = grow_stream (xdr, 1)) != 0)
++              return e;
++      *(xdr->stream + xdr->curloc) = XDR_LIST_START;
++      xdr->curloc += 1;
++      return 0;
++}
++
++int
++xdr_enc_list_stop (xdr_enc_t * xdr)
++{
++      int e;
++      if (xdr == NULL)
++              return -EINVAL;
++      if ((e = grow_stream (xdr, 1)) != 0)
++              return e;
++      *(xdr->stream + xdr->curloc) = XDR_LIST_STOP;
++      xdr->curloc += 1;
++      return 0;
++}
++
++/*****************************************************************************/
++
++/**
++ * get_next - 
++ * @xdr: 
++ * 
++ * get what ever may be next, and put it into the buffer.
++ * 
++ * Returns: int
++ */
++static int
++get_next (xdr_dec_t * xdr)
++{
++      int err;
++      uint16_t len;
++      if ((err = xdr_recv (xdr->fd, xdr->stream, 1)) < 0)
++              return err;
++      if (err == 0)
++              return -EPROTO;
++      xdr->curloc = 1;
++      if (*(xdr->stream) == XDR_UINT64) {
++              len = sizeof (uint64_t);
++      } else if (*(xdr->stream) == XDR_UINT32) {
++              len = sizeof (uint32_t);
++      } else if (*(xdr->stream) == XDR_UINT16) {
++              len = sizeof (uint16_t);
++      } else if (*(xdr->stream) == XDR_UINT8) {
++              len = sizeof (uint8_t);
++      } else if (*(xdr->stream) == XDR_IPv6) {
++              len = 16;
++      } else if (*(xdr->stream) == XDR_STRING) {
++              if ((err = xdr_recv (xdr->fd, (xdr->stream + 1), 2)) < 0)
++                      return err;
++              if (err == 0)
++                      return -EPROTO;
++              len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++              xdr->curloc += 2;
++      } else if (*(xdr->stream) == XDR_RAW) {
++              if ((err = xdr_recv (xdr->fd, (xdr->stream + 1), 2)) < 0)
++                      return err;
++              if (err == 0)
++                      return -EPROTO;
++              len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++              xdr->curloc += 2;
++      } else if (*(xdr->stream) == XDR_LIST_START) {
++              xdr->curloc = 0;
++              return 0;
++      } else if (*(xdr->stream) == XDR_LIST_STOP) {
++              xdr->curloc = 0;
++              return 0;
++      } else {
++              return -1;
++      }
++
++      /* grow buffer if need be. */
++      if (xdr->curloc + len > xdr->length) {
++              uint8_t *c;
++              c = xdr_realloc (xdr->stream, xdr->curloc + len, xdr->length);
++              if (c == NULL)
++                      return -ENOMEM;
++              xdr->stream = c;
++              xdr->length = xdr->curloc + len;
++      }
++
++      if (len > 0) {
++              if ((err =
++                   xdr_recv (xdr->fd, (xdr->stream + xdr->curloc), len)) < 0)
++                      return err;
++              if (err == 0)
++                      return -EPROTO;
++      }
++      xdr->curloc = 0;
++      return 0;
++}
++
++int
++xdr_dec_uint64 (xdr_dec_t * xdr, uint64_t * i)
++{
++      int err;
++      if (xdr == NULL || i == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_UINT64)
++              return -ENOMSG;
++      *i = be64_to_cpu (*((uint64_t *) (xdr->stream + 1)));
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_uint32 (xdr_dec_t * xdr, uint32_t * i)
++{
++      int err;
++      if (xdr == NULL || i == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_UINT32)
++              return -ENOMSG;
++      *i = be32_to_cpu (*((uint32_t *) (xdr->stream + 1)));
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_uint16 (xdr_dec_t * xdr, uint16_t * i)
++{
++      int err;
++      if (xdr == NULL || i == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_UINT16)
++              return -ENOMSG;
++      *i = be16_to_cpu (*((uint16_t *) (xdr->stream + 1)));
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_uint8 (xdr_dec_t * xdr, uint8_t * i)
++{
++      int err;
++      if (xdr == NULL || i == NULL)
++              return -EINVAL;
++
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_UINT8)
++              return -ENOMSG;
++      *i = *((uint8_t *) (xdr->stream + 1));
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_ipv6 (xdr_dec_t * xdr, struct in6_addr *ip)
++{
++      int err;
++      if (xdr == NULL || ip == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_IPv6)
++              return -ENOMSG;
++      memcpy (ip, xdr->stream + 1, 16);
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++/* mallocing version */
++int
++xdr_dec_raw_m (xdr_dec_t * xdr, void **p, uint16_t * l)
++{
++      int len;
++      void *str;
++      int err;
++
++      if (xdr == NULL || p == NULL || l == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_RAW)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      str = kmalloc (len, GFP_KERNEL);
++      if (str == NULL)
++              return -ENOMEM;
++      memcpy (str, (xdr->stream + xdr->curloc), len);
++      xdr->curloc += len;
++
++      *p = str;
++      *l = len;
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++/* non-mallocing version */
++int
++xdr_dec_raw (xdr_dec_t * xdr, void *p, uint16_t * l)
++{
++      int len;
++      int err;
++
++      if (xdr == NULL || p == NULL || l == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_RAW)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      if (len > *l)
++              return -1;
++
++      memcpy (p, (xdr->stream + xdr->curloc), len);
++      xdr->curloc += len;
++
++      *l = len;
++
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++/**
++ * xdr_dec_raw_ag - auto-growing version
++ * @xdr: 
++ * @p: <> pointer to buffer
++ * @bl: <> size of the buffer
++ * @rl: > size of data read from stream
++ * 
++ * This form of xdr_dec_raw will increase the size of a pre-malloced buffer
++ * to fit the data it is reading.  It is kind of a merger of the
++ * non-mallocing and mallocing versions.
++ * 
++ * Returns: int
++ */
++int
++xdr_dec_raw_ag (xdr_dec_t * xdr, void **p, uint16_t * bl, uint16_t * rl)
++{
++      int len;
++      int err;
++
++      if (xdr == NULL || p == NULL || bl == NULL || rl == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_RAW)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      if (len > *bl) {        /* grow p */
++              void *temp;
++              temp = xdr_realloc (*p, len, *bl);
++              if (temp == NULL)
++                      return -ENOMEM;
++              *bl = len;
++              *p = temp;
++      }
++
++      memcpy (*p, (xdr->stream + xdr->curloc), len);
++      xdr->curloc += len;
++
++      *rl = len;
++
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++/* mallocing version */
++int
++xdr_dec_string (xdr_dec_t * xdr, uint8_t ** strp)
++{
++      int len;
++      char *str;
++      int err;
++      if (xdr == NULL || strp == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_STRING)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      if (len > 0) {
++              str = kmalloc (len + 1, GFP_KERNEL);
++              if (str == NULL)
++                      return -ENOMEM;
++              str[len] = '\0';
++              memcpy (str, (xdr->stream + xdr->curloc), len);
++              xdr->curloc += len;
++
++              *strp = str;
++      } else {
++              *strp = NULL;
++      }
++
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++/* non-mallocing version */
++int
++xdr_dec_string_nm (xdr_dec_t * xdr, uint8_t * string, size_t l)
++{
++      int len;
++      int err;
++      if (xdr == NULL || string == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_STRING)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      if (len > 0) {
++              memcpy (string, (xdr->stream + xdr->curloc), MIN (len, l));
++              if (l > len) {
++                      string[len] = '\0';
++              }
++              string[l - 1] = '\0';
++      } else {
++              string[0] = '\0';
++      }
++
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_string_ag (xdr_dec_t * xdr, uint8_t ** s, uint16_t * bl)
++{
++      int len;
++      int err;
++      if (xdr == NULL || s == NULL || bl == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_STRING)
++              return -ENOMSG;
++      xdr->curloc = 1;
++
++      len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++      xdr->curloc += 2;
++
++      if (len == 0) {         /* empty string */
++              **s = '\0';
++              *(xdr->stream) = XDR_NULL;
++              return 0;
++      }
++
++      if (len >= *bl) {       /* grow s */
++              void *temp;
++              temp = xdr_realloc (*s, len + 1, *bl);
++              if (temp == NULL)
++                      return -ENOMEM;
++              *bl = len + 1;
++              *s = temp;
++      }
++
++      memcpy (*s, (xdr->stream + xdr->curloc), len);
++      (*s)[len] = '\0';
++
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_list_start (xdr_dec_t * xdr)
++{
++      int err;
++      if (xdr == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_LIST_START)
++              return -ENOMSG;
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
++
++int
++xdr_dec_list_stop (xdr_dec_t * xdr)
++{
++      int err;
++      if (xdr == NULL)
++              return -EINVAL;
++      if (*(xdr->stream) == XDR_NULL) {
++              if ((err = get_next (xdr)) != 0)
++                      return err;
++      }
++      if (*(xdr->stream) != XDR_LIST_STOP)
++              return -ENOMSG;
++      /* read the item out, mark that */
++      *(xdr->stream) = XDR_NULL;
++      return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_io.c linux-patched/fs/gfs_locking/lock_gulm/xdr_io.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_io.c       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_io.c    2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,169 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * does the lowest level of reads and writes.
++ * In kernel and/or userspace.
++ */
++
++#include "xdr.h"
++
++#ifdef __KERNEL__
++#ifdef __linux__
++#include <linux/net.h>
++#include <linux/in.h>
++#include <linux/socket.h>
++#include <net/sock.h>
++#include "asm/uaccess.h"
++
++/**
++ * do_tfer - transfers data over a socket
++ * @sock: < socket
++ * @iov: <> iovec of buffers
++ * @n:    < how many iovecs
++ * @size: < total data size to send/recv
++ * @dir:  < send or recv
++ * @timeout: < how many sec to wait. 0 == forever.
++ * 
++ * Returns: <0: Error
++ *         >=0: Bytes transfered
++ */
++static int
++do_tfer (struct socket *sock, struct iovec *iov, int n, int size, int dir)
++{
++      unsigned long flags;
++      sigset_t oldset;
++      struct msghdr m;
++      mm_segment_t fs;
++      int rv, moved = 0;
++
++      fs = get_fs ();
++      set_fs (get_ds ());
++
++      /* XXX do I still want the signal stuff? */
++      spin_lock_irqsave (&current->sighand->siglock, flags);
++      oldset = current->blocked;
++      siginitsetinv (&current->blocked,
++                     sigmask (SIGKILL) | sigmask (SIGTERM));
++      recalc_sigpending ();
++      spin_unlock_irqrestore (&current->sighand->siglock, flags);
++
++      memset (&m, 0, sizeof (struct msghdr));
++      for (;;) {
++              m.msg_iov = iov;
++              m.msg_iovlen = n;
++              m.msg_flags = MSG_NOSIGNAL;
++
++              if (dir)
++                      rv = sock_sendmsg (sock, &m, size - moved);
++              else
++                      rv = sock_recvmsg (sock, &m, size - moved, 0);
++
++              if (rv <= 0)
++                      goto out_err;
++              moved += rv;
++
++              if (moved >= size)
++                      break;
++
++              /* adjust iov's for next transfer */
++              while (iov->iov_len == 0) {
++                      iov++;
++                      n--;
++              }
++
++      }
++      rv = moved;
++      out_err:
++      spin_lock_irqsave (&current->sighand->siglock, flags);
++      current->blocked = oldset;
++      recalc_sigpending ();
++      spin_unlock_irqrestore (&current->sighand->siglock, flags);
++
++      set_fs (fs);
++
++      return rv;
++}
++
++size_t
++xdr_send (struct socket * sock, void *buf, size_t size)
++{
++      struct iovec iov;
++      int res;
++
++      iov.iov_base = buf;
++      iov.iov_len = size;
++
++      res = do_tfer (sock, &iov, 1, size, 1);
++
++      return res;
++}
++
++size_t
++xdr_recv (struct socket * sock, void *buf, size_t size)
++{
++      struct iovec iov;
++      int res;
++
++      iov.iov_base = buf;
++      iov.iov_len = size;
++
++      res = do_tfer (sock, &iov, 1, size, 0);
++
++      return res;
++}
++
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++
++#include <errno.h>
++#include <sys/types.h>
++#include <sys/socket.h>
++
++ssize_t
++xdr_recv (int fd, void *buf, size_t len)
++{
++      ssize_t cnt = 0;
++      size_t ttl = 0;
++      while (len > 0) {
++              cnt = recv (fd, buf, len, 0);
++              if (cnt == 0)
++                      return 0;
++              if (cnt < 0)
++                      return -errno;
++              len -= cnt;
++              buf += cnt;
++              ttl += cnt;
++      }
++      return ttl;
++}
++
++ssize_t
++xdr_send (int fd, void *buf, size_t len)
++{
++      ssize_t cnt = 0;
++      size_t ttl = 0;
++      while (len > 0) {
++              cnt = send (fd, buf, len, 0);
++              if (cnt == 0)
++                      return 0;
++              if (cnt < 0)
++                      return -errno;
++              len -= cnt;
++              buf += cnt;
++              ttl += cnt;
++      }
++      return ttl;
++}
++
++#endif /*__KERNEL__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_socket.c linux-patched/fs/gfs_locking/lock_gulm/xdr_socket.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_socket.c   1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_socket.c        2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,82 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * This file opens and closes a socket.
++ * In kernel and/or userspace.
++ */
++
++#include "xdr.h"
++
++#ifdef __KERNEL__
++#ifdef __linux__
++
++int
++xdr_open (xdr_socket * xsk)
++{
++      return sock_create (AF_INET6, SOCK_STREAM, 0, xsk);
++}
++
++int
++xdr_connect (struct sockaddr_in6 *adr, xdr_socket xsk)
++{
++      return xsk->ops->connect (xsk,
++                                (struct sockaddr *) adr,
++                                sizeof (struct sockaddr_in6), 0);
++}
++
++void
++xdr_close (xdr_socket * xsk)
++{
++      if (*xsk == NULL)
++              return;
++      sock_release (*xsk);
++      *xsk = NULL;
++}
++
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++
++int
++xdr_open (xdr_socket * xsk)
++{
++      int sk;
++      sk = socket (AF_INET6, SOCK_STREAM, 0);
++      if (sk < 0)
++              return -errno;
++      *xsk = sk;
++      return 0;
++}
++
++int
++xdr_connect (struct sockaddr_in6 *adr, xdr_socket xsk)
++{
++      int err;
++      err =
++          connect (xsk, (struct sockaddr *) adr,
++                   sizeof (struct sockaddr_in6));
++      if (err < 0)
++              return -errno;
++      return 0;
++}
++
++void
++xdr_close (xdr_socket * xsk)
++{
++      if (*xsk < 0)
++              return;
++      close (*xsk);
++      *xsk = -1;
++}
++
++#endif /*__KERNEL__*/
+diff -urN linux-orig/fs/gfs_locking/lock_harness/main.c linux-patched/fs/gfs_locking/lock_harness/main.c
+--- linux-orig/fs/gfs_locking/lock_harness/main.c      1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_harness/main.c   2004-06-16 12:03:10.006671787 -0500
+@@ -0,0 +1,226 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <linux/kmod.h>
++#include <linux/lm_interface.h>
++
++#define RELEASE_NAME "<CVS>"
++
++struct lmh_wrapper {
++      struct list_head lw_list;
++      struct lm_lockops *lw_ops;
++};
++
++static struct semaphore lmh_lock;
++static struct list_head lmh_list;
++
++/**
++ * lm_register_proto - Register a low-level locking protocol
++ * @proto: the protocol definition
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++lm_register_proto(struct lm_lockops *proto)
++{
++      struct list_head *tmp, *head;
++      struct lmh_wrapper *lw;
++
++      down(&lmh_lock);
++
++      for (head = &lmh_list, tmp = head->next; tmp != head; tmp = tmp->next) {
++              lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++              if (strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name) == 0) {
++                      up(&lmh_lock);
++                      printk("lock_harness:  protocol %s already exists\n",
++                             proto->lm_proto_name);
++                      return -EEXIST;
++              }
++      }
++
++      lw = kmalloc(sizeof (struct lmh_wrapper), GFP_KERNEL);
++      if (!lw) {
++              up(&lmh_lock);
++              return -ENOMEM;
++      }
++      memset(lw, 0, sizeof (struct lmh_wrapper));
++
++      lw->lw_ops = proto;
++      list_add(&lw->lw_list, &lmh_list);
++
++      up(&lmh_lock);
++
++      return 0;
++}
++
++/**
++ * lm_unregister_proto - Unregister a low-level locking protocol
++ * @proto: the protocol definition
++ *
++ */
++
++void
++lm_unregister_proto(struct lm_lockops *proto)
++{
++      struct list_head *tmp, *head;
++      struct lmh_wrapper *lw = NULL;
++
++      down(&lmh_lock);
++
++      for (head = &lmh_list, tmp = head->next; tmp != head; tmp = tmp->next) {
++              lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++              if (strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name) == 0) {
++                      list_del(&lw->lw_list);
++                      up(&lmh_lock);
++                      kfree(lw);
++                      return;
++              }
++      }
++
++      up(&lmh_lock);
++
++      printk("lock_harness:  can't unregister lock protocol %s\n",
++             proto->lm_proto_name);
++}
++
++/**
++ * lm_mount - Mount a lock protocol
++ * @proto_name - the name of the protocol
++ * @table_name - the name of the lock space
++ * @host_data - data specific to this host
++ * @cb - the callback to the code using the lock module
++ * @fsdata - data to pass back with the callback
++ * @min_lvb_size - the mininum LVB size that the caller can deal with
++ * @lockstruct - a structure returned describing the mount
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++lm_mount(char *proto_name, char *table_name, char *host_data,
++       lm_callback_t cb, lm_fsdata_t * fsdata,
++       unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++      struct list_head *tmp;
++      struct lmh_wrapper *lw = NULL;
++      int try = 0;
++      int error;
++
++      retry:
++      down(&lmh_lock);
++
++      for (tmp = lmh_list.next; tmp != &lmh_list; tmp = tmp->next) {
++              lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++              if (strcmp(lw->lw_ops->lm_proto_name, proto_name) == 0)
++                      break;
++              else
++                      lw = NULL;
++      }
++
++      if (!lw) {
++              if (!try && capable(CAP_SYS_MODULE)) {
++                      try = 1;
++                      up(&lmh_lock);
++                      request_module(proto_name);
++                      goto retry;
++              }
++              printk("lock_harness:  can't find protocol %s\n", proto_name);
++              error = -ENOENT;
++              goto out;
++      }
++
++      if (!try_module_get(lw->lw_ops->lm_owner)) {
++              try = 0;
++              up(&lmh_lock);
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ);
++              goto retry;
++      }
++
++      error = lw->lw_ops->lm_mount(table_name, host_data,
++                                   cb, fsdata, min_lvb_size, lockstruct);
++      if (error)
++              module_put(lw->lw_ops->lm_owner);
++
++      out:
++      up(&lmh_lock);
++
++      return error;
++}
++
++/**
++ * lm_unmount - unmount a lock module
++ * @lockstruct: the lockstruct passed into mount
++ *
++ */
++
++void
++lm_unmount(struct lm_lockstruct *lockstruct)
++{
++      down(&lmh_lock);
++      lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
++      if (lockstruct->ls_ops->lm_owner)
++              module_put(lockstruct->ls_ops->lm_owner);
++      up(&lmh_lock);
++}
++
++/**
++ * init_lmh - Initialize the lock module harness
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init
++init_lmh(void)
++{
++      init_MUTEX(&lmh_lock);
++      INIT_LIST_HEAD(&lmh_list);
++
++      printk("Lock_Harness %s (built %s %s) installed\n",
++             RELEASE_NAME, __DATE__, __TIME__);
++
++      return 0;
++}
++
++/**
++ * exit_lmh - cleanup the Lock Module Harness
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void __exit
++exit_lmh(void)
++{
++}
++
++module_init(init_lmh);
++module_exit(exit_lmh);
++
++MODULE_DESCRIPTION("GFS Lock Module Harness " RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++EXPORT_SYMBOL_GPL(lm_register_proto);
++EXPORT_SYMBOL_GPL(lm_unregister_proto);
++EXPORT_SYMBOL_GPL(lm_mount);
++EXPORT_SYMBOL_GPL(lm_unmount);
+diff -urN linux-orig/include/linux/lm_interface.h linux-patched/include/linux/lm_interface.h
+--- linux-orig/include/linux/lm_interface.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/lm_interface.h 2004-06-16 12:03:10.005672019 -0500
+@@ -0,0 +1,193 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++
++   Sooner or later, I need to put all the documentation back into this file.
++   In the mean time, here are some notes.
++
++   -  The lock module is now responsible for STOMITHing the an expired
++   client before calling the callback with type LM_CB_NEED_RECOVERY.
++
++   -  If mount() operation returns first == TRUE, GFS will check all the
++   journals.  GFS itself can't/shouldn't stomith the machines, so the lock module
++   needs to make sure that there are no zombie machines on any of the
++   journals.  (i.e. this should probably be on the first mount of the lock
++   space where all mounts by other machines are blocked.)  GFS will call
++   others_may_mount() when the filesystem is in a consistent state.
++
++   -  GFS can issue multiple simultaneous get_lock()s for the same lockname.
++   The lock module needs to deal with it, either by 1)  building a hash table
++   to lookup the structures and keeping a reference count so there is only
++   on lm_lock_t for a given lockname. or 2) just dealing with multiple 
++   lm_lock_t structures for a given lockname.
++
++*/
++
++#ifndef __LM_INTERFACE_DOT_H__
++#define __LM_INTERFACE_DOT_H__
++
++typedef void lm_lockspace_t;
++typedef void lm_lock_t;
++typedef void lm_fsdata_t;
++typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
++                             void *data);
++
++/* Flags for the struct lm_lockstruct->ls_flags field */
++
++#define LM_LSFLAG_LOCAL        (0x00000001)
++#define LM_LSFLAG_ASYNC        (0x00000002)
++
++/* Lock types */
++
++#define LM_TYPE_RESERVED       (0x00)
++#define LM_TYPE_NONDISK        (0x01)
++#define LM_TYPE_INODE          (0x02)
++#define LM_TYPE_RGRP           (0x03)
++#define LM_TYPE_META           (0x04)
++#define LM_TYPE_IOPEN          (0x05)
++#define LM_TYPE_FLOCK          (0x06)
++#define LM_TYPE_PLOCK          (0x07)
++#define LM_TYPE_QUOTA          (0x08)
++
++/* States passed to lock() */
++
++#define LM_ST_UNLOCKED         (0)
++#define LM_ST_EXCLUSIVE        (1)
++#define LM_ST_DEFERRED         (2)
++#define LM_ST_SHARED           (3)
++
++/* Flags passed to lock() */
++
++#define LM_FLAG_TRY            (0x00000001)
++#define LM_FLAG_TRY_1CB        (0x00000002)
++#define LM_FLAG_NOEXP          (0x00000004)
++#define LM_FLAG_ANY            (0x00000008)
++#define LM_FLAG_PRIORITY       (0x00000010)
++
++/* Flags returned by lock() */
++
++#define LM_OUT_ST_MASK         (0x00000003)
++#define LM_OUT_CACHEABLE       (0x00000004)
++#define LM_OUT_CANCELED        (0x00000008)
++#define LM_OUT_NEED_E          (0x00000010)
++#define LM_OUT_NEED_D          (0x00000020)
++#define LM_OUT_NEED_S          (0x00000040)
++#define LM_OUT_ASYNC           (0x00000080)
++#define LM_OUT_LVB_INVALID     (0x00000100)
++
++/* Callback types */
++
++#define LM_CB_NEED_E           (257)
++#define LM_CB_NEED_D           (258)
++#define LM_CB_NEED_S           (259)
++#define LM_CB_NEED_RECOVERY    (260)
++#define LM_CB_DROPLOCKS        (261)
++#define LM_CB_ASYNC            (262)
++
++/* Reset_exp messages */
++
++#define LM_RD_GAVEUP           (308)
++#define LM_RD_SUCCESS          (309)
++
++struct lm_lockname {
++      uint64_t ln_number;
++      unsigned int ln_type;
++};
++
++#define lm_name_equal(name1, name2) \
++(((name1)->ln_number == (name2)->ln_number) && \
++ ((name1)->ln_type == (name2)->ln_type)) \
++
++struct lm_async_cb {
++      struct lm_lockname lc_name;
++      int lc_ret;
++};
++
++struct lm_lockstruct;
++
++struct lm_lockops {
++      char lm_proto_name[256];
++
++      /* Mount/Unmount */
++
++      int (*lm_mount) (char *table_name, char *host_data,
++                       lm_callback_t cb, lm_fsdata_t *fsdata,
++                       unsigned int min_lvb_size,
++                       struct lm_lockstruct *lockstruct);
++      void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
++      void (*lm_unmount) (lm_lockspace_t *lockspace);
++
++      /* Lock oriented operations */
++
++      int (*lm_get_lock) (lm_lockspace_t *lockspace,
++                          struct lm_lockname *name, lm_lock_t **lockp);
++      void (*lm_put_lock) (lm_lock_t *lock);
++
++      unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
++                               unsigned int req_state, unsigned int flags);
++      unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
++
++      void (*lm_cancel) (lm_lock_t *lock);
++
++      int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
++      void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
++      void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
++
++      /* Posix Lock oriented operations  */
++
++      int (*lm_plock_get) (lm_lockspace_t *lockspace,
++                           struct lm_lockname *name, unsigned long owner,
++                           uint64_t *start, uint64_t *end, int *exclusive,
++                           unsigned long *rowner);
++
++      int (*lm_plock) (lm_lockspace_t *lockspace,
++                       struct lm_lockname *name, unsigned long owner,
++                       int wait, int exclusive, uint64_t start,
++                       uint64_t end);
++
++      int (*lm_punlock) (lm_lockspace_t *lockspace,
++                         struct lm_lockname *name, unsigned long owner,
++                         uint64_t start, uint64_t end);
++
++      /* Client oriented operations */
++
++      void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
++                                unsigned int message);
++
++      struct module *lm_owner;
++};
++
++struct lm_lockstruct {
++      unsigned int ls_jid;
++      unsigned int ls_first;
++      unsigned int ls_lvb_size;
++      lm_lockspace_t *ls_lockspace;
++      struct lm_lockops *ls_ops;
++      int ls_flags;
++};
++
++/* Bottom interface */
++
++int lm_register_proto(struct lm_lockops *proto);
++void lm_unregister_proto(struct lm_lockops *proto);
++
++/* Top interface */
++
++int lm_mount(char *proto_name,
++           char *table_name, char *host_data,
++           lm_callback_t cb, lm_fsdata_t *fsdata,
++           unsigned int min_lvb_size, struct lm_lockstruct *lockstruct);
++void lm_unmount(struct lm_lockstruct *lockstruct);
++
++#endif /* __LM_INTERFACE_DOT_H__ */
+diff -urN linux-orig/fs/gfs_locking/lock_nolock/main.c linux-patched/fs/gfs_locking/lock_nolock/main.c
+--- linux-orig/fs/gfs_locking/lock_nolock/main.c       1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_nolock/main.c    2004-06-16 12:03:13.918762838 -0500
+@@ -0,0 +1,350 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/lm_interface.h>
++
++#define RELEASE_NAME "<CVS>"
++
++struct nolock_lockspace {
++      unsigned int nl_lvb_size;
++};
++
++struct lm_lockops nolock_ops;
++
++/**
++ * nolock_mount - mount a nolock lockspace
++ * @table_name: the name of the space to mount
++ * @host_data: host specific data
++ * @cb: the callback
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_mount(char *table_name, char *host_data,
++           lm_callback_t cb, lm_fsdata_t *fsdata,
++           unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++      char *c;
++      unsigned int jid;
++      struct nolock_lockspace *nl;
++
++      /* If there is a "jid=" in the hostdata, return that jid.
++         Otherwise, return zero. */
++
++      c = strstr(host_data, "jid=");
++      if (!c)
++              jid = 0;
++      else {
++              c += 4;
++              sscanf(c, "%u", &jid);
++      }
++
++      nl = kmalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
++      if (!nl)
++              return -ENOMEM;
++
++      memset(nl, 0, sizeof(struct nolock_lockspace));
++      nl->nl_lvb_size = min_lvb_size;
++
++      lockstruct->ls_jid = jid;
++      lockstruct->ls_first = 1;
++      lockstruct->ls_lvb_size = min_lvb_size;
++      lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
++      lockstruct->ls_ops = &nolock_ops;
++      lockstruct->ls_flags = LM_LSFLAG_LOCAL | LM_LSFLAG_ASYNC;
++
++      return 0;
++}
++
++/**
++ * nolock_others_may_mount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void
++nolock_others_may_mount(lm_lockspace_t *lockspace)
++{
++}
++
++/**
++ * nolock_unmount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void
++nolock_unmount(lm_lockspace_t *lockspace)
++{
++      struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
++      kfree(nl);
++}
++
++/**
++ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
++ * @lockspace: the lockspace the lock lives in
++ * @name: the name of the lock
++ * @lockp: return the lm_lock_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++              lm_lock_t ** lockp)
++{
++      *lockp = (lm_lock_t *)lockspace;
++      return 0;
++}
++
++/**
++ * nolock_put_lock - get rid of a lock structure
++ * @lock: the lock to throw away
++ *
++ */
++
++static void
++nolock_put_lock(lm_lock_t *lock)
++{
++}
++
++/**
++ * nolock_lock - acquire a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ * @req_state: the requested state
++ * @flags: modifier flags
++ *
++ * Returns: A bitmap of LM_OUT_*
++ */
++
++static unsigned int
++nolock_lock(lm_lock_t *lock, unsigned int cur_state, unsigned int req_state,
++          unsigned int flags)
++{
++      return req_state | LM_OUT_CACHEABLE;
++}
++
++/**
++ * nolock_unlock - unlock a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ *
++ * Returns: 0
++ */
++
++static unsigned int
++nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
++{
++      return 0;
++}
++
++/**
++ * nolock_cancel - cancel a request on a lock
++ * @lock: the lock to cancel request for
++ *
++ */
++
++static void
++nolock_cancel(lm_lock_t *lock)
++{
++}
++
++/**
++ * nolock_hold_lvb - hold on to a lock value block
++ * @lock: the lock the LVB is associated with
++ * @lvbp: return the lm_lvb_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
++{
++      struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
++      int error = 0;
++
++      *lvbp = kmalloc(nl->nl_lvb_size, GFP_KERNEL);
++      if (*lvbp)
++              memset(*lvbp, 0, nl->nl_lvb_size);
++      else
++              error = -ENOMEM;
++
++      return error;
++}
++
++/**
++ * nolock_unhold_lvb - release a LVB
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++static void
++nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
++{
++      kfree(lvb);
++}
++
++/**
++ * nolock_sync_lvb - sync out the value of a lvb
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++static void
++nolock_sync_lvb(lm_lock_t *lock, char *lvb)
++{
++}
++
++/**
++ * nolock_plock_get - 
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @start:
++ * @end:
++ * @exclusive:
++ * @rowner:
++ *
++ */
++
++static int
++nolock_plock_get(lm_lockspace_t *lockspace,
++               struct lm_lockname *name, unsigned long owner,
++               uint64_t *start, uint64_t *end, int *exclusive,
++               unsigned long *rowner)
++{
++      return -ENOSYS;
++}
++
++/**
++ * nolock_plock -
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @wait:
++ * @exclusive:
++ * @start:
++ * @end:
++ *
++ */
++
++static int
++nolock_plock(lm_lockspace_t *lockspace,
++           struct lm_lockname *name, unsigned long owner,
++           int wait, int exclusive, uint64_t start,
++           uint64_t end)
++{
++      return -ENOSYS;
++}
++
++/**
++ * nolock_punlock -
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @start:
++ * @end:
++ *
++ */
++
++static int
++nolock_punlock(lm_lockspace_t *lockspace,
++             struct lm_lockname *name, unsigned long owner,
++             uint64_t start, uint64_t end)
++{
++      return -ENOSYS;
++}
++
++/**
++ * nolock_recovery_done - reset the expired locks for a given jid
++ * @lockspace: the lockspace
++ * @jid: the jid
++ *
++ */
++
++static void
++nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
++                   unsigned int message)
++{
++}
++
++struct lm_lockops nolock_ops = {
++      .lm_proto_name = "lock_nolock",
++      .lm_mount = nolock_mount,
++      .lm_others_may_mount = nolock_others_may_mount,
++      .lm_unmount = nolock_unmount,
++      .lm_get_lock = nolock_get_lock,
++      .lm_put_lock = nolock_put_lock,
++      .lm_lock = nolock_lock,
++      .lm_unlock = nolock_unlock,
++      .lm_cancel = nolock_cancel,
++      .lm_hold_lvb = nolock_hold_lvb,
++      .lm_unhold_lvb = nolock_unhold_lvb,
++      .lm_sync_lvb = nolock_sync_lvb,
++      .lm_plock_get = nolock_plock_get,
++      .lm_plock = nolock_plock,
++      .lm_punlock = nolock_punlock,
++      .lm_recovery_done = nolock_recovery_done,
++      .lm_owner = THIS_MODULE,
++};
++
++/**
++ * init_nolock - Initialize the nolock module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init
++init_nolock(void)
++{
++      int error;
++
++      error = lm_register_proto(&nolock_ops);
++      if (error) {
++              printk("lock_nolock: can't register protocol: %d\n", error);
++              return error;
++      }
++
++      printk("Lock_Nolock %s (built %s %s) installed\n",
++             RELEASE_NAME, __DATE__, __TIME__);
++
++      return 0;
++}
++
++/**
++ * exit_nolock - cleanup the nolock module
++ *
++ */
++
++void __exit
++exit_nolock(void)
++{
++      lm_unregister_proto(&nolock_ops);
++}
++
++module_init(init_nolock);
++module_exit(exit_nolock);
++
++MODULE_DESCRIPTION("GFS Nolock Locking Module " RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
diff --git a/linux-cluster-gnbd.patch b/linux-cluster-gnbd.patch

new file mode 100644 (file)

index 0000000..729651b
--- /dev/null
+++ b/linux-cluster-gnbd.patch
@@ -0,0 +1,1197 @@
+diff -urN linux-2.6.7/drivers/block/Kconfig linux-2.6.7-patched/drivers/block/Kconfig
+--- linux-2.6.7/drivers/block/Kconfig  2004-06-16 00:19:01.000000000 -0500
++++ linux-2.6.7-patched/drivers/block/Kconfig  2004-06-17 16:38:45.000000000 -0500
+@@ -347,6 +347,13 @@
+         your machine, or if you want to have a raid or loopback device
+         bigger than 2TB.  Otherwise say N.
+ 
++config BLK_DEV_GNBD
++      tristate "Global network block device support"
++      depends on NET
++      ---help---
++        
++        If unsure, say N.
++
+ source "drivers/s390/block/Kconfig"
+ 
+ endmenu
+diff -urN linux-2.6.7/drivers/block/Makefile linux-2.6.7-patched/drivers/block/Makefile
+--- linux-2.6.7/drivers/block/Makefile 2004-06-16 00:19:52.000000000 -0500
++++ linux-2.6.7-patched/drivers/block/Makefile 2004-06-17 16:38:45.000000000 -0500
+@@ -42,4 +42,4 @@
+ 
+ obj-$(CONFIG_VIODASD)         += viodasd.o
+ obj-$(CONFIG_BLK_DEV_SX8)     += sx8.o
+-
++obj-$(CONFIG_BLK_DEV_GNBD)    += gnbd.o
+diff -urN linux-orig/drivers/block/gnbd.c linux-patched/drivers/block/gnbd.c
+--- linux-orig/drivers/block/gnbd.c    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/drivers/block/gnbd.c 2004-06-11 13:13:58.000000000 -0500
+@@ -0,0 +1,1060 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* Large chunks of this code were lifted from nbd.c */
++
++#include <linux/major.h>
++
++#include <linux/blkdev.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/bio.h>
++#include <linux/stat.h>
++#include <linux/errno.h>
++#include <linux/file.h>
++#include <linux/ioctl.h>
++#include <net/sock.h>
++#include <linux/in.h>
++#include <linux/buffer_head.h>
++#include <linux/miscdevice.h>
++
++#include <linux/devfs_fs_kernel.h>
++
++#include <asm/uaccess.h>
++#include <asm/types.h>
++
++#include <linux/gnbd.h>
++
++static int major_nr = 0;
++uint64_t insmod_time;
++
++
++#define GNBD_MAGIC 0x74d06100
++
++#ifdef NDEBUG
++#define dprintk(flags, fmt...)
++#else /* NDEBUG */
++#define dprintk(flags, fmt...) do { \
++      if (debugflags & (flags)) printk(KERN_DEBUG fmt); \
++} while (0)
++#define DBG_IOCTL       0x0004
++#define DBG_INIT        0x0010
++#define DBG_EXIT        0x0020
++#define DBG_BLKDEV      0x0100
++#define DBG_RX          0x0200
++#define DBG_TX          0x0400
++static unsigned int debugflags;
++#endif /* NDEBUG */
++
++static struct gnbd_device gnbd_dev[MAX_GNBD];
++
++struct request shutdown_req;
++struct request ping_req;
++
++static spinlock_t gnbd_lock = SPIN_LOCK_UNLOCKED;
++
++#define to_gnbd_dev(d) container_of(d, struct gnbd_device, class_dev)
++
++static void gnbd_class_release(struct class_device *class_dev)
++{
++      printk("releasing gnbd class\n");
++      /* FIXME -- What the hell do I have to free up here */
++}
++
++static struct class gnbd_class = {
++      .name = "gnbd",
++      .release = gnbd_class_release
++};
++
++
++static ssize_t show_pid(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%d\n", dev->receiver_pid);
++}
++
++static CLASS_DEVICE_ATTR(pid, S_IRUGO, show_pid, NULL);
++
++static ssize_t show_server(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%08x:%hx\n", dev->server_addr.s_addr,
++                      dev->server_port);
++}
++
++static ssize_t store_server(struct class_device *class_dev,
++              const char *buf, size_t count)
++{
++      int res;
++      struct in_addr addr;
++      short unsigned int port;
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      if (down_trylock(&dev->do_it_lock))
++              return -EBUSY;
++      res = sscanf(buf, "%8x:%4hx", &addr.s_addr, &port);
++      if (res != 2){
++              up(&dev->do_it_lock);
++              return -EINVAL;
++      }
++      dev->server_addr = addr;
++      dev->server_port = port;
++      up(&dev->do_it_lock);
++      return count;
++}
++
++CLASS_DEVICE_ATTR(server, S_IRUGO | S_IWUSR, show_server, store_server);
++
++static ssize_t show_name(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%s\n", dev->name);
++}
++
++static ssize_t store_name(struct class_device *class_dev,
++                const char *buf, size_t count)
++{
++      int res;
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      if (down_trylock(&dev->do_it_lock))
++              return -EBUSY;
++      res = sscanf(buf, "%31s", dev->name);
++      up(&dev->do_it_lock);
++      if (res != 1)
++              return -EINVAL;
++      return count;
++}
++
++CLASS_DEVICE_ATTR(name, S_IRUGO | S_IWUSR, show_name, store_name);
++
++
++static ssize_t show_sectors(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%Lu\n",
++                      (unsigned long long)get_capacity(dev->disk));
++}
++
++static ssize_t store_sectors(struct class_device *class_dev,
++              const char *buf, size_t count)
++{
++      int res;
++      sector_t size;
++      struct block_device *bdev;
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      
++      if (down_trylock(&dev->do_it_lock))
++              return -EBUSY;
++      res = sscanf(buf, "%Lu\n", &size);
++      if (res != 1){
++              up(&dev->do_it_lock);
++              return -EINVAL;
++      }
++      /* FIXME -- should I switch the order here, so that I don't have
++         capacity set to one thing and the bdev inode size set to another */ 
++      set_capacity(dev->disk, size);
++      bdev = bdget_disk(dev->disk, 0);
++      if (bdev) {
++              down(&bdev->bd_inode->i_sem);
++              i_size_write(bdev->bd_inode, (loff_t)size << 9);
++              up(&bdev->bd_inode->i_sem);
++              bdput(bdev);
++      }
++      up(&dev->do_it_lock);
++      return count;
++}
++
++CLASS_DEVICE_ATTR(sectors, S_IRUGO | S_IWUSR, show_sectors, store_sectors);
++
++static ssize_t show_usage(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%d\n", dev->open_count);
++}
++
++CLASS_DEVICE_ATTR(usage, S_IRUGO, show_usage, NULL);
++
++static ssize_t show_flags(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "0x%04x\n", dev->flags);
++}
++
++static ssize_t store_flags(struct class_device *class_dev,
++                const char *buf, size_t count)
++{
++      int res;
++      
++        struct gnbd_device *dev = to_gnbd_dev(class_dev);
++        if (down_trylock(&dev->do_it_lock))
++                return -EBUSY;
++      res = sscanf(buf, "0x%hx", &dev->flags);
++      up(&dev->do_it_lock);
++        if (res != 1)
++                return -EINVAL;
++        return count;
++}
++
++
++CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags);
++
++static ssize_t show_waittime(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      if (list_empty(&dev->queue_head))
++              return sprintf(buf, "-1\n");
++      return sprintf(buf, "%ld\n",
++                      ((long)jiffies - (long)dev->last_received) / HZ);
++}
++
++CLASS_DEVICE_ATTR(waittime, S_IRUGO, show_waittime, NULL);
++
++static ssize_t show_connected(struct class_device *class_dev, char *buf)
++{
++      struct gnbd_device *dev = to_gnbd_dev(class_dev);
++      return sprintf(buf, "%d\n", (dev->sock != NULL));
++}
++
++CLASS_DEVICE_ATTR(connected, S_IRUGO, show_connected, NULL);
++
++#ifndef NDEBUG
++static const char *ioctl_cmd_to_ascii(int cmd)
++{
++      switch (cmd) {
++      case GNBD_DO_IT: return "do-it";
++      case GNBD_CLEAR_QUE: return "clear-que";
++      case GNBD_PRINT_DEBUG: return "print-debug";
++      case GNBD_DISCONNECT: return "disconnect";
++      }
++      return "unknown";
++}
++
++static const char *gnbdcmd_to_ascii(int cmd)
++{
++      switch (cmd) {
++      case  GNBD_CMD_READ: return "read";
++      case GNBD_CMD_WRITE: return "write";
++      case  GNBD_CMD_DISC: return "disconnect";
++      case GNBD_CMD_PING: return "ping";
++      }
++      return "invalid";
++}
++#endif /* NDEBUG */
++
++static void gnbd_end_request(struct request *req)
++{
++      int uptodate = (req->errors == 0) ? 1 : 0;
++      request_queue_t *q = req->q;
++      struct gnbd_device *dev = req->rq_disk->private_data;
++      unsigned long flags;
++
++      dprintk(DBG_BLKDEV, "%s: request %p: %s\n", req->rq_disk->disk_name,
++                      req, uptodate? "done": "failed");
++
++      if (!uptodate)
++              printk("%s %d called gnbd_end_request with and error\n",
++                     current->comm, current->pid);    
++      
++      spin_lock(&dev->queue_lock);
++      while (req->ref_count > 1) { /* still in send */
++              spin_unlock(&dev->queue_lock);
++              printk(KERN_DEBUG "%s: request %p still in use (%d), waiting\n",
++                  dev->disk->disk_name, req, req->ref_count);
++              set_current_state(TASK_UNINTERRUPTIBLE);
++              schedule_timeout(HZ); /* wait a second */
++              spin_lock(&dev->queue_lock);
++      }
++      spin_unlock(&dev->queue_lock);
++
++      spin_lock_irqsave(q->queue_lock, flags);
++      if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
++              end_that_request_last(req);
++      }
++      spin_unlock_irqrestore(q->queue_lock, flags);
++}
++
++/*
++ *  Send or receive packet.
++ */
++static int sock_xmit(struct socket *sock, int send, void *buf, int size,
++              int msg_flags)
++{
++      mm_segment_t oldfs;
++      int result;
++      struct msghdr msg;
++      struct iovec iov;
++      unsigned long flags;
++      sigset_t oldset;
++
++      oldfs = get_fs();
++      set_fs(get_ds());
++      /* Allow interception of SIGKILL only
++       * Don't allow other signals to interrupt the transmission */
++      spin_lock_irqsave(&current->sighand->siglock, flags);
++      oldset = current->blocked;
++      sigfillset(&current->blocked);
++      sigdelsetmask(&current->blocked, sigmask(SIGKILL));
++      recalc_sigpending();
++      spin_unlock_irqrestore(&current->sighand->siglock, flags);
++
++      do {
++              sock->sk->sk_allocation = GFP_NOIO;
++              iov.iov_base = buf;
++              iov.iov_len = size;
++              msg.msg_name = NULL;
++              msg.msg_namelen = 0;
++              msg.msg_iov = &iov;
++              msg.msg_iovlen = 1;
++              msg.msg_control = NULL;
++              msg.msg_controllen = 0;
++              msg.msg_namelen = 0;
++              msg.msg_flags = msg_flags | MSG_NOSIGNAL;
++
++              if (send)
++                      result = sock_sendmsg(sock, &msg, size);
++              else
++                      result = sock_recvmsg(sock, &msg, size, 0);
++
++              if (signal_pending(current)) {
++                      siginfo_t info;
++                      spin_lock_irqsave(&current->sighand->siglock, flags);
++                      printk(KERN_WARNING "gnbd (pid %d: %s) got signal %d\n",
++                              current->pid, current->comm, 
++                              dequeue_signal(current, &current->blocked, &info));
++                      spin_unlock_irqrestore(&current->sighand->siglock, flags);
++                      result = -EINTR;
++                      break;
++              }
++
++              if (result <= 0) {
++                      if (result == 0)
++                              result = -EPIPE; /* short read */
++                      break;
++              }
++              size -= result;
++              buf += result;
++      } while (size > 0);
++
++      spin_lock_irqsave(&current->sighand->siglock, flags);
++      current->blocked = oldset;
++      recalc_sigpending();
++      spin_unlock_irqrestore(&current->sighand->siglock, flags);
++
++      set_fs(oldfs);
++      return result;
++}
++
++static inline int sock_send_bvec(struct socket *sock, struct bio_vec *bvec,
++              int flags)
++{
++      int result;
++      void *kaddr = kmap(bvec->bv_page);
++      result = sock_xmit(sock, 1, kaddr + bvec->bv_offset, bvec->bv_len,
++                      flags);
++      kunmap(bvec->bv_page);
++      return result;
++}
++
++
++#define gnbd_send_req(dev, req) __gnbd_send_req((dev), (dev)->sock, (req))
++      
++int __gnbd_send_req(struct gnbd_device *dev, struct socket *sock,
++              struct request *req)
++{
++      int result, i, flags;
++      struct gnbd_request request;
++      unsigned long size = req->nr_sectors << 9;
++
++      request.magic = htonl(GNBD_REQUEST_MAGIC);
++      request.type = htonl(gnbd_cmd(req));
++      request.from = cpu_to_be64((u64) req->sector << 9);
++      request.len = htonl(size);
++      memcpy(request.handle, &req, sizeof(req));
++
++      down(&dev->tx_lock);
++
++      if (!sock) {
++              printk(KERN_ERR "%s: Attempted send on closed socket\n",
++                              dev->disk->disk_name);
++              result = -ENOTCONN;
++              goto error_out;
++      }
++
++      dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
++                      dev->disk->disk_name, req,
++                      gnbdcmd_to_ascii(gnbd_cmd(req)),
++                      (unsigned long long)req->sector << 9,
++                      req->nr_sectors << 9);
++      result = sock_xmit(sock, 1, &request, sizeof(request),
++                      (gnbd_cmd(req) == GNBD_CMD_WRITE)? MSG_MORE: 0);
++      if (result < 0) {
++              printk(KERN_ERR "%s: Send control failed (result %d)\n",
++                              dev->disk->disk_name, result);
++              goto error_out;
++      }
++
++      if (gnbd_cmd(req) == GNBD_CMD_WRITE) {
++              struct bio *bio;
++              /*
++               * we are really probing at internals to determine
++               * whether to set MSG_MORE or not...
++               */
++              rq_for_each_bio(bio, req) {
++                      struct bio_vec *bvec;
++                      bio_for_each_segment(bvec, bio, i) {
++                              flags = 0;
++                              if ((i < (bio->bi_vcnt - 1)) || bio->bi_next)
++                                      flags = MSG_MORE;
++                              dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
++                                              dev->disk->disk_name, req,
++                                              bvec->bv_len);
++                              result = sock_send_bvec(sock, bvec, flags);
++                              if (result < 0) {
++                                      printk(KERN_ERR "%s: Send data failed (result %d)\n",
++                                                      dev->disk->disk_name,
++                                                      result);
++                                      goto error_out;
++                              }
++                      }
++              }
++      }
++      up(&dev->tx_lock);
++      return 0;
++
++error_out:
++      up(&dev->tx_lock);
++      return result;
++}
++
++      
++static int gnbd_find_request(struct gnbd_device *dev, struct request *xreq)
++{
++      struct request *req;
++      struct list_head *tmp;
++
++      list_for_each(tmp, &dev->queue_head) {
++              req = list_entry(tmp, struct request, queuelist);
++              if (req != xreq)
++                      continue;
++              return 1;
++      }
++      return 0;
++}
++
++static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
++{
++      int result;
++      void *kaddr = kmap(bvec->bv_page);
++      result = sock_xmit(sock, 0, kaddr + bvec->bv_offset, bvec->bv_len,
++                      MSG_WAITALL);
++      kunmap(bvec->bv_page);
++      return result;
++}
++
++int gnbd_recv_req(struct gnbd_device *dev, struct request *req)
++{
++      int result;
++      int i;
++      struct bio *bio;
++      rq_for_each_bio(bio, req) {
++              struct bio_vec *bvec;
++              bio_for_each_segment(bvec, bio, i) {
++                      result = sock_recv_bvec(dev->sock, bvec);
++                      if (result < 0) {
++                              printk(KERN_ERR "%s: Receive data failed (result %d)\n",
++                                              dev->disk->disk_name,
++                                              result);
++                              return result;
++                      }
++                      dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
++                                      dev->disk->disk_name, req, bvec->bv_len);
++              }
++      }
++      return 0;
++}
++
++int gnbd_do_it(struct gnbd_device *dev)
++{
++      int result;
++      struct gnbd_reply reply;
++      struct request *req;
++      struct socket *sock = dev->sock;
++
++      BUG_ON(dev->magic != GNBD_MAGIC);
++
++      while((result = sock_xmit(sock, 0, &reply,sizeof(reply), MSG_WAITALL)) > 0){
++              if (ntohl(reply.magic) == GNBD_KEEP_ALIVE_MAGIC)
++                      /* FIXME -- I should reset the wait time here */
++                      continue;
++
++              memcpy(&req, reply.handle, sizeof(req));
++              if (req == &shutdown_req)
++                      return 0;
++
++              if (!gnbd_find_request(dev, req)){
++                      printk(KERN_ERR "%s: Unexpected reply (%p)\n",
++                                      dev->disk->disk_name, reply.handle);
++                      return -EBADR;
++              }
++              if (ntohl(reply.magic) != GNBD_REPLY_MAGIC) {
++                      printk(KERN_ERR "%s: Wrong magic (0x%lx)\n",
++                                      dev->disk->disk_name,
++                                      (unsigned long)ntohl(reply.magic));
++                      return -EPROTO;
++              }
++              if (ntohl(reply.error)) {
++                      printk(KERN_ERR "%s: Other side returned error (%d)\n",
++                                      dev->disk->disk_name, ntohl(reply.error));
++                      req->errors++;
++                      goto remove_req;
++              }
++              dprintk(DBG_RX, "%s: request %p: got reply\n",
++                              dev->disk->disk_name, req);
++
++              if (gnbd_cmd(req) == GNBD_CMD_READ){
++                      result = gnbd_recv_req(dev, req);
++                      if (result < 0)
++                              return result;
++              }
++remove_req:
++              spin_lock(&dev->queue_lock);
++              list_del_init(&req->queuelist);
++              dev->last_received = jiffies;
++              spin_unlock(&dev->queue_lock);
++              if (req != &ping_req)
++                      gnbd_end_request(req);
++      }
++      printk(KERN_ERR "%s: Receive control failed (result %d)\n",
++                      dev->disk->disk_name, result);
++      return result;
++}
++
++void gnbd_clear_que(struct gnbd_device *dev)
++{
++      struct request *req;
++
++      BUG_ON(dev->magic != GNBD_MAGIC);
++
++      do {
++              req = NULL;
++              if (!list_empty(&dev->queue_head)) {
++                      req = list_entry(dev->queue_head.next, struct request, queuelist);
++                      list_del_init(&req->queuelist);
++              }
++              if (req && req != &ping_req) {
++                      req->errors++;
++                      gnbd_end_request(req);
++              }
++      } while (req);
++}
++
++/*
++ * We always wait for result of write, for now. It would be nice to make it optional
++ * in future
++ * if ((req->cmd == WRITE) && (dev->flags & GNBD_WRITE_NOCHK)) 
++ *   { printk( "Warning: Ignoring result!\n"); gnbd_end_request( req ); }
++ */
++
++static void do_gnbd_request(request_queue_t * q)
++{
++      int err;
++      struct request *req;
++      
++      while ((req = elv_next_request(q)) != NULL) {
++              struct gnbd_device *dev;
++
++              blkdev_dequeue_request(req);
++              dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%lx)\n",
++                              req->rq_disk->disk_name, req, req->flags);
++
++              if (!(req->flags & REQ_CMD))
++                      goto error_out;
++              
++              dev = req->rq_disk->private_data;
++
++              if (dev->receiver_pid == -1)
++                      goto error_out;
++              
++              BUG_ON(dev->magic != GNBD_MAGIC);
++
++              gnbd_cmd(req) = GNBD_CMD_READ;
++              if (rq_data_dir(req) == WRITE) {
++                      gnbd_cmd(req) = GNBD_CMD_WRITE;
++                      if (dev->flags & GNBD_READ_ONLY) {
++                              printk(KERN_ERR "%s: Write on read-only\n",
++                                              dev->disk->disk_name);
++                              goto error_out;
++                      }
++              }
++
++              req->errors = 0;
++              spin_unlock_irq(q->queue_lock);
++
++              spin_lock(&dev->queue_lock);
++
++              if (list_empty(&dev->queue_head))
++                      dev->last_received = jiffies;
++              list_add(&req->queuelist, &dev->queue_head);
++              req->ref_count++; /* make sure req does not get freed */
++              spin_unlock(&dev->queue_lock);
++
++              err = gnbd_send_req(dev, req);
++
++              spin_lock(&dev->queue_lock);
++              req->ref_count--;
++              spin_unlock(&dev->queue_lock);
++              spin_lock_irq(q->queue_lock);
++              if (err)
++                      goto sock_error;
++              continue;
++
++error_out:
++              req->errors++;
++              spin_unlock(q->queue_lock);
++              gnbd_end_request(req);
++              spin_lock(q->queue_lock);
++      }
++      return;
++
++sock_error:
++      return;
++}
++
++/*
++ * This is called before dev-sock is set, so you dodn't need
++ * to worry about the tx_lock or the queue_lock
++ */
++static int gnbd_resend_requests(struct gnbd_device *dev, struct socket *sock)
++{
++      int err = 0;
++      struct request *req;
++      struct list_head *tmp;
++      
++      printk("resending requests\n");
++      list_for_each(tmp, &dev->queue_head) {
++              req = list_entry(tmp, struct request, queuelist);
++              err = __gnbd_send_req(dev, sock, req);
++
++              if (err){
++                      printk("failed trying to resend request (%d)\n", err);
++                      break;
++              }
++      }
++
++      return err;
++}
++/*
++static int get_server_info(struct gnbd_device *dev, struct socket *sock)
++{
++      struct sockaddr_in server;
++      int len;
++      int err;
++
++      err = sock->ops->getname(sock, (struct sockaddr *) &server, &len, 1);
++      if (err) {
++              printk(KERN_WARNING "cannot get socket info, shutting down\n");
++      } else{
++              dev->server_addr = server.sin_addr;
++              dev->server_port = server.sin_port;
++      }
++      return err;
++}
++*/
++
++static int gnbd_ctl_ioctl(struct inode *inode, struct file *file,
++                   unsigned int cmd, unsigned long arg)
++{
++      struct gnbd_device *dev = NULL;
++      struct block_device *bdev;
++        do_it_req_t req;
++      int error;
++
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++
++      if (cmd == GNBD_DISCONNECT || cmd == GNBD_CLEAR_QUE ||
++                        cmd == GNBD_PING || cmd == GNBD_PRINT_DEBUG) {
++                if (arg >= MAX_GNBD)
++                        return -EINVAL;
++                dev = &gnbd_dev[arg];
++                BUG_ON(dev->magic != GNBD_MAGIC);
++        }
++
++      /* Anyone capable of this syscall can do *real bad* things */
++      dprintk(DBG_IOCTL, "%s: gnbd_ioctl cmd=%s(0x%x) arg=%lu\n",
++                      dev->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
++
++      switch (cmd) {
++      case GNBD_DISCONNECT:
++              printk(KERN_INFO "%s: GNBD_DISCONNECT\n", dev->disk->disk_name);
++              spin_lock(&dev->open_lock);
++              if (dev->open_count > 0){
++                      spin_unlock(&dev->open_lock);
++                      return -EBUSY;
++              }
++              dev->receiver_pid = -1;
++              spin_unlock(&dev->open_lock);
++              /* There is no one using the device, you can disconnect it */
++              if (dev->sock == NULL)
++                      return -ENOTCONN;
++              gnbd_send_req(dev, &shutdown_req);
++                return 0;
++      case GNBD_CLEAR_QUE:
++              if (down_interruptible(&dev->do_it_lock))
++                      return -EBUSY;
++              dev->receiver_pid = -1;
++              gnbd_clear_que(dev);
++              bdev = dev->bdev;
++              if (bdev) {
++                      blk_run_queue(dev->disk->queue);
++                      fsync_bdev(bdev);
++                      invalidate_bdev(bdev, 0);
++              }
++              up(&dev->do_it_lock);
++              return 0;
++      case GNBD_DO_IT:
++              if (copy_from_user(&req, (do_it_req_t *)arg, sizeof(req)))
++                        return -EFAULT;
++                if (req.minor >= 128)
++                        return -EINVAL;
++                dev = &gnbd_dev[req.minor];
++                BUG_ON(dev->magic != GNBD_MAGIC);
++              if (dev->file)
++                      return -EBUSY;
++              error = -EINVAL;
++              file = fget(req.sock_fd);
++              if (!file)
++                      return error;
++              inode = file->f_dentry->d_inode;
++              if (!inode->i_sock) {
++                      fput(file);
++                      return error;
++              }
++              if (down_trylock(&dev->do_it_lock)){
++                      fput(file);
++                      return -EBUSY;
++              }
++              /* FIXME -- Why do this? I connected with the
++                 server. I know what these values are */
++              /*
++              error = get_server_info(dev, SOCKET_I(inode));
++              if (error){
++                      fput(file);
++                      return error;
++              }
++              */
++              error = gnbd_resend_requests(dev, SOCKET_I(inode));
++              if (error){
++                      printk("quitting NBD_DO_IT\n");
++                      up(&dev->do_it_lock);
++                      fput(file);
++                      return error;
++              }
++              dev->file = file;
++              dev->sock = SOCKET_I(inode);
++              dev->receiver_pid = current->pid; 
++              blk_run_queue(dev->disk->queue);
++              error = gnbd_do_it(dev);
++              /* should I kill the socket first */
++              up(&dev->do_it_lock);
++              down(&dev->tx_lock);
++              if (dev->sock) {
++                      printk(KERN_WARNING "%s: shutting down socket\n",
++                                      dev->disk->disk_name);
++                      dev->sock->ops->shutdown(dev->sock,
++                                      SEND_SHUTDOWN|RCV_SHUTDOWN);
++                      dev->sock = NULL;
++              }
++              up(&dev->tx_lock);
++              file = dev->file;
++              dev->file = NULL;
++              if (file)
++                      fput(file);
++              printk("exitting GNBD_DO_IT ioctl\n");
++              return error;
++      case GNBD_PING:
++              /* FIXME -- should I allow pings if everything is compeletely
++               * shutdown */
++              spin_lock(&dev->queue_lock);
++              /* only one outstanding ping at a time */
++              if (list_empty(&ping_req.queuelist)){
++                      if (list_empty(&dev->queue_head))
++                              dev->last_received = jiffies;
++                      list_add(&ping_req.queuelist, &dev->queue_head);
++              }
++              spin_unlock(&dev->queue_lock);
++              gnbd_send_req(dev, &ping_req); /* ignore the errors */
++              return 0;
++      case GNBD_PRINT_DEBUG:
++              printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
++                      dev->disk->disk_name,
++                      dev->queue_head.next, dev->queue_head.prev,
++                      &dev->queue_head);
++              return 0;
++      case GNBD_GET_TIME:
++              if (copy_to_user((void *)arg, &insmod_time, sizeof(uint64_t))){
++                      printk(KERN_WARNING "couldn't compy time argument to user\n");
++                      return -EFAULT;
++              }
++              return 0;
++      }
++      /* FIXME -- should I print something, is EINVAL the right error */
++      return -EINVAL;
++}
++
++static int gnbd_open(struct inode *inode, struct file *file)
++{
++      struct gnbd_device *dev = inode->i_bdev->bd_disk->private_data;
++      spin_lock(&dev->open_lock);
++      if (dev->receiver_pid == -1){
++              spin_unlock(&dev->open_lock);
++              return -ENXIO;
++      }
++      spin_unlock(&dev->open_lock);
++      if ((file->f_mode & FMODE_WRITE) && (dev->flags & GNBD_READ_ONLY)){
++              printk(KERN_INFO "cannot open read only gnbd device read/write");
++              return -EROFS;
++      }
++
++      dev->open_count++;
++      dev->bdev = inode->i_bdev;
++      return 0;
++}
++
++/* FIXME -- I don't sync the device at close. This means that If you write
++ * something, and close the device, and expect that then it is written,
++ * you are wrong.... This might cause problems */
++static int gnbd_release(struct inode *inode, struct file *file)
++{
++      struct gnbd_device *dev = inode->i_bdev->bd_disk->private_data;
++
++      dev->open_count--;
++      if (dev->open_count == 0)
++              dev->bdev = NULL;
++      return 0;
++}
++
++static struct file_operations _gnbd_ctl_fops =
++{
++        .ioctl = gnbd_ctl_ioctl,
++        .owner = THIS_MODULE,
++};
++
++static struct miscdevice _gnbd_misc =
++{
++        .minor = MISC_DYNAMIC_MINOR,
++        .name  = "gnbd_ctl",
++        .devfs_name = "gnbd_ctl",
++        .fops = &_gnbd_ctl_fops
++};
++
++/* FIXME -- I should probably do more here */
++int __init gnbd_ctl_init(void)
++{
++        int err;
++        
++        err = misc_register(&_gnbd_misc);
++        if (err) {
++                printk("cannot register control device\n");
++                return err;
++        }
++        return 0;
++}
++
++void gnbd_ctl_cleanup(void)
++{
++        if (misc_deregister(&_gnbd_misc) < 0)
++                printk("cannot deregister control device\n");
++}
++
++static struct block_device_operations gnbd_fops =
++{
++      .open =         gnbd_open,
++      .release =      gnbd_release,
++      .owner =        THIS_MODULE,
++};
++
++/*
++ * And here should be modules and kernel interface 
++ *  (Just smiley confuses emacs :-)
++ */
++
++static int __init gnbd_init(void)
++{
++      int err = -ENOMEM;
++      struct timeval tv;
++      int i;
++
++      if (sizeof(struct gnbd_request) != 28) {
++              printk(KERN_CRIT "gnbd: sizeof gnbd_request needs to be 28 in order to work!\n" );
++              return -EIO;
++      }
++      shutdown_req.flags = REQ_SPECIAL;
++      gnbd_cmd(&shutdown_req) = GNBD_CMD_DISC;
++      shutdown_req.sector = 0;
++      shutdown_req.nr_sectors = 0;
++
++      ping_req.flags = REQ_SPECIAL;
++      gnbd_cmd(&ping_req) = GNBD_CMD_PING;
++      ping_req.sector = 0;
++      ping_req.nr_sectors = 0;
++      
++      for (i = 0; i < MAX_GNBD; i++) {
++              struct gendisk *disk = alloc_disk(1);
++              if (!disk)
++                      goto out;
++              gnbd_dev[i].disk = disk;
++              /*
++               * The new linux 2.5 block layer implementation requires
++               * every gendisk to have its very own request_queue struct.
++               * These structs are big so we dynamically allocate them.
++               */
++              disk->queue = blk_init_queue(do_gnbd_request, &gnbd_lock);
++              if (!disk->queue) {
++                      put_disk(disk);
++                      goto out;
++              }
++      }
++      major_nr = register_blkdev(major_nr, "gnbd");
++      if (major_nr < 0) {
++              printk("gnbd: unable to get a major number\n");
++              err = major_nr;
++              goto out;
++      }
++
++      printk(KERN_INFO "gnbd: registered device at major %d\n", major_nr);
++      dprintk(DBG_INIT, "gnbd: debugflags=0x%x\n", debugflags);
++
++      devfs_mk_dir("gnbd");
++      err = class_register(&gnbd_class);
++      if (err)
++              goto out_unregister;
++      for (i = 0; i < MAX_GNBD; i++) {
++              struct gendisk *disk = gnbd_dev[i].disk;
++              gnbd_dev[i].file = NULL;
++              gnbd_dev[i].magic = GNBD_MAGIC;
++              gnbd_dev[i].flags = 0;
++              gnbd_dev[i].open_count = 0;
++              gnbd_dev[i].receiver_pid = -1;
++              gnbd_dev[i].server_addr.s_addr = 0;
++              gnbd_dev[i].server_port = 0;
++              gnbd_dev[i].name[0] = '\0';
++              gnbd_dev[i].bdev = NULL;
++              spin_lock_init(&gnbd_dev[i].queue_lock);
++              spin_lock_init(&gnbd_dev[i].open_lock);
++              INIT_LIST_HEAD(&gnbd_dev[i].queue_head);
++              init_MUTEX(&gnbd_dev[i].tx_lock);
++              init_MUTEX(&gnbd_dev[i].do_it_lock);
++              gnbd_dev[i].class_dev.class = &gnbd_class;
++              sprintf(gnbd_dev[i].class_dev.class_id, "gnbd%d", i);
++              err = class_device_register(&gnbd_dev[i].class_dev);
++              if (err){
++                      printk("class_device_register failed with %d\n", err);
++                      goto out_unregister_class;
++              }
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_pid))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_server))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_name))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_sectors))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_usage))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_flags))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_waittime))
++                      goto out_remove_file;
++              if(class_device_create_file(&gnbd_dev[i].class_dev,
++                                      &class_device_attr_connected))
++                      goto out_remove_file;
++              disk->major = major_nr;
++              disk->first_minor = i;
++              disk->fops = &gnbd_fops;
++              disk->private_data = &gnbd_dev[i];
++              sprintf(disk->disk_name, "gnbd%d", i);
++              sprintf(disk->devfs_name, "gnbd/%d", i);
++              set_capacity(disk, 0);
++              add_disk(disk);
++              if(sysfs_create_link(&gnbd_dev[i].class_dev.kobj,
++                                      &gnbd_dev[i].disk->kobj, "block"))
++                      goto out_remove_disk;
++              
++      }
++
++        err = gnbd_ctl_init();
++        if (err)
++                goto out_unregister_class;
++        
++      insmod_time = (uint64_t) tv.tv_sec * 1000000 + tv.tv_usec;
++
++      return 0;
++out_remove_disk:
++      del_gendisk(gnbd_dev[i].disk);
++out_remove_file:
++      class_device_unregister(&gnbd_dev[i].class_dev);
++out_unregister_class:
++      while(i--){
++              del_gendisk(gnbd_dev[i].disk);
++              class_device_unregister(&gnbd_dev[i].class_dev);
++      }
++      i = MAX_GNBD;
++      class_unregister(&gnbd_class);
++out_unregister:
++      unregister_blkdev(major_nr, "gnbd");
++out:
++      while (i--) {
++              blk_cleanup_queue(gnbd_dev[i].disk->queue);
++              put_disk(gnbd_dev[i].disk);
++      }
++      return err;
++}
++
++static void __exit gnbd_cleanup(void)
++{
++      int i;
++
++      gnbd_ctl_cleanup();
++      for (i = 0; i < MAX_GNBD; i++) {
++              struct gendisk *disk = gnbd_dev[i].disk;
++              if (disk) {
++                      del_gendisk(disk);
++                      blk_cleanup_queue(disk->queue);
++                      put_disk(disk);
++              }
++      }
++      class_unregister(&gnbd_class);
++      devfs_remove("gnbd");
++      unregister_blkdev(major_nr, "gnbd");
++      printk(KERN_INFO "gnbd: unregistered device at major %d\n", major_nr);
++}
++
++module_init(gnbd_init);
++module_exit(gnbd_cleanup);
++
++MODULE_DESCRIPTION("Network Block Device");
++MODULE_LICENSE("GPL");
++
++#ifndef NDEBUG
++MODULE_PARM(debugflags, "i");
++MODULE_PARM_DESC(debugflags, "flags for controlling debug output");
++#endif
+diff -urN linux-orig/include/linux/gnbd.h linux-patched/include/linux/gnbd.h
+--- linux-orig/include/linux/gnbd.h    1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/gnbd.h 2004-06-11 13:13:58.000000000 -0500
+@@ -0,0 +1,103 @@
++/******************************************************************************
++*******************************************************************************
++**
++**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
++**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
++**
++**  This copyrighted material is made available to anyone wishing to use,
++**  modify, copy, or redistribute it subject to the terms and conditions
++**  of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef LINUX_GNBD_H
++#define LINUX_GNBD_H
++
++#define GNBD_DO_IT    _IO( 0xab, 0x20 )
++#define GNBD_CLEAR_QUE        _IO( 0xab, 0x21 )
++#define GNBD_PRINT_DEBUG      _IO( 0xab, 0x22 )
++#define GNBD_DISCONNECT  _IO( 0xab, 0x23 )
++#define GNBD_PING     _IO( 0xab, 0x24 )
++#define GNBD_GET_TIME _IO( 0xab, 0x25 )
++
++enum {
++      GNBD_CMD_READ = 0,
++      GNBD_CMD_WRITE = 1,
++      GNBD_CMD_DISC = 2,
++      GNBD_CMD_PING = 3
++};
++
++#define gnbd_cmd(req) ((req)->cmd[0])
++#define MAX_GNBD 128
++
++/* values for flags field */
++#define GNBD_READ_ONLY 0x0001
++
++/* userspace doesn't need the gnbd_device structure */
++#ifdef __KERNEL__
++
++struct gnbd_device {
++      unsigned short int flags;
++      struct socket * sock;
++      struct file * file;     /* If == NULL, device is not ready, yet */
++      int magic;
++      spinlock_t queue_lock;
++      spinlock_t open_lock;
++      struct list_head queue_head;/* Requests are added here...       */
++      struct semaphore tx_lock;
++      struct gendisk *disk;
++      pid_t receiver_pid;
++      struct semaphore do_it_lock;
++      int open_count;
++      struct class_device class_dev;
++      unsigned short int server_port;
++      struct in_addr server_addr;
++      char name[32];
++      unsigned long last_received;
++      struct block_device *bdev;
++};
++
++#endif /* __KERNEL__ */
++
++/* These are sent over the network in the request/reply magic fields */
++
++#define GNBD_REQUEST_MAGIC 0x37a07e00
++#define GNBD_REPLY_MAGIC 0x41f09370
++#define GNBD_KEEP_ALIVE_MAGIC 0x5B46D8C2
++/* Do *not* use magics: 0x12560953 0x96744668. */
++
++/*
++ * This is the packet used for communication between client and
++ * server. All data are in network byte order.
++ */
++struct gnbd_request {
++      uint32_t magic;
++      uint32_t type;  /* == READ || == WRITE  why so long */
++      char handle[8];  /* why is this a char array instead of a u64 */
++      uint64_t from;
++      uint32_t len;
++}
++#ifdef __GNUC__
++      __attribute__ ((packed))
++#endif /* __GNUC__ */
++;
++
++/*
++ * This is the reply packet that gnbd-server sends back to the client after
++ * it has completed an I/O request (or an error occurs).
++ */
++#define SIZE_OF_REPLY 16
++struct gnbd_reply {
++      uint32_t magic;
++      uint32_t error;         /* 0 = ok, else error   */
++      char handle[8];         /* handle you got from request  */
++};
++
++struct do_it_req_s {
++        unsigned int minor;
++        int sock_fd;
++};
++typedef struct do_it_req_s do_it_req_t;
++
++#endif /* LINUX_GNBD_H */
author	Arkadiusz Miśkiewicz <arekm@maven.pl>
	Sun, 27 Jun 2004 00:34:14 +0000 (00:34 +0000)
committer	cvs2git <feedback@pld-linux.org>
	Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
linux-cluster-cman.patch	[new file with mode: 0644]	patch \| blob
linux-cluster-dlm.patch	[new file with mode: 0644]	patch \| blob
linux-cluster-gfs.patch	[new file with mode: 0644]	patch \| blob
linux-cluster-gnbd.patch	[new file with mode: 0644]	patch \| blob