--- /dev/null
+# Add CMAN to build system
+diff -urN -p linux-2.6.7/Makefile linux/Makefile
+--- linux-2.6.7/Makefile 2004-06-16 13:19:37.000000000 +0800
++++ linux/Makefile 2004-06-17 14:55:06.000000000 +0800
+@@ -418,7 +418,7 @@ all: vmlinux
+
+ # Objects we will link into vmlinux / subdirs we need to visit
+ init-y := init/
+-drivers-y := drivers/ sound/
++drivers-y := drivers/ sound/ cluster/
+ net-y := net/
+ libs-y := lib/
+ core-y := usr/
+diff -urN -p linux-2.6.7/arch/alpha/Kconfig linux/arch/alpha/Kconfig
+--- linux-2.6.7/arch/alpha/Kconfig 2004-06-16 13:19:44.000000000 +0800
++++ linux/arch/alpha/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -698,3 +698,4 @@ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
+
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/arch/i386/Kconfig linux/arch/i386/Kconfig
+--- linux-2.6.7/arch/i386/Kconfig 2004-06-16 13:18:59.000000000 +0800
++++ linux/arch/i386/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -1315,6 +1315,8 @@ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
+
++source "cluster/Kconfig"
++
+ config X86_SMP
+ bool
+ depends on SMP && !X86_VOYAGER
+diff -urN -p linux-2.6.7/arch/parisc/Kconfig linux/arch/parisc/Kconfig
+--- linux-2.6.7/arch/parisc/Kconfig 2004-06-16 13:19:36.000000000 +0800
++++ linux/arch/parisc/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -229,3 +229,4 @@ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
+
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/arch/sparc64/Kconfig linux/arch/sparc64/Kconfig
+--- linux-2.6.7/arch/sparc64/Kconfig 2004-06-16 13:19:52.000000000 +0800
++++ linux/arch/sparc64/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -713,3 +713,4 @@ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
+
++source "cluster/Kconfig"
+diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
+--- linux-2.6.7/cluster/Kconfig 1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/Kconfig 2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,13 @@
++menu "Cluster Support"
++
++config CLUSTER
++ tristate "Cluster support"
++ ---help---
++ Enable clustering support. This is not the high-performance clustering
++ made famous by beowulf. It is a high-availability cluster often using
++ shared storage.
++ The cluster manager is the heart(beat) of the cluster system. It is
++ needed by all the other components. It provides membership services
++ for those other subsystems.
++
++endmenu
+diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
+--- linux-2.6.7/cluster/Makefile 1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/Makefile 2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,3 @@
++obj-y := nocluster.o
++
++obj-$(CONFIG_CLUSTER) += cman/
+diff -urN -p linux-2.6.7/cluster/cman/Makefile linux/cluster/cman/Makefile
+--- linux-2.6.7/cluster/cman/Makefile 1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/cman/Makefile 2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,6 @@
++cman-objs := cnxman.o config.o membership.o proc.o\
++ sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
++ sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
++ sm_user.o
++
++obj-$(CONFIG_CLUSTER) := cman.o
+diff -urN -p linux-2.6.7/cluster/nocluster.c linux/cluster/nocluster.c
+--- linux-2.6.7/cluster/nocluster.c 1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/nocluster.c 2004-06-17 14:55:06.000000000 +0800
+@@ -0,0 +1,20 @@
++/*
++ * cluster/nocluster.c
++ *
++ * Copy from net/nonet.c
++ * Dummy functions to allow us to configure cluster support entirely
++ * out of the kernel.
++ *
++ * Distributed under the terms of the GNU GPL version 2.
++ * Copyright (c) Matthew Wilcox 2003
++ */
++
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++
++void __init nocluster_init(void)
++{
++}
+diff -urN linux-orig/cluster/cman/cnxman-private.h linux-patched/cluster/cman/cnxman-private.h
+--- linux-orig/cluster/cman/cnxman-private.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/cnxman-private.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,427 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CNXMAN_PRIVATE_H
++#define __CNXMAN_PRIVATE_H
++
++/* Version triplet */
++#define CNXMAN_MAJOR_VERSION 2
++#define CNXMAN_MINOR_VERSION 0
++#define CNXMAN_PATCH_VERSION 1
++
++#define MAX_RETRIES 3 /* Maximum number of send retries */
++#define CAP_CLUSTER CAP_SYS_ADMIN /* Capability needed to manage the
++ * cluster */
++#ifdef __KERNEL__
++
++/* How we announce ourself in console events */
++#define CMAN_NAME "CMAN"
++
++/* One of these per AF_CLUSTER socket */
++struct cluster_sock {
++ /* WARNING: sk has to be the first member */
++ struct sock sk;
++
++ unsigned char port; /* Bound port or zero */
++ int (*kernel_callback) (char *, int, char *, int, unsigned int);
++ void *service_data;
++};
++
++#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
++
++/* We have one of these for each socket we use for communications */
++struct cl_comms_socket {
++ struct socket *sock;
++ int broadcast; /* This is a broadcast socket */
++ int recv_only; /* This is the unicast receive end of a
++ * multicast socket */
++ struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
++ * the remote end(s) */
++ int addr_len; /* Length of above */
++ int number; /* Internal socket number, used to cycle around
++ * sockets in case of network errors */
++ struct file *file; /* file pointer for user-passed in sockets */
++
++ wait_queue_t wait;
++
++ /* The socket list */
++ struct list_head list;
++
++ /* On here when it has something to say */
++ struct list_head active_list;
++ unsigned long active;
++};
++
++/* A client socket. We keep a list of these so we can notify clients of cluster
++ * events */
++struct cl_client_socket {
++ struct socket *sock;
++ struct list_head list;
++};
++
++/* This structure is tacked onto the start of a cluster message packet for our
++ * own nefarious purposes. */
++struct cl_protheader {
++ unsigned char port;
++ unsigned char flags;
++ unsigned short cluster; /* Our cluster number, little-endian */
++ unsigned short seq; /* Packet sequence number, little-endian */
++ int srcid; /* Node ID of the sender */
++ int tgtid; /* Node ID of the target or 0 for multicast
++ * messages */
++};
++
++/* A cluster internal protocol message - port number 0 */
++struct cl_protmsg {
++ struct cl_protheader header;
++ unsigned char cmd;
++};
++
++/* A Cluster ACK message */
++struct cl_ackmsg {
++ struct cl_protheader header;
++ unsigned char cmd; /* Always CLUSTER_CMD_ACK */
++ unsigned char remport; /* Remoye port number the original message was
++ * for */
++ unsigned char aflags; /* ACK flags 0=OK, 1=No listener */
++ unsigned char pad;
++ unsigned short seq; /* Sequence number we are acking */
++};
++
++/* A Cluster LISTENREQ/LISTENRESP message */
++struct cl_listenmsg {
++ unsigned char cmd; /* CLUSTER_CMD_LISTENRESP/REQ */
++ unsigned char target_port; /* Port to probe */
++ unsigned char listening; /* Always 0 for LISTENREQ */
++ unsigned char pad;
++ unsigned short tag; /* PID of remote waiting process */
++};
++
++/* A Cluster PORTCLOSED message */
++struct cl_closemsg {
++ unsigned char cmd; /* CLUSTER_CMD_PORTCLOSED */
++ unsigned char port;
++};
++
++/* Structure of a newly dead node, passed from cnxman to kmembershipd */
++struct cl_new_dead_node {
++ struct list_head list;
++ struct cluster_node *node;
++};
++
++/* Subcommands for BARRIER message */
++#define BARRIER_REGISTER 1
++#define BARRIER_CHANGE 2
++#define BARRIER_WAIT 4
++#define BARRIER_COMPLETE 5
++
++/* A Cluster BARRIER message */
++struct cl_barriermsg {
++ unsigned char cmd; /* CLUSTER_CMD_BARRIER */
++ unsigned char subcmd; /* BARRIER sub command */
++ unsigned short pad;
++ unsigned int flags;
++ unsigned int nodes;
++ char name[MAX_BARRIER_NAME_LEN];
++};
++
++/* Membership services messages, the cl_protheader is added transparently */
++struct cl_mem_hello_msg {
++ unsigned char cmd;
++ unsigned char flags;
++ unsigned short members; /* Number of nodes in the cluster,
++ * little-endian */
++ unsigned int generation; /* Current cluster generation number */
++};
++
++struct cl_mem_endtrans_msg {
++ unsigned char cmd;
++ unsigned char pad1;
++ unsigned short pad2;
++ unsigned int quorum;
++ unsigned int total_votes;
++ unsigned int generation; /* Current cluster generation number */
++ unsigned int new_node_id; /* If reason is a new node joining */
++};
++
++/* ACK types for JOINACK message */
++#define JOINACK_TYPE_OK 1 /* You can join */
++#define JOINACK_TYPE_NAK 2 /* You can NOT join */
++#define JOINACK_TYPE_WAIT 3 /* Wait a bit longer - cluster is in transition
++ * already */
++
++struct cl_mem_joinack_msg {
++ unsigned char cmd;
++ unsigned char acktype;
++};
++
++/* This is used by JOINREQ message */
++struct cl_mem_join_msg {
++ unsigned char cmd;
++ unsigned char votes;
++ unsigned short num_addr; /* Number of addresses for this node */
++ unsigned int expected_votes;
++ unsigned int members; /* Number of nodes in the cluster,
++ * little-endian */
++ unsigned int major_version; /* Not backwards compatible */
++ unsigned int minor_version; /* Backwards compatible */
++ unsigned int patch_version; /* Backwards/forwards compatible */
++ unsigned int config_version;
++ unsigned int addr_len; /* length of node addresses */
++ char clustername[16];
++ /* Followed by <num_addr> addresses of `address_length` bytes and a
++ * NUL-terminated node name */
++};
++
++/* State transition start reasons: */
++#define TRANS_NEWNODE 1 /* A new node is joining the cluster */
++#define TRANS_REMNODE 2 /* a node has left the cluster */
++#define TRANS_ANOTHERREMNODE 3 /* A node left the cluster while we were in
++ * transition */
++#define TRANS_NEWMASTER 4 /* We have had an election and I am the new
++ * master */
++#define TRANS_CHECK 5 /* A consistency check was called for */
++#define TRANS_RESTART 6 /* Transition restarted because of a previous
++ * timeout */
++#define TRANS_DEADMASTER 7 /* The master died during transition and I have
++ * taken over */
++
++/* This is used to start a state transition */
++struct cl_mem_starttrans_msg {
++ unsigned char cmd;
++ unsigned char reason; /* Why a start transition was started - see
++ * above */
++ unsigned char flags;
++ unsigned char votes;
++ unsigned int expected_votes;
++ unsigned int generation; /* Incremented for each STARTTRANS sent
++ */
++ int nodeid; /* Node to be removed */
++ unsigned short num_addrs;
++ /* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
++ * `address_length` bytes and a NUL-terminated node name */
++};
++
++struct cl_mem_startack_msg {
++ unsigned char cmd;
++ unsigned char reason;
++ unsigned short pad;
++ unsigned int generation;
++ unsigned int node_id; /* node_id we think new node should have */
++ unsigned int highest_node_id; /* highest node_id on this system */
++};
++
++/* Reconfigure a cluster parameter */
++struct cl_mem_reconfig_msg {
++ unsigned char cmd;
++ unsigned char param;
++ unsigned short pad;
++ unsigned int value;
++};
++
++/* Structure containing information about an outstanding listen request */
++struct cl_waiting_listen_request {
++ wait_queue_head_t waitq;
++ int result;
++ int waiting;
++ unsigned short tag;
++ int nodeid;
++ struct list_head list;
++};
++
++/* Messages from membership services */
++#define CLUSTER_MEM_JOINCONF 1
++#define CLUSTER_MEM_JOINREQ 2
++#define CLUSTER_MEM_LEAVE 3
++#define CLUSTER_MEM_HELLO 4
++#define CLUSTER_MEM_KILL 5
++#define CLUSTER_MEM_JOINACK 6
++#define CLUSTER_MEM_ENDTRANS 7
++#define CLUSTER_MEM_RECONFIG 8
++#define CLUSTER_MEM_MASTERVIEW 9
++#define CLUSTER_MEM_STARTTRANS 10
++#define CLUSTER_MEM_JOINREJ 11
++#define CLUSTER_MEM_VIEWACK 12
++#define CLUSTER_MEM_STARTACK 13
++#define CLUSTER_MEM_TRANSITION 14
++#define CLUSTER_MEM_NEWCLUSTER 15
++#define CLUSTER_MEM_CONFACK 16
++#define CLUSTER_MEM_NOMINATE 17
++
++/* Parameters for RECONFIG command */
++#define RECONFIG_PARAM_EXPECTED_VOTES 1
++#define RECONFIG_PARAM_NODE_VOTES 2
++#define RECONFIG_PARAM_CONFIG_VERSION 3
++
++/* Data associated with an outgoing socket */
++struct cl_socket {
++ struct file *file; /* The real file */
++ struct socket *socket; /* The real sock */
++ struct cl_multicast_sock multicast_info;
++ int num_nodes; /* On this link */
++ int retransmit_count;
++};
++
++/* There's one of these for each node in the cluster */
++struct cluster_node {
++ struct list_head list;
++ char *name; /* Node/host name of node */
++ struct list_head addr_list;
++ int us; /* This node is us */
++ unsigned int node_id; /* Unique node ID */
++ nodestate_t state;
++ unsigned short last_seq_recv;
++ unsigned short last_seq_acked;
++ unsigned short last_seq_sent;
++ unsigned int votes;
++ unsigned int expected_votes;
++ unsigned int leave_reason;
++ unsigned int incarnation; /* Incremented each time a node joins
++ * the cluster */
++ unsigned long last_hello; /* Jiffies */
++};
++
++/* This is how we keep a list of user processes that are listening for cluster
++ * membership events */
++struct notify_struct {
++ struct list_head list;
++ pid_t pid;
++ int signal;
++};
++
++/* This is how we keep a list of kernel callbacks that are registered for
++ * cluster membership events */
++struct kernel_notify_struct {
++ struct list_head list;
++ void (*callback) (kcl_callback_reason, long arg);
++};
++
++/* A message waiting to be sent */
++struct queued_message {
++ struct list_head list;
++
++ struct socket *socket;
++ struct sockaddr_cl addr;
++ int addr_len;
++ int msg_len;
++ unsigned char port;
++ unsigned int flags;
++ char msg_buffer[MAX_CLUSTER_MESSAGE];
++};
++
++/* A barrier */
++struct cl_barrier {
++ struct list_head list;
++
++ char name[MAX_BARRIER_NAME_LEN];
++ unsigned int flags;
++ enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
++ BARRIER_STATE_COMPLETE } state;
++ unsigned int expected_nodes;
++ unsigned int registered_nodes;
++ atomic_t got_nodes;
++ atomic_t completed_nodes;
++ unsigned int inuse;
++ unsigned int waitsent;
++ unsigned int phase; /* Completion phase */
++ unsigned int endreason; /* Reason we were woken, usually 0 */
++ unsigned long timeout; /* In seconds */
++
++ void (*callback) (char *name, int status);
++ wait_queue_head_t waitq;
++ struct semaphore lock; /* To synch with cnxman messages */
++ spinlock_t phase2_spinlock; /* Need to synchronise with timer
++ * interrupts */
++ struct timer_list timer;
++};
++
++/* Cluster protocol commands sent to port 0 */
++#define CLUSTER_CMD_ACK 1
++#define CLUSTER_CMD_LISTENREQ 2
++#define CLUSTER_CMD_LISTENRESP 3
++#define CLUSTER_CMD_PORTCLOSED 4
++#define CLUSTER_CMD_BARRIER 5
++
++extern struct cluster_node *find_node_by_addr(unsigned char *addr,
++ int addr_len);
++extern struct cluster_node *find_node_by_nodeid(unsigned int id);
++extern struct cluster_node *find_node_by_name(char *name);
++extern void set_quorate(int);
++extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
++extern void notify_listeners(void);
++extern void free_nodeid_array(void);
++extern int send_reconfigure(int param, unsigned int value);
++extern int calculate_quorum(int, int, int *);
++extern void recalculate_quorum(int);
++extern int send_leave(unsigned char);
++extern int get_quorum(void);
++extern void set_votes(int, int);
++extern void kcl_wait_for_all_acks(void);
++extern char *membership_state(char *, int);
++extern void a_node_just_died(struct cluster_node *node);
++extern void check_barrier_returns(void);
++extern int in_transition(void);
++extern void get_local_addresses(struct cluster_node *node);
++extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
++extern void create_proc_entries(void);
++extern void cleanup_proc_entries(void);
++extern unsigned int get_highest_nodeid(void);
++extern int allocate_nodeid_array(void);
++extern void queue_oob_skb(struct socket *sock, int cmd);
++extern int new_temp_nodeid(char *addr, int addrlen);
++extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
++extern void remove_temp_nodeid(int nodeid);
++extern inline char *print_addr(unsigned char *addr, int len, char *buf)
++{
++ int i;
++ int ptr = 0;
++
++ for (i = 0; i < len; i++)
++ ptr += sprintf(buf + ptr, "%02x ", addr[i]);
++
++ return buf;
++}
++
++#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
++
++/* Debug enabling macros. Sorry about the C++ comments but they're easier to
++ * get rid of than C ones... */
++
++// #define DEBUG_MEMB
++// #define DEBUG_COMMS
++// #define DEBUG_BARRIER
++
++/* Debug macros */
++#ifdef DEBUG_COMMS
++#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
++#else
++#define P_COMMS(fmt, args...)
++#endif
++
++#ifdef DEBUG_BARRIER
++#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
++#else
++#define P_BARRIER(fmt, args...)
++#endif
++
++#ifdef DEBUG_MEMB
++#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
++#define C_MEMB(fmt, args...) printk(fmt, ## args)
++#else
++#define P_MEMB(fmt, args...)
++#define C_MEMB(fmt, args...)
++#endif
++
++#endif /* __KERNEL */
++
++#endif
+diff -urN linux-orig/cluster/cman/cnxman.c linux-patched/cluster/cman/cnxman.c
+--- linux-orig/cluster/cman/cnxman.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/cnxman.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,4080 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++#include <linux/init.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <linux/utsname.h>
++#include <net/sock.h>
++#include <linux/proc_fs.h>
++#include <linux/poll.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#include "cnxman-private.h"
++#include "sm_control.h"
++#include "sm_user.h"
++#include "config.h"
++
++#define CMAN_RELEASE_NAME "<CVS>"
++
++static int __cl_setsockopt(struct socket *sock, int level, int optname,
++ char *optval, int optlen, int flags);
++static int __cl_getsockopt(struct socket *sock, int level, int optname,
++ char *optval, int *optlen, int flags);
++static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
++ char *addr, int addrlen);
++static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
++ int addr_len, char *addr, unsigned char remport,
++ unsigned char flag);
++static void send_listen_request(int nodeid, unsigned char port);
++static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
++ unsigned char port, unsigned short tag);
++static void resend_last_message(void);
++static void start_ack_timer(void);
++static int send_queued_message(struct queued_message *qmsg);
++static void send_port_close_oob(unsigned char port);
++static void post_close_oob(unsigned char port, int nodeid);
++static void process_barrier_msg(struct cl_barriermsg *msg,
++ struct cluster_node *node);
++static struct cl_barrier *find_barrier(char *name);
++static void node_shutdown(void);
++static void node_cleanup(void);
++static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++ unsigned char port);
++static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
++static void check_for_unacked_nodes(void);
++static void free_cluster_sockets(void);
++static uint16_t generate_cluster_id(char *name);
++
++static int is_valid_temp_nodeid(int nodeid);
++
++extern int start_membership_services(pid_t);
++extern int kcl_leave_cluster(int remove);
++extern int send_kill(int nodeid);
++
++static struct proto_ops cl_proto_ops;
++static struct sock *master_sock;
++static kmem_cache_t *cluster_sk_cachep;
++
++/* Pointer to the pseudo node that maintains quorum in a 2node system */
++struct cluster_node *quorum_device = NULL;
++
++/* Array of "ports" allocated. This is just a list of pointers to the sock that
++ * has this port bound. Speed is a major issue here so 1-2K of allocated
++ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
++static struct sock *port_array[256];
++static struct semaphore port_array_lock;
++
++/* Our cluster name & number */
++unsigned short cluster_id;
++char cluster_name[MAX_CLUSTER_NAME_LEN+1];
++
++/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
++ * No more than two nodes are permitted to join the cluster. */
++unsigned short two_node;
++
++/* Cluster configuration version that must be the same among members. */
++unsigned int config_version;
++
++/* Reference counting for cluster applications */
++atomic_t use_count;
++
++/* Length of sockaddr address for our comms protocol */
++unsigned int address_length;
++
++/* Message sending */
++static unsigned short cur_seq; /* Last message sent */
++static unsigned int ack_count; /* Number of acks received for message
++ * 'cur_seq' */
++static unsigned int acks_expected; /* Number of acks we expect to receive */
++static struct semaphore send_lock;
++static struct timer_list ack_timer;
++
++/* Saved packet information in case we need to resend it */
++static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
++static int saved_msg_len;
++static int retry_count;
++
++/* Task variables */
++static pid_t kcluster_pid;
++static pid_t membership_pid;
++extern int quit_threads;
++
++wait_queue_head_t cnxman_waitq;
++
++/* Variables owned by membership services */
++extern int cluster_members;
++extern struct list_head cluster_members_list;
++extern struct semaphore cluster_members_lock;
++extern int we_are_a_cluster_member;
++extern int cluster_is_quorate;
++extern struct cluster_node *us;
++extern struct list_head new_dead_node_list;
++extern struct semaphore new_dead_node_lock;
++extern char nodename[];
++
++/* A list of processes listening for membership events */
++static struct list_head event_listener_list;
++static struct semaphore event_listener_lock;
++
++/* A list of kernel callbacks listening for membership events */
++static struct list_head kernel_listener_list;
++static struct semaphore kernel_listener_lock;
++
++/* A list of sockets we are listening on (and can transmit on...later) */
++static struct list_head socket_list;
++
++/* A list of all open cluster client sockets */
++static struct list_head client_socket_list;
++static struct semaphore client_socket_lock;
++
++/* A list of all current barriers */
++static struct list_head barrier_list;
++static struct semaphore barrier_list_lock;
++
++/* When a socket is read for reading it goes on this queue */
++static spinlock_t active_socket_lock;
++static struct list_head active_socket_list;
++
++/* If the cnxman process is running and available for work */
++atomic_t cnxman_running;
++
++/* Fkags set by timers etc for the mainloop to detect and act upon */
++static unsigned long mainloop_flags;
++
++#define ACK_TIMEOUT 1
++#define RESEND_NEEDED 2
++
++/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
++ * process context then the messages get put in here */
++static struct list_head messages_list;
++static struct semaphore messages_list_lock;
++
++static struct semaphore start_thread_sem;
++
++/* List of outstanding ISLISTENING requests */
++static struct list_head listenreq_list;
++static struct semaphore listenreq_lock;
++
++/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
++ * ACK) */
++static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
++
++/* Wait for thread to exit properly */
++struct completion cluster_thread_comp;
++struct completion member_thread_comp;
++
++/* The resend delay to use, We increase this geometrically(word?) each time a
++ * send is delayed. in deci-seconds */
++static int resend_delay = 1;
++
++/* Highest numbered interface and the current default */
++static int num_interfaces = 0;
++static struct cl_comms_socket *current_interface = NULL;
++
++struct temp_node
++{
++ int nodeid;
++ char addr[sizeof(struct sockaddr_in6)];
++ int addrlen;
++ struct list_head list;
++};
++static struct list_head tempnode_list;
++static struct semaphore tempnode_lock;
++
++/* Wake up any processes that are waiting to send. This is usually called when
++ * all the ACKs have been gathered up or when a node has left the cluster
++ * unexpectedly and we reckon there are no more acks to collect */
++static void unjam(void)
++{
++ wake_up_interruptible(&socket_waitq);
++ wake_up_interruptible(&cnxman_waitq);
++}
++
++/* Used by the data_ready routine to locate a connection given the socket */
++static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
++{
++ struct list_head *conlist;
++
++ list_for_each(conlist, &socket_list) {
++ struct cl_comms_socket *clsock =
++ list_entry(conlist, struct cl_comms_socket, list);
++ if (clsock->sock->sk == sk) {
++ return clsock;
++ }
++ }
++ return NULL;
++}
++
++/* Data available on socket */
++static void cnxman_data_ready(struct sock *sk, int count_unused)
++{
++ struct cl_comms_socket *clsock = find_comms_by_sock(sk);
++
++ if (clsock == NULL) /* ASSERT ?? */
++ return;
++
++ /* If we're already on the list then don't do it again */
++ if (test_and_set_bit(1, &clsock->active))
++ return;
++
++ spin_lock_irq(&active_socket_lock);
++ list_add(&clsock->active_list, &active_socket_list);
++ spin_unlock_irq(&active_socket_lock);
++
++ wake_up_interruptible(&cnxman_waitq);
++}
++
++static int receive_message(struct cl_comms_socket *csock, char *iobuf)
++{
++ struct msghdr msg;
++ struct iovec iov;
++ struct sockaddr_in6 sin;
++ int len;
++ mm_segment_t fs;
++
++ memset(&sin, 0, sizeof (sin));
++
++ msg.msg_control = NULL;
++ msg.msg_controllen = 0;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = &iov;
++ msg.msg_name = &sin;
++ msg.msg_namelen = sizeof (sin);
++ msg.msg_flags = 0;
++
++ iov.iov_len = MAX_CLUSTER_MESSAGE;
++ iov.iov_base = iobuf;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ len = sock_recvmsg(csock->sock, &msg, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
++ set_fs(fs);
++
++ if (len > 0) {
++ if (len > MAX_CLUSTER_MESSAGE) {
++ printk(KERN_CRIT CMAN_NAME
++ ": %d byte message far too big\n", len);
++ return 0;
++ }
++ send_to_userport(csock, iobuf, len, msg.msg_name, msg.msg_namelen);
++ }
++ else {
++ if (len != -EAGAIN)
++ printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
++ len);
++ }
++ return len;
++}
++
++static int cluster_kthread(void *unused)
++{
++ int len;
++ char *iobuf;
++ struct list_head *socklist;
++ struct cl_comms_socket *csock;
++ wait_queue_t cnxman_waitq_head;
++ sigset_t tmpsig;
++
++ daemonize("cman_comms");
++
++ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++ /* This is the waitq we can wake the process up with */
++ init_waitqueue_head(&cnxman_waitq);
++ init_waitqueue_entry(&cnxman_waitq_head, current);
++ add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
++
++ set_user_nice(current, -6);
++
++ /* Allow the sockets to start receiving */
++ list_for_each(socklist, &socket_list) {
++ csock = list_entry(socklist, struct cl_comms_socket, list);
++
++ clear_bit(1, &csock->active);
++ }
++
++ iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
++ if (!iobuf) {
++ printk(KERN_CRIT CMAN_NAME
++ ": Cannot allocate receive buffer for cluster comms\n");
++ return -1;
++ }
++
++ complete(&cluster_thread_comp);
++
++ for (;;) {
++ struct list_head *temp;
++
++ /* Wait for activity on any of the sockets */
++ set_task_state(current, TASK_INTERRUPTIBLE);
++
++ if (list_empty(&active_socket_list))
++ schedule();
++ set_task_state(current, TASK_RUNNING);
++
++ if (quit_threads)
++ break;
++
++ if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
++ check_for_unacked_nodes();
++ }
++
++ /* Now receive any messages waiting for us */
++ spin_lock_irq(&active_socket_lock);
++ list_for_each_safe(socklist, temp, &active_socket_list) {
++ csock =
++ list_entry(socklist, struct cl_comms_socket,
++ active_list);
++
++ list_del(&csock->active_list);
++ clear_bit(1, &csock->active);
++
++ spin_unlock_irq(&active_socket_lock);
++
++ do {
++ len = receive_message(csock, iobuf);
++ }
++ while (len > 0);
++
++ spin_lock_irq(&active_socket_lock);
++
++ if (len == 0)
++ break; /* EOF on socket */
++ }
++ spin_unlock_irq(&active_socket_lock);
++
++ /* Resend any unacked messages */
++ if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
++ && acks_expected) {
++ resend_last_message();
++ }
++
++ /* Send any queued messages */
++ if (acks_expected == 0) {
++ struct list_head *temp;
++ struct list_head *msglist;
++
++ down(&messages_list_lock);
++ list_for_each_safe(msglist, temp, &messages_list) {
++ struct queued_message *qmsg =
++ list_entry(msglist, struct queued_message,
++ list);
++ int status = send_queued_message(qmsg);
++
++ if (status >= 0) {
++ /* Suceeded, remove it from the queue */
++ list_del(&qmsg->list);
++ kfree(qmsg);
++ }
++ /* Did it fail horribly ?? */
++ if (status < 0 && status != -EAGAIN) {
++ printk(KERN_INFO CMAN_NAME
++ ": send_queued_message failed, error %d\n",
++ status);
++ list_del(&qmsg->list);
++ kfree(qmsg);
++ }
++ break; /* Only send one message at a time */
++ }
++ up(&messages_list_lock);
++ }
++
++ if (signal_pending(current))
++ break;
++ }
++ P_COMMS("closing down\n");
++
++ if (we_are_a_cluster_member)
++ send_leave(us->leave_reason);
++
++ kfree(iobuf);
++ quit_threads = 1; /* force other thread to die too */
++ node_shutdown();
++
++ if (timer_pending(&ack_timer))
++ del_timer(&ack_timer);
++
++ /* Wait for membership thread to die */
++ wait_for_completion(&member_thread_comp);
++
++ node_cleanup();
++
++ complete(&cluster_thread_comp);
++ return 0;
++}
++
++void notify_kernel_listeners(kcl_callback_reason reason, long arg)
++{
++ struct kernel_notify_struct *knotify;
++ struct list_head *proclist;
++
++ down(&kernel_listener_lock);
++ list_for_each(proclist, &kernel_listener_list) {
++ knotify =
++ list_entry(proclist, struct kernel_notify_struct, list);
++ knotify->callback(reason, arg);
++ }
++ up(&kernel_listener_lock);
++}
++
++static void check_for_unacked_nodes()
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++
++ clear_bit(RESEND_NEEDED, &mainloop_flags);
++ retry_count = 0;
++
++ P_COMMS("Retry count exceeded -- looking for dead node\n");
++
++ /* Node did not ACK a message after <n> tries, remove it from the
++ * cluster */
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ P_COMMS
++ ("checking node %s: last_acked = %d, last_seq_sent = %d\n",
++ node->name, node->last_seq_acked, node->last_seq_sent);
++ if (node->state != NODESTATE_DEAD
++ && node->last_seq_acked != node->last_seq_sent && !node->us) {
++ printk(KERN_WARNING CMAN_NAME
++ ": node %s is not responding - removing from the cluster\n",
++ node->name);
++
++ /* Start a state transition */
++ a_node_just_died(node);
++ }
++ }
++ up(&cluster_members_lock);
++ acks_expected = ack_count = 0;
++ unjam();
++ return;
++}
++
++static void ack_timer_fn(unsigned long arg)
++{
++ P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
++
++ /* Too many retries ? */
++ if (++retry_count > MAX_RETRIES) {
++ set_bit(ACK_TIMEOUT, &mainloop_flags);
++ wake_up_interruptible(&cnxman_waitq);
++ }
++ else {
++ /* Resend last message */
++ set_bit(RESEND_NEEDED, &mainloop_flags);
++ wake_up_interruptible(&cnxman_waitq);
++ }
++}
++
++/* Called to resend a packet if sock_sendmsg was busy */
++static void short_timer_fn(unsigned long arg)
++{
++ P_COMMS("short_timer fired\n");
++
++ /* Resend last message */
++ resend_delay <<= 1;
++ set_bit(RESEND_NEEDED, &mainloop_flags);
++ wake_up_interruptible(&cnxman_waitq);
++}
++
++static void start_ack_timer()
++{
++ ack_timer.function = ack_timer_fn;
++ ack_timer.data = 0L;
++ mod_timer(&ack_timer, jiffies + HZ);
++}
++
++static void start_short_timer(void)
++{
++ ack_timer.function = short_timer_fn;
++ ack_timer.data = 0L;
++ mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
++}
++
++
++static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
++{
++ struct list_head *llist;
++ struct cl_waiting_listen_request *listener;
++
++ down(&listenreq_lock);
++ list_for_each(llist, &listenreq_list) {
++ listener =
++ list_entry(llist, struct cl_waiting_listen_request, list);
++ if (listener->tag == tag) {
++ up(&listenreq_lock);
++ return listener;
++ }
++ }
++ up(&listenreq_lock);
++ return NULL;
++}
++
++static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
++ int len, char *addr, int addrlen,
++ struct cluster_node *rem_node)
++{
++ struct cl_protmsg *msg = (struct cl_protmsg *) data;
++ struct cl_protheader *header = (struct cl_protheader *) data;
++ struct cl_ackmsg *ackmsg;
++ struct cl_listenmsg *listenmsg;
++ struct cl_closemsg *closemsg;
++ struct cl_barriermsg *barriermsg;
++ struct cl_waiting_listen_request *listen_request;
++
++ P_COMMS("Message on port 0 is %d\n", msg->cmd);
++ switch (msg->cmd) {
++ case CLUSTER_CMD_ACK:
++ ackmsg = (struct cl_ackmsg *) data;
++
++ if (ackmsg->aflags & 1) {
++ if (net_ratelimit())
++ printk(KERN_INFO CMAN_NAME
++ ": WARNING no listener for port %d on node %s\n",
++ ackmsg->remport, rem_node->name);
++ }
++ P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
++ rem_node ? rem_node->name : "Unknown",
++ le16_to_cpu(ackmsg->seq), cur_seq);
++
++ if (rem_node && rem_node->state != NODESTATE_DEAD) {
++ /* This copes with duplicate acks from a multipathed
++ * host */
++ if (rem_node->last_seq_acked !=
++ le16_to_cpu(ackmsg->seq)) {
++ rem_node->last_seq_acked =
++ le16_to_cpu(ackmsg->seq);
++
++ /* Got em all */
++ if (++ack_count >= acks_expected) {
++
++ /* Cancel the timer */
++ del_timer(&ack_timer);
++ acks_expected = 0;
++ unjam();
++ }
++ }
++ }
++ else {
++ if (cluster_members) {
++#ifdef DEBUG_COMMS
++ char buf[MAX_ADDR_PRINTED_LEN];
++
++ printk(KERN_INFO CMAN_NAME
++ ": got ack from unknown or dead node: %s\n",
++ print_addr(addr, addrlen, buf));
++#endif
++ }
++ }
++ break;
++
++ /* Return 1 if we have a listener on this port, 0 if not */
++ case CLUSTER_CMD_LISTENREQ:
++ listenmsg =
++ (struct cl_listenmsg *) (data +
++ sizeof (struct cl_protheader));
++ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++ send_listen_response(csock, le32_to_cpu(header->srcid),
++ listenmsg->target_port, listenmsg->tag);
++ break;
++
++ case CLUSTER_CMD_LISTENRESP:
++ /* Wake up process waiting for listen response */
++ listenmsg =
++ (struct cl_listenmsg *) (data +
++ sizeof (struct cl_protheader));
++ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++ listen_request = find_listen_request(listenmsg->tag);
++ if (listen_request) {
++ listen_request->result = listenmsg->listening;
++ listen_request->waiting = 0;
++ wake_up_interruptible(&listen_request->waitq);
++ }
++ break;
++
++ case CLUSTER_CMD_PORTCLOSED:
++ closemsg =
++ (struct cl_closemsg *) (data +
++ sizeof (struct cl_protheader));
++ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++ post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
++ break;
++
++ case CLUSTER_CMD_BARRIER:
++ barriermsg =
++ (struct cl_barriermsg *) (data +
++ sizeof (struct cl_protheader));
++ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++ process_barrier_msg(barriermsg, rem_node);
++ break;
++
++ default:
++ printk(KERN_ERR CMAN_NAME
++ ": Unknown protocol message %d received\n", msg->cmd);
++ break;
++
++ }
++ return;
++}
++
++static void send_to_userport(struct cl_comms_socket *csock, char *data, int len,
++ char *addr, int addrlen)
++{
++ int err;
++ struct cl_protheader *header = (struct cl_protheader *) data;
++ struct cluster_node *rem_node =
++ find_node_by_nodeid(le32_to_cpu(header->srcid));
++ struct sk_buff *skb = NULL;
++
++ P_COMMS
++ ("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
++ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
++ le16_to_cpu(header->seq), rem_node,
++ rem_node ? rem_node->state : -1);
++
++ /* If the remote end is being coy about its node ID then look it up by
++ * address */
++ if (!rem_node && header->srcid == 0) {
++ rem_node = find_node_by_addr(addr, addrlen);
++ }
++
++ /* If this node is an ex-member then treat it as unknown */
++ if (rem_node && rem_node->state != NODESTATE_MEMBER
++ && rem_node->state != NODESTATE_JOINING)
++ rem_node = NULL;
++
++ /* Ignore messages not for our cluster */
++ if (le16_to_cpu(header->cluster) != cluster_id) {
++ P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
++ cluster_id, header->cluster);
++ goto userport_finish;
++ }
++
++ /* If the message is from us then just dump it */
++ if (rem_node && rem_node->us)
++ goto userport_finish;
++
++ /* If we can't find the nodeid then check for our own messages the hard
++ * way - this only happens during joining */
++ if (!rem_node) {
++ struct list_head *socklist;
++ struct cl_comms_socket *clsock;
++
++ list_for_each(socklist, &socket_list) {
++ clsock =
++ list_entry(socklist, struct cl_comms_socket, list);
++
++ if (clsock->recv_only) {
++
++ if (memcmp(addr, &clsock->saddr, address_length) == 0) {
++ goto userport_finish;
++ }
++ }
++ }
++
++ }
++
++ /* Ignore messages not for us */
++ if (le32_to_cpu(header->tgtid) > 0 && us
++ && le32_to_cpu(header->tgtid) != us->node_id) {
++ goto userport_finish;
++ }
++
++ P_COMMS("got message, from %d for %d, sequence num = %d\n",
++ le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
++ le16_to_cpu(header->seq));
++
++ /* Have we received this message before ? If so just ignore it, it's a
++ * resend for someone else's benefit */
++ if (!(header->flags & (MSG_NOACK >> 16)) &&
++ rem_node && le16_to_cpu(header->seq) == rem_node->last_seq_recv) {
++ P_COMMS
++ ("Discarding message - Already seen this sequence number %d\n",
++ rem_node->last_seq_recv);
++ /* Still need to ACK it though, in case it was the ACK that got
++ * lost */
++ cl_sendack(csock, header->seq, addrlen, addr, header->port, 0);
++ goto userport_finish;
++ }
++
++ /* If it's a new node then assign it a temporary node ID */
++ if (!rem_node)
++ header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
++
++ P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
++ header->flags, header->port, we_are_a_cluster_member);
++
++
++ /* If we are not part of the cluster then ignore multicast messages
++ * that need an ACK as we will confuse the sender who is only expecting
++ * ACKS from bona fide members */
++ if (header->flags & (MSG_MULTICAST >> 16) &&
++ !(header->flags & (MSG_NOACK >> 16)) && !we_are_a_cluster_member) {
++ P_COMMS
++ ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
++ header->port, header->flags);
++ goto userport_finish;
++ }
++
++ /* Save the sequence number of this message so we can ignore duplicates
++ * (above) */
++ if (!(header->flags & (MSG_NOACK >> 16)) && rem_node) {
++ P_COMMS("Saving seq %d for node %s\n", le16_to_cpu(header->seq),
++ rem_node->name);
++ rem_node->last_seq_recv = le16_to_cpu(header->seq);
++ }
++
++ /* Is it a protocol message? */
++ if (header->port == 0) {
++ process_cnxman_message(csock, data, len, addr, addrlen,
++ rem_node);
++ goto userport_finish;
++ }
++
++ /* Skip past the header to the data */
++ data += sizeof (struct cl_protheader);
++ len -= sizeof (struct cl_protheader);
++
++ /* Get the port number and look for a listener */
++ down(&port_array_lock);
++ if (port_array[header->port]) {
++ int native_srcid;
++ struct cluster_sock *c = cluster_sk(port_array[header->port]);
++
++ /* ACK it */
++ if (!(header->flags & (MSG_NOACK >> 16)))
++ cl_sendack(csock, header->seq, addrlen, addr,
++ header->port, 0);
++
++ /* Call a callback if there is one */
++ if (c->kernel_callback) {
++ up(&port_array_lock);
++ c->kernel_callback(data, len, addr, addrlen,
++ le32_to_cpu(header->srcid));
++ goto userport_finish;
++ }
++
++ /* Otherwise put it into an SKB and pass it onto the recvmsg
++ * mechanism */
++ skb = alloc_skb(len, GFP_KERNEL);
++ if (!skb) {
++ up(&port_array_lock);
++ printk(KERN_INFO CMAN_NAME
++ ": Failed to allocate skb\n");
++ return;
++ }
++
++ skb_put(skb, len);
++ memcpy(skb->data, data, len);
++
++ /* Put the nodeid into cb so we can pass it to the clients */
++ skb->cb[0] = 0; /* Clear flags */
++ native_srcid = le32_to_cpu(header->srcid);
++ memcpy(skb->cb + 1, &native_srcid, sizeof(int));
++
++ if ((err =
++ sock_queue_rcv_skb(port_array[header->port], skb)) < 0) {
++
++ printk(KERN_INFO CMAN_NAME
++ ": Error queueing request to port %d: %d\n",
++ header->port, err);
++ kfree_skb(skb);
++
++ /* If the port was MEMBERSHIP then we have to die */
++ if (header->port == CLUSTER_PORT_MEMBERSHIP) {
++ up(&port_array_lock);
++ send_leave(CLUSTER_LEAVEFLAG_PANIC);
++ panic("membership stopped responding");
++ }
++ }
++ up(&port_array_lock);
++
++ }
++ else {
++ /* ACK it, but set the flag bit so remote end knows no-one
++ * caught it */
++ if (!(header->flags & (MSG_NOACK >> 16)))
++ cl_sendack(csock, header->seq, addrlen, addr,
++ header->port, 1);
++
++ /* Nobody listening, drop it */
++ up(&port_array_lock);
++ }
++
++ userport_finish:
++ return;
++}
++
++static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
++{
++ struct sock *sk;
++ struct cluster_sock *c;
++
++ if ((sk =
++ sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
++ cluster_sk_cachep)) == NULL)
++ goto no_sock;
++
++ if (sock) {
++ sock->ops = &cl_proto_ops;
++ }
++ sock_init_data(sock, sk);
++
++ sk->sk_destruct = NULL;
++ sk->sk_no_check = 1;
++ sk->sk_family = PF_CLUSTER;
++ sk->sk_allocation = gfp;
++
++ c = cluster_sk(sk);
++ c->port = 0;
++ c->service_data = NULL;
++
++ return sk;
++ no_sock:
++ return NULL;
++}
++
++static int cl_release(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++ struct cl_client_socket *csock;
++ struct list_head *socklist;
++ struct list_head *tmp;
++
++ down(&client_socket_lock);
++ if (sk) {
++ /* Remove port allocations if it's a bound socket */
++ struct cluster_sock *c = cluster_sk(sk);
++
++ down(&port_array_lock);
++ if (c->port) {
++ port_array[c->port] = NULL;
++ }
++ up(&port_array_lock);
++
++ /* Tell other nodes in the cluster that this listener is going
++ * away */
++ if (atomic_read(&cnxman_running) && c->port)
++ send_port_close_oob(c->port);
++
++ if (c->service_data)
++ sm_sock_release(sock);
++
++ /* Master socket released ? */
++ if (sk->sk_protocol == CLPROTO_MASTER) {
++ master_sock = NULL;
++
++ /* If this socket is being freed and cnxman is not
++ * started then free all the comms sockets as either
++ * the userland "join" process has crashed or the
++ * join failed.
++ */
++ if (!atomic_read(&cnxman_running)) {
++ quit_threads = 1;
++ free_cluster_sockets();
++ }
++ }
++
++ sock_orphan(sk);
++ sock_hold(sk);
++ lock_sock(sk);
++ release_sock(sk);
++ sock_put(sk);
++ sock_put(sk);
++ sock->sk = NULL;
++ }
++
++ /* Remove it from the list of clients */
++ list_for_each_safe(socklist, tmp, &client_socket_list) {
++ csock = list_entry(socklist, struct cl_client_socket, list);
++
++ if (csock->sock == sock) {
++ list_del(&csock->list);
++ kfree(csock);
++ break;
++ }
++ }
++ up(&client_socket_lock);
++
++ return 0;
++}
++
++static int cl_create(struct socket *sock, int protocol)
++{
++ struct sock *sk;
++
++ /* All are datagrams */
++ if (sock->type != SOCK_DGRAM)
++ return -ESOCKTNOSUPPORT;
++
++ if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
++ return -EPERM;
++
++ /* Can only have one master socket */
++ if (master_sock && protocol == CLPROTO_MASTER)
++ return -EBUSY;
++
++ /* cnxman not running and a client was requested */
++ if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
++ return -ENETDOWN;
++
++ if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
++ return -ENOBUFS;
++
++ sk->sk_protocol = protocol;
++
++ if (protocol == CLPROTO_MASTER)
++ master_sock = sk;
++
++ /* Add client sockets to the list */
++ if (protocol == CLPROTO_CLIENT) {
++ struct cl_client_socket *clsock =
++ kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
++ if (!clsock) {
++ cl_release(sock);
++ return -ENOMEM;
++ }
++ clsock->sock = sock;
++ down(&client_socket_lock);
++ list_add(&clsock->list, &client_socket_list);
++ up(&client_socket_lock);
++ }
++
++ return 0;
++}
++
++static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
++{
++ struct sock *sk = sock->sk;
++ struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
++ struct cluster_sock *c = cluster_sk(sk);
++
++ if (!capable(CAP_NET_BIND_SERVICE))
++ return -EPERM;
++
++ if (sk->sk_zapped == 0)
++ return -EINVAL;
++
++ if (addr_len != sizeof (struct sockaddr_cl))
++ return -EINVAL;
++
++ if (saddr->scl_family != AF_CLUSTER)
++ return -EINVAL;
++
++ if (saddr->scl_port == 0)
++ return -EINVAL; /* Port 0 is reserved for protocol messages */
++
++ down(&port_array_lock);
++
++ if (port_array[saddr->scl_port]) {
++ up(&port_array_lock);
++ return -EADDRINUSE;
++ }
++
++ port_array[saddr->scl_port] = sk;
++
++ up(&port_array_lock);
++
++ c->port = saddr->scl_port;
++ sk->sk_zapped = 0;
++
++ /* If we are not a cluster member yet then make the client wait until
++ * we are, this allows nodes to start cluster clients at the same time
++ * as cluster services but they will wait until membership is achieved.
++ * This looks odd in bind() (open would seem more obvious) but we need
++ * to know which port number is being used so that things like
++ * membership services don't get blocked
++ */
++
++ if (saddr->scl_port > HIGH_PROTECTED_PORT)
++ while (!we_are_a_cluster_member || !cluster_is_quorate
++ || in_transition()) {
++ DECLARE_WAITQUEUE(wq, current);
++ struct task_struct *tsk = current;
++
++ set_task_state(tsk, TASK_INTERRUPTIBLE);
++ add_wait_queue(&socket_waitq, &wq);
++
++ if (!we_are_a_cluster_member || !cluster_is_quorate
++ || in_transition())
++ schedule();
++
++ set_task_state(tsk, TASK_RUNNING);
++ remove_wait_queue(&socket_waitq, &wq);
++
++ /* We were woken up because the cluster is going down,
++ * ...and we never got a chance to do any work! (sob) */
++ if (atomic_read(&cnxman_running) == 0 || quit_threads) {
++ return -ENOTCONN;
++ }
++ }
++
++ return 0;
++}
++
++static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
++ int *uaddr_len, int peer)
++{
++ struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
++ struct sock *sk = sock->sk;
++ struct cluster_sock *c = cluster_sk(sk);
++
++ *uaddr_len = sizeof (struct sockaddr_cl);
++
++ lock_sock(sk);
++
++ sa->scl_port = c->port;
++ sa->scl_flags = 0;
++ sa->scl_family = AF_CLUSTER;
++
++ release_sock(sk);
++
++ return 0;
++}
++
++static unsigned int cl_poll(struct file *file, struct socket *sock,
++ poll_table * wait)
++{
++ return datagram_poll(file, sock, wait);
++}
++
++/* Copy internal node format to userland format */
++void copy_to_usernode(struct cluster_node *node,
++ struct cl_cluster_node *unode)
++{
++ strcpy(unode->name, node->name);
++ unode->size = sizeof (struct cl_cluster_node);
++ unode->votes = node->votes;
++ unode->state = node->state;
++ unode->us = node->us;
++ unode->node_id = node->node_id;
++ unode->leave_reason = node->leave_reason;
++ unode->incarnation = node->incarnation;
++}
++
++/* ioctl processing functions */
++
++static int do_ioctl_set_version(unsigned long arg)
++{
++ struct cl_version version, *u_version;
++
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++ if (arg == 0)
++ return -EINVAL;
++
++ u_version = (struct cl_version *) arg;
++
++ if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
++ return -EFAULT;
++
++ if (version.major != CNXMAN_MAJOR_VERSION ||
++ version.minor != CNXMAN_MINOR_VERSION ||
++ version.patch != CNXMAN_PATCH_VERSION)
++ return -EINVAL;
++
++ if (config_version == version.config)
++ return 0;
++
++ config_version = version.config;
++ send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
++ return 0;
++}
++
++static int do_ioctl_get_members(unsigned long arg)
++{
++ struct cluster_node *node;
++ /* Kernel copies */
++ struct cl_cluster_node user_format_node;
++ struct cl_cluster_nodelist user_format_nodelist;
++ /* User space array ptr */
++ struct cl_cluster_node *user_node;
++ struct list_head *nodelist;
++ int num_nodes = 0;
++
++ if (arg == 0)
++ return cluster_members;
++
++ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
++ return -EFAULT;
++
++ down(&cluster_members_lock);
++
++ if (user_format_nodelist.max_members < cluster_members) {
++ up(&cluster_members_lock);
++ return -E2BIG;
++ }
++
++ user_node = user_format_nodelist.nodes;
++
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++ if (node->state == NODESTATE_MEMBER) {
++ copy_to_usernode(node, &user_format_node);
++ if (copy_to_user(user_node, &user_format_node,
++ sizeof (struct cl_cluster_node))) {
++ up(&cluster_members_lock);
++ return -EFAULT;
++ }
++ user_node++;
++ num_nodes++;
++ }
++ }
++ up(&cluster_members_lock);
++
++ return num_nodes;
++}
++
++static int do_ioctl_get_all_members(unsigned long arg)
++{
++ struct cluster_node *node;
++ /* Kernel copies */
++ struct cl_cluster_node user_format_node;
++ struct cl_cluster_nodelist user_format_nodelist;
++ /* User space array ptr*/
++ struct cl_cluster_node *user_node;
++ struct list_head *nodelist;
++ int num_nodes = 0;
++
++ if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
++ return -EFAULT;
++
++ down(&cluster_members_lock);
++
++ user_node = user_format_nodelist.nodes;
++
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++ if (arg) {
++ copy_to_usernode(node,
++ &user_format_node);
++
++ if (copy_to_user(user_node, &user_format_node,
++ sizeof (struct cl_cluster_node))) {
++ up(&cluster_members_lock);
++ return -EFAULT;
++ }
++ user_node++;
++ if (--user_format_nodelist.max_members < 0) {
++ num_nodes = -EFAULT;
++ goto err_exit;
++ }
++
++ }
++ num_nodes++;
++ }
++ err_exit:
++ up(&cluster_members_lock);
++
++ return num_nodes;
++}
++
++static int do_ioctl_get_node(unsigned long arg)
++{
++ struct cluster_node *node;
++ struct cl_cluster_node k_node, *u_node;
++
++ u_node = (struct cl_cluster_node *) arg;
++
++ if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
++ return -EFAULT;
++
++ if (k_node.node_id)
++ node = find_node_by_nodeid(k_node.node_id);
++ else
++ node = find_node_by_name(k_node.name);
++
++ if (!node)
++ return -ENOENT;
++
++ copy_to_usernode(node, &k_node);
++
++ if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
++ return -EFAULT;
++
++ return 0;
++}
++
++static int do_ioctl_set_expected(unsigned long arg)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ unsigned int total_votes;
++ unsigned int newquorum;
++
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++ if (arg == 0)
++ return -EINVAL;
++
++ newquorum = calculate_quorum(1, arg, &total_votes);
++
++ if (newquorum < total_votes / 2
++ || newquorum > total_votes) {
++ return -EINVAL;
++ }
++
++ /* Now do it */
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++ if (node->state == NODESTATE_MEMBER
++ && node->expected_votes > arg) {
++ node->expected_votes = arg;
++ }
++ }
++ up(&cluster_members_lock);
++
++ recalculate_quorum(1);
++
++ send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
++ sm_member_update(cluster_is_quorate);
++
++ return 0;
++}
++
++static int do_ioctl_kill_node(unsigned long arg)
++{
++ struct cluster_node *node;
++
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++
++ if ((node = find_node_by_nodeid(arg)) == NULL)
++ return -EINVAL;
++
++ /* Can't kill us */
++ if (node->us)
++ return -EINVAL;
++
++ if (node->state != NODESTATE_MEMBER)
++ return -EINVAL;
++
++ /* Just in case it is alive, send a KILL message */
++ send_kill(arg);
++
++ node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
++ a_node_just_died(node);
++
++ return 0;
++}
++
++static int do_ioctl_barrier(unsigned long arg)
++{
++ struct cl_barrier_info info;
++
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ if (copy_from_user(&info, (void *)arg, sizeof(info)) != 0)
++ return -EFAULT;
++
++ switch (info.cmd) {
++ case BARRIER_IOCTL_REGISTER:
++ return kcl_barrier_register(info.name,
++ info.flags,
++ info.arg);
++ case BARRIER_IOCTL_CHANGE:
++ return kcl_barrier_setattr(info.name,
++ info.flags,
++ info.arg);
++ case BARRIER_IOCTL_WAIT:
++ return kcl_barrier_wait(info.name);
++ case BARRIER_IOCTL_DELETE:
++ return kcl_barrier_delete(info.name);
++ default:
++ return -EINVAL;
++ }
++}
++
++static int do_ioctl_islistening(unsigned long arg)
++{
++ DECLARE_WAITQUEUE(wq, current);
++ struct cl_listen_request rq;
++ struct cluster_node *rem_node;
++ int nodeid;
++ int result;
++ struct cl_waiting_listen_request *listen_request;
++
++ if (!arg)
++ return -EINVAL;
++
++ if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
++ return -EFAULT;
++
++ nodeid = rq.nodeid;
++
++ rem_node = find_node_by_nodeid(nodeid);
++
++ /* Node not in the cluster */
++ if (!rem_node)
++ return -ENOENT;
++
++ if (rem_node->state != NODESTATE_MEMBER)
++ return -ENOTCONN;
++
++ /* If the request is for us then just look in the ports
++ * array */
++ if (nodeid == us->node_id)
++ return (port_array[rq.port] != 0) ? 1 : 0;
++
++ /* For a remote node we need to send a request out */
++
++ /* If we are in transition then wait until we are not */
++ while (in_transition()) {
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ add_wait_queue(&socket_waitq, &wq);
++
++ if (in_transition())
++ schedule();
++
++ set_task_state(current, TASK_RUNNING);
++ remove_wait_queue(&socket_waitq, &wq);
++
++ if (signal_pending(current))
++ return -EINTR;
++ }
++
++ /* Were we shut down before it completed ? */
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++
++ listen_request =
++ kmalloc(sizeof (struct cl_waiting_listen_request),
++ GFP_KERNEL);
++ if (!listen_request)
++ return -ENOMEM;
++
++ /* Build the request */
++ listen_request->waiting = 1;
++ listen_request->result = 0;
++ listen_request->tag = current->pid;
++ listen_request->nodeid = nodeid;
++ init_waitqueue_head(&listen_request->waitq);
++
++ down(&listenreq_lock);
++ list_add(&listen_request->list, &listenreq_list);
++ up(&listenreq_lock);
++
++ /* Now wait for the response to come back */
++ send_listen_request(rq.nodeid, rq.port);
++
++ while (listen_request->waiting) {
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ add_wait_queue(&listen_request->waitq, &wq);
++
++ if (listen_request->waiting)
++ schedule();
++
++ set_task_state(current, TASK_RUNNING);
++ remove_wait_queue(&listen_request->waitq, &wq);
++
++ if (signal_pending(current)) {
++ list_del(&listen_request->list);
++ kfree(listen_request);
++ return -ERESTARTSYS;
++ }
++ }
++ result = listen_request->result;
++ list_del(&listen_request->list);
++ kfree(listen_request);
++ return result;
++}
++
++static int do_ioctl_set_votes(unsigned long arg)
++{
++ unsigned int total_votes;
++ unsigned int newquorum;
++ int saved_votes;
++
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ /* Check votes is valid */
++ saved_votes = us->votes;
++ us->votes = arg;
++
++ newquorum = calculate_quorum(1, 0, &total_votes);
++
++ if (newquorum < total_votes / 2 || newquorum > total_votes) {
++ us->votes = saved_votes;
++ return -EINVAL;
++ }
++
++ recalculate_quorum(1);
++
++ send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
++
++ return 0;
++}
++
++static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++ int err = -EOPNOTSUPP;
++ struct list_head *proclist;
++ struct list_head *tmp;
++ struct notify_struct *notify;
++ struct cl_version cnxman_version;
++
++ switch (cmd) {
++ /* Process requests notification of cluster events */
++ case SIOCCLUSTER_NOTIFY:
++ notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
++ if (!notify)
++ return -ENOMEM;
++ notify->pid = current->pid;
++ notify->signal = arg;
++ down(&event_listener_lock);
++ list_add(¬ify->list, &event_listener_list);
++ up(&event_listener_lock);
++ err = 0;
++ break;
++
++ /* Process is no longer interested cluster events */
++ case SIOCCLUSTER_REMOVENOTIFY:
++ err = EINVAL;
++
++ down(&event_listener_lock);
++ list_for_each_safe(proclist, tmp, &event_listener_list) {
++ notify =
++ list_entry(proclist, struct notify_struct, list);
++ if (notify->pid == current->pid) {
++ list_del(¬ify->list);
++ kfree(notify);
++ err = 0;
++ }
++ }
++ up(&event_listener_lock);
++ break;
++
++ /* Return the cnxman version number */
++ case SIOCCLUSTER_GET_VERSION:
++ if (!arg)
++ return -EINVAL;
++ err = 0;
++ cnxman_version.major = CNXMAN_MAJOR_VERSION;
++ cnxman_version.minor = CNXMAN_MINOR_VERSION;
++ cnxman_version.patch = CNXMAN_PATCH_VERSION;
++ if (copy_to_user((void *) arg, &cnxman_version,
++ sizeof (struct cl_version))) {
++ return -EFAULT;
++ }
++ break;
++
++ /* Set the cnxman config version number */
++ case SIOCCLUSTER_SET_VERSION:
++ err = do_ioctl_set_version(arg);
++ break;
++
++ /* Return the active membership list */
++ case SIOCCLUSTER_GETMEMBERS:
++ err = do_ioctl_get_members(arg);
++ break;
++
++ /* Return the full membership list include dead nodes */
++ case SIOCCLUSTER_GETALLMEMBERS:
++ err = do_ioctl_get_all_members(arg);
++ break;
++
++ case SIOCCLUSTER_GETNODE:
++ err = do_ioctl_get_node(arg);
++ break;
++
++ case SIOCCLUSTER_ISQUORATE:
++ return cluster_is_quorate;
++
++ case SIOCCLUSTER_ISACTIVE:
++ return atomic_read(&cnxman_running);
++
++ case SIOCCLUSTER_SETEXPECTED_VOTES:
++ err = do_ioctl_set_expected(arg);
++ break;
++
++ /* Change the number of votes for this node */
++ case SIOCCLUSTER_SET_VOTES:
++ err = do_ioctl_set_votes(arg);
++ break;
++
++ /* Return 1 if the specified node is listening on a given port */
++ case SIOCCLUSTER_ISLISTENING:
++ err = do_ioctl_islistening(arg);
++ break;
++
++ /* Forcibly kill a node */
++ case SIOCCLUSTER_KILLNODE:
++ err = do_ioctl_kill_node(arg);
++ break;
++
++ case SIOCCLUSTER_GET_JOINCOUNT:
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++ else
++ return atomic_read(&use_count);
++
++ /* ioctl interface to the barrier system */
++ case SIOCCLUSTER_BARRIER:
++ err = do_ioctl_barrier(arg);
++ break;
++
++ default:
++ err = sm_ioctl(sock, cmd, arg);
++ }
++ return err;
++}
++
++static int cl_shutdown(struct socket *sock, int how)
++{
++ struct sock *sk = sock->sk;
++ int err = -ENOTCONN;
++
++ lock_sock(sk);
++
++ if (sock->state == SS_UNCONNECTED)
++ goto out;
++
++ err = 0;
++ if (sock->state == SS_DISCONNECTING)
++ goto out;
++
++ err = -EINVAL;
++
++ if (how != SHUTDOWN_MASK)
++ goto out;
++
++ sk->sk_shutdown = how;
++ err = 0;
++
++ out:
++ release_sock(sk);
++
++ return err;
++}
++
++static int cl_setsockopt(struct socket *sock, int level, int optname,
++ char *optval, int optlen)
++{
++ struct sock *sk = sock->sk;
++ int err;
++
++ if (sk != master_sock)
++ return -EPERM;
++
++ lock_sock(sk);
++ err = __cl_setsockopt(sock, level, optname, optval, optlen, 0);
++ release_sock(sk);
++
++ return err;
++}
++
++static int add_clsock(int broadcast, int number, struct socket *sock,
++ struct file *file)
++{
++ struct cl_comms_socket *newsock =
++ kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
++ if (!newsock)
++ return -ENOMEM;
++
++ memset(newsock, 0, sizeof (*newsock));
++ newsock->number = number;
++ newsock->sock = sock;
++ if (broadcast) {
++ newsock->broadcast = 1;
++ newsock->recv_only = 0;
++ }
++ else {
++ newsock->broadcast = 0;
++ newsock->recv_only = 1;
++ }
++
++ newsock->file = file;
++ newsock->addr_len = sizeof(struct sockaddr_in6);
++
++ /* Mark it active until cnxman thread is running and ready to process
++ * messages */
++ set_bit(1, &newsock->active);
++
++ /* Find out what it's bound to */
++ newsock->sock->ops->getname(newsock->sock,
++ (struct sockaddr *)&newsock->saddr,
++ &newsock->addr_len, 0);
++
++ num_interfaces = max(num_interfaces, newsock->number);
++ if (!current_interface && newsock->broadcast)
++ current_interface = newsock;
++
++ /* Hook data_ready */
++ newsock->sock->sk->sk_data_ready = cnxman_data_ready;
++
++ /* Make an attempt to keep them in order */
++ list_add_tail(&newsock->list, &socket_list);
++
++ address_length = newsock->addr_len;
++ return 0;
++}
++
++static int __cl_setsockopt(struct socket *sock, int level, int optname,
++ char *optval, int optlen, int flags)
++{
++ struct file *file;
++ struct cl_join_cluster_info join_info;
++ int error;
++ int leave_flags;
++ struct cl_multicast_sock multicast_info;
++
++ if (optlen && !optval)
++ return -EINVAL;
++
++ switch (optname) {
++ case CLU_SET_MULTICAST:
++ case CLU_SET_RCVONLY:
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ if (optlen != sizeof (struct cl_multicast_sock))
++ return -EINVAL;
++
++ if (atomic_read(&cnxman_running))
++ return -EINVAL;
++
++ error = -EBADF;
++
++ if (copy_from_user(&multicast_info, optval, optlen))
++ return -EFAULT;
++
++ file = fget(multicast_info.fd);
++ if (file) {
++ struct inode *inode = file->f_dentry->d_inode;
++
++ error =
++ add_clsock(optname == CLU_SET_MULTICAST,
++ multicast_info.number, SOCKET_I(inode),
++ file);
++ if (error)
++ fput(file);
++ }
++ return error;
++
++ case CLU_SET_NODENAME:
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ if (atomic_read(&cnxman_running))
++ return -EINVAL;
++
++ if (optlen > MAX_CLUSTER_MEMBER_NAME_LEN)
++ return -EINVAL;
++
++ if (copy_from_user(nodename, optval, optlen))
++ return -EFAULT;
++ break;
++
++ case CLU_JOIN_CLUSTER:
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ if (atomic_read(&cnxman_running))
++ return -EALREADY;
++
++ if (optlen != sizeof (struct cl_join_cluster_info))
++ return -EINVAL;
++
++ if (copy_from_user(&join_info, optval, optlen))
++ return -EFAULT;
++
++ if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
++ return -EINVAL;
++
++ if (list_empty(&socket_list))
++ return -ENOTCONN;
++
++ set_votes(join_info.votes, join_info.expected_votes);
++ cluster_id = generate_cluster_id(join_info.cluster_name);
++ strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
++ two_node = join_info.two_node;
++ config_version = join_info.config_version;
++
++ quit_threads = 0;
++ acks_expected = 0;
++ init_completion(&cluster_thread_comp);
++ init_completion(&member_thread_comp);
++ if (allocate_nodeid_array())
++ return -ENOMEM;
++
++ kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
++ if (kcluster_pid < 0)
++ return kcluster_pid;
++
++ wait_for_completion(&cluster_thread_comp);
++ init_completion(&cluster_thread_comp);
++
++ atomic_set(&cnxman_running, 1);
++
++ /* Make sure we have a node name */
++ if (nodename[0] == '\0')
++ strcpy(nodename, system_utsname.nodename);
++
++ membership_pid = start_membership_services(kcluster_pid);
++ if (membership_pid < 0) {
++ quit_threads = 1;
++ wait_for_completion(&cluster_thread_comp);
++ init_completion(&member_thread_comp);
++ return membership_pid;
++ }
++
++ sm_start();
++ break;
++
++ case CLU_LEAVE_CLUSTER:
++ if (!capable(CAP_CLUSTER))
++ return -EPERM;
++
++ if (optlen != sizeof (int))
++ return -EINVAL;
++
++ if (copy_from_user(&leave_flags, optval, optlen))
++ return -EFAULT;
++
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++
++ if (in_transition())
++ return -EBUSY;
++
++ /* Ignore the use count if FORCE is set */
++ if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
++ if (atomic_read(&use_count))
++ return -ENOTCONN;
++ }
++
++ us->leave_reason = leave_flags;
++ quit_threads = 1;
++ wake_up_interruptible(&cnxman_waitq);
++
++ wait_for_completion(&cluster_thread_comp);
++ break;
++
++ default:
++ return -ENOPROTOOPT;
++ }
++
++ return 0;
++}
++
++static int cl_getsockopt(struct socket *sock, int level, int optname,
++ char *optval, int *optlen)
++{
++ struct sock *sk = sock->sk;
++ int err;
++
++ lock_sock(sk);
++ err = __cl_getsockopt(sock, level, optname, optval, optlen, 0);
++ release_sock(sk);
++
++ return err;
++}
++
++static int __cl_getsockopt(struct socket *sock, int level, int optname,
++ char *optval, int *optlen, int flags)
++{
++
++ switch (optname) {
++ default:
++ return -ENOPROTOOPT;
++ }
++
++ return 0;
++}
++
++/* We'll be giving out reward points next... */
++/* Send the packet and save a copy in case someone loses theirs. Should be
++ * protected by the send mutexphore */
++static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
++ int size, int needack)
++{
++ mm_segment_t fs;
++ int result;
++ struct iovec save_vectors[msg->msg_iovlen];
++
++ /* Save a copy of the IO vectors as send_msg mucks around with them and
++ * we may want to send the same stuff out more than once (for different
++ * interfaces)
++ */
++ memcpy(save_vectors, msg->msg_iov,
++ sizeof (struct iovec) * msg->msg_iovlen);
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ result = sock_sendmsg(csock->sock, msg, size);
++
++ set_fs(fs);
++
++ if (result >= 0 && acks_expected && needack) {
++
++ /* Start retransmit timer if it didn't go */
++ if (result == 0) {
++ start_short_timer();
++ }
++ else {
++ resend_delay = 1;
++ }
++ }
++
++ /* Restore IOVs */
++ memcpy(msg->msg_iov, save_vectors,
++ sizeof (struct iovec) * msg->msg_iovlen);
++
++ return result;
++}
++
++static void resend_last_message()
++{
++ struct msghdr msg;
++ struct iovec vec[1];
++ mm_segment_t fs;
++ int result;
++
++ P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
++ jiffies, saved_msg_len, saved_msg_buffer[0],
++ saved_msg_buffer[6]);
++
++ /* Assume there is something wrong with the last interface */
++ current_interface = get_next_interface(current_interface);
++ if (num_interfaces > 1)
++ printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
++ current_interface->number);
++
++ vec[0].iov_base = saved_msg_buffer;
++ vec[0].iov_len = saved_msg_len;
++
++ memset(&msg, 0, sizeof (msg));
++ msg.msg_name = ¤t_interface->saddr;
++ msg.msg_namelen = current_interface->addr_len;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = vec;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ result = sock_sendmsg(current_interface->sock, &msg, saved_msg_len);
++
++ set_fs(fs);
++
++ if (result < 0)
++ printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
++
++ /* Try indefinitely to send this, the backlog must die down eventually
++ * !? */
++ if (result == 0)
++ start_short_timer();
++
++ /* Send succeeded, continue waiting for ACKS */
++ if (result > 0)
++ start_ack_timer();
++
++}
++
++static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
++ struct msghdr *msg, size_t size, int flags)
++{
++ struct sock *sk = sock->sk;
++ struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
++ struct cluster_sock *c = cluster_sk(sk);
++ struct sk_buff *skb;
++ int copied, err = 0;
++ int isoob = 0;
++
++ /* Socket was notified of shutdown, remove any pending skbs and return
++ * EOF */
++ if (!atomic_read(&cnxman_running)) {
++ while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
++ skb_free_datagram(sk, skb);
++ return 0; /* cnxman has left the building */
++ }
++
++ /* Generic datagram code does most of the work. If the user is not
++ * interested in OOB messages then ignore them */
++ do {
++ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
++ if (!skb)
++ goto out;
++
++ /* Is it OOB */
++ if (skb->cb[0] & 0x80)
++ isoob = 1;
++ else
++ isoob = 0;
++
++ /* If it is and the user doesn't want it, then throw it away. */
++ if (isoob && !(flags & MSG_OOB)) {
++ skb_free_datagram(sk, skb);
++
++ /* If we peeked (?) an OOB but the user doesn't want it
++ then we need to discard it or we'll loop forever */
++ if (flags & MSG_PEEK) {
++ skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
++ MSG_DONTWAIT, &err);
++ if (skb)
++ skb_free_datagram(sk, skb);
++ }
++ }
++ }
++ while (isoob && !(flags & MSG_OOB));
++
++ copied = skb->len;
++ if (copied > size) {
++ copied = size;
++ msg->msg_flags |= MSG_TRUNC;
++ }
++ err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
++
++ if (err)
++ goto out_free;
++
++ if (msg->msg_name && msg->msg_namelen) {
++ memset(msg->msg_name, 0, msg->msg_namelen);
++
++ if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
++
++ /* Nodeid is in native byte order - anything else is just
++ * perverse */
++ memcpy(&sin->scl_nodeid, skb->cb + 1, sizeof(int));
++ }
++ msg->msg_namelen = sizeof (struct sockaddr_cl);
++ sin->scl_port = c->port;
++ }
++
++ /* Top bit set in cb[0] means this is an OOB message */
++ if (skb->cb[0] & 0x80) {
++ msg->msg_flags |= MSG_OOB;
++ }
++
++ sock_recv_timestamp(msg, sk, skb);
++
++ err = copied;
++
++ out_free:
++ skb_free_datagram(sk, skb);
++
++ out:
++ return err;
++}
++
++/* Send a message out on all interfaces */
++static int send_to_all_ints(int nodeid, struct msghdr *our_msg, int size, int flags)
++{
++ struct sockaddr_in6 daddr;
++ struct cl_comms_socket *clsock;
++ int result = 0;
++
++ our_msg->msg_name = &daddr;
++
++ list_for_each_entry(clsock, &socket_list, list) {
++
++ /* Don't send out a recv-only socket */
++ if (!clsock->recv_only) {
++
++ /* For temporary node IDs send to the node's real IP address */
++ if (nodeid < 0) {
++ get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
++ }
++ else {
++ memcpy(&daddr, &clsock->saddr, clsock->addr_len);
++ our_msg->msg_namelen = clsock->addr_len;
++ }
++
++ result = __send_and_save(clsock, our_msg,
++ size + sizeof (struct cl_protheader),
++ !(flags & MSG_NOACK));
++ }
++ }
++ return result;
++}
++
++
++/* Internal common send message routine */
++static int __sendmsg(struct socket *sock, struct msghdr *msg, int size,
++ unsigned char port)
++{
++ int result = 0, i;
++ int flags = msg->msg_flags;
++ struct msghdr our_msg;
++ struct sockaddr_cl *caddr = msg->msg_name;
++ struct cl_protheader header;
++ struct iovec vectors[msg->msg_iovlen + 1];
++ int nodeid = 0;
++
++ if (size > MAX_CLUSTER_MESSAGE)
++ return -EINVAL;
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++
++ if (caddr)
++ nodeid = caddr->scl_nodeid;
++
++ /* Check that the node id (if present) is valid */
++ if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
++ !is_valid_temp_nodeid(nodeid))) {
++ return -ENOTCONN;
++ }
++
++ /* We can only have one send outstanding at a time so we might as well
++ * lock the whole send mechanism */
++ down(&send_lock);
++
++ while ((port > HIGH_PROTECTED_PORT
++ && (!cluster_is_quorate || in_transition()))
++ || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
++
++ DECLARE_WAITQUEUE(wq, current);
++ struct task_struct *tsk = current;
++
++ if (flags & MSG_DONTWAIT) {
++ up(&send_lock);
++ return -EAGAIN;
++ }
++
++ if (current->pid == kcluster_pid) {
++ P_COMMS
++ ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
++ port, ack_count, acks_expected);
++ up(&send_lock);
++ return -EAGAIN;
++ }
++
++ P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
++ ack_count, acks_expected);
++
++ set_task_state(tsk, TASK_INTERRUPTIBLE);
++ add_wait_queue(&socket_waitq, &wq);
++
++ if ((port > HIGH_PROTECTED_PORT
++ && (!cluster_is_quorate || in_transition()))
++ || (acks_expected > 0)) {
++
++ up(&send_lock);
++ schedule();
++ down(&send_lock);
++ }
++
++ /* Going down */
++ if (quit_threads) {
++ up(&send_lock);
++ return -ENOTCONN;
++ }
++
++ set_task_state(tsk, TASK_RUNNING);
++ remove_wait_queue(&socket_waitq, &wq);
++
++ if (signal_pending(current)) {
++ up(&send_lock);
++ return -ERESTARTSYS;
++ }
++
++ /* Were we shut down in the meantime ? */
++ if (!atomic_read(&cnxman_running)) {
++ up(&send_lock);
++ return -ENOTCONN;
++ }
++
++ }
++
++ memset(&our_msg, 0, sizeof (our_msg));
++
++ /* Build the header */
++ header.port = port;
++ header.flags = msg->msg_flags >> 16;
++ header.cluster = cpu_to_le16(cluster_id);
++ header.srcid = us ? cpu_to_le32(us->node_id) : 0;
++ header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
++
++ ++cur_seq;
++ header.seq = cpu_to_le16(cur_seq);
++
++ /* Set the MULTICAST flag on messages with no particular destination */
++ if (!msg->msg_namelen) {
++ header.flags |= MSG_MULTICAST >> 16;
++ header.tgtid = 0;
++ }
++
++ /* Copy the existing iovecs into our array and add the header on at the
++ * beginning */
++ vectors[0].iov_base = &header;
++ vectors[0].iov_len = sizeof (header);
++ for (i = 0; i < msg->msg_iovlen; i++) {
++ vectors[i + 1] = msg->msg_iov[i];
++ }
++
++ our_msg.msg_iovlen = msg->msg_iovlen + 1;
++ our_msg.msg_iov = vectors;
++
++ /* Work out how many ACKS are wanted - *don't* reset acks_expected to
++ * zero if no acks are required as an ACK-needed message may still be
++ * outstanding */
++ if (!(msg->msg_flags & MSG_NOACK)) {
++ if (msg->msg_namelen)
++ acks_expected = 1; /* Unicast */
++ else
++ acks_expected = max(cluster_members - 1, 0);
++
++ }
++
++ P_COMMS
++ ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
++ nodeid, header.port,
++ (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
++ le16_to_cpu(header.seq), header.flags);
++
++ /* Don't include temp nodeids in the message itself */
++ if (header.tgtid < 0)
++ header.tgtid = 0;
++
++ /* For non-member sends we use all the interfaces */
++ if ((nodeid < 0) || (flags & MSG_ALLINT)) {
++
++ result = send_to_all_ints(nodeid, &our_msg, size, msg->msg_flags);
++ }
++ else {
++ /* Send to only the current socket - resends will use the
++ * others if necessary */
++ our_msg.msg_name = ¤t_interface->saddr;
++ our_msg.msg_namelen = current_interface->addr_len;
++
++ result =
++ __send_and_save(current_interface, &our_msg,
++ size + sizeof (header),
++ !(msg->msg_flags & MSG_NOACK));
++ }
++
++ /* Make a note in each nodes' structure that it has been sent a message
++ * so we can see which ones went astray */
++ if (!(flags & MSG_NOACK) && nodeid >= 0) {
++ if (msg->msg_namelen) {
++ struct cluster_node *node;
++
++ node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
++ if (node)
++ node->last_seq_sent = cur_seq;
++ }
++ else {
++ struct cluster_node *node;
++ struct list_head *nodelist;
++
++ list_for_each(nodelist, &cluster_members_list) {
++ node =
++ list_entry(nodelist, struct cluster_node,
++ list);
++ if (node->state == NODESTATE_MEMBER) {
++ node->last_seq_sent = cur_seq;
++ }
++ }
++ }
++ }
++
++ /* Save a copy of the message if we're expecting an ACK */
++ if (!(flags & MSG_NOACK) && acks_expected) {
++ mm_segment_t fs;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ memcpy_fromiovec(saved_msg_buffer, our_msg.msg_iov,
++ size + sizeof (header));
++ set_fs(fs);
++
++ saved_msg_len = size + sizeof (header);
++ retry_count = ack_count = 0;
++ clear_bit(RESEND_NEEDED, &mainloop_flags);
++
++ start_ack_timer();
++ }
++
++ up(&send_lock);
++ return result;
++}
++
++static int queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++ unsigned char port, int flags)
++{
++ struct queued_message *qmsg;
++
++ qmsg = kmalloc(sizeof (struct queued_message),
++ (in_atomic()
++ || irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
++ if (qmsg == NULL)
++ return -1;
++
++ memcpy(qmsg->msg_buffer, buf, len);
++ qmsg->msg_len = len;
++ if (caddr) {
++ memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
++ qmsg->addr_len = sizeof (struct sockaddr_cl);
++ }
++ else {
++ qmsg->addr_len = 0;
++ }
++ qmsg->flags = flags;
++ qmsg->port = port;
++ qmsg->socket = NULL;
++
++ down(&messages_list_lock);
++ list_add_tail(&qmsg->list, &messages_list);
++ up(&messages_list_lock);
++
++ wake_up_interruptible(&cnxman_waitq);
++
++ return 0;
++}
++
++static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
++ struct msghdr *msg, size_t size)
++{
++ struct cluster_sock *c = cluster_sk(sock->sk);
++ char *buffer;
++ int status;
++ int saved_iovlen;
++ uint8_t port;
++ struct iovec iov;
++ struct iovec *saved_iov;
++ struct sockaddr_cl *caddr = msg->msg_name;
++
++ if (sock->sk->sk_protocol == CLPROTO_MASTER)
++ return -EOPNOTSUPP;
++
++ port = c->port;
++
++ /* Only capable users can override the port number */
++ if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
++ port = caddr->scl_port;
++
++ if (port == 0)
++ return -EDESTADDRREQ;
++
++ /* Hmmm. On machines with segmented user/kernel space (sparc64, hppa &
++ * m68k AFAICT) we can't mix user and kernel space addresses in the
++ * IOV. This stymies __sendmsg a little as it tries to add a header to
++ * what could possibly be a userspace iov. So, here (where all the
++ * userspace sends come) we copy it to a kernel space buffer first. If
++ * performance is a big problem here then I might #ifdef it for the
++ * affected architectures but for now I think it will probably be OK */
++ buffer = kmalloc(size, GFP_KERNEL);
++ if (!buffer)
++ return -ENOMEM;
++
++ memcpy_fromiovec(buffer, msg->msg_iov, size);
++ iov.iov_len = size;
++ iov.iov_base = buffer;
++
++ saved_iov = msg->msg_iov;
++ saved_iovlen = msg->msg_iovlen;
++ msg->msg_iov = &iov;
++ msg->msg_iovlen = 1;
++
++ status = __sendmsg(sock, msg, size, port);
++ msg->msg_iov = saved_iov;
++ msg->msg_iovlen = saved_iovlen;
++
++ kfree(buffer);
++
++ return status;
++}
++
++/* Kernel call to sendmsg */
++int kcl_sendmsg(struct socket *sock, void *buf, int size,
++ struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
++{
++ struct iovec iovecs[1];
++ struct msghdr msg;
++ struct cluster_sock *c = cluster_sk(sock->sk);
++ unsigned char port;
++
++ if (size > MAX_CLUSTER_MESSAGE)
++ return -EINVAL;
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++
++ port = c->port;
++ if (caddr && caddr->scl_port)
++ port = caddr->scl_port;
++
++ if (port == 0)
++ return -EDESTADDRREQ;
++
++ /* If we have no process context then queue it up for kclusterd to
++ * send. */
++ if (in_interrupt() || flags & MSG_QUEUE) {
++ return queue_message(buf, size, caddr, port,
++ flags & ~MSG_QUEUE);
++ }
++
++ iovecs[0].iov_base = buf;
++ iovecs[0].iov_len = size;
++
++ memset(&msg, 0, sizeof (msg));
++ msg.msg_name = caddr;
++ msg.msg_namelen = addr_len;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = iovecs;
++ msg.msg_flags = flags;
++
++ return __sendmsg(sock, &msg, size, port);
++}
++
++static int send_queued_message(struct queued_message *qmsg)
++{
++ struct iovec iovecs[1];
++ struct msghdr msg;
++
++ /* Don't send blocked messages */
++ if (qmsg->port > HIGH_PROTECTED_PORT
++ && (!cluster_is_quorate || in_transition()))
++ return -EAGAIN;
++
++ iovecs[0].iov_base = qmsg->msg_buffer;
++ iovecs[0].iov_len = qmsg->msg_len;
++
++ memset(&msg, 0, sizeof (msg));
++ msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
++ msg.msg_namelen = qmsg->addr_len;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = iovecs;
++ msg.msg_flags = qmsg->flags;
++
++ return __sendmsg(qmsg->socket, &msg, qmsg->msg_len, qmsg->port);
++}
++
++int kcl_register_read_callback(struct socket *sock,
++ int (*routine) (char *, int, char *, int,
++ unsigned int))
++{
++ struct cluster_sock *c = cluster_sk(sock->sk);
++
++ c->kernel_callback = routine;
++
++ return 0;
++}
++
++/* Used where we are in kclusterd context and we can't allow the task to wait
++ * as we are also responsible to processing the ACKs that do the wake up. Try
++ * to send the message immediately and queue it if that's not possible */
++static int send_or_queue_message(void *buf, int len, struct sockaddr_cl *caddr,
++ unsigned char port)
++{
++ struct iovec iovecs[1];
++ struct msghdr msg;
++
++ int status;
++
++ /* Don't send blocked messages */
++ if (port > HIGH_PROTECTED_PORT
++ && (!cluster_is_quorate || in_transition())) {
++ return queue_message(buf, len, caddr, port, 0);
++ }
++
++ iovecs[0].iov_base = buf;
++ iovecs[0].iov_len = len;
++
++ memset(&msg, 0, sizeof (msg));
++ msg.msg_name = caddr;
++ msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = iovecs;
++ msg.msg_flags = MSG_DONTWAIT;
++
++ status = __sendmsg(NULL, &msg, len, port);
++
++ /* Did it work ? */
++ if (status > 0) {
++ return 0;
++ }
++
++ /* Failure other than EAGAIN is fatal */
++ if (status != -EAGAIN) {
++ return status;
++ }
++
++ return queue_message(buf, len, caddr, port, 0);
++}
++
++/* Send a listen request to a node */
++static void send_listen_request(int nodeid, unsigned char port)
++{
++ struct cl_listenmsg listenmsg;
++ struct sockaddr_cl caddr;
++
++ memset(&caddr, 0, sizeof (caddr));
++
++ /* Build the header */
++ listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
++ listenmsg.target_port = port;
++ listenmsg.listening = 0;
++ listenmsg.tag = current->pid;
++
++ caddr.scl_family = AF_CLUSTER;
++ caddr.scl_port = 0;
++ caddr.scl_nodeid = nodeid;
++
++ send_or_queue_message(&listenmsg, sizeof(listenmsg), &caddr, 0);
++ return;
++}
++
++/* Return 1 or 0 to indicate if we have a listener on the requested port */
++static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
++ unsigned char port, unsigned short tag)
++{
++ struct cl_listenmsg listenmsg;
++ struct sockaddr_cl caddr;
++ int status;
++
++ memset(&caddr, 0, sizeof (caddr));
++
++ /* Build the message */
++ listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
++ listenmsg.target_port = port;
++ listenmsg.tag = tag;
++ listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
++
++ caddr.scl_family = AF_CLUSTER;
++ caddr.scl_port = 0;
++ caddr.scl_nodeid = nodeid;
++
++ status = send_or_queue_message(&listenmsg,
++ sizeof (listenmsg),
++ &caddr, 0);
++
++ return;
++}
++
++/* Send an ACK */
++static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
++ int addr_len, char *addr, unsigned char remport,
++ unsigned char flag)
++{
++ mm_segment_t fs;
++ struct iovec vec;
++ struct cl_ackmsg ackmsg;
++ struct msghdr msg;
++ struct sockaddr_in6 daddr;
++ int result;
++
++#ifdef DEBUG_COMMS
++ char buf[MAX_ADDR_PRINTED_LEN];
++
++ P_COMMS("Sending ACK to %s, seq=%d\n",
++ print_addr(addr, address_length, buf), le16_to_cpu(seq));
++#endif
++
++ if (addr) {
++ memcpy(&daddr, addr, addr_len);
++ }
++ else {
++ memcpy(&daddr, &csock->saddr, csock->addr_len);
++ addr_len = csock->addr_len;
++ }
++
++ /* Build the header */
++ ackmsg.header.port = 0; /* Protocol port */
++ ackmsg.header.seq = 0;
++ ackmsg.header.flags = MSG_NOACK >> 16;
++ ackmsg.header.cluster = cpu_to_le16(cluster_id);
++ ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
++ ackmsg.header.tgtid = 0; /* ACKS are unicast so we don't bother
++ * to look this up */
++ ackmsg.cmd = CLUSTER_CMD_ACK;
++ ackmsg.remport = remport;
++ ackmsg.aflags = flag;
++ ackmsg.seq = seq; /* Already in LE order */
++ vec.iov_base = &ackmsg;
++ vec.iov_len = sizeof (ackmsg);
++
++ memset(&msg, 0, sizeof (msg));
++ msg.msg_name = &daddr;
++ msg.msg_namelen = addr_len;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = &vec;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ result = sock_sendmsg(csock->sock, &msg, sizeof (ackmsg));
++
++ set_fs(fs);
++
++ if (result < 0)
++ printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
++
++ return result;
++
++}
++
++/* Wait for all ACKS to be gathered */
++void kcl_wait_for_all_acks()
++{
++ while (ack_count < acks_expected) {
++
++ DECLARE_WAITQUEUE(wq, current);
++ struct task_struct *tsk = current;
++
++ set_task_state(tsk, TASK_INTERRUPTIBLE);
++ add_wait_queue(&socket_waitq, &wq);
++
++ if (ack_count < acks_expected) {
++ schedule();
++ }
++
++ set_task_state(tsk, TASK_RUNNING);
++ remove_wait_queue(&socket_waitq, &wq);
++ }
++}
++
++/* Send a closedown OOB message to all cluster nodes - this tells them that a
++ * port listener has gone away */
++static void send_port_close_oob(unsigned char port)
++{
++ struct cl_closemsg closemsg;
++
++ /* Build the header */
++ closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
++ closemsg.port = port;
++
++ send_or_queue_message(&closemsg, sizeof (closemsg), NULL, 0);
++ return;
++}
++
++/* A remote port has been closed - post an OOB message to the local listen on
++ * that port (if there is one) */
++static void post_close_oob(unsigned char port, int nodeid)
++{
++ struct cl_portclosed_oob *oobmsg;
++ struct sk_buff *skb;
++ struct sock *sock = port_array[port];
++
++ if (!sock) {
++ return; /* No-one listening */
++ }
++
++ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
++ if (!skb)
++ return;
++
++ skb_put(skb, sizeof (*oobmsg));
++ oobmsg = (struct cl_portclosed_oob *) skb->data;
++ oobmsg->port = port;
++ oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
++ skb->cb[0] = 0x80;
++ memcpy(skb->cb + 1, &nodeid, sizeof(int));
++
++ sock_queue_rcv_skb(sock, skb);
++
++}
++
++/* Leave the cluster */
++static void node_shutdown()
++{
++ struct cl_barrier *barrier;
++ struct list_head *blist;
++ struct list_head *temp;
++ struct list_head *socklist;
++ struct cl_client_socket *csock;
++ struct sk_buff *null_skb;
++
++ printk(KERN_INFO CMAN_NAME ": we are leaving the cluster\n");
++
++ atomic_set(&cnxman_running, 0);
++ unjam();
++
++ /* Notify kernel listeners first */
++ notify_kernel_listeners(LEAVING, 0);
++
++ /* Notify client sockets */
++ down(&client_socket_lock);
++ list_for_each_safe(socklist, temp, &client_socket_list) {
++ csock = list_entry(socklist, struct cl_client_socket, list);
++
++ null_skb = alloc_skb(0, GFP_KERNEL);
++ if (null_skb)
++ sock_queue_rcv_skb(csock->sock->sk, null_skb);
++ list_del(&csock->list);
++ kfree(csock);
++ }
++ up(&client_socket_lock);
++ we_are_a_cluster_member = 0;
++
++ sm_stop(1);
++
++ /* Wake up any processes waiting for barriers */
++ down(&barrier_list_lock);
++ list_for_each(blist, &barrier_list) {
++ barrier = list_entry(blist, struct cl_barrier, list);
++
++ /* Cancel any timers */
++ if (timer_pending(&barrier->timer))
++ del_timer(&barrier->timer);
++
++ /* Force it to be auto-delete so it discards itself */
++ if (barrier->state == BARRIER_STATE_WAITING) {
++ barrier->flags |= BARRIER_ATTR_AUTODELETE;
++ wake_up_interruptible(&barrier->waitq);
++ }
++ else {
++ if (barrier->callback) {
++ barrier->callback(barrier->name, -ENOTCONN);
++ barrier->callback = NULL;
++ }
++ }
++ }
++ up(&barrier_list_lock);
++
++ /* Wake up any processes waiting for ISLISTENING requests */
++ down(&listenreq_lock);
++ list_for_each(blist, &listenreq_list) {
++ struct cl_waiting_listen_request *lrequest =
++ list_entry(blist, struct cl_waiting_listen_request, list);
++
++ if (lrequest->waiting)
++ wake_up_interruptible(&lrequest->waitq);
++ }
++ up(&listenreq_lock);
++}
++
++static void free_cluster_sockets()
++{
++ struct list_head *socklist;
++ struct cl_comms_socket *sock;
++ struct list_head *temp;
++
++ list_for_each_safe(socklist, temp, &socket_list) {
++ sock = list_entry(socklist, struct cl_comms_socket, list);
++
++ list_del(&sock->list);
++ fput(sock->file);
++ kfree(sock);
++ }
++ num_interfaces = 0;
++ current_interface = NULL;
++}
++
++/* Tidy up after all the rest of the cluster bits have shut down */
++static void node_cleanup()
++{
++ struct list_head *nodelist;
++ struct list_head *proclist;
++ struct list_head *temp;
++ struct list_head *socklist;
++ struct list_head *blist;
++ struct cl_comms_socket *sock;
++ struct kernel_notify_struct *knotify;
++
++ /* Free list of kernel listeners */
++ list_for_each_safe(proclist, temp, &kernel_listener_list) {
++ knotify =
++ list_entry(proclist, struct kernel_notify_struct, list);
++ list_del(&knotify->list);
++ kfree(knotify);
++ }
++
++ /* Mark the sockets as busy so they don't get added to the active
++ * sockets list in the next few lines of code before we free them */
++ list_for_each_safe(socklist, temp, &socket_list) {
++ sock = list_entry(socklist, struct cl_comms_socket, list);
++
++ set_bit(1, &sock->active);
++ }
++
++ /* Tidy the active sockets list */
++ list_for_each_safe(socklist, temp, &active_socket_list) {
++ sock =
++ list_entry(socklist, struct cl_comms_socket, active_list);
++ list_del(&sock->active_list);
++ }
++
++ /* Free the memory allocated to cluster nodes */
++ free_nodeid_array();
++ down(&cluster_members_lock);
++ us = NULL;
++ list_for_each_safe(nodelist, temp, &cluster_members_list) {
++
++ struct list_head *addrlist;
++ struct list_head *addrtemp;
++ struct cluster_node *node;
++ struct cluster_node_addr *nodeaddr;
++
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
++ nodeaddr =
++ list_entry(addrlist, struct cluster_node_addr,
++ list);
++
++ list_del(&nodeaddr->list);
++ kfree(nodeaddr);
++ }
++ list_del(&node->list);
++ kfree(node->name);
++ kfree(node);
++ }
++ cluster_members = 0;
++ up(&cluster_members_lock);
++
++ /* Free the memory allocated to the outgoing sockets */
++ free_cluster_sockets();
++
++ /* Make sure that all the barriers are deleted */
++ down(&barrier_list_lock);
++ list_for_each_safe(blist, temp, &barrier_list) {
++ struct cl_barrier *barrier =
++ list_entry(blist, struct cl_barrier, list);
++
++ list_del(&barrier->list);
++ kfree(barrier);
++ }
++ up(&barrier_list_lock);
++
++ kcluster_pid = 0;
++ clear_bit(RESEND_NEEDED, &mainloop_flags);
++ acks_expected = 0;
++}
++
++/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
++ * blocked. */
++void set_quorate(int total_votes)
++{
++ int quorate;
++
++ if (get_quorum() > total_votes) {
++ quorate = 0;
++ }
++ else {
++ quorate = 1;
++ }
++
++ /* Hide messages during startup state transition */
++ if (we_are_a_cluster_member) {
++ if (cluster_is_quorate && !quorate)
++ printk(KERN_CRIT CMAN_NAME
++ ": quorum lost, blocking activity\n");
++ if (!cluster_is_quorate && quorate)
++ printk(KERN_CRIT CMAN_NAME
++ ": quorum regained, resuming activity\n");
++ }
++ cluster_is_quorate = quorate;
++
++ /* Wake up any sleeping processes */
++ if (cluster_is_quorate) {
++ unjam();
++ }
++
++}
++
++void queue_oob_skb(struct socket *sock, int cmd)
++{
++ struct sk_buff *skb;
++ struct cl_portclosed_oob *oobmsg;
++
++ skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
++ if (!skb)
++ return;
++
++ skb_put(skb, sizeof (*oobmsg));
++ oobmsg = (struct cl_portclosed_oob *) skb->data;
++ oobmsg->port = 0;
++ oobmsg->cmd = cmd;
++
++ /* There is no remote node associated with this so
++ clear out the field to avoid any accidents */
++ memset(skb->cb, 0, sizeof(int));
++ skb->cb[0] = 0x80;
++
++ sock_queue_rcv_skb(sock->sk, skb);
++}
++
++/* Notify interested parties that the cluster configuration has changed */
++void notify_listeners()
++{
++ struct notify_struct *notify;
++ struct list_head *proclist;
++ struct list_head *socklist;
++ struct list_head *temp;
++
++ /* Do kernel listeners first */
++ notify_kernel_listeners(CLUSTER_RECONFIG, 0);
++
++ /* Now we deign to tell userspace */
++ down(&event_listener_lock);
++ list_for_each_safe(proclist, temp, &event_listener_list) {
++ notify = list_entry(proclist, struct notify_struct, list);
++
++ /* If the kill fails then remove the process from the list */
++ if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
++ list_del(¬ify->list);
++ kfree(notify);
++ }
++ }
++ up(&event_listener_lock);
++
++ /* Tell userspace processes which want OOB messages */
++ down(&client_socket_lock);
++ list_for_each(socklist, &client_socket_list) {
++ struct cl_client_socket *csock;
++ csock = list_entry(socklist, struct cl_client_socket, list);
++ queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
++ }
++ up(&client_socket_lock);
++}
++
++/* This fills in the list of all addresses for the local node */
++void get_local_addresses(struct cluster_node *node)
++{
++ struct list_head *socklist;
++ struct cl_comms_socket *sock;
++
++ list_for_each(socklist, &socket_list) {
++ sock = list_entry(socklist, struct cl_comms_socket, list);
++
++ if (sock->recv_only) {
++ add_node_address(node, (char *) &sock->saddr, address_length);
++ }
++ }
++}
++
++
++static uint16_t generate_cluster_id(char *name)
++{
++ int i;
++ int value = 0;
++
++ for (i=0; i<strlen(name); i++) {
++ value <<= 1;
++ value += name[i];
++ }
++ return value & 0xFFFF;
++}
++
++/* Return the next comms socket we can use. */
++static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
++{
++ int next;
++ struct list_head *socklist;
++
++ /* Fast path for single interface systems */
++ if (num_interfaces <= 1)
++ return cur;
++
++ /* Next number */
++ next = cur->number + 1;
++ if (next > num_interfaces)
++ next = 1;
++
++ /* Find the socket with this number, I could optimise this by starting
++ * at the current i/f but most systems are going to have a small number
++ * of them anyway */
++ list_for_each(socklist, &socket_list) {
++ struct cl_comms_socket *sock;
++ sock = list_entry(socklist, struct cl_comms_socket, list);
++
++ if (!sock->recv_only && sock->number == next)
++ return sock;
++ }
++
++ BUG();
++ return NULL;
++}
++
++/* MUST be called with the barrier list lock held */
++static struct cl_barrier *find_barrier(char *name)
++{
++ struct list_head *blist;
++ struct cl_barrier *bar;
++
++ list_for_each(blist, &barrier_list) {
++ bar = list_entry(blist, struct cl_barrier, list);
++
++ if (strcmp(name, bar->name) == 0)
++ return bar;
++ }
++ return NULL;
++}
++
++/* Do the stuff we need to do when the barrier has completed phase 1 */
++static void check_barrier_complete_phase1(struct cl_barrier *barrier)
++{
++ if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
++ ? barrier->expected_nodes :
++ cluster_members)) {
++
++ struct cl_barriermsg bmsg;
++
++ atomic_inc(&barrier->completed_nodes); /* We have completed */
++ barrier->phase = 2; /* Wait for complete phase II */
++
++ /* Send completion message, remember: we are in cnxman context
++ * and must not block */
++ bmsg.cmd = CLUSTER_CMD_BARRIER;
++ bmsg.subcmd = BARRIER_COMPLETE;
++ bmsg.flags = 0;
++ strcpy(bmsg.name, barrier->name);
++
++ P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
++ queue_message((char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
++ }
++}
++
++/* Do the stuff we need to do when the barrier has been reached */
++/* Return 1 if we deleted the barrier */
++static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
++{
++ spin_lock_irq(&barrier->phase2_spinlock);
++
++ if (barrier->state != BARRIER_STATE_COMPLETE &&
++ (status == -ETIMEDOUT ||
++ atomic_read(&barrier->completed_nodes) ==
++ ((barrier->expected_nodes != 0)
++ ? barrier->expected_nodes : cluster_members))) {
++
++ if (status == 0 && barrier->timeout)
++ del_timer(&barrier->timer);
++ barrier->endreason = status;
++
++ /* Wake up listener */
++ if (barrier->state == BARRIER_STATE_WAITING) {
++ wake_up_interruptible(&barrier->waitq);
++ }
++ else {
++ /* Additional tasks we have to do if the user was not
++ * waiting... */
++ /* Call the callback */
++ if (barrier->callback) {
++ barrier->callback(barrier->name, 0);
++ barrier->callback = NULL;
++ }
++ /* Remove it if it's AUTO-DELETE */
++ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++ list_del(&barrier->list);
++ spin_unlock_irq(&barrier->phase2_spinlock);
++ kfree(barrier);
++ return 1;
++ }
++ }
++ barrier->state = BARRIER_STATE_COMPLETE;
++ }
++ spin_unlock_irq(&barrier->phase2_spinlock);
++ return 0;
++}
++
++/* Called if a barrier timeout happens */
++static void barrier_timer_fn(unsigned long arg)
++{
++ struct cl_barrier *barrier = (struct cl_barrier *) arg;
++
++ /* Ignore any futher messages, they are too late. */
++ barrier->phase = 0;
++
++ /* and cause it to timeout */
++ check_barrier_complete_phase2(barrier, -ETIMEDOUT);
++}
++
++/* Process BARRIER messages from other nodes */
++static void process_barrier_msg(struct cl_barriermsg *msg,
++ struct cluster_node *node)
++{
++ struct cl_barrier *barrier;
++
++ down(&barrier_list_lock);
++ barrier = find_barrier(msg->name);
++ up(&barrier_list_lock);
++
++ /* Ignore other peoples messages, in_transition() is needed here so
++ * that joining nodes will see their barrier messages before the
++ * we_are_a_cluster_member is set */
++ if (!we_are_a_cluster_member && !in_transition())
++ return;
++ if (!barrier)
++ return;
++
++ P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
++ node ? node->name : "unknown");
++
++ switch (msg->subcmd) {
++ case BARRIER_WAIT:
++ down(&barrier->lock);
++ if (barrier->phase == 0)
++ barrier->phase = 1;
++
++ if (barrier->phase == 1) {
++ atomic_inc(&barrier->got_nodes);
++ check_barrier_complete_phase1(barrier);
++ }
++ else {
++ printk(KERN_WARNING CMAN_NAME
++ ": got WAIT barrier not in phase 1 %s (%d)\n",
++ msg->name, barrier->phase);
++
++ }
++ up(&barrier->lock);
++ break;
++
++ case BARRIER_COMPLETE:
++ down(&barrier->lock);
++ atomic_inc(&barrier->completed_nodes);
++
++ /* First node to get all the WAIT messages sends COMPLETE, so
++ * we all complete */
++ if (barrier->phase == 1) {
++ atomic_set(&barrier->got_nodes,
++ barrier->expected_nodes);
++ check_barrier_complete_phase1(barrier);
++ }
++
++ if (barrier->phase == 2) {
++ /* If it was deleted (ret==1) then no need to unlock
++ * the mutex */
++ if (check_barrier_complete_phase2(barrier, 0) == 1)
++ return;
++ }
++ up(&barrier->lock);
++ break;
++ }
++}
++
++/* In-kernel membership API */
++int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
++{
++ struct kernel_notify_struct *notify;
++
++ notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
++ if (!notify)
++ return -ENOMEM;
++ notify->callback = callback;
++
++ down(&kernel_listener_lock);
++ list_add(¬ify->list, &kernel_listener_list);
++ up(&kernel_listener_lock);
++
++ return 0;
++}
++
++int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
++{
++ struct list_head *calllist;
++ struct list_head *temp;
++ struct kernel_notify_struct *notify;
++
++ down(&kernel_listener_lock);
++ list_for_each_safe(calllist, temp, &kernel_listener_list) {
++ notify = list_entry(calllist, struct kernel_notify_struct, list);
++ if (notify->callback == callback){
++ list_del(¬ify->list);
++ kfree(notify);
++ up(&kernel_listener_lock);
++ return 0;
++ }
++ }
++ up(&kernel_listener_lock);
++ return -EINVAL;
++}
++
++/* Return quorate status */
++int kcl_is_quorate()
++{
++ return cluster_is_quorate;
++}
++
++/* Return the address list for a node */
++struct list_head *kcl_get_node_addresses(int nodeid)
++{
++ struct cluster_node *node = find_node_by_nodeid(nodeid);
++
++ if (node)
++ return &node->addr_list;
++ else
++ return NULL;
++}
++
++static void copy_to_kclnode(struct cluster_node *node,
++ struct kcl_cluster_node *knode)
++{
++ strcpy(knode->name, node->name);
++ knode->size = sizeof (struct kcl_cluster_node);
++ knode->votes = node->votes;
++ knode->state = node->state;
++ knode->node_id = node->node_id;
++ knode->us = node->us;
++ knode->leave_reason = node->leave_reason;
++ knode->incarnation = node->incarnation;
++}
++
++/* Return the info for a node given it's address. if addr is NULL then return
++ * OUR info */
++int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
++ struct kcl_cluster_node *n)
++{
++ struct cluster_node *node;
++
++ /* They want us */
++ if (addr == NULL) {
++ node = us;
++ }
++ else {
++ node = find_node_by_addr(addr, addr_len);
++ if (!node)
++ return -1;
++ }
++
++ /* Copy to user's buffer */
++ copy_to_kclnode(node, n);
++ return 0;
++}
++
++int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
++{
++ struct cluster_node *node;
++
++ /* They want us */
++ if (name == NULL) {
++ node = us;
++ if (node == NULL)
++ return -1;
++ }
++ else {
++ node = find_node_by_name(name);
++ if (!node)
++ return -1;
++ }
++
++ /* Copy to user's buffer */
++ copy_to_kclnode(node, n);
++ return 0;
++}
++
++/* As above but by node id. MUCH faster */
++int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
++{
++ struct cluster_node *node;
++
++ /* They want us */
++ if (nodeid == 0) {
++ node = us;
++ if (node == NULL)
++ return -1;
++ }
++ else {
++ node = find_node_by_nodeid(nodeid);
++ if (!node)
++ return -1;
++ }
++
++ /* Copy to user's buffer */
++ copy_to_kclnode(node, n);
++ return 0;
++}
++
++/* Return a list of all cluster members ever */
++int kcl_get_all_members(struct list_head *list)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ struct kcl_cluster_node *newnode;
++ int num_nodes = 0;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ if (list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++ newnode =
++ kmalloc(sizeof (struct kcl_cluster_node),
++ GFP_KERNEL);
++ if (newnode) {
++ copy_to_kclnode(node, newnode);
++ list_add(&newnode->list, list);
++ num_nodes++;
++ }
++ }
++ else {
++ num_nodes++;
++ }
++ }
++ up(&cluster_members_lock);
++
++ return num_nodes;
++}
++
++/* Return a list of cluster members */
++int kcl_get_members(struct list_head *list)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ struct kcl_cluster_node *newnode;
++ int num_nodes = 0;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER) {
++ if (list) {
++ newnode =
++ kmalloc(sizeof (struct kcl_cluster_node),
++ GFP_KERNEL);
++ if (newnode) {
++ copy_to_kclnode(node, newnode);
++ list_add(&newnode->list, list);
++ num_nodes++;
++ }
++ }
++ else {
++ num_nodes++;
++ }
++ }
++ }
++ up(&cluster_members_lock);
++
++ return num_nodes;
++}
++
++/* Copy current member's nodeids into buffer */
++int kcl_get_member_ids(uint32_t *idbuf, int size)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ int num_nodes = 0;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER) {
++ if (idbuf && size) {
++ idbuf[num_nodes] = node->node_id;
++ num_nodes++;
++ size--;
++ }
++ else {
++ num_nodes++;
++ }
++ }
++ }
++ up(&cluster_members_lock);
++
++ return num_nodes;
++}
++
++/* Barrier API */
++int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
++{
++ struct cl_barrier *barrier;
++
++ /* We are not joined to a cluster */
++ if (!we_are_a_cluster_member)
++ return -ENOTCONN;
++
++ /* Must have a valid name */
++ if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
++ return -EINVAL;
++
++ /* We don't do this yet */
++ if (flags & BARRIER_ATTR_MULTISTEP)
++ return -ENOTSUPP;
++
++ down(&barrier_list_lock);
++
++ /* See if it already exists */
++ if ((barrier = find_barrier(name))) {
++ up(&barrier_list_lock);
++ if (nodes != barrier->expected_nodes) {
++ printk(KERN_WARNING CMAN_NAME
++ ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
++ name, barrier->expected_nodes, nodes);
++ up(&barrier_list_lock);
++ return -EINVAL;
++ }
++ else
++ return 0;
++ }
++
++ /* Build a new struct and add it to the list */
++ barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
++ if (barrier == NULL) {
++ up(&barrier_list_lock);
++ return -ENOMEM;
++ }
++ memset(barrier, 0, sizeof (*barrier));
++
++ strcpy(barrier->name, name);
++ barrier->flags = flags;
++ barrier->expected_nodes = nodes;
++ atomic_set(&barrier->got_nodes, 0);
++ atomic_set(&barrier->completed_nodes, 0);
++ barrier->endreason = 0;
++ barrier->registered_nodes = 1;
++ spin_lock_init(&barrier->phase2_spinlock);
++ barrier->state = BARRIER_STATE_INACTIVE;
++ init_MUTEX(&barrier->lock);
++
++ list_add(&barrier->list, &barrier_list);
++ up(&barrier_list_lock);
++
++ return 0;
++}
++
++static int barrier_setattr_enabled(struct cl_barrier *barrier,
++ unsigned int attr, unsigned long arg)
++{
++ int status;
++
++ /* Can't disable a barrier */
++ if (!arg) {
++ up(&barrier->lock);
++ return -EINVAL;
++ }
++
++ /* We need to send WAIT now because the user may not
++ * actually call kcl_barrier_wait() */
++ if (!barrier->waitsent) {
++ struct cl_barriermsg bmsg;
++
++ /* Send it to the rest of the cluster */
++ bmsg.cmd = CLUSTER_CMD_BARRIER;
++ bmsg.subcmd = BARRIER_WAIT;
++ strcpy(bmsg.name, barrier->name);
++
++ barrier->waitsent = 1;
++ barrier->phase = 1;
++
++ atomic_inc(&barrier->got_nodes);
++
++ /* Start the timer if one was wanted */
++ if (barrier->timeout) {
++ init_timer(&barrier->timer);
++ barrier->timer.function = barrier_timer_fn;
++ barrier->timer.data = (long) barrier;
++ mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
++ }
++
++ /* Barrier WAIT and COMPLETE messages are
++ * always queued - that way they always get
++ * sent out in the right order. If we don't do
++ * this then one can get sent out in the
++ * context of the user process and the other in
++ * cnxman and COMPLETE may /just/ slide in
++ * before WAIT if its in the queue
++ */
++ P_BARRIER("Sending WAIT for %s\n", name);
++ status = queue_message(&bmsg, sizeof (bmsg), NULL, 0, 0);
++ if (status < 0) {
++ up(&barrier->lock);
++ return status;
++ }
++
++ /* It might have been reached now */
++ if (barrier
++ && barrier->state != BARRIER_STATE_COMPLETE
++ && barrier->phase == 1)
++ check_barrier_complete_phase1(barrier);
++ }
++ if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
++ up(&barrier->lock);
++ return barrier->endreason;
++ }
++ up(&barrier->lock);
++ return 0; /* Nothing to propogate */
++}
++
++int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
++{
++ struct cl_barrier *barrier;
++
++ /* See if it already exists */
++ down(&barrier_list_lock);
++ if (!(barrier = find_barrier(name))) {
++ up(&barrier_list_lock);
++ return -ENOENT;
++ }
++ up(&barrier_list_lock);
++
++ down(&barrier->lock);
++ if (barrier->state == BARRIER_STATE_COMPLETE) {
++ up(&barrier->lock);
++ return 0;
++ }
++
++ switch (attr) {
++ case BARRIER_SETATTR_AUTODELETE:
++ if (arg)
++ barrier->flags |= BARRIER_ATTR_AUTODELETE;
++ else
++ barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
++ up(&barrier->lock);
++ return 0;
++ break;
++
++ case BARRIER_SETATTR_TIMEOUT:
++ /* Can only change the timout of an inactive barrier */
++ if (barrier->state == BARRIER_STATE_WAITING
++ || barrier->waitsent) {
++ up(&barrier->lock);
++ return -EINVAL;
++ }
++ barrier->timeout = arg;
++ up(&barrier->lock);
++ return 0;
++
++ case BARRIER_SETATTR_MULTISTEP:
++ up(&barrier->lock);
++ return -ENOTSUPP;
++
++ case BARRIER_SETATTR_ENABLED:
++ return barrier_setattr_enabled(barrier, attr, arg);
++
++ case BARRIER_SETATTR_NODES:
++ /* Can only change the expected node count of an inactive
++ * barrier */
++ if (barrier->state == BARRIER_STATE_WAITING
++ || barrier->waitsent)
++ return -EINVAL;
++ barrier->expected_nodes = arg;
++ break;
++
++ case BARRIER_SETATTR_CALLBACK:
++ if (barrier->state == BARRIER_STATE_WAITING
++ || barrier->waitsent)
++ return -EINVAL;
++ barrier->callback = (void (*)(char *, int)) arg;
++ up(&barrier->lock);
++ return 0; /* Don't propgate this to other nodes */
++ }
++
++ up(&barrier->lock);
++ return 0;
++}
++
++int kcl_barrier_delete(char *name)
++{
++ struct cl_barrier *barrier;
++
++ down(&barrier_list_lock);
++ /* See if it exists */
++ if (!(barrier = find_barrier(name))) {
++ up(&barrier_list_lock);
++ return -ENOENT;
++ }
++
++ /* Delete it */
++ list_del(&barrier->list);
++ kfree(barrier);
++
++ up(&barrier_list_lock);
++
++ return 0;
++}
++
++int kcl_barrier_cancel(char *name)
++{
++ struct cl_barrier *barrier;
++
++ /* See if it exists */
++ down(&barrier_list_lock);
++ if (!(barrier = find_barrier(name))) {
++ up(&barrier_list_lock);
++ return -ENOENT;
++ }
++ down(&barrier->lock);
++
++ barrier->endreason = -ENOTCONN;
++
++ if (barrier->callback) {
++ barrier->callback(barrier->name, -ECONNRESET);
++ barrier->callback = NULL;
++ }
++
++ if (barrier->timeout)
++ del_timer(&barrier->timer);
++
++ /* Remove it if it's AUTO-DELETE */
++ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++ list_del(&barrier->list);
++ up(&barrier->lock);
++ kfree(barrier);
++ up(&barrier_list_lock);
++ return 0;
++ }
++
++ if (barrier->state == BARRIER_STATE_WAITING)
++ wake_up_interruptible(&barrier->waitq);
++
++ up(&barrier->lock);
++ up(&barrier_list_lock);
++ return 0;
++}
++
++int kcl_barrier_wait(char *name)
++{
++ struct cl_barrier *barrier;
++ int ret;
++
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++
++ /* Enable it */
++ kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
++
++ down(&barrier_list_lock);
++
++ /* See if it still exists - enable may have deleted it! */
++ if (!(barrier = find_barrier(name))) {
++ up(&barrier_list_lock);
++ return -ENOENT;
++ }
++
++ down(&barrier->lock);
++
++ up(&barrier_list_lock);
++
++ /* If it has already completed then return the status */
++ if (barrier->state == BARRIER_STATE_COMPLETE) {
++ up(&barrier->lock);
++ return barrier->endreason;
++ }
++
++ barrier->state = BARRIER_STATE_WAITING;
++
++ /* Have we all reached the barrier? */
++ while (atomic_read(&barrier->completed_nodes) !=
++ ((barrier->expected_nodes == 0)
++ ? cluster_members : barrier->expected_nodes)
++ && barrier->endreason == 0) {
++
++ wait_queue_t wq;
++
++ init_waitqueue_entry(&wq, current);
++ init_waitqueue_head(&barrier->waitq);
++
++ /* Wait for em all */
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ add_wait_queue(&barrier->waitq, &wq);
++
++ if (atomic_read(&barrier->completed_nodes) !=
++ ((barrier->expected_nodes ==
++ 0) ? cluster_members : barrier->expected_nodes)
++ && barrier->endreason == 0) {
++ up(&barrier->lock);
++ schedule();
++ down(&barrier->lock);
++ }
++
++ remove_wait_queue(&barrier->waitq, &wq);
++ set_task_state(current, TASK_RUNNING);
++
++ if (signal_pending(current)) {
++ barrier->endreason = -EINTR;
++ break;
++ }
++ }
++ barrier->state = BARRIER_STATE_INACTIVE;
++
++ if (barrier->timeout)
++ del_timer(&barrier->timer);
++
++ /* Barrier has been reached on all nodes, call the callback */
++ if (barrier->callback) {
++ barrier->callback(barrier->name, barrier->endreason);
++ barrier->callback = NULL;
++ }
++
++ atomic_set(&barrier->got_nodes, 0);
++
++ /* Return the reason we were woken */
++ ret = barrier->endreason;
++
++ /* Remove it if it's AUTO-DELETE */
++ if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
++ down(&barrier_list_lock);
++ list_del(&barrier->list);
++ up(&barrier_list_lock);
++ up(&barrier->lock);
++ kfree(barrier);
++ }
++ else {
++ up(&barrier->lock);
++ }
++
++ /* We were woken up because the node left the cluster ? */
++ if (!atomic_read(&cnxman_running))
++ ret = -ENOTCONN;
++
++ return ret;
++}
++
++/* This is called from membership services when a node has left the cluster -
++ * we signal all waiting barriers with -ESRCH so they know to do something
++ * else, if the number of nodes is left at 0 then we compare the new number of
++ * nodes in the cluster with that at the barrier and return 0 (success) in that
++ * case */
++void check_barrier_returns()
++{
++ struct list_head *blist;
++ struct list_head *llist;
++ struct cl_barrier *barrier;
++ int status = 0;
++
++ down(&barrier_list_lock);
++ list_for_each(blist, &barrier_list) {
++ barrier = list_entry(blist, struct cl_barrier, list);
++
++ if (barrier->waitsent) {
++ int wakeit = 0;
++
++ /* Check for a dynamic member barrier */
++ if (barrier->expected_nodes == 0) {
++ if (barrier->registered_nodes ==
++ cluster_members) {
++ status = 0;
++ wakeit = 1;
++ }
++ }
++ else {
++ status = -ESRCH;
++ wakeit = 1;
++ }
++
++ /* Do we need to tell the barrier? */
++ if (wakeit) {
++ if (barrier->state == BARRIER_STATE_WAITING) {
++ barrier->endreason = status;
++ wake_up_interruptible(&barrier->waitq);
++ }
++ else {
++ if (barrier->callback) {
++ barrier->callback(barrier->name,
++ status);
++ }
++ }
++ }
++ }
++ }
++ up(&barrier_list_lock);
++
++ /* Part 2 check for outstanding listen requests for dead nodes and
++ * cancel them */
++ down(&listenreq_lock);
++ list_for_each(llist, &listenreq_list) {
++ struct cl_waiting_listen_request *lrequest =
++ list_entry(llist, struct cl_waiting_listen_request, list);
++ struct cluster_node *node =
++ find_node_by_nodeid(lrequest->nodeid);
++
++ if (node && node->state != NODESTATE_MEMBER) {
++ lrequest->result = -ENOTCONN;
++ lrequest->waiting = 0;
++ wake_up_interruptible(&lrequest->waitq);
++ }
++ }
++ up(&listenreq_lock);
++}
++
++int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
++{
++ struct temp_node *tn;
++ int err = 1; /* true */
++#ifdef DEBUG_COMMS
++ char buf[MAX_ADDR_PRINTED_LEN];
++#endif
++
++ down(&tempnode_lock);
++
++ list_for_each_entry(tn, &tempnode_list, list) {
++ if (tn->nodeid == nodeid) {
++ memcpy(addr, tn->addr, tn->addrlen);
++ *addrlen = tn->addrlen;
++ P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
++ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
++
++ goto out;
++ }
++ }
++ err = 0;
++
++ out:
++ up(&tempnode_lock);
++ return err;
++}
++
++/* Create a new temporary node ID. This list will only ever be very small
++ (usaully only 1 item) but I can't take the risk that someone won't try to
++ boot 128 nodes all at exactly the same time. */
++int new_temp_nodeid(char *addr, int addrlen)
++{
++ struct temp_node *tn;
++ int err = -1;
++ int try_nodeid = 0;
++#ifdef DEBUG_COMMS
++ char buf[MAX_ADDR_PRINTED_LEN];
++#endif
++
++ P_COMMS("new_temp_nodeid needed for\n: %s\n",
++ print_addr(addr, addrlen, buf));
++
++ down(&tempnode_lock);
++
++ /* First see if we already know about this node */
++ list_for_each_entry(tn, &tempnode_list, list) {
++
++ P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
++ tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
++
++ /* We're already in here... */
++ if (tn->addrlen == addrlen &&
++ memcmp(tn->addr, addr, addrlen) == 0) {
++ P_COMMS("reused temp node ID %d\n", tn->nodeid);
++ err = tn->nodeid;
++ goto out;
++ }
++ }
++
++ /* Nope, OK, invent a suitable number */
++ retry:
++ try_nodeid -= 1;
++ list_for_each_entry(tn, &tempnode_list, list) {
++
++ if (tn->nodeid == try_nodeid)
++ goto retry;
++ }
++
++ tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
++ if (!tn)
++ goto out;
++
++ memcpy(tn->addr, addr, addrlen);
++ tn->addrlen = addrlen;
++ tn->nodeid = try_nodeid;
++ list_add_tail(&tn->list, &tempnode_list);
++ err = try_nodeid;
++ P_COMMS("new temp nodeid = %d\n", try_nodeid);
++ out:
++ up(&tempnode_lock);
++ return err;
++}
++
++static int is_valid_temp_nodeid(int nodeid)
++{
++ struct temp_node *tn;
++ int err = 1; /* true */
++
++ down(&tempnode_lock);
++
++ list_for_each_entry(tn, &tempnode_list, list) {
++ if (tn->nodeid == nodeid)
++ goto out;
++ }
++ err = 0;
++
++ out:
++ P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
++ up(&tempnode_lock);
++ return err;
++}
++
++/* TODO: This needs to clean the list more fully of
++ nodes that are now full members but we did not master the transition */
++void remove_temp_nodeid(int nodeid)
++{
++ struct temp_node *tn;
++ struct temp_node *tmp;
++
++ down(&tempnode_lock);
++
++ list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
++ if (nodeid == tn->nodeid) {
++ list_del(&tn->list);
++ kfree(tn);
++ up(&tempnode_lock);
++ return;
++ }
++ }
++
++ up(&tempnode_lock);
++}
++
++/* Quorum device functions */
++int kcl_register_quorum_device(char *name, int votes)
++{
++ if (quorum_device)
++ return -EBUSY;
++
++ if (find_node_by_name(name))
++ return -EINVAL;
++
++ quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++ if (!quorum_device)
++ return -ENOMEM;
++ memset(quorum_device, 0, sizeof (struct cluster_node));
++
++ quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
++ if (!quorum_device->name) {
++ kfree(quorum_device);
++ quorum_device = NULL;
++ return -ENOMEM;
++ }
++
++ strcpy(quorum_device->name, name);
++ quorum_device->votes = votes;
++ quorum_device->state = NODESTATE_DEAD;
++
++ /* Keep this list valid so it doesn't confuse other code */
++ INIT_LIST_HEAD(&quorum_device->addr_list);
++
++ return 0;
++}
++
++int kcl_unregister_quorum_device(void)
++{
++ if (!quorum_device)
++ return -EINVAL;
++ if (quorum_device->state == NODESTATE_MEMBER)
++ return -EINVAL;
++
++ quorum_device = NULL;
++
++ return 0;
++}
++
++int kcl_quorum_device_available(int yesno)
++{
++ if (!quorum_device)
++ return -EINVAL;
++
++ if (yesno) {
++ quorum_device->last_hello = jiffies;
++ if (quorum_device->state == NODESTATE_DEAD) {
++ quorum_device->state = NODESTATE_MEMBER;
++ recalculate_quorum(0);
++ }
++ }
++ else {
++ if (quorum_device->state == NODESTATE_MEMBER) {
++ quorum_device->state = NODESTATE_DEAD;
++ recalculate_quorum(0);
++ }
++ }
++
++ return 0;
++}
++
++/* APIs for cluster ref counting. */
++int kcl_addref_cluster()
++{
++ int ret = -ENOTCONN;
++
++ if (!atomic_read(&cnxman_running))
++ goto addref_ret;
++
++ if (try_module_get(THIS_MODULE)) {
++ atomic_inc(&use_count);
++ ret = 0;
++ }
++
++ addref_ret:
++ return ret;
++}
++
++int kcl_releaseref_cluster()
++{
++ if (!atomic_read(&cnxman_running))
++ return -ENOTCONN;
++ atomic_dec(&use_count);
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++int kcl_cluster_name(char **cname)
++{
++ char *name;
++
++ name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
++ if (!name)
++ return -ENOMEM;
++
++ strncpy(name, cluster_name, strlen(cluster_name)+1);
++ *cname = name;
++ return 0;
++}
++
++int kcl_get_current_interface(void)
++{
++ return current_interface->number;
++}
++
++/* Socket registration stuff */
++static struct net_proto_family cl_family_ops = {
++ .family = AF_CLUSTER,
++ .create = cl_create
++};
++
++static struct proto_ops cl_proto_ops = {
++ .family = AF_CLUSTER,
++
++ .release = cl_release,
++ .bind = cl_bind,
++ .connect = sock_no_connect,
++ .socketpair = sock_no_socketpair,
++ .accept = sock_no_accept,
++ .getname = cl_getname,
++ .poll = cl_poll,
++ .ioctl = cl_ioctl,
++ .listen = sock_no_listen,
++ .shutdown = cl_shutdown,
++ .setsockopt = cl_setsockopt,
++ .getsockopt = cl_getsockopt,
++ .sendmsg = cl_sendmsg,
++ .recvmsg = cl_recvmsg,
++ .mmap = sock_no_mmap,
++ .sendpage = sock_no_sendpage,
++};
++
++#ifdef MODULE
++MODULE_DESCRIPTION("Cluster Connection and Service Manager");
++MODULE_AUTHOR("Red Hat, Inc");
++MODULE_LICENSE("GPL");
++#endif
++
++static int __init cluster_init(void)
++{
++ printk("CMAN %s (built %s %s) installed\n",
++ CMAN_RELEASE_NAME, __DATE__, __TIME__);
++
++ /* allocate our sock slab cache */
++ cluster_sk_cachep = kmem_cache_create("cluster_sock",
++ sizeof (struct cluster_sock), 0,
++ SLAB_HWCACHE_ALIGN, 0, 0);
++ if (!cluster_sk_cachep) {
++ printk(KERN_CRIT
++ "cluster_init: Cannot create cluster_sock SLAB cache\n");
++ return -1;
++
++ }
++
++ if (sock_register(&cl_family_ops)) {
++ printk(KERN_INFO "Unable to register cluster socket type\n");
++ kmem_cache_destroy(cluster_sk_cachep);
++ return -1;
++ }
++
++
++#ifdef CONFIG_PROC_FS
++ create_proc_entries();
++#endif
++
++ init_MUTEX(&start_thread_sem);
++ init_MUTEX(&send_lock);
++ init_MUTEX(&barrier_list_lock);
++ init_MUTEX(&cluster_members_lock);
++ init_MUTEX(&port_array_lock);
++ init_MUTEX(&messages_list_lock);
++ init_MUTEX(&listenreq_lock);
++ init_MUTEX(&client_socket_lock);
++ init_MUTEX(&new_dead_node_lock);
++ init_MUTEX(&event_listener_lock);
++ init_MUTEX(&kernel_listener_lock);
++ init_MUTEX(&tempnode_lock);
++ spin_lock_init(&active_socket_lock);
++ init_timer(&ack_timer);
++
++ INIT_LIST_HEAD(&event_listener_list);
++ INIT_LIST_HEAD(&kernel_listener_list);
++ INIT_LIST_HEAD(&socket_list);
++ INIT_LIST_HEAD(&client_socket_list);
++ INIT_LIST_HEAD(&active_socket_list);
++ INIT_LIST_HEAD(&barrier_list);
++ INIT_LIST_HEAD(&messages_list);
++ INIT_LIST_HEAD(&listenreq_list);
++ INIT_LIST_HEAD(&cluster_members_list);
++ INIT_LIST_HEAD(&new_dead_node_list);
++ INIT_LIST_HEAD(&tempnode_list);
++
++ atomic_set(&cnxman_running, 0);
++
++ sm_init();
++
++ return 0;
++}
++
++static void __exit cluster_exit(void)
++{
++#ifdef CONFIG_PROC_FS
++ cleanup_proc_entries();
++#endif
++
++ sock_unregister(AF_CLUSTER);
++ kmem_cache_destroy(cluster_sk_cachep);
++}
++
++module_init(cluster_init);
++module_exit(cluster_exit);
++
++EXPORT_SYMBOL(kcl_sendmsg);
++EXPORT_SYMBOL(kcl_register_read_callback);
++EXPORT_SYMBOL(kcl_add_callback);
++EXPORT_SYMBOL(kcl_remove_callback);
++EXPORT_SYMBOL(kcl_get_members);
++EXPORT_SYMBOL(kcl_get_member_ids);
++EXPORT_SYMBOL(kcl_get_all_members);
++EXPORT_SYMBOL(kcl_is_quorate);
++EXPORT_SYMBOL(kcl_get_node_by_addr);
++EXPORT_SYMBOL(kcl_get_node_by_name);
++EXPORT_SYMBOL(kcl_get_node_by_nodeid);
++EXPORT_SYMBOL(kcl_get_node_addresses);
++EXPORT_SYMBOL(kcl_addref_cluster);
++EXPORT_SYMBOL(kcl_releaseref_cluster);
++EXPORT_SYMBOL(kcl_cluster_name);
++
++EXPORT_SYMBOL(kcl_barrier_register);
++EXPORT_SYMBOL(kcl_barrier_setattr);
++EXPORT_SYMBOL(kcl_barrier_delete);
++EXPORT_SYMBOL(kcl_barrier_wait);
++EXPORT_SYMBOL(kcl_barrier_cancel);
++
++EXPORT_SYMBOL(kcl_register_quorum_device);
++EXPORT_SYMBOL(kcl_unregister_quorum_device);
++EXPORT_SYMBOL(kcl_quorum_device_available);
++
++EXPORT_SYMBOL(kcl_register_service);
++EXPORT_SYMBOL(kcl_unregister_service);
++EXPORT_SYMBOL(kcl_join_service);
++EXPORT_SYMBOL(kcl_leave_service);
++EXPORT_SYMBOL(kcl_global_service_id);
++EXPORT_SYMBOL(kcl_start_done);
++EXPORT_SYMBOL(kcl_get_services);
++EXPORT_SYMBOL(kcl_get_current_interface);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/config.c linux-patched/cluster/cman/config.c
+--- linux-orig/cluster/cman/config.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/config.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,46 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "config.h"
++
++/* Config file defaults */
++
++#define DEFAULT_JOIN_WAIT_TIME 11 /* Time to wait while sending JOINREQ
++ * messages. Should be at least twice
++ * the HELLO timer */
++#define DEFAULT_JOIN_TIMEOUT 30 /* How long we wait after getting a
++ * JOINACK to regarding that node as
++ * dead */
++#define DEFAULT_HELLO_TIMER 5 /* Period between HELLO messages */
++#define DEFAULT_DEADNODE_TIMER 21 /* If we don't get a message from a
++ * node in this period kill it */
++#define DEFAULT_TRANSITION_TIMER 15 /* Maximum time a state transition
++ * should take */
++#define DEFAULT_JOINCONF_TIMER 5 /* Time allowed to a node to respond to
++ * a JOINCONF message */
++#define DEFAULT_MAX_NODES 128 /* Max allowed nodes */
++#define DEFAULT_TRANSITION_RESTARTS 10 /* Maximum number of transition
++ * restarts before we die */
++#define DEFAULT_SM_DEBUG_SIZE 256 /* Size in bytes of SM debug buffer */
++
++struct config_info cman_config = {
++ .joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
++ .joinconf_timeout = DEFAULT_JOINCONF_TIMER,
++ .join_timeout = DEFAULT_JOIN_TIMEOUT,
++ .hello_timer = DEFAULT_HELLO_TIMER,
++ .deadnode_timeout = DEFAULT_DEADNODE_TIMER,
++ .transition_timeout = DEFAULT_TRANSITION_TIMER,
++ .transition_restarts = DEFAULT_TRANSITION_RESTARTS,
++ .max_nodes = DEFAULT_MAX_NODES,
++ .sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
++};
+diff -urN linux-orig/cluster/cman/config.h linux-patched/cluster/cman/config.h
+--- linux-orig/cluster/cman/config.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/config.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,31 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CONFIG_DOT_H__
++#define __CONFIG_DOT_H__
++
++struct config_info {
++ int joinwait_timeout;
++ int joinconf_timeout;
++ int join_timeout;
++ int hello_timer;
++ int deadnode_timeout;
++ int transition_timeout;
++ int transition_restarts;
++ int max_nodes;
++ int sm_debug_size;
++};
++
++extern struct config_info cman_config;
++
++#endif /* __CONFIG_DOT_H__ */
+diff -urN linux-orig/cluster/cman/kjoin.c linux-patched/cluster/cman/kjoin.c
+--- linux-orig/cluster/cman/kjoin.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/kjoin.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,238 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++#include <linux/in.h>
++
++#include "cnxman-private.h"
++
++static struct socket *mcast_sock;
++static struct socket *recv_sock;
++static struct socket *cluster_sock;
++
++extern short cluster_id;
++extern int join_count;
++extern struct semaphore join_count_lock;
++extern atomic_t cnxman_running;
++
++int kcl_join_cluster(struct cl_join_cluster_info *join_info)
++{
++ int result;
++ int one = 1, error;
++ unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
++ unsigned short port = join_info->port;
++ mm_segment_t fs;
++ struct sockaddr_in saddr;
++ struct kcl_multicast_sock mcast_info;
++
++ down(&join_count_lock);
++ if (atomic_read(&cnxman_running))
++ {
++ error = 0;
++ if (join_info->cluster_id == cluster_id)
++ join_count++;
++ else
++ error = -EINVAL;
++ up(&join_count_lock);
++ return error;
++ }
++ up(&join_count_lock);
++
++ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
++ return result;
++ }
++
++ result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
++ return result;
++ }
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
++ (void *) &one, sizeof (int))))
++ {
++ set_fs(fs);
++ printk("Error %d Setting master socket to SO_BROADCAST\n",
++ error);
++ sock_release(mcast_sock);
++ return -1;
++ }
++ set_fs(fs);
++
++ /* Bind the multicast socket */
++ saddr.sin_family = AF_INET;
++ saddr.sin_port = htons(port);
++ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
++ result =
++ mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
++ sizeof (saddr));
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ return result;
++ }
++
++ /* Bind the receive socket to our IP address */
++ saddr.sin_family = AF_INET;
++ saddr.sin_port = htons(port);
++ saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
++ result =
++ recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
++ sizeof (saddr));
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ return result;
++ }
++
++ /* Create the cluster master socket */
++ result =
++ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME
++ ": Can't create cluster master socket\n");
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ return result;
++ }
++
++ /* This is the broadcast transmit address */
++ saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
++
++ /* Pass the multicast socket to kernel space */
++ mcast_info.sock = mcast_sock;
++ mcast_info.number = 1;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
++ KCL_SET_MULTICAST,
++ (void *) &mcast_info,
++ sizeof (mcast_info))))
++ {
++ set_fs(fs);
++ printk(CMAN_NAME
++ ": Unable to pass multicast socket to cnxman, %d\n",
++ error);
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ sock_release(cluster_sock);
++ return -1;
++ }
++
++ mcast_info.sock = recv_sock;
++ if ((error =
++ cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
++ KCL_SET_RCVONLY,
++ (void *) &mcast_info,
++ sizeof (mcast_info))))
++ {
++ set_fs(fs);
++ printk(CMAN_NAME
++ ": Unable to pass receive socket to cnxman, %d\n",
++ error);
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ sock_release(cluster_sock);
++ return -1;
++ }
++
++ /* This setsockopt expects usermode variables */
++
++ if (cluster_sock->ops->
++ setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
++ (void *) join_info,
++ sizeof (struct cl_join_cluster_info)))
++
++ {
++ set_fs(fs);
++ printk(CMAN_NAME ": Unable to join cluster\n");
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ sock_release(cluster_sock);
++ return -1;
++ }
++ set_fs(fs);
++
++ return 0;
++}
++
++int kcl_leave_cluster(int remove)
++{
++ mm_segment_t fs;
++ int rem = remove;
++ int ret = 0;
++ struct socket *shutdown_sock = cluster_sock;
++
++ cluster_sock = NULL;
++
++ if (!shutdown_sock)
++ {
++ /* Create the cluster master socket */
++ int result =
++ sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
++ &shutdown_sock);
++ if (result < 0)
++ {
++ printk(KERN_ERR CMAN_NAME
++ ": Can't create cluster master socket\n");
++ sock_release(mcast_sock);
++ sock_release(recv_sock);
++ return result;
++ }
++ }
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ if ((ret =
++ shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
++ CLU_LEAVE_CLUSTER, (void *) &rem,
++ sizeof (int))))
++ {
++ printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
++ ret);
++ }
++ set_fs(fs);
++
++ sock_release(shutdown_sock);
++
++ return ret;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/membership.c linux-patched/cluster/cman/membership.c
+--- linux-orig/cluster/cman/membership.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/membership.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,3069 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++#include <linux/slab.h>
++#include <linux/spinlock.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/list.h>
++#include <cluster/cnxman.h>
++
++#include "cnxman-private.h"
++#include "config.h"
++#include "sm_control.h"
++
++#ifndef TRUE
++#define TRUE 1
++#endif
++
++/* Barrier name for membership transitions. %d is the cluster generation number
++ */
++#define MEMBERSHIP_BARRIER_NAME "TRANSITION.%d"
++
++/* Variables also used by connection manager */
++struct list_head cluster_members_list;
++struct semaphore cluster_members_lock;
++int cluster_members; /* Number of ACTIVE members, not a count of
++ * nodes in the list */
++int we_are_a_cluster_member = 0;
++int cluster_is_quorate;
++int quit_threads = 0;
++struct task_struct *membership_task;
++struct cluster_node *us;
++
++static struct task_struct *hello_task;
++static struct semaphore hello_task_lock;
++
++/* Variables that belong to the connection manager */
++extern wait_queue_head_t cnxman_waitq;
++extern struct completion member_thread_comp;
++extern struct cluster_node *quorum_device;
++extern unsigned short two_node;
++extern char cluster_name[];
++extern unsigned int config_version;
++extern unsigned int address_length;
++
++static struct socket *mem_socket;
++static pid_t kcluster_pid;
++
++static char iobuf[MAX_CLUSTER_MESSAGE];
++static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
++
++/* Our node name, usually system_utsname.nodename, but can be overridden */
++char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
++
++static spinlock_t members_by_nodeid_lock;
++static int sizeof_members_array = 0; /* Can dynamically increase (vmalloc
++ * permitting) */
++static struct cluster_node **members_by_nodeid;
++
++#define MEMBER_INCREMENT_SIZE 10
++
++static int votes = 1; /* Votes this node has */
++static int expected_votes = 1; /* Total expected votes in the cluster */
++static unsigned int quorum; /* Quorum, fewer votes than this and we stop
++ * work */
++static int leavereason; /* Saved for the duration of a state transition */
++static int transitionreason; /* Reason this transition was initiated */
++static unsigned int highest_nodeid; /* Highest node ID known to the cluster */
++static struct timer_list transition_timer; /* Kicks in if the transition
++ * doesn't complete in a
++ * reasonable time */
++static struct timer_list hello_timer; /* Timer to send HELLOs on */
++static unsigned long join_time; /* The time that we got our JOIN-ACK */
++static unsigned long start_time; /* The time that we were started */
++static int joinconf_count; /* Number of JOINCONF messages we have sent to
++ * a new node */
++static unsigned long wake_flags;/* Reason we were woken */
++
++/* Flags in above */
++#define WAKE_FLAG_DEADNODE 1
++#define WAKE_FLAG_TRANSTIMER 2
++
++/* The time the transition finished */
++static unsigned long transition_end_time;
++
++/* A list of nodes that cnxman tells us are dead. I hope this never has more
++ * than one element in it but I can't take that chance. only non-static so it
++ * can be initialised in module_load. */
++struct list_head new_dead_node_list;
++struct semaphore new_dead_node_lock;
++
++static int do_membership_packet(struct msghdr *msg, int len);
++static int do_process_joinreq(struct msghdr *msg, int len);
++static int do_process_joinack(struct msghdr *msg, int len);
++static int do_process_joinconf(struct msghdr *msg, int len);
++static int do_process_leave(struct msghdr *msg, int len);
++static int do_process_hello(struct msghdr *msg, int len);
++static int do_process_kill(struct msghdr *msg, int len);
++static int do_process_reconfig(struct msghdr *msg, int len);
++static int do_process_starttrans(struct msghdr *msg, int len);
++static int do_process_masterview(struct msghdr *msg, int len);
++static int do_process_endtrans(struct msghdr *msg, int len);
++static int do_process_viewack(struct msghdr *msg, int len);
++static int do_process_startack(struct msghdr *msg, int len);
++static int do_process_newcluster(struct msghdr *msg, int len);
++static int do_process_nominate(struct msghdr *msg, int len);
++static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
++ unsigned int flags);
++static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
++static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id);
++static int send_hello(void);
++static int send_master_hello(void);
++static int send_newcluster(void);
++static int end_transition(void);
++static int dispatch_messages(struct socket *mem_socket);
++static void check_for_dead_nodes(void);
++static void confirm_joiner(void);
++static void reset_hello_time(void);
++static int add_us(void);
++static int send_joinconf(void);
++static int init_membership_services(void);
++static int elect_master(struct cluster_node **);
++static void trans_timer_expired(unsigned long arg);
++static void hello_timer_expired(unsigned long arg);
++static void join_or_form_cluster(void);
++static int do_timer_wakeup(void);
++static int start_transition(unsigned char reason, struct cluster_node *node);
++int send_leave(unsigned char);
++int send_reconfigure(int, unsigned int);
++
++#ifdef DEBUG_MEMB
++static char *msgname(int msg);
++static int debug_sendmsg(struct socket *sock, void *buf, int size,
++ struct sockaddr_cl *caddr, int addr_len,
++ unsigned int flags)
++{
++ P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
++ size);
++ return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
++}
++
++#define kcl_sendmsg debug_sendmsg
++#endif
++
++/* State of the node */
++static enum { STARTING, JOINING, JOINWAIT, JOINACK, TRANSITION,
++ TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
++} node_state = STARTING;
++
++/* Sub-state when we are MASTER */
++static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
++ MASTER_COMPLETE } master_state;
++
++/* Number of responses collected while a master controlling a state transition */
++static int responses_collected;
++static int responses_expected;
++
++/* Current cluster generation number */
++static int cluster_generation = 1;
++
++/* When another node initiates a transtion then store it's pointer in here so
++ * we can check for other nodes trying to spoof us */
++static struct cluster_node *master_node = NULL;
++
++/* Struct the node wanting to join us */
++static struct cluster_node *joining_node = NULL;
++static int joining_temp_nodeid = 0;
++
++/* Last time a HELLO message was sent */
++unsigned long last_hello = 0;
++
++/* When we got our JOINWAIT or NEWCLUSTER */
++unsigned long joinwait_time = 0;
++
++/* Number of times a transition has restarted when we were master */
++int transition_restarts = 0;
++
++/* Variables used by the master to collect cluster status during a transition */
++static int agreeing_nodes = 0;
++static int dissenting_nodes = 0;
++static uint8_t *node_opinion = NULL;
++#define OPINION_AGREE 1
++#define OPINION_DISAGREE 2
++
++/* Set node id of a node, also add it to the members array and expand the array
++ * if necessary */
++static inline void set_nodeid(struct cluster_node *node, int nodeid)
++{
++ if (!nodeid)
++ return;
++
++ node->node_id = nodeid;
++ if (nodeid > sizeof_members_array) {
++ int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
++ struct cluster_node **new_array =
++ vmalloc((new_size) * sizeof (struct cluster_node *));
++ if (new_array) {
++ spin_lock(&members_by_nodeid_lock);
++ memcpy(new_array, members_by_nodeid,
++ sizeof_members_array *
++ sizeof (struct cluster_node *));
++ memset(&new_array[sizeof_members_array], 0,
++ MEMBER_INCREMENT_SIZE *
++ sizeof (struct cluster_node *));
++ vfree(members_by_nodeid);
++ members_by_nodeid = new_array;
++ sizeof_members_array = new_size;
++ spin_unlock(&members_by_nodeid_lock);
++ }
++ else {
++ panic("No memory for more nodes");
++ }
++ }
++ notify_kernel_listeners(NEWNODE, (long) nodeid);
++
++ spin_lock(&members_by_nodeid_lock);
++ members_by_nodeid[nodeid] = node;
++ spin_unlock(&members_by_nodeid_lock);
++}
++
++static int hello_kthread(void *unused)
++{
++ struct task_struct *tsk = current;
++ sigset_t tmpsig;
++
++ daemonize("cman_hbeat");
++
++ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++ down(&hello_task_lock);
++ hello_task = tsk;
++ up(&hello_task_lock);
++
++ set_user_nice(current, -6);
++
++ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
++ send_hello();
++
++ /* Scan the nodes list for dead nodes */
++ if (node_state == MEMBER)
++ check_for_dead_nodes();
++
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ schedule();
++ set_task_state(current, TASK_RUNNING);
++ }
++ down(&hello_task_lock);
++ hello_task = NULL;
++ up(&hello_task_lock);
++ P_MEMB("heartbeat closing down\n");
++ return 0;
++}
++
++/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
++ * that keeps track of and controls cluster membership. */
++static int membership_kthread(void *unused)
++{
++ struct task_struct *tsk = current;
++ struct socket *tmp_socket;
++ sigset_t tmpsig;
++
++ daemonize("cman_memb");
++
++ /* Block everything but SIGKILL/SIGSTOP/SIGTERM */
++ siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
++ sigprocmask(SIG_BLOCK, &tmpsig, NULL);
++
++ membership_task = tsk;
++ set_user_nice(current, -5);
++
++ /* Open the socket */
++ if (init_membership_services())
++ return -1;
++
++ add_us();
++ joining_node = us;
++
++ init_timer(&hello_timer);
++ hello_timer.function = hello_timer_expired;
++ hello_timer.data = 0L;
++
++ /* Do joining stuff */
++ join_or_form_cluster();
++
++ transition_end_time = jiffies;
++
++ /* Main loop */
++ while (node_state != REJECTED && node_state != LEFT_CLUSTER) {
++
++ struct task_struct *tsk = current;
++
++ DECLARE_WAITQUEUE(wait, tsk);
++
++ tsk->state = TASK_INTERRUPTIBLE;
++ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++ if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
++ wake_flags == 0) {
++ if (node_state == JOINACK ||
++ node_state == JOINWAIT)
++ schedule_timeout(HZ);
++ else
++ schedule();
++ }
++
++ tsk->state = TASK_RUNNING;
++ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++ /* Are we being shut down? */
++ if (node_state == LEFT_CLUSTER || quit_threads ||
++ signal_pending(current))
++ break;
++
++ /* Were we woken by a dead node passed down from cnxman ? */
++ if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
++ struct list_head *nodelist, *tmp;
++ struct cl_new_dead_node *deadnode;
++
++ down(&new_dead_node_lock);
++ list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
++ deadnode =
++ list_entry(nodelist,
++ struct cl_new_dead_node, list);
++
++ if (deadnode->node->state == NODESTATE_MEMBER)
++ a_node_just_died(deadnode->node);
++ list_del(&deadnode->list);
++ kfree(deadnode);
++ }
++ up(&new_dead_node_lock);
++ }
++
++ /* Process received messages. If dispatch_message() returns an
++ * error then we shut down */
++ if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++ if (dispatch_messages(mem_socket) < 0)
++ goto leave_cluster;
++
++ }
++
++ /* Were we woken by the transition timer firing ? */
++ if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
++ switch (do_timer_wakeup()) {
++ case -1:
++ continue;
++ case 0:
++ break;
++ case +1:
++ goto leave_cluster;
++ }
++ }
++
++ /* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
++ * messages again */
++ if (node_state == JOINACK
++ && time_after(jiffies,
++ join_time + cman_config.join_timeout * HZ)) {
++ P_MEMB
++ ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
++ node_state = JOINWAIT;
++ joinwait_time = jiffies;
++ }
++
++ /* Have we been in joinwait for too long... */
++ if (node_state == JOINWAIT
++ && time_after(jiffies, joinwait_time +
++ cman_config.join_timeout * HZ)) {
++ printk(CMAN_NAME
++ ": Been in JOINWAIT for too long - giving up\n");
++ goto leave_cluster;
++ }
++ }
++
++ leave_cluster:
++
++ /* Wake up the heartbeat thread so it can exit */
++ down(&hello_task_lock);
++ if (hello_task)
++ wake_up_process(hello_task);
++ up(&hello_task_lock);
++
++ if (timer_pending(&hello_timer))
++ del_timer(&hello_timer);
++
++ if (timer_pending(&transition_timer))
++ del_timer(&transition_timer);
++
++ node_state = LEFT_CLUSTER;
++ P_MEMB("closing down\n");
++ quit_threads = 1; /* force other thread to exit too */
++
++ /* Close the socket, NULL the pointer first so it doesn't get used
++ * by send_leave()
++ */
++ tmp_socket = mem_socket;
++ mem_socket = NULL;
++ sock_release(tmp_socket);
++ highest_nodeid = 0;
++ complete(&member_thread_comp);
++ return 0;
++}
++
++/* Things to do in the main thread when the transition timer has woken us.
++ * Usually this happens when a transition is taking too long and we need to
++ * take remedial action.
++ *
++ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
++static int do_timer_wakeup()
++{
++ P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
++
++ /* Resend JOINCONF if it got lost on the wire */
++ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
++ mod_timer(&transition_timer,
++ jiffies + cman_config.joinconf_timeout * HZ);
++ if (++joinconf_count < MAX_RETRIES) {
++ P_MEMB("Resending JOINCONF\n");
++ send_joinconf();
++ }
++ else {
++ P_MEMB("JOINCONF not acked, cancelling transition\n");
++ end_transition();
++ }
++ return -1;
++ }
++
++ /* A joining node probably died */
++ if (cluster_members == 1) {
++ end_transition();
++ return -1;
++ }
++
++ /* See if the master is still there */
++ if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
++
++ /* If we are in transition and master_node is NULL then we are
++ * waiting for ENDTRANS after JOIN-CONF */
++ if (!master_node) {
++ /* Hmmm. master died after sending JOINCONF, we'll have
++ * to die as we are in mid-transition */
++ printk(KERN_INFO CMAN_NAME
++ ": Master died after JOINCONF, we must leave the cluster\n");
++ quit_threads = 1;
++ return +1;
++ }
++
++ /* No messages from the master - see if it's stil there */
++ if (master_node->state == NODESTATE_MEMBER) {
++ send_master_hello();
++ mod_timer(&transition_timer,
++ jiffies +
++ cman_config.transition_timeout * HZ);
++ }
++
++ /* If the master is dead then elect a new one */
++ if (master_node->state == NODESTATE_DEAD) {
++
++ struct cluster_node *node;
++
++ P_MEMB("Master node is dead...Election!\n");
++ if (elect_master(&node)) {
++
++ /* We are master now, all kneel */
++ start_transition(TRANS_DEADMASTER, master_node);
++ }
++ else {
++ /* Leave the job to someone on more pay */
++ master_node = node;
++ mod_timer(&transition_timer,
++ jiffies +
++ cman_config.transition_timeout * HZ);
++ }
++ }
++ }
++
++ /* If we are the master node then restart the transition */
++ if (node_state == MASTER) {
++ start_transition(TRANS_RESTART, us);
++ }
++
++ return 0;
++}
++
++static void form_cluster(void)
++{
++ printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
++ node_state = MEMBER;
++ we_are_a_cluster_member = TRUE;
++ us->node_id = 1;
++ us->state = NODESTATE_MEMBER;
++ set_nodeid(us, 1);
++ recalculate_quorum(0);
++ sm_member_update(cluster_is_quorate);
++ send_hello();
++ kernel_thread(hello_kthread, NULL, 0);
++ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++}
++
++/* This does the initial JOIN part of the membership process. Actually most of
++ * is done in the message processing routines but this is the main loop that
++ * controls it. The side-effect of this routine is "node_state" which tells the
++ * real main loop (in the kernel thread routine) what to do next */
++static void join_or_form_cluster()
++{
++ start_time = jiffies;
++
++ printk(KERN_INFO CMAN_NAME
++ ": Waiting to join or form a Linux-cluster\n");
++ join_time = 0;
++ start_time = jiffies;
++ joinwait_time = jiffies;
++ last_hello = 0;
++ send_newcluster();
++
++ /* Listen for a reply */
++ do {
++ DECLARE_WAITQUEUE(wait, current);
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ add_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++ if (!skb_peek(&mem_socket->sk->sk_receive_queue))
++ schedule_timeout((cman_config.joinwait_timeout * HZ) /
++ 5);
++
++ set_task_state(current, TASK_RUNNING);
++ remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
++
++ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++ dispatch_messages(mem_socket);
++ }
++ if (quit_threads)
++ node_state = LEFT_CLUSTER;
++
++ }
++ while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
++ node_state == STARTING);
++
++ /* If we didn't hear any HELLO messages then form a new cluster */
++ if (node_state == STARTING) {
++ form_cluster();
++ }
++ else
++ last_hello = jiffies;
++
++}
++
++int start_membership_services(pid_t cluster_pid)
++{
++ kcluster_pid = cluster_pid;
++
++ init_timer(&transition_timer);
++ transition_timer.function = trans_timer_expired;
++ transition_timer.data = 0L;
++
++ /* Start the thread */
++ return kernel_thread(membership_kthread, NULL, 0);
++}
++
++static int init_membership_services()
++{
++ int result;
++ struct sockaddr_cl saddr;
++ struct socket *sock;
++
++ init_MUTEX(&hello_task_lock);
++ /* Create a socket to communicate with */
++ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
++ if (result < 0) {
++ printk(KERN_ERR CMAN_NAME
++ ": Can't create cluster socket for membership services\n");
++ return result;
++ }
++ mem_socket = sock;
++
++ /* Bind to our port */
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++ result =
++ sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
++ if (result < 0) {
++ printk(KERN_ERR CMAN_NAME
++ ": Can't bind to cluster membership services port\n");
++ sock_release(sock);
++ return result;
++ }
++
++ node_state = STARTING;
++ return 0;
++}
++
++static int send_joinconf()
++{
++ struct sockaddr_cl saddr;
++ int status;
++
++ if (joining_temp_nodeid == 0) {
++ BUG();
++ }
++
++ master_state = MASTER_CONFIRM;
++ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_nodeid = joining_temp_nodeid;
++ status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
++ MSG_NOACK);
++
++ if (status < 0) {
++ printk("Error %d sending JOINCONF, aborting transition\n", status);
++ end_transition();
++ }
++ return status;
++}
++
++static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
++{
++ char *msgbuf = scratchbuf;
++ struct list_head *addrlist;
++ int ptr = sizeof (struct cl_mem_join_msg);
++ unsigned short num_addr = 0;
++ struct cluster_node_addr *nodeaddr;
++ struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
++
++ msg->cmd = CLUSTER_MEM_JOINREQ;
++ msg->votes = votes;
++ msg->expected_votes = cpu_to_le32(expected_votes);
++ msg->major_version = cpu_to_le32(CNXMAN_MAJOR_VERSION);
++ msg->minor_version = cpu_to_le32(CNXMAN_MINOR_VERSION);
++ msg->patch_version = cpu_to_le32(CNXMAN_PATCH_VERSION);
++ msg->config_version = cpu_to_le32(config_version);
++ msg->addr_len = cpu_to_le32(address_length);
++ strcpy(msg->clustername, cluster_name);
++
++ /* Add our addresses */
++ list_for_each(addrlist, &us->addr_list) {
++ nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
++
++ memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
++ ptr += address_length;
++ num_addr++;
++ }
++ msg->num_addr = cpu_to_le16(num_addr);
++
++ /* And our name */
++ strcpy(msgbuf + ptr, nodename);
++ ptr += strlen(nodename) + 1;
++
++ return kcl_sendmsg(mem_socket, msgbuf, ptr,
++ addr, addr_len, MSG_NOACK);
++}
++
++static int send_startack(struct sockaddr_cl *addr, int addr_len, int node_id)
++{
++ struct cl_mem_startack_msg msg;
++
++ msg.cmd = CLUSTER_MEM_STARTACK;
++ msg.generation = cpu_to_le32(cluster_generation);
++ msg.node_id = cpu_to_le32(node_id);
++ msg.highest_node_id = cpu_to_le32(get_highest_nodeid());
++
++ return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, 0);
++}
++
++static int send_newcluster()
++{
++ char buf[1];
++
++ buf[0] = CLUSTER_MEM_NEWCLUSTER;
++
++ return kcl_sendmsg(mem_socket, buf, 1, NULL, 0,
++ MSG_NOACK);
++}
++
++static int send_hello()
++{
++ struct cl_mem_hello_msg hello_msg;
++ int status;
++
++ hello_msg.cmd = CLUSTER_MEM_HELLO;
++ hello_msg.members = cpu_to_le16(cluster_members);
++ hello_msg.flags = 0;
++ hello_msg.generation = cpu_to_le32(cluster_generation);
++
++ status =
++ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg), NULL, 0,
++ MSG_NOACK | MSG_ALLINT);
++
++ last_hello = jiffies;
++
++ return status;
++}
++
++/* This is a special HELLO message that requires an ACK. clients in transition
++ * send these to the master to check it is till alive. if it does not ACK then
++ * cnxman will signal it dead and we can restart the transition */
++static int send_master_hello()
++{
++ struct cl_mem_hello_msg hello_msg;
++ int status;
++ struct sockaddr_cl saddr;
++
++ hello_msg.cmd = CLUSTER_MEM_HELLO;
++ hello_msg.members = cpu_to_le16(cluster_members);
++ hello_msg.flags = 1;
++ hello_msg.generation = cpu_to_le32(cluster_generation);
++
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++ saddr.scl_nodeid = master_node->node_id;
++ status =
++ kcl_sendmsg(mem_socket, &hello_msg, sizeof (hello_msg),
++ &saddr, sizeof (saddr), 0);
++
++ last_hello = jiffies;
++
++ return status;
++}
++
++/* Called when the transition timer has expired, meaning we sent a transition
++ * message that was not ACKed */
++static void trans_timer_expired(unsigned long arg)
++{
++ P_MEMB("Transition timer fired %ld\n", jiffies);
++
++ set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
++ wake_up_process(membership_task);
++}
++
++static void hello_timer_expired(unsigned long arg)
++{
++ P_MEMB("Hello timer fired %ld\n", jiffies);
++
++ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++
++ if (node_state >= TRANSITION) {
++ wake_up_process(hello_task);
++ }
++}
++
++static int wait_for_completion_barrier(void)
++{
++ int status;
++ char barriername[MAX_BARRIER_NAME_LEN];
++
++ sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
++
++ /* Make sure we all complete together */
++ P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
++ if ((status =
++ kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
++ printk(CMAN_NAME ": Error registering barrier: %d\n", status);
++ return -1;
++ }
++ kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
++ cman_config.transition_timeout);
++ status = kcl_barrier_wait(barriername);
++ kcl_barrier_delete(barriername);
++
++ P_MEMB("Completion barrier reached : status = %d\n", status);
++ return status;
++}
++
++/* Called at the end of a state transition when we are the master */
++static int end_transition()
++{
++ struct cl_mem_endtrans_msg msg;
++ int total_votes;
++ int status;
++
++ /* Cancel the timer */
++ del_timer(&transition_timer);
++
++ confirm_joiner();
++
++ quorum = calculate_quorum(leavereason, 0, &total_votes);
++
++ msg.cmd = CLUSTER_MEM_ENDTRANS;
++ msg.quorum = cpu_to_le32(quorum);
++ msg.generation = cpu_to_le32(++cluster_generation);
++ msg.total_votes = cpu_to_le32(total_votes);
++ if (joining_node && transitionreason == TRANS_NEWNODE) {
++ msg.new_node_id = cpu_to_le32(joining_node->node_id);
++ }
++ else {
++ msg.new_node_id = 0;
++ }
++ status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
++
++ /* When that's all settled down, do the transition completion barrier */
++ kcl_wait_for_all_acks();
++
++ if (wait_for_completion_barrier() != 0) {
++ P_MEMB("Barrier timed out - restart\n");
++ start_transition(TRANS_RESTART, us);
++ return 0;
++ }
++
++ set_quorate(total_votes);
++
++ notify_listeners();
++ reset_hello_time();
++
++ /* Tell any waiting barriers that we had a transition */
++ check_barrier_returns();
++
++ leavereason = 0;
++ node_state = MEMBER;
++ transition_end_time = jiffies;
++
++ sm_member_update(cluster_is_quorate);
++
++ return 0;
++}
++
++int send_reconfigure(int param, unsigned int value)
++{
++ char msgbuf[66];
++ struct cl_mem_reconfig_msg *msg =
++ (struct cl_mem_reconfig_msg *) &msgbuf;
++
++ if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
++ expected_votes = value;
++
++ msg->cmd = CLUSTER_MEM_RECONFIG;
++ msg->param = param;
++ msg->value = cpu_to_le32(value);
++
++ return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
++}
++
++static int send_joinack(char *addr, int addr_len, unsigned char acktype)
++{
++ struct cl_mem_joinack_msg msg;
++
++ msg.cmd = CLUSTER_MEM_JOINACK;
++ msg.acktype = acktype;
++
++ return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
++ (struct sockaddr_cl *)addr, addr_len, MSG_NOACK);
++}
++
++/* Only send a leave message to one node in the cluster so that it can master
++ * the state transition, otherwise we get a "thundering herd" of potential
++ * masters fighting it out */
++int send_leave(unsigned char flags)
++{
++ unsigned char msg[2];
++ struct sockaddr_cl saddr;
++ struct cluster_node *node = NULL;
++ int status;
++
++ if (!mem_socket)
++ return 0;
++
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++
++ /* If we are in transition then use the current master */
++ if (node_state == TRANSITION) {
++ node = master_node;
++ }
++ if (!node) {
++ /* If we are the master or not in transition then pick a node
++ * almost at random */
++ struct list_head *nodelist;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER && !node->us)
++ break;
++ }
++ up(&cluster_members_lock);
++ }
++
++ /* we are the only member of the cluster - there is no-one to tell */
++ if (node && !node->us) {
++ saddr.scl_nodeid = node->node_id;
++
++ P_MEMB("Sending LEAVE to %s\n", node->name);
++ msg[0] = CLUSTER_MEM_LEAVE;
++ msg[1] = flags;
++ status =
++ kcl_sendmsg(mem_socket, msg, 2,
++ &saddr, sizeof (saddr),
++ MSG_NOACK);
++
++ if (status < 0)
++ return status;
++ }
++
++ /* And exit */
++ node_state = LEFT_CLUSTER;
++ wake_up_process(membership_task);
++ return 0;
++}
++
++int send_kill(int nodeid)
++{
++ char killmsg;
++ struct sockaddr_cl saddr;
++
++ killmsg = CLUSTER_MEM_KILL;
++
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++ saddr.scl_nodeid = nodeid;
++ return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
++ sizeof (struct sockaddr_cl), MSG_NOACK);
++}
++
++/* Process a message */
++static int do_membership_packet(struct msghdr *msg, int len)
++{
++ int result = -1;
++ unsigned char *buf = msg->msg_iov->iov_base;
++ struct sockaddr_cl *saddr = msg->msg_name;
++ struct cluster_node *node;
++
++ node = find_node_by_nodeid(saddr->scl_nodeid);
++
++ P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
++ msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
++
++ switch (*buf) {
++ case CLUSTER_MEM_JOINREQ:
++ result = do_process_joinreq(msg, len);
++ break;
++
++ case CLUSTER_MEM_LEAVE:
++ if (we_are_a_cluster_member)
++ result = do_process_leave(msg, len);
++ break;
++
++ case CLUSTER_MEM_HELLO:
++ result = do_process_hello(msg, len);
++ break;
++
++ case CLUSTER_MEM_KILL:
++ if (we_are_a_cluster_member)
++ result = do_process_kill(msg, len);
++ break;
++
++ case CLUSTER_MEM_JOINCONF:
++ if (node_state == JOINACK) {
++ do_process_joinconf(msg, len);
++ }
++ break;
++
++ case CLUSTER_MEM_CONFACK:
++ if (node_state == MASTER && master_state == MASTER_CONFIRM) {
++ end_transition();
++ }
++ break;
++
++ case CLUSTER_MEM_MASTERVIEW:
++ if (node_state == TRANSITION)
++ do_process_masterview(msg, len);
++ break;
++
++ case CLUSTER_MEM_JOINACK:
++ if (node_state == JOINING || node_state == JOINWAIT) {
++ do_process_joinack(msg, len);
++ }
++ break;
++ case CLUSTER_MEM_RECONFIG:
++ if (we_are_a_cluster_member) {
++ do_process_reconfig(msg, len);
++ }
++ break;
++
++ case CLUSTER_MEM_STARTTRANS:
++ result = do_process_starttrans(msg, len);
++ break;
++
++ case CLUSTER_MEM_ENDTRANS:
++ result = do_process_endtrans(msg, len);
++ break;
++
++ case CLUSTER_MEM_VIEWACK:
++ result = do_process_viewack(msg, len);
++ break;
++
++ case CLUSTER_MEM_STARTACK:
++ if (node_state == MASTER)
++ result = do_process_startack(msg, len);
++ break;
++
++ case CLUSTER_MEM_NEWCLUSTER:
++ result = do_process_newcluster(msg, len);
++ break;
++
++ case CLUSTER_MEM_NOMINATE:
++ if (node_state != MASTER)
++ result = do_process_nominate(msg, len);
++ break;
++
++ default:
++ printk(KERN_ERR CMAN_NAME
++ ": Unknown membership services message %d received\n",
++ *buf);
++ break;
++
++ }
++ return result;
++}
++
++/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
++ * to ignore request (node already joining) */
++static int check_duplicate_node(char *name, struct msghdr *msg, int len)
++{
++ struct cluster_node *node;
++ struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
++ char addr[address_length];
++ int addrlen;
++
++ if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
++ return -3;
++
++ /* See if we already have a cluster member with that name... */
++ node = find_node_by_name(name);
++ if (node && node->state != NODESTATE_DEAD) {
++
++ if ((node->state == NODESTATE_JOINING ||
++ node->state == NODESTATE_REMOTEMEMBER))
++ return +1;
++
++ printk(KERN_WARNING CMAN_NAME
++ ": Rejecting cluster membership application from %s - already have a node with that name\n",
++ name);
++ return -1;
++
++ }
++
++ /* Need to check the node's address too */
++ if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
++ (node = find_node_by_addr(addr, addrlen)) &&
++ node->state != NODESTATE_DEAD) {
++
++ if ((node->state == NODESTATE_JOINING ||
++ node->state == NODESTATE_REMOTEMEMBER))
++ return +1;
++
++ printk(KERN_WARNING CMAN_NAME
++ ": Rejecting cluster membership application from %s - already have a node with that address\n",
++ name);
++ return -1;
++ }
++ return 0;
++}
++
++/* Start the state transition */
++static int start_transition(unsigned char reason, struct cluster_node *node)
++{
++ char *startbuf = scratchbuf;
++ struct cl_mem_starttrans_msg *msg =
++ (struct cl_mem_starttrans_msg *) startbuf;
++
++ P_MEMB("Start transition - reason = %d\n", reason);
++
++ /* If this is a restart then zero the counters */
++ if (reason == TRANS_RESTART) {
++ agreeing_nodes = 0;
++ dissenting_nodes = 0;
++ if (node_opinion) {
++ kfree(node_opinion);
++ node_opinion = NULL;
++ }
++ responses_collected = 0;
++ }
++
++ /* If we have timed out too many times then just die */
++ if (reason == TRANS_RESTART
++ && ++transition_restarts > cman_config.transition_restarts) {
++ printk(KERN_WARNING CMAN_NAME
++ ": too many transition restarts - will die\n");
++ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
++ node_state = LEFT_CLUSTER;
++ quit_threads = 1;
++ wake_up_process(membership_task);
++ wake_up_interruptible(&cnxman_waitq);
++ return 0;
++ }
++ if (reason != TRANS_RESTART)
++ transition_restarts = 0;
++
++ /* Only keep the original state transition reason in the global
++ * variable. */
++ if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
++ reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
++ transitionreason = reason;
++
++ /* Save the info of the requesting node */
++ if (reason == TRANS_NEWNODE)
++ joining_node = node;
++
++ node_state = MASTER;
++ master_state = MASTER_START;
++ responses_collected = 0;
++ responses_expected = cluster_members - 1;
++
++ /* If we are on our own then just do it */
++ if (responses_expected == 0) {
++ P_MEMB("We are on our own...lonely here\n");
++ responses_collected--;
++ do_process_startack(NULL, 0);
++ }
++ else {
++ int ptr = sizeof (struct cl_mem_starttrans_msg);
++ struct list_head *addrlist;
++ unsigned short num_addrs = 0;
++ int flags = 0;
++
++ /* Send the STARTTRANS message */
++ msg->cmd = CLUSTER_MEM_STARTTRANS;
++ msg->reason = reason;
++ msg->votes = node->votes;
++ msg->expected_votes = cpu_to_le32(node->expected_votes);
++ msg->generation = cpu_to_le32(++cluster_generation);
++ msg->nodeid = cpu_to_le32(node->node_id);
++
++ if (reason == TRANS_NEWNODE) {
++ /* Add the addresses */
++ list_for_each(addrlist, &node->addr_list) {
++ struct cluster_node_addr *nodeaddr =
++ list_entry(addrlist,
++ struct cluster_node_addr, list);
++
++ memcpy(startbuf + ptr, nodeaddr->addr,
++ address_length);
++ ptr += address_length;
++ num_addrs++;
++ }
++
++ /* And the name */
++ strcpy(startbuf + ptr, node->name);
++ ptr += strlen(node->name) + 1;
++ }
++
++ /* If another node died then we must queue the STARTTRANS
++ * messages so that membershipd can carry on processing the
++ * other replies */
++ if (reason == TRANS_ANOTHERREMNODE)
++ flags |= MSG_QUEUE;
++
++ msg->num_addrs = cpu_to_le16(num_addrs);
++ kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
++ }
++ /* Set a timer in case we don't get 'em all back */
++ mod_timer(&transition_timer,
++ jiffies + cman_config.transition_timeout * HZ);
++ return 0;
++}
++
++/* A node has died - decide what to do */
++void a_node_just_died(struct cluster_node *node)
++{
++ /* If we are not in the context of kmembershipd then stick it on the
++ * list and wake it */
++ if (current != membership_task) {
++ struct cl_new_dead_node *newnode =
++ kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
++ if (!newnode)
++ return;
++ newnode->node = node;
++ down(&new_dead_node_lock);
++ list_add_tail(&newnode->list, &new_dead_node_list);
++ set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
++ up(&new_dead_node_lock);
++ wake_up_process(membership_task);
++ P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
++ return;
++ }
++
++ /* Remove it */
++ down(&cluster_members_lock);
++ if (node->state == NODESTATE_MEMBER)
++ cluster_members--;
++ node->state = NODESTATE_DEAD;
++ up(&cluster_members_lock);
++
++ /* Notify listeners */
++ notify_kernel_listeners(DIED, (long) node->node_id);
++
++ /* If we are in normal operation then become master and initiate a
++ * state-transition */
++ if (node_state == MEMBER) {
++ start_transition(TRANS_REMNODE, node);
++ return;
++ }
++
++ /* If we are a slave in transition then see if it's the master that has
++ * failed. If not then ignore it. If it /is/ the master then elect a
++ * new one */
++ if (node_state == TRANSITION) {
++ if (master_node == node) {
++ if (elect_master(&node)) {
++ del_timer(&transition_timer);
++ node_state = MASTER;
++
++ start_transition(TRANS_DEADMASTER, master_node);
++ }
++ else {
++ /* Someone else can be in charge - phew! */
++ }
++ }
++ return;
++ }
++
++ /* If we are the master then we need to start the transition all over
++ * again */
++ if (node_state == MASTER) {
++ /* Cancel timer */
++ del_timer(&transition_timer);
++
++ /* Restart the transition */
++ start_transition(TRANS_ANOTHERREMNODE, node);
++ transition_restarts = 0;
++ return;
++ }
++}
++
++/*
++ * Build up and send a set of messages consisting of the whole cluster view.
++ * The first byte is the command (cmd as passed in), the second is a flag byte:
++ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
++ * this is the only message sent The rest is a set of packed node entries, which
++ * are NOT split over packets. */
++static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
++ unsigned int flags)
++{
++ int ptr = 2;
++ int len;
++ int status = 0;
++ int last_node_start = 2;
++ unsigned char first_packet_flag = 1;
++ struct list_head *nodelist;
++ struct list_head *temp;
++ struct cluster_node *node;
++ char *message = scratchbuf;
++
++ message[0] = cmd;
++
++ down(&cluster_members_lock);
++ list_for_each_safe(nodelist, temp, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER) {
++ unsigned int evotes;
++ unsigned int node_id;
++ unsigned short num_addrs = 0;
++ unsigned short num_addrs_le;
++ struct list_head *addrlist;
++
++ last_node_start = ptr;
++
++ message[ptr++] = len = strlen(node->name);
++ strcpy(&message[ptr], node->name);
++ ptr += len;
++
++ /* Count the number of addresses this node has */
++ list_for_each(addrlist, &node->addr_list) {
++ num_addrs++;
++ }
++
++ num_addrs_le = cpu_to_le16(num_addrs);
++ memcpy(&message[ptr], &num_addrs_le, sizeof (short));
++ ptr += sizeof (short);
++
++ /* Pack em in */
++ list_for_each(addrlist, &node->addr_list) {
++
++ struct cluster_node_addr *nodeaddr =
++ list_entry(addrlist,
++ struct cluster_node_addr, list);
++
++ memcpy(&message[ptr], nodeaddr->addr,
++ address_length);
++ ptr += address_length;
++ }
++
++ message[ptr++] = node->votes;
++
++ evotes = cpu_to_le32(node->expected_votes);
++ memcpy(&message[ptr], &evotes, sizeof (int));
++ ptr += sizeof (int);
++
++ node_id = cpu_to_le32(node->node_id);
++ memcpy(&message[ptr], &node_id, sizeof (int));
++ ptr += sizeof (int);
++
++ /* If the block is full then send it */
++ if (ptr > MAX_CLUSTER_MESSAGE) {
++ message[1] = first_packet_flag;
++
++ up(&cluster_members_lock);
++ status =
++ kcl_sendmsg(mem_socket, message,
++ last_node_start, saddr,
++ saddr ? sizeof (struct sockaddr_cl) : 0,
++ flags);
++
++ if (status < 0)
++ goto send_fail;
++
++ down(&cluster_members_lock);
++
++ first_packet_flag = 0;
++ /* Copy the overflow back to the start of the
++ * buffer for the next send */
++ memcpy(&message[2], &message[last_node_start],
++ ptr - last_node_start);
++ ptr = ptr - last_node_start + 2;
++ }
++ }
++ }
++
++ up(&cluster_members_lock);
++
++ message[1] = first_packet_flag | 2; /* The last may also be first */
++ status = kcl_sendmsg(mem_socket, message, ptr,
++ saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
++ flags);
++ send_fail:
++
++ return status;
++}
++
++/* Make the JOINING node into a MEMBER */
++static void confirm_joiner()
++{
++ if (joining_node && joining_node->state == NODESTATE_JOINING) {
++ down(&cluster_members_lock);
++ joining_node->state = NODESTATE_MEMBER;
++ cluster_members++;
++ up(&cluster_members_lock);
++ }
++ remove_temp_nodeid(joining_temp_nodeid);
++ joining_temp_nodeid = 0;
++}
++
++/* Reset HELLO timers for all nodes We do this after a state-transition as we
++ * have had HELLOS disabled during the transition and if we don't do this the
++ * nodes will go on an uncontrolled culling-spree afterwards */
++static void reset_hello_time()
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER) {
++ node->last_hello = jiffies;
++ }
++
++ }
++ up(&cluster_members_lock);
++}
++
++/* Calculate the new quorum and return the value. do *not* set it in here as
++ * cnxman calls this to check if a new expected_votes value is valid. It
++ * (optionally) returns the total number of votes in the cluster */
++int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ unsigned int total_votes = 0;
++ unsigned int highest_expected = 0;
++ unsigned int newquorum, q1, q2;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state == NODESTATE_MEMBER) {
++ highest_expected =
++ max(highest_expected, node->expected_votes);
++ total_votes += node->votes;
++ }
++ }
++ up(&cluster_members_lock);
++ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
++ total_votes += quorum_device->votes;
++
++ if (max_expected > 0)
++ highest_expected = max_expected;
++
++ /* This quorum calculation is taken from the OpenVMS Cluster Systems
++ * manual, but, then, you guessed that didn't you */
++ q1 = (highest_expected + 2) / 2;
++ q2 = (total_votes + 2) / 2;
++ newquorum = max(q1, q2);
++
++ /* Normally quorum never decreases but the system administrator can
++ * force it down by setting expected votes to a maximum value */
++ if (!allow_decrease)
++ newquorum = max(quorum, newquorum);
++
++ /* The special two_node mode allows each of the two nodes to retain
++ * quorum if the other fails. Only one of the two should live past
++ * fencing (as both nodes try to fence each other in split-brain.) */
++ if (two_node)
++ newquorum = 1;
++
++ if (ret_total_votes)
++ *ret_total_votes = total_votes;
++ return newquorum;
++}
++
++/* Recalculate cluster quorum, set quorate and notify changes */
++void recalculate_quorum(int allow_decrease)
++{
++ int total_votes;
++
++ quorum = calculate_quorum(allow_decrease, 0, &total_votes);
++ set_quorate(total_votes);
++ notify_listeners();
++}
++
++/* Add new node address to an existing node */
++int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
++{
++ struct cluster_node_addr *newaddr;
++
++ newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
++ if (!newaddr)
++ return -1;
++
++ memcpy(newaddr->addr, addr, len);
++ newaddr->addr_len = len;
++ list_add_tail(&newaddr->list, &node->addr_list);
++
++ return 0;
++}
++
++static struct cluster_node *add_new_node(char *name, unsigned char votes,
++ unsigned int expected_votes,
++ int node_id, int state)
++{
++ struct cluster_node *newnode;
++
++ /* Look for a dead node with this name */
++ newnode = find_node_by_name(name);
++
++ /* Is it already joining */
++ if (newnode && newnode->state == NODESTATE_JOINING)
++ return NULL;
++
++ /* Update existing information */
++ if (newnode && newnode->state == NODESTATE_DEAD) {
++ newnode->last_hello = jiffies;
++ newnode->votes = votes;
++ newnode->expected_votes = expected_votes;
++ newnode->state = state;
++ newnode->us = 0;
++ newnode->leave_reason = 0;
++ newnode->last_seq_recv = 0;
++ newnode->last_seq_acked = 0;
++ newnode->last_seq_sent = 0;
++ newnode->incarnation++;
++ /* Don't overwrite the node ID */
++
++ if (state == NODESTATE_MEMBER) {
++ down(&cluster_members_lock);
++ cluster_members++;
++ up(&cluster_members_lock);
++ }
++
++ printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
++ return newnode;
++ }
++
++ newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++ if (!newnode)
++ goto alloc_err;
++
++ memset(newnode, 0, sizeof (struct cluster_node));
++ newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
++ if (!newnode->name)
++ goto alloc_err1;
++
++ strcpy(newnode->name, name);
++ newnode->last_hello = jiffies;
++ newnode->votes = votes;
++ newnode->expected_votes = expected_votes;
++ newnode->state = state;
++ newnode->node_id = node_id;
++ newnode->us = 0;
++ newnode->leave_reason = 0;
++ newnode->last_seq_recv = 0;
++ newnode->last_seq_acked = 0;
++ newnode->last_seq_sent = 0;
++ newnode->incarnation = 0;
++ INIT_LIST_HEAD(&newnode->addr_list);
++ set_nodeid(newnode, node_id);
++
++ /* Add the new node to the list */
++ down(&cluster_members_lock);
++ list_add(&newnode->list, &cluster_members_list);
++ if (state == NODESTATE_MEMBER)
++ cluster_members++;
++ up(&cluster_members_lock);
++
++ printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
++ return newnode;
++
++ alloc_err1:
++ kfree(newnode);
++ alloc_err:
++ send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++ printk(KERN_CRIT CMAN_NAME
++ ": Cannot allocate memory for new cluster node %s\n", name);
++
++ panic("cluster memory allocation failed");
++
++ return NULL;
++}
++
++/* Remove node from a STARTTRANS message */
++static struct cluster_node *remove_node(int nodeid)
++{
++ struct cluster_node *node = find_node_by_nodeid(nodeid);
++
++ if (node && node->state == NODESTATE_MEMBER) {
++ P_MEMB("starttrans removes node %s\n", node->name);
++ down(&cluster_members_lock);
++ node->state = NODESTATE_DEAD;
++ cluster_members--;
++ up(&cluster_members_lock);
++
++ notify_kernel_listeners(DIED, (long) nodeid);
++
++ /* If this node is us then go quietly */
++ if (node->us) {
++ printk(KERN_INFO CMAN_NAME
++ ": killed by STARTTRANS or NOMINATE\n");
++ quit_threads = 1;
++ wake_up_process(membership_task);
++ wake_up_interruptible(&cnxman_waitq);
++ }
++ }
++ return node;
++}
++
++/* Add a node from a STARTTRANS or NOMINATE message */
++static void add_node_from_starttrans(struct msghdr *msg, int len)
++{
++ /* Add the new node but don't fill in the ID until the master has
++ * confirmed it */
++ struct cl_mem_starttrans_msg *startmsg =
++ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
++ char *msgbuf = (char *) msg->msg_iov->iov_base;
++ int ptr = sizeof (struct cl_mem_starttrans_msg);
++ char *name =
++ msgbuf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
++ int i;
++
++ joining_node = add_new_node(name, startmsg->votes,
++ le32_to_cpu(startmsg->expected_votes),
++ 0, NODESTATE_JOINING);
++
++ /* add_new_node returns NULL if the node already exists */
++ if (!joining_node)
++ joining_node = find_node_by_name(name);
++
++ /* Add the node's addresses */
++ if (list_empty(&joining_node->addr_list)) {
++ for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
++ add_node_address(joining_node, msgbuf + ptr, address_length);
++ ptr += address_length;
++ }
++ }
++}
++
++/* We have been nominated as master for a transition */
++static int do_process_nominate(struct msghdr *msg, int len)
++{
++ struct cl_mem_starttrans_msg *startmsg =
++ (struct cl_mem_starttrans_msg *)msg->msg_iov->iov_base;
++ struct cluster_node *node = NULL;
++ char *nodeaddr = msg->msg_iov->iov_base + sizeof(struct cl_mem_starttrans_msg);
++
++ P_MEMB("nominate reason is %d\n", startmsg->reason);
++
++ if (startmsg->reason == TRANS_REMNODE) {
++ node = remove_node(le32_to_cpu(startmsg->nodeid));
++ }
++
++ if (startmsg->reason == TRANS_NEWNODE) {
++ add_node_from_starttrans(msg, len);
++ node = joining_node;
++ /* Make sure we have a temp nodeid for the new node */
++ joining_temp_nodeid = new_temp_nodeid(nodeaddr,
++ address_length);
++ }
++
++ /* This should be a TRANS_CHECK but start_transition needs some node
++ * info */
++ if (node == NULL)
++ node = us;
++ start_transition(startmsg->reason, node);
++ return 0;
++}
++
++/* Got a STARTACK response from a node */
++static int do_process_startack(struct msghdr *msg, int len)
++{
++ if (node_state != MASTER && master_state != MASTER_START) {
++ P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
++ return 0;
++ }
++
++ /* msg is NULL if we are called directly from start_transition */
++ if (msg) {
++ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
++
++ /* Ignore any messages wil old generation numbers in them */
++ if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
++ P_MEMB("Got old generation START-ACK msg - ignoring\n");
++ return 0;
++ }
++ }
++
++ /* If the node_id is non-zero then use it. */
++ if (transitionreason == TRANS_NEWNODE && joining_node && msg) {
++ struct cl_mem_startack_msg *ackmsg = msg->msg_iov->iov_base;
++
++ if (ackmsg->node_id) {
++ set_nodeid(joining_node, le32_to_cpu(ackmsg->node_id));
++ }
++ highest_nodeid =
++ max(highest_nodeid, le32_to_cpu(ackmsg->highest_node_id));
++ P_MEMB("Node id = %d, highest node id = %d\n",
++ le32_to_cpu(ackmsg->node_id),
++ le32_to_cpu(ackmsg->highest_node_id));
++ }
++
++ /* If we have all the responses in then move to the next stage */
++ if (++responses_collected == responses_expected) {
++
++ /* If the new node has no node_id (ie nobody in the cluster has
++ * heard of it before) then assign it a new one */
++ if (transitionreason == TRANS_NEWNODE && joining_node) {
++ highest_nodeid =
++ max(highest_nodeid, get_highest_nodeid());
++ if (joining_node->node_id == 0) {
++ set_nodeid(joining_node, ++highest_nodeid);
++ }
++ P_MEMB("nodeIDs: new node: %d, highest: %d\n",
++ joining_node->node_id, highest_nodeid);
++ }
++
++ /* Behave a little differently if we are on our own */
++ if (cluster_members == 1) {
++ if (transitionreason == TRANS_NEWNODE) {
++ /* If the cluster is just us then confirm at
++ * once */
++ joinconf_count = 0;
++ mod_timer(&transition_timer,
++ jiffies +
++ cman_config.joinconf_timeout * HZ);
++ send_joinconf();
++ return 0;
++ }
++ else { /* Node leaving the cluster */
++ recalculate_quorum(leavereason);
++ leavereason = 0;
++ node_state = MEMBER;
++ }
++ }
++ else {
++ master_state = MASTER_COLLECT;
++ responses_collected = 0;
++ responses_expected = cluster_members - 1;
++ P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
++ responses_expected);
++
++ send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0);
++
++ /* Set a timer in case we don't get 'em all back */
++ mod_timer(&transition_timer,
++ jiffies +
++ cman_config.transition_timeout * HZ);
++ }
++ }
++ return 0;
++}
++
++/* Got a VIEWACK response from a node */
++static int do_process_viewack(struct msghdr *msg, int len)
++{
++ char *reply = msg->msg_iov->iov_base;
++ struct sockaddr_cl *saddr = msg->msg_name;
++
++ if (master_state != MASTER_COLLECT) {
++ printk(KERN_INFO CMAN_NAME
++ ": got VIEWACK while not in state transition\n");
++ return 0;
++ }
++
++ if (node_opinion == NULL) {
++ node_opinion =
++ kmalloc((1 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
++ if (!node_opinion) {
++ panic(": malloc agree/dissent failed\n");
++ }
++ memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
++ }
++
++ /* Keep a list of agreeing and dissenting nodes */
++ if (reply[1] == 1) {
++ /* ACK - remote node agrees with me */
++ P_MEMB("Node agrees\n");
++ node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
++ agreeing_nodes++;
++ }
++ else {
++ /* Remote node disagrees */
++ P_MEMB("Node disagrees\n");
++ node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
++ dissenting_nodes++;
++ }
++
++ P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
++ responses_expected);
++
++ /* Are all the results in yet ? */
++ if (++responses_collected == responses_expected) {
++ del_timer(&transition_timer);
++
++ P_MEMB("The results are in: %d agree, %d dissent\n",
++ agreeing_nodes, dissenting_nodes);
++
++ if (agreeing_nodes > dissenting_nodes) {
++ /* Kill dissenting nodes */
++ int i;
++
++ for (i = 1; i <= responses_collected; i++) {
++ if (node_opinion[i] == OPINION_DISAGREE)
++ send_kill(i);
++ }
++ }
++ else {
++ /* We must leave the cluster as we are in a minority,
++ * the rest of them can fight it out amongst
++ * themselves. */
++ send_leave(CLUSTER_LEAVEFLAG_INCONSISTENT);
++
++ agreeing_nodes = 0;
++ dissenting_nodes = 0;
++ kfree(node_opinion);
++ node_opinion = NULL;
++ node_state = LEFT_CLUSTER;
++ quit_threads = 1;
++ wake_up_process(membership_task);
++ wake_up_interruptible(&cnxman_waitq);
++ return -1;
++ }
++
++ /* Reset counters */
++ agreeing_nodes = 0;
++ dissenting_nodes = 0;
++ kfree(node_opinion);
++ node_opinion = NULL;
++
++ /* Confirm new node */
++ if (transitionreason == TRANS_NEWNODE) {
++ mod_timer(&transition_timer,
++ jiffies + cman_config.joinconf_timeout * HZ);
++ joinconf_count = 0;
++ send_joinconf();
++ return 0;
++ }
++
++ master_state = MASTER_COMPLETE;
++
++ end_transition();
++ }
++
++ return 0;
++}
++
++/* Got an ENDTRANS message */
++static int do_process_endtrans(struct msghdr *msg, int len)
++{
++ struct cl_mem_endtrans_msg *endmsg =
++ (struct cl_mem_endtrans_msg *) msg->msg_iov->iov_base;
++ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
++
++ /* Someone else's state transition */
++ if (node_state != TRANSITION && node_state != JOINACK)
++ return 0;
++
++ /* Check we got it from the MASTER node */
++ if (master_node && master_node->node_id != saddr->scl_nodeid) {
++ printk(KERN_INFO
++ "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
++ master_node->node_id, saddr->scl_nodeid);
++ return 0;
++ }
++
++ del_timer(&transition_timer);
++
++ /* Set node ID on new node */
++ if (endmsg->new_node_id) {
++ set_nodeid(joining_node, le32_to_cpu(endmsg->new_node_id));
++ P_MEMB("new node %s has ID %d\n", joining_node->name,
++ joining_node->node_id);
++ }
++
++ node_state = TRANSITION_COMPLETE;
++
++ /* Need to set this here or the barrier code will reject us if we've
++ * just joined */
++ we_are_a_cluster_member = TRUE;
++
++ confirm_joiner();
++ cluster_generation = le32_to_cpu(endmsg->generation);
++
++ if (wait_for_completion_barrier() != 0) {
++ P_MEMB("Barrier timed out - restart\n");
++ node_state = TRANSITION;
++ mod_timer(&transition_timer,
++ jiffies + cman_config.transition_timeout * HZ);
++ return 0;
++ }
++
++ quorum = le32_to_cpu(endmsg->quorum);
++ set_quorate(le32_to_cpu(endmsg->total_votes));
++
++ /* Tell any waiting barriers that we had a transition */
++ check_barrier_returns();
++
++ /* Clear the master node */
++ master_node = NULL;
++
++ node_state = MEMBER;
++
++ /* Notify other listeners that transition has completed */
++ notify_listeners();
++ reset_hello_time();
++ transition_end_time = jiffies;
++
++ sm_member_update(cluster_is_quorate);
++ return 0;
++}
++
++/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
++static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
++ int nodeid)
++{
++ struct sockaddr_cl maddr;
++
++ maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
++ maddr.scl_family = AF_CLUSTER;
++ maddr.scl_nodeid = nodeid;
++
++ startmsg->cmd = CLUSTER_MEM_NOMINATE;
++ return kcl_sendmsg(mem_socket, startmsg, msglen,
++ &maddr, sizeof (maddr), 0);
++}
++
++/* Got a STARTTRANS message */
++static int do_process_starttrans(struct msghdr *msg, int len)
++{
++ struct cl_mem_starttrans_msg *startmsg =
++ (struct cl_mem_starttrans_msg *) msg->msg_iov->iov_base;
++ struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
++ struct cluster_node *node;
++ unsigned int newgen = le32_to_cpu(startmsg->generation);
++
++ /* Got a WHAT from WHOM? */
++ node = find_node_by_nodeid(saddr->scl_nodeid);
++ if (!node || node->state != NODESTATE_MEMBER)
++ return 0;
++
++ /* Someone else's state transition */
++ if (node_state != MEMBER &&
++ node_state != TRANSITION && node_state != MASTER)
++ return 0;
++
++ /* Ignore old generation STARTTRANS messages */
++ if ((newgen < cluster_generation) ||
++ (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
++ P_MEMB("Ignoring STARTTRANS with old generation number\n");
++ return 0;
++ }
++
++ P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
++ newgen, cluster_generation, startmsg->reason);
++
++ /* Up the generation number */
++ cluster_generation = newgen;
++
++ /* If we are also a master then decide between us */
++ if (node_state == MASTER) {
++
++ /* See if we really want the responsibility of being master */
++ if (elect_master(&node)) {
++
++ /* I reluctantly accept this position of responsibility
++ */
++ P_MEMB("I elected myself master\n");
++
++ /* start_transition will re-establish this */
++ del_timer(&transition_timer);
++
++ start_transition(TRANS_NEWMASTER, node);
++ return 0;
++ }
++ else {
++ /* Back down */
++ P_MEMB("Backing down from MASTER status\n");
++ master_node = node;
++ node_state = MEMBER;
++
++ /* If we were bringing a new node into the cluster then
++ * we will have to abandon that now and tell the new
++ * node to try again later */
++ if (transitionreason == TRANS_NEWNODE && joining_node) {
++ struct cluster_node_addr *first_addr =
++ (struct cluster_node_addr *) joining_node->
++ addr_list.next;
++
++ P_MEMB("Postponing membership of node %s\n",
++ joining_node->name);
++ send_joinack(first_addr->addr, address_length,
++ JOINACK_TYPE_WAIT);
++
++ /* Not dead, just sleeping */
++ joining_node->state = NODESTATE_DEAD;
++ joining_node = NULL;
++ }
++
++ /* If the new master is not us OR the node we just got
++ * the STARTTRANS from then make sure it knows it has
++ * to be master */
++ if (saddr->scl_nodeid != node->node_id) {
++ send_nominate(startmsg, len, node->node_id);
++ return 0;
++ }
++
++ /* Fall through into MEMBER code below if we are
++ * obeying the STARTTRANS we just received */
++ }
++ }
++
++ /* Do non-MASTER STARTTRANS bits */
++ if (node_state == MEMBER) {
++ int ptr = sizeof (struct cl_mem_starttrans_msg);
++ int node_id = 0;
++
++ P_MEMB("Normal transition start\n");
++
++ /* If the master is adding a new node and we know it's node ID
++ * then ACK with it. */
++ if (startmsg->reason == TRANS_NEWNODE) {
++ struct cluster_node *node =
++ find_node_by_addr((char *) startmsg + ptr,
++ address_length);
++ if (node)
++ node_id = node->node_id;
++ }
++
++ /* Save the master info */
++ master_node = find_node_by_nodeid(saddr->scl_nodeid);
++ node_state = TRANSITION;
++
++ if (startmsg->reason == TRANS_NEWNODE) {
++ add_node_from_starttrans(msg, len);
++ }
++
++ if (startmsg->reason == TRANS_REMNODE ||
++ startmsg->reason == TRANS_ANOTHERREMNODE) {
++ remove_node(le32_to_cpu(startmsg->nodeid));
++ }
++
++ send_startack(saddr, msg->msg_namelen,
++ node_id);
++
++ /* Establish timer in case the master dies */
++ mod_timer(&transition_timer,
++ jiffies + cman_config.transition_timeout * HZ);
++
++ return 0;
++ }
++
++ /* We are in transition but this may be a restart */
++ if (node_state == TRANSITION) {
++
++ master_node = find_node_by_nodeid(saddr->scl_nodeid);
++ send_startack(saddr, msg->msg_namelen, 0);
++
++ /* Is it a new joining node ? This happens if a master is
++ * usurped */
++ if (startmsg->reason == TRANS_NEWNODE) {
++ struct cluster_node *oldjoin = joining_node;
++
++ add_node_from_starttrans(msg, len);
++
++ /* If this is a different node joining than the one we
++ * were previously joining (probably cos the master is
++ * a nominated one) then mark our "old" joiner as DEAD.
++ * The original master will already have told the node
++ * to go back into JOINWAIT state */
++ if (oldjoin && oldjoin != joining_node
++ && oldjoin->state == NODESTATE_JOINING)
++ oldjoin->state = NODESTATE_DEAD;
++ }
++
++ /* Is it a new master node? */
++ if (startmsg->reason == TRANS_NEWMASTER ||
++ startmsg->reason == TRANS_DEADMASTER) {
++ P_MEMB("starttrans %s, node=%d\n",
++ startmsg->reason ==
++ TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
++ le32_to_cpu(startmsg->nodeid));
++
++ /* If the old master has died then remove it */
++ node =
++ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
++
++ if (startmsg->reason == TRANS_DEADMASTER &&
++ node && node->state == NODESTATE_MEMBER) {
++ down(&cluster_members_lock);
++ node->state = NODESTATE_DEAD;
++ cluster_members--;
++ up(&cluster_members_lock);
++ }
++
++ /* Store new master */
++ master_node = find_node_by_nodeid(saddr->scl_nodeid);
++ }
++
++ /* Another node has died (or been killed) */
++ if (startmsg->reason == TRANS_ANOTHERREMNODE) {
++ /* Remove new dead node */
++ node =
++ find_node_by_nodeid(le32_to_cpu(startmsg->nodeid));
++ if (node && node->state == NODESTATE_MEMBER) {
++ down(&cluster_members_lock);
++ node->state = NODESTATE_DEAD;
++ cluster_members--;
++ up(&cluster_members_lock);
++ }
++ }
++ /* Restart the timer */
++ del_timer(&transition_timer);
++ mod_timer(&transition_timer,
++ jiffies + cman_config.transition_timeout * HZ);
++ }
++
++ return 0;
++}
++
++/* Change a cluster parameter */
++static int do_process_reconfig(struct msghdr *msg, int len)
++{
++ struct cl_mem_reconfig_msg *confmsg;
++ struct sockaddr_cl *saddr = msg->msg_name;
++ struct cluster_node *node;
++ unsigned int val;
++
++ if (len < sizeof(struct cl_mem_reconfig_msg))
++ return -1;
++
++ confmsg = (struct cl_mem_reconfig_msg *) msg->msg_iov->iov_base;
++ val = le32_to_cpu(confmsg->value);
++
++ switch (confmsg->param) {
++
++ case RECONFIG_PARAM_EXPECTED_VOTES:
++ /* Set any nodes with expected_votes higher than the new value
++ * down */
++ if (val > 0) {
++ struct cluster_node *node;
++
++ down(&cluster_members_lock);
++ list_for_each_entry(node, &cluster_members_list, list) {
++ if (node->state == NODESTATE_MEMBER &&
++ node->expected_votes > val) {
++ node->expected_votes = val;
++ }
++ }
++ up(&cluster_members_lock);
++ if (expected_votes > val)
++ expected_votes = val;
++ }
++ recalculate_quorum(1); /* Allow decrease */
++ sm_member_update(cluster_is_quorate);
++ break;
++
++ case RECONFIG_PARAM_NODE_VOTES:
++ node = find_node_by_nodeid(saddr->scl_nodeid);
++ node->votes = val;
++ recalculate_quorum(1); /* Allow decrease */
++ sm_member_update(cluster_is_quorate);
++ break;
++
++ case RECONFIG_PARAM_CONFIG_VERSION:
++ config_version = val;
++ break;
++
++ default:
++ printk(KERN_INFO CMAN_NAME
++ ": got unknown parameter in reconfigure message. %d\n",
++ confmsg->param);
++ break;
++ }
++ return 0;
++}
++
++/* Response from master node */
++static int do_process_joinack(struct msghdr *msg, int len)
++{
++ struct cl_mem_joinack_msg *ackmsg = msg->msg_iov->iov_base;
++
++ join_time = jiffies;
++ if (ackmsg->acktype == JOINACK_TYPE_OK) {
++ node_state = JOINACK;
++ }
++
++ if (ackmsg->acktype == JOINACK_TYPE_NAK) {
++ printk(KERN_WARNING CMAN_NAME
++ ": Cluster membership rejected\n");
++ P_MEMB("Got JOINACK NACK\n");
++ node_state = REJECTED;
++ }
++
++ if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
++ P_MEMB("Got JOINACK WAIT\n");
++ node_state = JOINWAIT;
++ joinwait_time = jiffies;
++ }
++
++ return 0;
++}
++
++/* Request to join the cluster. This makes us the master for this state
++ * transition */
++static int do_process_joinreq(struct msghdr *msg, int len)
++{
++ int status;
++ static unsigned long last_joinreq = 0;
++ static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
++ struct cl_mem_join_msg *joinmsg = msg->msg_iov->iov_base;
++ struct cluster_node *node;
++
++ /* If we are in a state transition then tell the new node to wait a bit
++ * longer */
++ if (node_state != MEMBER) {
++ if (node_state == MASTER || node_state == TRANSITION) {
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_WAIT);
++ }
++ return 0;
++ }
++
++ /* Check version number */
++ if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
++ char *ptr = (char *) joinmsg;
++ char *name;
++
++ /* Sanity-check the num_addrs field otherwise we could oops */
++ if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
++ printk(KERN_WARNING CMAN_NAME
++ ": num_addr in JOIN-REQ message is rubbish: %d\n",
++ le16_to_cpu(joinmsg->num_addr));
++ return 0;
++ }
++
++ /* Check the cluster name matches */
++ if (strcmp(cluster_name, joinmsg->clustername)) {
++ printk(KERN_WARNING CMAN_NAME
++ ": attempt to join with cluster name '%s' refused\n",
++ joinmsg->clustername);
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ ptr += sizeof (*joinmsg);
++ name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
++
++ /* Check we are not exceeding the maximum number of nodes */
++ if (cluster_members > cman_config.max_nodes) {
++ printk(KERN_WARNING CMAN_NAME
++ ": Join request from %s rejected, exceeds maximum number of nodes\n",
++ name);
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ /* Check that we don't exceed the two_node limit */
++ if (two_node && cluster_members == 2) {
++ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++ "rejected, exceeds two node limit\n", name);
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ if (le16_to_cpu(joinmsg->config_version) != config_version) {
++ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++ "rejected, config version local %u remote %u\n",
++ name, config_version,
++ le16_to_cpu(joinmsg->config_version));
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ /* If these don't match then I don't know how the message
++ arrived! However, I can't take the chance */
++ if (le32_to_cpu(joinmsg->addr_len) != address_length) {
++ printk(KERN_WARNING CMAN_NAME ": Join request from %s "
++ "rejected, address length local: %u remote %u\n",
++ name, address_length,
++ le32_to_cpu(joinmsg->addr_len));
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ /* Duplicate checking: Because joining messages do not have
++ * sequence numbers we may get as many JOINREQ messages as we
++ * have interfaces. This bit of code here just checks for
++ * JOINREQ messages that come in from the same node in a small
++ * period of time and removes the duplicates */
++ if (time_before(jiffies, last_joinreq + 10 * HZ)
++ && strcmp(name, last_name) == 0) {
++ return 0;
++ }
++
++ /* Do we already know about this node? */
++ status = check_duplicate_node(name, msg, len);
++
++ if (status < 0) {
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ /* OK, you can be in my gang */
++ if (status == 0) {
++ int i;
++ struct sockaddr_cl *addr = msg->msg_name;
++
++ last_joinreq = jiffies;
++ strcpy(last_name, name);
++
++ node =
++ add_new_node(name, joinmsg->votes,
++ le32_to_cpu(joinmsg->expected_votes),
++ 0, NODESTATE_JOINING);
++
++ /* Add the node's addresses */
++ if (list_empty(&node->addr_list)) {
++ for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
++ i++) {
++ add_node_address(node, ptr, address_length);
++ ptr += address_length;
++ }
++ }
++
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_OK);
++ joining_node = node;
++ joining_temp_nodeid = addr->scl_nodeid;
++
++ /* Start the state transition */
++ start_transition(TRANS_NEWNODE, node);
++ }
++ }
++ else {
++ /* Version number mismatch, don't use any part of the message
++ * other than the version numbers as things may have moved */
++ char buf[MAX_ADDR_PRINTED_LEN];
++
++ printk(KERN_INFO CMAN_NAME
++ ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d) addr: %s\n",
++ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
++ CNXMAN_PATCH_VERSION,
++ le32_to_cpu(joinmsg->major_version),
++ le32_to_cpu(joinmsg->minor_version),
++ le32_to_cpu(joinmsg->patch_version),
++ print_addr(msg->msg_name, msg->msg_namelen, buf));
++
++ send_joinack(msg->msg_name, msg->msg_namelen,
++ JOINACK_TYPE_NAK);
++ return 0;
++ }
++
++ return 0;
++}
++
++/* A simple function to invent a small number based
++ on the node name */
++static int node_hash(void)
++{
++ int i;
++ int value = 0;
++
++ for (i=0; i<strlen(nodename); i++) {
++ value += nodename[i];
++ }
++ return value & 0xF;
++}
++
++/* A new node has stated its intent to form a new cluster. we may have
++ * something to say about that... */
++static int do_process_newcluster(struct msghdr *msg, int len)
++{
++ /* If we are also in STARTING state then back down for a random period
++ * of time */
++ if (node_state == STARTING) {
++ P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
++ start_time = jiffies + node_hash() * HZ;
++ }
++
++ return 0;
++}
++
++/* Called for each node by the node-message unpacker. Returns -1 if there is a
++ * mismatch and the caller will stop processing */
++static int check_node(struct cluster_node *newnode, char *addrs,
++ unsigned short num_addr)
++{
++ struct cluster_node *node = find_node_by_name(newnode->name);
++
++ P_MEMB("check_node: %s", newnode->name);
++
++ if (!node) {
++ C_MEMB(" - not found\n");
++ return -1;
++ }
++
++ if (node->votes != newnode->votes ||
++ node->node_id != newnode->node_id ||
++ node->state != NODESTATE_MEMBER) {
++ C_MEMB
++ (" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d\n",
++ node->votes, newnode->votes, node->node_id,
++ newnode->node_id, node->state);
++ return -1;
++ }
++ C_MEMB(" - OK\n");
++ return 0;
++}
++
++/* Called for each new node found in a JOINCONF message. Create a new node
++ * entry */
++static int add_node(struct cluster_node *node, char *addrs,
++ unsigned short num_addr)
++{
++ P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
++ node->expected_votes, node->node_id);
++
++ if (!find_node_by_name(node->name)) {
++ struct cluster_node *newnode;
++ int i;
++
++ if ((newnode =
++ add_new_node(node->name, node->votes, node->expected_votes,
++ node->node_id, NODESTATE_MEMBER)) == NULL) {
++ P_MEMB("Error adding node\n");
++ return -1;
++ }
++ if (list_empty(&newnode->addr_list)) {
++ for (i = 0; i < num_addr; i++) {
++ add_node_address(newnode,
++ addrs + i * address_length, address_length);
++ }
++ }
++ return 0;
++ }
++ else {
++ P_MEMB("Already got node with name %s\n", node->name);
++ return -1;
++ }
++}
++
++/* Call a specified routine for each node unpacked from the message. Return
++ * either the number of nodes found or -1 for an error */
++static int unpack_nodes(unsigned char *buf, int len,
++ int (*routine) (struct cluster_node *, char *,
++ unsigned short))
++{
++ int ptr = 0;
++ int num_nodes = 0;
++ char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
++ struct cluster_node node;
++
++ node.name = nodename;
++
++ while (ptr < len) {
++ int namelen = buf[ptr++];
++ unsigned int evotes;
++ unsigned int node_id;
++ unsigned short num_addr;
++ unsigned char *addrs;
++
++ memcpy(nodename, &buf[ptr], namelen);
++ nodename[namelen] = '\0';
++ ptr += namelen;
++
++ memcpy(&num_addr, &buf[ptr], sizeof (short));
++ num_addr = le16_to_cpu(num_addr);
++ ptr += sizeof (short);
++
++ /* Just make a note of the addrs "array" */
++ addrs = &buf[ptr];
++ ptr += num_addr * address_length;
++
++ node.votes = buf[ptr++];
++
++ memcpy(&evotes, &buf[ptr], sizeof (int));
++ node.expected_votes = le32_to_cpu(evotes);
++ ptr += sizeof (int);
++
++ memcpy(&node_id, &buf[ptr], sizeof (int));
++ node.node_id = le32_to_cpu(node_id);
++ ptr += sizeof (int);
++
++ /* Call the callback routine */
++ if (routine(&node, addrs, num_addr) < 0)
++ return -1;
++ num_nodes++;
++ }
++ return num_nodes;
++}
++
++/* Got join confirmation from a master node. This message contains a list of
++ * cluster nodes which we unpack and build into our cluster nodes list. When we
++ * have the last message we can go into TRANSITION state */
++static int do_process_joinconf(struct msghdr *msg, int len)
++{
++ char *message = msg->msg_iov->iov_base;
++
++ if (unpack_nodes(message + 2, len - 2, add_node) < 0) {
++ printk(CMAN_NAME
++ ": Error procssing joinconf message - giving up on cluster join\n");
++ send_leave(CLUSTER_LEAVEFLAG_PANIC);
++ return -1;
++ }
++
++ /* Last message in the list? */
++ if (message[1] & 2) {
++ char ackmsg;
++ struct sockaddr_cl *addr = msg->msg_name;
++
++ us->state = NODESTATE_MEMBER;
++ node_state = TRANSITION;
++ we_are_a_cluster_member = TRUE;
++
++ ackmsg = CLUSTER_MEM_CONFACK;
++ kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
++ sizeof (struct sockaddr_cl),
++ MSG_NOACK);
++ kernel_thread(hello_kthread, NULL, 0);
++ mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
++ }
++ return 0;
++}
++
++/* Got the master's view of the cluster - compare it with ours and tell it the
++ * result */
++static int do_process_masterview(struct msghdr *msg, int len)
++{
++ char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
++ char *message = msg->msg_iov->iov_base;
++ static int num_nodes;
++
++ /* Someone else's state transition */
++ if (node_state != MEMBER &&
++ node_state != TRANSITION && node_state != MASTER)
++ return 0;
++
++ /* First message, zero the counter */
++ if (message[1] & 1)
++ num_nodes = 0;
++
++ num_nodes +=
++ unpack_nodes(msg->msg_iov->iov_base + 2, len - 2, check_node);
++
++ /* Last message, check the count and reply */
++ if (message[1] & 2) {
++ if (num_nodes == cluster_members) {
++ /* Send ACK */
++ reply[1] = 1;
++ }
++ else {
++ P_MEMB
++ ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
++ num_nodes, cluster_members);
++ /* Send NAK */
++ reply[1] = 0;
++ }
++ kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
++ msg->msg_namelen, 0);
++ }
++ return 0;
++}
++
++static int do_process_leave(struct msghdr *msg, int len)
++{
++ struct cluster_node *node;
++ struct sockaddr_cl *saddr = msg->msg_name;
++ unsigned char *leavemsg = (unsigned char *) msg->msg_iov->iov_base;
++
++ if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
++ unsigned char reason = leavemsg[1];
++
++ if (node->state != NODESTATE_DEAD) {
++ printk(KERN_INFO CMAN_NAME
++ ": Node %s is leaving the cluster, reason %d\n",
++ node->name, reason);
++
++ node->leave_reason = reason;
++ }
++ leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
++
++ a_node_just_died(node);
++
++ /* If it was the master node, then we have been nominated as
++ * the sucessor */
++ if (node == master_node) {
++ start_transition(TRANS_DEADMASTER, master_node);
++ }
++
++ }
++ return 0;
++}
++
++static int do_process_hello(struct msghdr *msg, int len)
++{
++ struct cluster_node *node;
++ struct cl_mem_hello_msg *hellomsg =
++ (struct cl_mem_hello_msg *) msg->msg_iov->iov_base;
++ struct sockaddr_cl *saddr = msg->msg_name;
++
++ /* We are starting up. Send a join message to the node whose HELLO we
++ * just received */
++ if (node_state == STARTING || node_state == JOINWAIT) {
++ struct sockaddr_cl *addr = msg->msg_name;
++
++ printk(KERN_INFO CMAN_NAME ": sending membership request\n");
++
++ send_joinreq(addr, msg->msg_namelen);
++ join_time = jiffies;
++ node_state = JOINING;
++ return 0;
++ }
++
++ /* Only process HELLOs if we are not in transition */
++ if (node_state == MEMBER) {
++ if (len < sizeof (struct cl_mem_hello_msg)) {
++ printk(KERN_ERR CMAN_NAME
++ ": short hello message from node %d\n",
++ saddr->scl_nodeid);
++ return -1;
++ }
++
++ node = find_node_by_nodeid(saddr->scl_nodeid);
++ if (node && node->state != NODESTATE_DEAD) {
++
++ /* Check the cluster generation in the HELLO message.
++ * NOTE: this may be different if the message crossed
++ * on the wire with an END-TRANS so we allow a period
++ * of grace in which this is allowable */
++ if (cluster_generation !=
++ le32_to_cpu(hellomsg->generation)
++ && node_state == MEMBER
++ && time_after(jiffies,
++ cman_config.hello_timer * HZ +
++ transition_end_time)) {
++ char killmsg;
++
++ printk(KERN_INFO CMAN_NAME
++ ": bad generation number %d in HELLO message, expected %d\n",
++ le32_to_cpu(hellomsg->generation),
++ cluster_generation);
++
++ notify_kernel_listeners(DIED,
++ (long) node->node_id);
++
++ killmsg = CLUSTER_MEM_KILL;
++ kcl_sendmsg(mem_socket, &killmsg, 1,
++ saddr, sizeof (struct sockaddr_cl),
++ MSG_NOACK);
++ return 0;
++ }
++
++ if (cluster_members != le16_to_cpu(hellomsg->members)
++ && node_state == MEMBER) {
++ printk(KERN_INFO CMAN_NAME
++ ": nmembers in HELLO message does not match our view\n");
++ start_transition(TRANS_CHECK, node);
++ return 0;
++ }
++ /* The message is OK - save the time */
++ node->last_hello = jiffies;
++
++ }
++ else {
++ struct sockaddr_cl *addr = msg->msg_name;
++
++ /* This node is a danger to our valid cluster */
++ if (cluster_is_quorate) {
++ char killmsg;
++
++ killmsg = CLUSTER_MEM_KILL;
++ kcl_sendmsg(mem_socket, &killmsg, 1, addr,
++ sizeof (struct sockaddr_cl),
++ MSG_NOACK);
++ }
++
++ }
++ }
++
++ return 0;
++
++}
++
++static int do_process_kill(struct msghdr *msg, int len)
++{
++ struct sockaddr_cl *saddr = msg->msg_name;
++ struct cluster_node *node;
++
++ node = find_node_by_nodeid(saddr->scl_nodeid);
++ if (node && node->state == NODESTATE_MEMBER) {
++
++ printk(KERN_INFO CMAN_NAME
++ ": Being told to leave the cluster by node %d\n",
++ saddr->scl_nodeid);
++
++ node_state = LEFT_CLUSTER;
++ quit_threads = 1;
++ wake_up_process(membership_task);
++ wake_up_interruptible(&cnxman_waitq);
++ }
++ else {
++ P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
++ }
++ return 0;
++}
++
++/* Some cluster membership utility functions */
++struct cluster_node *find_node_by_name(char *name)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (strcmp(node->name, name) == 0) {
++ up(&cluster_members_lock);
++ return node;
++ }
++ }
++ up(&cluster_members_lock);
++ return NULL;
++}
++
++/* Try to avoid using this as it's slow and holds the members lock */
++struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
++{
++ struct list_head *nodelist;
++ struct list_head *addrlist;
++ struct cluster_node *node;
++ struct cluster_node_addr *nodeaddr;
++
++ down(&cluster_members_lock);
++
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ list_for_each(addrlist, &node->addr_list) {
++ nodeaddr =
++ list_entry(addrlist, struct cluster_node_addr,
++ list);
++
++ if (memcmp(nodeaddr->addr, addr, address_length) == 0) {
++ up(&cluster_members_lock);
++ return node;
++ }
++ }
++ }
++
++ up(&cluster_members_lock);
++ return NULL;
++}
++
++/* This is the quick way to find a node */
++struct cluster_node *find_node_by_nodeid(unsigned int id)
++{
++ struct cluster_node *node;
++
++ if (id > sizeof_members_array)
++ return NULL;
++
++ spin_lock(&members_by_nodeid_lock);
++ node = members_by_nodeid[id];
++ spin_unlock(&members_by_nodeid_lock);
++ return node;
++}
++
++static int dispatch_messages(struct socket *mem_socket)
++{
++ int err = 0;
++
++ while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
++ struct msghdr msg;
++ struct iovec iov;
++ struct sockaddr_cl sin;
++ int len;
++ mm_segment_t fs;
++
++ memset(&sin, 0, sizeof (sin));
++
++ msg.msg_control = NULL;
++ msg.msg_controllen = 0;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = &iov;
++ msg.msg_name = &sin;
++ msg.msg_namelen = sizeof (sin);
++ msg.msg_flags = 0;
++
++ iov.iov_len = MAX_CLUSTER_MESSAGE;
++ iov.iov_base = iobuf;
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ len =
++ sock_recvmsg(mem_socket, &msg, MAX_CLUSTER_MESSAGE,
++ MSG_DONTWAIT);
++ set_fs(fs);
++ if (len > 0) {
++ iov.iov_base = iobuf; /* Reinstate pointer */
++ msg.msg_name = &sin;
++ do_membership_packet(&msg, len);
++ }
++ else {
++ if (len == -EAGAIN)
++ err = 0;
++ else
++ err = -1;
++ break;
++ }
++ }
++ return err;
++}
++
++/* Scan the nodes list for dead nodes */
++static void check_for_dead_nodes()
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->state != NODESTATE_DEAD &&
++ time_after(jiffies,
++ node->last_hello +
++ cman_config.deadnode_timeout * HZ) && !node->us) {
++
++ up(&cluster_members_lock);
++
++ printk(KERN_WARNING CMAN_NAME
++ ": no HELLO from %s, removing from the cluster\n",
++ node->name);
++
++ P_MEMB("last hello was %ld, current time is %ld\n",
++ node->last_hello, jiffies);
++
++ node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
++ leavereason = 0;
++
++ /* This is unlikely to work but it's worth a try! */
++ send_kill(node->node_id);
++
++ /* Start state transition */
++ a_node_just_died(node);
++ return;
++ }
++ }
++ up(&cluster_members_lock);
++
++ /* Also check for a dead quorum device */
++ if (quorum_device) {
++ if (quorum_device->state == NODESTATE_MEMBER &&
++ time_after(jiffies,
++ quorum_device->last_hello +
++ cman_config.deadnode_timeout * HZ)) {
++ quorum_device->state = NODESTATE_DEAD;
++ printk(KERN_WARNING CMAN_NAME
++ ": Quorum device %s timed out\n",
++ quorum_device->name);
++ recalculate_quorum(0);
++ }
++ }
++
++ return;
++}
++
++/* add "us" as a node in the cluster */
++static int add_us()
++{
++ struct cluster_node *newnode =
++ kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
++
++ if (!newnode) {
++ /* Oh shit, we have to commit hara kiri here for the greater
++ * good of the cluster */
++ send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++ printk(KERN_CRIT CMAN_NAME
++ ": Cannot allocate memory for our node structure\n");
++ panic("Must die");
++
++ return -1;
++ }
++
++ memset(newnode, 0, sizeof (struct cluster_node));
++ newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
++ if (!newnode->name) {
++ send_leave(CLUSTER_LEAVEFLAG_PANIC);
++
++ printk(KERN_CRIT CMAN_NAME
++ ": Cannot allocate memory for node name\n");
++ kfree(newnode);
++
++ panic("Must die");
++
++ return -1;
++ }
++
++ strcpy(newnode->name, nodename);
++ newnode->last_hello = jiffies;
++ newnode->votes = votes;
++ newnode->expected_votes = expected_votes;
++ newnode->state = NODESTATE_JOINING;
++ newnode->node_id = 0; /* Will get filled in by ENDTRANS message */
++ newnode->us = 1;
++ newnode->leave_reason = 0;
++ INIT_LIST_HEAD(&newnode->addr_list);
++ get_local_addresses(newnode); /* Get from cnxman socket info */
++
++ /* Add the new node to the list */
++ down(&cluster_members_lock);
++ list_add(&newnode->list, &cluster_members_list);
++ cluster_members++;
++ up(&cluster_members_lock);
++ us = newnode;
++
++ return 0;
++}
++
++/* Return the highest known node_id */
++unsigned int get_highest_nodeid()
++{
++ struct list_head *nodelist;
++ struct cluster_node *node = NULL;
++ unsigned int highest = 0;
++
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++
++ if (node->node_id > highest)
++ highest = node->node_id;
++ }
++ up(&cluster_members_lock);
++
++ return highest;
++}
++
++/* Elect a new master if there is a clash. Returns 1 if we are the new master,
++ * the master's struct will also be returned. This, rather primitively, uses
++ * the lowest node ID */
++static int elect_master(struct cluster_node **master_node)
++{
++ int i;
++
++ for (i = 1; i < sizeof_members_array; i++) {
++ if (members_by_nodeid[i]
++ && members_by_nodeid[i]->state == NODESTATE_MEMBER) {
++ *master_node = members_by_nodeid[i];
++ P_MEMB("Elected master is %s\n", (*master_node)->name);
++ return (*master_node)->us;
++ }
++ }
++ BUG();
++ return 0;
++}
++
++/* Called by node_cleanup in cnxman when we have left the cluster */
++void free_nodeid_array()
++{
++ vfree(members_by_nodeid);
++ members_by_nodeid = NULL;
++ sizeof_members_array = 0;
++}
++
++int allocate_nodeid_array()
++{
++ /* Allocate space for the nodeid lookup array */
++ if (!members_by_nodeid) {
++ spin_lock_init(&members_by_nodeid_lock);
++ members_by_nodeid =
++ vmalloc(cman_config.max_nodes *
++ sizeof (struct cluster_member *));
++ }
++
++ if (!members_by_nodeid) {
++ printk(KERN_WARNING
++ "Unable to allocate members array for %d members\n",
++ cman_config.max_nodes);
++ return -ENOMEM;
++ }
++ memset(members_by_nodeid, 0,
++ cman_config.max_nodes * sizeof (struct cluster_member *));
++ sizeof_members_array = cman_config.max_nodes;
++
++ return 0;
++}
++
++/* Set the votes & expected_votes variables */
++void set_votes(int v, int e)
++{
++ votes = v;
++ expected_votes = e;
++}
++
++int get_quorum()
++{
++ return quorum;
++}
++
++/* Called by cnxman to see if activity should be blocked because we are in a
++ * state transition */
++int in_transition()
++{
++ return node_state == TRANSITION ||
++ node_state == TRANSITION_COMPLETE || node_state == MASTER;
++}
++
++/* Return the current membership state as a string for the main line to put
++ * into /proc . I really should be using snprintf rather than sprintf but it's
++ * not exported... */
++char *membership_state(char *buf, int buflen)
++{
++ switch (node_state) {
++ case STARTING:
++ strncpy(buf, "Starting", buflen);
++ break;
++ case JOINING:
++ strncpy(buf, "Joining", buflen);
++ break;
++ case JOINWAIT:
++ strncpy(buf, "Join-Wait", buflen);
++ break;
++ case JOINACK:
++ strncpy(buf, "Join-Ack", buflen);
++ break;
++ case TRANSITION:
++ sprintf(buf, "State-Transition: Master is %s",
++ master_node ? master_node->name : "Unknown");
++ break;
++ case MEMBER:
++ strncpy(buf, "Cluster-Member", buflen);
++ break;
++ case REJECTED:
++ strncpy(buf, "Rejected", buflen);
++ break;
++ case LEFT_CLUSTER:
++ strncpy(buf, "Left-Cluster", buflen);
++ break;
++ case TRANSITION_COMPLETE:
++ strncpy(buf, "Transition-Complete", buflen);
++ break;
++ case MASTER:
++ strncpy(buf, "Transition-Master", buflen);
++ break;
++ default:
++ sprintf(buf, "Unknown: code=%d", node_state);
++ break;
++ }
++
++ return buf;
++}
++
++#ifdef DEBUG_MEMB
++static char *msgname(int msg)
++{
++ switch (msg) {
++ case CLUSTER_MEM_JOINCONF:
++ return "JOINCONF";
++ case CLUSTER_MEM_JOINREQ:
++ return "JOINREQ";
++ case CLUSTER_MEM_LEAVE:
++ return "LEAVE";
++ case CLUSTER_MEM_HELLO:
++ return "HELLO";
++ case CLUSTER_MEM_KILL:
++ return "KILL";
++ case CLUSTER_MEM_JOINACK:
++ return "JOINACK";
++ case CLUSTER_MEM_ENDTRANS:
++ return "ENDTRANS";
++ case CLUSTER_MEM_RECONFIG:
++ return "RECONFIG";
++ case CLUSTER_MEM_MASTERVIEW:
++ return "MASTERVIEW";
++ case CLUSTER_MEM_STARTTRANS:
++ return "STARTTRANS";
++ case CLUSTER_MEM_JOINREJ:
++ return "JOINREJ";
++ case CLUSTER_MEM_VIEWACK:
++ return "VIEWACK";
++ case CLUSTER_MEM_STARTACK:
++ return "STARTACK";
++ case CLUSTER_MEM_NEWCLUSTER:
++ return "NEWCLUSTER";
++ case CLUSTER_MEM_CONFACK:
++ return "CONFACK";
++ case CLUSTER_MEM_NOMINATE:
++ return "NOMINATE";
++
++ default:
++ return "??UNKNOWN??";
++ }
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/cman/proc.c linux-patched/cluster/cman/proc.c
+--- linux-orig/cluster/cman/proc.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/proc.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,364 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/init.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/list.h>
++#include <linux/in.h>
++#include <net/sock.h>
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#include "cnxman-private.h"
++#include "config.h"
++
++extern int cluster_members;
++extern struct list_head cluster_members_list;
++extern struct semaphore cluster_members_lock;
++extern struct cluster_node *quorum_device;
++extern int we_are_a_cluster_member;
++extern int cluster_is_quorate;
++extern unsigned short cluster_id;
++extern atomic_t use_count;
++extern unsigned int address_length;
++extern unsigned int config_version;
++extern char cluster_name[];
++extern struct cluster_node *us;
++static struct seq_operations cluster_info_op;
++
++int sm_procdata(char *b, char **start, off_t offset, int length);
++int sm_debug_info(char *b, char **start, off_t offset, int length);
++
++/* /proc interface to the configuration struct */
++static struct config_proc_info {
++ char *name;
++ int *value;
++} config_proc[] = {
++ {
++ .name = "joinwait_timeout",
++ .value = &cman_config.joinwait_timeout,
++ },
++ {
++ .name = "joinconf_timeout",
++ .value = &cman_config.joinconf_timeout,
++ },
++ {
++ .name = "join_timeout",
++ .value = &cman_config.join_timeout,
++ },
++ {
++ .name = "hello_timer",
++ .value = &cman_config.hello_timer,
++ },
++ {
++ .name = "deadnode_timeout",
++ .value = &cman_config.deadnode_timeout,
++ },
++ {
++ .name = "transition_timeout",
++ .value = &cman_config.transition_timeout,
++ },
++ {
++ .name = "transition_restarts",
++ .value = &cman_config.transition_restarts,
++ },
++ {
++ .name = "max_nodes",
++ .value = &cman_config.max_nodes,
++ },
++ {
++ .name = "sm_debug_size",
++ .value = &cman_config.sm_debug_size,
++ },
++};
++
++
++static int proc_cluster_status(char *b, char **start, off_t offset, int length)
++{
++ struct list_head *nodelist;
++ struct cluster_node *node;
++ struct cluster_node_addr *node_addr;
++ unsigned int total_votes = 0;
++ unsigned int max_expected = 0;
++ int c = 0;
++ char node_buf[MAX_CLUSTER_MEMBER_NAME_LEN];
++
++ if (!we_are_a_cluster_member) {
++ c += sprintf(b+c, "Not a cluster member. State: %s\n",
++ membership_state(node_buf,
++ sizeof (node_buf)));
++ return c;
++ }
++
++ /* Total the votes */
++ down(&cluster_members_lock);
++ list_for_each(nodelist, &cluster_members_list) {
++ node = list_entry(nodelist, struct cluster_node, list);
++ if (node->state == NODESTATE_MEMBER) {
++ total_votes += node->votes;
++ max_expected =
++ max(max_expected, node->expected_votes);
++ }
++ }
++ up(&cluster_members_lock);
++
++ if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
++ total_votes += quorum_device->votes;
++
++ c += sprintf(b+c,
++ "Version: %d.%d.%d\nConfig version: %d\nCluster name: %s\nCluster ID: %d\nMembership state: %s\n",
++ CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
++ CNXMAN_PATCH_VERSION,
++ config_version,
++ cluster_name, cluster_id,
++ membership_state(node_buf, sizeof (node_buf)));
++ c += sprintf(b+c,
++ "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d %s\n",
++ cluster_members, max_expected, total_votes,
++ get_quorum(),
++ cluster_is_quorate ? " " : "Activity blocked");
++ c += sprintf(b+c, "Active subsystems: %d\n",
++ atomic_read(&use_count));
++
++
++ c += sprintf(b+c, "Node addresses: ");
++ list_for_each_entry(node_addr, &us->addr_list, list) {
++ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
++ if (saddr->sin6_family == AF_INET6) {
++ c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x ",
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
++ be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
++ }
++ else {
++ struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
++ uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
++ c+= sprintf(b+c, "%u.%u.%u.%u ",
++ addr[0], addr[1], addr[2], addr[3]);
++ }
++ }
++ c += sprintf(b+c, "\n\n");
++ return c;
++}
++
++
++/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
++ * we are */
++struct cluster_seq_info {
++ int nodeid;
++ int highest_nodeid;
++};
++
++static int cluster_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &cluster_info_op);
++}
++
++static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
++{
++ struct cluster_seq_info *csi =
++ kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
++
++ if (!csi)
++ return NULL;
++
++ /* Keep highest_nodeid here so we don't need to keep traversing the
++ * list to find it */
++ csi->nodeid = *pos;
++ csi->highest_nodeid = get_highest_nodeid();
++
++ /* Print the header */
++ if (*pos == 0) {
++ seq_printf(m,
++ "Node Votes Exp Sts Name\n");
++ return csi;
++ }
++ return csi;
++}
++
++static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
++{
++ struct cluster_seq_info *csi = p;
++
++ *pos = ++csi->nodeid;
++ if (csi->nodeid > csi->highest_nodeid)
++ return NULL;
++
++ return csi;
++}
++
++static int cluster_seq_show(struct seq_file *m, void *p)
++{
++ char state = '?';
++ struct cluster_node *node;
++ struct cluster_seq_info *csi = p;
++
++ /*
++ * If we have "0" here then display the quorum device if
++ * there is one.
++ */
++ if (csi->nodeid == 0)
++ node = quorum_device;
++ else
++ node = find_node_by_nodeid(csi->nodeid);
++
++ if (!node)
++ return 0;
++
++ /* Make state printable */
++ switch (node->state) {
++ case NODESTATE_MEMBER:
++ state = 'M';
++ break;
++ case NODESTATE_JOINING:
++ state = 'J';
++ break;
++ case NODESTATE_REMOTEMEMBER:
++ state = 'R';
++ break;
++ case NODESTATE_DEAD:
++ state = 'X';
++ break;
++ }
++ seq_printf(m, " %3d %3d %3d %c %s\n",
++ node->node_id,
++ node->votes,
++ node->expected_votes,
++ state,
++ node->name);
++
++ return 0;
++}
++
++static void cluster_seq_stop(struct seq_file *m, void *p)
++{
++ kfree(p);
++}
++
++static struct seq_operations cluster_info_op = {
++ .start = cluster_seq_start,
++ .next = cluster_seq_next,
++ .stop = cluster_seq_stop,
++ .show = cluster_seq_show
++};
++
++static struct file_operations cluster_fops = {
++ .open = cluster_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++static int cman_config_read_proc(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct config_proc_info *cinfo = data;
++
++ return snprintf(page, count, "%d\n", *cinfo->value);
++}
++
++static int cman_config_write_proc(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ struct config_proc_info *cinfo = data;
++ int value;
++ char *end;
++
++ value = simple_strtoul(buffer, &end, 10);
++ if (*end) {
++ *cinfo->value = value;
++ }
++ return count;
++}
++
++/* Base of the config directory for cman */
++static struct proc_dir_entry *proc_cman_config;
++void create_proc_entries(void)
++{
++ struct proc_dir_entry *procentry;
++ struct proc_dir_entry *proc_cluster;
++ int i;
++
++ proc_cluster = proc_mkdir("cluster", 0);
++ if (!proc_cluster)
++ return;
++ proc_cluster->owner = THIS_MODULE;
++
++ /* Config dir filled in by us and others */
++ if (!proc_mkdir("cluster/config", 0))
++ return;
++
++ /* Don't much care if this fails, it's hardly vital */
++ procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
++ if (procentry)
++ procentry->proc_fops = &cluster_fops;
++
++ procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
++ if (procentry)
++ procentry->get_info = proc_cluster_status;
++
++ procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
++ if (procentry)
++ procentry->get_info = sm_procdata;
++
++ /* Config entries */
++ proc_cman_config = proc_mkdir("cluster/config/cman", 0);
++ if (!proc_cman_config)
++ return;
++
++ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
++ procentry = create_proc_entry(config_proc[i].name, 0660,
++ proc_cman_config);
++ if (procentry) {
++ procentry->data = &config_proc[i];
++ procentry->write_proc = cman_config_write_proc;
++ procentry->read_proc = cman_config_read_proc;
++ }
++ }
++
++ procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
++ if (procentry)
++ procentry->get_info = sm_debug_info;
++}
++
++void cleanup_proc_entries(void)
++{
++ int i, config_count;
++
++ remove_proc_entry("cluster/sm_debug", NULL);
++
++ config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
++
++ if (proc_cman_config) {
++ for (i=0; i<config_count; i++)
++ remove_proc_entry(config_proc[i].name, proc_cman_config);
++ }
++ remove_proc_entry("cluster/config/cman", NULL);
++ remove_proc_entry("cluster/config", NULL);
++
++ remove_proc_entry("cluster/nodes", NULL);
++ remove_proc_entry("cluster/status", NULL);
++ remove_proc_entry("cluster/services", NULL);
++ remove_proc_entry("cluster/config", NULL);
++ remove_proc_entry("cluster", NULL);
++}
+diff -urN linux-orig/cluster/cman/sm.h linux-patched/cluster/cman/sm.h
+--- linux-orig/cluster/cman/sm.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,108 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_DOT_H__
++#define __SM_DOT_H__
++
++/*
++ * This is the main header file to be included in each Service Manager source
++ * file.
++ */
++
++#include <linux/list.h>
++#include <linux/socket.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <net/sock.h>
++
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++#define SG_LEVELS (4)
++
++#include "sm_internal.h"
++#include "sm_barrier.h"
++#include "sm_control.h"
++#include "sm_daemon.h"
++#include "sm_joinleave.h"
++#include "sm_membership.h"
++#include "sm_message.h"
++#include "sm_misc.h"
++#include "sm_recover.h"
++#include "sm_services.h"
++
++extern struct list_head sm_sg[SG_LEVELS];
++extern struct semaphore sm_sglock;
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#define SM_ASSERT(x, do) \
++{ \
++ if (!(x)) \
++ { \
++ printk("\nSM: Assertion failed on line %d of file %s\n" \
++ "SM: assertion: \"%s\"\n" \
++ "SM: time = %lu\n", \
++ __LINE__, __FILE__, #x, jiffies); \
++ {do} \
++ printk("\n"); \
++ panic("SM: Record message above and reboot.\n"); \
++ } \
++}
++
++#define SM_RETRY(do_this, until_this) \
++for (;;) \
++{ \
++ do { do_this; } while (0); \
++ if (until_this) \
++ break; \
++ printk("SM: out of memory: %s, %u\n", __FILE__, __LINE__); \
++ schedule();\
++}
++
++
++#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
++
++#define log_error(sg, fmt, args...) \
++ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
++
++
++#define SM_DEBUG_LOG
++
++#ifdef SM_DEBUG_CONSOLE
++#define log_debug(sg, fmt, args...) \
++ printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
++#endif
++
++#ifdef SM_DEBUG_LOG
++#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
++#endif
++
++#ifdef SM_DEBUG_ALL
++#define log_debug(sg, fmt, args...) \
++do \
++{ \
++ printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
++ sm_debug_log(sg, fmt, ##args); \
++} \
++while (0)
++#endif
++
++#endif /* __SM_DOT_H__ */
+diff -urN linux-orig/cluster/cman/sm_barrier.c linux-patched/cluster/cman/sm_barrier.c
+--- linux-orig/cluster/cman/sm_barrier.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_barrier.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,232 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static struct list_head barriers;
++static spinlock_t barriers_lock;
++
++struct bc_entry {
++ struct list_head list;
++ uint32_t gid;
++ int status;
++ char type;
++};
++typedef struct bc_entry bc_entry_t;
++
++void init_barriers(void)
++{
++ INIT_LIST_HEAD(&barriers);
++ spin_lock_init(&barriers_lock);
++}
++
++static int atoi(char *c)
++{
++ int x = 0;
++
++ while ('0' <= *c && *c <= '9') {
++ x = x * 10 + (*c - '0');
++ c++;
++ }
++ return x;
++}
++
++static void add_barrier_callback(char *name, int status, int type)
++{
++ char *p;
++ uint32_t gid;
++ bc_entry_t *be;
++
++ /* an ESRCH callback just means there was a cnxman transition */
++ if (status == -ESRCH)
++ return;
++
++ /* extract global id of SG from barrier name */
++ p = strstr(name, "sm.");
++
++ SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
++
++ p += strlen("sm.");
++ gid = atoi(p);
++
++ SM_RETRY(be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC), be);
++
++ be->gid = gid;
++ be->status = status;
++ be->type = type;
++
++ spin_lock(&barriers_lock);
++ list_add_tail(&be->list, &barriers);
++ spin_unlock(&barriers_lock);
++
++ wake_serviced(DO_BARRIERS);
++}
++
++static void callback_recovery_barrier(char *name, int status)
++{
++ add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
++}
++
++static void callback_startdone_barrier_new(char *name, int status)
++{
++ add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
++}
++
++static void callback_startdone_barrier(char *name, int status)
++{
++ add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
++}
++
++int sm_barrier(char *name, int count, int type)
++{
++ int error;
++ unsigned long fn = 0;
++
++ switch (type) {
++ case SM_BARRIER_STARTDONE:
++ fn = (unsigned long) callback_startdone_barrier;
++ break;
++ case SM_BARRIER_STARTDONE_NEW:
++ fn = (unsigned long) callback_startdone_barrier_new;
++ break;
++ case SM_BARRIER_RECOVERY:
++ fn = (unsigned long) callback_recovery_barrier;
++ break;
++ }
++
++ error = kcl_barrier_register(name, 0, count);
++ if (error) {
++ log_print("barrier register error %d", error);
++ goto fail;
++ }
++
++ error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
++ if (error) {
++ log_print("barrier setattr autodel error %d", error);
++ goto fail_bar;
++ }
++
++ error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
++ if (error) {
++ log_print("barrier setattr cb error %d", error);
++ goto fail_bar;
++ }
++
++ error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
++ if (error) {
++ log_print("barrier setattr enabled error %d", error);
++ goto fail_bar;
++ }
++
++ return 0;
++
++ fail_bar:
++ kcl_barrier_delete(name);
++ fail:
++ return error;
++}
++
++void process_startdone_barrier_new(sm_group_t *sg, int status)
++{
++ sm_sevent_t *sev = sg->sevent;
++
++ if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
++ log_debug(sev->se_sg, "ignore barrier cb status %d", status);
++ return;
++ }
++
++ sev->se_barrier_status = status;
++ sev->se_state = SEST_BARRIER_DONE;
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++}
++
++void process_startdone_barrier(sm_group_t *sg, int status)
++{
++ sm_uevent_t *uev = &sg->uevent;
++
++ if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
++ log_debug(sg, "ignore barrier cb status %d", status);
++ return;
++ }
++
++ uev->ue_barrier_status = status;
++ uev->ue_state = UEST_BARRIER_DONE;
++ set_bit(UEFL_CHECK, &uev->ue_flags);
++ wake_serviced(DO_MEMBERSHIP);
++}
++
++void process_recovery_barrier(sm_group_t *sg, int status)
++{
++ if (status) {
++ log_error(sg, "process_recovery_barrier status=%d", status);
++ return;
++ }
++
++ if (sg->state != SGST_RECOVER ||
++ sg->recover_state != RECOVER_BARRIERWAIT) {
++ log_error(sg, "process_recovery_barrier state %d recover %d",
++ sg->state, sg->recover_state);
++ return;
++ }
++
++ if (!sg->recover_stop)
++ sg->recover_state = RECOVER_STOP;
++ else
++ sg->recover_state = RECOVER_BARRIERDONE;
++
++ wake_serviced(DO_RECOVERIES);
++}
++
++void process_barriers(void)
++{
++ sm_group_t *sg;
++ bc_entry_t *be;
++
++ while (1) {
++ be = NULL;
++
++ spin_lock(&barriers_lock);
++ if (!list_empty(&barriers)) {
++ be = list_entry(barriers.next, bc_entry_t, list);
++ list_del(&be->list);
++ }
++ spin_unlock(&barriers_lock);
++
++ if (!be)
++ break;
++
++ sg = sm_global_id_to_sg(be->gid);
++ if (!sg) {
++ log_print("process_barriers: no sg %08x", be->gid);
++ break;
++ }
++
++ switch (be->type) {
++ case SM_BARRIER_STARTDONE_NEW:
++ process_startdone_barrier_new(sg, be->status);
++ break;
++
++ case SM_BARRIER_STARTDONE:
++ process_startdone_barrier(sg, be->status);
++ break;
++
++ case SM_BARRIER_RECOVERY:
++ process_recovery_barrier(sg, be->status);
++ break;
++ }
++
++ kfree(be);
++ schedule();
++ }
++}
+diff -urN linux-orig/cluster/cman/sm_barrier.h linux-patched/cluster/cman/sm_barrier.h
+--- linux-orig/cluster/cman/sm_barrier.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_barrier.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_BARRIER_DOT_H__
++#define __SM_BARRIER_DOT_H__
++
++#define SM_BARRIER_STARTDONE (0)
++#define SM_BARRIER_STARTDONE_NEW (1)
++#define SM_BARRIER_RECOVERY (2)
++#define SM_BARRIER_RESET (3)
++
++void init_barriers(void);
++void process_barriers(void);
++int sm_barrier(char *name, int count, int type);
++void process_startdone_barrier(sm_group_t *sg, int status);
++void process_startdone_barrier_new(sm_group_t *sg, int status);
++void process_recovery_barrier(sm_group_t *sg, int status);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_control.c linux-patched/cluster/cman/sm_control.c
+--- linux-orig/cluster/cman/sm_control.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_control.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,156 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++struct socket * sm_socket;
++uint32_t * sm_new_nodeids;
++uint32_t sm_our_nodeid;
++int sm_quorum, sm_quorum_next;
++struct list_head sm_members;
++int sm_member_count;
++
++
++/*
++ * Context: cnxman
++ * Called by cnxman when it has a new member list.
++ */
++
++void sm_member_update(int quorate)
++{
++ sm_quorum_next = quorate;
++ wake_serviced(DO_START_RECOVERY);
++}
++
++/*
++ * Context: cnxman
++ * Called when module is loaded.
++ */
++
++void sm_init(void)
++{
++ sm_socket = NULL;
++ sm_new_nodeids = NULL;
++ sm_quorum = 0;
++ sm_quorum_next = 0;
++ sm_our_nodeid = 0;
++ INIT_LIST_HEAD(&sm_members);
++ sm_member_count = 0;
++
++ init_services();
++ init_messages();
++ init_barriers();
++ init_serviced();
++ init_recovery();
++ init_joinleave();
++ init_sm_misc();
++}
++
++/*
++ * Context: cnxman
++ * Called at beginning of cluster join procedure.
++ */
++
++void sm_start(void)
++{
++ struct sockaddr_cl saddr;
++ struct socket *sock;
++ int result;
++
++ /* Create a communication channel among service managers */
++
++ result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
++ if (result < 0) {
++ log_print("can't create socket %d", result);
++ goto fail;
++ }
++
++ sm_socket = sock;
++
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_SERVICES;
++
++ result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
++ sizeof(saddr));
++ if (result < 0) {
++ log_print("can't bind socket %d", result);
++ goto fail_release;
++ }
++
++ result = kcl_register_read_callback(sm_socket, sm_cluster_message);
++ if (result < 0) {
++ log_print("can't register read callback %d", result);
++ goto fail_release;
++ }
++
++ sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
++ sizeof(uint32_t),
++ GFP_KERNEL);
++ start_serviced();
++
++ /* cnxman should call sm_member_update() once we've joined - then we
++ * can get our first list of members and our own nodeid */
++
++ return;
++
++ fail_release:
++ sock_release(sm_socket);
++ sm_socket = NULL;
++
++ fail:
++ return;
++}
++
++/*
++ * Context: cnxman
++ * Called before cnxman leaves the cluster. If this returns an error to cman,
++ * cman should not leave the cluster but return EBUSY.
++ * If force is set we go away anyway. cman knows best in this case
++ */
++
++int sm_stop(int force)
++{
++ struct list_head *head;
++ sm_group_t *sg;
++ sm_node_t *node;
++ int i, busy = FALSE, error = -EBUSY;
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ if (!list_empty(&sm_sg[i])) {
++ sg = list_entry(sm_sg[i].next, sm_group_t, list);
++ log_error(sg, "sm_stop: SG still joined");
++ busy = TRUE;
++ }
++ }
++
++ if (!busy || force) {
++ stop_serviced();
++
++ if (sm_socket)
++ sock_release(sm_socket);
++
++ head = &sm_members;
++ while (!list_empty(head)) {
++ node = list_entry(head->next, sm_node_t, list);
++ list_del(&node->list);
++ sm_member_count--;
++ kfree(node);
++ }
++
++ kfree(sm_new_nodeids);
++ sm_init();
++ error = 0;
++ }
++ return error;
++}
+diff -urN linux-orig/cluster/cman/sm_control.h linux-patched/cluster/cman/sm_control.h
+--- linux-orig/cluster/cman/sm_control.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_control.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_CONTROL_DOT_H__
++#define __SM_CONTROL_DOT_H__
++
++void sm_init(void);
++void sm_start(void);
++int sm_stop(int force);
++void sm_member_update(int quorate);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_daemon.c linux-patched/cluster/cman/sm_daemon.c
+--- linux-orig/cluster/cman/sm_daemon.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_daemon.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,120 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static unsigned long daemon_flags;
++static struct task_struct * daemon_task;
++static struct completion daemon_done;
++static wait_queue_head_t daemon_wait;
++extern int sm_quorum;
++
++void init_serviced(void)
++{
++ daemon_flags = 0;
++ daemon_task = NULL;
++ init_completion(&daemon_done);
++ init_waitqueue_head(&daemon_wait);
++}
++
++void wake_serviced(int do_flag)
++{
++ set_bit(do_flag, &daemon_flags);
++ wake_up(&daemon_wait);
++}
++
++static inline int got_work(void)
++{
++ int rv = 0;
++
++ rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
++ test_bit(DO_MESSAGES, &daemon_flags) ||
++ test_bit(DO_BARRIERS, &daemon_flags) ||
++ test_bit(DO_CALLBACKS, &daemon_flags));
++
++ if (sm_quorum && !rv)
++ rv = (test_bit(DO_JOINLEAVE, &daemon_flags) ||
++ test_bit(DO_RECOVERIES, &daemon_flags) ||
++ test_bit(DO_MEMBERSHIP, &daemon_flags));
++ return rv;
++}
++
++static int serviced(void *arg)
++{
++ DECLARE_WAITQUEUE(wait, current);
++
++ daemonize("cman_serviced");
++ daemon_task = current;
++ set_bit(DO_RUN, &daemon_flags);
++ complete(&daemon_done);
++
++ for (;;) {
++ if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
++ process_nodechange();
++
++ if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
++ process_messages();
++
++ if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
++ process_barriers();
++
++ if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
++ process_callbacks();
++
++ if (sm_quorum) {
++ if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
++ process_recoveries();
++
++ if (test_and_clear_bit(DO_JOINLEAVE, &daemon_flags))
++ process_joinleave();
++
++ if (test_and_clear_bit(DO_MEMBERSHIP, &daemon_flags))
++ process_membership();
++ }
++
++ if (!test_bit(DO_RUN, &daemon_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ add_wait_queue(&daemon_wait, &wait);
++ if (!got_work() && test_bit(DO_RUN, &daemon_flags))
++ schedule();
++ remove_wait_queue(&daemon_wait, &wait);
++ current->state = TASK_RUNNING;
++ }
++
++ complete(&daemon_done);
++ return 0;
++}
++
++int start_serviced(void)
++{
++ int error;
++
++ error = kernel_thread(serviced, NULL, 0);
++ if (error < 0)
++ goto out;
++
++ error = 0;
++ wait_for_completion(&daemon_done);
++
++ out:
++ return error;
++}
++
++void stop_serviced(void)
++{
++ clear_bit(DO_RUN, &daemon_flags);
++ wake_up(&daemon_wait);
++ wait_for_completion(&daemon_done);
++}
+diff -urN linux-orig/cluster/cman/sm_daemon.h linux-patched/cluster/cman/sm_daemon.h
+--- linux-orig/cluster/cman/sm_daemon.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_daemon.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_DAEMON_DOT_H__
++#define __SM_DAEMON_DOT_H__
++
++#define DO_RUN (0)
++#define DO_START_RECOVERY (1)
++#define DO_MESSAGES (2)
++#define DO_BARRIERS (3)
++#define DO_CALLBACKS (4)
++#define DO_JOINLEAVE (5)
++#define DO_RECOVERIES (6)
++#define DO_MEMBERSHIP (7)
++#define DO_RESET (8)
++
++void init_serviced(void);
++void wake_serviced(int do_flag);
++void stop_serviced(void);
++int start_serviced(void);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_internal.h linux-patched/cluster/cman/sm_internal.h
+--- linux-orig/cluster/cman/sm_internal.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_internal.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,230 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_INTERNAL_DOT_H__
++#define __SM_INTERNAL_DOT_H__
++
++/*
++ * Any header files needed by this file should be included before it in sm.h.
++ * This file should only be included by sm.h.
++ */
++
++struct sm_group;
++struct sm_sevent;
++struct sm_uevent;
++struct sm_node;
++struct sm_msg;
++
++typedef struct sm_group sm_group_t;
++typedef struct sm_sevent sm_sevent_t;
++typedef struct sm_uevent sm_uevent_t;
++typedef struct sm_node sm_node_t;
++typedef struct sm_msg sm_msg_t;
++
++
++/*
++ * Number of seconds to wait before trying again to join or leave an SG
++ */
++#define RETRY_DELAY (2)
++
++
++/*
++ * Service Event - what a node uses to join or leave an sg
++ */
++
++/* SE Flags */
++#define SEFL_CHECK (0)
++#define SEFL_ALLOW_JOIN (1)
++#define SEFL_ALLOW_JSTOP (2)
++#define SEFL_ALLOW_LEAVE (3)
++#define SEFL_ALLOW_LSTOP (4)
++#define SEFL_ALLOW_STARTDONE (5)
++#define SEFL_ALLOW_BARRIER (6)
++#define SEFL_DELAY (7)
++#define SEFL_LEAVE (8)
++#define SEFL_CANCEL (9)
++
++/* SE States */
++#define SEST_JOIN_BEGIN (1)
++#define SEST_JOIN_ACKWAIT (2)
++#define SEST_JOIN_ACKED (3)
++#define SEST_JSTOP_ACKWAIT (4)
++#define SEST_JSTOP_ACKED (5)
++#define SEST_JSTART_SERVICEWAIT (6)
++#define SEST_JSTART_SERVICEDONE (7)
++#define SEST_BARRIER_WAIT (8)
++#define SEST_BARRIER_DONE (9)
++#define SEST_LEAVE_BEGIN (10)
++#define SEST_LEAVE_ACKWAIT (11)
++#define SEST_LEAVE_ACKED (12)
++#define SEST_LSTOP_ACKWAIT (13)
++#define SEST_LSTOP_ACKED (14)
++#define SEST_LSTART_WAITREMOTE (15)
++#define SEST_LSTART_REMOTEDONE (16)
++
++struct sm_sevent {
++ struct list_head se_list;
++ unsigned int se_id;
++ sm_group_t * se_sg;
++ unsigned long se_flags;
++ unsigned int se_state;
++
++ int se_node_count;
++ int se_memb_count;
++ int se_reply_count;
++
++ uint32_t * se_node_ids;
++ char * se_node_status;
++ int se_len_ids; /* length of node_ids */
++ int se_len_status; /* length of node_status */
++
++ int se_barrier_status;
++ struct timer_list se_restart_timer;
++};
++
++/*
++ * Update Event - what an sg member uses to respond to an sevent
++ */
++
++/* UE Flags */
++#define UEFL_ALLOW_STARTDONE (0)
++#define UEFL_ALLOW_BARRIER (1)
++#define UEFL_CANCEL (2)
++#define UEFL_LEAVE (3)
++#define UEFL_CHECK (4)
++
++/* UE States */
++#define UEST_JSTOP (1)
++#define UEST_JSTART_WAITCMD (2)
++#define UEST_JSTART (3)
++#define UEST_JSTART_SERVICEWAIT (4)
++#define UEST_JSTART_SERVICEDONE (5)
++#define UEST_BARRIER_WAIT (6)
++#define UEST_BARRIER_DONE (7)
++#define UEST_LSTOP (8)
++#define UEST_LSTART_WAITCMD (9)
++#define UEST_LSTART (10)
++#define UEST_LSTART_SERVICEWAIT (11)
++#define UEST_LSTART_SERVICEDONE (12)
++
++struct sm_uevent {
++ unsigned int ue_state;
++ unsigned long ue_flags;
++ uint32_t ue_id;
++ uint32_t ue_nodeid;
++ int ue_num_nodes;
++ int ue_barrier_status;
++ uint16_t ue_remote_seid;
++};
++
++/*
++ * Service Group
++ */
++
++#define RECOVER_NONE (0)
++#define RECOVER_STOP (1)
++#define RECOVER_START (2)
++#define RECOVER_STARTDONE (3)
++#define RECOVER_BARRIERWAIT (4)
++#define RECOVER_BARRIERDONE (5)
++
++/* SG Flags */
++#define SGFL_SEVENT (1)
++#define SGFL_UEVENT (2)
++#define SGFL_NEED_RECOVERY (3)
++
++/* SG States */
++#define SGST_NONE (0)
++#define SGST_JOIN (1)
++#define SGST_RUN (2)
++#define SGST_RECOVER (3)
++#define SGST_UEVENT (4)
++
++struct sm_group {
++ struct list_head list; /* list of sg's */
++ uint16_t level;
++ uint32_t local_id;
++ uint32_t global_id;
++ unsigned long flags;
++ int state;
++ int refcount; /* references from reg/unreg */
++ void * service_data; /* data from the service */
++ struct kcl_service_ops *ops; /* ops from the service */
++ struct completion event_comp;
++
++ struct list_head memb; /* Membership List for RC */
++ int memb_count; /* number of nodes in memb */
++ struct list_head joining; /* nodes joining the sg */
++ sm_sevent_t * sevent;
++ sm_uevent_t uevent;
++
++ int recover_state;
++ int recover_stop;
++ struct list_head recover_list; /* recovery event list */
++ void * recover_data;
++ char recover_barrier[MAX_BARRIER_NAME_LEN];
++
++ int namelen;
++ char name[1]; /* must be last field */
++};
++
++/*
++ * Service Message
++ */
++
++/* SMSG Type */
++#define SMSG_JOIN_REQ (1)
++#define SMSG_JOIN_REP (2)
++#define SMSG_JSTOP_REQ (3)
++#define SMSG_JSTOP_REP (4)
++#define SMSG_JSTART_CMD (5)
++#define SMSG_LEAVE_REQ (6)
++#define SMSG_LEAVE_REP (7)
++#define SMSG_LSTOP_REQ (8)
++#define SMSG_LSTOP_REP (9)
++#define SMSG_LSTART_CMD (10)
++#define SMSG_LSTART_DONE (11)
++#define SMSG_RECOVER (12)
++
++/* SMSG Status */
++#define STATUS_POS (1)
++#define STATUS_NEG (2)
++#define STATUS_WAIT (3)
++
++struct sm_msg {
++ uint8_t ms_type;
++ uint8_t ms_status;
++ uint16_t ms_sevent_id;
++ uint32_t ms_global_sgid;
++ uint32_t ms_global_lastid;
++ uint16_t ms_sglevel;
++ uint16_t ms_length;
++ /* buf of ms_length bytes follows */
++};
++
++/*
++ * Node structure
++ */
++
++#define SNFL_NEED_RECOVERY (0)
++#define SNFL_CLUSTER_MEMBER (1)
++#define SNFL_LEAVING (2)
++
++struct sm_node {
++ struct list_head list;
++ uint32_t id; /* node id from cnxman */
++ unsigned long flags;
++ int incarnation; /* node incarnation number */
++};
++
++#endif /* __SM_INTERNAL_DOT_H__ */
+diff -urN linux-orig/cluster/cman/sm_joinleave.c linux-patched/cluster/cman/sm_joinleave.c
+--- linux-orig/cluster/cman/sm_joinleave.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_joinleave.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,1286 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++/*
++ * Routines used by nodes that are joining or leaving a SG. These "sevent"
++ * routines initiate membership changes to a SG. Existing SG members respond
++ * using the "uevent" membership update routines.
++ */
++
++extern uint32_t sm_our_nodeid;
++extern struct list_head sm_members;
++static struct list_head new_event;
++static spinlock_t new_event_lock;
++static struct list_head joinleave_events;
++
++void init_joinleave(void)
++{
++ INIT_LIST_HEAD(&new_event);
++ spin_lock_init(&new_event_lock);
++ INIT_LIST_HEAD(&joinleave_events);
++}
++
++void new_joinleave(sm_sevent_t *sev)
++{
++ spin_lock(&new_event_lock);
++ list_add_tail(&sev->se_list, &new_event);
++ spin_unlock(&new_event_lock);
++ wake_serviced(DO_JOINLEAVE);
++}
++
++sm_sevent_t *find_sevent(unsigned int id)
++{
++ sm_sevent_t *sev;
++
++ list_for_each_entry(sev, &joinleave_events, se_list) {
++ if (sev->se_id == id)
++ return sev;
++ }
++ return NULL;
++}
++
++static void release_sevent(sm_sevent_t *sev)
++{
++ if (sev->se_len_ids) {
++ kfree(sev->se_node_ids);
++ sev->se_node_ids = NULL;
++ }
++
++ if (sev->se_len_status) {
++ kfree(sev->se_node_status);
++ sev->se_node_status = NULL;
++ }
++
++ sev->se_node_count = 0;
++ sev->se_memb_count = 0;
++ sev->se_reply_count = 0;
++}
++
++static int init_sevent(sm_sevent_t *sev)
++{
++ sm_node_t *node;
++ int len1, len2, count, cluster_members = 0;
++
++ /* clear state from any previous attempt */
++ release_sevent(sev);
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
++ cluster_members++;
++ }
++
++ sev->se_node_count = cluster_members;
++ sev->se_memb_count = sev->se_sg->memb_count;
++
++ /*
++ * When joining, we need a node array the size of the entire cluster
++ * member list because we get responses from all nodes. When leaving,
++ * we only get responses from SG members, so the node array need only
++ * be that large.
++ */
++
++ if (sev->se_state < SEST_LEAVE_BEGIN)
++ count = sev->se_node_count;
++ else
++ count = sev->se_memb_count;
++
++ len1 = count * sizeof(uint32_t);
++ sev->se_len_ids = len1;
++
++ sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
++ if (!sev->se_node_ids)
++ goto fail;
++
++ len2 = count * sizeof (char);
++ sev->se_len_status = len2;
++
++ sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
++ if (!sev->se_node_status)
++ goto fail_free;
++
++ memset(sev->se_node_status, 0, len2);
++ memset(sev->se_node_ids, 0, len1);
++
++ return 0;
++
++ fail_free:
++ kfree(sev->se_node_ids);
++ sev->se_node_ids = NULL;
++ sev->se_len_ids = 0;
++
++ fail:
++ return -ENOMEM;
++}
++
++/* Context: timer */
++
++static void sev_restart(unsigned long data)
++{
++ sm_sevent_t *sev = (sm_sevent_t *) data;
++
++ clear_bit(SEFL_DELAY, &sev->se_flags);
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++}
++
++static void schedule_sev_restart(sm_sevent_t *sev)
++{
++ init_timer(&sev->se_restart_timer);
++ sev->se_restart_timer.function = sev_restart;
++ sev->se_restart_timer.data = (long) sev;
++ mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
++}
++
++void free_sg_memb(sm_group_t *sg)
++{
++ sm_node_t *node;
++
++ while (!list_empty(&sg->memb)) {
++ node = list_entry(sg->memb.next, sm_node_t, list);
++ list_del(&node->list);
++ kfree(node);
++ }
++ sg->memb_count = 0;
++}
++
++/*
++ * 1. First step in joining a SG - send a message to all nodes in the cluster
++ * asking to join the named SG. If any nodes are members they will reply with
++ * a POS, or a WAIT (wait means try again, only one node can join at a time).
++ * If no one knows about this SG, they all send NEG replies which means we form
++ * the SG with just ourself as a member.
++ */
++
++static int send_join_notice(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ sm_node_t *node;
++ char *msg;
++ int i = 0, error, namelen, len = 0;
++
++ /*
++ * Create node array from member list in which to collect responses.
++ */
++
++ error = init_sevent(sev);
++ if (error)
++ goto out;
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
++ sev->se_node_ids[i++] = node->id;
++ }
++
++ /*
++ * Create and send a join request message.
++ *
++ * Other nodes then run process_join_request and reply to us; we
++ * collect the responses in process_reply and check them in
++ * check_join_notice.
++ */
++
++ namelen = sg->namelen;
++ msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
++ memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
++
++ error = send_broadcast_message_sev(msg, len, sev);
++
++ out:
++ return error;
++}
++
++/*
++ * 2. Second step in joining a SG - after we collect all replies to our join
++ * request, we look at them. If anyone told us to wait, we'll wait a while, go
++ * back and start at step 1 again.
++ */
++
++static int check_join_notice(sm_sevent_t *sev)
++{
++ int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
++
++ for (i = 0; i < sev->se_node_count; i++) {
++ switch (sev->se_node_status[i]) {
++ case STATUS_POS:
++ /* this node is in the SG and will be in new proposed
++ * memb list */
++ pos++;
++ break;
++
++ case STATUS_WAIT:
++ /* this node is in the SG but something else is
++ * happening with it at the moment. */
++ wait++;
++ break;
++
++ case STATUS_NEG:
++ /* this node has no record of the SG we're interested
++ * in */
++ neg++;
++
++ if (sev->se_node_ids[i] == sm_our_nodeid)
++ sev->se_node_status[i] = STATUS_POS;
++ break;
++
++ default:
++ /* we didn't get a valid response from this node,
++ * restart the entire sev. */
++ restart++;
++ break;
++ }
++ }
++
++ if (pos && !wait && !restart) {
++ /* all current members of this sg pos'ed our entry */
++ } else if (!pos && !wait && !restart && neg) {
++ /* we're the first in the cluster to join this sg */
++ sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
++ } else
++ error = -1;
++
++ return error;
++}
++
++/*
++ * 3. Third step in joining the SG - tell the nodes that are already members
++ * to "stop" the service. We stop them so that everyone can restart with the
++ * new member (us!) added.
++ */
++
++static int send_join_stop(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ sm_node_t *node;
++ char *msg;
++ uint32_t be_count;
++ int i, len = 0, error = 0;
++
++ /*
++ * Form the SG memb list with us in it.
++ */
++
++ for (i = 0; i < sev->se_node_count; i++) {
++ if (sev->se_node_status[i] != STATUS_POS)
++ continue;
++
++ node = sm_new_node(sev->se_node_ids[i]);
++ if (!node)
++ goto fail;
++
++ list_add_tail(&node->list, &sg->memb);
++ sg->memb_count++;
++ }
++
++ /*
++ * Re-init the node vector in which to collect responses again.
++ */
++
++ sev->se_memb_count = sg->memb_count;
++
++ memset(sev->se_node_status, 0, sev->se_len_status);
++ memset(sev->se_node_ids, 0, sev->se_len_ids);
++ i = 0;
++
++ list_for_each_entry(node, &sg->memb, list)
++ sev->se_node_ids[i++] = node->id;
++
++ /*
++ * Create and send a stop message.
++ *
++ * Other nodes then run process_stop_request and process_join_stop and
++ * reply to us. They stop the sg we're trying to join if they agree.
++ * We collect responses in process_reply and check them in
++ * check_join_stop.
++ */
++
++ msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
++ be_count = cpu_to_be32(sg->memb_count);
++ memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
++
++ error = send_members_message_sev(sg, msg, len, sev);
++ if (error < 0)
++ goto fail;
++
++ return 0;
++
++ fail:
++ free_sg_memb(sg);
++ return error;
++}
++
++/*
++ * 4. Fourth step in joining the SG - after we collect replies to our stop
++ * request, we look at them. Everyone sending POS agrees with us joining and
++ * has stopped their SG. If some nodes sent NEG, something is wrong and we
++ * don't have a good way to address that yet since some nodes may have sent
++ * POS.
++ *
++ * FIXME: even nodes replying with NEG should stop their SG so we can send an
++ * abort and have everyone at the same place to start from again.
++ */
++
++static int check_join_stop(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ int i, pos = 0, neg = 0;
++
++ for (i = 0; i < sev->se_memb_count; i++) {
++ switch (sev->se_node_status[i]) {
++ case STATUS_POS:
++ pos++;
++ break;
++
++ case STATUS_NEG:
++ log_error(sg, "check_join_stop: neg from nodeid %u "
++ "(%d, %d, %u)", sev->se_node_ids[i],
++ pos, neg, sev->se_memb_count);
++ neg++;
++ break;
++
++ default:
++ log_error(sg, "check_join_stop: unknown status=%u "
++ "nodeid=%u", sev->se_node_status[i],
++ sev->se_node_ids[i]);
++ neg++;
++ break;
++ }
++ }
++
++ if (pos == sg->memb_count)
++ return 0;
++
++ free_sg_memb(sg);
++ return -1;
++}
++
++/*
++ * 5. Fifth step in joining the SG - everyone has stopped their service and we
++ * all now start the service with us, the new member, added to the SG member
++ * list. We send start to our own service here and send a message to the other
++ * members that they should also start their service.
++ */
++
++static int send_join_start(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ sm_node_t *node;
++ uint32_t *memb;
++ char *msg;
++ int error, count = 0, len = 0;
++
++ /*
++ * Create a start message and send it.
++ */
++
++ msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
++
++ error = send_members_message(sg, msg, len);
++ if (error < 0)
++ goto fail;
++
++ /*
++ * Start the service ourself. The chunk of memory with the member ids
++ * must be freed by the service when it is done with it.
++ */
++
++ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
++ memb);
++
++ list_for_each_entry(node, &sg->memb, list)
++ memb[count++] = node->id;
++
++ set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
++
++ sg->ops->start(sg->service_data, memb, count, sev->se_id,
++ SERVICE_NODE_JOIN);
++ return 0;
++
++ fail:
++ free_sg_memb(sg);
++ return error;
++}
++
++/*
++ * 6. Sixth step in joining the SG - once the service has completed its start,
++ * it does a kcl_start_done() to signal us that it's done. That gets us here
++ * and we do a barrier with all other members which join the barrier when their
++ * service is done starting.
++ */
++
++static int startdone_barrier_new(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ char bname[MAX_BARRIER_NAME_LEN];
++ int error;
++
++ memset(bname, 0, MAX_BARRIER_NAME_LEN);
++ sev->se_barrier_status = -1;
++
++ set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++
++ /* If we're the only member, skip the barrier */
++ if (sg->memb_count == 1) {
++ process_startdone_barrier_new(sg, 0);
++ return 0;
++ }
++
++ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++ sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
++
++ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
++ if (error)
++ goto fail;
++
++ return 0;
++
++ fail:
++ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++ sg->ops->stop(sg->service_data);
++ free_sg_memb(sg);
++ return error;
++}
++
++/*
++ * 7. Seventh step in joining the SG - check that the barrier we joined with
++ * all other members returned with a successful status.
++ */
++
++static int check_startdone_barrier_new(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ int error = sev->se_barrier_status;
++
++ if (error) {
++ sg->ops->stop(sg->service_data);
++ free_sg_memb(sg);
++ }
++ return error;
++}
++
++/*
++ * 8. Eigth step in joining the SG - send the service a "finish" indicating
++ * that all members have successfully started the service.
++ */
++
++static void do_finish_new(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ sg->state = SGST_RUN;
++ sg->sevent = NULL;
++ clear_bit(SGFL_SEVENT, &sg->flags);
++
++ sg->ops->finish(sg->service_data, sev->se_id);
++}
++
++/*
++ * 9. Ninth step in joining the SG - it's done so get rid of the sevent stuff
++ * and tell the process which initiated the join that it's done.
++ */
++
++static void sevent_done(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ list_del(&sev->se_list);
++ release_sevent(sev);
++ kfree(sev);
++ complete(&sg->event_comp);
++}
++
++/*
++ * Move through the steps of a join. Summary:
++ *
++ * 1. Send a join notice to all cluster members.
++ * 2. Collect and check replies to the join notice.
++ * 3. Send a stop message to all SG members.
++ * 4. Collect and check replies to the stop message.
++ * 5. Send a start message to all SG members and start service ourself.
++ * 6. Use barrier to wait for all nodes to complete the start.
++ * 7. Check that all SG members joined the barrier.
++ * 8. Send finish to the service indicating that all nodes started it.
++ * 9. Clean up sevent and signal completion to the process that started the join
++ */
++
++static void process_join_sevent(sm_sevent_t *sev)
++{
++ int error = 0;
++
++ /*
++ * We may cancel the current join attempt if another node is also
++ * attempting to join or leave. (Only a single node can join or leave
++ * at once.) If cancelled, 0ur join attempt will be restarted later.
++ */
++
++ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
++ error = -1;
++ goto cancel;
++ }
++
++ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
++
++ switch (sev->se_state) {
++
++ /*
++ * An sevent is created in kcl_join_service with a state of
++ * JOIN_BEGIN.
++ */
++
++ case SEST_JOIN_BEGIN:
++ sev->se_state = SEST_JOIN_ACKWAIT;
++ error = send_join_notice(sev);
++ break;
++
++ /*
++ * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in
++ * process_reply (when all the replies have been received)
++ */
++
++ case SEST_JOIN_ACKED:
++ error = check_join_notice(sev);
++ if (error)
++ break;
++
++ sev->se_state = SEST_JSTOP_ACKWAIT;
++ error = send_join_stop(sev);
++ break;
++
++ /*
++ * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
++ * proces_reply (when all the replies have been received)
++ */
++
++ case SEST_JSTOP_ACKED:
++ error = check_join_stop(sev);
++ if (error)
++ break;
++
++ sev->se_state = SEST_JSTART_SERVICEWAIT;
++ error = send_join_start(sev);
++ break;
++
++ /*
++ * se_state is changed from JSTART_SERVICEWAIT to
++ * JSTART_SERVICEDONE in kcl_start_done
++ */
++
++ case SEST_JSTART_SERVICEDONE:
++ sev->se_state = SEST_BARRIER_WAIT;
++ error = startdone_barrier_new(sev);
++ break;
++
++ /*
++ * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
++ * process_startdone_barrier_new
++ */
++
++ case SEST_BARRIER_DONE:
++ error = check_startdone_barrier_new(sev);
++ if (error)
++ break;
++
++ do_finish_new(sev);
++ sevent_done(sev);
++ break;
++
++ default:
++ log_error(sev->se_sg, "no join processing for state %u",
++ sev->se_state);
++ }
++
++ cancel:
++ if (error) {
++ /* restart the sevent from the beginning */
++ sev->se_state = SEST_JOIN_BEGIN;
++ sev->se_sg->global_id = 0;
++ set_bit(SEFL_DELAY, &sev->se_flags);
++ schedule_sev_restart(sev);
++ }
++}
++
++/*
++ * 1. First step in leaving an SG - send a message to other SG members asking
++ * to leave the SG. Nodes that don't have another active sevent or uevent for
++ * this SG will return POS.
++ */
++
++static int send_leave_notice(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ sm_node_t *node;
++ char *msg;
++ int i = 0, error = -1, len = 0;
++
++ /*
++ * Create a node array from member list in which to collect responses.
++ */
++
++ error = init_sevent(sev);
++ if (error)
++ goto out;
++
++ list_for_each_entry(node, &sg->memb, list)
++ sev->se_node_ids[i++] = node->id;
++
++ /*
++ * Create and send a leave request message.
++ */
++
++ msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
++
++ error = send_members_message_sev(sg, msg, len, sev);
++
++ out:
++ return error;
++}
++
++/*
++ * 2. Second step in leaving an SG - after we collect all replies to our leave
++ * request, we look at them. If anyone replied with WAIT, we abort our attempt
++ * at leaving and try again in a bit.
++ */
++
++static int check_leave_notice(sm_sevent_t *sev)
++{
++ int pos = 0, wait = 0, neg = 0, restart = 0, i;
++
++ for (i = 0; i < sev->se_memb_count; i++) {
++ switch (sev->se_node_status[i]) {
++ case STATUS_POS:
++ pos++;
++ break;
++
++ case STATUS_WAIT:
++ wait++;
++ break;
++
++ case STATUS_NEG:
++ neg++;
++ break;
++
++ default:
++ /* we didn't get a valid response from this node,
++ * restart the entire sev. */
++ restart++;
++ break;
++ }
++ }
++
++ /* all members approve */
++ if (pos && !wait && !restart)
++ return 0;
++
++ return -1;
++}
++
++/*
++ * 3. Third step in leaving the SG - tell the member nodes to "stop" the SG.
++ * They must be stopped in order to restart without us as a member.
++ */
++
++static int send_leave_stop(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ char *msg;
++ int error, len = 0;
++
++ /*
++ * Re-init the status vector in which to collect responses.
++ */
++
++ memset(sev->se_node_status, 0, sev->se_len_status);
++
++ /*
++ * Create and send a stop message.
++ */
++
++ msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
++
++ error = send_members_message_sev(sg, msg, len, sev);
++ if (error < 0)
++ goto out;
++
++ /*
++ * we and all others stop the SG now
++ */
++
++ sg->ops->stop(sg->service_data);
++
++ out:
++ return error;
++}
++
++/*
++ * 4. Fourth step in leaving the SG - check the replies to our stop request.
++ * Same problem with getting different replies as check_join_stop.
++ */
++
++static int check_leave_stop(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ int i, pos = 0, neg = 0;
++
++ for (i = 0; i < sev->se_memb_count; i++) {
++ switch (sev->se_node_status[i]) {
++ case STATUS_POS:
++ pos++;
++ break;
++
++ case STATUS_NEG:
++ log_error(sg, "check_leave_stop: fail from nodeid %u "
++ "(%d, %d, %u)", sev->se_node_ids[i],
++ pos, neg, sev->se_memb_count);
++ neg++;
++ break;
++
++ default:
++ log_error(sg, "check_leave_stop: status %u nodeid %u",
++ sev->se_node_status[i], sev->se_node_ids[i]);
++ neg++;
++ break;
++ }
++ }
++
++ if (pos == sg->memb_count)
++ return 0;
++
++ return -1;
++}
++
++/*
++ * 5. Fifth step in leaving the SG - tell the other SG members to restart the
++ * service without us. We, of course, don't start our own stopped service. If
++ * we're the last SG member and leaving, we jump right to the next step.
++ */
++
++static int send_leave_start(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ char *msg;
++ int error = 0, len = 0;
++
++ if (sg->memb_count == 1) {
++ sev->se_state = SEST_LSTART_REMOTEDONE;
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ } else {
++ msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
++ error = send_members_message(sg, msg, len);
++ }
++ return error;
++}
++
++/*
++ * Move through the steps of a leave. Summary:
++ *
++ * 1. Send a leave notice to all SG members.
++ * 2. Collect and check replies to the leave notice.
++ * 3. Send a stop message to all SG members and stop our own SG.
++ * 4. Collect and check replies to the stop message.
++ * 5. Send a start message to SG members.
++ * 6. Clean up sevent and signal completion to the process that
++ * started the leave.
++ */
++
++static void process_leave_sevent(sm_sevent_t *sev)
++{
++ int error = 0;
++
++ /*
++ * We may cancel the current leave attempt if another node is also
++ * attempting to join or leave. (Only a single node can join or leave
++ * at once.) Our leave attempt will be restarted after being
++ * cancelled.
++ */
++
++ if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
++ error = 1;
++ goto cancel;
++ }
++
++ if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
++ error = 2;
++ goto cancel;
++ }
++
++ if (!list_empty(&sev->se_sg->joining)) {
++ error = 3;
++ goto cancel;
++ }
++
++ log_debug(sev->se_sg, "sevent state %u", sev->se_state);
++
++ switch (sev->se_state) {
++
++ /*
++ * An sevent is created in kcl_leave_service with a state of
++ * LEAVE_BEGIN.
++ */
++
++ case SEST_LEAVE_BEGIN:
++ sev->se_state = SEST_LEAVE_ACKWAIT;
++ error = send_leave_notice(sev);
++ break;
++
++ /*
++ * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in
++ * process_reply (when all the replies have been received)
++ */
++
++ case SEST_LEAVE_ACKED:
++ error = check_leave_notice(sev);
++ if (error)
++ break;
++
++ sev->se_state = SEST_LSTOP_ACKWAIT;
++ error = send_leave_stop(sev);
++ break;
++
++ /*
++ * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
++ * process_reply
++ */
++
++ case SEST_LSTOP_ACKED:
++ error = check_leave_stop(sev);
++ if (error)
++ break;
++
++ sev->se_state = SEST_LSTART_WAITREMOTE;
++ error = send_leave_start(sev);
++ break;
++
++ /*
++ * se_state is changed from LSTART_WAITREMOTE to
++ * LSTART_REMOTEDONE in process_leave_done
++ */
++
++ case SEST_LSTART_REMOTEDONE:
++ sevent_done(sev);
++ break;
++
++ default:
++ log_error(sev->se_sg, "process_leave_sevent state=%u\n",
++ sev->se_state);
++ }
++
++ cancel:
++ if (error) {
++ /* restart the sevent from the beginning */
++ sev->se_state = SEST_LEAVE_BEGIN;
++ set_bit(SEFL_DELAY, &sev->se_flags);
++ schedule_sev_restart(sev);
++ }
++}
++
++/*
++ * Sevent backout code. Take appropriate steps when a recovery occurs while
++ * we're in the midst of an sevent. The recovery may or may not affect the
++ * sevent. If it does, it usually means cancelling the sevent and restarting
++ * it from the beginning once the recovery processing is done.
++ */
++
++/*
++ * If any of the nodes that replied with OK is dead, we give up on the current
++ * join attempt and restart. Otherwise, this sevent can continue.
++ */
++
++static int backout_join_acked(sm_sevent_t *sev)
++{
++ sm_node_t *node;
++ int i;
++
++ for (i = 0; i < sev->se_node_count; i++) {
++ if (sev->se_node_status[i] != STATUS_POS)
++ continue;
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
++ (node->id == sev->se_node_ids[i]))
++ return TRUE;
++ }
++ }
++ return FALSE;
++}
++
++/*
++ * In this state our sg member list exists and mark_affected_sgs() will have
++ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead. We
++ * restart the join process if this is the case, otherwise this sevent can
++ * continue.
++ */
++
++static int backout_jstop_ackwait(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
++ free_sg_memb(sg);
++ return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_jstop_acked(sm_sevent_t *sev)
++{
++ return backout_jstop_ackwait(sev);
++}
++
++/*
++ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
++ * starting our service. The recovery process will restart the service on all
++ * the prior sg members (not including those that died or us). We will
++ * reattempt our join which should be accepted once the nodes are done with
++ * recovery.
++ */
++
++static int backout_jstart_servicewait(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
++ sg->ops->stop(sg->service_data);
++ free_sg_memb(sg);
++ return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_jstart_servicedone(sm_sevent_t *sev)
++{
++ return backout_jstart_servicewait(sev);
++}
++
++/*
++ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
++ * waiting on the "all done" barrier. Stop our service that we just started
++ * and cancel the barrier. The recovery process will restart the service on
++ * all the prior sg members (not including those that died or us). We will
++ * reattempt our join which should be accepted once the nodes are done with
++ * recovery.
++ */
++
++static int backout_barrier_wait(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++ char bname[MAX_BARRIER_NAME_LEN];
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
++
++ sg->ops->stop(sg->service_data);
++
++ memset(bname, 0, MAX_BARRIER_NAME_LEN);
++ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++ sg->global_id, sm_our_nodeid, sev->se_id,
++ sg->memb_count);
++ kcl_barrier_cancel(bname);
++
++ free_sg_memb(sg);
++ return TRUE;
++}
++
++/*
++ * If NEED_RECOVERY is set, a member of the sg we just joined has failed. The
++ * recovery began after the barrier callback. If the result in the callback is
++ * "success" then we are joined, this sevent is finished and we'll process the
++ * sg within the forthcoming recovery with the other members.
++ *
++ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
++ * all nodes will receive the corresponding barrier callback *before any*
++ * receive an sm_member_update() due to one of those nodes failing just after
++ * joining the barrier. If some nodes receive the sm_member_update() before
++ * the barrier callback and others receive the barrier callback before the
++ * sm_member_update() then they will disagree as to whether the node joining/
++ * leaving is in/out of the sg.
++ */
++
++static int backout_barrier_done(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ if (!sev->se_barrier_status) {
++ do_finish_new(sev);
++ sevent_done(sev);
++ return FALSE;
++ } else {
++ sg->ops->stop(sg->service_data);
++ free_sg_memb(sg);
++ return TRUE;
++ }
++}
++
++/*
++ * We've done nothing yet, just restart when recovery is done (if sg is flagged
++ * with recovery.)
++ */
++
++static int backout_leave_begin(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ return TRUE;
++}
++
++/*
++ * Ignore any replies to our leave notice and restart when recovery is done (if
++ * sg is flagged with recovery.)
++ */
++
++static int backout_leave_ackwait(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
++
++ return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_leave_acked(sm_sevent_t *sev)
++{
++ return backout_leave_ackwait(sev);
++}
++
++/*
++ * Ignore any stop replies. All the members will be stopped anyway to do the
++ * recovery. Let that happen and restart our leave when done.
++ */
++
++static int backout_lstop_ackwait(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
++
++ return TRUE;
++}
++
++/*
++ * Same as previous.
++ */
++
++static int backout_lstop_acked(sm_sevent_t *sev)
++{
++ return backout_lstop_ackwait(sev);
++}
++
++/*
++ * All members will be stopped due to recovery and restarted by recovery
++ * processing. That includes us, we have to retry the leave once the recovery
++ * is done.
++ */
++
++static int backout_lstart_waitremote(sm_sevent_t *sev)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ return FALSE;
++
++ return TRUE;
++}
++
++/*
++ * Reset an sevent to its beginning so it can be restarted. This is necessary
++ * when recovery affects an SG while we're trying to join or leave (ie. a node
++ * in the SG fails).
++ */
++
++void backout_sevents(void)
++{
++ sm_sevent_t *sev, *safe;
++ int delay;
++
++ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
++
++ delay = FALSE;
++
++ log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
++
++ switch (sev->se_state) {
++
++ /* backout after kcl_join_service and before
++ * send_join_notice */
++ case SEST_JOIN_BEGIN:
++ break;
++
++ /* backout after send_join_notice and before final
++ * process_reply */
++ case SEST_JOIN_ACKWAIT:
++ clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
++ sev->se_state = SEST_JOIN_BEGIN;
++ schedule_sev_restart(sev);
++ break;
++
++ /* backout after final process_reply and before
++ * check_join_notice */
++ case SEST_JOIN_ACKED:
++ delay = backout_join_acked(sev);
++ break;
++
++ /* backout after send_join_stop and before final
++ * process_reply */
++ case SEST_JSTOP_ACKWAIT:
++ delay = backout_jstop_ackwait(sev);
++ break;
++
++ /* backout after final process_reply and before
++ * check_join_stop */
++ case SEST_JSTOP_ACKED:
++ delay = backout_jstop_acked(sev);
++ break;
++
++ /* backout after send_join_start and before
++ * kcl_start_done */
++ case SEST_JSTART_SERVICEWAIT:
++ delay = backout_jstart_servicewait(sev);
++ break;
++
++ /* backout after kcl_start_done and before
++ * startdone_barrier_new */
++ case SEST_JSTART_SERVICEDONE:
++ delay = backout_jstart_servicedone(sev);
++ break;
++
++ /* backout after startdone_barrier_new and before
++ * callback_startdone_barrier_new */
++ case SEST_BARRIER_WAIT:
++ delay = backout_barrier_wait(sev);
++ break;
++
++ /* backout after callback_startdone_barrier_new and
++ * before check_startdone_barrier_new */
++ case SEST_BARRIER_DONE:
++ delay = backout_barrier_done(sev);
++ break;
++
++ /* backout after kcl_leave_service and before
++ * send_leave_notice */
++ case SEST_LEAVE_BEGIN:
++ delay = backout_leave_begin(sev);
++ break;
++
++ /* backout after send_leave_notice and before final
++ * process_reply */
++ case SEST_LEAVE_ACKWAIT:
++ delay = backout_leave_ackwait(sev);
++ break;
++
++ /* backout after final process_reply and before
++ * check_leave_notice */
++ case SEST_LEAVE_ACKED:
++ delay = backout_leave_acked(sev);
++ break;
++
++ /* backout after send_leave_stop and before final
++ * process_reply */
++ case SEST_LSTOP_ACKWAIT:
++ delay = backout_lstop_ackwait(sev);
++ break;
++
++ /* backout after final process_reply and before
++ * check_leave_stop */
++ case SEST_LSTOP_ACKED:
++ delay = backout_lstop_acked(sev);
++ break;
++
++ /* backout after send_leave_start and before
++ * process_lstart_done */
++ case SEST_LSTART_WAITREMOTE:
++ delay = backout_lstart_waitremote(sev);
++ break;
++
++ /* backout after process_lstart_done and before
++ * process_leave_sevent */
++ case SEST_LSTART_REMOTEDONE:
++ sevent_done(sev);
++ delay = FALSE;
++ break;
++
++ default:
++ log_error(sev->se_sg, "backout_sevents: bad state %d",
++ sev->se_state);
++ }
++
++ if (delay) {
++ set_bit(SEFL_DELAY, &sev->se_flags);
++
++ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++ sev->se_state = SEST_LEAVE_BEGIN;
++ /* The DELAY flag will be cleared once recovery
++ * is done allowing the leave to be retried. */
++ } else {
++ sev->se_state = SEST_JOIN_BEGIN;
++ /* restart timer function will clear DELAY */
++ schedule_sev_restart(sev);
++ }
++ }
++ }
++}
++
++void process_joinleave(void)
++{
++ sm_sevent_t *sev = NULL, *safe;
++
++ spin_lock(&new_event_lock);
++ if (!list_empty(&new_event)) {
++ sev = list_entry(new_event.next, sm_sevent_t, se_list);
++ list_del(&sev->se_list);
++ list_add_tail(&sev->se_list, &joinleave_events);
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ }
++ spin_unlock(&new_event_lock);
++
++ list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
++ if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
++ continue;
++
++ if (test_bit(SEFL_DELAY, &sev->se_flags))
++ continue;
++
++ if (sev->se_state < SEST_LEAVE_BEGIN)
++ process_join_sevent(sev);
++ else
++ process_leave_sevent(sev);
++ }
++}
+diff -urN linux-orig/cluster/cman/sm_joinleave.h linux-patched/cluster/cman/sm_joinleave.h
+--- linux-orig/cluster/cman/sm_joinleave.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_joinleave.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,23 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_JOINLEAVE_DOT_H__
++#define __SM_JOINLEAVE_DOT_H__
++
++void init_joinleave(void);
++void new_joinleave(sm_sevent_t *sev);
++void process_joinleave(void);
++void backout_sevents(void);
++sm_sevent_t *find_sevent(unsigned int id);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_membership.c linux-patched/cluster/cman/sm_membership.c
+--- linux-orig/cluster/cman/sm_membership.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_membership.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,696 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++extern struct list_head sm_members;
++
++/*
++ * Routines for SG members to handle other nodes joining or leaving the SG.
++ * These "uevent" membership update routines are the response to an "sevent" on
++ * a joining/leaving node.
++ */
++
++static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
++{
++ sm_node_t *node;
++
++ list_for_each_entry(node, &sg->memb, list) {
++ if (node->id != nodeid)
++ continue;
++ list_del(&node->list);
++ kfree(node);
++ sg->memb_count--;
++ log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
++ break;
++ }
++}
++
++static void add_memb_node(sm_group_t *sg, sm_node_t *node)
++{
++ list_add_tail(&node->list, &sg->memb);
++ sg->memb_count++;
++ log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
++}
++
++/*
++ * Join 1. The receive end of send_join_stop() from a node requesting to join
++ * the SG. We stop the service so it can be restarted with the new node.
++ */
++
++static int process_join_stop(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ sm_node_t *node;
++ sm_msg_t reply;
++ int error;
++
++ if (uev->ue_num_nodes != sg->memb_count + 1) {
++ log_error(sg, "process_join_stop: bad num nodes %u %u",
++ uev->ue_num_nodes, sg->memb_count);
++ return -1;
++ }
++
++ sm_set_event_id(&uev->ue_id);
++
++ node = sm_find_joiner(sg, uev->ue_nodeid);
++ SM_ASSERT(node,);
++
++ sg->state = SGST_UEVENT;
++ sg->ops->stop(sg->service_data);
++
++ reply.ms_type = SMSG_JSTOP_REP;
++ reply.ms_status = STATUS_POS;
++ reply.ms_sevent_id = uev->ue_remote_seid;
++ smsg_bswap_out(&reply);
++
++ error = send_nodeid_message((char *) &reply, sizeof(reply),
++ uev->ue_nodeid);
++ if (error < 0)
++ return error;
++ return 0;
++}
++
++/*
++ * Join 2. The receive end of send_join_start() from a node joining the SG.
++ * We are re-starting the service with the new member added.
++ */
++
++static int process_join_start(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ sm_node_t *node;
++ uint32_t *memb;
++ int count = 0;
++
++ /* this memory is passed to the service which must free it */
++ SM_RETRY(memb =
++ kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
++ memb);
++
++ /* transfer joining node from joining list to member list */
++ node = sm_find_joiner(sg, uev->ue_nodeid);
++ SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
++ list_del(&node->list);
++ add_memb_node(sg, node);
++
++ /* the new member list for the service */
++ list_for_each_entry(node, &sg->memb, list)
++ memb[count++] = node->id;
++
++ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++
++ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
++ SERVICE_NODE_JOIN);
++ return 0;
++}
++
++/*
++ * Join 3. When done starting their local service, every previous SG member
++ * calls startdone_barrier() and the new/joining member calls
++ * startdone_barrier_new(). The barrier returns when everyone has started
++ * their service and joined the barrier.
++ */
++
++static int startdone_barrier(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ char bname[MAX_BARRIER_NAME_LEN];
++ int error;
++
++ memset(bname, 0, MAX_BARRIER_NAME_LEN);
++ uev->ue_barrier_status = -1;
++
++ set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
++
++ /* If we're the only member, skip the barrier */
++ if (sg->memb_count == 1) {
++ process_startdone_barrier(sg, 0);
++ return 0;
++ }
++
++ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
++ sg->memb_count);
++
++ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
++
++ return error;
++}
++
++/*
++ * Join 4. Check that the "all started" barrier returned a successful status.
++ * The newly joined member calls check_startdone_barrier_new().
++ */
++
++static int check_startdone_barrier(sm_group_t *sg)
++{
++ int error = sg->uevent.ue_barrier_status;
++ return error;
++}
++
++/*
++ * Join 5. Send the service a "finish" indicating that all members have
++ * successfully started. The newly joined member calls do_finish_new().
++ */
++
++static void do_finish(sm_group_t *sg)
++{
++ sg->state = SGST_RUN;
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ sg->ops->finish(sg->service_data, sg->uevent.ue_id);
++}
++
++/*
++ * Join 6. The uevent is done. If this was a uevent for a node leaving the
++ * SG, then send a final message to the departed node signalling that the
++ * remaining nodes have restarted since it left.
++ */
++
++static void uevent_done(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ sm_msg_t reply;
++
++ if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
++ reply.ms_type = SMSG_LSTART_DONE;
++ reply.ms_status = STATUS_POS;
++ reply.ms_sevent_id = uev->ue_remote_seid;
++ smsg_bswap_out(&reply);
++ send_nodeid_message((char *) &reply, sizeof(reply),
++ uev->ue_nodeid);
++ }
++ memset(&sg->uevent, 0, sizeof(sm_uevent_t));
++}
++
++/*
++ * Leave 1. The receive end of send_leave_stop() from a node leaving the SG.
++ */
++
++static int process_leave_stop(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ sm_msg_t reply;
++ int error;
++
++ sm_set_event_id(&uev->ue_id);
++
++ sg->state = SGST_UEVENT;
++ sg->ops->stop(sg->service_data);
++
++ reply.ms_type = SMSG_LSTOP_REP;
++ reply.ms_status = STATUS_POS;
++ reply.ms_sevent_id = uev->ue_remote_seid;
++ smsg_bswap_out(&reply);
++
++ error = send_nodeid_message((char *) &reply, sizeof(reply),
++ uev->ue_nodeid);
++ if (error < 0)
++ return error;
++ return 0;
++}
++
++/*
++ * Leave 2. The receive end of send_leave_start() from a node leaving the SG.
++ * We are re-starting the service (without the node that's left naturally.)
++ */
++
++static int process_leave_start(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ sm_node_t *node;
++ uint32_t *memb;
++ int count = 0;
++
++ SM_ASSERT(sg->memb_count > 1,
++ printk("memb_count=%u\n", sg->memb_count););
++
++ /* this memory is passed to the service which must free it */
++ SM_RETRY(memb =
++ kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
++ memb);
++
++ /* remove departed member from sg member list */
++ del_memb_node(sg, uev->ue_nodeid);
++
++ /* build member list to pass to service */
++ list_for_each_entry(node, &sg->memb, list)
++ memb[count++] = node->id;
++
++ /* allow us to accept the start_done callback for this start */
++ set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++
++ sg->ops->start(sg->service_data, memb, count, uev->ue_id,
++ SERVICE_NODE_LEAVE);
++ return 0;
++}
++
++/*
++ * Move through the steps of another node joining or leaving the SG.
++ */
++
++static void process_one_uevent(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ int error = 0;
++
++ log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
++
++ switch (uev->ue_state) {
++
++ /*
++ * a uevent is initialized with state JSTOP in
++ * process_stop_request
++ */
++
++ case UEST_JSTOP:
++ uev->ue_state = UEST_JSTART_WAITCMD;
++ error = process_join_stop(sg);
++ break;
++
++ /*
++ * ue_state is changed from JSTART_WAITCMD to JSTART in
++ * process_start_request
++ */
++
++ case UEST_JSTART:
++ uev->ue_state = UEST_JSTART_SERVICEWAIT;
++ error = process_join_start(sg);
++ break;
++
++ /*
++ * ue_state is changed from JSTART_SERVICEWAIT to
++ * JSTART_SERVICEDONE in kcl_start_done
++ */
++
++ case UEST_JSTART_SERVICEDONE:
++ uev->ue_state = UEST_BARRIER_WAIT;
++ error = startdone_barrier(sg);
++ break;
++
++ /*
++ * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
++ * process_startdone_barrier
++ */
++
++ case UEST_BARRIER_DONE:
++ error = check_startdone_barrier(sg);
++ if (error)
++ break;
++
++ do_finish(sg);
++ uevent_done(sg);
++ break;
++
++ /*
++ * a uevent is initialized with state LSTOP in
++ * process_stop_request
++ */
++
++ case UEST_LSTOP:
++ uev->ue_state = UEST_LSTART_WAITCMD;
++ error = process_leave_stop(sg);
++ break;
++
++ /*
++ * a uevent is changed from LSTART_WAITCMD to LSTART in
++ * process_start_request
++ */
++
++ case UEST_LSTART:
++ uev->ue_state = UEST_LSTART_SERVICEWAIT;
++ error = process_leave_start(sg);
++ break;
++
++ /*
++ * a uevent is changed from LSTART_SERVICEWAIT to to
++ * LSTART_SERVICEDONE in kcl_start_done
++ */
++
++ case UEST_LSTART_SERVICEDONE:
++ uev->ue_state = UEST_BARRIER_WAIT;
++ error = startdone_barrier(sg);
++ break;
++
++ default:
++ error = -1;
++ }
++
++ /* If we encounter an error during these routines, we do nothing,
++ expecting that a node failure related to this sg will cause a
++ recovery event to arrive and call cancel_one_uevent(). */
++
++ if (error)
++ log_error(sg, "process_one_uevent error %d state %u",
++ error, uev->ue_state);
++}
++
++static sm_node_t *failed_memb(sm_group_t *sg, int *count)
++{
++ sm_node_t *node, *sm_node, *failed_uev_node = NULL;
++
++ list_for_each_entry(node, &sg->memb, list) {
++
++ sm_node = sm_find_member(node->id);
++ SM_ASSERT(sm_node, );
++
++ if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
++ (*count)++;
++ if (node->id == sg->uevent.ue_nodeid)
++ failed_uev_node = sm_node;
++ }
++ }
++ return failed_uev_node;
++}
++
++static void send_recover_msg(sm_group_t *sg)
++{
++ char *msg;
++ int len = 0;
++ msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
++ send_members_message(sg, msg, len);
++}
++
++static void cancel_barrier(sm_group_t *sg)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ char bname[MAX_BARRIER_NAME_LEN];
++
++ clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
++
++ memset(bname, 0, MAX_BARRIER_NAME_LEN);
++ snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
++ sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
++ sg->memb_count);
++ kcl_barrier_cancel(bname);
++}
++
++static void cancel_one_uevent(sm_group_t *sg, int *effected)
++{
++ sm_uevent_t *uev = &sg->uevent;
++ int failed_count;
++ sm_node_t *node, *failed_joiner, *failed_leaver;
++
++ log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
++ uev->ue_nodeid);
++
++ switch (uev->ue_state) {
++
++ case UEST_JSTOP:
++ case UEST_JSTART_WAITCMD:
++ case UEST_JSTART:
++
++ sg->ops->stop(sg->service_data);
++
++ failed_count = 0;
++ failed_joiner = failed_memb(sg, &failed_count);
++ SM_ASSERT(!failed_joiner, );
++
++ node = sm_find_member(uev->ue_nodeid);
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ failed_joiner = node;
++
++ if (!failed_count) {
++ /* only joining node failed */
++ SM_ASSERT(failed_joiner, );
++ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++ (*effected)++;
++ /* some nodes may not have gotten a JSTOP message
++ in which case this will tell them to begin
++ recovery for this sg. */
++ send_recover_msg(sg);
++
++ } else {
++ /* a member node failed (and possibly joining node, it
++ doesn't matter) */
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++
++ case UEST_JSTART_SERVICEWAIT:
++ case UEST_JSTART_SERVICEDONE:
++
++ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++ sg->ops->stop(sg->service_data);
++
++ failed_count = 0;
++ failed_joiner = failed_memb(sg, &failed_count);
++ SM_ASSERT(failed_count, );
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++ if (failed_count == 1 && failed_joiner) {
++ /* only joining node failed */
++
++ } else if (failed_count && failed_joiner) {
++ /* joining node and another member failed */
++
++ } else {
++ /* other member failed, joining node still alive */
++ SM_ASSERT(!failed_joiner, );
++ del_memb_node(sg, uev->ue_nodeid);
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++
++ case UEST_LSTOP:
++ case UEST_LSTART_WAITCMD:
++ case UEST_LSTART:
++
++ sg->ops->stop(sg->service_data);
++
++ failed_count = 0;
++ failed_leaver = failed_memb(sg, &failed_count);
++ SM_ASSERT(failed_count, );
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++ if (failed_count == 1 && failed_leaver) {
++ /* only leaving node failed */
++
++ } else if (failed_count && failed_leaver) {
++ /* leaving node and another member failed */
++
++ } else {
++ /* other member failed, leaving node still alive */
++ SM_ASSERT(!failed_leaver, );
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++
++ case UEST_LSTART_SERVICEWAIT:
++ case UEST_LSTART_SERVICEDONE:
++
++ clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
++ sg->ops->stop(sg->service_data);
++
++ failed_count = 0;
++ failed_leaver = failed_memb(sg, &failed_count);
++ SM_ASSERT(!failed_leaver, );
++
++ node = sm_find_member(uev->ue_nodeid);
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ failed_leaver = node;
++
++ if (!failed_count) {
++ /* only leaving node failed */
++ SM_ASSERT(failed_leaver, );
++ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++ (*effected)++;
++
++ } else if (failed_count && failed_leaver) {
++ /* leaving node and another member failed */
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++ } else {
++ /* other member failed, leaving node still alive */
++ SM_ASSERT(failed_count, );
++ SM_ASSERT(!failed_leaver, );
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ node = sm_new_node(sg->uevent.ue_nodeid);
++ add_memb_node(sg, node);
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++
++ case UEST_BARRIER_WAIT:
++
++ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
++ goto barrier_wait_leave;
++
++ sg->ops->stop(sg->service_data);
++ cancel_barrier(sg);
++
++ barrier_wait_join:
++
++ failed_count = 0;
++ failed_joiner = failed_memb(sg, &failed_count);
++ SM_ASSERT(failed_count, );
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++ if (failed_count == 1 && failed_joiner) {
++ /* only joining node failed */
++
++ } else if (failed_count && failed_joiner) {
++ /* joining node and another member failed */
++
++ } else {
++ /* other member failed, joining node still alive */
++ SM_ASSERT(!failed_joiner, );
++ del_memb_node(sg, uev->ue_nodeid);
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++ barrier_wait_leave:
++
++ failed_count = 0;
++ failed_leaver = failed_memb(sg, &failed_count);
++ SM_ASSERT(!failed_leaver, );
++
++ node = sm_find_member(uev->ue_nodeid);
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ failed_leaver = node;
++
++ if (!failed_count) {
++ /* only leaving node failed */
++ SM_ASSERT(failed_leaver, );
++ SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ set_bit(SGFL_NEED_RECOVERY, &sg->flags);
++ (*effected)++;
++
++ } else if (failed_count && failed_leaver) {
++ /* leaving node and another member failed */
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++
++ } else {
++ /* other member failed, leaving node still alive */
++ SM_ASSERT(failed_count, );
++ SM_ASSERT(!failed_leaver, );
++ SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
++ node = sm_new_node(sg->uevent.ue_nodeid);
++ add_memb_node(sg, node);
++ }
++
++ clear_bit(SGFL_UEVENT, &sg->flags);
++ memset(uev, 0, sizeof(sm_uevent_t));
++ break;
++
++
++ case UEST_BARRIER_DONE:
++
++ if (!uev->ue_barrier_status) {
++ do_finish(sg);
++ uevent_done(sg);
++ break;
++ }
++
++ if (test_bit(UEFL_LEAVE, &uev->ue_flags))
++ goto barrier_wait_leave;
++ else
++ goto barrier_wait_join;
++
++
++ default:
++ log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
++ }
++}
++
++void cancel_uevents(int *effected)
++{
++ sm_group_t *sg;
++ sm_node_t *node, *sgnode;
++ int i;
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ continue;
++
++ /*
++ * Clear this dead node from the "interested in joining" list
++ * of any SG. The node is added to this list before the uevent
++ * begins.
++ */
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &sm_sg[i], list) {
++ sgnode = sm_find_joiner(sg, node->id);
++ if (sgnode) {
++ log_debug(sg, "clear joining node %u",
++ sgnode->id);
++ list_del(&sgnode->list);
++ kfree(sgnode);
++ }
++ }
++ }
++ }
++
++ /* Adjust any uevents in sg's effected by the failed node(s) */
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &sm_sg[i], list) {
++ if (!test_bit(SGFL_UEVENT, &sg->flags))
++ continue;
++
++ /* We may have some cancelling to do if this sg is
++ flagged as having a failed member, or if a joining
++ or leaving node has died. */
++
++ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ cancel_one_uevent(sg, effected);
++ else if (sg->uevent.ue_nodeid) {
++ node = sm_find_member(sg->uevent.ue_nodeid);
++ SM_ASSERT(node, );
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ cancel_one_uevent(sg, effected);
++ }
++ }
++ }
++}
++
++void process_membership(void)
++{
++ sm_group_t *sg;
++ int i;
++
++ down(&sm_sglock);
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &sm_sg[i], list) {
++ if (!test_bit(SGFL_UEVENT, &sg->flags))
++ continue;
++
++ if (!test_and_clear_bit(UEFL_CHECK,
++ &sg->uevent.ue_flags))
++ continue;
++
++ process_one_uevent(sg);
++ }
++ }
++ up(&sm_sglock);
++}
+diff -urN linux-orig/cluster/cman/sm_membership.h linux-patched/cluster/cman/sm_membership.h
+--- linux-orig/cluster/cman/sm_membership.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_membership.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MEMBERSHIP_DOT_H__
++#define __SM_MEMBERSHIP_DOT_H__
++
++void process_membership(void);
++void cancel_uevents(int *effected);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_message.c linux-patched/cluster/cman/sm_message.c
+--- linux-orig/cluster/cman/sm_message.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_message.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,867 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
++
++extern struct socket * sm_socket;
++extern uint32_t sm_our_nodeid;
++static uint32_t global_last_id;
++static struct list_head messages;
++static spinlock_t message_lock;
++static char smsg_buf[SMSG_BUF_SIZE];
++
++int send_nodeid_message(char *msg, int len, uint32_t nodeid);
++
++struct rq_entry {
++ struct list_head list;
++ char *msg;
++ int len;
++ uint32_t nodeid;
++};
++typedef struct rq_entry rq_entry_t;
++
++void init_messages(void)
++{
++ global_last_id = 1;
++ INIT_LIST_HEAD(&messages);
++ spin_lock_init(&message_lock);
++}
++
++uint32_t sm_new_global_id(int level)
++{
++ uint32_t id = global_last_id++;
++ uint8_t l = (uint8_t) level;
++
++ if (level > 255)
++ return 0;
++
++ if (id > 0x00FFFFFF)
++ return 0;
++
++ id |= (l << 24);
++ return id;
++}
++
++static void smsg_copy_in(char *msg, sm_msg_t *smsg)
++{
++ sm_msg_t *in = (sm_msg_t *) msg;
++
++ smsg->ms_type = in->ms_type;
++ smsg->ms_status = in->ms_status;
++ smsg->ms_sevent_id = le16_to_cpu(in->ms_sevent_id);
++ smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
++ smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
++ smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
++ smsg->ms_length = le16_to_cpu(in->ms_length);
++}
++
++/* swapping bytes in place is an easy source of errors - be careful not to
++ * access the fields after calling this */
++
++void smsg_bswap_out(sm_msg_t *smsg)
++{
++ smsg->ms_sevent_id = cpu_to_le16(smsg->ms_sevent_id);
++ smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
++ smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
++ smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
++ smsg->ms_length = cpu_to_le16(smsg->ms_length);
++}
++
++char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
++ sm_sevent_t *sev)
++{
++ char *msg;
++ sm_msg_t *smsg;
++ int fulllen = sizeof(sm_msg_t) + datalen;
++
++ msg = smsg_buf;
++ memset(smsg_buf, 0, SMSG_BUF_SIZE);
++ SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
++
++ smsg = (sm_msg_t *) msg;
++ smsg->ms_type = type;
++ smsg->ms_global_sgid = sg->global_id;
++ smsg->ms_sglevel = sg->level;
++ smsg->ms_length = datalen;
++ smsg->ms_sevent_id = sev ? sev->se_id : 0;
++
++ smsg_bswap_out(smsg);
++ *msglen = fulllen;
++ return msg;
++}
++
++static unsigned int msgtype_to_flag(int type)
++{
++ unsigned int flag;
++
++ switch (type) {
++ case SMSG_JOIN_REP:
++ case SMSG_JOIN_REQ:
++ flag = SEFL_ALLOW_JOIN;
++ break;
++
++ case SMSG_JSTOP_REP:
++ case SMSG_JSTOP_REQ:
++ flag = SEFL_ALLOW_JSTOP;
++ break;
++
++ case SMSG_LEAVE_REP:
++ case SMSG_LEAVE_REQ:
++ flag = SEFL_ALLOW_LEAVE;
++ break;
++
++ case SMSG_LSTOP_REP:
++ case SMSG_LSTOP_REQ:
++ flag = SEFL_ALLOW_LSTOP;
++ break;
++
++ default:
++ SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
++ }
++ return flag;
++}
++
++static int test_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++ unsigned int flag = msgtype_to_flag(type);
++
++ return test_bit(flag, &sev->se_flags);
++}
++
++static void clear_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++ unsigned int flag = msgtype_to_flag(type);
++
++ clear_bit(flag, &sev->se_flags);
++}
++
++static void set_allowed_msgtype(sm_sevent_t * sev, int type)
++{
++ unsigned int flag = msgtype_to_flag(type);
++
++ set_bit(flag, &sev->se_flags);
++}
++
++static int save_global_id(sm_sevent_t * sev, sm_msg_t * smsg)
++{
++ sm_group_t *sg = sev->se_sg;
++
++ if (!smsg->ms_global_sgid) {
++ log_error(sg, "save_global_id: zero sg id");
++ return -1;
++ }
++
++ if (!sg->global_id)
++ sg->global_id = smsg->ms_global_sgid;
++
++ if (sg->global_id != smsg->ms_global_sgid) {
++ log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
++ return -1;
++ }
++ return 0;
++}
++
++static void save_lastid(sm_msg_t * smsg)
++{
++ uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
++
++ /*
++ * Keep track of the highst SG id which has been used
++ * in the cluster in case we need to choose a new SG id.
++ */
++
++ if (gid > global_last_id)
++ global_last_id = gid;
++}
++
++static int next_sev_state(int msg_type, int cur_state)
++{
++ int next = 0;
++
++ switch (msg_type) {
++ case SMSG_JOIN_REP:
++ SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
++ next = SEST_JOIN_ACKED;
++ break;
++
++ case SMSG_JSTOP_REP:
++ SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
++ next = SEST_JSTOP_ACKED;
++ break;
++
++ case SMSG_LEAVE_REP:
++ SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
++ next = SEST_LEAVE_ACKED;
++ break;
++
++ case SMSG_LSTOP_REP:
++ SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
++ next = SEST_LSTOP_ACKED;
++ break;
++ }
++ return next;
++}
++
++/*
++ * Functions in sevent.c send messages to other nodes and then expect replies.
++ * This function collects the replies for the sevent messages and moves the
++ * sevent to the next stage when all the expected replies have been received.
++ */
++
++static void process_reply(sm_msg_t * smsg, uint32_t nodeid)
++{
++ sm_sevent_t *sev;
++ int i, expected, type = smsg->ms_type;
++
++ /*
++ * Find the relevant sevent.
++ */
++
++ sev = find_sevent(smsg->ms_sevent_id);
++ if (!sev) {
++ log_print("process_reply invalid id=%u nodeid=%u",
++ smsg->ms_sevent_id, nodeid);
++ goto out;
++ }
++
++ /*
++ * Check if this message type is what this sevent is waiting for.
++ */
++
++ if (!test_allowed_msgtype(sev, type)) {
++ log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u " "id=%u", type, nodeid, sev->se_id);
++ goto out;
++ }
++
++ expected =
++ (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
++
++ SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
++ printk("type=%d expected=%d len_ids=%d node_count=%d "
++ "memb_count=%d\n", type, expected, sev->se_len_ids,
++ sev->se_node_count, sev->se_memb_count););
++
++ SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
++ printk("type=%d expected=%d len_status=%d node_count=%d "
++ "memb_count=%d\n", type, expected, sev->se_len_status,
++ sev->se_node_count, sev->se_memb_count););
++
++ for (i = 0; i < expected; i++) {
++ if (sev->se_node_ids[i] == nodeid) {
++ /*
++ * Save the status from the replying node
++ */
++
++ if (!sev->se_node_status[i])
++ sev->se_node_status[i] = smsg->ms_status;
++ else {
++ log_error(sev->se_sg, "process_reply duplicate"
++ "id=%u nodeid=%u %u/%u",
++ sev->se_id, nodeid,
++ sev->se_node_status[i],
++ smsg->ms_status);
++ goto out;
++ }
++
++ if (type == SMSG_JOIN_REP) {
++ save_lastid(smsg);
++
++ if (smsg->ms_status == STATUS_POS)
++ save_global_id(sev, smsg);
++ }
++
++ /*
++ * Signal sm if we have all replies
++ */
++
++ if (++sev->se_reply_count == expected) {
++ clear_allowed_msgtype(sev, type);
++ sev->se_state = next_sev_state(type,
++ sev->se_state);
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ }
++
++ break;
++ }
++ }
++
++ out:
++ return;
++}
++
++/*
++ * A node wants to join an SG and has run send_join_notice. If we know nothing
++ * about the SG , then we have no objection - send back STATUS_POS. If we're a
++ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
++ * no sevent or uevent of higher priority in progress (only a single join or
++ * leave is permitted for the SG at once). If there happens to be a higher
++ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
++ * requested join for a bit.
++ */
++
++static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
++{
++ sm_group_t *sg = NULL;
++ sm_sevent_t *sev = NULL;
++ sm_node_t *node;
++ int found = FALSE;
++ int level = smsg->ms_sglevel;
++ sm_msg_t reply;
++
++ memset(&reply, 0, sizeof(reply));
++
++ down(&sm_sglock);
++
++ if (nodeid == sm_our_nodeid)
++ goto next;
++
++ /*
++ * search SG list for an SG with given name/len
++ */
++
++ list_for_each_entry(sg, &sm_sg[level], list) {
++ if ((sg->namelen != smsg->ms_length) ||
++ memcmp(sg->name, name, sg->namelen))
++ continue;
++ found = TRUE;
++ break;
++ }
++
++ /*
++ * build reply message
++ */
++
++ next:
++
++ if (!found) {
++ reply.ms_type = SMSG_JOIN_REP;
++ reply.ms_status = STATUS_NEG;
++ reply.ms_global_lastid = global_last_id;
++ reply.ms_sevent_id = smsg->ms_sevent_id;
++ } else {
++ reply.ms_type = SMSG_JOIN_REP;
++ reply.ms_status = STATUS_POS;
++ reply.ms_sevent_id = smsg->ms_sevent_id;
++ reply.ms_global_sgid = sg->global_id;
++ reply.ms_global_lastid = global_last_id;
++
++ /*
++ * The node trying to join should wait and try again until
++ * we're done with recovery.
++ */
++
++ if (sg->state == SGST_RECOVER) {
++ reply.ms_status = STATUS_WAIT;
++ goto send;
++ }
++
++ /*
++ * An sevent node trying to join may have gotten as far as
++ * creating a uevent with us and then backed out. That node
++ * will retry joining from the beginning so we should not turn
++ * them away. If we're handling a uevent for another node,
++ * tell the joining node to wait.
++ */
++
++ if (test_bit(SGFL_UEVENT, &sg->flags)) {
++ if (sg->uevent.ue_nodeid != nodeid)
++ reply.ms_status = STATUS_WAIT;
++ goto send;
++ }
++
++ /*
++ * We're trying to join or leave the SG at the moment.
++ */
++
++ if (test_bit(SGFL_SEVENT, &sg->flags)) {
++ sev = sg->sevent;
++
++ /*
++ * We're trying to leave. Make the join wait until
++ * we've left if we're beyond LEAVE_ACKWAIT.
++ */
++
++ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++ if (sev->se_state > SEST_LEAVE_ACKED)
++ reply.ms_status = STATUS_WAIT;
++ else {
++ reply.ms_status = STATUS_POS;
++ clear_bit(SEFL_ALLOW_LEAVE,
++ &sev->se_flags);
++ set_bit(SEFL_CANCEL, &sev->se_flags);
++ }
++ }
++
++ /*
++ * We're trying to join. Making the other join wait
++ * until we're joined if we're beyond JOIN_ACKWAIT or
++ * if we have a lower id. (Send NEG to allow the other
++ * node to go ahead because we're not in the SG.)
++ */
++
++ else {
++ if (sev->se_state > SEST_JOIN_ACKED)
++ reply.ms_status = STATUS_WAIT;
++ else if (sm_our_nodeid < nodeid)
++ reply.ms_status = STATUS_WAIT;
++ else {
++ reply.ms_status = STATUS_NEG;
++ clear_bit(SEFL_ALLOW_JOIN,
++ &sev->se_flags);
++ set_bit(SEFL_CANCEL, &sev->se_flags);
++ }
++ }
++
++ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ }
++ goto send;
++ }
++
++ /* no r,u,s event, stick with STATUS_POS */
++ }
++
++ send:
++
++ if (reply.ms_status == STATUS_POS) {
++ node = sm_find_joiner(sg, nodeid);
++ if (!node) {
++ node = sm_new_node(nodeid);
++ list_add_tail(&node->list, &sg->joining);
++ }
++ }
++
++ up(&sm_sglock);
++ smsg_bswap_out(&reply);
++ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++/*
++ * Another node wants us to stop a service so it can join or leave the SG. We
++ * do this by saving the request info in a uevent and having the sm thread do
++ * the processing and then replying.
++ */
++
++static void process_stop_request(sm_msg_t * smsg, uint32_t nodeid,
++ uint32_t * msgbuf)
++{
++ sm_group_t *sg;
++ sm_uevent_t *uev;
++ sm_msg_t reply;
++ int type = smsg->ms_type;
++
++ if (nodeid == sm_our_nodeid)
++ goto agree;
++
++ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++ if (!sg) {
++ log_print("process_stop_request: unknown sg id %x",
++ smsg->ms_global_sgid);
++ return;
++ }
++
++ /*
++ * We shouldn't get here with uevent already set.
++ */
++
++ if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
++ log_error(sg, "process_stop_request: uevent already set");
++ return;
++ }
++
++ uev = &sg->uevent;
++ uev->ue_nodeid = nodeid;
++ uev->ue_remote_seid = smsg->ms_sevent_id;
++ uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
++
++ if (type == SMSG_JSTOP_REQ)
++ uev->ue_num_nodes = be32_to_cpu(*msgbuf);
++ else
++ set_bit(UEFL_LEAVE, &uev->ue_flags);
++
++ /*
++ * Do process_join_stop() or process_leave_stop().
++ */
++
++ set_bit(UEFL_CHECK, &uev->ue_flags);
++ wake_serviced(DO_MEMBERSHIP);
++ return;
++
++ agree:
++ reply.ms_status = STATUS_POS;
++ reply.ms_type =
++ (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
++ reply.ms_sevent_id = smsg->ms_sevent_id;
++ smsg_bswap_out(&reply);
++ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++static void process_start_request(sm_msg_t * smsg, uint32_t nodeid)
++{
++ sm_group_t *sg;
++ sm_uevent_t *uev;
++ int type = smsg->ms_type;
++
++ if (nodeid == sm_our_nodeid)
++ return;
++
++ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++ if (!sg) {
++ log_print("process_start_request: unknown sg id %x",
++ smsg->ms_global_sgid);
++ return;
++ }
++
++ if (!test_bit(SGFL_UEVENT, &sg->flags)) {
++ log_error(sg, "process_start_request: no uevent");
++ return;
++ }
++
++ uev = &sg->uevent;
++
++ if (type == SMSG_JSTART_CMD)
++ uev->ue_state = UEST_JSTART;
++ else
++ uev->ue_state = UEST_LSTART;
++
++ set_bit(UEFL_CHECK, &uev->ue_flags);
++ wake_serviced(DO_MEMBERSHIP);
++}
++
++static void process_leave_request(sm_msg_t * smsg, uint32_t nodeid)
++{
++ sm_group_t *sg;
++ sm_node_t *node;
++ sm_msg_t reply;
++ sm_sevent_t *sev;
++ int found = FALSE;
++
++ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++ if (sg) {
++ if (nodeid == sm_our_nodeid)
++ found = TRUE;
++ else {
++ list_for_each_entry(node, &sg->memb, list) {
++ if (node->id != nodeid)
++ continue;
++ set_bit(SNFL_LEAVING, &node->flags);
++ found = TRUE;
++ break;
++ }
++ }
++ }
++
++ if (!found) {
++ reply.ms_type = SMSG_LEAVE_REP;
++ reply.ms_status = STATUS_NEG;
++ reply.ms_sevent_id = smsg->ms_sevent_id;
++ } else {
++ reply.ms_type = SMSG_LEAVE_REP;
++ reply.ms_status = STATUS_POS;
++ reply.ms_sevent_id = smsg->ms_sevent_id;
++
++ if (sg->state == SGST_RECOVER)
++ reply.ms_status = STATUS_WAIT;
++
++ else if (test_bit(SGFL_SEVENT, &sg->flags) &&
++ nodeid != sm_our_nodeid) {
++ sev = sg->sevent;
++
++ /*
++ * We're trying to join or leave at the moment. If
++ * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
++ * wait. Otherwise, if joining we'll cancel to let the
++ * leave happen first, or if we're leaving allow the
++ * lower nodeid to leave first.
++ */
++
++ if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
++ if (sev->se_state > SEST_LEAVE_ACKWAIT)
++ reply.ms_status = STATUS_WAIT;
++ else if (sm_our_nodeid < nodeid)
++ reply.ms_status = STATUS_WAIT;
++ else {
++ reply.ms_status = STATUS_POS;
++ clear_bit(SEFL_ALLOW_LEAVE,
++ &sev->se_flags);
++ set_bit(SEFL_CANCEL, &sev->se_flags);
++ }
++ } else {
++ if (sev->se_state > SEST_JOIN_ACKWAIT)
++ reply.ms_status = STATUS_WAIT;
++ else {
++ reply.ms_status = STATUS_NEG;
++ clear_bit(SEFL_ALLOW_JOIN,
++ &sev->se_flags);
++ set_bit(SEFL_CANCEL, &sev->se_flags);
++ }
++ }
++
++ if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ }
++ }
++
++ else if (test_bit(SGFL_UEVENT, &sg->flags)) {
++ if (sg->uevent.ue_nodeid != nodeid)
++ reply.ms_status = STATUS_WAIT;
++ }
++
++ }
++
++ smsg_bswap_out(&reply);
++ send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
++}
++
++/*
++ * Each remaining node will send us a done message. We quit when we get the
++ * first. The subsequent done messages for the finished sevent get here and
++ * are ignored.
++ */
++
++static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
++{
++ sm_sevent_t *sev;
++
++ sev = find_sevent(smsg->ms_sevent_id);
++ if (!sev)
++ return;
++
++ if (sev->se_state != SEST_LSTART_WAITREMOTE)
++ return;
++
++ sev->se_state = SEST_LSTART_REMOTEDONE;
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++}
++
++/*
++ * This function and everything it calls always runs in sm context.
++ */
++
++static void process_message(char *msg, uint32_t nodeid)
++{
++ sm_msg_t smsg;
++
++ smsg_copy_in(msg, &smsg);
++
++ switch (smsg.ms_type) {
++ case SMSG_JOIN_REQ:
++ process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
++ break;
++
++ case SMSG_JSTOP_REQ:
++ process_stop_request(&smsg, nodeid,
++ (uint32_t *) (msg + sizeof(sm_msg_t)));
++ break;
++
++ case SMSG_LEAVE_REQ:
++ process_leave_request(&smsg, nodeid);
++ break;
++
++ case SMSG_LSTOP_REQ:
++ process_stop_request(&smsg, nodeid, NULL);
++ break;
++
++ case SMSG_JSTART_CMD:
++ case SMSG_LSTART_CMD:
++ process_start_request(&smsg, nodeid);
++ break;
++
++ case SMSG_LSTART_DONE:
++ process_lstart_done(&smsg, nodeid);
++ break;
++
++ case SMSG_JOIN_REP:
++ case SMSG_JSTOP_REP:
++ case SMSG_LEAVE_REP:
++ case SMSG_LSTOP_REP:
++ process_reply(&smsg, nodeid);
++ break;
++
++ case SMSG_RECOVER:
++ process_recover_msg(&smsg, nodeid);
++ break;
++
++ default:
++ log_print("process_message: unknown type %u nodeid %u",
++ smsg.ms_type, nodeid);
++ }
++}
++
++/*
++ * Always called from sm context.
++ */
++
++void process_messages(void)
++{
++ rq_entry_t *re;
++
++ while (1) {
++ re = NULL;
++
++ spin_lock(&message_lock);
++ if (!list_empty(&messages)) {
++ re = list_entry(messages.next, rq_entry_t, list);
++ list_del(&re->list);
++ }
++ spin_unlock(&message_lock);
++
++ if (!re)
++ break;
++ process_message(re->msg, re->nodeid);
++ kfree(re->msg);
++ kfree(re);
++ schedule();
++ }
++}
++
++/*
++ * Context: cnxman and sm
++ */
++
++static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
++{
++ rq_entry_t *re;
++
++ SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
++ re);
++ SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
++
++ memcpy(re->msg, msg, len);
++ re->len = len;
++ re->nodeid = nodeid;
++
++ spin_lock(&message_lock);
++ list_add_tail(&re->list, &messages);
++ spin_unlock(&message_lock);
++
++ wake_serviced(DO_MESSAGES);
++ return 0;
++}
++
++/*
++ * Context: cnxman
++ * Called by cnxman when a service manager message arrives.
++ */
++
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++ unsigned int node_id)
++{
++ struct kcl_cluster_node kclnode;
++ uint32_t nodeid = 0;
++ int error = 0;
++
++ if (!node_id) {
++ error = kcl_get_node_by_addr(addr, addr_len, &kclnode);
++ if (error)
++ return error;
++ nodeid = kclnode.node_id;
++ } else
++ nodeid = node_id;
++
++ return add_to_recvqueue(msg, len, nodeid);
++}
++
++/*
++ * These send routines are used by sm and are always called from sm context.
++ */
++
++int send_nodeid_message(char *msg, int len, uint32_t nodeid)
++{
++ int error = 0;
++ struct sockaddr_cl saddr;
++
++ if (nodeid == sm_our_nodeid) {
++ add_to_recvqueue(msg, len, nodeid);
++ goto out;
++ }
++
++ saddr.scl_family = AF_CLUSTER;
++ saddr.scl_port = CLUSTER_PORT_SERVICES;
++ saddr.scl_nodeid = nodeid;
++ error = kcl_sendmsg(sm_socket, msg, len, &saddr,
++ sizeof(saddr), 0);
++ if (error > 0)
++ error = 0;
++
++ if (error)
++ log_print("send_nodeid_message error %d to %u", error, nodeid);
++ out:
++ return error;
++}
++
++int send_broadcast_message(char *msg, int len)
++{
++ int error;
++
++ error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
++ if (error > 0)
++ error = 0;
++
++ add_to_recvqueue(msg, len, sm_our_nodeid);
++
++ if (error)
++ log_print("send_broadcast_message error %d", error);
++
++ return error;
++}
++
++int send_members_message(sm_group_t *sg, char *msg, int len)
++{
++ sm_node_t *node;
++ int error = 0;
++
++ list_for_each_entry(node, &sg->memb, list) {
++ error = send_nodeid_message(msg, len, node->id);
++ if (error < 0)
++ break;
++ }
++ return error;
++}
++
++int send_members_message_sev(sm_group_t *sg, char *msg, int len,
++ sm_sevent_t * sev)
++{
++ int error;
++ sm_msg_t *smsg = (sm_msg_t *) msg;
++
++ set_allowed_msgtype(sev, smsg->ms_type);
++ sev->se_reply_count = 0;
++
++ error = send_members_message(sg, msg, len);
++ if (error < 0)
++ clear_allowed_msgtype(sev, smsg->ms_type);
++
++ return error;
++}
++
++int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
++{
++ int error;
++ sm_msg_t *smsg = (sm_msg_t *) msg;
++
++ set_allowed_msgtype(sev, smsg->ms_type);
++ sev->se_reply_count = 0;
++
++ error = send_broadcast_message(msg, len);
++ if (error < 0)
++ clear_allowed_msgtype(sev, smsg->ms_type);
++
++ return error;
++}
+diff -urN linux-orig/cluster/cman/sm_message.h linux-patched/cluster/cman/sm_message.h
+--- linux-orig/cluster/cman/sm_message.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_message.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MESSAGE_DOT_H__
++#define __SM_MESSAGE_DOT_H__
++
++void init_messages(void);
++uint32_t sm_new_global_id(int level);
++void smsg_bswap_out(sm_msg_t * smsg);
++char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
++ sm_sevent_t *sev);
++void process_messages(void);
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++ unsigned int node_id);
++int send_nodeid_message(char *msg, int len, uint32_t nodeid);
++int send_broadcast_message(char *msg, int len);
++int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
++int send_members_message(sm_group_t *sg, char *msg, int len);
++int send_members_message_sev(sm_group_t *sg, char *msg, int len,
++ sm_sevent_t * sev);
++int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
++ unsigned int node_id);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_misc.c linux-patched/cluster/cman/sm_misc.c
+--- linux-orig/cluster/cman/sm_misc.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_misc.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,369 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++#define MAX_DEBUG_MSG_LEN (40)
++
++extern struct list_head sm_members;
++static uint32_t local_ids;
++static uint32_t event_id;
++static spinlock_t event_id_lock;
++static char * debug_buf;
++static unsigned int debug_size;
++static unsigned int debug_point;
++static int debug_wrap;
++static spinlock_t debug_lock;
++
++
++void init_sm_misc(void)
++{
++ local_ids = 1;
++ event_id = 1;
++ spin_lock_init(&event_id_lock);
++ debug_buf = NULL;
++ debug_size = 0;
++ debug_point = 0;
++ debug_wrap = 0;
++ spin_lock_init(&debug_lock);
++
++ sm_debug_setup(cman_config.sm_debug_size);
++}
++
++sm_node_t *sm_new_node(uint32_t nodeid)
++{
++ struct kcl_cluster_node kclnode;
++ sm_node_t *node;
++ int error;
++
++ error = kcl_get_node_by_nodeid(nodeid, &kclnode);
++ SM_ASSERT(!error,);
++
++ SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
++ node);
++
++ memset(node, 0, sizeof(sm_node_t));
++ node->id = nodeid;
++ node->incarnation = kclnode.incarnation;
++ return node;
++}
++
++sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
++{
++ sm_node_t *node;
++
++ list_for_each_entry(node, &sg->joining, list) {
++ if (node->id == nodeid)
++ return node;
++ }
++ return NULL;
++}
++
++sm_node_t *sm_find_member(uint32_t nodeid)
++{
++ sm_node_t *node;
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (node->id == nodeid)
++ return node;
++ }
++ return NULL;
++}
++
++uint32_t sm_new_local_id(int level)
++{
++ uint32_t id = local_ids++;
++ uint8_t l = (uint8_t) level;
++
++ if (level > 0xFF)
++ return 0;
++
++ if (id > 0x00FFFFFF)
++ return 0;
++
++ id |= (l << 24);
++ return id;
++}
++
++int sm_id_to_level(uint32_t id)
++{
++ uint8_t l = (id & 0xFF000000) >> 24;
++
++ return (int) l;
++}
++
++void sm_set_event_id(int *id)
++{
++ spin_lock(&event_id_lock);
++ *id = event_id++;
++ spin_unlock(&event_id_lock);
++}
++
++sm_group_t *sm_local_id_to_sg(int id)
++{
++ sm_group_t *sg;
++ int level = sm_id_to_level(id);
++ int found = FALSE;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sm_sg[level], list) {
++ if (sg->local_id == id) {
++ found = TRUE;
++ break;
++ }
++ }
++ up(&sm_sglock);
++ if (!found)
++ sg = NULL;
++ return sg;
++}
++
++sm_group_t *sm_global_id_to_sg(int id)
++{
++ sm_group_t *sg;
++ int level = sm_id_to_level(id);
++ int found = FALSE;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sm_sg[level], list) {
++ if (sg->global_id == id) {
++ found = TRUE;
++ break;
++ }
++ }
++ up(&sm_sglock);
++ if (!found)
++ sg = NULL;
++ return sg;
++}
++
++void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
++{
++ va_list va;
++ int i, n, size, len;
++ char buf[MAX_DEBUG_MSG_LEN+1];
++
++ spin_lock(&debug_lock);
++
++ if (!debug_buf)
++ goto out;
++
++ size = MAX_DEBUG_MSG_LEN;
++ memset(buf, 0, size+1);
++
++ n = snprintf(buf, size, "%08x ", sg->global_id);
++ size -= n;
++
++ va_start(va, fmt);
++ vsnprintf(buf+n, size, fmt, va);
++ va_end(va);
++
++ len = strlen(buf);
++ if (len > MAX_DEBUG_MSG_LEN-1)
++ len = MAX_DEBUG_MSG_LEN-1;
++ buf[len] = '\n';
++ buf[len+1] = '\0';
++
++ for (i = 0; i < strlen(buf); i++) {
++ debug_buf[debug_point++] = buf[i];
++
++ if (debug_point == debug_size) {
++ debug_point = 0;
++ debug_wrap = 1;
++ }
++ }
++ out:
++ spin_unlock(&debug_lock);
++}
++
++void sm_debug_setup(int size)
++{
++ char *b = kmalloc(size, GFP_KERNEL);
++
++ spin_lock(&debug_lock);
++ if (debug_buf)
++ kfree(debug_buf);
++
++ if (size > PAGE_SIZE)
++ size = PAGE_SIZE;
++ debug_size = size;
++ debug_point = 0;
++ debug_wrap = 0;
++ debug_buf = b;
++ memset(debug_buf, 0, debug_size);
++ spin_unlock(&debug_lock);
++}
++
++#ifdef CONFIG_PROC_FS
++
++int sm_debug_info(char *b, char **start, off_t offset, int length)
++{
++ int i, n = 0;
++
++ spin_lock(&debug_lock);
++
++ if (debug_wrap) {
++ for (i = debug_point; i < debug_size; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++ }
++ for (i = 0; i < debug_point; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++
++ spin_unlock(&debug_lock);
++
++ return n;
++}
++
++int sm_procdata(char *b, char **start, off_t offset, int length)
++{
++ sm_group_t *sg;
++ sm_node_t *node;
++ int n = 0, level, i;
++
++ n += sprintf(b + n, "\n");
++
++ /*
++ * Header
++ */
++
++ n += sprintf(b + n,
++ "Service Name GID LID State Code\n");
++
++ down(&sm_sglock);
++
++ for (level = 0; level < SG_LEVELS; level++) {
++ list_for_each_entry(sg, &sm_sg[level], list) {
++
++ /*
++ * Cluster Service
++ */
++
++ switch (level) {
++ case SERVICE_LEVEL_FENCE:
++ n += sprintf(b + n, "Fence Domain: ");
++ break;
++ case SERVICE_LEVEL_GDLM:
++ n += sprintf(b + n, "DLM Lock Space: ");
++ break;
++ case SERVICE_LEVEL_GFS:
++ n += sprintf(b + n, "GFS Mount Group: ");
++ break;
++ case SERVICE_LEVEL_USER:
++ n += sprintf(b + n, "User: ");
++ break;
++ }
++
++ /*
++ * Name
++ */
++
++ n += sprintf(b + n, "\"");
++ for (i = 0; i < sg->namelen; i++)
++ n += sprintf(b + n, "%c", sg->name[i]);
++ n += sprintf(b + n, "\"");
++
++ for (; i < MAX_SERVICE_NAME_LEN-1; i++)
++ n += sprintf(b + n, " ");
++
++ /*
++ * GID LID (sans level from top byte)
++ */
++
++ n += sprintf(b + n, "%3u %3u ",
++ (sg->global_id & 0x00FFFFFF),
++ (sg->local_id & 0x00FFFFFF));
++
++ /*
++ * State
++ */
++
++ switch (sg->state) {
++ case SGST_NONE:
++ n += sprintf(b + n, "none ");
++ break;
++ case SGST_JOIN:
++ n += sprintf(b + n, "join ");
++ break;
++ case SGST_RUN:
++ n += sprintf(b + n, "run ");
++ break;
++ case SGST_RECOVER:
++ n += sprintf(b + n, "recover %u ",
++ sg->recover_state);
++ break;
++ case SGST_UEVENT:
++ n += sprintf(b + n, "update ");
++ break;
++ }
++
++ /*
++ * Code
++ */
++
++ if (test_bit(SGFL_SEVENT, &sg->flags))
++ n += sprintf(b + n, "S");
++ if (test_bit(SGFL_UEVENT, &sg->flags))
++ n += sprintf(b + n, "U");
++ if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
++ n += sprintf(b + n, "N");
++
++ n += sprintf(b + n, "-");
++
++ if (test_bit(SGFL_SEVENT, &sg->flags)
++ && sg->sevent) {
++ n += sprintf(b + n, "%u,%lx,%u",
++ sg->sevent->se_state,
++ sg->sevent->se_flags,
++ sg->sevent->se_reply_count);
++ }
++
++ if (test_bit(SGFL_UEVENT, &sg->flags)) {
++ n += sprintf(b + n, "%u,%lx,%u",
++ sg->uevent.ue_state,
++ sg->uevent.ue_flags,
++ sg->uevent.ue_nodeid);
++ }
++
++ n += sprintf(b + n, "\n");
++
++ /*
++ * node list
++ */
++
++ i = 0;
++
++ n += sprintf(b + n, "[");
++
++ list_for_each_entry(node, &sg->memb, list) {
++ if (i && !(i % 24))
++ n += sprintf(b + n, "\n");
++
++ if (i)
++ n += sprintf(b + n, " ");
++
++ n += sprintf(b + n, "%u", node->id);
++ i++;
++ }
++
++ n += sprintf(b + n, "]\n\n");
++ }
++ }
++
++ up(&sm_sglock);
++
++ return n;
++}
++#endif
+diff -urN linux-orig/cluster/cman/sm_misc.h linux-patched/cluster/cman/sm_misc.h
+--- linux-orig/cluster/cman/sm_misc.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_misc.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_MISC_DOT_H__
++#define __SM_MISC_DOT_H__
++
++void init_sm_misc(void);
++sm_node_t *sm_new_node(uint32_t nodeid);
++sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
++sm_node_t *sm_find_member(uint32_t nodeid);
++uint32_t sm_new_local_id(int level);
++int sm_id_to_level(uint32_t id);
++void sm_set_event_id(int *id);
++sm_group_t *sm_local_id_to_sg(int id);
++sm_group_t *sm_global_id_to_sg(int id);
++void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
++void sm_debug_setup(int size);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_recover.c linux-patched/cluster/cman/sm_recover.c
+--- linux-orig/cluster/cman/sm_recover.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_recover.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,522 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "config.h"
++
++/*
++ * A collection of sg's which need to be recovered due to a failed member.
++ * These sg's are recovered in order of level. An sg subject to cascading
++ * failures is moved from one of these structs to a newer one.
++ */
++
++struct recover {
++ struct list_head list; /* list of current re's */
++ struct list_head sgs[SG_LEVELS]; /* lists of sg's by level */
++ int event_id; /* event id */
++ int cur_level;
++};
++typedef struct recover recover_t;
++
++
++extern uint32_t * sm_new_nodeids;
++extern int sm_quorum, sm_quorum_next;
++extern uint32_t sm_our_nodeid;
++extern struct list_head sm_members;
++extern int sm_member_count;
++static struct list_head recoveries;
++
++
++void init_recovery(void)
++{
++ INIT_LIST_HEAD(&recoveries);
++}
++
++/*
++ * This is the first thing called when a change is announced in cluster
++ * membership. Nodes are marked as being a CLUSTER_MEMBER or not. SM adds new
++ * nodes to its sm_members list which it's not seen before. Nodes which were
++ * alive but are now gone are marked as "need recovery".
++ *
++ * The "need recovery" status of nodes is propagated to the node's SG's in
++ * mark_effected_sgs. The effected SG's are themselves marked as needing
++ * recovery and in new_recovery the dead nodes are removed from the SG's
++ * individual member lists. The "need recovery" status of nodes is cleared in
++ * adjust_members_done().
++ */
++
++static int adjust_members(void)
++{
++ sm_node_t *node;
++ struct kcl_cluster_node knode;
++ int i, error, num_nodes, sub = 0, add = 0, found;
++
++ /*
++ * Get list of current members from cnxman
++ */
++
++ memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
++ num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
++
++ /*
++ * Determine who's gone
++ */
++
++ list_for_each_entry(node, &sm_members, list) {
++ found = FALSE;
++ for (i = 0; i < num_nodes; i++) {
++ if (node->id == sm_new_nodeids[i]) {
++ found = TRUE;
++ sm_new_nodeids[i] = 0;
++ break;
++ }
++ }
++
++ if (found) {
++ error = kcl_get_node_by_nodeid(node->id, &knode);
++ SM_ASSERT(!error, printk("error=%d\n", error););
++
++ if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
++ /* former member is back */
++ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
++ node->incarnation = knode.incarnation;
++ add++;
++ } else {
++ /* current member is still alive - if the
++ * incarnation number is different it died and
++ * returned between checks */
++ if (node->incarnation != knode.incarnation) {
++ set_bit(SNFL_NEED_RECOVERY,
++ &node->flags);
++ node->incarnation = knode.incarnation;
++ sub++;
++ }
++ }
++ } else {
++ /* current member has died */
++ if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
++ &node->flags)) {
++ set_bit(SNFL_NEED_RECOVERY, &node->flags);
++ sub++;
++ }
++ }
++ }
++
++ /*
++ * Look for new nodes
++ */
++
++ for (i = 0; i < num_nodes; i++) {
++ if (sm_new_nodeids[i]) {
++ node = sm_new_node(sm_new_nodeids[i]);
++ set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
++ add++;
++ list_add_tail(&node->list, &sm_members);
++ sm_member_count++;
++ }
++ }
++
++ /*
++ * Get our own nodeid
++ */
++
++ if (!sm_our_nodeid) {
++ list_for_each_entry(node, &sm_members, list) {
++ error = kcl_get_node_by_nodeid(node->id, &knode);
++ SM_ASSERT(!error, printk("error=%d\n", error););
++
++ if (knode.us) {
++ sm_our_nodeid = knode.node_id;
++ break;
++ }
++ }
++ }
++
++ return sub;
++}
++
++/*
++ * Given some number of dead nodes, flag SG's the dead nodes were part of.
++ * This requires a number of loops because each node structure does not keep a
++ * list of SG's it's in.
++ */
++
++static int mark_effected_sgs(void)
++{
++ sm_group_t *sg;
++ sm_node_t *node, *sgnode;
++ uint32_t dead_id;
++ int i, effected = 0;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(node, &sm_members, list) {
++ if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
++ continue;
++
++ dead_id = node->id;
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &sm_sg[i], list) {
++ /* check if dead node is among sg's members */
++ list_for_each_entry(sgnode, &sg->memb, list) {
++ if (sgnode->id == dead_id) {
++ set_bit(SGFL_NEED_RECOVERY,
++ &sg->flags);
++ effected++;
++ break;
++ }
++ }
++ }
++ }
++ }
++ up(&sm_sglock);
++
++ return effected;
++}
++
++static recover_t *alloc_recover(void)
++{
++ recover_t *rev;
++ int i;
++
++ SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
++
++ memset(rev, 0, sizeof(recover_t));
++
++ sm_set_event_id(&rev->event_id);
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ INIT_LIST_HEAD(&rev->sgs[i]);
++ }
++
++ return rev;
++}
++
++/*
++ * An in-progress revent re-start for an SG is interrupted by another node
++ * failure in the SG. Cancel an outstanding barrier if there is one. The SG
++ * will be moved to the new revent and re-started as part of that.
++ */
++
++static void cancel_prev_recovery(sm_group_t *sg)
++{
++ int error;
++
++ if (sg->recover_state == RECOVER_BARRIERWAIT) {
++ error = kcl_barrier_cancel(sg->recover_barrier);
++ if (error)
++ log_error(sg, "cancel_prev_recovery: error %d", error);
++ }
++}
++
++static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
++{
++ if (sg->state == SGST_RECOVER) {
++ cancel_prev_recovery(sg);
++ list_del(&sg->recover_list);
++ }
++
++ sg->ops->stop(sg->service_data);
++ sg->state = SGST_RECOVER;
++ sg->recover_state = RECOVER_NONE;
++ sg->recover_data = rev;
++ list_add(&sg->recover_list, &rev->sgs[sg->level]);
++}
++
++/*
++ * When adjust_members finds that some nodes are dead and mark_effected_sgs
++ * finds that some SG's are effected by departed nodes, this is called to
++ * collect together the SG's which need to be recovered. An revent (recovery
++ * event) is the group of effected SG's.
++ */
++
++static int new_recovery(void)
++{
++ sm_group_t *sg;
++ recover_t *rev;
++ sm_node_t *node, *sgnode, *safe;
++ int i;
++
++ rev = alloc_recover();
++ list_add_tail(&rev->list, &recoveries);
++
++ down(&sm_sglock);
++
++ /*
++ * Stop effected SG's and add them to the rev
++ */
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &sm_sg[i], list) {
++ if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
++ if (sg->state == SGST_JOIN)
++ continue;
++ pre_recover_sg(sg, rev);
++ }
++ }
++ }
++
++ /*
++ * For an SG needing recovery, remove dead nodes from sg->memb list
++ */
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ list_for_each_entry(sg, &rev->sgs[i], recover_list) {
++
++ /* Remove dead members from SG's member list */
++ list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
++
++ node = sm_find_member(sgnode->id);
++ SM_ASSERT(node, printk("id %u\n", sgnode->id););
++
++ if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
++ list_del(&sgnode->list);
++ kfree(sgnode);
++ sg->memb_count--;
++ log_debug(sg, "remove node %u count %d",
++ sgnode->id, sg->memb_count);
++ }
++ }
++ }
++ }
++
++ up(&sm_sglock);
++ rev->cur_level = 0;
++ return 0;
++}
++
++/*
++ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
++ * mark_effected_sgs() and add_revent(). After that, we're done using the bit
++ * and we clear it here.
++ */
++
++static void adjust_members_done(void)
++{
++ sm_node_t *node;
++
++ list_for_each_entry(node, &sm_members, list)
++ clear_bit(SNFL_NEED_RECOVERY, &node->flags);
++}
++
++/*
++ * Start the service of the given SG. The service must be given an array of
++ * nodeids specifying the new sg membership. The service is responsible to
++ * free this chunk of memory when done with it.
++ */
++
++static void start_sg(sm_group_t *sg, uint32_t event_id)
++{
++ sm_node_t *node;
++ uint32_t *memb;
++ int count = 0;
++
++ SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
++ memb);
++
++ list_for_each_entry(node, &sg->memb, list)
++ memb[count++] = node->id;
++
++ sg->ops->start(sg->service_data, memb, count, event_id,
++ SERVICE_NODE_FAILED);
++}
++
++static void recovery_barrier(sm_group_t *sg)
++{
++ char bname[MAX_BARRIER_NAME_LEN];
++ int error, len;
++
++ memset(bname, 0, MAX_BARRIER_NAME_LEN);
++
++ /* bypass the barrier if we're the only member */
++ if (sg->memb_count == 1) {
++ process_recovery_barrier(sg, 0);
++ return;
++ }
++
++ len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
++ sg->global_id, sg->recover_stop, sg->memb_count);
++
++ /* We save this barrier name so we can cancel it if needed. */
++ memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
++ memcpy(sg->recover_barrier, bname, len);
++
++ error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
++ if (error)
++ log_error(sg, "recovery_barrier error %d: %s", error, bname);
++}
++
++static void recover_sg(sm_group_t *sg, int event_id)
++{
++ log_debug(sg, "recover state %d", sg->recover_state);
++
++ switch (sg->recover_state) {
++
++ case RECOVER_NONE:
++ /* must wait for recovery to stop sg on all nodes */
++ sg->recover_state = RECOVER_BARRIERWAIT;
++ sg->recover_stop = 0;
++ recovery_barrier(sg);
++ break;
++
++ case RECOVER_BARRIERWAIT:
++ break;
++
++ case RECOVER_STOP:
++ /* barrier callback sets state STOP */
++ sg->recover_stop = 1;
++ sg->recover_state = RECOVER_START;
++ start_sg(sg, event_id);
++ break;
++
++ case RECOVER_START:
++ break;
++
++ case RECOVER_STARTDONE:
++ /* service callback sets state STARTDONE */
++ sg->recover_state = RECOVER_BARRIERWAIT;
++ recovery_barrier(sg);
++ break;
++
++ case RECOVER_BARRIERDONE:
++ /* barrier callback sets state BARRIERDONE */
++ sg->ops->finish(sg->service_data, event_id);
++ list_del(&sg->recover_list);
++ sg->recover_state = RECOVER_NONE;
++ sg->state = SGST_RUN;
++
++ /* Continue a previous, interrupted attempt to leave the sg */
++ if (sg->sevent) {
++ clear_bit(SEFL_DELAY, &sg->sevent->se_flags);
++ set_bit(SEFL_CHECK, &sg->sevent->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ }
++ break;
++
++ default:
++ log_error(sg, "invalid recover_state %u", sg->recover_state);
++ }
++}
++
++static void recover_level(recover_t *rev, int level)
++{
++ sm_group_t *sg, *safe;
++
++ list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list)
++ recover_sg(sg, rev->event_id);
++}
++
++static void recover_levels(recover_t *rev)
++{
++ for (;;) {
++ recover_level(rev, rev->cur_level);
++
++ if (list_empty(&rev->sgs[rev->cur_level])) {
++ if (rev->cur_level == SG_LEVELS - 1) {
++ list_del(&rev->list);
++ kfree(rev);
++ return;
++ }
++ rev->cur_level++;
++ continue;
++ }
++ break;
++ }
++}
++
++/*
++ * Called by SM thread when the cluster is quorate. It restarts
++ * SG's that were stopped in new_recovery() due to a member death.
++ * It waits for all SG's at level N to complete restart before
++ * restarting SG's at level N+1.
++ */
++
++void process_recoveries(void)
++{
++ recover_t *rev, *safe;
++
++ down(&sm_sglock);
++ list_for_each_entry_safe(rev, safe, &recoveries, list)
++ recover_levels(rev);
++ up(&sm_sglock);
++}
++
++/*
++ * The cnxman membership has changed. Check if there's still quorum and
++ * whether any nodes have died. If nodes have died, initiate recovery on any
++ * SG's they were in. This begins immediately if the cluster remains quorate;
++ * if not this waits until the cluster regains quorum.
++ */
++
++void process_nodechange(void)
++{
++ int gone, effected;
++
++ if ((sm_quorum = sm_quorum_next))
++ wake_serviced(DO_RUN);
++
++ gone = adjust_members();
++ if (gone > 0) {
++ effected = mark_effected_sgs();
++
++ backout_sevents();
++ cancel_uevents(&effected);
++
++ if (effected > 0) {
++ new_recovery();
++ wake_serviced(DO_RECOVERIES);
++ }
++ }
++ adjust_members_done();
++}
++
++int check_recovery(sm_group_t *sg, int event_id)
++{
++ if (sg->state == SGST_RECOVER) {
++ recover_t *rev = (recover_t *) sg->recover_data;
++ if (rev && rev->event_id == event_id)
++ return 1;
++ }
++ return 0;
++}
++
++void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
++{
++ sm_group_t *sg;
++ recover_t *rev;
++
++ sg = sm_global_id_to_sg(smsg->ms_global_sgid);
++ if (!sg) {
++ log_print("process_recover_msg: unknown sg id %x",
++ smsg->ms_global_sgid);
++ return;
++ }
++
++ /* we already know about the recovery and can ignore the msg */
++ if (sg->state == SGST_RECOVER)
++ return;
++
++ if (test_bit(SGFL_UEVENT, &sg->flags)) {
++ /* we will initiate recovery on our own if we know about the
++ uevent so we can ignore this */
++ log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
++ return;
++ }
++
++ log_debug(sg, "recovery initiated by msg from %u", nodeid);
++ rev = alloc_recover();
++ list_add_tail(&rev->list, &recoveries);
++ pre_recover_sg(sg, rev);
++ wake_serviced(DO_RECOVERIES);
++}
+diff -urN linux-orig/cluster/cman/sm_recover.h linux-patched/cluster/cman/sm_recover.h
+--- linux-orig/cluster/cman/sm_recover.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_recover.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,23 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_RECOVER_DOT_H__
++#define __SM_RECOVER_DOT_H__
++
++void init_recovery(void);
++void process_recoveries(void);
++void process_nodechange(void);
++int check_recovery(sm_group_t *sg, int event_id);
++void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_services.c linux-patched/cluster/cman/sm_services.c
+--- linux-orig/cluster/cman/sm_services.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_services.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,418 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++
++static struct list_head callbacks;
++static spinlock_t callback_lock;
++static struct list_head sg_registered[SG_LEVELS];
++
++/*
++ * These are the functions to register, join, leave, unregister, callback
++ * with/to the sm.
++ */
++
++struct sc_entry {
++ struct list_head list;
++ uint32_t local_id;
++ int event_id;
++};
++typedef struct sc_entry sc_entry_t;
++
++void init_services(void)
++{
++ int i;
++
++ INIT_LIST_HEAD(&callbacks);
++ spin_lock_init(&callback_lock);
++
++ for (i = 0; i < SG_LEVELS; i++) {
++ INIT_LIST_HEAD(&sm_sg[i]);
++ INIT_LIST_HEAD(&sg_registered[i]);
++ }
++ init_MUTEX(&sm_sglock);
++}
++
++/* Context: service */
++
++int kcl_register_service(char *name, int namelen, int level,
++ struct kcl_service_ops *ops, int unique,
++ void *servicedata, uint32_t *service_id)
++{
++ sm_group_t *sg;
++ int found = FALSE;
++ int error = -EINVAL;
++
++ if (level > SG_LEVELS - 1)
++ goto fail;
++
++ if (namelen > MAX_SERVICE_NAME_LEN)
++ goto fail;
++
++ error = kcl_addref_cluster();
++ if (error)
++ goto fail;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sm_sg[level], list) {
++ if ((sg->namelen == namelen) &&
++ (!strncmp(sg->name, name, namelen))) {
++ found = TRUE;
++ goto next;
++ }
++ }
++
++ list_for_each_entry(sg, &sg_registered[level], list) {
++ if ((sg->namelen == namelen) &&
++ (!strncmp(sg->name, name, namelen))) {
++ found = TRUE;
++ goto next;
++ }
++ }
++
++ next:
++
++ if (found && unique) {
++ error = -EEXIST;
++ goto fail_unlock;
++ }
++
++ if (found) {
++ sg->refcount++;
++ goto out;
++ }
++
++ sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
++ if (!sg) {
++ error = -ENOMEM;
++ goto fail_unlock;
++ }
++ memset(sg, 0, sizeof(sm_group_t) + namelen);
++
++ sg->refcount = 1;
++ sg->service_data = servicedata;
++ sg->ops = ops;
++ sg->level = level;
++ sg->namelen = namelen;
++ memcpy(sg->name, name, namelen);
++ sg->local_id = sm_new_local_id(level);
++ sg->state = SGST_NONE;
++ INIT_LIST_HEAD(&sg->memb);
++ INIT_LIST_HEAD(&sg->joining);
++ init_completion(&sg->event_comp);
++
++ list_add_tail(&sg->list, &sg_registered[level]);
++
++ out:
++ *service_id = sg->local_id;
++ up(&sm_sglock);
++ return 0;
++
++ fail_unlock:
++ up(&sm_sglock);
++ kcl_releaseref_cluster();
++ fail:
++ return error;
++}
++
++/* Context: service */
++
++void kcl_unregister_service(uint32_t local_id)
++{
++ sm_group_t *sg;
++ int level = sm_id_to_level(local_id);
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sg_registered[level], list) {
++ if (sg->local_id == local_id) {
++ SM_ASSERT(sg->refcount,);
++ sg->refcount--;
++
++ if (!sg->refcount) {
++ list_del(&sg->list);
++ kfree(sg);
++ }
++ kcl_releaseref_cluster();
++ break;
++ }
++ }
++ up(&sm_sglock);
++}
++
++/* Context: service */
++
++int kcl_join_service(uint32_t local_id)
++{
++ sm_group_t *sg;
++ sm_sevent_t *sev;
++ int level = sm_id_to_level(local_id);
++ int error, found = FALSE;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sg_registered[level], list) {
++ if (sg->local_id == local_id) {
++ found = TRUE;
++ break;
++ }
++ }
++
++ if (!found) {
++ up(&sm_sglock);
++ error = -ENOENT;
++ goto out;
++ }
++
++ if (sg->state != SGST_NONE) {
++ up(&sm_sglock);
++ error = -EINVAL;
++ goto out;
++ }
++
++ sg->state = SGST_JOIN;
++ set_bit(SGFL_SEVENT, &sg->flags);
++ list_del(&sg->list);
++ list_add_tail(&sg->list, &sm_sg[sg->level]);
++
++ up(&sm_sglock);
++
++ /*
++ * The join is a service event which will be processed asynchronously.
++ */
++
++ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
++ if (!sev) {
++ error = -ENOMEM;
++ goto out;
++ }
++
++ memset(sev, 0, sizeof (sm_sevent_t));
++ sev->se_state = SEST_JOIN_BEGIN;
++ sev->se_sg = sg;
++ sg->sevent = sev;
++ sm_set_event_id(&sev->se_id);
++
++ new_joinleave(sev);
++ wait_for_completion(&sg->event_comp);
++ error = 0;
++
++ out:
++ return error;
++}
++
++/* Context: service */
++
++int kcl_leave_service(uint32_t local_id)
++{
++ sm_group_t *sg = NULL;
++ sm_sevent_t *sev;
++ int error;
++
++ error = -ENOENT;
++ sg = sm_local_id_to_sg(local_id);
++ if (!sg)
++ goto out;
++
++ /* sg was never joined */
++ error = -EINVAL;
++ if (sg->state == SGST_NONE)
++ goto out;
++
++ /* may still be joining */
++ error = -EBUSY;
++ if (test_and_set_bit(SGFL_SEVENT, &sg->flags))
++ goto out;
++
++ error = -ENOMEM;
++ sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
++ if (!sev)
++ goto out;
++
++ memset(sev, 0, sizeof (sm_sevent_t));
++ sev->se_state = SEST_LEAVE_BEGIN;
++ set_bit(SEFL_LEAVE, &sev->se_flags);
++ sev->se_sg = sg;
++ sg->sevent = sev;
++ sm_set_event_id(&sev->se_id);
++
++ new_joinleave(sev);
++ wait_for_completion(&sg->event_comp);
++ error = 0;
++
++ down(&sm_sglock);
++ list_del(&sg->list);
++ list_add_tail(&sg->list, &sg_registered[sg->level]);
++ up(&sm_sglock);
++
++ out:
++ return error;
++}
++
++static void process_callback(uint32_t local_id, int event_id)
++{
++ sm_group_t *sg;
++ sm_sevent_t *sev;
++ sm_uevent_t *uev;
++
++ sg = sm_local_id_to_sg(local_id);
++ if (!sg)
++ return;
++
++ if (sg->state == SGST_RECOVER) {
++ if (!check_recovery(sg, event_id)) {
++ log_error(sg, "process_callback invalid recover "
++ "event id %d", event_id);
++ return;
++ }
++
++ if (sg->recover_state == RECOVER_START)
++ sg->recover_state = RECOVER_STARTDONE;
++ else
++ log_error(sg, "process_callback recover state %u",
++ sg->recover_state);
++ wake_serviced(DO_RECOVERIES);
++ }
++
++ else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
++ (sg->sevent->se_id == event_id)) {
++ sev = sg->sevent;
++
++ if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
++ (sev->se_state == SEST_JSTART_SERVICEWAIT))
++ sev->se_state = SEST_JSTART_SERVICEDONE;
++
++ set_bit(SEFL_CHECK, &sev->se_flags);
++ wake_serviced(DO_JOINLEAVE);
++ }
++
++ else if (test_bit(SGFL_UEVENT, &sg->flags) &&
++ (sg->uevent.ue_id == event_id)) {
++ uev = &sg->uevent;
++
++ if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
++ if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
++ uev->ue_state = UEST_JSTART_SERVICEDONE;
++ else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
++ uev->ue_state = UEST_LSTART_SERVICEDONE;
++ }
++ set_bit(UEFL_CHECK, &uev->ue_flags);
++ wake_serviced(DO_MEMBERSHIP);
++ }
++
++ else
++ log_error(sg, "ignoring service callback id=%x event=%u",
++ local_id, event_id);
++}
++
++void process_callbacks(void)
++{
++ sc_entry_t *se;
++
++ while (1) {
++ se = NULL;
++
++ spin_lock(&callback_lock);
++ if (!list_empty(&callbacks)) {
++ se = list_entry(callbacks.next, sc_entry_t, list);
++ list_del(&se->list);
++ }
++ spin_unlock(&callback_lock);
++
++ if (!se)
++ break;
++ process_callback(se->local_id, se->event_id);
++ kfree(se);
++ schedule();
++ }
++}
++
++/* Context: service */
++
++void kcl_start_done(uint32_t local_id, int event_id)
++{
++ sc_entry_t *se;
++
++ SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
++
++ se->local_id = local_id;
++ se->event_id = event_id;
++
++ spin_lock(&callback_lock);
++ list_add_tail(&se->list, &callbacks);
++ spin_unlock(&callback_lock);
++
++ wake_serviced(DO_CALLBACKS);
++}
++
++/* Context: service */
++
++void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
++{
++ sm_group_t *sg = sm_local_id_to_sg(local_id);
++
++ if (!sg)
++ log_print("kcl_global_service_id: can't find %x", local_id);
++ else
++ *global_id = sg->global_id;
++}
++
++static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
++{
++ s->level = sg->level;
++ s->local_id = sg->local_id;
++ s->global_id = sg->global_id;
++ s->node_count = sg->memb_count;
++ strcpy(s->name, sg->name);
++}
++
++int kcl_get_services(struct list_head *head, int level)
++{
++ sm_group_t *sg;
++ struct kcl_service *s;
++ int error = -ENOMEM, count = 0;
++
++ down(&sm_sglock);
++
++ list_for_each_entry(sg, &sg_registered[level], list) {
++ if (head) {
++ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
++ if (!s)
++ goto out;
++ copy_to_service(sg, s);
++ list_add(&s->list, head);
++ }
++ count++;
++ }
++
++ list_for_each_entry(sg, &sm_sg[level], list) {
++ if (head) {
++ s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
++ if (!s)
++ goto out;
++ copy_to_service(sg, s);
++ list_add(&s->list, head);
++ }
++ count++;
++ }
++
++ error = count;
++ out:
++ up(&sm_sglock);
++ return error;
++}
++
++/* These three global variables listed in extern form in sm.h. */
++struct list_head sm_sg[SG_LEVELS];
++struct semaphore sm_sglock;
+diff -urN linux-orig/cluster/cman/sm_services.h linux-patched/cluster/cman/sm_services.h
+--- linux-orig/cluster/cman/sm_services.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_services.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_SERVICES_DOT_H__
++#define __SM_SERVICES_DOT_H__
++
++void init_services(void);
++void process_callbacks(void);
++
++#endif
+diff -urN linux-orig/cluster/cman/sm_user.c linux-patched/cluster/cman/sm_user.c
+--- linux-orig/cluster/cman/sm_user.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_user.c 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,563 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "sm.h"
++#include "cnxman-private.h"
++
++void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
++
++#define UST_REGISTER 1
++#define UST_UNREGISTER 2
++#define UST_JOIN 3
++#define UST_LEAVE 4
++#define UST_JOINED 5
++
++struct event {
++ struct list_head list;
++ service_event_t type;
++ service_start_t start_type;
++ unsigned int event_id;
++ unsigned int last_stop;
++ unsigned int last_start;
++ unsigned int last_finish;
++ unsigned int node_count;
++ uint32_t * nodeids;
++};
++typedef struct event event_t;
++
++struct user_service {
++ uint32_t local_id;
++ pid_t pid;
++ int signal;
++ struct socket * sock;
++ uint8_t state;
++ uint8_t async;
++ struct semaphore lock;
++ struct list_head events;
++ spinlock_t event_lock;
++ unsigned int last_stop;
++ unsigned int last_start;
++ unsigned int last_finish;
++ unsigned int need_startdone;
++ unsigned int node_count;
++ uint32_t * nodeids;
++ int name_len;
++ char name[MAX_SERVICE_NAME_LEN];
++};
++typedef struct user_service user_service_t;
++
++
++static void add_event(user_service_t *us, event_t *ev)
++{
++ spin_lock(&us->event_lock);
++ list_add_tail(&ev->list, &us->events);
++
++ switch(ev->type) {
++ case SERVICE_EVENT_STOP:
++ us->last_stop = us->last_start;
++ break;
++ case SERVICE_EVENT_START:
++ us->last_start = ev->event_id;
++ break;
++ case SERVICE_EVENT_FINISH:
++ us->last_finish = ev->event_id;
++ break;
++ case SERVICE_EVENT_LEAVEDONE:
++ break;
++ }
++ spin_unlock(&us->event_lock);
++}
++
++static event_t *get_event(user_service_t *us)
++{
++ event_t *ev = NULL;
++
++ spin_lock(&us->event_lock);
++ if (!list_empty(&us->events)) {
++ ev = list_entry(us->events.next, event_t, list);
++ ev->last_stop = us->last_stop;
++ ev->last_start = us->last_start;
++ ev->last_finish = us->last_finish;
++ }
++ spin_unlock(&us->event_lock);
++ return ev;
++}
++
++static void del_event(user_service_t *us, event_t *ev)
++{
++ spin_lock(&us->event_lock);
++ list_del(&ev->list);
++ spin_unlock(&us->event_lock);
++}
++
++static event_t *alloc_event(void)
++{
++ event_t *ev;
++ SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
++ memset(ev, 0, sizeof(event_t));
++ return ev;
++}
++
++/* us->lock must be held before calling */
++static void user_notify(user_service_t *us)
++{
++ if (us->sock)
++ queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
++ if (us->pid && us->signal)
++ kill_proc(us->pid, us->signal, 0);
++}
++
++static service_start_t start_type(int type)
++{
++ switch (type) {
++ case SERVICE_NODE_FAILED:
++ return SERVICE_START_FAILED;
++ case SERVICE_NODE_JOIN:
++ return SERVICE_START_JOIN;
++ case SERVICE_NODE_LEAVE:
++ return SERVICE_START_LEAVE;
++ }
++ return 0;
++}
++
++static int user_stop(void *servicedata)
++{
++ user_service_t *us = (user_service_t *) servicedata;
++ event_t *ev;
++
++ down(&us->lock);
++ if (!us->sock)
++ goto out;
++
++ ev = alloc_event();
++ ev->type = SERVICE_EVENT_STOP;
++
++ add_event(us, ev);
++ user_notify(us);
++ out:
++ up(&us->lock);
++ return 0;
++}
++
++static int user_start(void *servicedata, uint32_t *nodeids, int count,
++ int event_id, int type)
++{
++ user_service_t *us = (user_service_t *) servicedata;
++ event_t *ev;
++
++ down(&us->lock);
++ if (!us->sock) {
++ kcl_start_done(us->local_id, event_id);
++ goto out;
++ }
++
++ us->need_startdone = event_id;
++
++ ev = alloc_event();
++ ev->type = SERVICE_EVENT_START;
++ ev->node_count = count;
++ ev->start_type = start_type(type);
++ ev->event_id = event_id;
++ ev->nodeids = nodeids;
++
++ add_event(us, ev);
++ user_notify(us);
++ out:
++ up(&us->lock);
++ return 0;
++}
++
++static void user_finish(void *servicedata, int event_id)
++{
++ user_service_t *us = (user_service_t *) servicedata;
++ event_t *ev;
++
++ down(&us->lock);
++ if (!us->sock)
++ goto out;
++
++ ev = alloc_event();
++ ev->type = SERVICE_EVENT_FINISH;
++ ev->event_id = event_id;
++
++ add_event(us, ev);
++ user_notify(us);
++ out:
++ up(&us->lock);
++}
++
++struct kcl_service_ops user_service_ops = {
++ .stop = user_stop,
++ .start = user_start,
++ .finish = user_finish
++};
++
++static int user_register(char *name, user_service_t **us_data)
++{
++ user_service_t *us;
++ int len = strlen(name);
++ int error;
++
++ if (len > MAX_SERVICE_NAME_LEN - 1)
++ return -ENAMETOOLONG;
++ if (!len)
++ return -EINVAL;
++
++ us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
++ if (!us)
++ return -ENOMEM;
++ memset(us, 0, sizeof(user_service_t));
++ us->nodeids = NULL;
++ INIT_LIST_HEAD(&us->events);
++ spin_lock_init(&us->event_lock);
++ init_MUTEX(&us->lock);
++ us->name_len = len;
++ memcpy(us->name, name, len);
++
++ error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
++ &user_service_ops, TRUE, (void *) us,
++ &us->local_id);
++ if (error) {
++ kfree(us);
++ us = NULL;
++ }
++ *us_data = us;
++ return error;
++}
++
++static void user_unregister(user_service_t *us)
++{
++ event_t *ev;
++
++ kcl_unregister_service(us->local_id);
++
++ if (us->nodeids)
++ kfree(us->nodeids);
++
++ while ((ev = get_event(us))) {
++ del_event(us, ev);
++ if (ev->nodeids)
++ kfree(ev->nodeids);
++ kfree(ev);
++ }
++}
++
++static int user_join_async(void *arg)
++{
++ user_service_t *us = arg;
++ int user_gone = 0;
++
++ daemonize("cman_userjoin");
++
++ kcl_join_service(us->local_id);
++
++ down(&us->lock);
++ us->state = UST_JOINED;
++ us->async = 0;
++ if (!us->sock) {
++ if (us->need_startdone)
++ kcl_start_done(us->local_id, us->need_startdone);
++ user_gone = 1;
++ }
++ up(&us->lock);
++
++ if (user_gone) {
++ kcl_leave_service(us->local_id);
++ user_unregister(us);
++ kfree(us);
++ }
++ return 0;
++}
++
++static int user_leave_async(void *arg)
++{
++ user_service_t *us = arg;
++
++ daemonize("cman_userleave");
++
++ kcl_leave_service(us->local_id);
++
++ down(&us->lock);
++ us->async = 0;
++ if (!us->sock) {
++ user_unregister(us);
++ kfree(us);
++ } else {
++ event_t *ev = alloc_event();
++ ev->type = SERVICE_EVENT_LEAVEDONE;
++ add_event(us, ev);
++ user_notify(us);
++ up(&us->lock);
++ }
++
++ return 0;
++}
++
++static int user_join(user_service_t *us, int wait)
++{
++ int error = 0;
++
++ if (wait) {
++ error = kcl_join_service(us->local_id);
++ us->state = UST_JOINED;
++ }
++ else {
++ us->async = 1;
++ kernel_thread(user_join_async, us, 0);
++ }
++
++ return error;
++}
++
++static void user_leave(user_service_t *us, int wait)
++{
++ if (wait)
++ kcl_leave_service(us->local_id);
++ else {
++ us->async = 1;
++ kernel_thread(user_leave_async, us, 0);
++ }
++}
++
++static int user_start_done(user_service_t *us, unsigned int event_id)
++{
++ if (!us->need_startdone)
++ return -EINVAL;
++ if (us->need_startdone == event_id)
++ us->need_startdone = 0;
++ kcl_start_done(us->local_id, event_id);
++ return 0;
++}
++
++static void user_set_signal(user_service_t *us, int signal)
++{
++ us->pid = current->pid;
++ us->signal = signal;
++}
++
++static int user_get_event(user_service_t *us,
++ struct cl_service_event *user_event)
++{
++ event_t *ev;
++ struct cl_service_event event;
++
++ ev = get_event(us);
++ if (!ev)
++ return 0;
++
++ event.type = ev->type;
++ event.start_type = ev->start_type;
++ event.event_id = ev->event_id;
++ event.last_stop = ev->last_stop;
++ event.last_start = ev->last_start;
++ event.last_finish = ev->last_finish;
++ event.node_count = ev->node_count;
++
++ if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
++ return -EFAULT;
++
++ del_event(us, ev);
++
++ if (ev->type == SERVICE_EVENT_START) {
++ if (us->nodeids)
++ kfree(us->nodeids);
++ us->nodeids = ev->nodeids;
++ us->node_count = ev->node_count;
++ }
++
++ kfree(ev);
++ return 1;
++}
++
++static int user_get_members(user_service_t *us,
++ struct cl_cluster_nodelist *u_nodelist)
++{
++ struct cl_cluster_nodelist user_nodelist;
++ struct cl_cluster_node user_node, *u_node;
++ struct cluster_node *node;
++ unsigned int i;
++ int num_nodes = 0;
++
++ if (!u_nodelist)
++ return us->node_count;
++
++ if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
++ sizeof(struct cl_cluster_nodelist)))
++ return -EFAULT;
++
++ if (user_nodelist.max_members < us->node_count)
++ return -E2BIG;
++
++ u_node = user_nodelist.nodes;
++
++ for (i = 0; i < us->node_count; i++) {
++ node = find_node_by_nodeid(us->nodeids[i]);
++ if (!node)
++ continue;
++
++ copy_to_usernode(node, &user_node);
++ if (copy_to_user(u_node, &user_node,
++ sizeof(struct cl_cluster_node)))
++ return -EFAULT;
++
++ u_node++;
++ num_nodes++;
++ }
++ return num_nodes;
++}
++
++static int user_global_id(user_service_t *us, uint32_t *id)
++{
++ uint32_t gid = 0;
++
++ if (us->state != UST_JOINED)
++ return -EINVAL;
++
++ kcl_global_service_id(us->local_id, &gid);
++
++ if (copy_to_user(id, &gid, sizeof(uint32_t)))
++ return -EFAULT;
++ return 0;
++}
++
++static int user_set_level(user_service_t *us, int level)
++{
++ int prev_id = us->local_id;
++ int error;
++
++ if (us->state != UST_REGISTER)
++ return -EINVAL;
++
++ error = kcl_register_service(us->name, us->name_len, level,
++ &user_service_ops, TRUE, (void *) us,
++ &us->local_id);
++ if (error)
++ return error;
++
++ kcl_unregister_service(prev_id);
++ return 0;
++}
++
++int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
++{
++ struct cluster_sock *c = cluster_sk(sock->sk);
++ user_service_t *us = c->service_data;
++ int error = 0;
++
++ if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
++ return -EINVAL;
++
++ switch (cmd) {
++ case SIOCCLUSTER_SERVICE_REGISTER:
++ error = user_register((char *) arg, &us);
++ if (!error) {
++ us->state = UST_REGISTER;
++ us->sock = sock;
++ c->service_data = us;
++ }
++ break;
++
++ case SIOCCLUSTER_SERVICE_UNREGISTER:
++ down(&us->lock);
++ us->state = UST_UNREGISTER;
++ user_unregister(us);
++ up(&us->lock);
++ break;
++
++ case SIOCCLUSTER_SERVICE_JOIN:
++ us->state = UST_JOIN;
++ user_join(us, 0);
++ break;
++
++ case SIOCCLUSTER_SERVICE_LEAVE:
++ down(&us->lock);
++ if (us->state != UST_JOINED) {
++ error = -EBUSY;
++ up(&us->lock);
++ } else {
++ us->state = UST_LEAVE;
++ up(&us->lock);
++ user_leave(us, 0);
++ }
++ break;
++
++ case SIOCCLUSTER_SERVICE_SETSIGNAL:
++ user_set_signal(us, (int) arg);
++ break;
++
++ case SIOCCLUSTER_SERVICE_STARTDONE:
++ error = user_start_done(us, (unsigned int) arg);
++ break;
++
++ case SIOCCLUSTER_SERVICE_GETEVENT:
++ error = user_get_event(us, (struct cl_service_event *) arg);
++ break;
++
++ case SIOCCLUSTER_SERVICE_GETMEMBERS:
++ error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
++ break;
++
++ case SIOCCLUSTER_SERVICE_GLOBALID:
++ error = user_global_id(us, (uint32_t *) arg);
++ break;
++
++ case SIOCCLUSTER_SERVICE_SETLEVEL:
++ error = user_set_level(us, (int) arg);
++ break;
++
++ default:
++ error = -EINVAL;
++ }
++
++ return error;
++}
++
++void sm_sock_release(struct socket *sock)
++{
++ struct cluster_sock *c = cluster_sk(sock->sk);
++ user_service_t *us = c->service_data;
++ int state;
++
++ if (!us)
++ return;
++
++ down(&us->lock);
++ us->sock = NULL;
++ c->service_data = NULL;
++
++ if (us->need_startdone)
++ kcl_start_done(us->local_id, us->need_startdone);
++
++ if (us->async) {
++ /* async thread will clean up before exiting */
++ up(&us->lock);
++ return;
++ }
++ state = us->state;
++ up(&us->lock);
++
++ switch (state) {
++ case UST_JOIN:
++ break;
++ case UST_JOINED:
++ user_leave(us, 1);
++ /* fall through */
++ case UST_LEAVE:
++ case UST_REGISTER:
++ user_unregister(us);
++ /* fall through */
++ case UST_UNREGISTER:
++ kfree(us);
++ break;
++ }
++}
+diff -urN linux-orig/cluster/cman/sm_user.h linux-patched/cluster/cman/sm_user.h
+--- linux-orig/cluster/cman/sm_user.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/cman/sm_user.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,21 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SM_USER_DOT_H__
++#define __SM_USER_DOT_H__
++
++int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
++void sm_sock_release(struct socket *sock);
++void sm_sock_bind(struct socket *sock);
++
++#endif
+diff -urN linux-orig/include/cluster/cnxman-socket.h linux-patched/include/cluster/cnxman-socket.h
+--- linux-orig/include/cluster/cnxman-socket.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/cnxman-socket.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,226 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* CMAN socket interface header,
++ may be include by user or kernel code */
++
++#ifndef __CNXMAN_SOCKET_H
++#define __CNXMAN_SOCKET_H
++
++/* Just made these up but the address family must be less than 32 (NPROTO) */
++#define AF_CLUSTER 31
++#define PF_CLUSTER AF_CLUSTER
++
++/* Protocol(socket) types */
++#define CLPROTO_MASTER 2
++#define CLPROTO_CLIENT 3
++
++/* Setsockopt -- maybe should be ioctls?? */
++#define CLU_SET_MULTICAST 100
++#define CLU_JOIN_CLUSTER 101
++#define CLU_LEAVE_CLUSTER 102
++#define CLU_SET_RCVONLY 103
++#define CLU_SET_UNICAST 104
++#define KCL_SET_MULTICAST 105
++#define KCL_SET_RCVONLY 106
++#define KCL_SET_UNICAST 107
++#define KCL_SET_NODENAME 108
++#define CLU_SET_NODENAME 109
++
++/* ioctls -- should register these properly */
++#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int)
++#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02)
++#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
++#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05)
++#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request)
++#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int)
++#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version)
++#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version)
++#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b)
++#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int)
++#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d)
++#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char)
++#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
++#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10)
++#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20)
++#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
++#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
++#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event)
++#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
++#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t)
++#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int)
++#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node)
++#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info)
++
++/* Maximum size of a cluster message */
++#define MAX_CLUSTER_MESSAGE 1500
++#define MAX_CLUSTER_MEMBER_NAME_LEN 255
++#define MAX_BARRIER_NAME_LEN 33
++#define MAX_SA_ADDR_LEN 12
++#define MAX_CLUSTER_NAME_LEN 16
++
++/* Well-known cluster port numbers */
++#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster
++ * transitions! */
++#define CLUSTER_PORT_SERVICES 2
++#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
++#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
++#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */
++
++/* Port numbers above this will be blocked when the cluster is inquorate or in
++ * transition */
++#define HIGH_PROTECTED_PORT 9
++
++/* Reasons for leaving the cluster */
++#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */
++#define CLUSTER_LEAVEFLAG_KILLED 1
++#define CLUSTER_LEAVEFLAG_PANIC 2
++#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */
++#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the
++ * first place */
++#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is
++ * in a minority */
++#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */
++#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */
++
++/* OOB messages sent to a local socket */
++#define CLUSTER_OOB_MSG_PORTCLOSED 1
++#define CLUSTER_OOB_MSG_STATECHANGE 2
++#define CLUSTER_OOB_MSG_SERVICEEVENT 3
++
++/* Sendmsg flags, these are above the normal sendmsg flags so they don't
++ * interfere */
++#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */
++#define MSG_QUEUE 0x020000 /* Queue the message for sending later */
++#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster
++ */
++#define MSG_ALLINT 0x100000 /* Send out of all interfaces */
++
++typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER,
++ NODESTATE_DEAD } nodestate_t;
++
++
++struct sockaddr_cl {
++ unsigned short scl_family;
++ unsigned char scl_flags;
++ unsigned char scl_port;
++ int scl_nodeid;
++};
++
++/* This is how we pass the multicast socket into kernel space. addr is the
++ * multicast address to use in the address family of the socket (eg for UDP it
++ * might be 255.255.255.0) */
++struct cl_multicast_sock {
++ int fd; /* FD of master socket to do multicast on */
++ int number; /* Socket number, to match up recvonly & bcast
++ * sockets */
++};
++
++/* Cluster configuration info passed when we join the cluster */
++struct cl_join_cluster_info {
++ unsigned char votes;
++ unsigned int expected_votes;
++ unsigned int two_node;
++ unsigned int config_version;
++
++ char cluster_name[17];
++};
++
++
++/* This is the structure, per node, returned from the membership ioctl */
++struct cl_cluster_node {
++ unsigned int size;
++ unsigned int node_id;
++ unsigned int us;
++ unsigned int leave_reason;
++ unsigned int incarnation;
++ nodestate_t state;
++ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
++ unsigned char votes;
++};
++
++/* The struct passed to the membership ioctls */
++struct cl_cluster_nodelist {
++ uint32_t max_members;
++ struct cl_cluster_node *nodes;
++};
++
++/* Structure passed to SIOCCLUSTER_ISLISTENING */
++struct cl_listen_request {
++ unsigned char port;
++ int nodeid;
++};
++
++/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
++struct cl_portclosed_oob {
++ unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */
++ unsigned char port;
++};
++
++/* Get all version numbers or set the config version */
++struct cl_version {
++ unsigned int major;
++ unsigned int minor;
++ unsigned int patch;
++ unsigned int config;
++};
++
++/* structure passed to barrier ioctls */
++struct cl_barrier_info {
++ char cmd;
++ char name[MAX_BARRIER_NAME_LEN];
++ unsigned int flags;
++ unsigned long arg;
++};
++
++typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
++ SERVICE_EVENT_LEAVEDONE } service_event_t;
++
++typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
++ service_start_t;
++
++struct cl_service_event {
++ service_event_t type;
++ service_start_t start_type;
++ unsigned int event_id;
++ unsigned int last_stop;
++ unsigned int last_start;
++ unsigned int last_finish;
++ unsigned int node_count;
++};
++
++
++/* Commands to the barrier ioctl */
++#define BARRIER_IOCTL_REGISTER 1
++#define BARRIER_IOCTL_CHANGE 2
++#define BARRIER_IOCTL_DELETE 3
++#define BARRIER_IOCTL_WAIT 4
++
++/* Attributes of a barrier - bitmask */
++#define BARRIER_ATTR_AUTODELETE 1
++#define BARRIER_ATTR_MULTISTEP 2
++#define BARRIER_ATTR_MANUAL 4
++#define BARRIER_ATTR_ENABLED 8
++#define BARRIER_ATTR_CALLBACK 16
++
++/* Attribute setting commands */
++#define BARRIER_SETATTR_AUTODELETE 1
++#define BARRIER_SETATTR_MULTISTEP 2
++#define BARRIER_SETATTR_ENABLED 3
++#define BARRIER_SETATTR_NODES 4
++#define BARRIER_SETATTR_CALLBACK 5
++#define BARRIER_SETATTR_TIMEOUT 6
++
++#endif
+diff -urN linux-orig/include/cluster/cnxman.h linux-patched/include/cluster/cnxman.h
+--- linux-orig/include/cluster/cnxman.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/cnxman.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,87 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CNXMAN_H
++#define __CNXMAN_H
++
++#include "linux/in6.h"
++#include "cluster/cnxman-socket.h"
++
++/* In-kernel API */
++
++/* This is the structure, per node, returned from the membership request */
++struct kcl_cluster_node {
++ unsigned int size;
++ unsigned int node_id;
++ unsigned int us;
++ unsigned int leave_reason;
++ unsigned int incarnation;
++ nodestate_t state;
++ struct list_head list;
++ char name[MAX_CLUSTER_MEMBER_NAME_LEN];
++ unsigned char votes;
++};
++
++struct cluster_node_addr {
++ struct list_head list;
++ unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
++ int addr_len;
++};
++
++
++/* Reasons for a kernel membership callback */
++typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
++
++/* Kernel version of above, the void *sock is a struct socket */
++struct kcl_multicast_sock {
++ void *sock;
++ int number; /* Socket number, to match up recvonly & bcast
++ * sockets */
++};
++
++extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
++ struct sockaddr_cl *caddr, int addr_len,
++ unsigned int flags);
++extern int kcl_register_read_callback(struct socket *sock,
++ int (*routine) (char *, int, char *, int,
++ unsigned int));
++extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
++extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
++extern int kcl_get_members(struct list_head *list);
++extern int kcl_get_member_ids(uint32_t * idbuf, int size);
++extern int kcl_get_all_members(struct list_head *list);
++extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
++ struct kcl_cluster_node *n);
++extern int kcl_get_node_by_name(unsigned char *name,
++ struct kcl_cluster_node *n);
++extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
++extern int kcl_is_quorate(void);
++extern int kcl_addref_cluster(void);
++extern int kcl_releaseref_cluster(void);
++extern int kcl_cluster_name(char **cname);
++extern int kcl_get_current_interface(void);
++extern struct list_head *kcl_get_node_addresses(int nodeid);
++
++extern int kcl_barrier_register(char *name, unsigned int flags,
++ unsigned int nodes);
++extern int kcl_barrier_setattr(char *name, unsigned int attr,
++ unsigned long arg);
++extern int kcl_barrier_delete(char *name);
++extern int kcl_barrier_wait(char *name);
++extern int kcl_barrier_cancel(char *name);
++
++extern int kcl_register_quorum_device(char *name, int votes);
++extern int kcl_unregister_quorum_device(void);
++extern int kcl_quorum_device_available(int yesno);
++
++#endif
+diff -urN linux-orig/include/cluster/service.h linux-patched/include/cluster/service.h
+--- linux-orig/include/cluster/service.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/service.h 2004-06-25 10:15:13.000000000 +0800
+@@ -0,0 +1,102 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SERVICE_DOT_H__
++#define __SERVICE_DOT_H__
++
++/*
++ * Interface between service manager and services
++ */
++
++/*
++ * Service levels are started in order from lowest, so level 0 is started on
++ * all nodes before level 1 is started.
++ */
++
++#define SERVICE_LEVEL_FENCE (0)
++#define SERVICE_LEVEL_GDLM (1)
++#define SERVICE_LEVEL_GFS (2)
++#define SERVICE_LEVEL_USER (3)
++
++#define MAX_SERVICE_NAME_LEN (33)
++
++/*
++ * The type of start a service receives. The start (and preceding stop) may be
++ * due to a node joining or leaving the SG or due to a node having failed.
++ */
++
++#define SERVICE_NODE_FAILED (1)
++#define SERVICE_NODE_JOIN (2)
++#define SERVICE_NODE_LEAVE (3)
++
++
++struct kcl_service {
++ struct list_head list;
++ uint16_t level;
++ uint32_t local_id;
++ uint32_t global_id;
++ int node_count;
++ char name[MAX_SERVICE_NAME_LEN];
++};
++
++int kcl_get_services(struct list_head *list, int level);
++
++
++/*
++ * These routines which run in CMAN context must return quickly and cannot
++ * block.
++ */
++
++struct kcl_service_ops {
++ int (*stop) (void *servicedata);
++ int (*start) (void *servicedata, uint32_t *nodeids, int count,
++ int event_id, int type);
++ void (*finish) (void *servicedata, int event_id);
++};
++
++/*
++ * Register will cause CMAN to create a Service Group (SG) for the named
++ * instance of the service. A local ID is returned which is used to join,
++ * leave and unregister the service.
++ */
++
++int kcl_register_service(char *name, int namelen, int level,
++ struct kcl_service_ops *ops, int unique,
++ void *servicedata, uint32_t *local_id);
++
++void kcl_unregister_service(uint32_t local_id);
++
++/*
++ * Once a service is joined it will be managed by CMAN and receive start, stop,
++ * and finish calls. After leave is called the service is no longer managed by
++ * CMAN. The first start for a service may arrive before kcl_join_service()
++ * returns.
++ */
++
++int kcl_join_service(uint32_t local_id);
++int kcl_leave_service(uint32_t local_id);
++
++/*
++ * After a service is started, it can ask for its cluster-wide unique ID.
++ */
++
++void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
++
++/*
++ * Called by a service when it's done with a start(). Cannot be called from
++ * the start function.
++ */
++
++void kcl_start_done(uint32_t local_id, int event_id);
++
++#endif
--- /dev/null
+# Add DLM to the build system
+diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
+--- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
++++ linux/cluster/Kconfig 2004-06-17 15:00:57.000000000 +0800
+@@ -10,4 +10,22 @@ config CLUSTER
+ needed by all the other components. It provides membership services
+ for those other subsystems.
+
++config CLUSTER_DLM
++ tristate "Distributed Lock Manager"
++ depends on CLUSTER
++ ---help---
++ A fully distributed lock manager, providing cluster-wide locking services
++ and protected lock namespaces for kernel and userland applications.
++
++config CLUSTER_DLM_PROCLOCKS
++ boolean "/proc/locks support for DLM"
++ depends on CLUSTER_DLM
++ depends on PROC_FS
++ ---help---
++ If this option is enabled a file will appear in /proc/cluster/dlm_locks.
++ write into this "file" the name of a lockspace known to the DLM and then
++ read out a list of all the resources and locks in that lockspace that are
++ known to the local node. Note because the DLM is distributed this may not
++ be the full lock picture.
++
+ endmenu
+diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
+--- linux-2.6.7/cluster/Makefile 2004-06-17 15:00:36.000000000 +0800
++++ linux/cluster/Makefile 2004-06-17 15:00:57.000000000 +0800
+@@ -1,3 +1,4 @@
+ obj-y := nocluster.o
+
+ obj-$(CONFIG_CLUSTER) += cman/
++obj-$(CONFIG_CLUSTER_DLM) += dlm/
+diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
+--- linux-2.6.7/cluster/dlm/Makefile 1970-01-01 07:30:00.000000000 +0730
++++ linux/cluster/dlm/Makefile 2004-06-17 15:00:57.000000000 +0800
+@@ -0,0 +1,23 @@
++dlm-objs := ast.o \
++ config.o \
++ device.o \
++ dir.o \
++ lkb.o \
++ locking.o \
++ lockqueue.o \
++ lockspace.o \
++ lowcomms.o \
++ main.o \
++ memory.o \
++ midcomms.o \
++ nodes.o \
++ proc.o \
++ queries.o \
++ rebuild.o \
++ reccomms.o \
++ recover.o \
++ recoverd.o \
++ rsb.o \
++ util.o \
++
++obj-$(CONFIG_CLUSTER_DLM) += dlm.o
+diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
+--- linux-orig/cluster/dlm/ast.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/ast.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,581 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * This delivers ASTs and checks for dead remote requests and deadlocks.
++ */
++
++#include <linux/timer.h>
++
++#include "dlm_internal.h"
++#include "rsb.h"
++#include "lockqueue.h"
++#include "dir.h"
++#include "locking.h"
++#include "lkb.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "ast.h"
++#include "nodes.h"
++#include "config.h"
++
++/* Wake up flags for astd */
++#define GDLMD_WAKE_ASTS 1
++#define GDLMD_WAKE_TIMER 2
++
++static struct list_head _deadlockqueue;
++static struct semaphore _deadlockqueue_lock;
++static struct list_head _lockqueue;
++static struct semaphore _lockqueue_lock;
++static struct timer_list _lockqueue_timer;
++static struct list_head _ast_queue;
++static struct semaphore _ast_queue_lock;
++static wait_queue_head_t _astd_waitchan;
++static atomic_t _astd_running;
++static long _astd_pid;
++static unsigned long _astd_wakeflags;
++static struct completion _astd_done;
++
++void add_to_lockqueue(gd_lkb_t *lkb)
++{
++ /* Time stamp the entry so we know if it's been waiting too long */
++ lkb->lkb_lockqueue_time = jiffies;
++
++ down(&_lockqueue_lock);
++ list_add(&lkb->lkb_lockqueue, &_lockqueue);
++ up(&_lockqueue_lock);
++}
++
++void remove_from_lockqueue(gd_lkb_t *lkb)
++{
++ down(&_lockqueue_lock);
++ list_del(&lkb->lkb_lockqueue);
++ up(&_lockqueue_lock);
++}
++
++void add_to_deadlockqueue(gd_lkb_t *lkb)
++{
++ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
++ return;
++ lkb->lkb_duetime = jiffies;
++ down(&_deadlockqueue_lock);
++ list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
++ up(&_deadlockqueue_lock);
++}
++
++void remove_from_deadlockqueue(gd_lkb_t *lkb)
++{
++ if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
++ return;
++
++ down(&_deadlockqueue_lock);
++ list_del(&lkb->lkb_deadlockq);
++ up(&_deadlockqueue_lock);
++
++ /* Invalidate the due time */
++ memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
++}
++
++void remove_from_astqueue(gd_lkb_t *lkb)
++{
++ down(&_ast_queue_lock);
++ if (lkb->lkb_asts_to_deliver)
++ list_del(&lkb->lkb_astqueue);
++ lkb->lkb_asts_to_deliver = 0;
++ up(&_ast_queue_lock);
++}
++
++/*
++ * Actually deliver an AST to a user. The caller MUST hold the ast queue lock
++ * and we unlock it for the duration of the user call, otherwise things can
++ * deadlock.
++ */
++
++static void deliver_ast(gd_lkb_t *lkb, gd_ast_type_t astt)
++{
++ void (*cast) (long param) = lkb->lkb_astaddr;
++ void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
++
++ up(&_ast_queue_lock);
++
++ if (cast && (astt == GDLM_QUEUE_COMPAST))
++ cast(lkb->lkb_astparam);
++
++ else if (bast && (astt == GDLM_QUEUE_BLKAST)
++ && (lkb->lkb_status == GDLM_LKSTS_GRANTED))
++ bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
++
++ /*
++ * Remove LKB if requested. It is up to the caller to remove the LKB
++ * from any resource queue it may be on.
++ *
++ * NOTE: we check lkb_asts_to_deliver here in case an ast for us was
++ * queued during the AST delivery itself (eg a user called dlm_unlock
++ * in the AST routine!
++ */
++
++ if (lkb->lkb_flags & GDLM_LKFLG_DELAST && astt == GDLM_QUEUE_COMPAST &&
++ lkb->lkb_asts_to_deliver == 0) {
++ gd_res_t *rsb = lkb->lkb_resource;
++ struct rw_semaphore *in_recovery = &rsb->res_ls->ls_in_recovery;
++
++ down_read(in_recovery);
++ release_lkb(rsb->res_ls, lkb);
++ release_rsb(rsb);
++ up_read(in_recovery);
++ }
++
++ /* This queue can get very big so we schedule here to give the rest of
++ * the cluster chance to do some work. */
++ schedule();
++
++ down(&_ast_queue_lock);
++}
++
++/*
++ * Queue an AST for delivery, this will only deal with
++ * kernel ASTs, usermode API will piggyback on top of this.
++ *
++ * This can be called in either the user or DLM context.
++ * ASTs are queued EVEN IF we are already running in gdlm_astd
++ * context as we don't know what other locks are held (eg we could
++ * be being called from a lock operation that was called from
++ * another AST!
++ * If the AST is to be queued remotely then a message is sent to
++ * the target system via midcomms.
++ */
++
++void queue_ast(gd_lkb_t *lkb, gd_ast_type_t astt, uint8_t rqmode)
++{
++ struct gd_remlockrequest req;
++
++ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++ /*
++ * Send a message to have an ast queued remotely. Note: we do
++ * not send remote completion asts, they are handled as part of
++ * remote lock granting.
++ */
++
++ if (astt == GDLM_QUEUE_BLKAST) {
++ req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
++ req.rr_header.rh_length = sizeof(req);
++ req.rr_header.rh_flags = 0;
++ req.rr_header.rh_lkid = lkb->lkb_id;
++ req.rr_header.rh_lockspace =
++ lkb->lkb_resource->res_ls->ls_global_id;
++ req.rr_status = lkb->lkb_retstatus;
++ req.rr_remlkid = lkb->lkb_remid;
++ req.rr_rqmode = rqmode;
++
++ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
++ lkb->lkb_resource->res_ls->ls_allocation);
++
++ } else if (lkb->lkb_retstatus == -EDEADLOCK) {
++ /*
++ * We only queue remote Completion ASTs here for error
++ * completions that happen out of band.
++ * DEADLOCK is one such.
++ */
++
++ req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
++ req.rr_header.rh_length = sizeof(req);
++ req.rr_header.rh_flags = 0;
++ req.rr_header.rh_lkid = lkb->lkb_id;
++ req.rr_header.rh_lockspace =
++ lkb->lkb_resource->res_ls->ls_global_id;
++ req.rr_status = lkb->lkb_retstatus;
++ req.rr_remlkid = lkb->lkb_remid;
++ req.rr_rqmode = rqmode;
++
++ midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
++ lkb->lkb_resource->res_ls->ls_allocation);
++ }
++ } else {
++ /*
++ * Prepare info which will be returned in ast/bast.
++ */
++
++ if (astt == GDLM_QUEUE_BLKAST) {
++ lkb->lkb_bastmode = rqmode;
++ } else {
++ lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
++
++ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
++ lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
++ else
++ lkb->lkb_lksb->sb_flags = 0;
++ }
++
++ /*
++ * Queue ast/bast or deliver directly. astd can deliver ASTs
++ * during deadlock detection or lock timeouts.
++ */
++
++ down(&_ast_queue_lock);
++
++ if (!lkb->lkb_asts_to_deliver)
++ list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
++ lkb->lkb_asts_to_deliver |= astt;
++
++ up(&_ast_queue_lock);
++
++ /* It is the responsibility of the caller to call wake_astd()
++ * after it has finished other locking operations that request
++ * the ASTs to be delivered after */
++ }
++}
++
++/*
++ * Process any LKBs on the AST queue. The were queued in queue_ast().
++ */
++
++static void process_asts(void)
++{
++ gd_lkb_t *lkb, *safe;
++ uint32_t to_deliver;
++
++ down(&_ast_queue_lock);
++
++ list_for_each_entry_safe(lkb, safe, &_ast_queue, lkb_astqueue) {
++
++ /* The lkb can be placed back on _ast_queue as soon as
++ * _ast_queue_lock is released. */
++
++ to_deliver = lkb->lkb_asts_to_deliver;
++ lkb->lkb_asts_to_deliver = 0;
++ list_del(&lkb->lkb_astqueue);
++
++ if ((to_deliver & GDLM_QUEUE_COMPAST))
++ deliver_ast(lkb, GDLM_QUEUE_COMPAST);
++
++ if ((to_deliver & GDLM_QUEUE_BLKAST))
++ deliver_ast(lkb, GDLM_QUEUE_BLKAST);
++ }
++ up(&_ast_queue_lock);
++}
++
++void lockqueue_lkb_mark(gd_ls_t *ls)
++{
++ gd_lkb_t *lkb, *safe;
++ int count = 0;
++
++ log_all(ls, "mark waiting requests");
++
++ down(&_lockqueue_lock);
++
++ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++
++ if (lkb->lkb_resource->res_ls != ls)
++ continue;
++
++ /*
++ * These lkb's are new and the master is being looked up. Mark
++ * the lkb request to be resent. Even if the destination node
++ * for the request is still living and has our request, it will
++ * purge all resdir requests in purge_requestqueue. If there's
++ * a reply to the LOOKUP request in our requestqueue (the reply
++ * arrived after ls_stop), it is invalid and will be discarded
++ * in purge_requestqueue, too.
++ */
++
++ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
++ GDLM_ASSERT(lkb->lkb_nodeid == -1,
++ log_error(ls, "nodeid=%d\n",
++ lkb->lkb_nodeid););
++
++ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
++ count++;
++ continue;
++ }
++
++ /*
++ * These lkb's have an outstanding request to a bygone node.
++ * The request will be redirected to the new master node in
++ * resend_cluster_requests(). Don't mark the request for
++ * resending if there's a reply for it saved in the
++ * requestqueue.
++ */
++
++ if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
++ !reply_in_requestqueue(ls, lkb->lkb_id)) {
++
++ lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
++
++ /*
++ * Don't rebuild this lkb on a new rsb in
++ * rebuild_rsbs_send().
++ */
++
++ if (lkb->lkb_lockqueue_state ==
++ GDLM_LQSTATE_WAIT_CONDGRANT) {
++ GDLM_ASSERT(lkb->lkb_status ==
++ GDLM_LKSTS_WAITING, );
++ lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
++ }
++
++ /*
++ * This flag indicates to the new master that his lkb
++ * is in the midst of a convert request and should be
++ * placed on the granted queue rather than the convert
++ * queue. We will resend this convert request to the
++ * new master.
++ */
++
++ else if (lkb->lkb_lockqueue_state ==
++ GDLM_LQSTATE_WAIT_CONVERT) {
++ GDLM_ASSERT(lkb->lkb_status ==
++ GDLM_LKSTS_CONVERT, );
++ lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
++ }
++
++ count++;
++ }
++ }
++ up(&_lockqueue_lock);
++
++ log_all(ls, "marked %d requests", count);
++}
++
++int resend_cluster_requests(gd_ls_t *ls)
++{
++ gd_lkb_t *lkb, *safe;
++ int error = 0, state, count = 0;
++
++ log_all(ls, "resend marked requests");
++
++ down(&_lockqueue_lock);
++
++ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++ log_debug(ls, "resend_cluster_requests: aborted");
++ error = -EINTR;
++ break;
++ }
++
++ if (lkb->lkb_resource->res_ls != ls)
++ continue;
++
++ log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
++ "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
++ lkb->lkb_lockqueue_state, lkb->lkb_flags);
++
++ /*
++ * Resend/process the lockqueue lkb's (in-progres requests)
++ * that were flagged at the start of recovery in
++ * lockqueue_lkb_mark().
++ */
++
++ if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
++ lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
++ lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
++ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
++
++ if (lkb->lkb_nodeid == -1) {
++ /*
++ * Send lookup to new resdir node.
++ */
++ lkb->lkb_lockqueue_time = jiffies;
++ send_cluster_request(lkb,
++ lkb->lkb_lockqueue_state);
++ }
++
++ else if (lkb->lkb_nodeid != 0) {
++ /*
++ * There's a new RSB master (that's not us.)
++ */
++ lkb->lkb_lockqueue_time = jiffies;
++ send_cluster_request(lkb,
++ lkb->lkb_lockqueue_state);
++ }
++
++ else {
++ /*
++ * We are the new RSB master for this lkb
++ * request.
++ */
++ state = lkb->lkb_lockqueue_state;
++ lkb->lkb_lockqueue_state = 0;
++ /* list_del equals remove_from_lockqueue() */
++ list_del(&lkb->lkb_lockqueue);
++ process_remastered_lkb(lkb, state);
++ }
++
++ count++;
++ }
++ }
++ up(&_lockqueue_lock);
++
++ log_all(ls, "resent %d requests", count);
++ return error;
++}
++
++/*
++ * Process any LKBs on the Lock queue, this
++ * just looks at the entries to see if they have been
++ * on the queue too long and fails the requests if so.
++ */
++
++static void process_lockqueue(void)
++{
++ gd_lkb_t *lkb, *safe;
++ gd_ls_t *ls;
++ int count = 0;
++
++ down(&_lockqueue_lock);
++
++ list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
++ ls = lkb->lkb_resource->res_ls;
++
++ if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
++ continue;
++
++ /* Don't time out locks that are in transition */
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
++ continue;
++
++ if (check_timeout(lkb->lkb_lockqueue_time,
++ dlm_config.lock_timeout)) {
++ count++;
++ list_del(&lkb->lkb_lockqueue);
++ up(&_lockqueue_lock);
++ cancel_lockop(lkb, -ETIMEDOUT);
++ down(&_lockqueue_lock);
++ }
++ }
++ up(&_lockqueue_lock);
++
++ if (count)
++ wake_astd();
++
++ if (atomic_read(&_astd_running))
++ mod_timer(&_lockqueue_timer,
++ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
++}
++
++/* Look for deadlocks */
++static void process_deadlockqueue(void)
++{
++ gd_lkb_t *lkb, *safe;
++
++ down(&_deadlockqueue_lock);
++
++ list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
++ gd_lkb_t *kill_lkb;
++
++ /* Only look at "due" locks */
++ if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
++ break;
++
++ /* Don't look at locks that are in transition */
++ if (!test_bit(LSFL_LS_RUN,
++ &lkb->lkb_resource->res_ls->ls_flags))
++ continue;
++
++ up(&_deadlockqueue_lock);
++
++ /* Lock has hit due time, check for conversion deadlock */
++ kill_lkb = conversion_deadlock_check(lkb);
++ if (kill_lkb)
++ cancel_conversion(kill_lkb, -EDEADLOCK);
++
++ down(&_deadlockqueue_lock);
++ }
++ up(&_deadlockqueue_lock);
++}
++
++static __inline__ int no_asts(void)
++{
++ int ret;
++
++ down(&_ast_queue_lock);
++ ret = list_empty(&_ast_queue);
++ up(&_ast_queue_lock);
++ return ret;
++}
++
++static void lockqueue_timer_fn(unsigned long arg)
++{
++ set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
++ wake_up(&_astd_waitchan);
++}
++
++/*
++ * DLM daemon which delivers asts.
++ */
++
++static int dlm_astd(void *data)
++{
++ daemonize("dlm_astd");
++
++ INIT_LIST_HEAD(&_lockqueue);
++ init_MUTEX(&_lockqueue_lock);
++ INIT_LIST_HEAD(&_deadlockqueue);
++ init_MUTEX(&_deadlockqueue_lock);
++ INIT_LIST_HEAD(&_ast_queue);
++ init_MUTEX(&_ast_queue_lock);
++ init_waitqueue_head(&_astd_waitchan);
++ complete(&_astd_done);
++
++ /*
++ * Set a timer to check the lockqueue for dead locks (and deadlocks).
++ */
++
++ init_timer(&_lockqueue_timer);
++ _lockqueue_timer.function = lockqueue_timer_fn;
++ _lockqueue_timer.data = 0;
++ mod_timer(&_lockqueue_timer,
++ jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
++
++ while (atomic_read(&_astd_running)) {
++ wchan_cond_sleep_intr(_astd_waitchan, no_asts());
++
++ if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
++ process_asts();
++
++ if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
++ process_lockqueue();
++ if (dlm_config.deadlocktime)
++ process_deadlockqueue();
++ }
++ }
++
++ if (timer_pending(&_lockqueue_timer))
++ del_timer(&_lockqueue_timer);
++
++ complete(&_astd_done);
++
++ return 0;
++}
++
++void wake_astd(void)
++{
++ set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
++ wake_up(&_astd_waitchan);
++}
++
++int astd_start()
++{
++ init_completion(&_astd_done);
++ atomic_set(&_astd_running, 1);
++ _astd_pid = kernel_thread(dlm_astd, NULL, 0);
++ wait_for_completion(&_astd_done);
++ return 0;
++}
++
++void astd_stop()
++{
++ atomic_set(&_astd_running, 0);
++ wake_astd();
++ wait_for_completion(&_astd_done);
++}
+diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
+--- linux-orig/cluster/dlm/ast.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/ast.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __AST_DOT_H__
++#define __AST_DOT_H__
++
++void lockqueue_lkb_mark(gd_ls_t * ls);
++int resend_cluster_requests(gd_ls_t * ls);
++void add_to_lockqueue(gd_lkb_t * lkb);
++void remove_from_lockqueue(gd_lkb_t * lkb);
++void add_to_deadlockqueue(gd_lkb_t * lkb);
++void remove_from_deadlockqueue(gd_lkb_t * lkb);
++void remove_from_astqueue(gd_lkb_t * lkb);
++void queue_ast(gd_lkb_t * lkb, gd_ast_type_t astt, uint8_t rqmode);
++void wake_astd(void);
++int astd_start(void);
++void astd_stop(void);
++
++#endif /* __AST_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
+--- linux-orig/cluster/dlm/config.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/config.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,125 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "config.h"
++
++/* Config file defaults */
++#define DEFAULT_TCP_PORT 21064
++#define DEFAULT_LOCK_TIMEOUT 30
++#define DEFAULT_BUFFER_SIZE 4096
++#define DEFAULT_RESHASHTBL 256
++#define DEFAULT_LOCKIDTBL 1024
++#define DEFAULT_MAX_CONNECTIONS 128
++#define DEFAULT_DEADLOCKTIME 10
++
++struct config_info dlm_config = {
++ .tcp_port = DEFAULT_TCP_PORT,
++ .lock_timeout = DEFAULT_LOCK_TIMEOUT,
++ .buffer_size = DEFAULT_BUFFER_SIZE,
++ .reshashtbl = DEFAULT_RESHASHTBL,
++ .lockidtbl = DEFAULT_LOCKIDTBL,
++ .max_connections = DEFAULT_MAX_CONNECTIONS,
++ .deadlocktime = DEFAULT_DEADLOCKTIME,
++};
++
++
++static struct config_proc_info {
++ char *name;
++ int *value;
++} config_proc[] = {
++ {
++ .name = "tcp_port",
++ .value = &dlm_config.tcp_port,
++ },
++ {
++ .name = "lock_timeout",
++ .value = &dlm_config.lock_timeout,
++ },
++ {
++ .name = "buffer_size",
++ .value = &dlm_config.buffer_size,
++ },
++ {
++ .name = "reshashtbl",
++ .value = &dlm_config.reshashtbl,
++ },
++ {
++ .name = "lockidtbl",
++ .value = &dlm_config.lockidtbl,
++ },
++ {
++ .name = "max_connections",
++ .value = &dlm_config.max_connections,
++ },
++ {
++ .name = "deadlocktime",
++ .value = &dlm_config.deadlocktime,
++ },
++};
++static struct proc_dir_entry *dlm_dir;
++
++static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct config_proc_info *cinfo = data;
++ return snprintf(page, count, "%d\n", *cinfo->value);
++}
++
++static int dlm_config_write_proc(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ struct config_proc_info *cinfo = data;
++ int value;
++ char *end;
++
++ value = simple_strtoul(buffer, &end, 10);
++ if (*end)
++ *cinfo->value = value;
++ return count;
++}
++
++int dlm_config_init(void)
++{
++ int i;
++ struct proc_dir_entry *pde;
++
++ dlm_dir = proc_mkdir("cluster/config/dlm", 0);
++ if (!dlm_dir)
++ return -1;
++
++ dlm_dir->owner = THIS_MODULE;
++
++ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
++ pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
++ if (pde) {
++ pde->data = &config_proc[i];
++ pde->write_proc = dlm_config_write_proc;
++ pde->read_proc = dlm_config_read_proc;
++ }
++ }
++ return 0;
++}
++
++void dlm_config_exit(void)
++{
++ int i;
++
++ for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
++ remove_proc_entry(config_proc[i].name, dlm_dir);
++ remove_proc_entry("cluster/config/dlm", NULL);
++}
+diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
+--- linux-orig/cluster/dlm/config.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/config.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,31 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __CONFIG_DOT_H__
++#define __CONFIG_DOT_H__
++
++struct config_info {
++ int tcp_port;
++ int lock_timeout;
++ int buffer_size;
++ int reshashtbl;
++ int lockidtbl;
++ int max_connections;
++ int deadlocktime;
++};
++
++extern struct config_info dlm_config;
++extern int dlm_config_init(void);
++extern void dlm_config_exit(void);
++
++#endif /* __CONFIG_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
+--- linux-orig/cluster/dlm/device.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/device.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1020 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * device.c
++ *
++ * This is the userland interface to the DLM.
++ *
++ * The locking is done via a misc char device (find the
++ * registered minor number in /proc/misc).
++ *
++ * User code should not use this interface directly but
++ * call the library routines in libdlm.a instead.
++ *
++ */
++
++#include <linux/miscdevice.h>
++#include <linux/init.h>
++#include <linux/wait.h>
++#include <linux/module.h>
++#include <linux/file.h>
++#include <linux/fs.h>
++#include <linux/poll.h>
++#include <linux/signal.h>
++#include <linux/spinlock.h>
++#include <asm/ioctls.h>
++
++#include "dlm_internal.h"
++#include "device.h"
++
++extern gd_lkb_t *dlm_get_lkb(gd_ls_t *, int);
++static struct file_operations _dlm_fops;
++static const char *name_prefix="dlm";
++static struct list_head user_ls_list;
++
++/* Flags in li_flags */
++#define LI_FLAG_COMPLETE 1
++#define LI_FLAG_FIRSTLOCK 2
++
++struct lock_info {
++ uint8_t li_cmd;
++ struct dlm_lksb li_lksb;
++ wait_queue_head_t li_waitq;
++ unsigned long li_flags;
++ void __user *li_astparam;
++ void __user *li_astaddr;
++ void __user *li_bastaddr;
++ struct file_info *li_file;
++ struct dlm_lksb __user *li_user_lksb;
++ struct semaphore li_firstlock;
++ struct dlm_queryinfo *li_queryinfo;
++ struct dlm_queryinfo __user *li_user_queryinfo;
++};
++
++/* A queued AST no less */
++struct ast_info {
++ struct dlm_lock_result result;
++ struct dlm_queryinfo *queryinfo;
++ struct dlm_queryinfo __user *user_queryinfo;
++ struct list_head list;
++};
++
++/* One of these per userland lockspace */
++struct user_ls {
++ void *ls_lockspace;
++ atomic_t ls_refcnt;
++ long ls_flags; /* bit 1 means LS has been deleted */
++
++ /* Passed into misc_register() */
++ struct miscdevice ls_miscinfo;
++ struct list_head ls_list;
++};
++
++/* misc_device info for the control device */
++static struct miscdevice ctl_device;
++
++/*
++ * Stuff we hang off the file struct.
++ * The first two are to cope with unlocking all the
++ * locks help by a process when it dies.
++ */
++struct file_info {
++ struct list_head fi_lkb_list; /* List of active lkbs */
++ spinlock_t fi_lkb_lock;
++ struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
++ spinlock_t fi_ast_lock;
++ wait_queue_head_t fi_wait;
++ struct user_ls *fi_ls;
++ atomic_t fi_refcnt; /* Number of users */
++ unsigned long fi_flags; /* Bit 1 means the device is open */
++};
++
++
++/* get and put ops for file_info.
++ Actually I don't really like "get" and "put", but everyone
++ else seems to use them and I can't think of anything
++ nicer at the moment */
++static void get_file_info(struct file_info *f)
++{
++ atomic_inc(&f->fi_refcnt);
++}
++
++static void put_file_info(struct file_info *f)
++{
++ if (atomic_dec_and_test(&f->fi_refcnt))
++ kfree(f);
++}
++
++/* Find a lockspace struct given the device minor number */
++static struct user_ls *find_lockspace(int minor)
++{
++ struct user_ls *lsinfo;
++
++ list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
++
++ if (lsinfo->ls_miscinfo.minor == minor)
++ return lsinfo;
++ }
++ return NULL;
++}
++
++static void add_lockspace_to_list(struct user_ls *lsinfo)
++{
++ list_add(&lsinfo->ls_list, &user_ls_list);
++}
++
++/* Register a lockspace with the DLM and create a misc
++ device for userland to access it */
++static int register_lockspace(char *name, struct user_ls **ls)
++{
++ struct user_ls *newls;
++ int status;
++ int namelen;
++
++ namelen = strlen(name)+strlen(name_prefix)+2;
++
++ newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
++ if (!newls)
++ return -ENOMEM;
++ memset(newls, 0, sizeof(struct user_ls));
++
++ newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
++ if (!newls->ls_miscinfo.name) {
++ kfree(newls);
++ return -ENOMEM;
++ }
++ snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
++
++ status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
++ strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
++ &newls->ls_lockspace, 0);
++
++ if (status != 0) {
++ kfree(newls->ls_miscinfo.name);
++ kfree(newls);
++ return status;
++ }
++
++ newls->ls_miscinfo.fops = &_dlm_fops;
++ newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
++
++ status = misc_register(&newls->ls_miscinfo);
++ if (status) {
++ log_print("failed to register misc device for %s", name);
++ dlm_release_lockspace(newls->ls_lockspace, 0);
++ kfree(newls->ls_miscinfo.name);
++ kfree(newls);
++ return status;
++ }
++
++
++ add_lockspace_to_list(newls);
++ *ls = newls;
++ return 0;
++}
++
++static int unregister_lockspace(struct user_ls *lsinfo, int force)
++{
++ int status;
++
++ status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
++ if (status)
++ return status;
++
++ status = misc_deregister(&lsinfo->ls_miscinfo);
++ if (status)
++ return status;
++
++ list_del(&lsinfo->ls_list);
++ kfree(lsinfo->ls_miscinfo.name);
++ kfree(lsinfo);
++
++ return 0;
++}
++
++/* Add it to userland's AST queue */
++static void add_to_astqueue(struct lock_info *li, void *astaddr)
++{
++ struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
++ if (!ast)
++ return;
++
++ ast->result.astparam = li->li_astparam;
++ ast->result.astaddr = astaddr;
++ ast->result.user_lksb = li->li_user_lksb;
++ ast->result.cmd = li->li_cmd;
++ memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
++
++ /* These two will both be NULL for anything other than queries */
++ ast->queryinfo = li->li_queryinfo;
++ ast->user_queryinfo = li->li_user_queryinfo;
++
++ spin_lock(&li->li_file->fi_ast_lock);
++ list_add_tail(&ast->list, &li->li_file->fi_ast_list);
++ spin_unlock(&li->li_file->fi_ast_lock);
++ wake_up_interruptible(&li->li_file->fi_wait);
++}
++
++static void bast_routine(void *param, int mode)
++{
++ struct lock_info *li = param;
++
++ if (param) {
++ add_to_astqueue(li, li->li_bastaddr);
++ }
++}
++
++/*
++ * This is the kernel's AST routine.
++ * All lock, unlock & query operations complete here.
++ * The only syncronous ops are those done during device close.
++ */
++static void ast_routine(void *param)
++{
++ struct lock_info *li = param;
++
++ /* Param may be NULL if a persistent lock is unlocked by someone else */
++ if (!param)
++ return;
++
++ /* If it's an async request then post data to the user's AST queue. */
++ if (li->li_astaddr) {
++
++ /* Only queue AST if the device is still open */
++ if (test_bit(1, &li->li_file->fi_flags))
++ add_to_astqueue(li, li->li_astaddr);
++
++ /* If it's a new lock operation that failed, then
++ * remove it from the owner queue and free the
++ * lock_info. The DLM will not free the LKB until this
++ * AST has completed.
++ */
++ if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
++ li->li_lksb.sb_status != 0) {
++ gd_lkb_t *lkb;
++
++ /* Wait till dlm_lock() has finished */
++ down(&li->li_firstlock);
++ lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
++ if (lkb) {
++ spin_lock(&li->li_file->fi_lkb_lock);
++ list_del(&lkb->lkb_ownerqueue);
++ spin_unlock(&li->li_file->fi_lkb_lock);
++ }
++ up(&li->li_firstlock);
++ put_file_info(li->li_file);
++ kfree(li);
++ return;
++ }
++ /* Free unlocks & queries */
++ if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
++ li->li_cmd == DLM_USER_QUERY) {
++ put_file_info(li->li_file);
++ kfree(li);
++ }
++ }
++ else {
++ /* Syncronous request, just wake up the caller */
++ set_bit(LI_FLAG_COMPLETE, &li->li_flags);
++ wake_up_interruptible(&li->li_waitq);
++ }
++}
++
++/*
++ * Wait for the lock op to complete and return the status.
++ */
++static int wait_for_ast(struct lock_info *li)
++{
++ /* Wait for the AST routine to complete */
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
++ schedule();
++
++ set_task_state(current, TASK_RUNNING);
++
++ return li->li_lksb.sb_status;
++}
++
++
++/* Open on control device */
++static int dlm_ctl_open(struct inode *inode, struct file *file)
++{
++ return 0;
++}
++
++/* Close on control device */
++static int dlm_ctl_close(struct inode *inode, struct file *file)
++{
++ return 0;
++}
++
++/* Open on lockspace device */
++static int dlm_open(struct inode *inode, struct file *file)
++{
++ struct file_info *f;
++ struct user_ls *lsinfo;
++
++ lsinfo = find_lockspace(iminor(inode));
++ if (!lsinfo)
++ return -ENOENT;
++
++ f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
++ if (!f)
++ return -ENOMEM;
++
++ atomic_inc(&lsinfo->ls_refcnt);
++ INIT_LIST_HEAD(&f->fi_lkb_list);
++ INIT_LIST_HEAD(&f->fi_ast_list);
++ spin_lock_init(&f->fi_ast_lock);
++ spin_lock_init(&f->fi_lkb_lock);
++ init_waitqueue_head(&f->fi_wait);
++ f->fi_ls = lsinfo;
++ atomic_set(&f->fi_refcnt, 1);
++ set_bit(1, &f->fi_flags);
++
++ file->private_data = f;
++
++ return 0;
++}
++
++/* Check the user's version matches ours */
++static int check_version(struct dlm_lock_params *params)
++{
++ if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
++ (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
++ params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
++
++ log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
++ params->version[0],
++ params->version[1],
++ params->version[2],
++ DLM_DEVICE_VERSION_MAJOR,
++ DLM_DEVICE_VERSION_MINOR,
++ DLM_DEVICE_VERSION_PATCH);
++ return -EINVAL;
++ }
++ return 0;
++}
++
++/* Close on lockspace device */
++static int dlm_close(struct inode *inode, struct file *file)
++{
++ struct file_info *f = file->private_data;
++ struct lock_info li;
++ sigset_t tmpsig;
++ sigset_t allsigs;
++ gd_lkb_t *lkb, *safe;
++ struct user_ls *lsinfo;
++ DECLARE_WAITQUEUE(wq, current);
++
++ lsinfo = find_lockspace(iminor(inode));
++ if (!lsinfo)
++ return -ENOENT;
++
++ /* Mark this closed so that ASTs will not be delivered any more */
++ clear_bit(1, &f->fi_flags);
++
++ /* Block signals while we are doing this */
++ sigfillset(&allsigs);
++ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
++
++ /* We use our own lock_info struct here, so that any
++ * outstanding "real" ASTs will be delivered with the
++ * corresponding "real" params, thus freeing the lock_info
++ * that belongs the lock. This catches the corner case where
++ * a lock is BUSY when we try to unlock it here
++ */
++ memset(&li, 0, sizeof(li));
++ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++ init_waitqueue_head(&li.li_waitq);
++ add_wait_queue(&li.li_waitq, &wq);
++
++ /*
++ * Free any outstanding locks, they are on the
++ * list in LIFO order so there should be no problems
++ * about unlocking parents before children.
++ * Although we don't remove the lkbs from the list here
++ * (what would be the point?), foreach_safe is needed
++ * because the lkbs are freed during dlm_unlock operations
++ */
++ list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
++ int status;
++ int lock_status;
++ int flags = 0;
++ struct lock_info *old_li;
++
++ /* Make a copy of this pointer. If all goes well we will
++ * free it later. if not it will be left to the AST routine
++ * to tidy up
++ */
++ old_li = (struct lock_info *)lkb->lkb_astparam;
++
++ /* Don't unlock persistent locks */
++ if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
++ list_del(&lkb->lkb_ownerqueue);
++
++ /* But tidy our references in it */
++ kfree(old_li);
++ lkb->lkb_astparam = (long)NULL;
++ put_file_info(f);
++ continue;
++ }
++
++ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++
++ /* If it's not granted then cancel the request.
++ * If the lock was WAITING then it will be dropped,
++ * if it was converting then it will be reverted to GRANTED,
++ * then we will unlock it.
++ */
++ lock_status = lkb->lkb_status;
++
++ if (lock_status != GDLM_LKSTS_GRANTED)
++ flags = DLM_LKF_CANCEL;
++
++ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
++
++ /* Must wait for it to complete as the next lock could be its
++ * parent */
++ if (status == 0)
++ wait_for_ast(&li);
++
++ /* If it was waiting for a conversion, it will
++ now be granted so we can unlock it properly */
++ if (lock_status == GDLM_LKSTS_CONVERT) {
++
++ clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
++ status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
++
++ if (status == 0)
++ wait_for_ast(&li);
++ }
++ /* Unlock suceeded, free the lock_info struct. */
++ if (status == 0) {
++ kfree(old_li);
++ put_file_info(f);
++ }
++ }
++
++ remove_wait_queue(&li.li_waitq, &wq);
++
++ /* If this is the last reference, and the lockspace has been deleted
++ the free the struct */
++ if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
++ kfree(lsinfo);
++ }
++
++ /* Restore signals */
++ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
++ recalc_sigpending();
++
++ return 0;
++}
++
++/*
++ * ioctls to create/remove lockspaces, and check how many
++ * outstanding ASTs there are against a particular LS.
++ */
++static int dlm_ioctl(struct inode *inode, struct file *file,
++ uint command, ulong u)
++{
++ struct file_info *fi = file->private_data;
++ int status = -EINVAL;
++ int count;
++ struct list_head *tmp_list;
++
++ switch (command) {
++
++ /* Are there any ASTs for us to read?
++ * Warning, this returns the number of messages (ASTs)
++ * in the queue, NOT the number of bytes to read
++ */
++ case FIONREAD:
++ count = 0;
++ spin_lock(&fi->fi_ast_lock);
++ list_for_each(tmp_list, &fi->fi_ast_list)
++ count++;
++ spin_unlock(&fi->fi_ast_lock);
++ status = put_user(count, (int *)u);
++ break;
++
++ default:
++ return -ENOTTY;
++ }
++
++ return status;
++}
++
++/*
++ * ioctls to create/remove lockspaces.
++ */
++static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
++ uint command, ulong u)
++{
++ int status = -EINVAL;
++ char ls_name[MAX_LS_NAME_LEN];
++ struct user_ls *lsinfo;
++ int force = 0;
++
++ switch (command) {
++ case DLM_CREATE_LOCKSPACE:
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
++ return -EFAULT;
++ status = register_lockspace(ls_name, &lsinfo);
++
++ /* If it succeeded then return the minor number */
++ if (status == 0)
++ status = lsinfo->ls_miscinfo.minor;
++ break;
++
++ case DLM_FORCE_RELEASE_LOCKSPACE:
++ force = 2;
++
++ case DLM_RELEASE_LOCKSPACE:
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ lsinfo = find_lockspace(u);
++ if (!lsinfo)
++ return -EINVAL;
++ status = unregister_lockspace(lsinfo, force);
++ break;
++
++ default:
++ return -ENOTTY;
++ }
++
++ return status;
++}
++
++/* Deal with the messy stuff of copying a web of structs
++ from kernel space to userspace */
++static int copy_query_result(struct ast_info *ast)
++{
++ int status = -EFAULT;
++ struct dlm_queryinfo qi;
++
++ /* Get the pointers to userspace structs */
++ if (copy_from_user(&qi, ast->user_queryinfo,
++ sizeof(struct dlm_queryinfo)))
++ goto copy_out;
++
++ /* TODO: does this deref a user pointer? */
++ if (put_user(ast->queryinfo->gqi_lockcount,
++ &ast->user_queryinfo->gqi_lockcount))
++ goto copy_out;
++
++ if (qi.gqi_resinfo) {
++ if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
++ sizeof(struct dlm_resinfo)))
++ goto copy_out;
++ }
++
++ if (qi.gqi_lockinfo) {
++ if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
++ sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
++ goto copy_out;
++ }
++
++ status = 0;
++
++ if (ast->queryinfo->gqi_lockinfo)
++ kfree(ast->queryinfo->gqi_lockinfo);
++
++ if (ast->queryinfo->gqi_resinfo)
++ kfree(ast->queryinfo->gqi_resinfo);
++
++ kfree(ast->queryinfo);
++
++ copy_out:
++ return status;
++}
++
++/* Read call, might block if no ASTs are waiting.
++ * It will only ever return one message at a time, regardless
++ * of how many are pending.
++ */
++static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
++{
++ struct file_info *fi = file->private_data;
++ struct ast_info *ast;
++ int ret;
++ DECLARE_WAITQUEUE(wait, current);
++
++ if (count < sizeof(struct dlm_lock_result))
++ return -EINVAL;
++
++ spin_lock(&fi->fi_ast_lock);
++ if (list_empty(&fi->fi_ast_list)) {
++
++ /* No waiting ASTs.
++ * Return EOF if the lockspace been deleted.
++ */
++ if (test_bit(1, &fi->fi_ls->ls_flags))
++ return 0;
++
++ if (file->f_flags & O_NONBLOCK) {
++ spin_unlock(&fi->fi_ast_lock);
++ return -EAGAIN;
++ }
++
++ add_wait_queue(&fi->fi_wait, &wait);
++
++ repeat:
++ set_current_state(TASK_INTERRUPTIBLE);
++ if (list_empty(&fi->fi_ast_list) &&
++ !signal_pending(current)) {
++
++ spin_unlock(&fi->fi_ast_lock);
++ schedule();
++ spin_lock(&fi->fi_ast_lock);
++ goto repeat;
++ }
++
++ current->state = TASK_RUNNING;
++ remove_wait_queue(&fi->fi_wait, &wait);
++
++ if (signal_pending(current)) {
++ spin_unlock(&fi->fi_ast_lock);
++ return -ERESTARTSYS;
++ }
++ }
++
++ ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
++ list_del(&ast->list);
++ spin_unlock(&fi->fi_ast_lock);
++
++ ret = sizeof(struct dlm_lock_result);
++ if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
++ ret = -EFAULT;
++
++ /* If it was a query then copy the result block back here */
++ if (ast->queryinfo) {
++ int status = copy_query_result(ast);
++ if (status)
++ ret = status;
++ }
++
++ kfree(ast);
++ return ret;
++}
++
++static unsigned int dlm_poll(struct file *file, poll_table *wait)
++{
++ struct file_info *fi = file->private_data;
++
++ poll_wait(file, &fi->fi_wait, wait);
++
++ spin_lock(&fi->fi_ast_lock);
++ if (!list_empty(&fi->fi_ast_list)) {
++ spin_unlock(&fi->fi_ast_lock);
++ return POLLIN | POLLRDNORM;
++ }
++
++ spin_unlock(&fi->fi_ast_lock);
++ return 0;
++}
++
++static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
++{
++ struct lock_info *li;
++ int status;
++
++ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
++ if (!li)
++ return -ENOMEM;
++
++ get_file_info(fi);
++ li->li_user_lksb = kparams->lksb;
++ li->li_astparam = kparams->astparam;
++ li->li_bastaddr = kparams->bastaddr;
++ li->li_astaddr = kparams->astaddr;
++ li->li_file = fi;
++ li->li_flags = 0;
++ li->li_cmd = kparams->cmd;
++ clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
++
++ if (copy_from_user(&li->li_lksb, kparams->lksb,
++ sizeof(struct dlm_lksb))) {
++ kfree(li);
++ return -EFAULT;
++ }
++ li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
++
++ /* Allocate query structs */
++ status = -ENOMEM;
++ li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
++ if (!li->li_queryinfo)
++ goto out1;
++
++ /* Mainly to get gqi_lock buffer size */
++ if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
++ sizeof(struct dlm_queryinfo))) {
++ status = -EFAULT;
++ goto out1;
++ }
++
++ /* Overwrite userspace pointers we just copied with kernel space ones */
++ if (li->li_queryinfo->gqi_resinfo) {
++ li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
++ if (!li->li_queryinfo->gqi_resinfo)
++ goto out1;
++ }
++ if (li->li_queryinfo->gqi_lockinfo) {
++ li->li_queryinfo->gqi_lockinfo =
++ kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
++ GFP_KERNEL);
++ if (!li->li_queryinfo->gqi_lockinfo)
++ goto out2;
++ }
++
++ li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
++
++ return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
++ kparams->flags, /* query */
++ li->li_queryinfo,
++ ast_routine, li);
++
++ out2:
++ kfree(li->li_queryinfo);
++
++ out1:
++ kfree(li);
++ return status;
++}
++
++static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
++ const char *buffer)
++{
++ struct lock_info *li;
++ int status;
++ char name[DLM_RESNAME_MAXLEN];
++
++ /*
++ * Validate things that we need to have correct.
++ */
++ if (kparams->namelen > DLM_RESNAME_MAXLEN)
++ return -EINVAL;
++
++ if (!kparams->astaddr)
++ return -EINVAL;
++
++ if (!kparams->lksb)
++ return -EINVAL;
++
++ /* Get the lock name */
++ if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
++ kparams->namelen)) {
++ return -EFAULT;
++ }
++
++ /* For conversions, the lock will already have a lock_info
++ block squirelled away in astparam */
++ if (kparams->flags & DLM_LKF_CONVERT) {
++ gd_lkb_t *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
++ if (!lkb) {
++ return -EINVAL;
++ }
++ li = (struct lock_info *)lkb->lkb_astparam;
++
++ /* Only override these if they are provided */
++ if (li->li_user_lksb)
++ li->li_user_lksb = kparams->lksb;
++ if (li->li_astparam)
++ li->li_astparam = kparams->astparam;
++ if (li->li_bastaddr)
++ li->li_bastaddr = kparams->bastaddr;
++ if (li->li_bastaddr)
++ li->li_astaddr = kparams->astaddr;
++ li->li_flags = 0;
++ }
++ else {
++ li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
++ if (!li)
++ return -ENOMEM;
++
++ li->li_user_lksb = kparams->lksb;
++ li->li_astparam = kparams->astparam;
++ li->li_bastaddr = kparams->bastaddr;
++ li->li_astaddr = kparams->astaddr;
++ li->li_file = fi;
++ li->li_flags = 0;
++ li->li_cmd = kparams->cmd;
++ li->li_queryinfo = NULL;
++
++ /* semaphore to allow us to complete our work before
++ the AST routine runs. In fact we only need (and use) this
++ when the initial lock fails */
++ init_MUTEX_LOCKED(&li->li_firstlock);
++ set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
++
++ get_file_info(fi);
++ }
++
++ /* Copy the user's LKSB into kernel space,
++ needed for conversions & value block operations */
++ if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
++ sizeof(struct dlm_lksb)))
++ return -EFAULT;
++
++ /* Lock it ... */
++ status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
++ kparams->flags, name, kparams->namelen,
++ kparams->parent,
++ ast_routine,
++ li,
++ li->li_bastaddr ? bast_routine : NULL,
++ kparams->range.ra_end ? &kparams->range : NULL);
++
++ /* If it succeeded (this far) with a new lock then keep track of
++ it on the file's lkb list */
++ if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
++ gd_lkb_t *lkb;
++ lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
++
++ if (lkb) {
++ spin_lock(&fi->fi_lkb_lock);
++ list_add(&lkb->lkb_ownerqueue,
++ &fi->fi_lkb_list);
++ spin_unlock(&fi->fi_lkb_lock);
++ }
++ else {
++ log_print("failed to get lkb for new lock");
++ }
++ up(&li->li_firstlock);
++ }
++
++ return status;
++}
++
++static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
++{
++ struct lock_info *li;
++ gd_lkb_t *lkb;
++ int status;
++
++ lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
++ if (!lkb) {
++ return -EINVAL;
++ }
++
++ li = (struct lock_info *)lkb->lkb_astparam;
++
++ li->li_user_lksb = kparams->lksb;
++ li->li_astparam = kparams->astparam;
++ li->li_cmd = kparams->cmd;
++
++ /* Have to do it here cos the lkb may not exist after
++ * dlm_unlock() */
++ spin_lock(&fi->fi_lkb_lock);
++ list_del(&lkb->lkb_ownerqueue);
++ spin_unlock(&fi->fi_lkb_lock);
++
++ /* Use existing lksb & astparams */
++ status = dlm_unlock(fi->fi_ls->ls_lockspace,
++ kparams->lkid,
++ kparams->flags, NULL, NULL);
++
++ return status;
++}
++
++/* Write call, submit a locking request */
++static ssize_t dlm_write(struct file *file, const char __user *buffer,
++ size_t count, loff_t *ppos)
++{
++ struct file_info *fi = file->private_data;
++ struct dlm_lock_params kparams;
++ sigset_t tmpsig;
++ sigset_t allsigs;
++ int status;
++
++ if (count < sizeof(kparams))
++ return -EINVAL;
++
++ /* Has the lockspace been deleted */
++ if (test_bit(1, &fi->fi_ls->ls_flags))
++ return -ENOENT;
++
++ /* Get the command info */
++ if (copy_from_user(&kparams, buffer, sizeof(kparams)))
++ return -EFAULT;
++
++ if (check_version(&kparams))
++ return -EINVAL;
++
++ /* Block signals while we are doing this */
++ sigfillset(&allsigs);
++ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
++
++ switch (kparams.cmd)
++ {
++ case DLM_USER_LOCK:
++ status = do_user_lock(fi, &kparams, buffer);
++ break;
++
++ case DLM_USER_UNLOCK:
++ status = do_user_unlock(fi, &kparams);
++ break;
++
++ case DLM_USER_QUERY:
++ status = do_user_query(fi, &kparams);
++ break;
++
++ default:
++ status = -EINVAL;
++ break;
++ }
++ /* Restore signals */
++ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
++ recalc_sigpending();
++
++ if (status == 0)
++ return count;
++ else
++ return status;
++}
++
++void dlm_device_free_devices()
++{
++ struct user_ls *tmp;
++ struct user_ls *lsinfo;
++
++ list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
++ misc_deregister(&lsinfo->ls_miscinfo);
++
++ /* Tidy up, but don't delete the lsinfo struct until
++ all the users have closed their devices */
++ list_del(&lsinfo->ls_list);
++ kfree(lsinfo->ls_miscinfo.name);
++ set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
++ }
++}
++
++static struct file_operations _dlm_fops = {
++ .open = dlm_open,
++ .release = dlm_close,
++ .ioctl = dlm_ioctl,
++ .read = dlm_read,
++ .write = dlm_write,
++ .poll = dlm_poll,
++ .owner = THIS_MODULE,
++};
++
++static struct file_operations _dlm_ctl_fops = {
++ .open = dlm_ctl_open,
++ .release = dlm_ctl_close,
++ .ioctl = dlm_ctl_ioctl,
++ .owner = THIS_MODULE,
++};
++
++/*
++ * Create control device
++ */
++int dlm_device_init(void)
++{
++ int r;
++
++ INIT_LIST_HEAD(&user_ls_list);
++
++ ctl_device.name = "dlm-control";
++ ctl_device.fops = &_dlm_ctl_fops;
++ ctl_device.minor = MISC_DYNAMIC_MINOR;
++
++ r = misc_register(&ctl_device);
++ if (r) {
++ log_print("misc_register failed for DLM control device");
++ return r;
++ }
++
++ return 0;
++}
++
++void dlm_device_exit(void)
++{
++ misc_deregister(&ctl_device);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
+--- linux-orig/cluster/dlm/device.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/device.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DEVICE_DOT_H__
++#define __DEVICE_DOT_H__
++
++extern void dlm_device_free_devices(void);
++
++#endif /* __DEVICE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
+--- linux-orig/cluster/dlm/dir.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dir.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,430 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "lowcomms.h"
++#include "reccomms.h"
++#include "rsb.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++#include "util.h"
++
++/*
++ * We use the upper 16 bits of the hash value to select the directory node.
++ * Low bits are used for distribution of rsb's among hash buckets on each node.
++ *
++ * From the hash value, we are interested in arriving at a final value between
++ * zero and the number of nodes minus one (num_nodes - 1).
++ *
++ * To accomplish this scaling, we take the nearest power of two larger than
++ * num_nodes and subtract one to create a bit mask. The mask is applied to the
++ * hash, reducing the range to nearer the final range.
++ *
++ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
++ * num_nodes to the previously masked hash value.
++ *
++ * This value in the desired range is used as an offset into the sorted list of
++ * nodeid's to give the particular nodeid of the directory node.
++ */
++
++uint32_t name_to_directory_nodeid(gd_ls_t *ls, char *name, int length)
++{
++ struct list_head *tmp;
++ gd_csb_t *csb = NULL;
++ uint32_t hash, node, n = 0, nodeid;
++
++ if (ls->ls_num_nodes == 1) {
++ nodeid = our_nodeid();
++ goto out;
++ }
++
++ hash = gdlm_hash(name, length);
++ node = (hash >> 16) & ls->ls_nodes_mask;
++ node %= ls->ls_num_nodes;
++
++ list_for_each(tmp, &ls->ls_nodes) {
++ if (n++ != node)
++ continue;
++ csb = list_entry(tmp, gd_csb_t, csb_list);
++ break;
++ }
++
++ GDLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
++ ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
++ nodeid = csb->csb_node->gn_nodeid;
++
++ out:
++ return nodeid;
++}
++
++uint32_t get_directory_nodeid(gd_res_t *rsb)
++{
++ return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
++ rsb->res_length);
++}
++
++static inline uint32_t rd_hash(gd_ls_t *ls, char *name, int len)
++{
++ uint32_t val;
++
++ val = gdlm_hash(name, len);
++ val &= RESDIRHASH_MASK;
++
++ return val;
++}
++
++static void add_resdata_to_hash(gd_ls_t *ls, gd_resdata_t *rd)
++{
++ gd_resdir_bucket_t *bucket;
++ uint32_t hashval;
++
++ hashval = rd_hash(ls, rd->rd_name, rd->rd_length);
++ bucket = &ls->ls_resdir_hash[hashval];
++
++ list_add_tail(&rd->rd_list, &bucket->rb_reslist);
++}
++
++static gd_resdata_t *search_rdbucket(gd_ls_t *ls, char *name, int namelen,
++ uint32_t bucket)
++{
++ struct list_head *head;
++ gd_resdata_t *rd;
++
++ head = &ls->ls_resdir_hash[bucket].rb_reslist;
++ list_for_each_entry(rd, head, rd_list) {
++ if (rd->rd_length == namelen &&
++ !memcmp(name, rd->rd_name, namelen))
++ goto out;
++ }
++ rd = NULL;
++ out:
++ return rd;
++}
++
++void remove_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
++ uint8_t sequence)
++{
++ gd_resdata_t *rd;
++ uint32_t bucket;
++
++ bucket = rd_hash(ls, name, namelen);
++
++ write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++ rd = search_rdbucket(ls, name, namelen, bucket);
++
++ if (!rd) {
++ log_debug(ls, "remove_resdata not found nodeid=%u", nodeid);
++ goto out;
++ }
++
++ if (rd->rd_master_nodeid != nodeid) {
++ log_debug(ls, "remove_resdata wrong nodeid=%u", nodeid);
++ goto out;
++ }
++
++ if (rd->rd_sequence == sequence) {
++ list_del(&rd->rd_list);
++ free_resdata(rd);
++ } else {
++ /*
++ log_debug(ls, "remove_resdata mismatch nodeid=%u rd=%u in=%u",
++ nodeid, rd->rd_sequence, sequence);
++ */
++ }
++
++ out:
++ write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++}
++
++void resdir_clear(gd_ls_t *ls)
++{
++ struct list_head *head;
++ gd_resdata_t *rd;
++ int i;
++
++ for (i = 0; i < RESDIRHASH_SIZE; i++) {
++ head = &ls->ls_resdir_hash[i].rb_reslist;
++ while (!list_empty(head)) {
++ rd = list_entry(head->next, gd_resdata_t, rd_list);
++ list_del(&rd->rd_list);
++ free_resdata(rd);
++ }
++ }
++}
++
++static void gdlm_resmov_in(gd_resmov_t *rm, char *buf)
++{
++ gd_resmov_t tmp;
++
++ memcpy(&tmp, buf, sizeof(gd_resmov_t));
++
++ rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
++ rm->rm_length = be16_to_cpu(tmp.rm_length);
++}
++
++int resdir_rebuild_local(gd_ls_t *ls)
++{
++ gd_csb_t *csb;
++ gd_resdata_t *rd;
++ gd_rcom_t *rc;
++ gd_resmov_t mov, last_mov;
++ char *b, *last_name;
++ int error = -ENOMEM, count = 0;
++
++ log_all(ls, "rebuild resource directory");
++
++ resdir_clear(ls);
++
++ rc = allocate_rcom_buffer(ls);
++ if (!rc)
++ goto out;
++
++ last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
++ if (!last_name)
++ goto free_rc;
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ last_mov.rm_length = 0;
++ for (;;) {
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto free_last;
++
++ memcpy(rc->rc_buf, last_name, last_mov.rm_length);
++ rc->rc_datalen = last_mov.rm_length;
++
++ error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
++ RECCOMM_RECOVERNAMES, rc, 1);
++ if (error)
++ goto free_last;
++
++ schedule();
++
++ /*
++ * pick each res out of buffer
++ */
++
++ b = rc->rc_buf;
++
++ for (;;) {
++ gdlm_resmov_in(&mov, b);
++ b += sizeof(gd_resmov_t);
++
++ /* Length of 0 with a non-zero nodeid marks the
++ * end of the list */
++ if (!mov.rm_length && mov.rm_nodeid)
++ goto done;
++
++ /* This is just the end of the block */
++ if (!mov.rm_length)
++ break;
++
++ error = -ENOMEM;
++ rd = allocate_resdata(ls, mov.rm_length);
++ if (!rd)
++ goto free_last;
++
++ rd->rd_master_nodeid = mov.rm_nodeid;
++ rd->rd_length = mov.rm_length;
++ rd->rd_sequence = 1;
++
++ memcpy(rd->rd_name, b, mov.rm_length);
++ b += mov.rm_length;
++
++ add_resdata_to_hash(ls, rd);
++ count++;
++
++ last_mov = mov;
++ memset(last_name, 0, DLM_RESNAME_MAXLEN);
++ memcpy(last_name, rd->rd_name, rd->rd_length);
++ }
++ }
++ done:
++ ;
++ }
++
++ set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
++ error = 0;
++
++ log_all(ls, "rebuilt %d resources", count);
++
++ free_last:
++ kfree(last_name);
++
++ free_rc:
++ free_rcom_buffer(rc);
++
++ out:
++ return error;
++}
++
++/*
++ * The reply end of resdir_rebuild_local/RECOVERNAMES. Collect and send as
++ * many resource names as can fit in the buffer.
++ */
++
++int resdir_rebuild_send(gd_ls_t *ls, char *inbuf, int inlen, char *outbuf,
++ int outlen, uint32_t nodeid)
++{
++ struct list_head *list;
++ gd_res_t *start_rsb = NULL, *rsb;
++ int offset = 0, start_namelen, error;
++ char *start_name;
++ gd_resmov_t tmp;
++ uint32_t dir_nodeid;
++
++ /*
++ * Find the rsb where we left off (or start again)
++ */
++
++ start_namelen = inlen;
++ start_name = inbuf;
++
++ if (start_namelen > 1) {
++ error = find_or_create_rsb(ls, NULL, start_name,
++ start_namelen, 0, &start_rsb);
++ GDLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
++ release_rsb(start_rsb);
++ }
++
++ /*
++ * Send rsb names for rsb's we're master of and whose directory node
++ * matches the requesting node.
++ */
++
++ down_read(&ls->ls_rec_rsblist);
++ if (start_rsb)
++ list = start_rsb->res_rootlist.next;
++ else
++ list = ls->ls_rootres.next;
++
++ for (offset = 0; list != &ls->ls_rootres; list = list->next) {
++ rsb = list_entry(list, gd_res_t, res_rootlist);
++ if (rsb->res_nodeid)
++ continue;
++
++ dir_nodeid = get_directory_nodeid(rsb);
++ if (dir_nodeid != nodeid)
++ continue;
++
++ if (offset + sizeof(gd_resmov_t)*2 + rsb->res_length > outlen) {
++ /* Write end-of-block record */
++ memset(&tmp, 0, sizeof(gd_resmov_t));
++ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++ offset += sizeof(gd_resmov_t);
++ goto out;
++ }
++
++ memset(&tmp, 0, sizeof(gd_resmov_t));
++ tmp.rm_nodeid = cpu_to_be32(our_nodeid());
++ tmp.rm_length = cpu_to_be16(rsb->res_length);
++
++ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++ offset += sizeof(gd_resmov_t);
++
++ memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
++ offset += rsb->res_length;
++ }
++
++ /*
++ * If we've reached the end of the list (and there's room) write a
++ * terminating record.
++ */
++
++ if ((list == &ls->ls_rootres) &&
++ (offset + sizeof(gd_resmov_t) <= outlen)) {
++
++ memset(&tmp, 0, sizeof(gd_resmov_t));
++ /* This only needs to be non-zero */
++ tmp.rm_nodeid = cpu_to_be32(1);
++ /* and this must be zero */
++ tmp.rm_length = 0;
++ memcpy(outbuf + offset, &tmp, sizeof(gd_resmov_t));
++ offset += sizeof(gd_resmov_t);
++ }
++
++ out:
++ up_read(&ls->ls_rec_rsblist);
++ return offset;
++}
++
++int get_resdata(gd_ls_t *ls, uint32_t nodeid, char *name, int namelen,
++ gd_resdata_t **rdp, int recovery)
++{
++ gd_resdata_t *rd;
++ gd_resdata_t *tmp;
++ uint32_t bucket;
++
++ bucket = rd_hash(ls, name, namelen);
++
++ read_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++ rd = search_rdbucket(ls, name, namelen, bucket);
++ read_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++ if (rd)
++ goto out;
++
++ rd = allocate_resdata(ls, namelen);
++ if (!rd)
++ return -ENOMEM;
++
++ rd->rd_master_nodeid = nodeid;
++ rd->rd_length = namelen;
++ memcpy(rd->rd_name, name, namelen);
++
++ write_lock(&ls->ls_resdir_hash[bucket].rb_lock);
++ tmp = search_rdbucket(ls, name, namelen, bucket);
++ if (!tmp)
++ list_add_tail(&rd->rd_list,
++ &ls->ls_resdir_hash[bucket].rb_reslist);
++ write_unlock(&ls->ls_resdir_hash[bucket].rb_lock);
++
++ if (tmp) {
++ free_resdata(rd);
++ rd = tmp;
++ }
++
++ out:
++ *rdp = rd;
++
++ if (!recovery) {
++ if (++rd->rd_sequence == 0)
++ rd->rd_sequence++;
++ } else
++ rd->rd_sequence = 1;
++
++ return 0;
++}
++
++/*
++ * The node with lowest id queries all nodes to determine when all are done.
++ * All other nodes query the low nodeid for this.
++ */
++
++int resdir_rebuild_wait(gd_ls_t *ls)
++{
++ int error;
++
++ if (ls->ls_low_nodeid == our_nodeid()) {
++ error = gdlm_wait_status_all(ls, RESDIR_VALID);
++ if (!error)
++ set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
++ } else
++ error = gdlm_wait_status_low(ls, RESDIR_ALL_VALID);
++
++ return error;
++}
+diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
+--- linux-orig/cluster/dlm/dir.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dir.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIR_DOT_H__
++#define __DIR_DOT_H__
++
++uint32_t name_to_directory_nodeid(gd_ls_t * ls, char *name, int length);
++uint32_t get_directory_nodeid(gd_res_t * rsb);
++void remove_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
++ uint8_t sequence);
++int resdir_rebuild_local(gd_ls_t * ls);
++int resdir_rebuild_send(gd_ls_t * ls, char *inbuf, int inlen, char *outbuf,
++ int outlen, uint32_t nodeid);
++int get_resdata(gd_ls_t * ls, uint32_t nodeid, char *name, int namelen,
++ gd_resdata_t ** rdp, int recovery);
++int resdir_rebuild_wait(gd_ls_t * ls);
++void resdir_clear(gd_ls_t * ls);
++void resdir_dump(gd_ls_t * ls);
++
++#endif /* __DIR_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
+--- linux-orig/cluster/dlm/dlm_internal.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/dlm_internal.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,634 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DLM_INTERNAL_DOT_H__
++#define __DLM_INTERNAL_DOT_H__
++
++/*
++ * This is the main header file to be included in each DLM source file.
++ */
++
++#define DLM_RELEASE_NAME "<CVS>"
++
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <asm/semaphore.h>
++#include <linux/types.h>
++#include <linux/spinlock.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/list.h>
++#include <linux/errno.h>
++#include <linux/random.h>
++
++#include <cluster/dlm.h>
++#include <cluster/dlm_device.h>
++#include <cluster/service.h>
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++#define wchan_cond_sleep_intr(chan, sleep_cond) \
++do \
++{ \
++ DECLARE_WAITQUEUE(__wait_chan, current); \
++ current->state = TASK_INTERRUPTIBLE; \
++ add_wait_queue(&chan, &__wait_chan); \
++ if ((sleep_cond)) \
++ schedule(); \
++ remove_wait_queue(&chan, &__wait_chan); \
++ current->state = TASK_RUNNING; \
++} \
++while (0)
++
++static inline int check_timeout(unsigned long stamp, unsigned int seconds)
++{
++ return time_after(jiffies, stamp + seconds * HZ);
++}
++
++
++#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
++
++#define log_all(ls, fmt, args...) \
++ do { \
++ printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
++ dlm_debug_log(ls, fmt, ##args); \
++ } while (0)
++
++#define log_error log_all
++
++
++#define DLM_DEBUG
++#if defined(DLM_DEBUG)
++#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
++#else
++#define log_debug(ls, fmt, args...)
++#endif
++
++#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
++#undef log_debug
++#define log_debug log_all
++#endif
++
++
++#define GDLM_ASSERT(x, do) \
++{ \
++ if (!(x)) \
++ { \
++ dlm_debug_dump(); \
++ printk("\nDLM: Assertion failed on line %d of file %s\n" \
++ "DLM: assertion: \"%s\"\n" \
++ "DLM: time = %lu\n", \
++ __LINE__, __FILE__, #x, jiffies); \
++ {do} \
++ printk("\n"); \
++ BUG(); \
++ panic("DLM: Record message above and reboot.\n"); \
++ } \
++}
++
++
++struct gd_ls;
++struct gd_lkb;
++struct gd_res;
++struct gd_csb;
++struct gd_node;
++struct gd_resmov;
++struct gd_resdata;
++struct gd_recover;
++struct gd_recinfo;
++struct gd_resdir_bucket;
++struct gd_remlockreply;
++struct gd_remlockrequest;
++struct gd_rcom;
++
++typedef struct gd_ls gd_ls_t;
++typedef struct gd_lkb gd_lkb_t;
++typedef struct gd_res gd_res_t;
++typedef struct gd_csb gd_csb_t;
++typedef struct gd_node gd_node_t;
++typedef struct gd_resmov gd_resmov_t;
++typedef struct gd_resdata gd_resdata_t;
++typedef struct gd_recover gd_recover_t;
++typedef struct gd_resdir_bucket gd_resdir_bucket_t;
++typedef struct gd_rcom gd_rcom_t;
++
++/*
++ * Resource Data - an entry for a resource in the resdir hash table
++ */
++
++struct gd_resdata {
++ struct list_head rd_list;
++ uint32_t rd_master_nodeid;
++ uint16_t rd_length;
++ uint8_t rd_sequence;
++ char rd_name[1]; /* <rd_length> bytes */
++};
++
++/*
++ * Resource Directory Bucket - a hash bucket of resdata entries in the resdir
++ * hash table
++ */
++
++struct gd_resdir_bucket {
++ struct list_head rb_reslist;
++ rwlock_t rb_lock;
++};
++
++/*
++ * A resource description as moved between nodes
++ */
++
++struct gd_resmov {
++ uint32_t rm_nodeid;
++ uint16_t rm_length;
++ uint16_t rm_pad;
++};
++
++/*
++ * An entry in the lock ID table. Locks for this bucket are kept on list.
++ * Counter is used to assign an id to locks as they are added to this bucket.
++ */
++
++struct gd_lockidtbl_entry {
++ struct list_head list;
++ uint16_t counter;
++};
++
++/* Elements in the range array */
++
++#define GR_RANGE_START 0
++#define GR_RANGE_END 1
++#define RQ_RANGE_START 2
++#define RQ_RANGE_END 3
++
++/*
++ * Lockspace structure. The context for GDLM locks.
++ */
++
++#define RESHASHTBL_SIZE (256)
++
++#define RESDIRHASH_SHIFT (9)
++#define RESDIRHASH_SIZE (1 << RESDIRHASH_SHIFT)
++#define RESDIRHASH_MASK (RESDIRHASH_SIZE - 1)
++
++#define LSFL_WORK (0)
++#define LSFL_LS_RUN (1)
++#define LSFL_LS_STOP (2)
++#define LSFL_LS_START (3)
++#define LSFL_LS_FINISH (4)
++#define LSFL_RECCOMM_WAIT (5)
++#define LSFL_RECCOMM_READY (6)
++#define LSFL_NOTIMERS (7)
++#define LSFL_FINISH_RECOVERY (8)
++#define LSFL_RESDIR_VALID (9)
++#define LSFL_ALL_RESDIR_VALID (10)
++#define LSFL_NODES_VALID (11)
++#define LSFL_ALL_NODES_VALID (12)
++#define LSFL_REQUEST_WARN (13)
++
++#define LSST_NONE (0)
++#define LSST_INIT (1)
++#define LSST_INIT_DONE (2)
++#define LSST_CLEAR (3)
++#define LSST_WAIT_START (4)
++#define LSST_RECONFIG_DONE (5)
++
++struct gd_ls {
++ struct list_head ls_list; /* list of lockspaces */
++ uint32_t ls_local_id; /* local unique lockspace ID */
++ uint32_t ls_global_id; /* global unique lockspace ID */
++ int ls_allocation; /* Memory allocation policy */
++ unsigned long ls_flags; /* LSFL_ */
++
++ struct list_head ls_rootres; /* List of root resources */
++
++ int ls_hashsize;
++ int ls_hashmask;
++ struct list_head *ls_reshashtbl; /* Hash table for resources */
++ rwlock_t ls_reshash_lock; /* Lock for hash table */
++
++ struct gd_lockidtbl_entry *ls_lockidtbl;
++ uint32_t ls_lockidtbl_size; /* Size of lock id table */
++ rwlock_t ls_lockidtbl_lock;
++
++ struct list_head ls_nodes; /* current nodes in RC */
++ uint32_t ls_num_nodes; /* number of nodes in RC */
++ uint32_t ls_nodes_mask;
++ uint32_t ls_low_nodeid;
++
++ int ls_state; /* state changes for recovery */
++ struct list_head ls_recover; /* gr_recover_t structs */
++ int ls_last_stop; /* event ids from sm */
++ int ls_last_start;
++ int ls_last_finish;
++ spinlock_t ls_recover_lock;
++ struct list_head ls_nodes_gone; /* dead node list for recovery */
++
++ wait_queue_head_t ls_wait_general;
++
++ gd_rcom_t *ls_rcom;
++ uint32_t ls_rcom_msgid;
++ struct semaphore ls_rcom_lock;
++
++ struct list_head ls_recover_list;
++ int ls_recover_list_count;
++ spinlock_t ls_recover_list_lock;
++
++ struct rw_semaphore ls_in_recovery; /* held in write during
++ * recovery, read for normal
++ * locking ops */
++ struct rw_semaphore ls_unlock_sem; /* To prevent unlock on a
++ * parent lock racing with a
++ * new child lock */
++
++ struct rw_semaphore ls_rec_rsblist; /* To prevent incoming recovery
++ * operations happening while
++ * we are purging */
++
++ struct rw_semaphore ls_gap_rsblist; /* To protect rootres list
++ * in grant_after_purge() which
++ * runs outside recovery */
++
++ struct list_head ls_rebuild_rootrsb_list; /* Root of lock trees
++ * we are deserialising
++ */
++
++ struct list_head ls_deadlockq; /* List of locks in conversion ordered
++ * by duetime. for deadlock detection */
++
++ struct list_head ls_requestqueue; /* List of incoming requests
++ * held while we are in
++ * recovery */
++
++ gd_resdir_bucket_t ls_resdir_hash[RESDIRHASH_SIZE];
++
++ int ls_namelen;
++ char ls_name[1]; /* <namelen> bytes */
++};
++
++/*
++ * Cluster node (per node in cluster)
++ */
++
++struct gd_node {
++ struct list_head gn_list; /* global list of cluster nodes */
++ uint32_t gn_nodeid; /* cluster unique nodeid (cman) */
++ uint32_t gn_ipaddr; /* node's first IP address (cman) */
++ int gn_refcount; /* number of csb's referencing */
++};
++
++/*
++ * Cluster System Block (per node in a ls)
++ */
++
++struct gd_csb {
++ struct list_head csb_list; /* per-lockspace list of nodes */
++ gd_node_t *csb_node; /* global node structure */
++ int csb_gone_event; /* event id when node was removed */
++
++ uint32_t csb_names_send_count;
++ uint32_t csb_names_send_msgid;
++ uint32_t csb_names_recv_count;
++ uint32_t csb_names_recv_msgid;
++ uint32_t csb_locks_send_count;
++ uint32_t csb_locks_send_msgid;
++ uint32_t csb_locks_recv_count;
++ uint32_t csb_locks_recv_msgid;
++};
++
++/*
++ * Resource block
++ */
++
++/* status */
++
++#define GDLM_RESSTS_DIRENTRY 1 /* This is a directory entry */
++#define GDLM_RESSTS_LVBINVALID 2 /* The LVB is invalid */
++
++#define RESFL_NEW_MASTER (0)
++#define RESFL_RECOVER_LIST (1)
++
++struct gd_res {
++ struct list_head res_hashchain; /* Chain of resources in this hash
++ * bucket */
++
++ gd_ls_t *res_ls; /* The owning lockspace */
++
++ struct list_head res_rootlist; /* List of root resources in lockspace */
++
++ struct list_head res_subreslist; /* List of all sub-resources
++ * for this root res. */
++ /* This is a list head on the root res and holds the whole tree below
++ * it. */
++ uint8_t res_depth; /* Depth in resource tree */
++ uint16_t res_status;
++ unsigned long res_flags; /* Flags, RESFL_ */
++
++ struct list_head res_grantqueue;
++ struct list_head res_convertqueue;
++ struct list_head res_waitqueue;
++
++ uint32_t res_nodeid; /* nodeid of master node */
++
++ gd_res_t *res_root; /* If a subresource, this is our root */
++ gd_res_t *res_parent; /* Our parent resource (if any) */
++
++ atomic_t res_ref; /* No of lkb's */
++ uint16_t res_remasterid; /* ID used during remaster */
++ struct list_head res_recover_list; /* General list for use during
++ * recovery */
++ int res_recover_msgid;
++ int res_newlkid_expect;
++
++ struct rw_semaphore res_lock;
++
++ char *res_lvbptr; /* Lock value block */
++
++ uint8_t res_resdir_seq; /* Last directory sequence number */
++
++ uint8_t res_length;
++ char res_name[1]; /* <res_length> bytes */
++};
++
++/*
++ * Lock block. To avoid confusion, where flags mirror the
++ * public flags, they should have the same value.
++ */
++
++#define GDLM_LKSTS_NEW (0)
++#define GDLM_LKSTS_WAITING (1)
++#define GDLM_LKSTS_GRANTED (2)
++#define GDLM_LKSTS_CONVERT (3)
++
++#define GDLM_LKFLG_VALBLK (0x00000008)
++#define GDLM_LKFLG_PERSISTENT (0x00000080) /* Don't unlock when process exits */
++#define GDLM_LKFLG_NODLCKWT (0x00000100) /* Don't do deadlock detection */
++#define GDLM_LKFLG_EXPEDITE (0x00000400) /* Move to head of convert queue */
++
++/* Internal flags */
++#define GDLM_LKFLG_RANGE (0x00001000) /* Range field is present (remote protocol only) */
++#define GDLM_LKFLG_MSTCPY (0x00002000)
++#define GDLM_LKFLG_DELETED (0x00004000) /* LKB is being deleted */
++#define GDLM_LKFLG_DELAST (0x00008000) /* Delete after delivering AST */
++#define GDLM_LKFLG_LQRESEND (0x00010000) /* LKB on lockqueue must be resent */
++#define GDLM_LKFLG_DEMOTED (0x00020000)
++#define GDLM_LKFLG_RESENT (0x00040000)
++#define GDLM_LKFLG_NOREBUILD (0x00080000)
++#define GDLM_LKFLG_LQCONVERT (0x00100000)
++
++struct gd_lkb {
++ void *lkb_astaddr;
++ void *lkb_bastaddr;
++ long lkb_astparam;
++
++ uint32_t lkb_flags;
++ uint16_t lkb_status; /* LKSTS_ granted, waiting, converting */
++ int8_t lkb_rqmode; /* Requested lock mode */
++ int8_t lkb_grmode; /* Granted lock mode */
++ uint8_t lkb_bastmode; /* Requested mode returned in bast */
++ uint8_t lkb_highbast; /* Highest mode we have sent a BAST for */
++ uint32_t lkb_retstatus; /* Status to return in lksb */
++
++ uint32_t lkb_id; /* Our lock ID */
++ struct dlm_lksb *lkb_lksb; /* Lock status block of caller */
++ struct list_head lkb_idtbl_list; /* list pointer into the
++ * lockidtbl */
++
++ struct list_head lkb_statequeue; /* List of locks in this state */
++
++ struct list_head lkb_ownerqueue; /* List of locks owned by a
++ * process */
++
++ gd_lkb_t *lkb_parent; /* Pointer to parent if any */
++
++ atomic_t lkb_childcnt; /* Number of children */
++
++ struct list_head lkb_lockqueue; /* For when we are on the lock queue */
++ int lkb_lockqueue_state;
++ int lkb_lockqueue_flags; /* As passed into lock/unlock */
++ unsigned long lkb_lockqueue_time; /* Time we went on the lock
++ * queue */
++
++ gd_res_t *lkb_resource;
++
++ unsigned long lkb_duetime; /* For deadlock detection */
++
++ uint32_t lkb_remid; /* Remote partner */
++ uint32_t lkb_nodeid;
++
++ struct list_head lkb_astqueue; /* For when we are on the AST queue */
++ uint32_t lkb_asts_to_deliver;
++
++ struct gd_remlockrequest *lkb_request;
++
++ struct list_head lkb_deadlockq; /* on ls_deadlockq list */
++
++ char *lkb_lvbptr; /* Points to lksb on a local lock, allocated
++ * LVB (if necessary) on a remote lock */
++ uint64_t *lkb_range; /* Points to an array of 64 bit numbers that
++ * represent the requested and granted ranges
++ * of the lock. NULL implies 0-ffffffffffffffff
++ */
++};
++
++/*
++ * Used to save and manage recovery state for a lockspace.
++ */
++
++struct gd_recover {
++ struct list_head gr_list;
++ uint32_t *gr_nodeids;
++ int gr_node_count;
++ int gr_event_id;
++};
++
++/*
++ * Header part of the mid-level comms system. All packets start with
++ * this header so we can identify them. The comms packet can
++ * contain many of these structs but the are split into individual
++ * work units before being passed to the lockqueue routines.
++ * below this are the structs that this is a header for
++ */
++
++struct gd_req_header {
++ uint8_t rh_cmd; /* What we are */
++ uint8_t rh_flags; /* maybe just a pad */
++ uint16_t rh_length; /* Length of struct (so we can send several in
++ * one message) */
++ uint32_t rh_lkid; /* Lock ID tag: ie the local (requesting) lock
++ * ID */
++ uint32_t rh_lockspace; /* Lockspace ID */
++};
++
++/*
++ * This is the struct used in a remote lock/unlock/convert request
++ * The mid-level comms API should turn this into native byte order.
++ * Most "normal" lock operations will use these two structs for
++ * communications. Recovery operations use their own structs
++ * but still with the gd_req_header on the front.
++ */
++
++struct gd_remlockrequest {
++ struct gd_req_header rr_header;
++
++ uint32_t rr_remlkid; /* Remote lock ID */
++ uint32_t rr_remparid; /* Parent's remote lock ID or 0 */
++ uint32_t rr_flags; /* Flags from lock/convert request */
++ uint64_t rr_range_start;/* Yes, these are in the right place... */
++ uint64_t rr_range_end;
++ uint32_t rr_status; /* Status to return if this is an AST request */
++ uint8_t rr_rqmode; /* Requested lock mode */
++ uint8_t rr_asts; /* Whether the LKB has ASTs or not */
++ uint8_t rr_resdir_seq; /* Directory sequence number */
++ char rr_lvb[DLM_LVB_LEN]; /* Value block */
++ char rr_name[1]; /* As long as needs be. Only used for directory
++ * lookups. The length of this can be worked
++ * out from the packet length */
++};
++
++/*
++ * This is the struct returned by a remote lock/unlock/convert request
++ * The mid-level comms API should turn this into native byte order.
++ */
++
++struct gd_remlockreply {
++ struct gd_req_header rl_header;
++
++ uint32_t rl_lockstate; /* Whether request was queued/granted/waiting */
++ uint32_t rl_nodeid; /* nodeid of lock master */
++ uint32_t rl_status; /* Status to return to caller */
++ uint32_t rl_lkid; /* Remote lkid */
++ uint8_t rl_resdir_seq; /* Returned directory sequence number */
++ char rl_lvb[DLM_LVB_LEN]; /* LVB itself */
++};
++
++/*
++ * Recovery comms message
++ */
++
++struct gd_rcom {
++ struct gd_req_header rc_header; /* 32 byte aligned */
++ uint32_t rc_msgid;
++ uint16_t rc_datalen;
++ uint8_t rc_expanded;
++ uint8_t rc_subcmd; /* secondary command */
++ char rc_buf[1]; /* first byte of data goes here and extends
++ * beyond here for another datalen - 1 bytes.
++ * rh_length is set to sizeof(gd_rcom_t) +
++ * datalen - 1 */
++};
++
++
++/* A remote query: GDLM_REMCMD_QUERY */
++struct gd_remquery {
++ struct gd_req_header rq_header;
++
++ uint32_t rq_mstlkid; /* LockID on master node */
++ uint32_t rq_query; /* query from the user */
++ uint32_t rq_maxlocks; /* max number of locks we can cope with */
++};
++
++/* First block of a reply query. cmd = GDLM_REMCMD_QUERY */
++/* There may be subsequent blocks of
++ lock info in GDLM_REMCMD_QUERYCONT messages which just have
++ a normal header. The last of these will have rh_flags set to
++ GDLM_REMFLAG_ENDQUERY
++ */
++struct gd_remqueryreply {
++ struct gd_req_header rq_header;
++
++ uint32_t rq_numlocks; /* Number of locks in reply */
++ uint32_t rq_startlock; /* Which lock this block starts at (for multiple block replies) */
++ uint32_t rq_status;
++
++ /* Resource information */
++ uint32_t rq_grantcount; /* No. of nodes on grant queue */
++ uint32_t rq_convcount; /* No. of nodes on convert queue */
++ uint32_t rq_waitcount; /* No. of nodes on wait queue */
++ char rq_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable */
++};
++
++/*
++ * Lockqueue wait lock states
++ */
++
++#define GDLM_LQSTATE_WAIT_RSB 1
++#define GDLM_LQSTATE_WAIT_CONVERT 2
++#define GDLM_LQSTATE_WAIT_CONDGRANT 3
++#define GDLM_LQSTATE_WAIT_UNLOCK 4
++
++/* Commands sent across the comms link */
++#define GDLM_REMCMD_LOOKUP 1
++#define GDLM_REMCMD_LOCKREQUEST 2
++#define GDLM_REMCMD_UNLOCKREQUEST 3
++#define GDLM_REMCMD_CONVREQUEST 4
++#define GDLM_REMCMD_LOCKREPLY 5
++#define GDLM_REMCMD_LOCKGRANT 6
++#define GDLM_REMCMD_SENDBAST 7
++#define GDLM_REMCMD_SENDCAST 8
++#define GDLM_REMCMD_REM_RESDATA 9
++#define GDLM_REMCMD_RECOVERMESSAGE 20
++#define GDLM_REMCMD_RECOVERREPLY 21
++#define GDLM_REMCMD_QUERY 30
++#define GDLM_REMCMD_QUERYREPLY 31
++
++/* Set in rh_flags when this is the last block of
++ query information. Note this could also be the first
++ block */
++#define GDLM_REMFLAG_ENDQUERY 1
++
++/*
++ * This is a both a parameter to queue_ast and also the bitmap of ASTs in
++ * lkb_asts_to_deliver
++ */
++
++typedef enum { GDLM_QUEUE_COMPAST = 1, GDLM_QUEUE_BLKAST = 2 } gd_ast_type_t;
++
++#ifndef BUG_ON
++#define BUG_ON(x)
++#endif
++
++void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...);
++void dlm_debug_dump(void);
++
++#endif /* __DLM_INTERNAL_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
+--- linux-orig/cluster/dlm/lkb.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lkb.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,225 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * lkb.c
++ *
++ * Allocate and free locks on the lock ID table.
++ *
++ * This is slightly naff but I don't really like the
++ * VMS lockidtbl stuff as it uses a realloced array
++ * to hold the locks in. I think this is slightly better
++ * in some ways.
++ *
++ * Any better suggestions gratefully received. Patrick
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "lkb.h"
++#include "config.h"
++#include "rsb.h"
++#include "memory.h"
++#include "lockspace.h"
++#include "util.h"
++
++/*
++ * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
++ */
++
++static gd_lkb_t *__find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
++{
++ uint16_t entry = lkid & 0xFFFF;
++ gd_lkb_t *lkb;
++
++ if (entry >= ls->ls_lockidtbl_size)
++ goto out;
++
++ list_for_each_entry(lkb, &ls->ls_lockidtbl[entry].list, lkb_idtbl_list){
++ if (lkb->lkb_id == lkid)
++ return lkb;
++ }
++
++ out:
++ return NULL;
++}
++
++/*
++ * Should be called at lockspace initialisation time.
++ */
++
++int init_lockidtbl(gd_ls_t *ls, int entries)
++{
++ int i;
++
++ /* Make sure it's a power of two */
++ GDLM_ASSERT(!(entries & (entries - 1)),);
++
++ ls->ls_lockidtbl_size = entries;
++ rwlock_init(&ls->ls_lockidtbl_lock);
++
++ ls->ls_lockidtbl = kmalloc(entries * sizeof(struct gd_lockidtbl_entry),
++ GFP_KERNEL);
++ if (!ls->ls_lockidtbl)
++ return -ENOMEM;
++
++ for (i = 0; i < entries; i++) {
++ INIT_LIST_HEAD(&ls->ls_lockidtbl[i].list);
++ ls->ls_lockidtbl[i].counter = 1;
++ }
++
++ return 0;
++}
++
++/*
++ * Free up the space - returns an error if there are still locks hanging around
++ */
++
++int free_lockidtbl(gd_ls_t *ls)
++{
++ int i;
++
++ write_lock(&ls->ls_lockidtbl_lock);
++
++ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++ if (!list_empty(&ls->ls_lockidtbl[i].list)) {
++ write_unlock(&ls->ls_lockidtbl_lock);
++ return -1;
++ }
++ }
++ kfree(ls->ls_lockidtbl);
++
++ write_unlock(&ls->ls_lockidtbl_lock);
++
++ return 0;
++}
++
++/*
++ * LKB lkid's are 32 bits and have two 16 bit parts. The bottom 16 bits are a
++ * random number between 0 and lockidtbl_size-1. This random number specifies
++ * the "bucket" for the lkb in lockidtbl. The upper 16 bits are a sequentially
++ * assigned per-bucket id.
++ *
++ * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
++ * against the lkid of all lkb's in the bucket to avoid duplication.
++ *
++ */
++
++gd_lkb_t *create_lkb(gd_ls_t *ls)
++{
++ gd_lkb_t *lkb;
++ uint32_t lkid;
++ uint16_t bucket;
++
++ lkb = allocate_lkb(ls);
++ if (!lkb)
++ goto out;
++
++ write_lock(&ls->ls_lockidtbl_lock);
++ do {
++ get_random_bytes(&bucket, sizeof(bucket));
++ bucket &= (ls->ls_lockidtbl_size - 1);
++ lkid = bucket | (ls->ls_lockidtbl[bucket].counter++ << 16);
++ }
++ while (__find_lock_by_id(ls, lkid));
++
++ lkb->lkb_id = (uint32_t) lkid;
++ list_add(&lkb->lkb_idtbl_list, &ls->ls_lockidtbl[bucket].list);
++ write_unlock(&ls->ls_lockidtbl_lock);
++
++ out:
++ return lkb;
++}
++
++/*
++ * Free LKB and remove it from the lockidtbl.
++ * NB - this always frees the lkb whereas release_rsb doesn't free an
++ * rsb unless its reference count is zero.
++ */
++
++void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb)
++{
++ if (lkb->lkb_status) {
++ log_error(ls, "release lkb with status %u", lkb->lkb_status);
++ print_lkb(lkb);
++ return;
++ }
++
++ if (lkb->lkb_parent)
++ atomic_dec(&lkb->lkb_parent->lkb_childcnt);
++
++ write_lock(&ls->ls_lockidtbl_lock);
++ list_del(&lkb->lkb_idtbl_list);
++ write_unlock(&ls->ls_lockidtbl_lock);
++
++ /* if this is not a master copy then lvbptr points into the user's
++ * lksb, so don't free it */
++ if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++ free_lvb(lkb->lkb_lvbptr);
++
++ if (lkb->lkb_range)
++ free_range(lkb->lkb_range);
++
++ free_lkb(lkb);
++}
++
++gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid)
++{
++ gd_lkb_t *lkb;
++
++ read_lock(&ls->ls_lockidtbl_lock);
++ lkb = __find_lock_by_id(ls, lkid);
++ read_unlock(&ls->ls_lockidtbl_lock);
++
++ return lkb;
++}
++
++gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid)
++{
++ gd_ls_t *lspace = find_lockspace_by_local_id(ls);
++ return find_lock_by_id(lspace, lkid);
++}
++
++/*
++ * Initialise the range parts of an LKB.
++ */
++
++int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end)
++{
++ int ret = -ENOMEM;
++
++ /*
++ * if this wasn't already a range lock, make it one
++ */
++ if (!lkb->lkb_range) {
++ lkb->lkb_range = allocate_range(lspace);
++ if (!lkb->lkb_range)
++ goto out;
++
++ /*
++ * This is needed for conversions that contain ranges where the
++ * original lock didn't but it's harmless for new locks too.
++ */
++ lkb->lkb_range[GR_RANGE_START] = 0LL;
++ lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
++ }
++
++ lkb->lkb_range[RQ_RANGE_START] = start;
++ lkb->lkb_range[RQ_RANGE_END] = end;
++
++ ret = 0;
++
++ out:
++ return ret;
++}
+diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
+--- linux-orig/cluster/dlm/lkb.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lkb.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,27 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LKB_DOT_H__
++#define __LKB_DOT_H__
++
++int free_lockidtbl(gd_ls_t * lspace);
++int init_lockidtbl(gd_ls_t * lspace, int entries);
++
++gd_lkb_t *find_lock_by_id(gd_ls_t *ls, uint32_t lkid);
++gd_lkb_t *create_lkb(gd_ls_t *ls);
++void release_lkb(gd_ls_t *ls, gd_lkb_t *lkb);
++gd_lkb_t *dlm_get_lkb(void *ls, uint32_t lkid);
++int verify_lkb_nodeids(gd_ls_t *ls);
++int lkb_set_range(gd_ls_t *lspace, gd_lkb_t *lkb, uint64_t start, uint64_t end);
++
++#endif /* __LKB_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
+--- linux-orig/cluster/dlm/locking.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/locking.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1225 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * locking.c
++ *
++ * This is where the main work of the DLM goes on
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "locking.h"
++#include "lockspace.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "memory.h"
++#include "rsb.h"
++
++#define MAX(a, b) (((a) > (b)) ? (a) : (b))
++
++/*
++ * Lock compatibilty matrix - thanks Steve
++ * UN = Unlocked state. Not really a state, used as a flag
++ * PD = Padding. Used to make the matrix a nice power of two in size
++ * Other states are the same as the VMS DLM.
++ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
++ */
++
++#define modes_compat(gr, rq) \
++ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
++
++const int __dlm_compat_matrix[8][8] = {
++ /* UN NL CR CW PR PW EX PD */
++ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
++ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
++ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
++ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
++ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
++ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
++ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
++ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
++};
++
++/*
++ * Compatibility matrix for conversions with QUECVT set.
++ * Granted mode is the row; requested mode is the column.
++ * Usage: matrix[grmode+1][rqmode+1]
++ */
++
++const int __quecvt_compat_matrix[8][8] = {
++ /* UN NL CR CW PR PW EX PD */
++ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
++ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
++ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
++ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
++ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
++ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
++ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
++ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
++};
++
++/*
++ * This defines the direction of transfer of LVB data.
++ * Granted mode is the row; requested mode is the column.
++ * Usage: matrix[grmode+1][rqmode+1]
++ * 1 = LVB is returned to the caller
++ * 0 = LVB is written to the resource
++ * -1 = nothing happens to the LVB
++ */
++
++const int __lvb_operations[8][8] = {
++ /* UN NL CR CW PR PW EX PD*/
++ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
++ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
++ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
++ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
++ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
++ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
++ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
++ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
++};
++
++static void grant_lock(gd_lkb_t * lkb, int send_remote);
++static void send_blocking_asts(gd_res_t * rsb, gd_lkb_t * lkb);
++static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb);
++static int convert_lock(gd_ls_t * ls, int mode, struct dlm_lksb *lksb,
++ int flags, void *ast, void *astarg, void *bast,
++ struct dlm_range *range);
++static int dlm_lock_stage1(gd_ls_t * lspace, gd_lkb_t * lkb, int flags,
++ char *name, int namelen);
++
++
++static inline int first_in_list(gd_lkb_t *lkb, struct list_head *head)
++{
++ gd_lkb_t *first = list_entry(head->next, gd_lkb_t, lkb_statequeue);
++
++ if (lkb->lkb_id == first->lkb_id)
++ return 1;
++
++ return 0;
++}
++
++/*
++ * Return 1 if the locks' ranges overlap
++ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
++ */
++
++static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
++{
++ if (!lkb1->lkb_range || !lkb2->lkb_range)
++ return 1;
++
++ if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
++ lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
++ return 0;
++
++ return 1;
++}
++
++/*
++ * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
++ * locks on the convert queue. One of the deadlocked locks is allowed to
++ * retain its original granted state (we choose the lkb provided although it
++ * shouldn't matter which.) We do not change the granted mode on locks without
++ * the CONVDEADLK flag. If any of these exist (there shouldn't if the app uses
++ * the flag consistently) the false return value is used.
++ */
++
++static int conversion_deadlock_resolve(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ gd_lkb_t *this;
++ int rv = TRUE;
++
++ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++ if (this == lkb)
++ continue;
++
++ if (!ranges_overlap(lkb, this))
++ continue;
++
++ if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
++
++ if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
++ rv = FALSE;
++ continue;
++ }
++ this->lkb_grmode = DLM_LOCK_NL;
++ this->lkb_flags |= GDLM_LKFLG_DEMOTED;
++ }
++ }
++ return rv;
++}
++
++/*
++ * "A conversion deadlock arises with a pair of lock requests in the converting
++ * queue for one resource. The granted mode of each lock blocks the requested
++ * mode of the other lock."
++ */
++
++static int conversion_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ gd_lkb_t *this;
++
++ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++ if (this == lkb)
++ continue;
++
++ if (!ranges_overlap(lkb, this))
++ continue;
++
++ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
++ return TRUE;
++ }
++ return FALSE;
++}
++
++/*
++ * Check if the given lkb conflicts with another lkb on the queue.
++ */
++
++static int queue_conflict(struct list_head *head, gd_lkb_t *lkb)
++{
++ gd_lkb_t *this;
++
++ list_for_each_entry(this, head, lkb_statequeue) {
++ if (this == lkb)
++ continue;
++ if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
++ return TRUE;
++ }
++ return FALSE;
++}
++
++/*
++ * Deadlock can arise when using the QUECVT flag if the requested mode of the
++ * first converting lock is incompatible with the granted mode of another
++ * converting lock further down the queue. To prevent this deadlock, a
++ * requested QUEUECVT lock is granted immediately if adding it to the end of
++ * the queue would prevent a lock ahead of it from being granted.
++ */
++
++static int queuecvt_deadlock_detect(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ gd_lkb_t *this;
++
++ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
++ if (this == lkb)
++ break;
++
++ if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
++ return TRUE;
++ }
++ return FALSE;
++}
++
++/*
++ * Return 1 if the lock can be granted, 0 otherwise.
++ * Also detect and resolve conversion deadlocks.
++ */
++
++static int can_be_granted(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ if (lkb->lkb_rqmode == DLM_LOCK_NL)
++ return TRUE;
++
++ if (lkb->lkb_rqmode == lkb->lkb_grmode)
++ return TRUE;
++
++ if (queue_conflict(&rsb->res_grantqueue, lkb))
++ return FALSE;
++
++ if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
++ if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
++ return TRUE;
++
++ if (list_empty(&rsb->res_convertqueue) ||
++ first_in_list(lkb, &rsb->res_convertqueue) ||
++ queuecvt_deadlock_detect(rsb, lkb))
++ return TRUE;
++ else
++ return FALSE;
++ }
++
++ /* there *is* a conflict between this lkb and a converting lock so
++ we return false unless conversion deadlock resolution is permitted
++ (only conversion requests will have the CONVDEADLK flag set) */
++
++ if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
++ return FALSE;
++
++ if (!conversion_deadlock_detect(rsb, lkb))
++ return FALSE;
++
++ if (conversion_deadlock_resolve(rsb, lkb))
++ return TRUE;
++
++ return FALSE;
++}
++
++int dlm_lock(void *lockspace,
++ uint32_t mode,
++ struct dlm_lksb *lksb,
++ uint32_t flags,
++ void *name,
++ unsigned int namelen,
++ uint32_t parent,
++ void (*ast) (void *astarg),
++ void *astarg,
++ void (*bast) (void *astarg, int mode),
++ struct dlm_range *range)
++{
++ gd_ls_t *lspace;
++ gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
++ int ret = -EINVAL;
++
++ lspace = find_lockspace_by_local_id(lockspace);
++ if (!lspace)
++ goto out;
++
++ if (mode < 0 || mode > DLM_LOCK_EX)
++ goto out;
++
++ if (namelen > DLM_RESNAME_MAXLEN)
++ goto out;
++
++ if (flags & DLM_LKF_CANCEL)
++ goto out;
++
++ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
++ goto out;
++
++ if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
++ goto out;
++
++ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
++ goto out;
++
++ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
++ goto out;
++
++ if (!ast || !lksb)
++ goto out;
++
++ if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
++ goto out;
++
++ if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
++ goto out;
++
++ /*
++ * Take conversion path.
++ */
++
++ if (flags & DLM_LKF_CONVERT) {
++ ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
++ bast, range);
++ goto out;
++ }
++
++ /*
++ * Take new lock path.
++ */
++
++ if (parent) {
++ down_read(&lspace->ls_unlock_sem);
++
++ parent_lkb = find_lock_by_id(lspace, parent);
++
++ if (!parent_lkb ||
++ parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
++ parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
++ parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
++ up_read(&lspace->ls_unlock_sem);
++ goto out;
++ }
++
++ atomic_inc(&parent_lkb->lkb_childcnt);
++ up_read(&lspace->ls_unlock_sem);
++ }
++
++ down_read(&lspace->ls_in_recovery);
++
++ ret = -ENOMEM;
++
++ lkb = create_lkb(lspace);
++ if (!lkb)
++ goto fail_dec;
++ lkb->lkb_astaddr = ast;
++ lkb->lkb_astparam = (long) astarg;
++ lkb->lkb_bastaddr = bast;
++ lkb->lkb_rqmode = mode;
++ lkb->lkb_grmode = DLM_LOCK_IV;
++ lkb->lkb_lksb = lksb;
++ lkb->lkb_parent = parent_lkb;
++ lkb->lkb_lockqueue_flags = flags;
++ lkb->lkb_lvbptr = lksb->sb_lvbptr;
++
++ /* Copy the range if appropriate */
++ if (range) {
++ if (range->ra_start > range->ra_end) {
++ ret = -EINVAL;
++ goto fail_free;
++ }
++
++ if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
++ goto fail_free;
++ }
++
++ /* Convert relevant flags to internal numbers */
++ if (flags & DLM_LKF_VALBLK)
++ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++ if (flags & DLM_LKF_PERSISTENT)
++ lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
++ if (flags & DLM_LKF_NODLCKWT)
++ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
++
++ lksb->sb_lkid = lkb->lkb_id;
++
++ ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
++ if (ret)
++ goto fail_free;
++
++ up_read(&lspace->ls_in_recovery);
++
++ wake_astd();
++
++ return 0;
++
++ fail_free:
++ release_lkb(lspace, lkb);
++ goto fail_unlock;
++
++ fail_dec:
++ if (parent_lkb)
++ atomic_dec(&parent_lkb->lkb_childcnt);
++
++ fail_unlock:
++ up_read(&lspace->ls_in_recovery);
++
++ out:
++ return ret;
++}
++
++int dlm_lock_stage1(gd_ls_t *ls, gd_lkb_t *lkb, int flags, char *name,
++ int namelen)
++{
++ gd_res_t *rsb, *parent_rsb = NULL;
++ gd_lkb_t *parent_lkb = lkb->lkb_parent;
++ gd_resdata_t *rd;
++ uint32_t nodeid;
++ int error;
++
++ if (parent_lkb)
++ parent_rsb = parent_lkb->lkb_resource;
++
++ error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
++ if (error)
++ goto out;
++
++ lkb->lkb_resource = rsb;
++ lkb->lkb_nodeid = rsb->res_nodeid;
++
++ /*
++ * Next stage, do we need to find the master or can
++ * we get on with the real locking work ?
++ */
++
++ if (rsb->res_nodeid == -1) {
++ if (get_directory_nodeid(rsb) != our_nodeid()) {
++ error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
++ goto out;
++ }
++
++ error = get_resdata(ls, our_nodeid(), rsb->res_name,
++ rsb->res_length, &rd, 0);
++ if (error)
++ goto out;
++
++ nodeid = rd->rd_master_nodeid;
++ if (nodeid == our_nodeid())
++ nodeid = 0;
++ rsb->res_nodeid = nodeid;
++ lkb->lkb_nodeid = nodeid;
++ rsb->res_resdir_seq = rd->rd_sequence;
++ }
++
++ error = dlm_lock_stage2(ls, lkb, rsb, flags);
++
++ out:
++ if (error)
++ release_rsb(rsb);
++
++ return error;
++}
++
++/*
++ * Locking routine called after we have an RSB, either a copy of a remote one
++ * or a local one, or perhaps a shiny new one all of our very own
++ */
++
++int dlm_lock_stage2(gd_ls_t *ls, gd_lkb_t *lkb, gd_res_t *rsb, int flags)
++{
++ int error = 0;
++
++ if (rsb->res_nodeid) {
++ res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++ error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
++ } else {
++ dlm_lock_stage3(lkb);
++ }
++
++ return error;
++}
++
++/*
++ * Called on an RSB's master node to do stage2 locking for a remote lock
++ * request. Returns a proper lkb with rsb ready for lock processing.
++ * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
++ */
++
++gd_lkb_t *remote_stage2(int remote_nodeid, gd_ls_t *ls,
++ struct gd_remlockrequest *freq)
++{
++ gd_res_t *rsb = NULL, *parent_rsb = NULL;
++ gd_lkb_t *lkb = NULL, *parent_lkb = NULL;
++ int error, namelen;
++
++ if (freq->rr_remparid) {
++ parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
++ if (!parent_lkb)
++ goto fail;
++
++ atomic_inc(&parent_lkb->lkb_childcnt);
++ parent_rsb = parent_lkb->lkb_resource;
++ }
++
++ /*
++ * A new MSTCPY lkb. Initialize lkb fields including the real lkid and
++ * node actually holding the (non-MSTCPY) lkb. AST address are just
++ * flags in the master copy.
++ */
++
++ lkb = create_lkb(ls);
++ if (!lkb)
++ goto fail_dec;
++ lkb->lkb_grmode = DLM_LOCK_IV;
++ lkb->lkb_rqmode = freq->rr_rqmode;
++ lkb->lkb_parent = parent_lkb;
++ lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_COMPAST);
++ lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & GDLM_QUEUE_BLKAST);
++ lkb->lkb_nodeid = remote_nodeid;
++ lkb->lkb_remid = freq->rr_header.rh_lkid;
++ lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
++ lkb->lkb_lockqueue_flags = freq->rr_flags;
++
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
++ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++ allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
++ if (!lkb->lkb_lvbptr)
++ goto fail_free;
++ }
++
++ if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
++ error = lkb_set_range(ls, lkb, freq->rr_range_start,
++ freq->rr_range_end);
++ if (error)
++ goto fail_free;
++ }
++
++ /*
++ * Get the RSB which this lock is for. Create a new RSB if this is a
++ * new lock on a new resource. We must be the master of any new rsb.
++ */
++
++ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++
++ error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 1,
++ &rsb);
++ if (error)
++ goto fail_free;
++
++ lkb->lkb_resource = rsb;
++ if (rsb->res_nodeid == -1)
++ rsb->res_nodeid = 0;
++ if (freq->rr_resdir_seq)
++ rsb->res_resdir_seq = freq->rr_resdir_seq;
++
++ return lkb;
++
++
++ fail_free:
++ /* release_lkb handles parent */
++ release_lkb(ls, lkb);
++ parent_lkb = NULL;
++
++ fail_dec:
++ if (parent_lkb)
++ atomic_dec(&parent_lkb->lkb_childcnt);
++ fail:
++ return NULL;
++}
++
++/*
++ * The final bit of lock request processing on the master node. Here the lock
++ * is granted and the completion ast is queued, or the lock is put on the
++ * waitqueue and blocking asts are sent.
++ */
++
++void dlm_lock_stage3(gd_lkb_t *lkb)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++
++ /*
++ * This is a locally mastered lock on a resource that already exists,
++ * see if it can be granted or if it must wait. When this function is
++ * called for a remote lock request (process_cluster_request,
++ * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
++ * requesting node at the end of process_cluster_request, not at the
++ * end of grant_lock.
++ */
++
++ down_write(&rsb->res_lock);
++
++ if (can_be_granted(rsb, lkb)) {
++ grant_lock(lkb, 0);
++ goto out;
++ }
++
++ /*
++ * This request is not a conversion, so the lkb didn't exist other than
++ * for this request and should be freed after EAGAIN is returned in the
++ * ast.
++ */
++
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ lkb->lkb_retstatus = -EAGAIN;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
++ send_blocking_asts_all(rsb, lkb);
++ goto out;
++ }
++
++ /*
++ * The requested lkb must wait. Because the rsb of the requested lkb
++ * is mastered here, send blocking asts for the lkb's blocking the
++ * request.
++ */
++
++ lkb->lkb_retstatus = 0;
++ lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++
++ send_blocking_asts(rsb, lkb);
++
++ out:
++ up_write(&rsb->res_lock);
++}
++
++int dlm_unlock(void *lockspace,
++ uint32_t lkid,
++ uint32_t flags,
++ struct dlm_lksb *lksb,
++ void *astarg)
++{
++ gd_ls_t *ls = find_lockspace_by_local_id(lockspace);
++ gd_lkb_t *lkb;
++ gd_res_t *rsb;
++ int ret = -EINVAL;
++
++ if (!ls)
++ goto out;
++
++ lkb = find_lock_by_id(ls, lkid);
++ if (!lkb)
++ goto out;
++
++ /* Can't dequeue a master copy (a remote node's mastered lock) */
++ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++ goto out;
++
++ /* Already waiting for a remote lock operation */
++ if (lkb->lkb_lockqueue_state) {
++ ret = -EBUSY;
++ goto out;
++ }
++
++ /* Can only cancel WAITING or CONVERTing locks.
++ * This is just a quick check - it is also checked in unlock_stage2()
++ * (which may be on the master) under the semaphore.
++ */
++ if ((flags & DLM_LKF_CANCEL) &&
++ (lkb->lkb_status == GDLM_LKSTS_GRANTED))
++ goto out;
++
++ /* "Normal" unlocks must operate on a granted lock */
++ if (!(flags & DLM_LKF_CANCEL) &&
++ (lkb->lkb_status != GDLM_LKSTS_GRANTED))
++ goto out;
++
++ down_write(&ls->ls_unlock_sem);
++
++ /* Can't dequeue a lock with sublocks */
++ if (atomic_read(&lkb->lkb_childcnt)) {
++ up_write(&ls->ls_unlock_sem);
++ ret = -ENOTEMPTY;
++ goto out;
++ }
++
++ /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
++ if (!(flags & DLM_LKF_CANCEL))
++ lkb->lkb_flags |= GDLM_LKFLG_DELETED;
++ up_write(&ls->ls_unlock_sem);
++
++ /* Save any new params */
++ if (lksb)
++ lkb->lkb_lksb = lksb;
++ if (astarg)
++ lkb->lkb_astparam = (long) astarg;
++
++ lkb->lkb_lockqueue_flags = flags;
++
++ rsb = lkb->lkb_resource;
++
++ down_read(&ls->ls_in_recovery);
++
++ if (rsb->res_nodeid)
++ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
++ else
++ ret = dlm_unlock_stage2(lkb, flags);
++
++ up_read(&ls->ls_in_recovery);
++
++ wake_astd();
++
++ out:
++ return ret;
++}
++
++int dlm_unlock_stage2(gd_lkb_t *lkb, uint32_t flags)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++ int old_status;
++ int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
++
++ down_write(&rsb->res_lock);
++
++ /* Can only cancel WAITING or CONVERTing locks */
++ if ((flags & DLM_LKF_CANCEL) &&
++ (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
++ lkb->lkb_retstatus = -EINVAL;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ goto out;
++ }
++
++ old_status = lkb_dequeue(lkb);
++
++ /*
++ * If was granted grant any converting or waiting locks.
++ */
++
++ if (old_status == GDLM_LKSTS_GRANTED)
++ grant_pending_locks(rsb);
++
++ /*
++ * Cancelling a conversion
++ */
++
++ if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
++ /* VMS semantics say we should send blocking ASTs again here */
++ send_blocking_asts(rsb, lkb);
++
++ /* Remove from deadlock detection */
++ if (lkb->lkb_duetime)
++ remove_from_deadlockqueue(lkb);
++
++ /* Stick it back on the granted queue */
++ lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++ lkb->lkb_rqmode = lkb->lkb_grmode;
++
++ /* Was it blocking any other locks? */
++ if (first_in_list(lkb, &rsb->res_convertqueue))
++ grant_pending_locks(rsb);
++
++ lkb->lkb_retstatus = -DLM_ECANCEL;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ goto out;
++ }
++
++ /*
++ * The lvb can be saved or cleared on unlock.
++ */
++
++ if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
++ if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
++ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++ if (flags & DLM_LKF_IVVALBLK)
++ memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
++ }
++
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ lkb->lkb_retstatus =
++ (flags & DLM_LKF_CANCEL) ? -DLM_ECANCEL : -DLM_EUNLOCK;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++ /*
++ * Only free the LKB if we are the master copy. Otherwise the AST
++ * delivery routine will free it after delivery. queue_ast for MSTCPY
++ * lkb just sends a message.
++ */
++
++ if (remote) {
++ up_write(&rsb->res_lock);
++ release_lkb(rsb->res_ls, lkb);
++ release_rsb(rsb);
++ goto out2;
++ }
++
++ out:
++ up_write(&rsb->res_lock);
++ out2:
++ wake_astd();
++ return 0;
++}
++
++/*
++ * Lock conversion
++ */
++
++static int convert_lock(gd_ls_t *ls, int mode, struct dlm_lksb *lksb,
++ int flags, void *ast, void *astarg, void *bast,
++ struct dlm_range *range)
++{
++ gd_lkb_t *lkb;
++ gd_res_t *rsb;
++ int ret = -EINVAL;
++
++ lkb = find_lock_by_id(ls, lksb->sb_lkid);
++ if (!lkb) {
++ goto out;
++ }
++
++ if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
++ ret = -EBUSY;
++ goto out;
++ }
++
++ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++ goto out;
++ }
++
++ if ((flags & DLM_LKF_QUECVT) &&
++ !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
++ goto out;
++ }
++
++ if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
++ goto out;
++ }
++
++ if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
++ goto out;
++ }
++
++ /* Set up the ranges as appropriate */
++ if (range) {
++ if (range->ra_start > range->ra_end)
++ goto out;
++
++ if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ }
++
++ rsb = lkb->lkb_resource;
++ down_read(&rsb->res_ls->ls_in_recovery);
++
++ lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
++ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
++
++ if (flags & DLM_LKF_NODLCKWT)
++ lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
++ if (ast)
++ lkb->lkb_astaddr = ast;
++ if (astarg)
++ lkb->lkb_astparam = (long) astarg;
++ if (bast)
++ lkb->lkb_bastaddr = bast;
++ lkb->lkb_rqmode = mode;
++ lkb->lkb_lockqueue_flags = flags;
++ lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
++ lkb->lkb_lvbptr = lksb->sb_lvbptr;
++
++ if (rsb->res_nodeid) {
++ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++ ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
++ } else {
++ ret = dlm_convert_stage2(lkb, FALSE);
++ }
++
++ up_read(&rsb->res_ls->ls_in_recovery);
++
++ wake_astd();
++
++ out:
++ return ret;
++}
++
++/*
++ * For local conversion requests on locally mastered locks this is called
++ * directly from dlm_lock/convert_lock. This function is also called for
++ * remote conversion requests of MSTCPY locks (from process_cluster_request).
++ */
++
++int dlm_convert_stage2(gd_lkb_t *lkb, int do_ast)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++ int ret = 0;
++
++ down_write(&rsb->res_lock);
++
++ if (can_be_granted(rsb, lkb)) {
++ grant_lock(lkb, 0);
++ grant_pending_locks(rsb);
++ goto out;
++ }
++
++ /*
++ * Remove lkb from granted queue.
++ */
++
++ lkb_dequeue(lkb);
++
++ /*
++ * The user won't wait so stick it back on the grant queue
++ */
++
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
++ lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++ ret = lkb->lkb_retstatus = -EAGAIN;
++ if (do_ast)
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
++ send_blocking_asts_all(rsb, lkb);
++ goto out;
++ }
++
++ /*
++ * The lkb's status tells which queue it's on. Put back on convert
++ * queue. (QUECVT requests added at end of the queue, all others in
++ * order.)
++ */
++
++ lkb->lkb_retstatus = 0;
++ lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++
++ /*
++ * If the request can't be granted
++ */
++
++ send_blocking_asts(rsb, lkb);
++
++ if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
++ add_to_deadlockqueue(lkb);
++
++ out:
++ up_write(&rsb->res_lock);
++ return ret;
++}
++
++/*
++ * Remove lkb from any queue it's on, add it to the granted queue, and queue a
++ * completion ast. rsb res_lock must be held in write when this is called.
++ */
++
++static void grant_lock(gd_lkb_t *lkb, int send_remote)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++
++ if (lkb->lkb_duetime)
++ remove_from_deadlockqueue(lkb);
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++ int b;
++ GDLM_ASSERT(lkb->lkb_lvbptr,);
++
++ if (!rsb->res_lvbptr)
++ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++ b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
++ if (b)
++ memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
++ else
++ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++ }
++
++ if (lkb->lkb_range) {
++ lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
++ lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
++ }
++
++ lkb->lkb_grmode = lkb->lkb_rqmode;
++ lkb->lkb_rqmode = DLM_LOCK_IV;
++ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++ lkb->lkb_highbast = 0;
++ lkb->lkb_retstatus = 0;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++ /*
++ * A remote conversion request has been granted, either immediately
++ * upon being requested or after waiting a bit. In the former case,
++ * reply_and_grant() is called. In the later case send_remote is 1 and
++ * remote_grant() is called.
++ *
++ * The "send_remote" flag is set only for locks which are granted "out
++ * of band" - ie by another lock being converted or unlocked.
++ *
++ * The second case occurs when this lkb is granted right away as part
++ * of processing the initial request. In that case, we send a single
++ * message in reply_and_grant which combines the request reply with the
++ * grant message.
++ */
++
++ if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
++ if (send_remote)
++ remote_grant(lkb);
++ else if (lkb->lkb_request)
++ reply_and_grant(lkb);
++ }
++
++}
++
++static void send_bast_queue(struct list_head *head, gd_lkb_t *lkb)
++{
++ gd_lkb_t *gr;
++
++ list_for_each_entry(gr, head, lkb_statequeue) {
++ if (gr->lkb_bastaddr &&
++ gr->lkb_highbast < lkb->lkb_rqmode &&
++ ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
++ queue_ast(gr, GDLM_QUEUE_BLKAST, lkb->lkb_rqmode);
++ gr->lkb_highbast = lkb->lkb_rqmode;
++ }
++ }
++}
++
++/*
++ * Notify granted locks if they are blocking a newly forced-to-wait lock.
++ */
++
++static void send_blocking_asts(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ send_bast_queue(&rsb->res_grantqueue, lkb);
++ /* check if the following improves performance */
++ /* send_bast_queue(&rsb->res_convertqueue, lkb); */
++}
++
++static void send_blocking_asts_all(gd_res_t *rsb, gd_lkb_t *lkb)
++{
++ send_bast_queue(&rsb->res_grantqueue, lkb);
++ send_bast_queue(&rsb->res_convertqueue, lkb);
++}
++
++/*
++ * Called when a lock has been dequeued. Look for any locks to grant that are
++ * waiting for conversion or waiting to be granted.
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int grant_pending_locks(gd_res_t *rsb)
++{
++ gd_lkb_t *lkb;
++ struct list_head *list;
++ struct list_head *temp;
++ int8_t high = DLM_LOCK_IV;
++
++ list_for_each_safe(list, temp, &rsb->res_convertqueue) {
++ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++ if (can_be_granted(rsb, lkb))
++ grant_lock(lkb, 1);
++ else
++ high = MAX(lkb->lkb_rqmode, high);
++ }
++
++ list_for_each_safe(list, temp, &rsb->res_waitqueue) {
++ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++ if (can_be_granted(rsb, lkb))
++ grant_lock(lkb, 1);
++ else
++ high = MAX(lkb->lkb_rqmode, high);
++ }
++
++ /*
++ * If there are locks left on the wait/convert queue then send blocking
++ * ASTs to granted locks that are blocking
++ *
++ * FIXME: This might generate some spurious blocking ASTs for range
++ * locks.
++ */
++
++ if (high > DLM_LOCK_IV) {
++ list_for_each_safe(list, temp, &rsb->res_grantqueue) {
++ lkb = list_entry(list, gd_lkb_t, lkb_statequeue);
++
++ if (lkb->lkb_bastaddr &&
++ (lkb->lkb_highbast < high) &&
++ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
++
++ queue_ast(lkb, GDLM_QUEUE_BLKAST, high);
++ lkb->lkb_highbast = high;
++ }
++ }
++ }
++
++ return 0;
++}
++
++/*
++ * Called to cancel a locking operation that failed due to some internal
++ * reason.
++ *
++ * Waiting locks will be removed, converting locks will be reverted to their
++ * granted status, unlocks will be left where they are.
++ *
++ * A completion AST will be delivered to the caller.
++ */
++
++int cancel_lockop(gd_lkb_t *lkb, int status)
++{
++ int state = lkb->lkb_lockqueue_state;
++
++ lkb->lkb_lockqueue_state = 0;
++
++ switch (state) {
++ case GDLM_LQSTATE_WAIT_RSB:
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONDGRANT:
++ res_lkb_dequeue(lkb);
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONVERT:
++ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
++
++ /* Remove from deadlock detection */
++ if (lkb->lkb_duetime) {
++ remove_from_deadlockqueue(lkb);
++ }
++ break;
++
++ case GDLM_LQSTATE_WAIT_UNLOCK:
++ /* We can leave this. I think.... */
++ break;
++ }
++
++ lkb->lkb_retstatus = status;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++
++ return 0;
++}
++
++/*
++ * Check for conversion deadlock. If a deadlock was found
++ * return lkb to kill, else return NULL
++ */
++
++gd_lkb_t *conversion_deadlock_check(gd_lkb_t *lkb)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++ struct list_head *entry;
++
++ GDLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
++
++ /* Work our way up to the head of the queue looking for locks that
++ * conflict with us */
++
++ down_read(&rsb->res_lock);
++
++ entry = lkb->lkb_statequeue.prev;
++ while (entry != &rsb->res_convertqueue) {
++ gd_lkb_t *lkb2 = list_entry(entry, gd_lkb_t, lkb_statequeue);
++
++ if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
++ up_read(&rsb->res_lock);
++ return lkb;
++ }
++ entry = entry->prev;
++ }
++ up_read(&rsb->res_lock);
++
++ return 0;
++}
++
++/*
++ * Conversion operation was cancelled by us (not the user).
++ * ret contains the return code to pass onto the user
++ */
++
++void cancel_conversion(gd_lkb_t *lkb, int ret)
++{
++ gd_res_t *rsb = lkb->lkb_resource;
++
++ /* Stick it back on the granted queue */
++ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++ lkb->lkb_rqmode = lkb->lkb_grmode;
++
++ remove_from_deadlockqueue(lkb);
++
++ lkb->lkb_retstatus = ret;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ wake_astd();
++}
++
++/*
++ * As new master of the rsb for this lkb, we need to handle these requests
++ * removed from the lockqueue and originating from local processes:
++ * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
++ * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
++ */
++
++void process_remastered_lkb(gd_lkb_t *lkb, int state)
++{
++ switch (state) {
++ case GDLM_LQSTATE_WAIT_RSB:
++ dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
++ lkb->lkb_lockqueue_flags,
++ lkb->lkb_resource->res_name,
++ lkb->lkb_resource->res_length);
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONDGRANT:
++ res_lkb_dequeue(lkb);
++ dlm_lock_stage3(lkb);
++ break;
++
++ case GDLM_LQSTATE_WAIT_UNLOCK:
++ dlm_unlock_stage2(lkb, lkb->lkb_lockqueue_flags);
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONVERT:
++ dlm_convert_stage2(lkb, TRUE);
++ break;
++
++ default:
++ GDLM_ASSERT(0,);
++ }
++}
+diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
+--- linux-orig/cluster/dlm/locking.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/locking.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,33 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKING_DOT_H__
++#define __LOCKING_DOT_H__
++
++void process_remastered_lkb(gd_lkb_t * lkb, int state);
++void dlm_lock_stage3(gd_lkb_t * lkb);
++int dlm_convert_stage2(gd_lkb_t * lkb, int do_ast);
++int dlm_unlock_stage2(gd_lkb_t * lkb, uint32_t flags);
++int dlm_lock_stage2(gd_ls_t * lspace, gd_lkb_t * lkb, gd_res_t * rsb,
++ int flags);
++gd_res_t *create_rsb(gd_ls_t * lspace, gd_lkb_t * lkb, char *name, int namelen);
++int free_rsb_if_unused(gd_res_t * rsb);
++gd_lkb_t *remote_stage2(int remote_csid, gd_ls_t * lspace,
++ struct gd_remlockrequest *freq);
++int cancel_lockop(gd_lkb_t * lkb, int status);
++int dlm_remove_lock(gd_lkb_t * lkb, uint32_t flags);
++int grant_pending_locks(gd_res_t * rsb);
++void cancel_conversion(gd_lkb_t * lkb, int ret);
++gd_lkb_t *conversion_deadlock_check(gd_lkb_t * lkb);
++
++#endif /* __LOCKING_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
+--- linux-orig/cluster/dlm/lockqueue.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockqueue.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,954 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * lockqueue.c
++ *
++ * This controls the lock queue, which is where locks
++ * come when they need to wait for a remote operation
++ * to complete.
++ *
++ * This could also be thought of as the "high-level" comms
++ * layer.
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "dir.h"
++#include "locking.h"
++#include "lkb.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "reccomms.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "ast.h"
++#include "memory.h"
++#include "rsb.h"
++#include "queries.h"
++
++static void add_reply_lvb(gd_lkb_t * lkb, struct gd_remlockreply *reply);
++static void add_request_lvb(gd_lkb_t * lkb, struct gd_remlockrequest *req);
++
++/*
++ * format of an entry on the request queue
++ */
++struct rq_entry {
++ struct list_head rqe_list;
++ uint32_t rqe_nodeid;
++ char rqe_request[1];
++};
++
++/*
++ * Add a new request (if appropriate) to the request queue and send the remote
++ * request out. - runs in the context of the locking caller
++ *
++ * Recovery of a remote_stage request if the remote end fails while the lkb
++ * is still on the lockqueue:
++ *
++ * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
++ * lockqueue_lkb_mark() at the start of recovery.
++ *
++ * o Some lkb's will be rebuilt on new master rsb's during recovery.
++ * (depends on the type of request, see below).
++ *
++ * o At the end of recovery, resend_cluster_requests() looks at these
++ * LQRESEND lkb's and either:
++ *
++ * i) resends the request to the new master for the rsb where the
++ * request is processed as usual. The lkb remains on the lockqueue until
++ * the new master replies and we run process_lockqueue_reply().
++ *
++ * ii) if we've become the rsb master, remove the lkb from the lockqueue
++ * and processes the request locally via process_remastered_lkb().
++ *
++ * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
++ * and the request should be resent if dest node is failed.
++ *
++ * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
++ * wait queue. Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
++ * makes send_lkb_queue() skip it). Resend this request to the new master.
++ *
++ * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue. It will
++ * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
++ * Resend this request to the new master.
++ *
++ * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
++ * It will be rebuilt on the new master rsb's granted queue. Resend this
++ * request to the new master.
++ */
++
++int remote_stage(gd_lkb_t *lkb, int state)
++{
++ int error;
++
++ lkb->lkb_lockqueue_state = state;
++ add_to_lockqueue(lkb);
++
++ error = send_cluster_request(lkb, state);
++ if (error < 0) {
++ log_print("remote_stage error sending request %d", error);
++
++ /* Leave on lockqueue, it will be resent to correct node during
++ * recovery. */
++
++ /*
++ lkb->lkb_lockqueue_state = 0;
++ remove_from_lockqueue(lkb);
++ return -ENOTCONN;
++ */
++ }
++ return 0;
++}
++
++/*
++ * Requests received while the lockspace is in recovery get added to the
++ * request queue and processed when recovery is complete.
++ */
++
++void add_to_requestqueue(gd_ls_t *ls, int nodeid, char *request, int length)
++{
++ struct rq_entry *entry;
++
++ if (in_nodes_gone(ls, nodeid))
++ return;
++
++ entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
++ if (!entry) {
++ // TODO something better
++ printk("dlm: add_to_requestqueue: out of memory\n");
++ return;
++ }
++
++ log_debug(ls, "add_to_requestqueue %d", nodeid);
++ entry->rqe_nodeid = nodeid;
++ memcpy(entry->rqe_request, request, length);
++ list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
++}
++
++int process_requestqueue(gd_ls_t *ls)
++{
++ int error = 0, count = 0;
++ struct rq_entry *entry, *safe;
++ struct gd_req_header *req;
++
++ log_all(ls, "process held requests");
++
++ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++ req = (struct gd_req_header *) entry->rqe_request;
++ log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
++
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++ log_debug(ls, "process_requestqueue aborted");
++ error = -EINTR;
++ break;
++ }
++
++ error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
++ if (error == -EINTR) {
++ log_debug(ls, "process_requestqueue interrupted");
++ break;
++ }
++
++ list_del(&entry->rqe_list);
++ kfree(entry);
++ count++;
++ error = 0;
++ }
++
++ log_all(ls, "processed %d requests", count);
++ return error;
++}
++
++void wait_requestqueue(gd_ls_t *ls)
++{
++ while (!list_empty(&ls->ls_requestqueue) &&
++ test_bit(LSFL_LS_RUN, &ls->ls_flags))
++ schedule();
++}
++
++/*
++ * Resdir requests (lookup or remove) and replies from before recovery are
++ * invalid since the resdir was rebuilt. Clear them. Requests from nodes now
++ * gone are also invalid.
++ */
++
++void purge_requestqueue(gd_ls_t *ls)
++{
++ int count = 0;
++ struct rq_entry *entry, *safe;
++ struct gd_req_header *req;
++ struct gd_remlockrequest *freq;
++ gd_lkb_t *lkb;
++
++ log_all(ls, "purge requests");
++
++ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++ req = (struct gd_req_header *) entry->rqe_request;
++ freq = (struct gd_remlockrequest *) req;
++
++ if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
++ req->rh_cmd == GDLM_REMCMD_LOOKUP ||
++ in_nodes_gone(ls, entry->rqe_nodeid)) {
++
++ list_del(&entry->rqe_list);
++ kfree(entry);
++ count++;
++
++ } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
++
++ /*
++ * Replies to resdir lookups are invalid and must be
++ * purged. The lookup requests are marked in
++ * lockqueue_lkb_mark and will be resent in
++ * resend_cluster_requests. The only way to check if
++ * this is a lookup reply is to look at the
++ * lockqueue_state of the lkb.
++ */
++
++ lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
++ GDLM_ASSERT(lkb,);
++ if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
++ list_del(&entry->rqe_list);
++ kfree(entry);
++ count++;
++ }
++ }
++ }
++
++ log_all(ls, "purged %d requests", count);
++}
++
++/*
++ * Check if there's a reply for the given lkid in the requestqueue.
++ */
++
++int reply_in_requestqueue(gd_ls_t *ls, int lkid)
++{
++ int rv = FALSE;
++ struct rq_entry *entry, *safe;
++ struct gd_req_header *req;
++ struct gd_remlockrequest *freq;
++
++ list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
++ req = (struct gd_req_header *) entry->rqe_request;
++ freq = (struct gd_remlockrequest *) req;
++
++ if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
++ freq->rr_header.rh_lkid == lkid) {
++ rv = TRUE;
++ break;
++ }
++ }
++
++ return rv;
++}
++
++void allocate_and_copy_lvb(gd_ls_t *ls, char **lvbptr, char *src)
++{
++ if (!*lvbptr)
++ *lvbptr = allocate_lvb(ls);
++ if (*lvbptr)
++ memcpy(*lvbptr, src, DLM_LVB_LEN);
++}
++
++/*
++ * Process a lockqueue LKB after it has had it's remote processing complete and
++ * been pulled from the lockqueue. Runs in the context of the DLM recvd thread on
++ * the machine that requested the lock.
++ */
++
++static void process_lockqueue_reply(gd_lkb_t *lkb,
++ struct gd_remlockreply *reply)
++{
++ int state = lkb->lkb_lockqueue_state;
++ int oldstate;
++ gd_res_t *rsb = lkb->lkb_resource;
++ gd_ls_t *ls = rsb->res_ls;
++
++ lkb->lkb_lockqueue_state = 0;
++ if (state)
++ remove_from_lockqueue(lkb);
++
++ switch (state) {
++ case GDLM_LQSTATE_WAIT_RSB:
++
++ GDLM_ASSERT(reply->rl_status == 0,);
++
++ if (reply->rl_nodeid == our_nodeid())
++ rsb->res_nodeid = 0;
++ else
++ rsb->res_nodeid = reply->rl_nodeid;
++
++ rsb->res_resdir_seq = reply->rl_resdir_seq;
++ lkb->lkb_nodeid = rsb->res_nodeid;
++
++ dlm_lock_stage2(rsb->res_ls, lkb, rsb,
++ lkb->lkb_lockqueue_flags);
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONVERT:
++ case GDLM_LQSTATE_WAIT_CONDGRANT:
++
++ /*
++ * After a remote lock/conversion/grant request we put the lock
++ * on the right queue and send an AST if appropriate. Any lock
++ * shuffling (eg newly granted locks because this one was
++ * converted downwards) will be dealt with in seperate messages
++ * (which may be in the same network message)
++ */
++
++ if (!lkb->lkb_remid)
++ lkb->lkb_remid = reply->rl_lkid;
++
++ /*
++ * The remote request failed (we assume because of NOQUEUE).
++ * If this is a new request (non-conv) the lkb was created just
++ * for it so the lkb should be freed. If this was a
++ * conversion, the lkb already existed so we should put it back
++ * on the grant queue.
++ */
++
++ if (reply->rl_status != 0) {
++ GDLM_ASSERT(reply->rl_status == -EAGAIN,);
++
++ if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
++ res_lkb_dequeue(lkb);
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ } else
++ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++ lkb->lkb_retstatus = reply->rl_status;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ break;
++ }
++
++ /*
++ * The remote request was successful in granting the request or
++ * queuing it to be granted later. Add the lkb to the
++ * appropriate rsb queue.
++ */
++
++ switch (reply->rl_lockstate) {
++ case GDLM_LKSTS_GRANTED:
++
++ /* Compact version of grant_lock(). */
++
++ down_write(&rsb->res_lock);
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++ memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
++ DLM_LVB_LEN);
++
++ lkb->lkb_grmode = lkb->lkb_rqmode;
++ lkb->lkb_rqmode = DLM_LOCK_IV;
++ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++
++ if (lkb->lkb_range) {
++ lkb->lkb_range[GR_RANGE_START] =
++ lkb->lkb_range[RQ_RANGE_START];
++ lkb->lkb_range[GR_RANGE_END] =
++ lkb->lkb_range[RQ_RANGE_END];
++ }
++ up_write(&rsb->res_lock);
++
++ lkb->lkb_retstatus = 0;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ break;
++
++ case GDLM_LKSTS_WAITING:
++
++ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
++ else
++ log_error(ls, "wait reply for granted %x %u",
++ lkb->lkb_id, lkb->lkb_nodeid);
++ break;
++
++ case GDLM_LKSTS_CONVERT:
++
++ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++ res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
++ else
++ log_error(ls, "convert reply for granted %x %u",
++ lkb->lkb_id, lkb->lkb_nodeid);
++ break;
++
++ default:
++ log_error(ls, "process_lockqueue_reply state %d",
++ reply->rl_lockstate);
++ }
++
++ break;
++
++ case GDLM_LQSTATE_WAIT_UNLOCK:
++
++ /*
++ * Unlocks should never fail. Update local lock info. This
++ * always sends completion AST with status in lksb
++ */
++
++ GDLM_ASSERT(reply->rl_status == 0,);
++ oldstate = res_lkb_dequeue(lkb);
++
++ /* Differentiate between unlocks and conversion cancellations */
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
++ oldstate == GDLM_LKSTS_CONVERT) {
++ res_lkb_enqueue(lkb->lkb_resource, lkb,
++ GDLM_LKSTS_GRANTED);
++ lkb->lkb_retstatus = -DLM_ECANCEL;
++ } else {
++ lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ lkb->lkb_retstatus = -DLM_EUNLOCK;
++ }
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ break;
++
++ default:
++ log_error(ls, "process_lockqueue_reply id %x state %d",
++ lkb->lkb_id, state);
++ }
++}
++
++/*
++ * Tell a remote node to grant a lock. This happens when we are the master
++ * copy for a lock that is actually held on a remote node. The remote end is
++ * also responsible for sending the completion AST.
++ */
++
++void remote_grant(gd_lkb_t *lkb)
++{
++ struct writequeue_entry *e;
++ struct gd_remlockrequest *req;
++
++ // TODO Error handling
++ e = lowcomms_get_buffer(lkb->lkb_nodeid,
++ sizeof(struct gd_remlockrequest),
++ lkb->lkb_resource->res_ls->ls_allocation,
++ (char **) &req);
++ if (!e)
++ return;
++
++ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
++ req->rr_header.rh_length = sizeof(struct gd_remlockrequest);
++ req->rr_header.rh_flags = 0;
++ req->rr_header.rh_lkid = lkb->lkb_id;
++ req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
++ req->rr_remlkid = lkb->lkb_remid;
++ req->rr_flags = 0;
++
++ if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
++ /* This is a confusing non-standard use of rr_flags which is
++ * usually used to pass lockqueue_flags. */
++ req->rr_flags |= GDLM_LKFLG_DEMOTED;
++ }
++
++ add_request_lvb(lkb, req);
++ midcomms_send_buffer(&req->rr_header, e);
++}
++
++void reply_and_grant(gd_lkb_t *lkb)
++{
++ struct gd_remlockrequest *req = lkb->lkb_request;
++ struct gd_remlockreply *reply;
++ struct writequeue_entry *e;
++
++ // TODO Error handling
++ e = lowcomms_get_buffer(lkb->lkb_nodeid,
++ sizeof(struct gd_remlockreply),
++ lkb->lkb_resource->res_ls->ls_allocation,
++ (char **) &reply);
++ if (!e)
++ return;
++
++ reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
++ reply->rl_header.rh_flags = 0;
++ reply->rl_header.rh_length = sizeof(struct gd_remlockreply);
++ reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
++ reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
++
++ reply->rl_status = lkb->lkb_retstatus;
++ reply->rl_lockstate = lkb->lkb_status;
++ reply->rl_lkid = lkb->lkb_id;
++
++ GDLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
++
++ lkb->lkb_request = NULL;
++
++ add_reply_lvb(lkb, reply);
++ midcomms_send_buffer(&reply->rl_header, e);
++}
++
++/*
++ * Request removal of a dead entry in the resource directory
++ */
++
++void remote_remove_resdata(gd_ls_t *ls, int nodeid, char *name, int namelen,
++ uint8_t sequence)
++{
++ struct writequeue_entry *e;
++ struct gd_remlockrequest *req;
++
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++ gd_rcom_t *rc = allocate_rcom_buffer(ls);
++
++ memcpy(rc->rc_buf, name, namelen);
++ rc->rc_datalen = namelen;
++
++ rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
++
++ free_rcom_buffer(rc);
++ return;
++ }
++ // TODO Error handling
++ e = lowcomms_get_buffer(nodeid,
++ sizeof(struct gd_remlockrequest) + namelen - 1,
++ ls->ls_allocation, (char **) &req);
++ if (!e)
++ return;
++
++ memset(req, 0, sizeof(struct gd_remlockrequest) + namelen - 1);
++ req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
++ req->rr_header.rh_length =
++ sizeof(struct gd_remlockrequest) + namelen - 1;
++ req->rr_header.rh_flags = 0;
++ req->rr_header.rh_lkid = 0;
++ req->rr_header.rh_lockspace = ls->ls_global_id;
++ req->rr_remlkid = 0;
++ req->rr_resdir_seq = sequence;
++ memcpy(req->rr_name, name, namelen);
++
++ midcomms_send_buffer(&req->rr_header, e);
++}
++
++/*
++ * Send remote cluster request to directory or master node before the request
++ * is put on the lock queue. Runs in the context of the locking caller.
++ */
++
++int send_cluster_request(gd_lkb_t *lkb, int state)
++{
++ uint32_t target_nodeid;
++ gd_res_t *rsb = lkb->lkb_resource;
++ gd_ls_t *ls = rsb->res_ls;
++ struct gd_remlockrequest *req;
++ struct writequeue_entry *e;
++
++ /* Need to know the target nodeid before we allocate a send buffer */
++ target_nodeid = lkb->lkb_nodeid;
++ GDLM_ASSERT(target_nodeid != 0,);
++
++ if (state == GDLM_LQSTATE_WAIT_RSB)
++ target_nodeid = get_directory_nodeid(rsb);
++
++ GDLM_ASSERT(target_nodeid,);
++
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++ /* this may happen when called by resend_cluster_request */
++ log_error(ls, "send_cluster_request to %u state %d recovery",
++ target_nodeid, state);
++ }
++
++ e = lowcomms_get_buffer(target_nodeid,
++ sizeof(struct gd_remlockrequest) +
++ rsb->res_length - 1, ls->ls_allocation,
++ (char **) &req);
++ if (!e)
++ return -ENOBUFS;
++ memset(req, 0, sizeof(struct gd_remlockrequest) + rsb->res_length - 1);
++
++ /* Common stuff, some are just defaults */
++
++ if (lkb->lkb_bastaddr)
++ req->rr_asts = GDLM_QUEUE_BLKAST;
++ if (lkb->lkb_astaddr)
++ req->rr_asts |= GDLM_QUEUE_COMPAST;
++ if (lkb->lkb_parent)
++ req->rr_remparid = lkb->lkb_parent->lkb_remid;
++
++ req->rr_flags = lkb->lkb_lockqueue_flags;
++ req->rr_rqmode = lkb->lkb_rqmode;
++ req->rr_remlkid = lkb->lkb_remid;
++ req->rr_header.rh_length =
++ sizeof(struct gd_remlockrequest) + rsb->res_length - 1;
++ req->rr_header.rh_flags = 0;
++ req->rr_header.rh_lkid = lkb->lkb_id;
++ req->rr_header.rh_lockspace = ls->ls_global_id;
++
++ switch (state) {
++
++ case GDLM_LQSTATE_WAIT_RSB:
++
++ /* The lock must be a root lock */
++ GDLM_ASSERT(!lkb->lkb_parent,);
++
++ req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
++ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONVERT:
++
++ req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
++ if (lkb->lkb_range) {
++ req->rr_flags |= GDLM_LKFLG_RANGE;
++ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
++ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
++ }
++ break;
++
++ case GDLM_LQSTATE_WAIT_CONDGRANT:
++
++ req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
++ req->rr_resdir_seq = rsb->res_resdir_seq;
++ memcpy(req->rr_name, rsb->res_name, rsb->res_length);
++ if (lkb->lkb_range) {
++ req->rr_flags |= GDLM_LKFLG_RANGE;
++ req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
++ req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
++ }
++ break;
++
++ case GDLM_LQSTATE_WAIT_UNLOCK:
++
++ req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
++ break;
++
++ default:
++ GDLM_ASSERT(!"Unknown cluster request",);
++ }
++
++ add_request_lvb(lkb, req);
++ midcomms_send_buffer(&req->rr_header, e);
++
++ return 0;
++}
++
++/*
++ * We got a request from another cluster node, process it and return an info
++ * structure with the lock state/LVB etc as required. Executes in the DLM's
++ * recvd thread.
++ */
++
++int process_cluster_request(int nodeid, struct gd_req_header *req, int recovery)
++{
++ gd_ls_t *lspace;
++ gd_lkb_t *lkb = NULL;
++ gd_res_t *rsb;
++ int send_reply = 0, status = 0, namelen;
++ struct gd_remlockrequest *freq = (struct gd_remlockrequest *) req;
++ struct gd_remlockreply reply;
++
++ lspace = find_lockspace_by_global_id(req->rh_lockspace);
++
++ if (!lspace) {
++ log_print("process_cluster_request invalid lockspace %x "
++ "from %d req %u", req->rh_lockspace, nodeid,
++ req->rh_cmd);
++ status = -EINVAL;
++ goto out;
++ }
++
++ /* wait for recoverd to drain requestqueue */
++ if (!recovery)
++ wait_requestqueue(lspace);
++
++ /*
++ * If we're in recovery then queue the request for later. Otherwise,
++ * we still need to get the "in_recovery" lock to make sure the
++ * recovery itself doesn't start until we are done.
++ */
++ retry:
++ if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
++ if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
++ log_error(lspace, "process_cluster_request warning %u",
++ nodeid);
++ add_to_requestqueue(lspace, nodeid, (char *) req,
++ req->rh_length);
++ log_debug(lspace, "process_cluster_request abort");
++ status = -EINTR;
++ goto out;
++ }
++ if (!down_read_trylock(&lspace->ls_in_recovery)) {
++ schedule();
++ goto retry;
++ }
++
++
++ /*
++ * Process the request.
++ */
++
++ switch (req->rh_cmd) {
++
++ case GDLM_REMCMD_LOOKUP:
++ {
++ gd_resdata_t *rd;
++ int status;
++ uint32_t dir_nodeid;
++
++ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++
++ dir_nodeid = name_to_directory_nodeid(lspace,
++ freq->rr_name,
++ namelen);
++ if (dir_nodeid != our_nodeid())
++ log_debug(lspace, "ignoring directory lookup");
++
++ status = get_resdata(lspace, nodeid, freq->rr_name,
++ namelen, &rd, 0);
++ if (status)
++ status = -ENOMEM;
++
++ reply.rl_status = status;
++ reply.rl_lockstate = 0;
++ reply.rl_nodeid = rd->rd_master_nodeid;
++ reply.rl_resdir_seq = rd->rd_sequence;
++ }
++ send_reply = 1;
++ break;
++
++ case GDLM_REMCMD_REM_RESDATA:
++
++ namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
++ remove_resdata(lspace, nodeid, freq->rr_name, namelen,
++ freq->rr_resdir_seq);
++ break;
++
++ case GDLM_REMCMD_LOCKREQUEST:
++
++ lkb = remote_stage2(nodeid, lspace, freq);
++ if (lkb) {
++ lkb->lkb_request = freq;
++ dlm_lock_stage3(lkb);
++
++ /*
++ * If the request was granted in lock_stage3, then a
++ * reply message was already sent in combination with
++ * the grant message and lkb_request is NULL.
++ */
++
++ if (lkb->lkb_request) {
++ lkb->lkb_request = NULL;
++ send_reply = 1;
++ reply.rl_status = lkb->lkb_retstatus;
++ reply.rl_lockstate = lkb->lkb_status;
++ reply.rl_lkid = lkb->lkb_id;
++
++ /*
++ * If the request could not be granted and the
++ * user won't wait, then free up the LKB
++ */
++
++ if (lkb->lkb_flags & GDLM_LKFLG_DELAST) {
++ rsb = lkb->lkb_resource;
++ release_lkb(lspace, lkb);
++ release_rsb(rsb);
++ lkb = NULL;
++ }
++ }
++ } else {
++ reply.rl_status = -ENOMEM;
++ send_reply = 1;
++ }
++ break;
++
++ case GDLM_REMCMD_CONVREQUEST:
++
++ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++ log_error(lspace, "convrequest: invalid status %d",
++ lkb->lkb_status);
++
++ lkb->lkb_rqmode = freq->rr_rqmode;
++ lkb->lkb_lockqueue_flags = freq->rr_flags;
++ lkb->lkb_request = freq;
++ lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK
++ || freq->rr_flags & DLM_LKF_VALBLK) {
++ lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
++ allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
++ freq->rr_lvb);
++ }
++
++ if (freq->rr_flags & GDLM_LKFLG_RANGE) {
++ if (lkb_set_range(lspace, lkb, freq->rr_range_start,
++ freq->rr_range_end)) {
++ reply.rl_status = -ENOMEM;
++ send_reply = 1;
++ goto out;
++ }
++ }
++
++ dlm_convert_stage2(lkb, FALSE);
++
++ /*
++ * If the conv request was granted in stage2, then a reply
++ * message was already sent in combination with the grant
++ * message.
++ */
++
++ if (lkb->lkb_request) {
++ lkb->lkb_request = NULL;
++ send_reply = 1;
++ reply.rl_status = lkb->lkb_retstatus;
++ reply.rl_lockstate = lkb->lkb_status;
++ reply.rl_lkid = lkb->lkb_id;
++ }
++ break;
++
++ case GDLM_REMCMD_LOCKREPLY:
++
++ lkb = find_lock_by_id(lspace, freq->rr_header.rh_lkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ process_lockqueue_reply(lkb, (struct gd_remlockreply *) req);
++ break;
++
++ case GDLM_REMCMD_LOCKGRANT:
++
++ /*
++ * Remote lock has been granted asynchronously. Do a compact
++ * version of what grant_lock() does.
++ */
++
++ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ rsb = lkb->lkb_resource;
++
++ if (lkb->lkb_lockqueue_state)
++ log_error(rsb->res_ls, "granting lock on lockqueue "
++ "id=%x from=%u lqstate=%d flags=%x",
++ lkb->lkb_id, nodeid, lkb->lkb_lockqueue_state,
++ lkb->lkb_flags);
++
++ down_write(&rsb->res_lock);
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++ memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
++
++ lkb->lkb_grmode = lkb->lkb_rqmode;
++ lkb->lkb_rqmode = DLM_LOCK_IV;
++
++ if (lkb->lkb_range) {
++ lkb->lkb_range[GR_RANGE_START] =
++ lkb->lkb_range[RQ_RANGE_START];
++ lkb->lkb_range[GR_RANGE_END] =
++ lkb->lkb_range[RQ_RANGE_END];
++ }
++
++ lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
++ up_write(&rsb->res_lock);
++
++ if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
++ lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
++
++ lkb->lkb_retstatus = 0;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ break;
++
++ case GDLM_REMCMD_SENDBAST:
++
++ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
++ queue_ast(lkb, GDLM_QUEUE_BLKAST, freq->rr_rqmode);
++ break;
++
++ case GDLM_REMCMD_SENDCAST:
++
++ /* This is only used for some error completion ASTs */
++
++ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ /* Return the lock to granted status */
++ res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
++
++ lkb->lkb_retstatus = freq->rr_status;
++ queue_ast(lkb, GDLM_QUEUE_COMPAST, 0);
++ break;
++
++ case GDLM_REMCMD_UNLOCKREQUEST:
++
++ lkb = find_lock_by_id(lspace, freq->rr_remlkid);
++
++ GDLM_ASSERT(lkb, printk("rr_remlkid=%x rh_lkid=%x from=%u\n",
++ freq->rr_remlkid,
++ freq->rr_header.rh_lkid, nodeid););
++
++ reply.rl_status = dlm_unlock_stage2(lkb, freq->rr_flags);
++ send_reply = 1;
++ break;
++
++ case GDLM_REMCMD_QUERY:
++ remote_query(nodeid, lspace, req);
++ break;
++
++ case GDLM_REMCMD_QUERYREPLY:
++ remote_query_reply(nodeid, lspace, req);
++ break;
++
++ default:
++ log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
++ }
++
++ up_read(&lspace->ls_in_recovery);
++
++ out:
++ if (send_reply) {
++ reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
++ reply.rl_header.rh_flags = 0;
++ reply.rl_header.rh_length = sizeof(reply);
++ reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
++ reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
++
++ status = midcomms_send_message(nodeid, &reply.rl_header,
++ GFP_KERNEL);
++ }
++
++ wake_astd();
++
++ return status;
++}
++
++static void add_reply_lvb(gd_lkb_t *lkb, struct gd_remlockreply *reply)
++{
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++ memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
++}
++
++static void add_request_lvb(gd_lkb_t *lkb, struct gd_remlockrequest *req)
++{
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++ memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
++}
+diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
+--- linux-orig/cluster/dlm/lockqueue.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockqueue.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKQUEUE_DOT_H__
++#define __LOCKQUEUE_DOT_H__
++
++void remote_grant(gd_lkb_t * lkb);
++void reply_and_grant(gd_lkb_t * lkb);
++int remote_stage(gd_lkb_t * lkb, int state);
++int process_cluster_request(int csid, struct gd_req_header *req, int recovery);
++int send_cluster_request(gd_lkb_t * lkb, int state);
++void purge_requestqueue(gd_ls_t * ls);
++int process_requestqueue(gd_ls_t * ls);
++int reply_in_requestqueue(gd_ls_t * ls, int lkid);
++void remote_remove_resdata(gd_ls_t * ls, int nodeid, char *name, int namelen,
++ uint8_t sequence);
++void allocate_and_copy_lvb(gd_ls_t * ls, char **lvbptr, char *src);
++
++#endif /* __LOCKQUEUE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
+--- linux-orig/cluster/dlm/lockspace.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockspace.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,706 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "recoverd.h"
++#include "ast.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "lowcomms.h"
++#include "config.h"
++#include "memory.h"
++#include "lockspace.h"
++#include "device.h"
++
++#define GDST_NONE (0)
++#define GDST_RUNNING (1)
++
++static int gdlmstate;
++static int gdlmcount;
++static struct semaphore gdlmstate_lock;
++struct list_head lslist;
++spinlock_t lslist_lock;
++struct kcl_service_ops ls_ops;
++
++static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
++
++
++void dlm_lockspace_init(void)
++{
++ gdlmstate = GDST_NONE;
++ gdlmcount = 0;
++ init_MUTEX(&gdlmstate_lock);
++ INIT_LIST_HEAD(&lslist);
++ spin_lock_init(&lslist_lock);
++}
++
++gd_ls_t *find_lockspace_by_global_id(uint32_t id)
++{
++ gd_ls_t *ls;
++
++ spin_lock(&lslist_lock);
++
++ list_for_each_entry(ls, &lslist, ls_list) {
++ if (ls->ls_global_id == id)
++ goto out;
++ }
++ ls = NULL;
++ out:
++ spin_unlock(&lslist_lock);
++ return ls;
++}
++
++/* TODO: make this more efficient */
++gd_ls_t *find_lockspace_by_local_id(void *id)
++{
++ gd_ls_t *ls;
++
++ spin_lock(&lslist_lock);
++
++ list_for_each_entry(ls, &lslist, ls_list) {
++ if (ls->ls_local_id == (uint32_t)(long)id)
++ goto out;
++ }
++ ls = NULL;
++ out:
++ spin_unlock(&lslist_lock);
++ return ls;
++}
++
++gd_ls_t *find_lockspace_by_name(char *name, int namelen)
++{
++ gd_ls_t *ls;
++
++ spin_lock(&lslist_lock);
++
++ list_for_each_entry(ls, &lslist, ls_list) {
++ if (ls->ls_namelen == namelen &&
++ memcmp(ls->ls_name, name, namelen) == 0)
++ goto out;
++ }
++ ls = NULL;
++ out:
++ spin_unlock(&lslist_lock);
++ return ls;
++}
++
++/*
++ * Called from dlm_init. These are the general threads which are not
++ * lockspace-specific and work for all gdlm lockspaces.
++ */
++
++static int threads_start(void)
++{
++ int error;
++
++ /* Thread which interacts with cman for all ls's */
++ error = recoverd_start();
++ if (error) {
++ log_print("cannot start recovery thread %d", error);
++ goto fail;
++ }
++
++ /* Thread which process lock requests for all ls's */
++ error = astd_start();
++ if (error) {
++ log_print("cannot start ast thread %d", error);
++ goto recoverd_fail;
++ }
++
++ /* Thread for sending/receiving messages for all ls's */
++ error = lowcomms_start();
++ if (error) {
++ log_print("cannot start lowcomms %d", error);
++ goto astd_fail;
++ }
++
++ return 0;
++
++ astd_fail:
++ astd_stop();
++
++ recoverd_fail:
++ recoverd_stop();
++
++ fail:
++ return error;
++}
++
++static void threads_stop(void)
++{
++ lowcomms_stop();
++ astd_stop();
++ recoverd_stop();
++}
++
++static int init_internal(void)
++{
++ int error = 0;
++
++ if (gdlmstate == GDST_RUNNING)
++ gdlmcount++;
++ else {
++ error = threads_start();
++ if (error)
++ goto out;
++
++ gdlmstate = GDST_RUNNING;
++ gdlmcount = 1;
++ }
++
++ out:
++ return error;
++}
++
++
++/*
++ * Called after gdlm module is loaded and before any lockspaces are created.
++ * Starts and initializes global threads and structures. These global entities
++ * are shared by and independent of all lockspaces.
++ *
++ * There should be a gdlm-specific user command which a person can run which
++ * calls this function. If a user hasn't run that command and something
++ * creates a new lockspace, this is called first.
++ *
++ * This also starts the default lockspace.
++ */
++
++int dlm_init(void)
++{
++ int error;
++
++ down(&gdlmstate_lock);
++ error = init_internal();
++ up(&gdlmstate_lock);
++
++ return error;
++}
++
++int dlm_release(void)
++{
++ int error = 0;
++
++ down(&gdlmstate_lock);
++
++ if (gdlmstate == GDST_NONE)
++ goto out;
++
++ if (gdlmcount)
++ gdlmcount--;
++
++ if (gdlmcount)
++ goto out;
++
++ spin_lock(&lslist_lock);
++ if (!list_empty(&lslist)) {
++ spin_unlock(&lslist_lock);
++ log_print("cannot stop threads, lockspaces still exist");
++ goto out;
++ }
++ spin_unlock(&lslist_lock);
++
++ threads_stop();
++ gdlmstate = GDST_NONE;
++
++ out:
++ up(&gdlmstate_lock);
++
++ return error;
++}
++
++gd_ls_t *allocate_ls(int namelen)
++{
++ gd_ls_t *ls;
++
++ /* FIXME: use appropriate malloc type */
++
++ ls = kmalloc(sizeof(gd_ls_t) + namelen, GFP_KERNEL);
++ if (ls)
++ memset(ls, 0, sizeof(gd_ls_t) + namelen);
++
++ return ls;
++}
++
++void free_ls(gd_ls_t *ls)
++{
++ kfree(ls);
++}
++
++static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
++{
++ gd_ls_t *ls;
++ int i, error = -ENOMEM;
++ uint32_t local_id = 0;
++
++ if (!try_module_get(THIS_MODULE))
++ return -EINVAL;
++
++ if (namelen > MAX_SERVICE_NAME_LEN)
++ return -EINVAL;
++
++ if ((ls = find_lockspace_by_name(name, namelen))) {
++ *lockspace = (void *)ls->ls_local_id;
++ return -EEXIST;
++ }
++
++ /*
++ * Initialize ls fields
++ */
++
++ ls = allocate_ls(namelen);
++ if (!ls)
++ goto out;
++
++ memcpy(ls->ls_name, name, namelen);
++ ls->ls_namelen = namelen;
++
++ ls->ls_allocation = GFP_KERNEL;
++ memset(&ls->ls_flags, 0, sizeof(unsigned long));
++ INIT_LIST_HEAD(&ls->ls_rootres);
++ ls->ls_hashsize = dlm_config.reshashtbl;
++ ls->ls_hashmask = ls->ls_hashsize - 1;
++
++ ls->ls_reshashtbl =
++ kmalloc(sizeof(struct list_head) * ls->ls_hashsize, GFP_KERNEL);
++ if (!ls->ls_reshashtbl)
++ goto out_lsfree;
++
++ for (i = 0; i < ls->ls_hashsize; i++)
++ INIT_LIST_HEAD(&ls->ls_reshashtbl[i]);
++
++ rwlock_init(&ls->ls_reshash_lock);
++
++ if (init_lockidtbl(ls, dlm_config.lockidtbl) == -1)
++ goto out_htfree;
++
++ INIT_LIST_HEAD(&ls->ls_nodes);
++ ls->ls_num_nodes = 0;
++ INIT_LIST_HEAD(&ls->ls_nodes_gone);
++ INIT_LIST_HEAD(&ls->ls_recover);
++ spin_lock_init(&ls->ls_recover_lock);
++ INIT_LIST_HEAD(&ls->ls_recover_list);
++ ls->ls_recover_list_count = 0;
++ spin_lock_init(&ls->ls_recover_list_lock);
++ init_waitqueue_head(&ls->ls_wait_general);
++ INIT_LIST_HEAD(&ls->ls_requestqueue);
++ INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
++ ls->ls_last_stop = 0;
++ ls->ls_last_start = 0;
++ ls->ls_last_finish = 0;
++ ls->ls_rcom_msgid = 0;
++ init_MUTEX(&ls->ls_rcom_lock);
++ init_rwsem(&ls->ls_in_recovery);
++ init_rwsem(&ls->ls_unlock_sem);
++ init_rwsem(&ls->ls_rec_rsblist);
++ init_rwsem(&ls->ls_gap_rsblist);
++ down_write(&ls->ls_in_recovery);
++
++ for (i = 0; i < RESDIRHASH_SIZE; i++) {
++ INIT_LIST_HEAD(&ls->ls_resdir_hash[i].rb_reslist);
++ rwlock_init(&ls->ls_resdir_hash[i].rb_lock);
++ }
++
++ if (flags & DLM_LSF_NOTIMERS)
++ set_bit(LSFL_NOTIMERS, &ls->ls_flags);
++
++ /*
++ * Connect this lockspace with the cluster manager
++ */
++
++ error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
++ &ls_ops, TRUE, (void *) ls, &local_id);
++ if (error)
++ goto out_idtblfree;
++
++ ls->ls_state = LSST_INIT;
++ ls->ls_local_id = local_id;
++
++ spin_lock(&lslist_lock);
++ list_add(&ls->ls_list, &lslist);
++ spin_unlock(&lslist_lock);
++
++ error = kcl_join_service(local_id);
++ if (error) {
++ log_error(ls, "service manager join error %d", error);
++ goto out_reg;
++ }
++
++ /* The ls isn't actually running until it receives a start() from CMAN.
++ * Neither does it have a global ls id until started. */
++
++
++ /* Return the local ID as the lockspace handle. I've left this
++ cast to a void* as it allows us to replace it with pretty much
++ anything at a future date without breaking clients. But returning
++ the address of the lockspace is a bad idea as it could get
++ forcibly removed, leaving client with a dangling pointer */
++ *lockspace = (void *)local_id;
++
++ return 0;
++
++ out_reg:
++ kcl_unregister_service(ls->ls_local_id);
++
++ out_idtblfree:
++ free_lockidtbl(ls);
++
++ out_htfree:
++ kfree(ls->ls_reshashtbl);
++
++ out_lsfree:
++ free_ls(ls);
++
++ out:
++ return error;
++}
++
++/*
++ * Called by a system like GFS which wants independent lock spaces.
++ */
++
++int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
++{
++ int error = -ENOSYS;
++
++ down(&gdlmstate_lock);
++
++ error = init_internal();
++ if (error)
++ goto out;
++
++ error = new_lockspace(name, namelen, lockspace, flags);
++
++ out:
++ up(&gdlmstate_lock);
++
++ return error;
++}
++
++/* Return 1 if the lockspace still has active remote locks,
++ * 2 if the lockspace still has active local locks.
++ */
++static int lockspace_busy(gd_ls_t *ls)
++{
++ int i;
++ int lkb_found = 0;
++ gd_lkb_t *lkb;
++
++ /* NOTE: We check the lockidtbl here rather than the resource table.
++ * This is because there may be LKBs queued as ASTs that have been unlinked
++ * from their RSBs and are pending deletion once the AST has been delivered
++ */
++ read_lock(&ls->ls_lockidtbl_lock);
++ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++ if (!list_empty(&ls->ls_lockidtbl[i].list)) {
++ lkb_found = 1;
++ list_for_each_entry(lkb, &ls->ls_lockidtbl[i].list, lkb_idtbl_list) {
++ if (!lkb->lkb_nodeid) {
++ read_unlock(&ls->ls_lockidtbl_lock);
++ return 2;
++ }
++ }
++ }
++ }
++ read_unlock(&ls->ls_lockidtbl_lock);
++ return lkb_found;
++}
++
++/* Actually release the lockspace */
++static int release_lockspace(gd_ls_t *ls, int force)
++{
++ gd_lkb_t *lkb;
++ gd_res_t *rsb;
++ gd_recover_t *gr;
++ gd_csb_t *csb;
++ struct list_head *head;
++ int i;
++ int busy = lockspace_busy(ls);
++
++ /* Don't destroy a busy lockspace */
++ if (busy > force)
++ return -EBUSY;
++
++ if (force < 3) {
++ kcl_leave_service(ls->ls_local_id);
++ kcl_unregister_service(ls->ls_local_id);
++ }
++
++ spin_lock(&lslist_lock);
++ list_del(&ls->ls_list);
++ spin_unlock(&lslist_lock);
++
++ /*
++ * Free resdata structs.
++ */
++
++ resdir_clear(ls);
++
++ /*
++ * Free all lkb's on lockidtbl[] lists.
++ */
++
++ for (i = 0; i < ls->ls_lockidtbl_size; i++) {
++ head = &ls->ls_lockidtbl[i].list;
++ while (!list_empty(head)) {
++ lkb = list_entry(head->next, gd_lkb_t, lkb_idtbl_list);
++ list_del(&lkb->lkb_idtbl_list);
++
++ if (lkb->lkb_lockqueue_state)
++ remove_from_lockqueue(lkb);
++
++ if (lkb->lkb_asts_to_deliver)
++ list_del(&lkb->lkb_astqueue);
++
++ if (lkb->lkb_lvbptr
++ && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
++ free_lvb(lkb->lkb_lvbptr);
++
++ free_lkb(lkb);
++ }
++ }
++
++ /*
++ * Free lkidtbl[] itself
++ */
++
++ kfree(ls->ls_lockidtbl);
++
++ /*
++ * Free all rsb's on reshashtbl[] lists
++ */
++
++ for (i = 0; i < ls->ls_hashsize; i++) {
++ head = &ls->ls_reshashtbl[i];
++ while (!list_empty(head)) {
++ rsb = list_entry(head->next, gd_res_t, res_hashchain);
++ list_del(&rsb->res_hashchain);
++
++ if (rsb->res_lvbptr)
++ free_lvb(rsb->res_lvbptr);
++
++ free_rsb(rsb);
++ }
++ }
++
++ /*
++ * Free reshashtbl[] itself
++ */
++
++ kfree(ls->ls_reshashtbl);
++
++ /*
++ * Free structures on any other lists
++ */
++
++ head = &ls->ls_recover;
++ while (!list_empty(head)) {
++ gr = list_entry(head->next, gd_recover_t, gr_list);
++ list_del(&gr->gr_list);
++ free_dlm_recover(gr);
++ }
++
++ head = &ls->ls_nodes;
++ while (!list_empty(head)) {
++ csb = list_entry(head->next, gd_csb_t, csb_list);
++ list_del(&csb->csb_list);
++ release_csb(csb);
++ }
++
++ head = &ls->ls_nodes_gone;
++ while (!list_empty(head)) {
++ csb = list_entry(head->next, gd_csb_t, csb_list);
++ list_del(&csb->csb_list);
++ release_csb(csb);
++ }
++
++ free_ls(ls);
++
++ dlm_release();
++
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++
++/*
++ * Called when a system has released all its locks and is not going to use the
++ * lockspace any longer. We blindly free everything we're managing for this
++ * lockspace. Remaining nodes will go through the recovery process as if we'd
++ * died. The lockspace must continue to function as usual, participating in
++ * recoveries, until kcl_leave_service returns.
++ *
++ * Force has 4 possible values:
++ * 0 - don't destroy locksapce if it has any LKBs
++ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
++ * 2 - destroy lockspace regardless of LKBs
++ * 3 - destroy lockspace as part of a forced shutdown
++ */
++
++int dlm_release_lockspace(void *lockspace, int force)
++{
++ gd_ls_t *ls;
++
++ ls = find_lockspace_by_local_id(lockspace);
++ if (!ls)
++ return -EINVAL;
++
++ return release_lockspace(ls, force);
++}
++
++
++/* Called when the cluster is being shut down dirtily */
++void dlm_emergency_shutdown()
++{
++ gd_ls_t *ls;
++ gd_ls_t *tmp;
++
++ /* Shut lowcomms down to prevent any socket activity */
++ lowcomms_stop_accept();
++
++ /* Delete the devices that belong the the userland
++ lockspaces to be deleted. */
++ dlm_device_free_devices();
++
++ /* Now try to clean the lockspaces */
++ spin_lock(&lslist_lock);
++
++ list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
++ spin_unlock(&lslist_lock);
++ release_lockspace(ls, 3);
++ spin_lock(&lslist_lock);
++ }
++
++ spin_unlock(&lslist_lock);
++}
++
++gd_recover_t *allocate_dlm_recover(void)
++{
++ gd_recover_t *gr;
++
++ gr = (gd_recover_t *) kmalloc(sizeof(gd_recover_t), GFP_KERNEL);
++ if (gr)
++ memset(gr, 0, sizeof(gd_recover_t));
++
++ return gr;
++}
++
++void free_dlm_recover(gd_recover_t * gr)
++{
++ kfree(gr);
++}
++
++/*
++ * Called by CMAN on a specific ls. "stop" means set flag which while set
++ * causes all new requests to ls to be queued and not submitted until flag is
++ * cleared. stop on a ls also needs to cancel any prior starts on the ls.
++ * The recoverd thread carries out any work called for by this event.
++ */
++
++static int dlm_ls_stop(void *servicedata)
++{
++ gd_ls_t *ls = (gd_ls_t *) servicedata;
++ int new;
++
++ spin_lock(&ls->ls_recover_lock);
++ ls->ls_last_stop = ls->ls_last_start;
++ set_bit(LSFL_LS_STOP, &ls->ls_flags);
++ new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
++ spin_unlock(&ls->ls_recover_lock);
++
++ /*
++ * This in_recovery lock does two things:
++ *
++ * 1) Keeps this function from returning until all threads are out
++ * of locking routines and locking is truely stopped.
++ * 2) Keeps any new requests from being processed until it's unlocked
++ * when recovery is complete.
++ */
++
++ if (new)
++ down_write(&ls->ls_in_recovery);
++
++ clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
++ clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
++ clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
++ clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
++
++ recoverd_kick(ls);
++
++ return 0;
++}
++
++/*
++ * Called by CMAN on a specific ls. "start" means enable the lockspace to do
++ * request processing which first requires that the recovery procedure be
++ * stepped through with all nodes sharing the lockspace (nodeids). The first
++ * start on the ls after it's created is a special case and requires some extra
++ * work like figuring out our own local nodeid. We can't do all this in the
++ * calling CMAN context, so we must pass this work off to the recoverd thread
++ * which was created in gdlm_init(). The recoverd thread carries out any work
++ * called for by this event.
++ */
++
++static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
++ int event_id, int type)
++{
++ gd_ls_t *ls = (gd_ls_t *) servicedata;
++ gd_recover_t *gr;
++ int error = -ENOMEM;
++
++ gr = allocate_dlm_recover();
++ if (!gr)
++ goto out;
++
++ gr->gr_nodeids = nodeids;
++ gr->gr_node_count = count;
++ gr->gr_event_id = event_id;
++
++ spin_lock(&ls->ls_recover_lock);
++ ls->ls_last_start = event_id;
++ list_add_tail(&gr->gr_list, &ls->ls_recover);
++ set_bit(LSFL_LS_START, &ls->ls_flags);
++ spin_unlock(&ls->ls_recover_lock);
++
++ recoverd_kick(ls);
++ error = 0;
++
++ out:
++ return error;
++}
++
++/*
++ * Called by CMAN on a specific ls. "finish" means that all nodes which
++ * received a "start" have completed the start and called kcl_start_done.
++ * The recoverd thread carries out any work called for by this event.
++ */
++
++static void dlm_ls_finish(void *servicedata, int event_id)
++{
++ gd_ls_t *ls = (gd_ls_t *) servicedata;
++
++ spin_lock(&ls->ls_recover_lock);
++ ls->ls_last_finish = event_id;
++ set_bit(LSFL_LS_FINISH, &ls->ls_flags);
++ spin_unlock(&ls->ls_recover_lock);
++
++ recoverd_kick(ls);
++}
++
++struct kcl_service_ops ls_ops = {
++ .stop = dlm_ls_stop,
++ .start = dlm_ls_start,
++ .finish = dlm_ls_finish
++};
+diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
+--- linux-orig/cluster/dlm/lockspace.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lockspace.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKSPACE_DOT_H__
++#define __LOCKSPACE_DOT_H__
++
++void dlm_lockspace_init(void);
++int dlm_init(void);
++int dlm_release(void);
++int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
++int dlm_release_lockspace(void *ls, int force);
++gd_ls_t *find_lockspace_by_global_id(uint32_t id);
++gd_ls_t *find_lockspace_by_local_id(void *id);
++gd_ls_t *find_lockspace_by_name(char *name, int namelen);
++void free_dlm_recover(gd_recover_t *gr);
++int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out);
++void dlm_emergency_shutdown(void);
++
++#endif /* __LOCKSPACE_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
+--- linux-orig/cluster/dlm/lowcomms.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lowcomms.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1354 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * lowcomms.c
++ *
++ * This is the "low-level" comms layer.
++ *
++ * It is responsible for sending/receiving messages
++ * from other nodes in the cluster.
++ *
++ * Cluster nodes are referred to by their nodeids. nodeids are
++ * simply 32 bit numbers to the locking module - if they need to
++ * be expanded for the cluster infrastructure then that is it's
++ * responsibility. It is this layer's
++ * responsibility to resolve these into IP address or
++ * whatever it needs for inter-node communication.
++ *
++ * The comms level is two kernel threads that deal mainly with
++ * the receiving of messages from other nodes and passing them
++ * up to the mid-level comms layer (which understands the
++ * message format) for execution by the locking core, and
++ * a send thread which does all the setting up of connections
++ * to remote nodes and the sending of data. Threads are not allowed
++ * to send their own data because it may cause them to wait in times
++ * of high load. Also, this way, the sending thread can collect together
++ * messages bound for one node and send them in one block.
++ *
++ * I don't see any problem with the recv thread executing the locking
++ * code on behalf of remote processes as the locking code is
++ * short, efficient and never waits.
++ *
++ */
++
++
++#include <asm/ioctls.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <linux/pagemap.h>
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "config.h"
++
++struct cbuf {
++ unsigned base;
++ unsigned len;
++ unsigned mask;
++};
++
++#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
++#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
++#define CBUF_EMPTY(cb) ((cb)->len == 0)
++#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
++#define CBUF_EAT(cb, n) do { (cb)->len -= (n); \
++ (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
++#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
++
++struct connection {
++ struct socket *sock; /* NULL if not connected */
++ uint32_t nodeid; /* So we know who we are in the list */
++ struct rw_semaphore sock_sem; /* Stop connect races */
++ struct list_head read_list; /* On this list when ready for reading */
++ struct list_head write_list; /* On this list when ready for writing */
++ struct list_head state_list; /* On this list when ready to connect */
++ unsigned long flags; /* bit 1,2 = We are on the read/write lists */
++#define CF_READ_PENDING 1
++#define CF_WRITE_PENDING 2
++#define CF_CONNECT_PENDING 3
++#define CF_IS_OTHERSOCK 4
++ struct list_head writequeue; /* List of outgoing writequeue_entries */
++ struct list_head listenlist; /* List of allocated listening sockets */
++ spinlock_t writequeue_lock;
++ int (*rx_action) (struct connection *); /* What to do when active */
++ struct page *rx_page;
++ struct cbuf cb;
++ int retries;
++#define MAX_CONNECT_RETRIES 3
++ struct connection *othersock;
++};
++#define sock2con(x) ((struct connection *)(x)->sk_user_data)
++#define nodeid2con(x) (&connections[(x)])
++
++/* An entry waiting to be sent */
++struct writequeue_entry {
++ struct list_head list;
++ struct page *page;
++ int offset;
++ int len;
++ int end;
++ int users;
++ struct connection *con;
++};
++
++/* "Template" structure for IPv4 and IPv6 used to fill
++ * in the missing bits when converting between cman (which knows
++ * nothing about sockaddr structs) and real life where we actually
++ * have to connect to these addresses. Also one of these structs
++ * will hold the cached "us" address.
++ *
++ * It's an in6 sockaddr just so there's enough space for anything
++ * we're likely to see here.
++ */
++static struct sockaddr_in6 local_addr;
++
++/* Manage daemons */
++static struct semaphore thread_lock;
++static struct completion thread_completion;
++static atomic_t send_run;
++static atomic_t recv_run;
++
++/* An array of connections, indexed by NODEID */
++static struct connection *connections;
++static int conn_array_size;
++static atomic_t writequeue_length;
++static atomic_t accepting;
++
++static wait_queue_t lowcomms_send_waitq_head;
++static wait_queue_head_t lowcomms_send_waitq;
++
++static wait_queue_t lowcomms_recv_waitq_head;
++static wait_queue_head_t lowcomms_recv_waitq;
++
++/* List of sockets that have reads pending */
++static struct list_head read_sockets;
++static spinlock_t read_sockets_lock;
++
++/* List of sockets which have writes pending */
++static struct list_head write_sockets;
++static spinlock_t write_sockets_lock;
++
++/* List of sockets which have connects pending */
++static struct list_head state_sockets;
++static spinlock_t state_sockets_lock;
++
++/* List of allocated listen sockets */
++static struct list_head listen_sockets;
++
++static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
++static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
++
++
++/* Data available on socket or listen socket received a connect */
++static void lowcomms_data_ready(struct sock *sk, int count_unused)
++{
++ struct connection *con = sock2con(sk);
++
++ if (test_and_set_bit(CF_READ_PENDING, &con->flags))
++ return;
++
++ spin_lock_bh(&read_sockets_lock);
++ list_add_tail(&con->read_list, &read_sockets);
++ spin_unlock_bh(&read_sockets_lock);
++
++ wake_up_interruptible(&lowcomms_recv_waitq);
++}
++
++static void lowcomms_write_space(struct sock *sk)
++{
++ struct connection *con = sock2con(sk);
++
++ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
++ return;
++
++ spin_lock_bh(&write_sockets_lock);
++ list_add_tail(&con->write_list, &write_sockets);
++ spin_unlock_bh(&write_sockets_lock);
++
++ wake_up_interruptible(&lowcomms_send_waitq);
++}
++
++static inline void lowcomms_connect_sock(struct connection *con)
++{
++ if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
++ return;
++ if (!atomic_read(&accepting))
++ return;
++
++ spin_lock_bh(&state_sockets_lock);
++ list_add_tail(&con->state_list, &state_sockets);
++ spin_unlock_bh(&state_sockets_lock);
++
++ wake_up_interruptible(&lowcomms_send_waitq);
++}
++
++static void lowcomms_state_change(struct sock *sk)
++{
++/* struct connection *con = sock2con(sk); */
++
++ switch (sk->sk_state) {
++ case TCP_ESTABLISHED:
++ lowcomms_write_space(sk);
++ break;
++
++ case TCP_FIN_WAIT1:
++ case TCP_FIN_WAIT2:
++ case TCP_TIME_WAIT:
++ case TCP_CLOSE:
++ case TCP_CLOSE_WAIT:
++ case TCP_LAST_ACK:
++ case TCP_CLOSING:
++ /* FIXME: I think this causes more trouble than it solves.
++ lowcomms wil reconnect anyway when there is something to
++ send. This just attempts reconnection if a node goes down!
++ */
++ /* lowcomms_connect_sock(con); */
++ break;
++
++ default:
++ printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
++ break;
++ }
++}
++
++/* Make a socket active */
++static int add_sock(struct socket *sock, struct connection *con)
++{
++ con->sock = sock;
++
++ /* Install a data_ready callback */
++ con->sock->sk->sk_data_ready = lowcomms_data_ready;
++ con->sock->sk->sk_write_space = lowcomms_write_space;
++ con->sock->sk->sk_state_change = lowcomms_state_change;
++
++ return 0;
++}
++
++/* Add the port number to an IP6 or 4 sockaddr and return the address
++ length */
++static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
++ int *addr_len)
++{
++ saddr->sin6_family = local_addr.sin6_family;
++ if (local_addr.sin6_family == AF_INET) {
++ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
++ in4_addr->sin_port = cpu_to_be16(port);
++ *addr_len = sizeof(struct sockaddr_in);
++ }
++ else {
++ saddr->sin6_port = cpu_to_be16(port);
++ *addr_len = sizeof(struct sockaddr_in6);
++ }
++}
++
++/* Close a remote connection and tidy up */
++static void close_connection(struct connection *con)
++{
++ if (test_bit(CF_IS_OTHERSOCK, &con->flags))
++ return;
++
++ down_write(&con->sock_sem);
++
++ if (con->sock) {
++ sock_release(con->sock);
++ con->sock = NULL;
++ if (con->othersock) {
++ down_write(&con->othersock->sock_sem);
++ sock_release(con->othersock->sock);
++ con->othersock->sock = NULL;
++ up_write(&con->othersock->sock_sem);
++ kfree(con->othersock);
++ con->othersock = NULL;
++ }
++ }
++ if (con->rx_page) {
++ __free_page(con->rx_page);
++ con->rx_page = NULL;
++ }
++ up_write(&con->sock_sem);
++}
++
++/* Data received from remote end */
++static int receive_from_sock(struct connection *con)
++{
++ int ret = 0;
++ struct msghdr msg;
++ struct iovec iov[2];
++ mm_segment_t fs;
++ unsigned len;
++ int r;
++ int call_again_soon = 0;
++
++ down_read(&con->sock_sem);
++
++ if (con->sock == NULL)
++ goto out;
++ if (con->rx_page == NULL) {
++ /*
++ * This doesn't need to be atomic, but I think it should
++ * improve performance if it is.
++ */
++ con->rx_page = alloc_page(GFP_ATOMIC);
++ if (con->rx_page == NULL)
++ goto out_resched;
++ CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
++ }
++ /*
++ * To avoid doing too many short reads, we will reschedule for another
++ * another time if there are less than 32 bytes left in the buffer.
++ */
++ if (!CBUF_MAY_ADD(&con->cb, 32))
++ goto out_resched;
++
++ msg.msg_control = NULL;
++ msg.msg_controllen = 0;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = iov;
++ msg.msg_name = NULL;
++ msg.msg_namelen = 0;
++ msg.msg_flags = 0;
++
++ /*
++ * iov[0] is the bit of the circular buffer between the current end
++ * point (cb.base + cb.len) and the end of the buffer.
++ */
++ iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
++ iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
++ iov[1].iov_len = 0;
++
++ /*
++ * iov[1] is the bit of the circular buffer between the start of the
++ * buffer and the start of the currently used section (cb.base)
++ */
++ if (CBUF_DATA(&con->cb) >= con->cb.base) {
++ iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
++ iov[1].iov_len = con->cb.base;
++ iov[1].iov_base = page_address(con->rx_page);
++ msg.msg_iovlen = 2;
++ }
++ len = iov[0].iov_len + iov[1].iov_len;
++
++ fs = get_fs();
++ set_fs(get_ds());
++ r = ret = sock_recvmsg(con->sock, &msg, len,
++ MSG_DONTWAIT | MSG_NOSIGNAL);
++ set_fs(fs);
++
++ if (ret <= 0)
++ goto out_close;
++ if (ret == len)
++ call_again_soon = 1;
++ CBUF_ADD(&con->cb, ret);
++ ret = midcomms_process_incoming_buffer(con->nodeid,
++ page_address(con->rx_page),
++ con->cb.base, con->cb.len,
++ PAGE_CACHE_SIZE);
++ if (ret == -EBADMSG) {
++ printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
++ "iov_len=%u, iov_base[0]=%p, read=%d\n",
++ page_address(con->rx_page), con->cb.base, con->cb.len,
++ len, iov[0].iov_base, r);
++ }
++ if (ret < 0)
++ goto out_close;
++ CBUF_EAT(&con->cb, ret);
++
++ if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
++ __free_page(con->rx_page);
++ con->rx_page = NULL;
++ }
++ out:
++ if (call_again_soon)
++ goto out_resched;
++ up_read(&con->sock_sem);
++ ret = 0;
++ goto out_ret;
++
++ out_resched:
++ lowcomms_data_ready(con->sock->sk, 0);
++ up_read(&con->sock_sem);
++ ret = 0;
++ goto out_ret;
++
++ out_close:
++ up_read(&con->sock_sem);
++ if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
++ close_connection(con);
++ lowcomms_connect_sock(con);
++ }
++
++ out_ret:
++ return ret;
++}
++
++/* Listening socket is busy, accept a connection */
++static int accept_from_sock(struct connection *con)
++{
++ int result;
++ struct sockaddr_in6 peeraddr;
++ struct socket *newsock;
++ int len;
++ int nodeid;
++ struct connection *newcon;
++
++ memset(&peeraddr, 0, sizeof(peeraddr));
++ newsock = sock_alloc();
++ if (!newsock)
++ return -ENOMEM;
++
++ down_read(&con->sock_sem);
++
++ result = -ENOTCONN;
++ if (con->sock == NULL)
++ goto accept_err;
++
++ newsock->type = con->sock->type;
++ newsock->ops = con->sock->ops;
++
++ result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
++ if (result < 0)
++ goto accept_err;
++
++ /* Get the connected socket's peer */
++ if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
++ &len, 2)) {
++ result = -ECONNABORTED;
++ goto accept_err;
++ }
++
++ /* Get the new node's NODEID */
++ nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
++ if (nodeid == 0) {
++ printk("dlm: connect from non cluster node\n");
++ sock_release(newsock);
++ up_read(&con->sock_sem);
++ return -1;
++ }
++
++ log_print("got connection from %d", nodeid);
++
++ /* Check to see if we already have a connection to this node. This
++ * could happen if the two nodes initiate a connection at roughly
++ * the same time and the connections cross on the wire.
++ * TEMPORARY FIX:
++ * In this case we store the incoming one in "othersock"
++ */
++ newcon = nodeid2con(nodeid);
++ down_write(&newcon->sock_sem);
++ if (newcon->sock) {
++ struct connection *othercon;
++
++ othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
++ if (!othercon) {
++ printk("dlm: failed to allocate incoming socket\n");
++ sock_release(newsock);
++ up_write(&newcon->sock_sem);
++ up_read(&con->sock_sem);
++ goto accept_out;
++ }
++ memset(othercon, 0, sizeof(*othercon));
++ newcon->othersock = othercon;
++ othercon->nodeid = nodeid;
++ othercon->sock = newsock;
++ othercon->rx_action = receive_from_sock;
++ add_sock(newsock, othercon);
++ init_rwsem(&othercon->sock_sem);
++ set_bit(CF_IS_OTHERSOCK, &othercon->flags);
++ newsock->sk->sk_user_data = othercon;
++
++ up_write(&newcon->sock_sem);
++ lowcomms_data_ready(newsock->sk, 0);
++ up_read(&con->sock_sem);
++ goto accept_out;
++ }
++
++ newsock->sk->sk_user_data = newcon;
++ newcon->rx_action = receive_from_sock;
++ add_sock(newsock, newcon);
++ up_write(&newcon->sock_sem);
++
++ /*
++ * Add it to the active queue in case we got data
++ * beween processing the accept adding the socket
++ * to the read_sockets list
++ */
++ lowcomms_data_ready(newsock->sk, 0);
++
++ up_read(&con->sock_sem);
++
++ accept_out:
++ return 0;
++
++ accept_err:
++ up_read(&con->sock_sem);
++ sock_release(newsock);
++
++ printk("dlm: error accepting connection from node: %d\n", result);
++ return result;
++}
++
++/* Connect a new socket to its peer */
++static int connect_to_sock(struct connection *con)
++{
++ int result = -EHOSTUNREACH;
++ struct sockaddr_in6 saddr;
++ int addr_len;
++ struct socket *sock;
++
++ if (con->nodeid == 0) {
++ log_print("attempt to connect sock 0 foiled");
++ return 0;
++ }
++
++ down_write(&con->sock_sem);
++ if (con->retries++ > MAX_CONNECT_RETRIES)
++ goto out;
++
++ // FIXME not sure this should happen, let alone like this.
++ if (con->sock) {
++ sock_release(con->sock);
++ con->sock = NULL;
++ }
++
++ /* Create a socket to communicate with */
++ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
++ if (result < 0)
++ goto out_err;
++
++ if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
++ goto out_err;
++
++ sock->sk->sk_user_data = con;
++ con->rx_action = receive_from_sock;
++
++ make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
++
++ add_sock(sock, con);
++ result =
++ sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
++ O_NONBLOCK);
++ if (result == -EINPROGRESS)
++ result = 0;
++ if (result != 0)
++ goto out_err;
++
++ out:
++ up_write(&con->sock_sem);
++ /*
++ * Returning an error here means we've given up trying to connect to
++ * a remote node, otherwise we return 0 and reschedule the connetion
++ * attempt
++ */
++ return result;
++
++ out_err:
++ if (con->sock) {
++ sock_release(con->sock);
++ con->sock = NULL;
++ }
++ /*
++ * Some errors are fatal and this list might need adjusting. For other
++ * errors we try again until the max number of retries is reached.
++ */
++ if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
++ result != -ENETDOWN && result != EINVAL
++ && result != -EPROTONOSUPPORT) {
++ lowcomms_connect_sock(con);
++ result = 0;
++ }
++ goto out;
++}
++
++static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
++{
++ struct socket *sock = NULL;
++ mm_segment_t fs;
++ int result = 0;
++ int one = 1;
++ struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
++
++ /* Create a socket to communicate with */
++ result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
++ if (result < 0) {
++ printk("dlm: Can't create listening comms socket\n");
++ goto create_out;
++ }
++
++ fs = get_fs();
++ set_fs(get_ds());
++ result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
++ set_fs(fs);
++ if (result < 0) {
++ printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
++ }
++ sock->sk->sk_user_data = con;
++ con->rx_action = accept_from_sock;
++ con->sock = sock;
++
++ /* Bind to our port */
++ make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
++ result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
++ if (result < 0) {
++ printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
++ sock_release(sock);
++ sock = NULL;
++ goto create_out;
++ }
++
++ fs = get_fs();
++ set_fs(get_ds());
++
++ result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
++ set_fs(fs);
++ if (result < 0) {
++ printk("dlm: Set keepalive failed: %d\n", result);
++ }
++
++ result = sock->ops->listen(sock, 5);
++ if (result < 0) {
++ printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
++ sock_release(sock);
++ sock = NULL;
++ goto create_out;
++ }
++
++ create_out:
++ return sock;
++}
++
++
++/* Listen on all interfaces */
++static int listen_for_all(void)
++{
++ int result = 0;
++ int nodeid;
++ struct socket *sock = NULL;
++ struct list_head *addr_list;
++ struct connection *con = nodeid2con(0);
++ struct cluster_node_addr *node_addr;
++ char local_addr[sizeof(struct sockaddr_in6)];
++
++ /* This will also fill in local_addr */
++ nodeid = lowcomms_our_nodeid();
++
++ addr_list = kcl_get_node_addresses(nodeid);
++ if (!addr_list) {
++ printk("dlm: cannot initialise comms layer\n");
++ result = -ENOTCONN;
++ goto create_out;
++ }
++
++ list_for_each_entry(node_addr, addr_list, list) {
++
++ if (!con) {
++ con = kmalloc(sizeof(struct connection), GFP_KERNEL);
++ if (!con) {
++ printk("dlm: failed to allocate listen socket\n");
++ goto create_out;
++ }
++ memset(con, 0, sizeof(*con));
++ init_rwsem(&con->sock_sem);
++ spin_lock_init(&con->writequeue_lock);
++ INIT_LIST_HEAD(&con->writequeue);
++ set_bit(CF_IS_OTHERSOCK, &con->flags);
++ }
++
++ memcpy(local_addr, node_addr->addr, node_addr->addr_len);
++ sock = create_listen_sock(con, local_addr,
++ node_addr->addr_len);
++ if (sock) {
++ add_sock(sock, con);
++ }
++ else {
++ kfree(con);
++ }
++
++ /* Keep a list of dynamically allocated listening sockets
++ so we can free them at shutdown */
++ if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
++ list_add_tail(&con->listenlist, &listen_sockets);
++ }
++ con = NULL;
++ }
++
++ create_out:
++ return result;
++}
++
++
++
++static struct writequeue_entry *new_writequeue_entry(struct connection *con,
++ int allocation)
++{
++ struct writequeue_entry *entry;
++
++ entry = kmalloc(sizeof(struct writequeue_entry), allocation);
++ if (!entry)
++ return NULL;
++
++ entry->page = alloc_page(allocation);
++ if (!entry->page) {
++ kfree(entry);
++ return NULL;
++ }
++
++ entry->offset = 0;
++ entry->len = 0;
++ entry->end = 0;
++ entry->users = 0;
++ entry->con = con;
++
++ return entry;
++}
++
++struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
++ int allocation, char **ppc)
++{
++ struct connection *con = nodeid2con(nodeid);
++ struct writequeue_entry *e;
++ int offset = 0;
++ int users = 0;
++
++ if (!atomic_read(&accepting))
++ return NULL;
++
++ spin_lock(&con->writequeue_lock);
++ e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
++ if (((struct list_head *) e == &con->writequeue) ||
++ (PAGE_CACHE_SIZE - e->end < len)) {
++ e = NULL;
++ } else {
++ offset = e->end;
++ e->end += len;
++ users = e->users++;
++ }
++ spin_unlock(&con->writequeue_lock);
++
++ if (e) {
++ got_one:
++ if (users == 0)
++ kmap(e->page);
++ *ppc = page_address(e->page) + offset;
++ return e;
++ }
++
++ e = new_writequeue_entry(con, allocation);
++ if (e) {
++ spin_lock(&con->writequeue_lock);
++ offset = e->end;
++ e->end += len;
++ users = e->users++;
++ list_add_tail(&e->list, &con->writequeue);
++ spin_unlock(&con->writequeue_lock);
++ atomic_inc(&writequeue_length);
++ goto got_one;
++ }
++ return NULL;
++}
++
++void lowcomms_commit_buffer(struct writequeue_entry *e)
++{
++ struct connection *con = e->con;
++ int users;
++
++ if (!atomic_read(&accepting))
++ return;
++
++ spin_lock(&con->writequeue_lock);
++ users = --e->users;
++ if (users)
++ goto out;
++ e->len = e->end - e->offset;
++ kunmap(e->page);
++ spin_unlock(&con->writequeue_lock);
++
++ if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
++ spin_lock_bh(&write_sockets_lock);
++ list_add_tail(&con->write_list, &write_sockets);
++ spin_unlock_bh(&write_sockets_lock);
++
++ wake_up_interruptible(&lowcomms_send_waitq);
++ }
++ return;
++
++ out:
++ spin_unlock(&con->writequeue_lock);
++ return;
++}
++
++static void free_entry(struct writequeue_entry *e)
++{
++ __free_page(e->page);
++ kfree(e);
++ atomic_dec(&writequeue_length);
++}
++
++/* Send a message */
++static int send_to_sock(struct connection *con)
++{
++ int ret = 0;
++ ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
++ const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
++ struct writequeue_entry *e;
++ int len, offset;
++
++ down_read(&con->sock_sem);
++ if (con->sock == NULL)
++ goto out_connect;
++
++ sendpage = con->sock->ops->sendpage;
++
++ spin_lock(&con->writequeue_lock);
++ for (;;) {
++ e = list_entry(con->writequeue.next, struct writequeue_entry,
++ list);
++ if ((struct list_head *) e == &con->writequeue)
++ break;
++
++ len = e->len;
++ offset = e->offset;
++ BUG_ON(len == 0 && e->users == 0);
++ spin_unlock(&con->writequeue_lock);
++
++ ret = 0;
++ if (len) {
++ ret = sendpage(con->sock, e->page, offset, len,
++ msg_flags);
++ if (ret == -EAGAIN || ret == 0)
++ goto out;
++ if (ret <= 0)
++ goto send_error;
++ }
++
++ spin_lock(&con->writequeue_lock);
++ e->offset += ret;
++ e->len -= ret;
++
++ if (e->len == 0 && e->users == 0) {
++ list_del(&e->list);
++ free_entry(e);
++ continue;
++ }
++ }
++ spin_unlock(&con->writequeue_lock);
++ out:
++ up_read(&con->sock_sem);
++ return ret;
++
++ send_error:
++ up_read(&con->sock_sem);
++ close_connection(con);
++ lowcomms_connect_sock(con);
++ return ret;
++
++ out_connect:
++ up_read(&con->sock_sem);
++ lowcomms_connect_sock(con);
++ return 0;
++}
++
++/* Called from recoverd when it knows that a node has
++ left the cluster */
++int lowcomms_close(int nodeid)
++{
++ struct connection *con;
++
++ if (!connections)
++ goto out;
++
++ con = nodeid2con(nodeid);
++ if (con->sock) {
++ close_connection(con);
++ return 0;
++ }
++
++ out:
++ return -1;
++}
++
++/* API send message call, may queue the request */
++/* N.B. This is the old interface - use the new one for new calls */
++int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
++{
++ struct writequeue_entry *e;
++ char *b;
++
++ GDLM_ASSERT(nodeid < dlm_config.max_connections,
++ printk("nodeid=%u\n", nodeid););
++
++ e = lowcomms_get_buffer(nodeid, len, allocation, &b);
++ if (e) {
++ memcpy(b, buf, len);
++ lowcomms_commit_buffer(e);
++ return 0;
++ }
++ return -ENOBUFS;
++}
++
++/* Look for activity on active sockets */
++static void process_sockets(void)
++{
++ struct list_head *list;
++ struct list_head *temp;
++
++ spin_lock_bh(&read_sockets_lock);
++ list_for_each_safe(list, temp, &read_sockets) {
++ struct connection *con =
++ list_entry(list, struct connection, read_list);
++ list_del(&con->read_list);
++ clear_bit(CF_READ_PENDING, &con->flags);
++
++ spin_unlock_bh(&read_sockets_lock);
++
++ con->rx_action(con);
++
++ /* Don't starve out everyone else */
++ schedule();
++ spin_lock_bh(&read_sockets_lock);
++ }
++ spin_unlock_bh(&read_sockets_lock);
++}
++
++/* Try to send any messages that are pending
++ */
++static void process_output_queue(void)
++{
++ struct list_head *list;
++ struct list_head *temp;
++ int ret;
++
++ spin_lock_bh(&write_sockets_lock);
++ list_for_each_safe(list, temp, &write_sockets) {
++ struct connection *con =
++ list_entry(list, struct connection, write_list);
++ list_del(&con->write_list);
++ clear_bit(CF_WRITE_PENDING, &con->flags);
++
++ spin_unlock_bh(&write_sockets_lock);
++
++ ret = send_to_sock(con);
++ if (ret < 0) {
++ }
++ spin_lock_bh(&write_sockets_lock);
++ }
++ spin_unlock_bh(&write_sockets_lock);
++}
++
++static void process_state_queue(void)
++{
++ struct list_head *list;
++ struct list_head *temp;
++ int ret;
++
++ spin_lock_bh(&state_sockets_lock);
++ list_for_each_safe(list, temp, &state_sockets) {
++ struct connection *con =
++ list_entry(list, struct connection, state_list);
++ list_del(&con->state_list);
++ clear_bit(CF_CONNECT_PENDING, &con->flags);
++ spin_unlock_bh(&state_sockets_lock);
++
++ ret = connect_to_sock(con);
++ if (ret < 0) {
++ }
++ spin_lock_bh(&state_sockets_lock);
++ }
++ spin_unlock_bh(&state_sockets_lock);
++}
++
++/* Discard all entries on the write queues */
++static void clean_writequeues(void)
++{
++ struct list_head *list;
++ struct list_head *temp;
++ int nodeid;
++
++ for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
++ struct connection *con = nodeid2con(nodeid);
++
++ spin_lock(&con->writequeue_lock);
++ list_for_each_safe(list, temp, &con->writequeue) {
++ struct writequeue_entry *e =
++ list_entry(list, struct writequeue_entry, list);
++ list_del(&e->list);
++ free_entry(e);
++ }
++ spin_unlock(&con->writequeue_lock);
++ }
++}
++
++static int read_list_empty(void)
++{
++ int status;
++
++ spin_lock_bh(&read_sockets_lock);
++ status = list_empty(&read_sockets);
++ spin_unlock_bh(&read_sockets_lock);
++
++ return status;
++}
++
++/* DLM Transport comms receive daemon */
++static int dlm_recvd(void *data)
++{
++ daemonize("dlm_recvd");
++ atomic_set(&recv_run, 1);
++
++ init_waitqueue_head(&lowcomms_recv_waitq);
++ init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
++ add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
++
++ complete(&thread_completion);
++
++ while (atomic_read(&recv_run)) {
++
++ set_task_state(current, TASK_INTERRUPTIBLE);
++
++ if (read_list_empty())
++ schedule();
++
++ set_task_state(current, TASK_RUNNING);
++
++ process_sockets();
++ }
++
++ down(&thread_lock);
++ up(&thread_lock);
++
++ complete(&thread_completion);
++
++ return 0;
++}
++
++static int write_and_state_lists_empty(void)
++{
++ int status;
++
++ spin_lock_bh(&write_sockets_lock);
++ status = list_empty(&write_sockets);
++ spin_unlock_bh(&write_sockets_lock);
++
++ spin_lock_bh(&state_sockets_lock);
++ if (list_empty(&state_sockets) == 0)
++ status = 0;
++ spin_unlock_bh(&state_sockets_lock);
++
++ return status;
++}
++
++/* DLM Transport send daemon */
++static int dlm_sendd(void *data)
++{
++ daemonize("dlm_sendd");
++ atomic_set(&send_run, 1);
++
++ init_waitqueue_head(&lowcomms_send_waitq);
++ init_waitqueue_entry(&lowcomms_send_waitq_head, current);
++ add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
++
++ complete(&thread_completion);
++
++ while (atomic_read(&send_run)) {
++
++ set_task_state(current, TASK_INTERRUPTIBLE);
++
++ if (write_and_state_lists_empty())
++ schedule();
++
++ set_task_state(current, TASK_RUNNING);
++
++ process_state_queue();
++ process_output_queue();
++ }
++
++ down(&thread_lock);
++ up(&thread_lock);
++
++ complete(&thread_completion);
++
++ return 0;
++}
++
++static void daemons_stop(void)
++{
++ if (atomic_read(&recv_run)) {
++ down(&thread_lock);
++ atomic_set(&recv_run, 0);
++ wake_up_interruptible(&lowcomms_recv_waitq);
++ up(&thread_lock);
++ wait_for_completion(&thread_completion);
++ }
++
++ if (atomic_read(&send_run)) {
++ down(&thread_lock);
++ atomic_set(&send_run, 0);
++ wake_up_interruptible(&lowcomms_send_waitq);
++ up(&thread_lock);
++ wait_for_completion(&thread_completion);
++ }
++}
++
++static int daemons_start(void)
++{
++ int error;
++
++ error = kernel_thread(dlm_recvd, NULL, 0);
++ if (error < 0) {
++ log_print("can't start recvd thread: %d", error);
++ goto out;
++ }
++ wait_for_completion(&thread_completion);
++
++ error = kernel_thread(dlm_sendd, NULL, 0);
++ if (error < 0) {
++ log_print("can't start sendd thread: %d", error);
++ daemons_stop();
++ goto out;
++ }
++ wait_for_completion(&thread_completion);
++
++ error = 0;
++ out:
++ return error;
++}
++
++/*
++ * Return the largest buffer size we can cope with.
++ */
++int lowcomms_max_buffer_size(void)
++{
++ return PAGE_CACHE_SIZE;
++}
++
++void lowcomms_stop(void)
++{
++ int i;
++ struct connection *temp;
++ struct connection *lcon;
++
++ atomic_set(&accepting, 0);
++
++ /* Set all the activity flags to prevent any
++ socket activity.
++ */
++ for (i = 0; i < conn_array_size; i++) {
++ connections[i].flags = 0x7;
++ }
++ daemons_stop();
++ clean_writequeues();
++
++ for (i = 0; i < conn_array_size; i++) {
++ close_connection(nodeid2con(i));
++ }
++
++ kfree(connections);
++ connections = NULL;
++
++ /* Free up any dynamically allocated listening sockets */
++ list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
++ sock_release(lcon->sock);
++ kfree(lcon);
++ }
++
++ kcl_releaseref_cluster();
++}
++
++/* This is quite likely to sleep... */
++int lowcomms_start(void)
++{
++ int error = 0;
++ int i;
++
++ INIT_LIST_HEAD(&read_sockets);
++ INIT_LIST_HEAD(&write_sockets);
++ INIT_LIST_HEAD(&state_sockets);
++ INIT_LIST_HEAD(&listen_sockets);
++
++ spin_lock_init(&read_sockets_lock);
++ spin_lock_init(&write_sockets_lock);
++ spin_lock_init(&state_sockets_lock);
++
++ init_completion(&thread_completion);
++ init_MUTEX(&thread_lock);
++ atomic_set(&send_run, 0);
++ atomic_set(&recv_run, 0);
++
++ error = -ENOTCONN;
++ if (kcl_addref_cluster())
++ goto out;
++
++ /*
++ * Temporarily initialise the waitq head so that lowcomms_send_message
++ * doesn't crash if it gets called before the thread is fully
++ * initialised
++ */
++ init_waitqueue_head(&lowcomms_send_waitq);
++
++ error = -ENOMEM;
++
++ connections = kmalloc(sizeof(struct connection) *
++ dlm_config.max_connections, GFP_KERNEL);
++ if (!connections)
++ goto out;
++
++ memset(connections, 0,
++ sizeof(struct connection) * dlm_config.max_connections);
++ for (i = 0; i < dlm_config.max_connections; i++) {
++ connections[i].nodeid = i;
++ init_rwsem(&connections[i].sock_sem);
++ INIT_LIST_HEAD(&connections[i].writequeue);
++ spin_lock_init(&connections[i].writequeue_lock);
++ }
++ conn_array_size = dlm_config.max_connections;
++
++ /* Start listening */
++ error = listen_for_all();
++ if (error)
++ goto fail_free_conn;
++
++ error = daemons_start();
++ if (error)
++ goto fail_free_conn;
++
++ atomic_set(&accepting, 1);
++
++ return 0;
++
++ fail_free_conn:
++ kfree(connections);
++
++ out:
++ return error;
++}
++
++/* Don't accept any more outgoing work */
++void lowcomms_stop_accept()
++{
++ atomic_set(&accepting, 0);
++}
++
++/* Cluster Manager interface functions for looking up
++ nodeids and IP addresses by each other
++*/
++
++/* Return the IP address of a node given its NODEID */
++static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
++{
++ struct list_head *addrs;
++ struct cluster_node_addr *node_addr;
++ struct cluster_node_addr *current_addr = NULL;
++ struct sockaddr_in6 *saddr;
++ int interface;
++ int i;
++
++ addrs = kcl_get_node_addresses(nodeid);
++ if (!addrs)
++ return -1;
++
++ interface = kcl_get_current_interface();
++
++ /* Look for address number <interface> */
++ i=0; /* i/f numbers start at 1 */
++ list_for_each_entry(node_addr, addrs, list) {
++ if (interface == ++i) {
++ current_addr = node_addr;
++ break;
++ }
++ }
++
++ /* If that failed then just use the first one */
++ if (!current_addr)
++ current_addr = (struct cluster_node_addr *)addrs->next;
++
++ saddr = (struct sockaddr_in6 *)current_addr->addr;
++
++ /* Extract the IP address */
++ if (saddr->sin6_family == AF_INET) {
++ struct sockaddr_in *in4 = (struct sockaddr_in *)saddr;
++ struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
++ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
++ }
++ else {
++ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
++ memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
++ }
++
++ return 0;
++}
++
++/* Return the NODEID for a node given its sockaddr */
++static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
++{
++ struct kcl_cluster_node node;
++ struct sockaddr_in6 ipv6_addr;
++ struct sockaddr_in ipv4_addr;
++
++ if (addr->sa_family == AF_INET) {
++ struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
++ memcpy(&ipv4_addr, &local_addr, addr_len);
++ memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
++
++ addr = (struct sockaddr *)&ipv4_addr;
++ }
++ else {
++ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
++ memcpy(&ipv6_addr, &local_addr, addr_len);
++ memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
++
++ addr = (struct sockaddr *)&ipv6_addr;
++ }
++
++ if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
++ return node.node_id;
++ else
++ return 0;
++}
++
++int lowcomms_our_nodeid(void)
++{
++ struct kcl_cluster_node node;
++ struct list_head *addrs;
++ struct cluster_node_addr *first_addr;
++ static int our_nodeid = 0;
++
++ if (our_nodeid)
++ return our_nodeid;
++
++ if (kcl_get_node_by_nodeid(0, &node) == -1)
++ return 0;
++
++ our_nodeid = node.node_id;
++
++ /* Fill in the "template" structure */
++ addrs = kcl_get_node_addresses(our_nodeid);
++ if (!addrs)
++ return 0;
++
++ first_addr = (struct cluster_node_addr *) addrs->next;
++ memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
++
++ return node.node_id;
++}
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
+--- linux-orig/cluster/dlm/lowcomms.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/lowcomms.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOWCOMMS_DOT_H__
++#define __LOWCOMMS_DOT_H__
++
++/* The old interface */
++int lowcomms_send_message(int csid, char *buf, int len, int allocation);
++
++/* The new interface */
++struct writequeue_entry;
++extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
++ int allocation, char **ppc);
++extern void lowcomms_commit_buffer(struct writequeue_entry *e);
++
++int lowcomms_start(void);
++void lowcomms_stop(void);
++void lowcomms_stop_accept(void);
++int lowcomms_close(int nodeid);
++int lowcomms_max_buffer_size(void);
++
++int lowcomms_our_nodeid(void);
++
++#endif /* __LOWCOMMS_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
+--- linux-orig/cluster/dlm/main.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/main.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++#include <linux/ctype.h>
++#include <linux/seq_file.h>
++#include <linux/module.h>
++#include <net/sock.h>
++
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lockspace.h"
++#include "recoverd.h"
++#include "ast.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "locking.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++#include "lowcomms.h"
++
++int dlm_device_init(void);
++void dlm_device_exit(void);
++void dlm_proc_init(void);
++void dlm_proc_exit(void);
++
++
++/* Cluster manager callbacks, we want to know if a node dies
++ N.B. this is independent of lockspace-specific event callbacks from SM */
++
++static void cman_callback(kcl_callback_reason reason, long arg)
++{
++ if (reason == DIED) {
++ lowcomms_close((int) arg);
++ }
++
++ /* This is unconditional. so do what we can to tidy up */
++ if (reason == LEAVING) {
++ dlm_emergency_shutdown();
++ }
++}
++
++int __init init_dlm(void)
++{
++ dlm_proc_init();
++ dlm_lockspace_init();
++ dlm_recoverd_init();
++ dlm_nodes_init();
++ dlm_device_init();
++ dlm_memory_init();
++ dlm_config_init();
++
++ kcl_add_callback(cman_callback);
++
++ printk("DLM %s (built %s %s) installed\n",
++ DLM_RELEASE_NAME, __DATE__, __TIME__);
++
++ return 0;
++}
++
++void __exit exit_dlm(void)
++{
++ kcl_remove_callback(cman_callback);
++
++ dlm_device_exit();
++ dlm_memory_exit();
++ dlm_config_exit();
++ dlm_proc_exit();
++}
++
++MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++module_init(init_dlm);
++module_exit(exit_dlm);
++
++EXPORT_SYMBOL(dlm_init);
++EXPORT_SYMBOL(dlm_release);
++EXPORT_SYMBOL(dlm_new_lockspace);
++EXPORT_SYMBOL(dlm_release_lockspace);
++EXPORT_SYMBOL(dlm_lock);
++EXPORT_SYMBOL(dlm_unlock);
+diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
+--- linux-orig/cluster/dlm/memory.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/memory.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,238 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* memory.c
++ *
++ * memory allocation routines
++ *
++ */
++
++#include "dlm_internal.h"
++#include "memory.h"
++#include "config.h"
++
++/* as the man says...Shouldn't this be in a header file somewhere? */
++#define BYTES_PER_WORD sizeof(void *)
++
++static kmem_cache_t *rsb_cache_small;
++static kmem_cache_t *rsb_cache_large;
++static kmem_cache_t *lkb_cache;
++static kmem_cache_t *lvb_cache;
++static kmem_cache_t *resdir_cache_large;
++static kmem_cache_t *resdir_cache_small;
++
++/* The thresholds above which we allocate large RSBs/resdatas rather than small
++ * ones. This must make the resultant structure end on a word boundary */
++#define LARGE_RSB_NAME 28
++#define LARGE_RES_NAME 28
++
++int dlm_memory_init()
++{
++ int ret = -ENOMEM;
++
++
++ rsb_cache_small =
++ kmem_cache_create("dlm_rsb(small)",
++ (sizeof(gd_res_t) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
++ __alignof__(gd_res_t), 0, NULL, NULL);
++ if (!rsb_cache_small)
++ goto out;
++
++ rsb_cache_large =
++ kmem_cache_create("dlm_rsb(large)",
++ sizeof(gd_res_t) + DLM_RESNAME_MAXLEN,
++ __alignof__(gd_res_t), 0, NULL, NULL);
++ if (!rsb_cache_large)
++ goto out_free_rsbs;
++
++ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(gd_lkb_t),
++ __alignof__(gd_lkb_t), 0, NULL, NULL);
++ if (!lkb_cache)
++ goto out_free_rsbl;
++
++ resdir_cache_large =
++ kmem_cache_create("dlm_resdir(l)",
++ sizeof(gd_resdata_t) + DLM_RESNAME_MAXLEN,
++ __alignof__(gd_resdata_t), 0, NULL, NULL);
++ if (!resdir_cache_large)
++ goto out_free_lkb;
++
++ resdir_cache_small =
++ kmem_cache_create("dlm_resdir(s)",
++ (sizeof(gd_resdata_t) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
++ __alignof__(gd_resdata_t), 0, NULL, NULL);
++ if (!resdir_cache_small)
++ goto out_free_resl;
++
++ /* LVB cache also holds ranges, so should be 64bit aligned */
++ lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
++ __alignof__(uint64_t), 0, NULL, NULL);
++ if (!lkb_cache)
++ goto out_free_ress;
++
++ ret = 0;
++ goto out;
++
++ out_free_ress:
++ kmem_cache_destroy(resdir_cache_small);
++
++ out_free_resl:
++ kmem_cache_destroy(resdir_cache_large);
++
++ out_free_lkb:
++ kmem_cache_destroy(lkb_cache);
++
++ out_free_rsbl:
++ kmem_cache_destroy(rsb_cache_large);
++
++ out_free_rsbs:
++ kmem_cache_destroy(rsb_cache_small);
++
++ out:
++ return ret;
++}
++
++void dlm_memory_exit()
++{
++ kmem_cache_destroy(rsb_cache_large);
++ kmem_cache_destroy(rsb_cache_small);
++ kmem_cache_destroy(lkb_cache);
++ kmem_cache_destroy(resdir_cache_small);
++ kmem_cache_destroy(resdir_cache_large);
++ kmem_cache_destroy(lvb_cache);
++}
++
++gd_res_t *allocate_rsb(gd_ls_t *ls, int namelen)
++{
++ gd_res_t *r;
++
++ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++ if (namelen >= LARGE_RSB_NAME)
++ r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
++ else
++ r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
++
++ if (r)
++ memset(r, 0, sizeof(gd_res_t) + namelen);
++
++ return r;
++}
++
++void free_rsb(gd_res_t *r)
++{
++ int length = r->res_length;
++
++#ifdef POISON
++ memset(r, 0x55, sizeof(gd_res_t) + r->res_length);
++#endif
++
++ if (length >= LARGE_RSB_NAME)
++ kmem_cache_free(rsb_cache_large, r);
++ else
++ kmem_cache_free(rsb_cache_small, r);
++}
++
++gd_lkb_t *allocate_lkb(gd_ls_t *ls)
++{
++ gd_lkb_t *l;
++
++ l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
++ if (l)
++ memset(l, 0, sizeof(gd_lkb_t));
++
++ return l;
++}
++
++void free_lkb(gd_lkb_t *l)
++{
++#ifdef POISON
++ memset(l, 0xAA, sizeof(gd_lkb_t));
++#endif
++ kmem_cache_free(lkb_cache, l);
++}
++
++gd_resdata_t *allocate_resdata(gd_ls_t *ls, int namelen)
++{
++ gd_resdata_t *rd;
++
++ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++ if (namelen >= LARGE_RES_NAME)
++ rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
++ else
++ rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
++
++ if (rd)
++ memset(rd, 0, sizeof(gd_resdata_t));
++
++ return rd;
++}
++
++void free_resdata(gd_resdata_t *rd)
++{
++ if (rd->rd_length >= LARGE_RES_NAME)
++ kmem_cache_free(resdir_cache_large, rd);
++ else
++ kmem_cache_free(resdir_cache_small, rd);
++}
++
++char *allocate_lvb(gd_ls_t *ls)
++{
++ char *l;
++
++ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
++ if (l)
++ memset(l, 0, DLM_LVB_LEN);
++
++ return l;
++}
++
++void free_lvb(char *l)
++{
++ kmem_cache_free(lvb_cache, l);
++}
++
++/* Ranges are allocated from the LVB cache as they are the same size (4x64
++ * bits) */
++uint64_t *allocate_range(gd_ls_t * ls)
++{
++ uint64_t *l;
++
++ l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
++ if (l)
++ memset(l, 0, DLM_LVB_LEN);
++
++ return l;
++}
++
++void free_range(uint64_t *l)
++{
++ kmem_cache_free(lvb_cache, l);
++}
++
++gd_rcom_t *allocate_rcom_buffer(gd_ls_t *ls)
++{
++ gd_rcom_t *rc;
++
++ rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
++ if (rc)
++ memset(rc, 0, dlm_config.buffer_size);
++
++ return rc;
++}
++
++void free_rcom_buffer(gd_rcom_t *rc)
++{
++ kfree(rc);
++}
+diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
+--- linux-orig/cluster/dlm/memory.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/memory.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MEMORY_DOT_H__
++#define __MEMORY_DOT_H__
++
++int dlm_memory_init(void);
++void dlm_memory_exit(void);
++gd_res_t *allocate_rsb(gd_ls_t * ls, int namelen);
++void free_rsb(gd_res_t * r);
++gd_lkb_t *allocate_lkb(gd_ls_t * ls);
++void free_lkb(gd_lkb_t * l);
++gd_resdata_t *allocate_resdata(gd_ls_t * ls, int namelen);
++void free_resdata(gd_resdata_t * rd);
++char *allocate_lvb(gd_ls_t * ls);
++void free_lvb(char *l);
++gd_rcom_t *allocate_rcom_buffer(gd_ls_t * ls);
++void free_rcom_buffer(gd_rcom_t * rc);
++uint64_t *allocate_range(gd_ls_t * ls);
++void free_range(uint64_t * l);
++
++#endif /* __MEMORY_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
+--- linux-orig/cluster/dlm/midcomms.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/midcomms.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,351 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * midcomms.c
++ *
++ * This is the appallingly named "mid-level" comms layer.
++ *
++ * Its purpose is to take packets from the "real" comms layer,
++ * split them up into packets and pass them to the interested
++ * part of the locking mechanism.
++ *
++ * It also takes messages from the locking layer, formats them
++ * into packets and sends them to the comms layer.
++ *
++ * It knows the format of the mid-level messages used and nodeidss
++ * but it does not know how to resolve a nodeid into an IP address
++ * or any of the comms channel details
++ *
++ */
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "lockqueue.h"
++#include "nodes.h"
++#include "reccomms.h"
++#include "config.h"
++
++/* Byteorder routines */
++
++static void host_to_network(void *msg)
++{
++ struct gd_req_header *head = msg;
++ struct gd_remlockrequest *req = msg;
++ struct gd_remlockreply *reply = msg;
++ struct gd_remquery *query = msg;
++ struct gd_remqueryreply *queryrep = msg;
++ gd_rcom_t *rc = msg;
++
++ /* Force into network byte order */
++
++ /*
++ * Do the common header first
++ */
++
++ head->rh_length = cpu_to_le16(head->rh_length);
++ head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
++ /* Leave the lkid alone as it is transparent at the remote end */
++
++ /*
++ * Do the fields in the remlockrequest or remlockreply structs
++ */
++
++ switch (req->rr_header.rh_cmd) {
++
++ case GDLM_REMCMD_LOCKREQUEST:
++ case GDLM_REMCMD_CONVREQUEST:
++ req->rr_range_start = cpu_to_le64(req->rr_range_start);
++ req->rr_range_end = cpu_to_le64(req->rr_range_end);
++ /* Deliberate fall through */
++ case GDLM_REMCMD_UNLOCKREQUEST:
++ case GDLM_REMCMD_LOOKUP:
++ case GDLM_REMCMD_LOCKGRANT:
++ case GDLM_REMCMD_SENDBAST:
++ case GDLM_REMCMD_SENDCAST:
++ case GDLM_REMCMD_REM_RESDATA:
++ req->rr_flags = cpu_to_le32(req->rr_flags);
++ req->rr_status = cpu_to_le32(req->rr_status);
++ break;
++
++ case GDLM_REMCMD_LOCKREPLY:
++ reply->rl_lockstate = cpu_to_le32(reply->rl_lockstate);
++ reply->rl_nodeid = cpu_to_le32(reply->rl_nodeid);
++ reply->rl_status = cpu_to_le32(reply->rl_status);
++ break;
++
++ case GDLM_REMCMD_RECOVERMESSAGE:
++ case GDLM_REMCMD_RECOVERREPLY:
++ rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
++ rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
++ break;
++
++ case GDLM_REMCMD_QUERY:
++ query->rq_mstlkid = cpu_to_le32(query->rq_mstlkid);
++ query->rq_query = cpu_to_le32(query->rq_query);
++ query->rq_maxlocks = cpu_to_le32(query->rq_maxlocks);
++ break;
++
++ case GDLM_REMCMD_QUERYREPLY:
++ queryrep->rq_numlocks = cpu_to_le32(queryrep->rq_numlocks);
++ queryrep->rq_status = cpu_to_le32(queryrep->rq_status);
++ queryrep->rq_grantcount = cpu_to_le32(queryrep->rq_grantcount);
++ queryrep->rq_waitcount = cpu_to_le32(queryrep->rq_waitcount);
++ queryrep->rq_convcount = cpu_to_le32(queryrep->rq_convcount);
++ break;
++
++ default:
++ printk("dlm: warning, unknown REMCMD type %u\n",
++ req->rr_header.rh_cmd);
++ }
++}
++
++static void network_to_host(void *msg)
++{
++ struct gd_req_header *head = msg;
++ struct gd_remlockrequest *req = msg;
++ struct gd_remlockreply *reply = msg;
++ struct gd_remquery *query = msg;
++ struct gd_remqueryreply *queryrep = msg;
++ gd_rcom_t *rc = msg;
++
++ /* Force into host byte order */
++
++ /*
++ * Do the common header first
++ */
++
++ head->rh_length = le16_to_cpu(head->rh_length);
++ head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
++ /* Leave the lkid alone as it is transparent at the remote end */
++
++ /*
++ * Do the fields in the remlockrequest or remlockreply structs
++ */
++
++ switch (req->rr_header.rh_cmd) {
++
++ case GDLM_REMCMD_LOCKREQUEST:
++ case GDLM_REMCMD_CONVREQUEST:
++ req->rr_range_start = le64_to_cpu(req->rr_range_start);
++ req->rr_range_end = le64_to_cpu(req->rr_range_end);
++ case GDLM_REMCMD_LOOKUP:
++ case GDLM_REMCMD_UNLOCKREQUEST:
++ case GDLM_REMCMD_LOCKGRANT:
++ case GDLM_REMCMD_SENDBAST:
++ case GDLM_REMCMD_SENDCAST:
++ case GDLM_REMCMD_REM_RESDATA:
++ /* Actually, not much to do here as the remote lock IDs are
++ * transparent too */
++ req->rr_flags = le32_to_cpu(req->rr_flags);
++ req->rr_status = le32_to_cpu(req->rr_status);
++ break;
++
++ case GDLM_REMCMD_LOCKREPLY:
++ reply->rl_lockstate = le32_to_cpu(reply->rl_lockstate);
++ reply->rl_nodeid = le32_to_cpu(reply->rl_nodeid);
++ reply->rl_status = le32_to_cpu(reply->rl_status);
++ break;
++
++ case GDLM_REMCMD_RECOVERMESSAGE:
++ case GDLM_REMCMD_RECOVERREPLY:
++ rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
++ rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
++ break;
++
++
++ case GDLM_REMCMD_QUERY:
++ query->rq_mstlkid = le32_to_cpu(query->rq_mstlkid);
++ query->rq_query = le32_to_cpu(query->rq_query);
++ query->rq_maxlocks = le32_to_cpu(query->rq_maxlocks);
++ break;
++
++ case GDLM_REMCMD_QUERYREPLY:
++ queryrep->rq_numlocks = le32_to_cpu(queryrep->rq_numlocks);
++ queryrep->rq_status = le32_to_cpu(queryrep->rq_status);
++ queryrep->rq_grantcount = le32_to_cpu(queryrep->rq_grantcount);
++ queryrep->rq_waitcount = le32_to_cpu(queryrep->rq_waitcount);
++ queryrep->rq_convcount = le32_to_cpu(queryrep->rq_convcount);
++ break;
++
++ default:
++ printk("dlm: warning, unknown REMCMD type %u\n",
++ req->rr_header.rh_cmd);
++ }
++}
++
++static void copy_from_cb(void *dst, const void *base, unsigned offset,
++ unsigned len, unsigned limit)
++{
++ unsigned copy = len;
++
++ if ((copy + offset) > limit)
++ copy = limit - offset;
++ memcpy(dst, base + offset, copy);
++ len -= copy;
++ if (len)
++ memcpy(dst + copy, base, len);
++}
++
++static void khexdump(const unsigned char *c, int len)
++{
++ while (len > 16) {
++ printk(KERN_INFO
++ "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
++ c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
++ c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
++ len -= 16;
++ }
++ while (len > 4) {
++ printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
++ c[3]);
++ len -= 4;
++ }
++ while (len > 0) {
++ printk(KERN_INFO "%02x\n", c[0]);
++ len--;
++ }
++}
++
++/*
++ * Called from the low-level comms layer to process a buffer of
++ * commands.
++ *
++ * Only complete messages are processed here, any "spare" bytes from
++ * the end of a buffer are saved and tacked onto the front of the next
++ * message that comes in. I doubt this will happen very often but we
++ * need to be able to cope with it and I don't want the task to be waiting
++ * for packets to come in when there is useful work to be done.
++ *
++ */
++int midcomms_process_incoming_buffer(int nodeid, const void *base,
++ unsigned offset, unsigned len,
++ unsigned limit)
++{
++ unsigned char __tmp[sizeof(struct gd_req_header) + 64];
++ struct gd_req_header *msg = (struct gd_req_header *) __tmp;
++ int ret = 0;
++ int err = 0;
++ unsigned msglen;
++ __u32 id, space;
++
++ while (len > sizeof(struct gd_req_header)) {
++ /* Get message header and check it over */
++ copy_from_cb(msg, base, offset, sizeof(struct gd_req_header),
++ limit);
++ msglen = le16_to_cpu(msg->rh_length);
++ id = msg->rh_lkid;
++ space = msg->rh_lockspace;
++
++ /* Check message size */
++ err = -EINVAL;
++ if (msglen < sizeof(struct gd_req_header))
++ break;
++ err = -E2BIG;
++ if (msglen > dlm_config.buffer_size) {
++ printk("dlm: message size too big %d\n", msglen);
++ break;
++ }
++ err = 0;
++
++ /* Not enough in buffer yet? wait for some more */
++ if (msglen > len)
++ break;
++
++ /* Make sure our temp buffer is large enough */
++ if (msglen > sizeof(__tmp) &&
++ msg == (struct gd_req_header *) __tmp) {
++ msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
++ if (msg == NULL)
++ return ret;
++ }
++
++ copy_from_cb(msg, base, offset, msglen, limit);
++ BUG_ON(id != msg->rh_lkid);
++ BUG_ON(space != msg->rh_lockspace);
++ ret += msglen;
++ offset += msglen;
++ offset &= (limit - 1);
++ len -= msglen;
++ network_to_host(msg);
++
++ if ((msg->rh_cmd > 32) ||
++ (msg->rh_cmd == 0) ||
++ (msg->rh_length < sizeof(struct gd_req_header)) ||
++ (msg->rh_length > dlm_config.buffer_size)) {
++
++ printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
++ "lkid=%u, lockspace=%u\n",
++ msg->rh_cmd, msg->rh_flags, msg->rh_length,
++ msg->rh_lkid, msg->rh_lockspace);
++
++ printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
++ "ret=%u, limit=%08x newbuf=%d\n",
++ base, offset, len, ret, limit,
++ ((struct gd_req_header *) __tmp == msg));
++
++ khexdump((const unsigned char *) msg, msg->rh_length);
++
++ return -EBADMSG;
++ }
++
++ switch (msg->rh_cmd) {
++ case GDLM_REMCMD_RECOVERMESSAGE:
++ case GDLM_REMCMD_RECOVERREPLY:
++ process_recovery_comm(nodeid, msg);
++ break;
++ default:
++ process_cluster_request(nodeid, msg, FALSE);
++ }
++ }
++
++ if (msg != (struct gd_req_header *) __tmp)
++ kfree(msg);
++
++ return err ? err : ret;
++}
++
++/*
++ * Send a lowcomms buffer
++ */
++
++void midcomms_send_buffer(struct gd_req_header *msg, struct writequeue_entry *e)
++{
++ host_to_network(msg);
++ lowcomms_commit_buffer(e);
++}
++
++/*
++ * Make the message into network byte order and send it
++ */
++
++int midcomms_send_message(uint32_t nodeid, struct gd_req_header *msg,
++ int allocation)
++{
++ int len = msg->rh_length;
++
++ host_to_network(msg);
++
++ /*
++ * Loopback. In fact, the locking code pretty much prevents this from
++ * being needed but it can happen when the directory node is also the
++ * local node.
++ */
++
++ if (nodeid == our_nodeid())
++ return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
++ len, len);
++
++ return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
++}
+diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
+--- linux-orig/cluster/dlm/midcomms.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/midcomms.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,24 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MIDCOMMS_DOT_H__
++#define __MIDCOMMS_DOT_H__
++
++int midcomms_send_message(uint32_t csid, struct gd_req_header *msg,
++ int allocation);
++int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
++ unsigned len, unsigned limit);
++void midcomms_send_buffer(struct gd_req_header *msg,
++ struct writequeue_entry *e);
++
++#endif /* __MIDCOMMS_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
+--- linux-orig/cluster/dlm/nodes.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/nodes.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,325 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <net/sock.h>
++#include <cluster/cnxman.h>
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "nodes.h"
++#include "recover.h"
++#include "reccomms.h"
++#include "util.h"
++
++static struct list_head cluster_nodes;
++static spinlock_t node_lock;
++static uint32_t local_nodeid;
++static struct semaphore local_init_lock;
++
++
++void dlm_nodes_init(void)
++{
++ INIT_LIST_HEAD(&cluster_nodes);
++ spin_lock_init(&node_lock);
++ local_nodeid = 0;
++ init_MUTEX(&local_init_lock);
++}
++
++static gd_node_t *search_node(uint32_t nodeid)
++{
++ gd_node_t *node;
++
++ list_for_each_entry(node, &cluster_nodes, gn_list) {
++ if (node->gn_nodeid == nodeid)
++ goto out;
++ }
++ node = NULL;
++ out:
++ return node;
++}
++
++static void put_node(gd_node_t *node)
++{
++ spin_lock(&node_lock);
++ node->gn_refcount--;
++ if (node->gn_refcount == 0) {
++ list_del(&node->gn_list);
++ spin_unlock(&node_lock);
++ kfree(node);
++ return;
++ }
++ spin_unlock(&node_lock);
++}
++
++static int get_node(uint32_t nodeid, gd_node_t **ndp)
++{
++ gd_node_t *node, *node2;
++ int error = -ENOMEM;
++
++ spin_lock(&node_lock);
++ node = search_node(nodeid);
++ if (node)
++ node->gn_refcount++;
++ spin_unlock(&node_lock);
++
++ if (node)
++ goto out;
++
++ node = (gd_node_t *) kmalloc(sizeof(gd_node_t), GFP_KERNEL);
++ if (!node)
++ goto fail;
++
++ memset(node, 0, sizeof(gd_node_t));
++ node->gn_nodeid = nodeid;
++
++ spin_lock(&node_lock);
++ node2 = search_node(nodeid);
++ if (node2) {
++ node2->gn_refcount++;
++ spin_unlock(&node_lock);
++ kfree(node);
++ node = node2;
++ goto out;
++ }
++
++ node->gn_refcount = 1;
++ list_add_tail(&node->gn_list, &cluster_nodes);
++ spin_unlock(&node_lock);
++
++ out:
++ *ndp = node;
++ return 0;
++
++ fail:
++ return error;
++}
++
++int init_new_csb(uint32_t nodeid, gd_csb_t **ret_csb)
++{
++ gd_csb_t *csb;
++ gd_node_t *node;
++ int error = -ENOMEM;
++
++ csb = (gd_csb_t *) kmalloc(sizeof(gd_csb_t), GFP_KERNEL);
++ if (!csb)
++ goto fail;
++
++ memset(csb, 0, sizeof(gd_csb_t));
++
++ error = get_node(nodeid, &node);
++ if (error)
++ goto fail_free;
++
++ csb->csb_node = node;
++
++ down(&local_init_lock);
++
++ if (!local_nodeid) {
++ if (nodeid == our_nodeid()) {
++ local_nodeid = node->gn_nodeid;
++ }
++ }
++ up(&local_init_lock);
++
++ *ret_csb = csb;
++ return 0;
++
++ fail_free:
++ kfree(csb);
++ fail:
++ return error;
++}
++
++void release_csb(gd_csb_t *csb)
++{
++ put_node(csb->csb_node);
++ kfree(csb);
++}
++
++uint32_t our_nodeid(void)
++{
++ return lowcomms_our_nodeid();
++}
++
++int nodes_reconfig_wait(gd_ls_t *ls)
++{
++ int error;
++
++ if (ls->ls_low_nodeid == our_nodeid()) {
++ error = gdlm_wait_status_all(ls, NODES_VALID);
++ if (!error)
++ set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
++
++ /* Experimental: this delay should allow any final messages
++ * from the previous node to be received before beginning
++ * recovery. */
++
++ if (ls->ls_num_nodes == 1) {
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout((2) * HZ);
++ }
++
++ } else
++ error = gdlm_wait_status_low(ls, NODES_ALL_VALID);
++
++ return error;
++}
++
++static void add_ordered_node(gd_ls_t *ls, gd_csb_t *new)
++{
++ gd_csb_t *csb = NULL;
++ struct list_head *tmp;
++ struct list_head *newlist = &new->csb_list;
++ struct list_head *head = &ls->ls_nodes;
++
++ list_for_each(tmp, head) {
++ csb = list_entry(tmp, gd_csb_t, csb_list);
++
++ if (new->csb_node->gn_nodeid < csb->csb_node->gn_nodeid)
++ break;
++ }
++
++ if (!csb)
++ list_add_tail(newlist, head);
++ else {
++ /* FIXME: can use list macro here */
++ newlist->prev = tmp->prev;
++ newlist->next = tmp;
++ tmp->prev->next = newlist;
++ tmp->prev = newlist;
++ }
++}
++
++int ls_nodes_reconfig(gd_ls_t *ls, gd_recover_t *gr, int *neg_out)
++{
++ gd_csb_t *csb, *safe;
++ int error, i, found, pos = 0, neg = 0;
++ uint32_t low = (uint32_t) (-1);
++
++ /*
++ * Remove (and save) departed nodes from lockspace's nodes list
++ */
++
++ list_for_each_entry_safe(csb, safe, &ls->ls_nodes, csb_list) {
++ found = FALSE;
++ for (i = 0; i < gr->gr_node_count; i++) {
++ if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
++ found = TRUE;
++ break;
++ }
++ }
++
++ if (!found) {
++ neg++;
++ csb->csb_gone_event = gr->gr_event_id;
++ list_del(&csb->csb_list);
++ list_add_tail(&csb->csb_list, &ls->ls_nodes_gone);
++ ls->ls_num_nodes--;
++ log_all(ls, "remove node %u", csb->csb_node->gn_nodeid);
++ }
++ }
++
++ /*
++ * Add new nodes to lockspace's nodes list
++ */
++
++ for (i = 0; i < gr->gr_node_count; i++) {
++ found = FALSE;
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ if (csb->csb_node->gn_nodeid == gr->gr_nodeids[i]) {
++ found = TRUE;
++ break;
++ }
++ }
++
++ if (!found) {
++ pos++;
++
++ error = init_new_csb(gr->gr_nodeids[i], &csb);
++ GDLM_ASSERT(!error,);
++
++ add_ordered_node(ls, csb);
++ ls->ls_num_nodes++;
++ log_all(ls, "add node %u", csb->csb_node->gn_nodeid);
++ }
++ }
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ if (csb->csb_node->gn_nodeid < low)
++ low = csb->csb_node->gn_nodeid;
++ }
++
++ rcom_log_clear(ls);
++ ls->ls_low_nodeid = low;
++ ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
++ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
++ *neg_out = neg;
++
++ error = nodes_reconfig_wait(ls);
++
++ log_all(ls, "total nodes %d", ls->ls_num_nodes);
++
++ return error;
++}
++
++int ls_nodes_init(gd_ls_t *ls, gd_recover_t *gr)
++{
++ gd_csb_t *csb;
++ int i, error;
++ uint32_t low = (uint32_t) (-1);
++
++ log_all(ls, "add nodes");
++
++ for (i = 0; i < gr->gr_node_count; i++) {
++ error = init_new_csb(gr->gr_nodeids[i], &csb);
++ if (error)
++ goto fail;
++
++ add_ordered_node(ls, csb);
++ ls->ls_num_nodes++;
++
++ if (csb->csb_node->gn_nodeid < low)
++ low = csb->csb_node->gn_nodeid;
++ }
++
++ ls->ls_low_nodeid = low;
++ ls->ls_nodes_mask = gdlm_next_power2(ls->ls_num_nodes) - 1;
++ set_bit(LSFL_NODES_VALID, &ls->ls_flags);
++
++ error = nodes_reconfig_wait(ls);
++
++ log_all(ls, "total nodes %d", ls->ls_num_nodes);
++
++ return error;
++
++ fail:
++ while (!list_empty(&ls->ls_nodes)) {
++ csb = list_entry(ls->ls_nodes.next, gd_csb_t, csb_list);
++ list_del(&csb->csb_list);
++ release_csb(csb);
++ }
++ ls->ls_num_nodes = 0;
++
++ return error;
++}
++
++int in_nodes_gone(gd_ls_t *ls, uint32_t nodeid)
++{
++ gd_csb_t *csb;
++
++ list_for_each_entry(csb, &ls->ls_nodes_gone, csb_list) {
++ if (csb->csb_node->gn_nodeid == nodeid)
++ return TRUE;
++ }
++ return FALSE;
++}
+diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
+--- linux-orig/cluster/dlm/nodes.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/nodes.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,25 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __NODES_DOT_H__
++#define __NODES_DOT_H__
++
++void dlm_nodes_init(void);
++int init_new_csb(uint32_t nodeid, gd_csb_t ** ret_csb);
++void release_csb(gd_csb_t * csb);
++uint32_t our_nodeid(void);
++int ls_nodes_reconfig(gd_ls_t * ls, gd_recover_t * gr, int *neg);
++int ls_nodes_init(gd_ls_t * ls, gd_recover_t * gr);
++int in_nodes_gone(gd_ls_t * ls, uint32_t nodeid);
++
++#endif /* __NODES_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
+--- linux-orig/cluster/dlm/proc.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/proc.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,469 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++#include <linux/ctype.h>
++#include <linux/seq_file.h>
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "lockspace.h"
++
++#if defined(DLM_DEBUG)
++#define DLM_DEBUG_SIZE (1024)
++#define MAX_DEBUG_MSG_LEN (64)
++#else
++#define DLM_DEBUG_SIZE (0)
++#define MAX_DEBUG_MSG_LEN (0)
++#endif
++
++static char * debug_buf;
++static unsigned int debug_size;
++static unsigned int debug_point;
++static int debug_wrap;
++static spinlock_t debug_lock;
++static struct proc_dir_entry * debug_proc_entry = NULL;
++static struct proc_dir_entry * rcom_proc_entry = NULL;
++static char proc_ls_name[255] = "";
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++static struct proc_dir_entry * locks_proc_entry = NULL;
++static struct seq_operations locks_info_op;
++
++
++static int locks_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &locks_info_op);
++}
++
++/* Write simply sets the lockspace to use */
++static ssize_t locks_write(struct file *file, const char *buf,
++ size_t count, loff_t * ppos)
++{
++ if (count < sizeof(proc_ls_name)) {
++ copy_from_user(proc_ls_name, buf, count);
++ proc_ls_name[count] = '\0';
++
++ /* Remove any trailing LF so that lazy users
++ can just echo "lsname" > /proc/cluster/dlm_locks */
++ if (proc_ls_name[count - 1] == '\n')
++ proc_ls_name[count - 1] = '\0';
++
++ return count;
++ }
++ return 0;
++}
++
++static struct file_operations locks_fops = {
++ open:locks_open,
++ write:locks_write,
++ read:seq_read,
++ llseek:seq_lseek,
++ release:seq_release,
++};
++
++struct ls_dumpinfo {
++ int entry;
++ struct list_head *next;
++ gd_ls_t *ls;
++ gd_res_t *rsb;
++};
++
++static int print_resource(gd_res_t * res, struct seq_file *s);
++
++static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
++{
++ read_lock(&di->ls->ls_reshash_lock);
++ if (!di->next) {
++ /* Find the next non-empty hash bucket */
++ while (list_empty(&di->ls->ls_reshashtbl[di->entry]) &&
++ di->entry < di->ls->ls_hashsize) {
++ di->entry++;
++ }
++ if (di->entry >= di->ls->ls_hashsize) {
++ read_unlock(&di->ls->ls_reshash_lock);
++ return NULL; /* End of hash list */
++ }
++
++ di->next = di->ls->ls_reshashtbl[di->entry].next;
++ } else { /* Find the next entry in the list */
++
++ di->next = di->next->next;
++ if (di->next->next == di->ls->ls_reshashtbl[di->entry].next) {
++ /* End of list - move to next bucket */
++ di->next = NULL;
++ di->entry++;
++ read_unlock(&di->ls->ls_reshash_lock);
++
++ return next_rsb(di); /* do the top half of this conditional */
++ }
++ }
++ di->rsb = list_entry(di->next, gd_res_t, res_hashchain);
++ read_unlock(&di->ls->ls_reshash_lock);
++
++ return di;
++}
++
++static void *s_start(struct seq_file *m, loff_t * pos)
++{
++ struct ls_dumpinfo *di;
++ gd_ls_t *ls;
++ int i;
++
++ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
++ if (!ls)
++ return NULL;
++
++ di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
++ if (!di)
++ return NULL;
++
++ if (*pos == 0)
++ seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
++
++ di->entry = 0;
++ di->next = NULL;
++ di->ls = ls;
++
++ for (i = 0; i < *pos; i++)
++ if (next_rsb(di) == NULL)
++ return NULL;
++
++ return next_rsb(di);
++}
++
++static void *s_next(struct seq_file *m, void *p, loff_t * pos)
++{
++ struct ls_dumpinfo *di = p;
++
++ *pos += 1;
++
++ return next_rsb(di);
++}
++
++static int s_show(struct seq_file *m, void *p)
++{
++ struct ls_dumpinfo *di = p;
++ return print_resource(di->rsb, m);
++}
++
++static void s_stop(struct seq_file *m, void *p)
++{
++ kfree(p);
++}
++
++static struct seq_operations locks_info_op = {
++ start:s_start,
++ next:s_next,
++ stop:s_stop,
++ show:s_show
++};
++
++static char *print_lockmode(int mode)
++{
++ switch (mode) {
++ case DLM_LOCK_IV:
++ return "--";
++ case DLM_LOCK_NL:
++ return "NL";
++ case DLM_LOCK_CR:
++ return "CR";
++ case DLM_LOCK_CW:
++ return "CW";
++ case DLM_LOCK_PR:
++ return "PR";
++ case DLM_LOCK_PW:
++ return "PW";
++ case DLM_LOCK_EX:
++ return "EX";
++ default:
++ return "??";
++ }
++}
++
++static void print_lock(struct seq_file *s, gd_lkb_t * lkb, gd_res_t * res)
++{
++
++ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
++
++ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++ || lkb->lkb_status == GDLM_LKSTS_WAITING)
++ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
++
++ if (lkb->lkb_range) {
++ /* This warns on Alpha. Tough. Only I see it */
++ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++ || lkb->lkb_status == GDLM_LKSTS_GRANTED)
++ seq_printf(s, " %" PRIx64 "-%" PRIx64,
++ lkb->lkb_range[GR_RANGE_START],
++ lkb->lkb_range[GR_RANGE_END]);
++ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++ || lkb->lkb_status == GDLM_LKSTS_WAITING)
++ seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
++ lkb->lkb_range[RQ_RANGE_START],
++ lkb->lkb_range[RQ_RANGE_END]);
++ }
++
++ if (lkb->lkb_nodeid) {
++ if (lkb->lkb_nodeid != res->res_nodeid)
++ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
++ lkb->lkb_remid);
++ else
++ seq_printf(s, " Master: %08x", lkb->lkb_remid);
++ }
++
++ if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
++ seq_printf(s, " LQ: %d", lkb->lkb_lockqueue_state);
++
++ seq_printf(s, "\n");
++}
++
++static int print_resource(gd_res_t *res, struct seq_file *s)
++{
++ int i;
++ struct list_head *locklist;
++
++ seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
++ res->res_parent, res->res_length);
++ for (i = 0; i < res->res_length; i++) {
++ if (isprint(res->res_name[i]))
++ seq_printf(s, "%c", res->res_name[i]);
++ else
++ seq_printf(s, "%c", '.');
++ }
++ if (res->res_nodeid)
++ seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
++ res->res_nodeid);
++ else
++ seq_printf(s, "\" \nMaster Copy\n");
++
++ /* Print the LVB: */
++ if (res->res_lvbptr) {
++ seq_printf(s, "LVB: ");
++ for (i = 0; i < DLM_LVB_LEN; i++) {
++ if (i == DLM_LVB_LEN / 2)
++ seq_printf(s, "\n ");
++ seq_printf(s, "%02x ",
++ (unsigned char) res->res_lvbptr[i]);
++ }
++ seq_printf(s, "\n");
++ }
++
++ /* Print the locks attached to this resource */
++ seq_printf(s, "Granted Queue\n");
++ list_for_each(locklist, &res->res_grantqueue) {
++ gd_lkb_t *this_lkb =
++ list_entry(locklist, gd_lkb_t, lkb_statequeue);
++ print_lock(s, this_lkb, res);
++ }
++
++ seq_printf(s, "Conversion Queue\n");
++ list_for_each(locklist, &res->res_convertqueue) {
++ gd_lkb_t *this_lkb =
++ list_entry(locklist, gd_lkb_t, lkb_statequeue);
++ print_lock(s, this_lkb, res);
++ }
++
++ seq_printf(s, "Waiting Queue\n");
++ list_for_each(locklist, &res->res_waitqueue) {
++ gd_lkb_t *this_lkb =
++ list_entry(locklist, gd_lkb_t, lkb_statequeue);
++ print_lock(s, this_lkb, res);
++ }
++ return 0;
++}
++#endif /* CONFIG_CLUSTER_DLM_PROCLOCKS */
++
++void dlm_debug_log(gd_ls_t *ls, const char *fmt, ...)
++{
++ va_list va;
++ int i, n, size, len;
++ char buf[MAX_DEBUG_MSG_LEN+1];
++
++ spin_lock(&debug_lock);
++
++ if (!debug_buf)
++ goto out;
++
++ size = MAX_DEBUG_MSG_LEN;
++ memset(buf, 0, size+1);
++
++ n = snprintf(buf, size, "%s ", ls->ls_name);
++ size -= n;
++
++ va_start(va, fmt);
++ vsnprintf(buf+n, size, fmt, va);
++ va_end(va);
++
++ len = strlen(buf);
++ if (len > MAX_DEBUG_MSG_LEN-1)
++ len = MAX_DEBUG_MSG_LEN-1;
++ buf[len] = '\n';
++ buf[len+1] = '\0';
++
++ for (i = 0; i < strlen(buf); i++) {
++ debug_buf[debug_point++] = buf[i];
++
++ if (debug_point == debug_size) {
++ debug_point = 0;
++ debug_wrap = 1;
++ }
++ }
++ out:
++ spin_unlock(&debug_lock);
++}
++
++void dlm_debug_dump(void)
++{
++ int i;
++
++ spin_lock(&debug_lock);
++ if (debug_wrap) {
++ for (i = debug_point; i < debug_size; i++)
++ printk("%c", debug_buf[i]);
++ }
++ for (i = 0; i < debug_point; i++)
++ printk("%c", debug_buf[i]);
++ spin_unlock(&debug_lock);
++}
++
++void dlm_debug_setup(int size)
++{
++ char *b = NULL;
++
++ if (size > PAGE_SIZE)
++ size = PAGE_SIZE;
++ if (size)
++ b = kmalloc(size, GFP_KERNEL);
++
++ spin_lock(&debug_lock);
++ if (debug_buf)
++ kfree(debug_buf);
++ if (!size || !b)
++ goto out;
++ debug_size = size;
++ debug_point = 0;
++ debug_wrap = 0;
++ debug_buf = b;
++ memset(debug_buf, 0, debug_size);
++ out:
++ spin_unlock(&debug_lock);
++}
++
++static void dlm_debug_init(void)
++{
++ debug_buf = NULL;
++ debug_size = 0;
++ debug_point = 0;
++ debug_wrap = 0;
++ spin_lock_init(&debug_lock);
++
++ dlm_debug_setup(DLM_DEBUG_SIZE);
++}
++
++#ifdef CONFIG_PROC_FS
++int dlm_debug_info(char *b, char **start, off_t offset, int length)
++{
++ int i, n = 0;
++
++ spin_lock(&debug_lock);
++
++ if (debug_wrap) {
++ for (i = debug_point; i < debug_size; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++ }
++ for (i = 0; i < debug_point; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++
++ spin_unlock(&debug_lock);
++
++ return n;
++}
++
++int dlm_rcom_info(char *b, char **start, off_t offset, int length)
++{
++ gd_ls_t *ls;
++ gd_csb_t *csb;
++ int n = 0;
++
++ ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
++ if (!ls)
++ return 0;
++
++ n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
++ "names_recv_count names_recv_msgid "
++ "locks_send_count locks_send_msgid "
++ "locks_recv_count locks_recv_msgid\n");
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
++ csb->csb_node->gn_nodeid,
++ csb->csb_names_send_count,
++ csb->csb_names_send_msgid,
++ csb->csb_names_recv_count,
++ csb->csb_names_recv_msgid,
++ csb->csb_locks_send_count,
++ csb->csb_locks_send_msgid,
++ csb->csb_locks_recv_count,
++ csb->csb_locks_recv_msgid);
++ }
++ return n;
++}
++#endif
++
++void dlm_proc_init(void)
++{
++#ifdef CONFIG_PROC_FS
++ debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
++ NULL);
++ if (!debug_proc_entry)
++ return;
++
++ debug_proc_entry->get_info = &dlm_debug_info;
++
++ rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
++ if (!rcom_proc_entry)
++ return;
++
++ rcom_proc_entry->get_info = &dlm_rcom_info;
++#endif
++ dlm_debug_init();
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++ locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
++ S_IFREG | 0400,
++ NULL, NULL, NULL);
++ if (!locks_proc_entry)
++ return;
++ locks_proc_entry->proc_fops = &locks_fops;
++#endif
++}
++
++void dlm_proc_exit(void)
++{
++#ifdef CONFIG_PROC_FS
++ if (debug_proc_entry) {
++ remove_proc_entry("cluster/dlm_debug", NULL);
++ dlm_debug_setup(0);
++ }
++
++ if (rcom_proc_entry)
++ remove_proc_entry("cluster/dlm_rcom", NULL);
++#endif
++
++#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
++ if (locks_proc_entry)
++ remove_proc_entry("cluster/dlm_locks", NULL);
++#endif
++}
+diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
+--- linux-orig/cluster/dlm/queries.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/queries.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,697 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * queries.c
++ *
++ * This file provides the kernel query interface to the DLM.
++ *
++ */
++
++#define EXPORT_SYMTAB
++#include <linux/module.h>
++
++#include "dlm_internal.h"
++#include "lockqueue.h"
++#include "locking.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "memory.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "rsb.h"
++
++static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo);
++static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo);
++
++/*
++ * API entry point.
++ */
++int dlm_query(void *lockspace,
++ struct dlm_lksb *lksb,
++ int query,
++ struct dlm_queryinfo *qinfo,
++ void (ast_routine(void *)),
++ void *astarg)
++{
++ int status = -EINVAL;
++ gd_lkb_t *target_lkb;
++ gd_lkb_t *query_lkb = NULL; /* Our temporary LKB */
++ gd_ls_t *ls = (gd_ls_t *) find_lockspace_by_local_id(lockspace);
++
++
++ if (!qinfo)
++ goto out;
++ if (!ls)
++ goto out;
++ if (!ast_routine)
++ goto out;
++ if (!lksb)
++ goto out;
++
++ if (!qinfo->gqi_lockinfo)
++ qinfo->gqi_locksize = 0;
++
++ /* Find the lkid */
++ target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
++ if (!target_lkb)
++ goto out;
++
++ /* If the user wants a list of locks that are blocking or
++ not blocking this lock, then it must be waiting
++ for something
++ */
++ if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
++ (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
++ target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
++ return -EINVAL;
++
++ /* We now allocate an LKB for our own use (so we can hang
++ * things like the AST routine and the lksb from it) */
++ lksb->sb_status = -EBUSY;
++ query_lkb = create_lkb(ls);
++ if (!query_lkb) {
++ status = -ENOMEM;
++ goto out;
++ }
++ query_lkb->lkb_astaddr = ast_routine;
++ query_lkb->lkb_astparam = (long)astarg;
++ query_lkb->lkb_resource = target_lkb->lkb_resource;
++ query_lkb->lkb_lksb = lksb;
++
++ /* Don't free the resource while we are querying it. This ref
++ * will be dropped when the LKB is freed */
++ hold_rsb(query_lkb->lkb_resource);
++
++ /* Fill in the stuff that's always local */
++ if (qinfo->gqi_resinfo) {
++ if (target_lkb->lkb_resource->res_nodeid)
++ qinfo->gqi_resinfo->rsi_masternode =
++ target_lkb->lkb_resource->res_nodeid;
++ else
++ qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
++ qinfo->gqi_resinfo->rsi_length =
++ target_lkb->lkb_resource->res_length;
++ memcpy(qinfo->gqi_resinfo->rsi_name,
++ target_lkb->lkb_resource->res_name,
++ qinfo->gqi_resinfo->rsi_length);
++ }
++
++ /* If the master is local (or the user doesn't want the overhead of a
++ * remote call) - fill in the details here */
++ if (target_lkb->lkb_resource->res_nodeid == 0 ||
++ (query & DLM_QUERY_LOCAL)) {
++
++ status = 0;
++ /* Resource info */
++ if (qinfo->gqi_resinfo) {
++ query_resource(target_lkb->lkb_resource,
++ qinfo->gqi_resinfo);
++ }
++
++ /* Lock lists */
++ if (qinfo->gqi_lockinfo) {
++ status = query_locks(query, target_lkb, qinfo);
++ }
++
++ query_lkb->lkb_retstatus = status;
++ query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
++ wake_astd();
++
++ /* An AST will be delivered so we must return success here */
++ status = 0;
++ goto out;
++ }
++
++ /* Remote master */
++ if (target_lkb->lkb_resource->res_nodeid != 0)
++ {
++ struct gd_remquery *remquery;
++ struct writequeue_entry *e;
++
++ /* Clear this cos the receiving end adds to it with
++ each incoming packet */
++ qinfo->gqi_lockcount = 0;
++
++ /* Squirrel a pointer to the query info struct
++ somewhere illegal */
++ query_lkb->lkb_request = (struct gd_remlockrequest *) qinfo;
++
++ e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
++ sizeof(struct gd_remquery),
++ ls->ls_allocation,
++ (char **) &remquery);
++ if (!e) {
++ status = -ENOBUFS;
++ goto out;
++ }
++
++ /* Build remote packet */
++ memset(remquery, 0, sizeof(struct gd_remquery));
++
++ remquery->rq_maxlocks = qinfo->gqi_locksize;
++ remquery->rq_query = query;
++ remquery->rq_mstlkid = target_lkb->lkb_remid;
++ if (qinfo->gqi_lockinfo)
++ remquery->rq_maxlocks = qinfo->gqi_locksize;
++
++ remquery->rq_header.rh_cmd = GDLM_REMCMD_QUERY;
++ remquery->rq_header.rh_flags = 0;
++ remquery->rq_header.rh_length = sizeof(struct gd_remquery);
++ remquery->rq_header.rh_lkid = query_lkb->lkb_id;
++ remquery->rq_header.rh_lockspace = ls->ls_global_id;
++
++ midcomms_send_buffer(&remquery->rq_header, e);
++ status = 0;
++ }
++
++ out:
++
++ return status;
++}
++
++static inline int valid_range(struct dlm_range *r)
++{
++ if (r->ra_start != 0ULL ||
++ r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
++ return 1;
++ else
++ return 0;
++}
++
++static void put_int(int x, char *buf, int *offp)
++{
++ x = cpu_to_le32(x);
++ memcpy(buf + *offp, &x, sizeof(int));
++ *offp += sizeof(int);
++}
++
++static void put_int64(uint64_t x, char *buf, int *offp)
++{
++ x = cpu_to_le64(x);
++ memcpy(buf + *offp, &x, sizeof(uint64_t));
++ *offp += sizeof(uint64_t);
++}
++
++static int get_int(char *buf, int *offp)
++{
++ int value;
++ memcpy(&value, buf + *offp, sizeof(int));
++ *offp += sizeof(int);
++ return le32_to_cpu(value);
++}
++
++static uint64_t get_int64(char *buf, int *offp)
++{
++ uint64_t value;
++
++ memcpy(&value, buf + *offp, sizeof(uint64_t));
++ *offp += sizeof(uint64_t);
++ return le64_to_cpu(value);
++}
++
++#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
++
++/* Called from recvd to get lock info for a remote node */
++int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
++{
++ struct gd_remquery *query = (struct gd_remquery *) msg;
++ struct gd_remqueryreply *reply;
++ struct dlm_resinfo resinfo;
++ struct dlm_queryinfo qinfo;
++ struct writequeue_entry *e;
++ char *buf;
++ gd_lkb_t *lkb;
++ int status = 0;
++ int bufidx;
++ int finished = 0;
++ int cur_lock = 0;
++ int start_lock = 0;
++
++ lkb = find_lock_by_id(ls, query->rq_mstlkid);
++ if (!lkb) {
++ status = -EINVAL;
++ goto send_error;
++ }
++
++ qinfo.gqi_resinfo = &resinfo;
++ qinfo.gqi_locksize = query->rq_maxlocks;
++
++ /* Get the resource bits */
++ query_resource(lkb->lkb_resource, &resinfo);
++
++ /* Now get the locks if wanted */
++ if (query->rq_maxlocks) {
++ qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
++ GFP_KERNEL);
++ if (!qinfo.gqi_lockinfo) {
++ status = -ENOMEM;
++ goto send_error;
++ }
++
++ status = query_locks(query->rq_query, lkb, &qinfo);
++ if (status && status != -E2BIG) {
++ kfree(qinfo.gqi_lockinfo);
++ goto send_error;
++ }
++ }
++ else {
++ qinfo.gqi_lockinfo = NULL;
++ qinfo.gqi_lockcount = 0;
++ }
++
++ /* Send as many blocks as needed for all the locks */
++ do {
++ int i;
++ int msg_len = sizeof(struct gd_remqueryreply);
++ int last_msg_len = msg_len; /* keeps compiler quiet */
++ int last_lock;
++
++ /* First work out how many locks we can fit into a block */
++ for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
++
++ last_msg_len = msg_len;
++
++ msg_len += LOCK_LEN;
++ if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
++ valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
++
++ msg_len += sizeof(uint64_t) * 4;
++ }
++ }
++
++ /* There must be a neater way of doing this... */
++ if (msg_len > PAGE_SIZE) {
++ last_lock = i-1;
++ msg_len = last_msg_len;
++ }
++ else {
++ last_lock = i;
++ }
++
++ e = lowcomms_get_buffer(nodeid,
++ msg_len,
++ ls->ls_allocation,
++ (char **) &reply);
++ if (!e) {
++ kfree(qinfo.gqi_lockinfo);
++ status = -ENOBUFS;
++ goto out;
++ }
++
++ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
++ reply->rq_header.rh_length = msg_len;
++ reply->rq_header.rh_lkid = msg->rh_lkid;
++ reply->rq_header.rh_lockspace = msg->rh_lockspace;
++
++ reply->rq_status = status;
++ reply->rq_startlock = cur_lock;
++ reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
++ reply->rq_convcount = qinfo.gqi_resinfo->rsi_convcount;
++ reply->rq_waitcount = qinfo.gqi_resinfo->rsi_waitcount;
++ memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
++
++ buf = (char *)reply;
++ bufidx = sizeof(struct gd_remqueryreply);
++
++ for (; cur_lock < last_lock; cur_lock++) {
++
++ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
++ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
++ buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
++ put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
++ put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
++ put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
++ put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
++
++ if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
++ valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
++
++ buf[bufidx++] = 1;
++ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
++ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
++ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
++ put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
++ }
++ else {
++ buf[bufidx++] = 0;
++ }
++ }
++
++ if (cur_lock == qinfo.gqi_lockcount) {
++ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
++ finished = 1;
++ }
++ else {
++ reply->rq_header.rh_flags = 0;
++ }
++
++ reply->rq_numlocks = cur_lock - start_lock;
++ start_lock = cur_lock;
++
++ midcomms_send_buffer(&reply->rq_header, e);
++ } while (!finished);
++
++ kfree(qinfo.gqi_lockinfo);
++ out:
++ return status;
++
++ send_error:
++ e = lowcomms_get_buffer(nodeid,
++ sizeof(struct gd_remqueryreply),
++ ls->ls_allocation,
++ (char **) &reply);
++ if (!e) {
++ status = -ENOBUFS;
++ goto out;
++ }
++ reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
++ reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
++ reply->rq_header.rh_length = sizeof(struct gd_remqueryreply);
++ reply->rq_header.rh_lkid = msg->rh_lkid;
++ reply->rq_header.rh_lockspace = msg->rh_lockspace;
++ reply->rq_status = status;
++ reply->rq_numlocks = 0;
++ reply->rq_startlock = 0;
++ reply->rq_grantcount = 0;
++ reply->rq_convcount = 0;
++ reply->rq_waitcount = 0;
++
++ midcomms_send_buffer(&reply->rq_header, e);
++
++ return status;
++}
++
++/* Reply to a remote query */
++int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg)
++{
++ gd_lkb_t *query_lkb;
++ struct dlm_queryinfo *qinfo;
++ struct gd_remqueryreply *reply;
++ char *buf;
++ int i;
++ int bufidx;
++
++ query_lkb = find_lock_by_id(ls, msg->rh_lkid);
++ if (!query_lkb)
++ return -EINVAL;
++
++ qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
++ reply = (struct gd_remqueryreply *) msg;
++
++ /* Copy the easy bits first */
++ qinfo->gqi_lockcount += reply->rq_numlocks;
++ if (qinfo->gqi_resinfo) {
++ qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
++ qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
++ qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
++ memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
++ DLM_LVB_LEN);
++ }
++
++ /* Now unpack the locks */
++ bufidx = sizeof(struct gd_remqueryreply);
++ buf = (char *) msg;
++
++ GDLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
++ printk("start = %d, num + %d. Max= %d\n",
++ reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
++
++ for (i = reply->rq_startlock;
++ i < reply->rq_startlock + reply->rq_numlocks; i++) {
++ qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
++ qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
++ qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
++ qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
++ if (buf[bufidx++]) {
++ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = get_int64(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
++ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = get_int64(buf, &bufidx);
++ }
++ else {
++ qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
++ qinfo->gqi_lockinfo[i].lki_grrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
++ qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
++ qinfo->gqi_lockinfo[i].lki_rqrange.ra_end = 0xFFFFFFFFFFFFFFFFULL;
++ }
++ }
++
++ /* If this was the last block then now tell the user */
++ if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
++ query_lkb->lkb_retstatus = reply->rq_status;
++ query_lkb->lkb_flags |= GDLM_LKFLG_DELAST;
++ queue_ast(query_lkb, GDLM_QUEUE_COMPAST, 0);
++ wake_astd();
++ }
++
++ return 0;
++}
++
++/* Aggregate resource information */
++static int query_resource(gd_res_t *rsb, struct dlm_resinfo *resinfo)
++{
++ struct list_head *tmp;
++
++
++ if (rsb->res_lvbptr)
++ memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
++
++ resinfo->rsi_grantcount = 0;
++ list_for_each(tmp, &rsb->res_grantqueue) {
++ resinfo->rsi_grantcount++;
++ }
++
++ resinfo->rsi_waitcount = 0;
++ list_for_each(tmp, &rsb->res_waitqueue) {
++ resinfo->rsi_waitcount++;
++ }
++
++ resinfo->rsi_convcount = 0;
++ list_for_each(tmp, &rsb->res_convertqueue) {
++ resinfo->rsi_convcount++;
++ }
++
++ return 0;
++}
++
++static int add_lock(gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
++{
++ int entry;
++
++ /* Don't fill it in if the buffer is full */
++ if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
++ return -E2BIG;
++
++ /* gqi_lockcount contains the number of locks we have returned */
++ entry = qinfo->gqi_lockcount++;
++
++ /* Fun with master copies */
++ if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
++ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
++ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
++ }
++ else {
++ qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
++ qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
++ }
++
++ /* Also make sure we always have a valid nodeid in there, the
++ calling end may not know which node "0" is */
++ if (lkb->lkb_nodeid)
++ qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
++ else
++ qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
++
++ if (lkb->lkb_parent)
++ qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
++ else
++ qinfo->gqi_lockinfo[entry].lki_parent = 0;
++
++ qinfo->gqi_lockinfo[entry].lki_state = lkb->lkb_status;
++ qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
++ qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
++
++ if (lkb->lkb_range) {
++ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
++ lkb->lkb_range[GR_RANGE_START];
++ qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
++ lkb->lkb_range[GR_RANGE_END];
++ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
++ lkb->lkb_range[RQ_RANGE_START];
++ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
++ lkb->lkb_range[RQ_RANGE_END];
++ } else {
++ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
++ qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
++ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
++ qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
++ }
++ return 0;
++}
++
++static int query_lkb_queue(struct list_head *queue, int query,
++ struct dlm_queryinfo *qinfo)
++{
++ struct list_head *tmp;
++ int status = 0;
++ int mode = query & DLM_QUERY_MODE_MASK;
++
++ list_for_each(tmp, queue) {
++ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++ int lkmode;
++
++ if (query & DLM_QUERY_RQMODE)
++ lkmode = lkb->lkb_rqmode;
++ else
++ lkmode = lkb->lkb_grmode;
++
++ /* Add the LKB info to the list if it matches the criteria in
++ * the query bitmap */
++ switch (query & DLM_QUERY_MASK) {
++ case DLM_QUERY_LOCKS_ALL:
++ status = add_lock(lkb, qinfo);
++ break;
++
++ case DLM_QUERY_LOCKS_HIGHER:
++ if (lkmode > mode)
++ status = add_lock(lkb, qinfo);
++ break;
++
++ case DLM_QUERY_LOCKS_EQUAL:
++ if (lkmode == mode)
++ status = add_lock(lkb, qinfo);
++ break;
++
++ case DLM_QUERY_LOCKS_LOWER:
++ if (lkmode < mode)
++ status = add_lock(lkb, qinfo);
++ break;
++ }
++ }
++ return status;
++}
++
++/*
++ * Return 1 if the locks' ranges overlap
++ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
++ */
++static inline int ranges_overlap(gd_lkb_t *lkb1, gd_lkb_t *lkb2)
++{
++ if (!lkb1->lkb_range || !lkb2->lkb_range)
++ return 1;
++
++ if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
++ lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
++ return 0;
++
++ return 1;
++}
++extern const int __dlm_compat_matrix[8][8];
++
++
++static int get_blocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
++{
++ struct list_head *tmp;
++ int status = 0;
++
++ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
++ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++ if (ranges_overlap(lkb, qlkb) &&
++ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
++ status = add_lock(lkb, qinfo);
++ }
++
++ return status;
++}
++
++static int get_nonblocking_locks(gd_lkb_t *qlkb, struct dlm_queryinfo *qinfo)
++{
++ struct list_head *tmp;
++ int status = 0;
++
++ list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
++ gd_lkb_t *lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++ if (!(ranges_overlap(lkb, qlkb) &&
++ !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
++ status = add_lock(lkb, qinfo);
++ }
++
++ return status;
++}
++
++/* Gather a list of appropriate locks */
++static int query_locks(int query, gd_lkb_t *lkb, struct dlm_queryinfo *qinfo)
++{
++ int status = 0;
++
++
++ /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
++ * was requested as the mode
++ */
++ if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
++ query &= ~DLM_QUERY_MODE_MASK;
++ if (query & DLM_QUERY_RQMODE)
++ query |= lkb->lkb_rqmode;
++ else
++ query |= lkb->lkb_grmode;
++ }
++
++ qinfo->gqi_lockcount = 0;
++
++ /* BLOCKING/NOTBLOCK only look at the granted queue */
++ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
++ return get_blocking_locks(lkb, qinfo);
++
++ if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
++ return get_nonblocking_locks(lkb, qinfo);
++
++ /* Do the lock queues that were requested */
++ if (query & DLM_QUERY_QUEUE_GRANT) {
++ status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
++ query, qinfo);
++ }
++
++ if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
++ status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
++ query, qinfo);
++ }
++
++ if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
++ status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
++ query, qinfo);
++ }
++
++
++ return status;
++}
++
++EXPORT_SYMBOL(dlm_query);
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
+--- linux-orig/cluster/dlm/queries.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/queries.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __QUERIES_DOT_H__
++#define __QUERIES_DOT_H__
++
++extern int remote_query(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
++extern int remote_query_reply(int nodeid, gd_ls_t *ls, struct gd_req_header *msg);
++
++#endif /* __QUERIES_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
+--- linux-orig/cluster/dlm/rebuild.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rebuild.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,1246 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * Rebuild RSB's on new masters. Functions for transferring locks and
++ * subresources to new RSB masters during recovery.
++ */
++
++#include "dlm_internal.h"
++#include "reccomms.h"
++#include "lkb.h"
++#include "rsb.h"
++#include "nodes.h"
++#include "config.h"
++#include "memory.h"
++#include "recover.h"
++
++
++/* Types of entity serialised in remastering messages */
++#define REMASTER_ROOTRSB 1
++#define REMASTER_RSB 2
++#define REMASTER_LKB 3
++
++struct rcom_fill {
++ char * outbuf; /* Beginning of data */
++ int offset; /* Current offset into outbuf */
++ int maxlen; /* Max value of offset */
++ int remasterid;
++ int count;
++ gd_res_t * rsb;
++ gd_res_t * subrsb;
++ gd_lkb_t * lkb;
++ struct list_head * lkbqueue;
++ char more;
++};
++typedef struct rcom_fill rcom_fill_t;
++
++
++struct rebuild_node {
++ struct list_head list;
++ int nodeid;
++ gd_res_t * rootrsb;
++};
++typedef struct rebuild_node rebuild_node_t;
++
++
++/*
++ * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
++ * master. The rsb will be "done" with recovery when the new master has
++ * replied with all the new remote lockid's for this rsb's lkb's.
++ */
++
++void expect_new_lkids(gd_res_t *rsb)
++{
++ rsb->res_newlkid_expect = 0;
++ recover_list_add(rsb);
++}
++
++/*
++ * This function is called on root rsb or subrsb when another lkb is being sent
++ * to the new master for which we expect to receive a corresponding remote lkid
++ */
++
++void need_new_lkid(gd_res_t *rsb)
++{
++ gd_res_t *root = rsb;
++
++ if (rsb->res_parent)
++ root = rsb->res_root;
++
++ if (!root->res_newlkid_expect)
++ recover_list_add(root);
++ else
++ GDLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
++
++ root->res_newlkid_expect++;
++}
++
++/*
++ * This function is called for each lkb for which a new remote lkid is
++ * received. Decrement the expected number of remote lkids expected for the
++ * root rsb.
++ */
++
++void have_new_lkid(gd_lkb_t *lkb)
++{
++ gd_res_t *root = lkb->lkb_resource;
++
++ if (root->res_parent)
++ root = root->res_root;
++
++ down_write(&root->res_lock);
++
++ GDLM_ASSERT(root->res_newlkid_expect,
++ printk("newlkid_expect=%d\n", root->res_newlkid_expect););
++
++ root->res_newlkid_expect--;
++
++ if (!root->res_newlkid_expect) {
++ clear_bit(RESFL_NEW_MASTER, &root->res_flags);
++ recover_list_del(root);
++ }
++ up_write(&root->res_lock);
++}
++
++/*
++ * Return the rebuild struct for a node - will create an entry on the rootrsb
++ * list if necessary.
++ *
++ * Currently no locking is needed here as it all happens in the gdlm_recvd
++ * thread
++ */
++
++static rebuild_node_t *find_rebuild_root(gd_ls_t *ls, int nodeid)
++{
++ rebuild_node_t *node = NULL;
++
++ list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
++ if (node->nodeid == nodeid)
++ return node;
++ }
++
++ /* Not found, add one */
++ node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
++ if (!node)
++ return NULL;
++
++ node->nodeid = nodeid;
++ node->rootrsb = NULL;
++ list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
++
++ return node;
++}
++
++/*
++ * Tidy up after a rebuild run. Called when all recovery has finished
++ */
++
++void rebuild_freemem(gd_ls_t *ls)
++{
++ rebuild_node_t *node = NULL, *s;
++
++ list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
++ list_del(&node->list);
++ kfree(node);
++ }
++}
++
++static void put_int(int x, char *buf, int *offp)
++{
++ x = cpu_to_le32(x);
++ memcpy(buf + *offp, &x, sizeof(int));
++ *offp += sizeof(int);
++}
++
++static void put_int64(uint64_t x, char *buf, int *offp)
++{
++ x = cpu_to_le64(x);
++ memcpy(buf + *offp, &x, sizeof(uint64_t));
++ *offp += sizeof(uint64_t);
++}
++
++static void put_bytes(char *x, int len, char *buf, int *offp)
++{
++ put_int(len, buf, offp);
++ memcpy(buf + *offp, x, len);
++ *offp += len;
++}
++
++static void put_char(char x, char *buf, int *offp)
++{
++ buf[*offp] = x;
++ *offp += 1;
++}
++
++static int get_int(char *buf, int *offp)
++{
++ int value;
++ memcpy(&value, buf + *offp, sizeof(int));
++ *offp += sizeof(int);
++ return le32_to_cpu(value);
++}
++
++static uint64_t get_int64(char *buf, int *offp)
++{
++ uint64_t value;
++
++ memcpy(&value, buf + *offp, sizeof(uint64_t));
++ *offp += sizeof(uint64_t);
++ return le64_to_cpu(value);
++}
++
++static char get_char(char *buf, int *offp)
++{
++ char x = buf[*offp];
++
++ *offp += 1;
++ return x;
++}
++
++static void get_bytes(char *bytes, int *len, char *buf, int *offp)
++{
++ *len = get_int(buf, offp);
++ memcpy(bytes, buf + *offp, *len);
++ *offp += *len;
++}
++
++static int lkb_length(gd_lkb_t *lkb)
++{
++ int len = 0;
++
++ len += sizeof(int); /* lkb_id */
++ len += sizeof(int); /* lkb_resource->res_reamasterid */
++ len += sizeof(int); /* lkb_flags */
++ len += sizeof(int); /* lkb_status */
++ len += sizeof(char); /* lkb_rqmode */
++ len += sizeof(char); /* lkb_grmode */
++ len += sizeof(int); /* lkb_childcnt */
++ len += sizeof(int); /* lkb_parent->lkb_id */
++ len += sizeof(int); /* lkb_bastaddr */
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++ len += sizeof(int); /* number of lvb bytes */
++ len += DLM_LVB_LEN;
++ }
++
++ if (lkb->lkb_range) {
++ len += sizeof(uint64_t);
++ len += sizeof(uint64_t);
++ if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
++ len += sizeof(uint64_t);
++ len += sizeof(uint64_t);
++ }
++ }
++
++ return len;
++}
++
++/*
++ * It's up to the caller to be sure there's enough space in the buffer.
++ */
++
++static void serialise_lkb(gd_lkb_t *lkb, char *buf, int *offp)
++{
++ int flags;
++
++ /* Need to tell the remote end if we have a range */
++ flags = lkb->lkb_flags;
++ if (lkb->lkb_range)
++ flags |= GDLM_LKFLG_RANGE;
++
++ /*
++ * See lkb_length()
++ * Total: 30 (no lvb) or 66 (with lvb) bytes
++ */
++
++ put_int(lkb->lkb_id, buf, offp);
++ put_int(lkb->lkb_resource->res_remasterid, buf, offp);
++ put_int(flags, buf, offp);
++ put_int(lkb->lkb_status, buf, offp);
++ put_char(lkb->lkb_rqmode, buf, offp);
++ put_char(lkb->lkb_grmode, buf, offp);
++ put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
++
++ if (lkb->lkb_parent)
++ put_int(lkb->lkb_parent->lkb_id, buf, offp);
++ else
++ put_int(0, buf, offp);
++
++ if (lkb->lkb_bastaddr)
++ put_int(1, buf, offp);
++ else
++ put_int(0, buf, offp);
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++ GDLM_ASSERT(lkb->lkb_lvbptr,);
++ put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
++ }
++
++ /* Only send the range we actually need */
++ if (lkb->lkb_range) {
++ switch (lkb->lkb_status) {
++ case GDLM_LKSTS_CONVERT:
++ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
++ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
++ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
++ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
++ break;
++ case GDLM_LKSTS_WAITING:
++ put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
++ put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
++ break;
++ case GDLM_LKSTS_GRANTED:
++ put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
++ put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
++ break;
++ default:
++ GDLM_ASSERT(0,);
++ }
++ }
++}
++
++static int rsb_length(gd_res_t *rsb)
++{
++ int len = 0;
++
++ len += sizeof(int); /* number of res_name bytes */
++ len += rsb->res_length; /* res_name */
++ len += sizeof(int); /* res_remasterid */
++ len += sizeof(int); /* res_parent->res_remasterid */
++
++ return len;
++}
++
++static inline gd_res_t *next_subrsb(gd_res_t *subrsb)
++{
++ struct list_head *tmp;
++ gd_res_t *r;
++
++ tmp = subrsb->res_subreslist.next;
++ r = list_entry(tmp, gd_res_t, res_subreslist);
++
++ return r;
++}
++
++static inline int last_in_list(gd_res_t *r, struct list_head *head)
++{
++ gd_res_t *last = list_entry(head->prev, gd_res_t, res_subreslist);
++
++ if (last == r)
++ return 1;
++ return 0;
++}
++
++/*
++ * Used to decide if an rsb should be rebuilt on a new master. An rsb only
++ * needs to be rebuild if we have lkb's queued on it. NOREBUILD lkb's on the
++ * wait queue are not rebuilt.
++ */
++
++static int lkbs_to_remaster(gd_res_t *r)
++{
++ gd_lkb_t *lkb;
++ gd_res_t *sub;
++
++ if (!list_empty(&r->res_grantqueue) ||
++ !list_empty(&r->res_convertqueue))
++ return TRUE;
++
++ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
++ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++ continue;
++ return TRUE;
++ }
++
++ list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
++ if (!list_empty(&sub->res_grantqueue) ||
++ !list_empty(&sub->res_convertqueue))
++ return TRUE;
++
++ list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
++ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++ continue;
++ return TRUE;
++ }
++ }
++
++ return FALSE;
++}
++
++static void serialise_rsb(gd_res_t *rsb, char *buf, int *offp)
++{
++ /*
++ * See rsb_length()
++ * Total: 36 bytes (4 + 24 + 4 + 4)
++ */
++
++ put_bytes(rsb->res_name, rsb->res_length, buf, offp);
++ put_int(rsb->res_remasterid, buf, offp);
++
++ if (rsb->res_parent)
++ put_int(rsb->res_parent->res_remasterid, buf, offp);
++ else
++ put_int(0, buf, offp);
++
++ GDLM_ASSERT(!rsb->res_lvbptr,);
++}
++
++/*
++ * Flatten an LKB into a buffer for sending to the new RSB master. As a
++ * side-effect the nodeid of the lock is set to the nodeid of the new RSB
++ * master.
++ */
++
++static int pack_one_lkb(gd_res_t *r, gd_lkb_t *lkb, rcom_fill_t *fill)
++{
++ if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
++ goto nospace;
++
++ lkb->lkb_nodeid = r->res_nodeid;
++
++ put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
++ serialise_lkb(lkb, fill->outbuf, &fill->offset);
++
++ fill->count++;
++ need_new_lkid(r);
++ return 0;
++
++ nospace:
++ return -ENOSPC;
++}
++
++/*
++ * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
++ */
++
++static int pack_lkb_queue(gd_res_t *r, struct list_head *queue,
++ rcom_fill_t *fill)
++{
++ gd_lkb_t *lkb;
++ int error;
++
++ list_for_each_entry(lkb, queue, lkb_statequeue) {
++ if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
++ continue;
++
++ error = pack_one_lkb(r, lkb, fill);
++ if (error)
++ goto nospace;
++ }
++
++ return 0;
++
++ nospace:
++ fill->lkb = lkb;
++ fill->lkbqueue = queue;
++
++ return error;
++}
++
++static int pack_lkb_queues(gd_res_t *r, rcom_fill_t *fill)
++{
++ int error;
++
++ error = pack_lkb_queue(r, &r->res_grantqueue, fill);
++ if (error)
++ goto nospace;
++
++ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
++ if (error)
++ goto nospace;
++
++ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
++
++ nospace:
++ return error;
++}
++
++/*
++ * Pack remaining lkb's for rsb or subrsb. This may include a partial lkb
++ * queue and full lkb queues.
++ */
++
++static int pack_lkb_remaining(gd_res_t *r, rcom_fill_t *fill)
++{
++ struct list_head *tmp, *start, *end;
++ gd_lkb_t *lkb;
++ int error;
++
++ /*
++ * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
++ */
++
++ error = pack_one_lkb(r, fill->lkb, fill);
++ if (error)
++ goto out;
++
++ start = fill->lkb->lkb_statequeue.next;
++ end = fill->lkbqueue;
++
++ for (tmp = start; tmp != end; tmp = tmp->next) {
++ lkb = list_entry(tmp, gd_lkb_t, lkb_statequeue);
++
++ error = pack_one_lkb(r, lkb, fill);
++ if (error) {
++ fill->lkb = lkb;
++ goto out;
++ }
++ }
++
++ /*
++ * Pack all lkb's on r's queues following fill->lkbqueue.
++ */
++
++ if (fill->lkbqueue == &r->res_waitqueue)
++ goto out;
++ if (fill->lkbqueue == &r->res_convertqueue)
++ goto skip;
++
++ GDLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
++
++ error = pack_lkb_queue(r, &r->res_convertqueue, fill);
++ if (error)
++ goto out;
++ skip:
++ error = pack_lkb_queue(r, &r->res_waitqueue, fill);
++
++ out:
++ return error;
++}
++
++static int pack_one_subrsb(gd_res_t *rsb, gd_res_t *subrsb, rcom_fill_t *fill)
++{
++ int error;
++
++ down_write(&subrsb->res_lock);
++
++ if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
++ goto nospace;
++
++ subrsb->res_nodeid = rsb->res_nodeid;
++ subrsb->res_remasterid = ++fill->remasterid;
++
++ put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
++ serialise_rsb(subrsb, fill->outbuf, &fill->offset);
++
++ error = pack_lkb_queues(subrsb, fill);
++ if (error)
++ goto nospace;
++
++ up_write(&subrsb->res_lock);
++
++ return 0;
++
++ nospace:
++ up_write(&subrsb->res_lock);
++ fill->subrsb = subrsb;
++
++ return -ENOSPC;
++}
++
++static int pack_subrsbs(gd_res_t *rsb, gd_res_t *in_subrsb, rcom_fill_t *fill)
++{
++ gd_res_t *subrsb;
++ int error = 0;
++
++ /*
++ * When an initial subrsb is given, we know it needs to be packed.
++ * When no initial subrsb is given, begin with the first (if any exist).
++ */
++
++ if (!in_subrsb) {
++ if (list_empty(&rsb->res_subreslist))
++ goto out;
++
++ subrsb = list_entry(rsb->res_subreslist.next, gd_res_t,
++ res_subreslist);
++ } else
++ subrsb = in_subrsb;
++
++ for (;;) {
++ error = pack_one_subrsb(rsb, subrsb, fill);
++ if (error)
++ goto out;
++
++ if (last_in_list(subrsb, &rsb->res_subreslist))
++ break;
++
++ subrsb = next_subrsb(subrsb);
++ }
++
++ out:
++ return error;
++}
++
++/*
++ * Finish packing whatever is left in an rsb tree. If space runs out while
++ * finishing, save subrsb/lkb and this will be called again for the same rsb.
++ *
++ * !subrsb && lkb, we left off part way through root rsb's lkbs.
++ * subrsb && !lkb, we left off just before starting a new subrsb.
++ * subrsb && lkb, we left off part way through a subrsb's lkbs.
++ * !subrsb && !lkb, we shouldn't be in this function, but starting
++ * a new rsb in pack_rsb_tree().
++ */
++
++static int pack_rsb_tree_remaining(gd_ls_t *ls, gd_res_t *rsb,
++ rcom_fill_t *fill)
++{
++ gd_res_t *subrsb = NULL;
++ int error = 0;
++
++ if (!fill->subrsb && fill->lkb) {
++ error = pack_lkb_remaining(rsb, fill);
++ if (error)
++ goto out;
++
++ error = pack_subrsbs(rsb, NULL, fill);
++ if (error)
++ goto out;
++ }
++
++ else if (fill->subrsb && !fill->lkb) {
++ error = pack_subrsbs(rsb, fill->subrsb, fill);
++ if (error)
++ goto out;
++ }
++
++ else if (fill->subrsb && fill->lkb) {
++ error = pack_lkb_remaining(fill->subrsb, fill);
++ if (error)
++ goto out;
++
++ if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
++ goto out;
++
++ subrsb = next_subrsb(fill->subrsb);
++
++ error = pack_subrsbs(rsb, subrsb, fill);
++ if (error)
++ goto out;
++ }
++
++ fill->subrsb = NULL;
++ fill->lkb = NULL;
++
++ out:
++ return error;
++}
++
++/*
++ * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
++ * buffer. When the buffer runs out of space, save the place to restart (the
++ * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
++ */
++
++static int pack_rsb_tree(gd_ls_t *ls, gd_res_t *rsb, rcom_fill_t *fill)
++{
++ int error = -ENOSPC;
++
++ fill->remasterid = 0;
++
++ /*
++ * Pack the root rsb itself. A 1 byte type precedes the serialised
++ * rsb. Then pack the lkb's for the root rsb.
++ */
++
++ down_write(&rsb->res_lock);
++
++ if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
++ goto out;
++
++ rsb->res_remasterid = ++fill->remasterid;
++ put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
++ serialise_rsb(rsb, fill->outbuf, &fill->offset);
++
++ error = pack_lkb_queues(rsb, fill);
++ if (error)
++ goto out;
++
++ up_write(&rsb->res_lock);
++
++ /*
++ * Pack subrsb/lkb's under the root rsb.
++ */
++
++ error = pack_subrsbs(rsb, NULL, fill);
++
++ return error;
++
++ out:
++ up_write(&rsb->res_lock);
++ return error;
++}
++
++/*
++ * Given an RSB, return the next RSB that should be sent to a new master.
++ */
++
++static gd_res_t *next_remastered_rsb(gd_ls_t *ls, gd_res_t *rsb)
++{
++ struct list_head *tmp, *start, *end;
++ gd_res_t *r;
++
++ if (!rsb)
++ start = ls->ls_rootres.next;
++ else
++ start = rsb->res_rootlist.next;
++
++ end = &ls->ls_rootres;
++
++ for (tmp = start; tmp != end; tmp = tmp->next) {
++ r = list_entry(tmp, gd_res_t, res_rootlist);
++
++ if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
++ if (r->res_nodeid && lkbs_to_remaster(r)) {
++ expect_new_lkids(r);
++ return r;
++ } else
++ clear_bit(RESFL_NEW_MASTER, &r->res_flags);
++ }
++ }
++
++ return NULL;
++}
++
++/*
++ * Given an rcom buffer, fill it with RSB's that need to be sent to a single
++ * new master node. In the case where all the data to send to one node
++ * requires multiple messages, this function needs to resume filling each
++ * successive buffer from the point where it left off when the previous buffer
++ * filled up.
++ */
++
++static void fill_rcom_buffer(gd_ls_t *ls, rcom_fill_t *fill, uint32_t *nodeid)
++{
++ gd_res_t *rsb, *prev_rsb = fill->rsb;
++ int error;
++
++ fill->offset = 0;
++
++ if (!prev_rsb) {
++
++ /*
++ * The first time this function is called.
++ */
++
++ rsb = next_remastered_rsb(ls, NULL);
++ if (!rsb)
++ goto no_more;
++
++ } else if (fill->subrsb || fill->lkb) {
++
++ /*
++ * Continue packing an rsb tree that was partially packed last
++ * time (fill->subrsb/lkb indicates where packing of last block
++ * left off)
++ */
++
++ rsb = prev_rsb;
++ *nodeid = rsb->res_nodeid;
++
++ error = pack_rsb_tree_remaining(ls, rsb, fill);
++ if (error == -ENOSPC)
++ goto more;
++
++ rsb = next_remastered_rsb(ls, prev_rsb);
++ if (!rsb)
++ goto no_more;
++
++ if (rsb->res_nodeid != prev_rsb->res_nodeid)
++ goto more;
++ } else {
++ rsb = prev_rsb;
++ }
++
++ /*
++ * Pack rsb trees into the buffer until we run out of space, run out of
++ * new rsb's or hit a new nodeid.
++ */
++
++ *nodeid = rsb->res_nodeid;
++
++ for (;;) {
++ error = pack_rsb_tree(ls, rsb, fill);
++ if (error == -ENOSPC)
++ goto more;
++
++ prev_rsb = rsb;
++
++ rsb = next_remastered_rsb(ls, prev_rsb);
++ if (!rsb)
++ goto no_more;
++
++ if (rsb->res_nodeid != prev_rsb->res_nodeid)
++ goto more;
++ }
++
++ more:
++ fill->more = 1;
++ fill->rsb = rsb;
++ return;
++
++ no_more:
++ fill->more = 0;
++}
++
++/*
++ * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
++ */
++
++int rebuild_rsbs_send(gd_ls_t *ls)
++{
++ gd_rcom_t *rc;
++ rcom_fill_t fill;
++ uint32_t nodeid;
++ int error;
++
++ GDLM_ASSERT(recover_list_empty(ls),);
++
++ log_all(ls, "rebuild locks");
++
++ error = -ENOMEM;
++ rc = allocate_rcom_buffer(ls);
++ if (!rc)
++ goto ret;
++
++ error = 0;
++ memset(&fill, 0, sizeof(rcom_fill_t));
++ fill.outbuf = rc->rc_buf;
++ fill.maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
++
++ do {
++ fill_rcom_buffer(ls, &fill, &nodeid);
++ if (!fill.offset)
++ break;
++
++ rc->rc_datalen = fill.offset;
++ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
++ if (error)
++ goto out;
++
++ schedule();
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto out;
++ }
++ while (fill.more);
++
++ error = gdlm_wait_function(ls, &recover_list_empty);
++
++ log_all(ls, "rebuilt %d locks", fill.count);
++
++ out:
++ rebuild_freemem(ls);
++ free_rcom_buffer(rc);
++
++ ret:
++ return error;
++}
++
++static gd_res_t *find_by_remasterid(gd_ls_t *ls, int remasterid,
++ gd_res_t *rootrsb)
++{
++ gd_res_t *rsb;
++
++ GDLM_ASSERT(rootrsb,);
++
++ if (rootrsb->res_remasterid == remasterid) {
++ rsb = rootrsb;
++ goto out;
++ }
++
++ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
++ if (rsb->res_remasterid == remasterid)
++ goto out;
++ }
++ rsb = NULL;
++
++ out:
++ return rsb;
++}
++
++/*
++ * Search a queue for the given remote lock id (remlkid).
++ */
++
++static gd_lkb_t *search_remlkid(struct list_head *statequeue, int nodeid,
++ int remid)
++{
++ gd_lkb_t *lkb;
++
++ list_for_each_entry(lkb, statequeue, lkb_statequeue) {
++ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
++ return lkb;
++ }
++ }
++
++ return NULL;
++}
++
++/*
++ * Given a remote lock ID (and a parent resource), return the local LKB for it
++ * Hopefully we dont need to do this too often on deep lock trees. This is
++ * VERY suboptimal for anything but the smallest lock trees. It searches the
++ * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
++ * returns the LKB address. OPTIMISATION: we should keep a list of these while
++ * we are building up the remastered LKBs
++ */
++
++static gd_lkb_t *find_by_remlkid(gd_res_t *rootrsb, int nodeid, int remid)
++{
++ gd_lkb_t *lkb;
++ gd_res_t *rsb;
++
++ lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++
++ lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++
++ lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++
++ list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
++ lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++
++ lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++
++ lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
++ if (lkb)
++ goto out;
++ }
++ lkb = NULL;
++
++ out:
++ return lkb;
++}
++
++/*
++ * Unpack an LKB from a remaster operation
++ */
++
++static int deserialise_lkb(gd_ls_t *ls, int rem_nodeid, gd_res_t *rootrsb,
++ char *buf, int *ptr, char *outbuf, int *outoffp)
++{
++ gd_lkb_t *lkb;
++ gd_res_t *rsb;
++ int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
++
++ remote_lkid = get_int(buf, ptr);
++
++ rsb_rmid = get_int(buf, ptr);
++ rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
++ GDLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
++
++ /*
++ * We could have received this lkb already from a previous recovery
++ * that was interrupted. If so, just return the lkid to the remote
++ * node.
++ */
++ lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
++ if (lkb)
++ goto put_lkid;
++
++ lkb = create_lkb(rsb->res_ls);
++ if (!lkb)
++ goto out;
++
++ lkb->lkb_remid = remote_lkid;
++ lkb->lkb_flags = get_int(buf, ptr);
++ status = get_int(buf, ptr);
++ lkb->lkb_rqmode = get_char(buf, ptr);
++ lkb->lkb_grmode = get_char(buf, ptr);
++ atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
++
++ parentid = get_int(buf, ptr);
++ lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
++
++ if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
++ lkb->lkb_lvbptr = allocate_lvb(ls);
++ if (!lkb->lkb_lvbptr)
++ goto out;
++ get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
++ }
++
++ if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
++ uint64_t start, end;
++
++ /* Don't need to keep the range flag, for comms use only */
++ lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
++ start = get_int64(buf, ptr);
++ end = get_int64(buf, ptr);
++
++ lkb->lkb_range = allocate_range(rsb->res_ls);
++ if (!lkb->lkb_range)
++ goto out;
++
++ switch (status) {
++ case GDLM_LKSTS_CONVERT:
++ lkb->lkb_range[RQ_RANGE_START] = start;
++ lkb->lkb_range[RQ_RANGE_END] = end;
++ start = get_int64(buf, ptr);
++ end = get_int64(buf, ptr);
++ lkb->lkb_range[GR_RANGE_START] = start;
++ lkb->lkb_range[GR_RANGE_END] = end;
++
++ case GDLM_LKSTS_WAITING:
++ lkb->lkb_range[RQ_RANGE_START] = start;
++ lkb->lkb_range[RQ_RANGE_END] = end;
++ break;
++
++ case GDLM_LKSTS_GRANTED:
++ lkb->lkb_range[GR_RANGE_START] = start;
++ lkb->lkb_range[GR_RANGE_END] = end;
++ break;
++ default:
++ GDLM_ASSERT(0,);
++ }
++ }
++
++ /* Resolve local lock LKB address from parent ID */
++ if (parentid)
++ lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
++ parentid);
++
++ atomic_inc(&rsb->res_ref);
++ lkb->lkb_resource = rsb;
++
++ lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
++ lkb->lkb_nodeid = rem_nodeid;
++
++ /*
++ * Put the lkb on an RSB queue. An lkb that's in the midst of a
++ * conversion request (on the requesting node's lockqueue and has
++ * LQCONVERT set) should be put on the granted queue. The convert
++ * request will be resent by the requesting node.
++ */
++
++ if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
++ lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
++ GDLM_ASSERT(status == GDLM_LKSTS_CONVERT,
++ printk("status=%d\n", status););
++ lkb->lkb_rqmode = DLM_LOCK_IV;
++ status = GDLM_LKSTS_GRANTED;
++ }
++
++ lkb_enqueue(rsb, lkb, status);
++
++ /*
++ * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
++ */
++
++ if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
++ && lkb->lkb_grmode > DLM_LOCK_NL) {
++ if (!rsb->res_lvbptr)
++ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++ if (!rsb->res_lvbptr)
++ goto out;
++ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++ }
++
++ /*
++ * Clear flags that may have been sent over that are only relevant in
++ * the context of the sender.
++ */
++
++ lkb->lkb_flags &= ~(GDLM_LKFLG_DELAST | GDLM_LKFLG_DELETED |
++ GDLM_LKFLG_LQRESEND | GDLM_LKFLG_NOREBUILD |
++ GDLM_LKFLG_DEMOTED);
++
++ put_lkid:
++ /* Return the new LKID to the caller's buffer */
++ put_int(lkb->lkb_id, outbuf, outoffp);
++ put_int(lkb->lkb_remid, outbuf, outoffp);
++ error = 0;
++
++ out:
++ return error;
++}
++
++static gd_res_t *deserialise_rsb(gd_ls_t *ls, int nodeid, gd_res_t *rootrsb,
++ char *buf, int *ptr)
++{
++ int length;
++ int remasterid;
++ int parent_remasterid;
++ char name[DLM_RESNAME_MAXLEN];
++ int error;
++ gd_res_t *parent = NULL;
++ gd_res_t *rsb;
++
++ get_bytes(name, &length, buf, ptr);
++ remasterid = get_int(buf, ptr);
++ parent_remasterid = get_int(buf, ptr);
++
++ if (parent_remasterid)
++ parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
++
++ /*
++ * The rsb reference from this find_or_create_rsb() will keep the rsb
++ * around while we add new lkb's to it from deserialise_lkb. Each of
++ * the lkb's will add an rsb reference. The reference added here is
++ * removed by release_rsb() after all lkb's are added.
++ */
++
++ error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
++ GDLM_ASSERT(!error,);
++
++ /* There is a case where the above needs to create the RSB. */
++ if (rsb->res_nodeid == -1)
++ rsb->res_nodeid = our_nodeid();
++
++ rsb->res_remasterid = remasterid;
++
++ return rsb;
++}
++
++/*
++ * Processing at the receiving end of a NEWLOCKS message from a node in
++ * rebuild_rsbs_send(). Rebuild a remastered lock tree. Nodeid is the remote
++ * node whose locks we are now mastering. For a reply we need to send back the
++ * new lockids of the remastered locks so that remote ops can find them.
++ */
++
++int rebuild_rsbs_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
++{
++ gd_rcom_t *rc;
++ gd_res_t *rsb = NULL;
++ rebuild_node_t *rnode;
++ char *outbuf;
++ int outptr, ptr = 0, error = -ENOMEM;
++
++ rnode = find_rebuild_root(ls, nodeid);
++ if (!rnode)
++ goto out;
++
++ /*
++ * Allocate a buffer for the reply message which is a list of remote
++ * lock IDs and their (new) local lock ids. It will always be big
++ * enough to fit <n> ID pairs if it already fit <n> LKBs.
++ */
++
++ rc = allocate_rcom_buffer(ls);
++ if (!rc)
++ goto out;
++ outbuf = rc->rc_buf;
++ outptr = 0;
++
++ /*
++ * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
++ * created. Each deserialise_rsb adds an rsb reference that must be
++ * removed with release_rsb once all new lkb's for an rsb have been
++ * added.
++ */
++
++ while (ptr < len) {
++ int type;
++
++ type = get_char(buf, &ptr);
++
++ switch (type) {
++ case REMASTER_ROOTRSB:
++ if (rsb)
++ release_rsb(rsb);
++ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
++ &ptr);
++ rnode->rootrsb = rsb;
++ break;
++
++ case REMASTER_RSB:
++ if (rsb)
++ release_rsb(rsb);
++ rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
++ &ptr);
++ break;
++
++ case REMASTER_LKB:
++ deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
++ outbuf, &outptr);
++ break;
++
++ default:
++ GDLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
++ "len=%d\n", type, nodeid, ptr,
++ len););
++ }
++ }
++
++ if (rsb)
++ release_rsb(rsb);
++
++ /*
++ * Reply with the new lock IDs.
++ */
++
++ rc->rc_datalen = outptr;
++ error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
++
++ free_rcom_buffer(rc);
++
++ out:
++ return error;
++}
++
++/*
++ * Processing for a NEWLOCKIDS message. Called when we get the reply from the
++ * new master telling us what the new remote lock IDs are for the remastered
++ * locks
++ */
++
++int rebuild_rsbs_lkids_recv(gd_ls_t *ls, int nodeid, char *buf, int len)
++{
++ int offset = 0;
++
++ if (len == 1)
++ len = 0;
++
++ while (offset < len) {
++ int remote_id;
++ int local_id;
++ gd_lkb_t *lkb;
++
++ if (offset + 8 > len) {
++ log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
++ "length nodeid=%d offset=%d len=%d",
++ nodeid, offset, len);
++ break;
++ }
++
++ remote_id = get_int(buf, &offset);
++ local_id = get_int(buf, &offset);
++
++ lkb = find_lock_by_id(ls, local_id);
++ if (lkb) {
++ lkb->lkb_remid = remote_id;
++ have_new_lkid(lkb);
++ } else {
++ log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
++ "nodeid=%d id=%x remid=%x offset=%d len=%d",
++ nodeid, local_id, remote_id, offset, len);
++ }
++ }
++
++ if (recover_list_empty(ls))
++ wake_up(&ls->ls_wait_general);
++
++ return 0;
++}
+diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
+--- linux-orig/cluster/dlm/rebuild.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rebuild.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __REBUILD_DOT_H__
++#define __REBUILD_DOT_H__
++
++int rebuild_rsbs_send(gd_ls_t * ls);
++int rebuild_rsbs_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
++int rebuild_rsbs_lkids_recv(gd_ls_t * ls, int nodeid, char *buf, int len);
++int rebuild_freemem(gd_ls_t * ls);
++
++#endif /* __REBUILD_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
+--- linux-orig/cluster/dlm/reccomms.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/reccomms.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,502 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "lowcomms.h"
++#include "midcomms.h"
++#include "reccomms.h"
++#include "nodes.h"
++#include "lockspace.h"
++#include "recover.h"
++#include "dir.h"
++#include "config.h"
++#include "rebuild.h"
++#include "memory.h"
++
++/* Running on the basis that only a single recovery communication will be done
++ * at a time per lockspace */
++
++static void rcom_process_message(gd_ls_t * ls, uint32_t nodeid, gd_rcom_t * rc);
++
++/*
++ * Track per-node progress/stats during recovery to help debugging.
++ */
++
++void rcom_log(gd_ls_t *ls, int nodeid, gd_rcom_t *rc, int send)
++{
++ gd_csb_t *csb;
++ int found = 0;
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ if (csb->csb_node->gn_nodeid == nodeid) {
++ found = TRUE;
++ break;
++ }
++ }
++
++ if (!found)
++ return;
++
++ if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
++ if (send) {
++ csb->csb_names_send_count++;
++ csb->csb_names_send_msgid = rc->rc_msgid;
++ } else {
++ csb->csb_names_recv_count++;
++ csb->csb_names_recv_msgid = rc->rc_msgid;
++ }
++ } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
++ if (send) {
++ csb->csb_locks_send_count++;
++ csb->csb_locks_send_msgid = rc->rc_msgid;
++ } else {
++ csb->csb_locks_recv_count++;
++ csb->csb_locks_recv_msgid = rc->rc_msgid;
++ }
++ }
++}
++
++void rcom_log_clear(gd_ls_t *ls)
++{
++ gd_csb_t *csb;
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ csb->csb_names_send_count = 0;
++ csb->csb_names_send_msgid = 0;
++ csb->csb_names_recv_count = 0;
++ csb->csb_names_recv_msgid = 0;
++ csb->csb_locks_send_count = 0;
++ csb->csb_locks_send_msgid = 0;
++ csb->csb_locks_recv_count = 0;
++ csb->csb_locks_recv_msgid = 0;
++ }
++}
++
++static int rcom_response(gd_ls_t *ls)
++{
++ return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++}
++
++/**
++ * rcom_send_message - send or request recovery data
++ * @ls: the lockspace
++ * @nodeid: node to which the message is sent
++ * @type: type of recovery message
++ * @rc: the rc buffer to send
++ * @need_reply: wait for reply if this is set
++ *
++ * Using this interface
++ * i) Allocate an rc buffer:
++ * rc = allocate_rcom_buffer(ls);
++ * ii) Copy data to send beginning at rc->rc_buf:
++ * memcpy(rc->rc_buf, mybuf, mylen);
++ * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
++ * rc->rc_datalen = mylen
++ * iv) Submit the rc to this function:
++ * rcom_send_message(rc);
++ *
++ * The max value of "mylen" is dlm_config.buffer_size - sizeof(gd_rcom_t). If
++ * more data must be passed in one send, use rcom_expand_buffer() which
++ * incrementally increases the size of the rc buffer by dlm_config.buffer_size
++ * bytes.
++ *
++ * Any data returned for the message (when need_reply is set) will saved in
++ * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
++ * number of bytes copied into rc->rc_buf.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int rcom_send_message(gd_ls_t *ls, uint32_t nodeid, int type, gd_rcom_t *rc,
++ int need_reply)
++{
++ int error = 0;
++
++ if (!rc->rc_datalen)
++ rc->rc_datalen = 1;
++
++ /*
++ * Fill in the header.
++ */
++
++ rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
++ rc->rc_header.rh_lockspace = ls->ls_global_id;
++ rc->rc_header.rh_length = sizeof(gd_rcom_t) + rc->rc_datalen - 1;
++ rc->rc_subcmd = type;
++ rc->rc_msgid = ++ls->ls_rcom_msgid;
++
++ rcom_log(ls, nodeid, rc, 1);
++
++ /*
++ * When a reply is received, the reply data goes back into this buffer.
++ * Synchronous rcom requests (need_reply=1) are serialised because of
++ * the single ls_rcom.
++ */
++
++ if (need_reply) {
++ down(&ls->ls_rcom_lock);
++ ls->ls_rcom = rc;
++ }
++
++ /*
++ * After sending the message we'll wait at the end of this function to
++ * get a reply. The READY flag will be set when the reply has been
++ * received and requested data has been copied into
++ * ls->ls_rcom->rc_buf;
++ */
++
++ GDLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
++
++ /*
++ * The WAIT bit indicates that we're waiting for and willing to accept a
++ * reply. Any replies are ignored unless this bit is set.
++ */
++
++ set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
++
++ /*
++ * Process the message locally.
++ */
++
++ if (nodeid == our_nodeid()) {
++ rcom_process_message(ls, nodeid, rc);
++ goto out;
++ }
++
++ /*
++ * Send the message.
++ */
++
++ log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
++
++ error = midcomms_send_message(nodeid, (struct gd_req_header *) rc,
++ GFP_KERNEL);
++ GDLM_ASSERT(error >= 0, printk("error = %d\n", error););
++ error = 0;
++
++ /*
++ * Wait for a reply. Once a reply is processed from midcomms, the
++ * READY bit will be set and we'll be awoken (gdlm_wait_function will
++ * return 0).
++ */
++
++ if (need_reply) {
++ error = gdlm_wait_function(ls, &rcom_response);
++ if (error)
++ log_debug(ls, "rcom wait error %d", error);
++ }
++
++ out:
++ clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
++ clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++
++ if (need_reply)
++ up(&ls->ls_rcom_lock);
++
++ return error;
++}
++
++/*
++ * Runs in same context as midcomms.
++ */
++
++static void rcom_process_message(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *rc)
++{
++ gd_rcom_t rc_stack;
++ gd_rcom_t *reply = NULL;
++ gd_resdata_t *rd;
++ int status, datalen, maxlen;
++ uint32_t be_nodeid;
++
++ if (!ls)
++ return;
++
++ rcom_log(ls, nodeid, rc, 0);
++
++ if (gdlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
++ log_error(ls, "ignoring recovery message %x from %u",
++ rc->rc_subcmd, nodeid);
++ return;
++ }
++
++ switch (rc->rc_subcmd) {
++
++ case RECCOMM_STATUS:
++
++ memset(&rc_stack, 0, sizeof(gd_rcom_t));
++ reply = &rc_stack;
++
++ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++ reply->rc_subcmd = rc->rc_subcmd;
++ reply->rc_msgid = rc->rc_msgid;
++ reply->rc_buf[0] = 0;
++
++ if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
++ reply->rc_buf[0] |= RESDIR_VALID;
++
++ if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
++ reply->rc_buf[0] |= RESDIR_ALL_VALID;
++
++ if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
++ reply->rc_buf[0] |= NODES_VALID;
++
++ if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
++ reply->rc_buf[0] |= NODES_ALL_VALID;
++
++ reply->rc_datalen = 1;
++ reply->rc_header.rh_length =
++ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++ log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
++ break;
++
++ case RECCOMM_RECOVERNAMES:
++
++ reply = allocate_rcom_buffer(ls);
++ GDLM_ASSERT(reply,);
++ maxlen = dlm_config.buffer_size - sizeof(gd_rcom_t);
++
++ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++ reply->rc_subcmd = rc->rc_subcmd;
++ reply->rc_msgid = rc->rc_msgid;
++
++ /*
++ * The other node wants a bunch of resource names. The name of
++ * the resource to begin with is in rc->rc_buf.
++ */
++
++ datalen = resdir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
++ reply->rc_buf, maxlen, nodeid);
++
++ reply->rc_datalen = datalen;
++ reply->rc_header.rh_length =
++ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++ log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
++ reply->rc_msgid);
++ break;
++
++ case RECCOMM_GETMASTER:
++
++ reply = allocate_rcom_buffer(ls);
++ GDLM_ASSERT(reply,);
++
++ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++ reply->rc_subcmd = rc->rc_subcmd;
++ reply->rc_msgid = rc->rc_msgid;
++
++ /*
++ * The other node wants to know the master of a named resource.
++ */
++
++ status = get_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen,
++ &rd, 1);
++ if (status != 0) {
++ free_rcom_buffer(reply);
++ reply = NULL;
++ return;
++ }
++ be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
++ memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
++ reply->rc_datalen = sizeof(uint32_t);
++ reply->rc_header.rh_length =
++ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++ break;
++
++ case RECCOMM_BULKLOOKUP:
++
++ reply = allocate_rcom_buffer(ls);
++ GDLM_ASSERT(reply,);
++
++ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++ reply->rc_subcmd = rc->rc_subcmd;
++ reply->rc_msgid = rc->rc_msgid;
++
++ /*
++ * This is a bulk version of the above and just returns a
++ * buffer full of node ids to match the resources
++ */
++
++ datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
++ rc->rc_datalen, reply->rc_buf);
++ if (datalen < 0) {
++ free_rcom_buffer(reply);
++ reply = NULL;
++ return;
++ }
++
++ reply->rc_datalen = datalen;
++ reply->rc_header.rh_length =
++ sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++ break;
++
++ /*
++ * These RECCOMM messages don't need replies.
++ */
++
++ case RECCOMM_NEWLOCKS:
++ rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
++ break;
++
++ case RECCOMM_NEWLOCKIDS:
++ rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
++ break;
++
++ case RECCOMM_REMRESDATA:
++ remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen, 1);
++ break;
++
++ default:
++ GDLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
++ }
++
++ if (reply) {
++ if (nodeid == our_nodeid()) {
++ GDLM_ASSERT(rc == ls->ls_rcom,);
++ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
++ rc->rc_datalen = reply->rc_datalen;
++ } else {
++ midcomms_send_message(nodeid,
++ (struct gd_req_header *) reply,
++ GFP_KERNEL);
++ }
++
++ if (reply != &rc_stack)
++ free_rcom_buffer(reply);
++ }
++}
++
++static void process_reply_sync(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++ gd_rcom_t *rc = ls->ls_rcom;
++
++ if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
++ log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
++ return;
++ }
++
++ if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
++ log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
++ reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
++ return;
++ }
++
++ memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
++ rc->rc_datalen = reply->rc_datalen;
++
++ /*
++ * Tell the thread waiting in rcom_send_message() that it can go ahead.
++ */
++
++ set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
++ wake_up(&ls->ls_wait_general);
++}
++
++static void process_reply_async(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++ restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
++ reply->rc_msgid);
++}
++
++/*
++ * Runs in same context as midcomms.
++ */
++
++static void rcom_process_reply(gd_ls_t *ls, uint32_t nodeid, gd_rcom_t *reply)
++{
++ if (gdlm_recovery_stopped(ls)) {
++ log_error(ls, "ignoring recovery reply %x from %u",
++ reply->rc_subcmd, nodeid);
++ return;
++ }
++
++ switch (reply->rc_subcmd) {
++ case RECCOMM_GETMASTER:
++ process_reply_async(ls, nodeid, reply);
++ break;
++ case RECCOMM_STATUS:
++ case RECCOMM_NEWLOCKS:
++ case RECCOMM_NEWLOCKIDS:
++ case RECCOMM_RECOVERNAMES:
++ process_reply_sync(ls, nodeid, reply);
++ break;
++ default:
++ log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
++ reply->rc_subcmd, nodeid);
++ }
++}
++
++
++static int send_ls_not_ready(uint32_t nodeid, struct gd_req_header *header)
++{
++ struct writequeue_entry *wq;
++ gd_rcom_t *rc = (gd_rcom_t *) header;
++ gd_rcom_t *reply;
++
++ wq = lowcomms_get_buffer(nodeid, sizeof(gd_rcom_t), GFP_KERNEL,
++ (char **)&reply);
++ if (!wq)
++ return -ENOMEM;
++
++ reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
++ reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
++ reply->rc_subcmd = rc->rc_subcmd;
++ reply->rc_msgid = rc->rc_msgid;
++ reply->rc_buf[0] = 0;
++
++ reply->rc_datalen = 1;
++ reply->rc_header.rh_length = sizeof(gd_rcom_t) + reply->rc_datalen - 1;
++
++ midcomms_send_buffer((struct gd_req_header *)reply, wq);
++ return 0;
++}
++
++
++/*
++ * Runs in same context as midcomms. Both recovery requests and recovery
++ * replies come through this function.
++ */
++
++void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header)
++{
++ gd_ls_t *ls = find_lockspace_by_global_id(header->rh_lockspace);
++ gd_rcom_t *rc = (gd_rcom_t *) header;
++
++ /* If the lockspace doesn't exist then still send a status message
++ back, it's possible that it just doesn't have it's global_id
++ yet. */
++ if (!ls) {
++ send_ls_not_ready(nodeid, header);
++ return;
++ }
++
++ switch (header->rh_cmd) {
++ case GDLM_REMCMD_RECOVERMESSAGE:
++ down_read(&ls->ls_rec_rsblist);
++ rcom_process_message(ls, nodeid, rc);
++ up_read(&ls->ls_rec_rsblist);
++ break;
++
++ case GDLM_REMCMD_RECOVERREPLY:
++ rcom_process_reply(ls, nodeid, rc);
++ break;
++
++ default:
++ GDLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
++ }
++}
++
+diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
+--- linux-orig/cluster/dlm/reccomms.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/reccomms.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,37 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECCOMMS_DOT_H__
++#define __RECCOMMS_DOT_H__
++
++/* Bit flags */
++
++#define RESDIR_VALID (1)
++#define RESDIR_ALL_VALID (2)
++#define NODES_VALID (4)
++#define NODES_ALL_VALID (8)
++
++#define RECCOMM_STATUS (1)
++#define RECCOMM_RECOVERNAMES (2)
++#define RECCOMM_GETMASTER (3)
++#define RECCOMM_BULKLOOKUP (4)
++#define RECCOMM_NEWLOCKS (5)
++#define RECCOMM_NEWLOCKIDS (6)
++#define RECCOMM_REMRESDATA (7)
++
++int rcom_send_message(gd_ls_t * ls, uint32_t nodeid, int type, gd_rcom_t * rc,
++ int need_reply);
++void process_recovery_comm(uint32_t nodeid, struct gd_req_header *header);
++void rcom_log_clear(gd_ls_t *ls);
++
++#endif
+diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
+--- linux-orig/cluster/dlm/recover.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recover.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,632 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "reccomms.h"
++#include "dir.h"
++#include "locking.h"
++#include "rsb.h"
++#include "lockspace.h"
++#include "lkb.h"
++#include "nodes.h"
++#include "config.h"
++#include "ast.h"
++#include "memory.h"
++
++/*
++ * Called in recovery routines to check whether the recovery process has been
++ * interrupted/stopped by another transition. A recovery in-process will abort
++ * if the lockspace is "stopped" so that a new recovery process can start from
++ * the beginning when the lockspace is "started" again.
++ */
++
++int gdlm_recovery_stopped(gd_ls_t *ls)
++{
++ return test_bit(LSFL_LS_STOP, &ls->ls_flags);
++}
++
++static void gdlm_wait_timer_fn(unsigned long data)
++{
++ gd_ls_t *ls = (gd_ls_t *) data;
++
++ wake_up(&ls->ls_wait_general);
++}
++
++/*
++ * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
++ * set due to failure of a node in ls_nodes). When another function thinks it
++ * could have completed the waited-on task, they should wake up ls_wait_general
++ * to get an immediate response rather than waiting for the timer to detect the
++ * result. A timer wakes us up periodically while waiting to see if we should
++ * abort due to a node failure.
++ */
++
++int gdlm_wait_function(gd_ls_t *ls, int (*testfn) (gd_ls_t * ls))
++{
++ struct timer_list timer;
++ int error = 0;
++
++ init_timer(&timer);
++ timer.function = gdlm_wait_timer_fn;
++ timer.data = (long) ls;
++
++ for (;;) {
++ mod_timer(&timer, jiffies + (5 * HZ));
++
++ wchan_cond_sleep_intr(ls->ls_wait_general,
++ !testfn(ls) &&
++ !test_bit(LSFL_LS_STOP, &ls->ls_flags));
++
++ if (timer_pending(&timer))
++ del_timer(&timer);
++
++ if (testfn(ls))
++ break;
++
++ if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
++ error = -1;
++ break;
++ }
++ }
++
++ return error;
++}
++
++int gdlm_wait_status_all(gd_ls_t *ls, unsigned int wait_status)
++{
++ gd_rcom_t rc_stack, *rc;
++ gd_csb_t *csb;
++ int status;
++ int error = 0;
++
++ memset(&rc_stack, 0, sizeof(gd_rcom_t));
++ rc = &rc_stack;
++ rc->rc_datalen = 0;
++
++ list_for_each_entry(csb, &ls->ls_nodes, csb_list) {
++ for (;;) {
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto out;
++
++ error = rcom_send_message(ls, csb->csb_node->gn_nodeid,
++ RECCOMM_STATUS, rc, 1);
++ if (error)
++ goto out;
++
++ status = rc->rc_buf[0];
++ if (status & wait_status)
++ break;
++ else {
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule_timeout(HZ >> 1);
++ }
++ }
++ }
++
++ out:
++ return error;
++}
++
++int gdlm_wait_status_low(gd_ls_t *ls, unsigned int wait_status)
++{
++ gd_rcom_t rc_stack, *rc;
++ uint32_t nodeid = ls->ls_low_nodeid;
++ int status;
++ int error = 0;
++
++ memset(&rc_stack, 0, sizeof(gd_rcom_t));
++ rc = &rc_stack;
++ rc->rc_datalen = 0;
++
++ for (;;) {
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto out;
++
++ error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
++ if (error)
++ break;
++
++ status = rc->rc_buf[0];
++ if (status & wait_status)
++ break;
++ else {
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule_timeout(HZ >> 1);
++ }
++ }
++
++ out:
++ return error;
++}
++
++static int purge_queue(gd_ls_t *ls, struct list_head *queue)
++{
++ gd_lkb_t *lkb, *safe;
++ gd_res_t *rsb;
++ int count = 0;
++
++ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
++ if (!lkb->lkb_nodeid)
++ continue;
++
++ GDLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
++
++ if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
++ list_del(&lkb->lkb_statequeue);
++
++ rsb = lkb->lkb_resource;
++ lkb->lkb_status = 0;
++
++ if (lkb->lkb_status == GDLM_LKSTS_CONVERT
++ && &lkb->lkb_duetime)
++ remove_from_deadlockqueue(lkb);
++
++ release_lkb(ls, lkb);
++ release_rsb(rsb);
++ count++;
++ }
++ }
++
++ return count;
++}
++
++/*
++ * Go through local restbl and for each rsb we're master of, clear out any
++ * lkb's held by departed nodes.
++ */
++
++int restbl_lkb_purge(gd_ls_t *ls)
++{
++ struct list_head *tmp2, *safe2;
++ int count = 0;
++ gd_res_t *rootrsb, *safe, *rsb;
++
++ log_all(ls, "purge locks of departed nodes");
++
++ list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
++
++ rootrsb->res_resdir_seq = 1;
++
++ if (rootrsb->res_nodeid)
++ continue;
++
++ hold_rsb(rootrsb);
++ down_write(&rootrsb->res_lock);
++
++ /* This traverses the subreslist in reverse order so we purge
++ * the children before their parents. */
++
++ for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
++ tmp2 != &rootrsb->res_subreslist;
++ tmp2 = safe2, safe2 = safe2->prev) {
++ rsb = list_entry(tmp2, gd_res_t, res_subreslist);
++
++ hold_rsb(rsb);
++ purge_queue(ls, &rsb->res_grantqueue);
++ purge_queue(ls, &rsb->res_convertqueue);
++ purge_queue(ls, &rsb->res_waitqueue);
++ release_rsb(rsb);
++ }
++ count += purge_queue(ls, &rootrsb->res_grantqueue);
++ count += purge_queue(ls, &rootrsb->res_convertqueue);
++ count += purge_queue(ls, &rootrsb->res_waitqueue);
++
++ up_write(&rootrsb->res_lock);
++ release_rsb(rootrsb);
++ }
++
++ log_all(ls, "purged %d locks", count);
++
++ return 0;
++}
++
++/*
++ * Grant any locks that have become grantable after a purge
++ */
++
++int restbl_grant_after_purge(gd_ls_t *ls)
++{
++ gd_res_t *root, *rsb, *safe;
++ int error = 0;
++
++ down_write(&ls->ls_gap_rsblist);
++
++ list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
++ /* only the rsb master grants locks */
++ if (root->res_nodeid)
++ continue;
++
++ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
++ log_debug(ls, "restbl_grant_after_purge aborted");
++ error = -EINTR;
++ up_write(&ls->ls_gap_rsblist);
++ goto out;
++ }
++
++ down_write(&root->res_lock);
++ grant_pending_locks(root);
++ up_write(&root->res_lock);
++
++ list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
++ down_write(&rsb->res_lock);
++ grant_pending_locks(rsb);
++ up_write(&rsb->res_lock);
++ }
++ }
++ up_write(&ls->ls_gap_rsblist);
++ wake_astd();
++ out:
++ return error;
++}
++
++/*
++ * Set the lock master for all LKBs in a lock queue
++ */
++
++static void set_lock_master(struct list_head *queue, int nodeid)
++{
++ gd_lkb_t *lkb;
++
++ list_for_each_entry(lkb, queue, lkb_statequeue) {
++ /* Don't muck around with pre-exising sublocks */
++ if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
++ lkb->lkb_nodeid = nodeid;
++ }
++}
++
++static void set_master_lkbs(gd_res_t *rsb)
++{
++ set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
++ set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
++ set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
++}
++
++/*
++ * This rsb struct is now the master so it is responsible for keeping the
++ * latest rsb. Find if any current lkb's have an up to date copy of the lvb to
++ * be used as the rsb copy. An equivalent step occurs as new lkb's arrive for
++ * this rsb in deserialise_lkb.
++ */
++
++static void set_rsb_lvb(gd_res_t *rsb)
++{
++ gd_lkb_t *lkb;
++
++ list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
++
++ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
++ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
++ (lkb->lkb_grmode > DLM_LOCK_NL))
++ {
++ if (!rsb->res_lvbptr)
++ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++ return;
++ }
++ }
++
++ list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
++
++ if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
++ (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
++ (lkb->lkb_grmode > DLM_LOCK_NL))
++ {
++ if (!rsb->res_lvbptr)
++ rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
++
++ memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
++ return;
++ }
++ }
++}
++
++/*
++ * Propogate the new master nodeid to locks, subrsbs, sublocks.
++ * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
++ */
++
++static void set_new_master(gd_res_t *rsb)
++{
++ gd_res_t *subrsb;
++
++ down_write(&rsb->res_lock);
++
++ if (rsb->res_nodeid == our_nodeid()) {
++ rsb->res_nodeid = 0;
++ set_rsb_lvb(rsb);
++ }
++
++ set_master_lkbs(rsb);
++
++ list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
++ subrsb->res_nodeid = rsb->res_nodeid;
++ set_master_lkbs(subrsb);
++ }
++
++ up_write(&rsb->res_lock);
++
++ set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
++}
++
++/*
++ * The recover_list contains all the rsb's for which we've requested the new
++ * master nodeid. As replies are returned from the resource directories the
++ * rsb's are removed from the list. When the list is empty we're done.
++ *
++ * The recover_list is later similarly used for all rsb's for which we've sent
++ * new lkb's and need to receive new corresponding lkid's.
++ */
++
++int recover_list_empty(gd_ls_t *ls)
++{
++ int empty;
++
++ spin_lock(&ls->ls_recover_list_lock);
++ empty = list_empty(&ls->ls_recover_list);
++ spin_unlock(&ls->ls_recover_list_lock);
++
++ return empty;
++}
++
++int recover_list_count(gd_ls_t *ls)
++{
++ int count;
++
++ spin_lock(&ls->ls_recover_list_lock);
++ count = ls->ls_recover_list_count;
++ spin_unlock(&ls->ls_recover_list_lock);
++
++ return count;
++}
++
++void recover_list_add(gd_res_t *rsb)
++{
++ gd_ls_t *ls = rsb->res_ls;
++
++ spin_lock(&ls->ls_recover_list_lock);
++ if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
++ list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
++ ls->ls_recover_list_count++;
++ hold_rsb(rsb);
++ }
++ spin_unlock(&ls->ls_recover_list_lock);
++}
++
++void recover_list_del(gd_res_t *rsb)
++{
++ gd_ls_t *ls = rsb->res_ls;
++
++ spin_lock(&ls->ls_recover_list_lock);
++ clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
++ list_del(&rsb->res_recover_list);
++ ls->ls_recover_list_count--;
++ spin_unlock(&ls->ls_recover_list_lock);
++
++ release_rsb(rsb);
++}
++
++static gd_res_t *recover_list_find(gd_ls_t *ls, int msgid)
++{
++ gd_res_t *rsb = NULL;
++
++ spin_lock(&ls->ls_recover_list_lock);
++
++ list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
++ if (rsb->res_recover_msgid == msgid)
++ goto rec_found;
++ }
++ rsb = NULL;
++
++ rec_found:
++ spin_unlock(&ls->ls_recover_list_lock);
++ return rsb;
++}
++
++#if 0
++static void recover_list_clear(gd_ls_t *ls)
++{
++ gd_res_t *rsb;
++
++
++ spin_lock(&ls->ls_recover_list_lock);
++
++ while (!list_empty(&ls->ls_recover_list)) {
++ rsb = list_entry(ls->ls_recover_list.next, gd_res_t,
++ res_recover_list);
++ list_del(&rsb->res_recover_list);
++ ls->ls_recover_list_count--;
++ }
++ spin_unlock(&ls->ls_recover_list_lock);
++
++}
++#endif
++
++#if 0
++void recover_list_dump(gd_ls_t *ls)
++{
++ struct list_head *tmp;
++ gd_res_t *rsb;
++
++ spin_lock(&ls->ls_recover_list_lock);
++
++ printk("recover_list_count=%d\n", ls->ls_recover_list_count);
++
++ list_for_each(tmp, &ls->ls_recover_list) {
++ rsb = list_entry(tmp, gd_res_t, res_recover_list);
++ gdlm_res_dbprint(rsb);
++ }
++ spin_unlock(&ls->ls_recover_list_lock);
++}
++#endif
++
++static int rsb_master_lookup(gd_res_t *rsb, gd_rcom_t *rc)
++{
++ gd_ls_t *ls = rsb->res_ls;
++ gd_resdata_t *rd;
++ uint32_t dir_nodeid;
++ int error;
++
++ dir_nodeid = get_directory_nodeid(rsb);
++
++ if (dir_nodeid == our_nodeid()) {
++ error = get_resdata(ls, dir_nodeid, rsb->res_name,
++ rsb->res_length, &rd, 1);
++ if (error)
++ goto fail;
++
++ rsb->res_nodeid = rd->rd_master_nodeid;
++ set_new_master(rsb);
++ } else {
++ /* As we are the only thread doing recovery this
++ should be safe. if not then we need to use a different
++ ID somehow. We must set it in the RSB before rcom_send_msg
++ completes cos we may get a reply quite quickly.
++ */
++ rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
++
++ recover_list_add(rsb);
++
++ memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
++ rc->rc_datalen = rsb->res_length;
++
++ error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
++ rc, 0);
++ if (error)
++ goto fail;
++ }
++
++ fail:
++ return error;
++}
++
++/*
++ * Go through local root resources and for each rsb which has a master which
++ * has departed, get the new master nodeid from the resdir. The resdir will
++ * assign mastery to the first node to look up the new master. That means
++ * we'll discover in this lookup if we're the new master of any rsb's.
++ *
++ * We fire off all the resdir requests individually and asynchronously to the
++ * correct resdir node. The replies are processed in rsb_master_recv().
++ */
++
++int restbl_rsb_update(gd_ls_t *ls)
++{
++ gd_res_t *rsb, *safe;
++ gd_rcom_t *rc;
++ int error = -ENOMEM;
++ int count = 0;
++
++ log_all(ls, "update remastered resources");
++
++ rc = allocate_rcom_buffer(ls);
++ if (!rc)
++ goto out;
++
++ list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
++ if (!rsb->res_nodeid)
++ continue;
++
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto out_free;
++
++ if (in_nodes_gone(ls, rsb->res_nodeid)) {
++ error = rsb_master_lookup(rsb, rc);
++ if (error)
++ goto out_free;
++ count++;
++ }
++ }
++
++ error = gdlm_wait_function(ls, &recover_list_empty);
++
++ log_all(ls, "updated %d resources", count);
++
++ out_free:
++ free_rcom_buffer(rc);
++
++ out:
++ return error;
++}
++
++int restbl_rsb_update_recv(gd_ls_t *ls, uint32_t nodeid, char *buf, int length,
++ int msgid)
++{
++ gd_res_t *rsb;
++ uint32_t be_nodeid;
++
++ rsb = recover_list_find(ls, msgid);
++ if (!rsb) {
++ log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
++ goto out;
++ }
++
++ memcpy(&be_nodeid, buf, sizeof(uint32_t));
++ rsb->res_nodeid = be32_to_cpu(be_nodeid);
++ set_new_master(rsb);
++ recover_list_del(rsb);
++
++ if (recover_list_empty(ls))
++ wake_up(&ls->ls_wait_general);
++
++ out:
++ return 0;
++}
++
++/*
++ * This function not used any longer.
++ */
++
++int bulk_master_lookup(gd_ls_t *ls, int nodeid, char *inbuf, int inlen,
++ char *outbuf)
++{
++ char *inbufptr, *outbufptr;
++
++ /*
++ * The other node wants nodeids matching the resource names in inbuf.
++ * The resource names are packed into inbuf as
++ * [len1][name1][len2][name2]... where lenX is 1 byte and nameX is
++ * lenX bytes. Matching nodeids are packed into outbuf in order
++ * [nodeid1][nodeid2]...
++ */
++
++ inbufptr = inbuf;
++ outbufptr = outbuf;
++
++ while (inbufptr < inbuf + inlen) {
++ gd_resdata_t *rd;
++ uint32_t be_nodeid;
++ int status;
++
++ status = get_resdata(ls, nodeid, inbufptr + 1, *inbufptr,
++ &rd, 1);
++ if (status != 0)
++ goto fail;
++
++ inbufptr += *inbufptr + 1;
++
++ be_nodeid = cpu_to_be32(rd->rd_master_nodeid);
++ memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
++ outbufptr += sizeof(uint32_t);
++
++ /* add assertion that outbufptr - outbuf is not > than ... */
++ }
++
++ return (outbufptr - outbuf);
++
++ fail:
++ return -1;
++}
+diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
+--- linux-orig/cluster/dlm/recover.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recover.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,34 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVER_DOT_H__
++#define __RECOVER_DOT_H__
++
++int gdlm_wait_function(gd_ls_t * ls, int (*testfn) (gd_ls_t * ls));
++int gdlm_wait_status_all(gd_ls_t * ls, unsigned int wait_status);
++int gdlm_wait_status_low(gd_ls_t * ls, unsigned int wait_status);
++int gdlm_recovery_stopped(gd_ls_t * ls);
++int recover_list_empty(gd_ls_t * ls);
++int recover_list_count(gd_ls_t * ls);
++void recover_list_add(gd_res_t * rsb);
++void recover_list_del(gd_res_t * rsb);
++void recover_list_dump(gd_ls_t * ls);
++int restbl_lkb_purge(gd_ls_t * ls);
++void restbl_grant_after_purge(gd_ls_t * ls);
++int restbl_rsb_update(gd_ls_t * ls);
++int restbl_rsb_update_recv(gd_ls_t * ls, int nodeid, char *buf, int len,
++ int msgid);
++int bulk_master_lookup(gd_ls_t * ls, int nodeid, char *inbuf, int inlen,
++ char *outbuf);
++
++#endif /* __RECOVER_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
+--- linux-orig/cluster/dlm/recoverd.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recoverd.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,692 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "nodes.h"
++#include "dir.h"
++#include "ast.h"
++#include "recover.h"
++#include "lockspace.h"
++#include "lowcomms.h"
++#include "lockqueue.h"
++#include "lkb.h"
++#include "rebuild.h"
++
++/*
++ * next_move actions
++ */
++
++#define DO_STOP (1)
++#define DO_START (2)
++#define DO_FINISH (3)
++#define DO_FINISH_STOP (4)
++#define DO_FINISH_START (5)
++
++/*
++ * recoverd_flags for thread
++ */
++
++#define THREAD_STOP (0)
++
++/*
++ * local thread variables
++ */
++
++static unsigned long recoverd_flags;
++static struct completion recoverd_run;
++static wait_queue_head_t recoverd_wait;
++static struct task_struct *recoverd_task;
++
++/*
++ * Queue of lockspaces (gr_recover_t structs) which need to be
++ * started/recovered
++ */
++
++static struct list_head recoverd_start_queue;
++static atomic_t recoverd_start_count;
++
++extern struct list_head lslist;
++extern spinlock_t lslist_lock;
++
++void dlm_recoverd_init(void)
++{
++ INIT_LIST_HEAD(&recoverd_start_queue);
++ atomic_set(&recoverd_start_count, 0);
++
++ init_completion(&recoverd_run);
++ init_waitqueue_head(&recoverd_wait);
++ memset(&recoverd_flags, 0, sizeof(unsigned long));
++}
++
++static int enable_locking(gd_ls_t *ls, int event_id)
++{
++ int error = 0;
++
++ spin_lock(&ls->ls_recover_lock);
++ if (ls->ls_last_stop < event_id) {
++ set_bit(LSFL_LS_RUN, &ls->ls_flags);
++ up_write(&ls->ls_in_recovery);
++ } else {
++ error = -EINTR;
++ log_debug(ls, "enable_locking: abort %d", event_id);
++ }
++ spin_unlock(&ls->ls_recover_lock);
++ return error;
++}
++
++static int ls_first_start(gd_ls_t *ls, gd_recover_t *gr)
++{
++ int error;
++
++ log_all(ls, "recover event %u (first)", gr->gr_event_id);
++
++ kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
++
++ error = ls_nodes_init(ls, gr);
++ if (error) {
++ log_error(ls, "nodes_init failed %d", error);
++ goto out;
++ }
++
++ error = resdir_rebuild_local(ls);
++ if (error) {
++ log_error(ls, "resdir_rebuild_local failed %d", error);
++ goto out;
++ }
++
++ error = resdir_rebuild_wait(ls);
++ if (error) {
++ log_error(ls, "resdir_rebuild_wait failed %d", error);
++ goto out;
++ }
++
++ log_all(ls, "recover event %u done", gr->gr_event_id);
++ kcl_start_done(ls->ls_local_id, gr->gr_event_id);
++
++ out:
++ return error;
++}
++
++/*
++ * We are given here a new group of nodes which are in the lockspace. We first
++ * figure out the differences in ls membership from when we were last running.
++ * If nodes from before are gone, then there will be some lock recovery to do.
++ * If there are only nodes which have joined, then there's no lock recovery.
++ *
++ * note: cman requires an rc to finish starting on an revent (where nodes die)
++ * before it allows an sevent (where nodes join) to be processed. This means
++ * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
++ * joined.
++ */
++
++static int ls_reconfig(gd_ls_t *ls, gd_recover_t *gr)
++{
++ int error, neg = 0;
++
++ log_all(ls, "recover event %u", gr->gr_event_id);
++
++ /*
++ * Add or remove nodes from the lockspace's ls_nodes list.
++ */
++
++ error = ls_nodes_reconfig(ls, gr, &neg);
++ if (error) {
++ log_error(ls, "nodes_reconfig failed %d", error);
++ goto fail;
++ }
++
++ /*
++ * Rebuild our own share of the resdir by collecting from all other
++ * nodes rsb name/master pairs for which the name hashes to us.
++ */
++
++ error = resdir_rebuild_local(ls);
++ if (error) {
++ log_error(ls, "resdir_rebuild_local failed %d", error);
++ goto fail;
++ }
++
++ /*
++ * Purge resdir-related requests that are being held in requestqueue.
++ * All resdir requests from before recovery started are invalid now due
++ * to the resdir rebuild and will be resent by the requesting nodes.
++ */
++
++ purge_requestqueue(ls);
++ set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
++
++ /*
++ * Wait for all nodes to complete resdir rebuild.
++ */
++
++ error = resdir_rebuild_wait(ls);
++ if (error) {
++ log_error(ls, "resdir_rebuild_wait failed %d", error);
++ goto fail;
++ }
++
++ /*
++ * Mark our own lkb's waiting in the lockqueue for remote replies from
++ * nodes that are now departed. These will be resent to the new
++ * masters in resend_cluster_requests. Also mark resdir lookup
++ * requests for resending.
++ */
++
++ lockqueue_lkb_mark(ls);
++
++ error = gdlm_recovery_stopped(ls);
++ if (error)
++ goto fail;
++
++ if (neg) {
++ /*
++ * Clear lkb's for departed nodes. This can't fail since it
++ * doesn't involve communicating with other nodes.
++ */
++
++ down_write(&ls->ls_rec_rsblist);
++ restbl_lkb_purge(ls);
++ up_write(&ls->ls_rec_rsblist);
++
++ down_read(&ls->ls_rec_rsblist);
++
++ /*
++ * Get new master id's for rsb's of departed nodes. This fails
++ * if we can't communicate with other nodes.
++ */
++
++ error = restbl_rsb_update(ls);
++ if (error) {
++ log_error(ls, "restbl_rsb_update failed %d", error);
++ goto fail_up;
++ }
++
++ /*
++ * Send our lkb info to new masters. This fails if we can't
++ * communicate with a node.
++ */
++
++ error = rebuild_rsbs_send(ls);
++ if (error) {
++ log_error(ls, "rebuild_rsbs_send failed %d", error);
++ goto fail_up;
++ }
++ up_read(&ls->ls_rec_rsblist);
++ }
++
++ clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
++
++ log_all(ls, "recover event %u done", gr->gr_event_id);
++ kcl_start_done(ls->ls_local_id, gr->gr_event_id);
++ return 0;
++
++ fail_up:
++ up_read(&ls->ls_rec_rsblist);
++ fail:
++ log_all(ls, "recover event %d error %d", gr->gr_event_id, error);
++ return error;
++}
++
++static void clear_finished_nodes(gd_ls_t *ls, int finish_event)
++{
++ gd_csb_t *csb, *safe;
++
++ list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, csb_list) {
++ if (csb->csb_gone_event <= finish_event) {
++ list_del(&csb->csb_list);
++ release_csb(csb);
++ }
++ }
++}
++
++/*
++ * Between calls to this routine for a ls, there can be multiple stop/start
++ * events from cman where every start but the latest is cancelled by stops.
++ * There can only be a single finish from cman because every finish requires us
++ * to call start_done. A single finish event could be followed by multiple
++ * stop/start events. This routine takes any combination of events from cman
++ * and boils them down to one course of action.
++ */
++
++int next_move(gd_ls_t *ls, gd_recover_t **gr_out, int *finish_out)
++{
++ LIST_HEAD(events);
++ unsigned int cmd = 0, stop, start, finish;
++ unsigned int last_stop, last_start, last_finish;
++ gd_recover_t *gr = NULL, *start_gr = NULL;
++
++ /*
++ * Grab the current state of cman/sm events.
++ */
++
++ spin_lock(&ls->ls_recover_lock);
++
++ stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
++ start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
++ finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
++
++ last_stop = ls->ls_last_stop;
++ last_start = ls->ls_last_start;
++ last_finish = ls->ls_last_finish;
++
++ while (!list_empty(&ls->ls_recover)) {
++ gr = list_entry(ls->ls_recover.next, gd_recover_t, gr_list);
++ list_del(&gr->gr_list);
++ list_add_tail(&gr->gr_list, &events);
++ }
++ spin_unlock(&ls->ls_recover_lock);
++
++ log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
++ last_stop, last_start, last_finish);
++
++ /*
++ * Toss start events which have since been cancelled.
++ */
++
++ while (!list_empty(&events)) {
++ GDLM_ASSERT(start,);
++ gr = list_entry(events.next, gd_recover_t, gr_list);
++ list_del(&gr->gr_list);
++
++ if (gr->gr_event_id <= last_stop) {
++ log_debug(ls, "move skip event %u", gr->gr_event_id);
++ kfree(gr->gr_nodeids);
++ free_dlm_recover(gr);
++ gr = NULL;
++ } else {
++ log_debug(ls, "move use event %u", gr->gr_event_id);
++ GDLM_ASSERT(!start_gr,);
++ start_gr = gr;
++ }
++ }
++
++ /*
++ * Eight possible combinations of events.
++ */
++
++ /* 0 */
++ if (!stop && !start && !finish) {
++ GDLM_ASSERT(!start_gr,);
++ cmd = 0;
++ goto out;
++ }
++
++ /* 1 */
++ if (!stop && !start && finish) {
++ GDLM_ASSERT(!start_gr,);
++ GDLM_ASSERT(last_start > last_stop,);
++ GDLM_ASSERT(last_finish == last_start,);
++ cmd = DO_FINISH;
++ *finish_out = last_finish;
++ goto out;
++ }
++
++ /* 2 */
++ if (!stop && start && !finish) {
++ GDLM_ASSERT(start_gr,);
++ GDLM_ASSERT(last_start > last_stop,);
++ cmd = DO_START;
++ *gr_out = start_gr;
++ goto out;
++ }
++
++ /* 3 */
++ if (!stop && start && finish) {
++ GDLM_ASSERT(0, printk("finish and start with no stop\n"););
++ }
++
++ /* 4 */
++ if (stop && !start && !finish) {
++ GDLM_ASSERT(!start_gr,);
++ GDLM_ASSERT(last_start == last_stop,);
++ cmd = DO_STOP;
++ goto out;
++ }
++
++ /* 5 */
++ if (stop && !start && finish) {
++ GDLM_ASSERT(!start_gr,);
++ GDLM_ASSERT(last_finish == last_start,);
++ GDLM_ASSERT(last_stop == last_start,);
++ cmd = DO_FINISH_STOP;
++ *finish_out = last_finish;
++ goto out;
++ }
++
++ /* 6 */
++ if (stop && start && !finish) {
++ if (start_gr) {
++ GDLM_ASSERT(last_start > last_stop,);
++ cmd = DO_START;
++ *gr_out = start_gr;
++ } else {
++ GDLM_ASSERT(last_stop == last_start,);
++ cmd = DO_STOP;
++ }
++ goto out;
++ }
++
++ /* 7 */
++ if (stop && start && finish) {
++ if (start_gr) {
++ GDLM_ASSERT(last_start > last_stop,);
++ GDLM_ASSERT(last_start > last_finish,);
++ cmd = DO_FINISH_START;
++ *finish_out = last_finish;
++ *gr_out = start_gr;
++ } else {
++ GDLM_ASSERT(last_start == last_stop,);
++ GDLM_ASSERT(last_start > last_finish,);
++ cmd = DO_FINISH_STOP;
++ *finish_out = last_finish;
++ }
++ goto out;
++ }
++
++ out:
++ return cmd;
++}
++
++/*
++ * This function decides what to do given every combination of current
++ * lockspace state and next lockspace state.
++ */
++
++static void do_ls_recovery(gd_ls_t *ls)
++{
++ gd_recover_t *gr = NULL;
++ int error, cur_state, next_state = 0, do_now, finish_event = 0;
++
++ do_now = next_move(ls, &gr, &finish_event);
++ if (!do_now)
++ goto out;
++
++ cur_state = ls->ls_state;
++ next_state = 0;
++
++ GDLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
++ log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
++
++ /*
++ * LSST_CLEAR - we're not in any recovery state. We can get a stop or
++ * a stop and start which equates with a START.
++ */
++
++ if (cur_state == LSST_CLEAR) {
++ switch (do_now) {
++ case DO_STOP:
++ next_state = LSST_WAIT_START;
++ break;
++
++ case DO_START:
++ error = ls_reconfig(ls, gr);
++ if (error)
++ next_state = LSST_WAIT_START;
++ else
++ next_state = LSST_RECONFIG_DONE;
++ break;
++
++ case DO_FINISH: /* invalid */
++ case DO_FINISH_STOP: /* invalid */
++ case DO_FINISH_START: /* invalid */
++ default:
++ GDLM_ASSERT(0,);
++ }
++ goto out;
++ }
++
++ /*
++ * LSST_WAIT_START - we're not running because of getting a stop or
++ * failing a start. We wait in this state for another stop/start or
++ * just the next start to begin another reconfig attempt.
++ */
++
++ if (cur_state == LSST_WAIT_START) {
++ switch (do_now) {
++ case DO_STOP:
++ break;
++
++ case DO_START:
++ error = ls_reconfig(ls, gr);
++ if (error)
++ next_state = LSST_WAIT_START;
++ else
++ next_state = LSST_RECONFIG_DONE;
++ break;
++
++ case DO_FINISH: /* invalid */
++ case DO_FINISH_STOP: /* invalid */
++ case DO_FINISH_START: /* invalid */
++ default:
++ GDLM_ASSERT(0,);
++ }
++ goto out;
++ }
++
++ /*
++ * LSST_RECONFIG_DONE - we entered this state after successfully
++ * completing ls_reconfig and calling kcl_start_done. We expect to get
++ * a finish if everything goes ok. A finish could be followed by stop
++ * or stop/start before we get here to check it. Or a finish may never
++ * happen, only stop or stop/start.
++ */
++
++ if (cur_state == LSST_RECONFIG_DONE) {
++ switch (do_now) {
++ case DO_FINISH:
++ clear_finished_nodes(ls, finish_event);
++ next_state = LSST_CLEAR;
++
++ error = enable_locking(ls, finish_event);
++ if (error)
++ break;
++
++ error = process_requestqueue(ls);
++ if (error)
++ break;
++
++ error = resend_cluster_requests(ls);
++ if (error)
++ break;
++
++ restbl_grant_after_purge(ls);
++
++ log_all(ls, "recover event %u finished", finish_event);
++ break;
++
++ case DO_STOP:
++ next_state = LSST_WAIT_START;
++ break;
++
++ case DO_FINISH_STOP:
++ clear_finished_nodes(ls, finish_event);
++ next_state = LSST_WAIT_START;
++ break;
++
++ case DO_FINISH_START:
++ clear_finished_nodes(ls, finish_event);
++ /* fall into DO_START */
++
++ case DO_START:
++ error = ls_reconfig(ls, gr);
++ if (error)
++ next_state = LSST_WAIT_START;
++ else
++ next_state = LSST_RECONFIG_DONE;
++ break;
++
++ default:
++ GDLM_ASSERT(0,);
++ }
++ goto out;
++ }
++
++ /*
++ * LSST_INIT - state after ls is created and before it has been
++ * started. A start operation will cause the ls to be started for the
++ * first time. A failed start will cause to just wait in INIT for
++ * another stop/start.
++ */
++
++ if (cur_state == LSST_INIT) {
++ switch (do_now) {
++ case DO_START:
++ error = ls_first_start(ls, gr);
++ if (!error)
++ next_state = LSST_INIT_DONE;
++ break;
++
++ case DO_STOP:
++ break;
++
++ case DO_FINISH: /* invalid */
++ case DO_FINISH_STOP: /* invalid */
++ case DO_FINISH_START: /* invalid */
++ default:
++ GDLM_ASSERT(0,);
++ }
++ goto out;
++ }
++
++ /*
++ * LSST_INIT_DONE - after the first start operation is completed
++ * successfully and kcl_start_done() called. If there are no errors, a
++ * finish will arrive next and we'll move to LSST_CLEAR.
++ */
++
++ if (cur_state == LSST_INIT_DONE) {
++ switch (do_now) {
++ case DO_STOP:
++ case DO_FINISH_STOP:
++ next_state = LSST_WAIT_START;
++ break;
++
++ case DO_START:
++ case DO_FINISH_START:
++ error = ls_reconfig(ls, gr);
++ if (error)
++ next_state = LSST_WAIT_START;
++ else
++ next_state = LSST_RECONFIG_DONE;
++ break;
++
++ case DO_FINISH:
++ next_state = LSST_CLEAR;
++ enable_locking(ls, finish_event);
++ log_all(ls, "recover event %u finished", finish_event);
++ break;
++
++ default:
++ GDLM_ASSERT(0,);
++ }
++ goto out;
++ }
++
++ out:
++ if (next_state)
++ ls->ls_state = next_state;
++
++ if (gr) {
++ kfree(gr->gr_nodeids);
++ free_dlm_recover(gr);
++ }
++}
++
++static __inline__ gd_ls_t *get_work(int clear)
++{
++ gd_ls_t *ls;
++
++ spin_lock(&lslist_lock);
++
++ list_for_each_entry(ls, &lslist, ls_list) {
++ if (clear) {
++ if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
++ goto got_work;
++
++ } else {
++ if (test_bit(LSFL_WORK, &ls->ls_flags))
++ goto got_work;
++ }
++ }
++ ls = NULL;
++
++ got_work:
++ spin_unlock(&lslist_lock);
++
++ return ls;
++}
++
++/*
++ * Thread which does recovery for all lockspaces.
++ */
++
++static int dlm_recoverd(void *arg)
++{
++ gd_ls_t *ls;
++
++ daemonize("dlm_recoverd");
++ recoverd_task = current;
++ complete(&recoverd_run);
++
++ while (!test_bit(THREAD_STOP, &recoverd_flags)) {
++ wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
++ if ((ls = get_work(1)))
++ do_ls_recovery(ls);
++ }
++
++ complete(&recoverd_run);
++ return 0;
++}
++
++/*
++ * Mark a specific lockspace as needing work and wake up the thread to do it.
++ */
++
++void recoverd_kick(gd_ls_t *ls)
++{
++ set_bit(LSFL_WORK, &ls->ls_flags);
++ wake_up(&recoverd_wait);
++}
++
++/*
++ * Start the recoverd thread when gdlm is started (before any lockspaces).
++ */
++
++int recoverd_start(void)
++{
++ int error;
++
++ clear_bit(THREAD_STOP, &recoverd_flags);
++ error = kernel_thread(dlm_recoverd, NULL, 0);
++ if (error < 0)
++ goto out;
++
++ error = 0;
++ wait_for_completion(&recoverd_run);
++
++ out:
++ return error;
++}
++
++/*
++ * Stop the recoverd thread when gdlm is shut down (all lockspaces are gone).
++ */
++
++int recoverd_stop(void)
++{
++ set_bit(THREAD_STOP, &recoverd_flags);
++ wake_up(&recoverd_wait);
++ wait_for_completion(&recoverd_run);
++
++ return 0;
++}
+diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
+--- linux-orig/cluster/dlm/recoverd.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/recoverd.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVERD_DOT_H__
++#define __RECOVERD_DOT_H__
++
++void dlm_recoverd_init(void);
++void recoverd_kick(gd_ls_t * ls);
++int recoverd_start(void);
++int recoverd_stop(void);
++
++#endif /* __RECOVERD_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
+--- linux-orig/cluster/dlm/rsb.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rsb.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,307 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++#include "locking.h"
++#include "memory.h"
++#include "lockqueue.h"
++#include "nodes.h"
++#include "dir.h"
++#include "util.h"
++
++static gd_res_t *search_hashchain(struct list_head *head, gd_res_t *parent,
++ char *name, int namelen)
++{
++ gd_res_t *r;
++
++ list_for_each_entry(r, head, res_hashchain) {
++ if ((parent == r->res_parent) && (namelen == r->res_length) &&
++ (memcmp(name, r->res_name, namelen) == 0)) {
++ atomic_inc(&r->res_ref);
++ return r;
++ }
++ }
++
++ return NULL;
++}
++
++/*
++ * A way to arbitrarily hold onto an rsb which we already have a reference to
++ * to make sure it doesn't go away. Opposite of release_rsb().
++ */
++
++void hold_rsb(gd_res_t *r)
++{
++ atomic_inc(&r->res_ref);
++}
++
++/*
++ * release_rsb() - Decrement reference count on rsb struct. Free the rsb
++ * struct when there are zero references. Every lkb for the rsb adds a
++ * reference. When ref is zero there can be no more lkb's for the rsb, on the
++ * queue's or anywhere else.
++ */
++
++void release_rsb(gd_res_t *r)
++{
++ gd_ls_t *ls = r->res_ls;
++ int removed = FALSE;
++
++ write_lock(&ls->ls_reshash_lock);
++ atomic_dec(&r->res_ref);
++
++ if (!atomic_read(&r->res_ref)) {
++ GDLM_ASSERT(list_empty(&r->res_grantqueue),);
++ GDLM_ASSERT(list_empty(&r->res_waitqueue),);
++ GDLM_ASSERT(list_empty(&r->res_convertqueue),);
++ removed = TRUE;
++ list_del(&r->res_hashchain);
++ }
++ write_unlock(&ls->ls_reshash_lock);
++
++ if (removed) {
++ down_read(&ls->ls_gap_rsblist);
++ if (r->res_parent)
++ list_del(&r->res_subreslist);
++ else
++ list_del(&r->res_rootlist);
++ up_read(&ls->ls_gap_rsblist);
++
++ /*
++ * Remove resdir entry if this was a locally mastered root rsb.
++ */
++ if (!r->res_parent && !r->res_nodeid) {
++ if (get_directory_nodeid(r) != our_nodeid())
++ remote_remove_resdata(r->res_ls,
++ get_directory_nodeid(r),
++ r->res_name,
++ r->res_length,
++ r->res_resdir_seq);
++ else
++ remove_resdata(r->res_ls, our_nodeid(),
++ r->res_name, r->res_length,
++ r->res_resdir_seq);
++ }
++
++ if (r->res_lvbptr)
++ free_lvb(r->res_lvbptr);
++
++ free_rsb(r);
++ }
++}
++
++/*
++ * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
++ * If the rsb exists, its ref count is incremented by this function. If it
++ * doesn't exist, it's created with a ref count of one.
++ */
++
++int find_or_create_rsb(gd_ls_t *ls, gd_res_t *parent, char *name, int namelen,
++ int create, gd_res_t **rp)
++{
++ uint32_t hash;
++ gd_res_t *r, *tmp;
++ int error = -ENOMEM;
++
++ GDLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
++
++ hash = gdlm_hash(name, namelen);
++ hash &= ls->ls_hashmask;
++
++ read_lock(&ls->ls_reshash_lock);
++ r = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
++ read_unlock(&ls->ls_reshash_lock);
++
++ if (r)
++ goto out_set;
++ if (!create) {
++ *rp = NULL;
++ goto out;
++ }
++
++ r = allocate_rsb(ls, namelen);
++ if (!r)
++ goto fail;
++
++ INIT_LIST_HEAD(&r->res_subreslist);
++ INIT_LIST_HEAD(&r->res_grantqueue);
++ INIT_LIST_HEAD(&r->res_convertqueue);
++ INIT_LIST_HEAD(&r->res_waitqueue);
++
++ memcpy(r->res_name, name, namelen);
++ r->res_length = namelen;
++ r->res_ls = ls;
++ init_rwsem(&r->res_lock);
++ atomic_set(&r->res_ref, 1);
++
++ if (parent) {
++ r->res_parent = parent;
++ r->res_depth = parent->res_depth + 1;
++ r->res_root = parent->res_root;
++ r->res_nodeid = parent->res_nodeid;
++ } else {
++ r->res_parent = NULL;
++ r->res_depth = 1;
++ r->res_root = r;
++ r->res_nodeid = -1;
++ }
++
++ write_lock(&ls->ls_reshash_lock);
++ tmp = search_hashchain(&ls->ls_reshashtbl[hash], parent, name, namelen);
++ if (tmp) {
++ write_unlock(&ls->ls_reshash_lock);
++ free_rsb(r);
++ r = tmp;
++ } else {
++ list_add(&r->res_hashchain, &ls->ls_reshashtbl[hash]);
++ write_unlock(&ls->ls_reshash_lock);
++
++ down_read(&ls->ls_gap_rsblist);
++ if (parent)
++ list_add_tail(&r->res_subreslist,
++ &r->res_root->res_subreslist);
++ else
++ list_add(&r->res_rootlist, &ls->ls_rootres);
++ up_read(&ls->ls_gap_rsblist);
++ }
++
++ out_set:
++ *rp = r;
++
++ out:
++ error = 0;
++
++ fail:
++ return error;
++}
++
++/*
++ * Add a LKB to a resource's grant/convert/wait queue. in order
++ */
++
++void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
++{
++ gd_lkb_t *lkb = NULL;
++
++ list_for_each_entry(lkb, head, lkb_statequeue) {
++ if (lkb->lkb_rqmode < mode)
++ break;
++ }
++
++ if (!lkb) {
++ /* No entries in the queue, we are alone */
++ list_add_tail(new, head);
++ } else {
++ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
++ }
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++void lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++
++ GDLM_ASSERT(!lkb->lkb_status, printk("status=%u\n", lkb->lkb_status););
++
++ lkb->lkb_status = type;
++
++ switch (type) {
++ case GDLM_LKSTS_WAITING:
++ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
++ break;
++
++ case GDLM_LKSTS_GRANTED:
++ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
++ lkb->lkb_grmode);
++ break;
++
++ case GDLM_LKSTS_CONVERT:
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
++ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
++
++ else
++ if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
++ list_add_tail(&lkb->lkb_statequeue,
++ &r->res_convertqueue);
++ else
++ lkb_add_ordered(&lkb->lkb_statequeue,
++ &r->res_convertqueue, lkb->lkb_rqmode);
++ break;
++
++ default:
++ GDLM_ASSERT(0,);
++ }
++}
++
++void res_lkb_enqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++ down_write(&r->res_lock);
++ lkb_enqueue(r, lkb, type);
++ up_write(&r->res_lock);
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int lkb_dequeue(gd_lkb_t *lkb)
++{
++ int status = lkb->lkb_status;
++
++ if (!status)
++ goto out;
++
++ lkb->lkb_status = 0;
++ list_del(&lkb->lkb_statequeue);
++
++ out:
++ return status;
++}
++
++int res_lkb_dequeue(gd_lkb_t *lkb)
++{
++ int status;
++
++ down_write(&lkb->lkb_resource->res_lock);
++ status = lkb_dequeue(lkb);
++ up_write(&lkb->lkb_resource->res_lock);
++
++ return status;
++}
++
++/*
++ * The rsb res_lock must be held in write when this function is called.
++ */
++
++int lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++ int status;
++
++ status = lkb_dequeue(lkb);
++ lkb_enqueue(r, lkb, type);
++
++ return status;
++}
++
++int res_lkb_swqueue(gd_res_t *r, gd_lkb_t *lkb, int type)
++{
++ int status;
++
++ down_write(&r->res_lock);
++ status = lkb_swqueue(r, lkb, type);
++ up_write(&r->res_lock);
++
++ return status;
++}
+diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
+--- linux-orig/cluster/dlm/rsb.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/rsb.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RSB_DOT_H__
++#define __RSB_DOT_H__
++
++void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
++void _release_rsb(gd_res_t * r);
++void release_rsb(gd_res_t * r);
++void hold_rsb(gd_res_t * r);
++int find_or_create_rsb(gd_ls_t * ls, gd_res_t * parent, char *name, int namelen,
++ int create, gd_res_t ** rp);
++void lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++void res_lkb_enqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++int lkb_dequeue(gd_lkb_t * lkb);
++int res_lkb_dequeue(gd_lkb_t * lkb);
++int lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++int res_lkb_swqueue(gd_res_t * r, gd_lkb_t * lkb, int type);
++
++#endif /* __RSB_DOT_H__ */
+diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
+--- linux-orig/cluster/dlm/util.c 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/util.c 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,130 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "dlm_internal.h"
++
++static const uint32_t crc_32_tab[] = {
++ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
++ 0xe963a535, 0x9e6495a3,
++ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
++ 0xe7b82d07, 0x90bf1d91,
++ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
++ 0xf4d4b551, 0x83d385c7,
++ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
++ 0xfa0f3d63, 0x8d080df5,
++ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
++ 0xd20d85fd, 0xa50ab56b,
++ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
++ 0xdcd60dcf, 0xabd13d59,
++ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
++ 0xcfba9599, 0xb8bda50f,
++ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
++ 0xc1611dab, 0xb6662d3d,
++ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
++ 0x9fbfe4a5, 0xe8b8d433,
++ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
++ 0x91646c97, 0xe6635c01,
++ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
++ 0x8208f4c1, 0xf50fc457,
++ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
++ 0x8cd37cf3, 0xfbd44c65,
++ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
++ 0xa4d1c46d, 0xd3d6f4fb,
++ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
++ 0xaa0a4c5f, 0xdd0d7cc9,
++ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
++ 0xb966d409, 0xce61e49f,
++ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
++ 0xb7bd5c3b, 0xc0ba6cad,
++ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
++ 0x04db2615, 0x73dc1683,
++ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
++ 0x0a00ae27, 0x7d079eb1,
++ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
++ 0x196c3671, 0x6e6b06e7,
++ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
++ 0x17b7be43, 0x60b08ed5,
++ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
++ 0x3fb506dd, 0x48b2364b,
++ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
++ 0x316e8eef, 0x4669be79,
++ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
++ 0x220216b9, 0x5505262f,
++ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
++ 0x2cd99e8b, 0x5bdeae1d,
++ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
++ 0x72076785, 0x05005713,
++ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
++ 0x7cdcefb7, 0x0bdbdf21,
++ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
++ 0x6fb077e1, 0x18b74777,
++ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
++ 0x616bffd3, 0x166ccf45,
++ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
++ 0x4969474d, 0x3e6e77db,
++ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
++ 0x47b2cf7f, 0x30b5ffe9,
++ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
++ 0x54de5729, 0x23d967bf,
++ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
++ 0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * gdlm_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Copied from GFS.
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data. The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at. That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t gdlm_hash(const char *data, int len)
++{
++ uint32_t hash = 0xFFFFFFFF;
++
++ for (; len--; data++)
++ hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++ hash = ~hash;
++
++ return hash;
++}
++
++uint32_t gdlm_next_power2(uint32_t val)
++{
++ uint32_t x;
++
++ for (x = 1; x < val; x <<= 1) ;
++
++ return x;
++}
++
++void print_lkb(gd_lkb_t *lkb)
++{
++ printk("dlm: lkb id=%x remid=%x flags=%x status=%x rq=%d gr=%d "
++ "nodeid=%u lqstate=%x lqflags=%x\n",
++ lkb->lkb_id, lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_status,
++ lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_nodeid,
++ lkb->lkb_lockqueue_state, lkb->lkb_lockqueue_flags);
++}
+diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
+--- linux-orig/cluster/dlm/util.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/cluster/dlm/util.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++uint32_t gdlm_hash(const char *data, int len);
++uint32_t gdlm_next_power2(uint32_t val);
++
++void print_lkb(gd_lkb_t *lkb);
++
++#endif
+diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
+--- linux-orig/include/cluster/dlm.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/dlm.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,404 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DLM_DOT_H__
++#define __DLM_DOT_H__
++
++/*
++ * Interface to DLM - routines and structures to use DLM lockspaces.
++ */
++
++/*
++ * Lock Modes
++ */
++
++#define DLM_LOCK_IV (-1) /* invalid */
++#define DLM_LOCK_NL (0) /* null */
++#define DLM_LOCK_CR (1) /* concurrent read */
++#define DLM_LOCK_CW (2) /* concurrent write */
++#define DLM_LOCK_PR (3) /* protected read */
++#define DLM_LOCK_PW (4) /* protected write */
++#define DLM_LOCK_EX (5) /* exclusive */
++
++/*
++ * Maximum size in bytes of a dlm_lock name
++ */
++
++#define DLM_RESNAME_MAXLEN (64)
++
++/*
++ * Size in bytes of Lock Value Block
++ */
++
++#define DLM_LVB_LEN (32)
++
++/*
++ * Flags to dlm_new_lockspace
++ *
++ * DLM_LSF_NOTIMERS
++ *
++ * Do not subject locks in this lockspace to time-outs.
++ *
++ */
++
++#define DLM_LSF_NOTIMERS (1)
++
++/*
++ * Flags to dlm_lock
++ *
++ * DLM_LKF_NOQUEUE
++ *
++ * Do not queue the lock request on the wait queue if it cannot be granted
++ * immediately. If the lock cannot be granted because of this flag, DLM will
++ * either return -EAGAIN from the dlm_lock call or will return 0 from
++ * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
++ *
++ * DLM_LKF_CONVERT
++ *
++ * Indicates a lock conversion request. For conversions the name and namelen
++ * are ignored and the lock ID in the LKSB is used to identify the lock.
++ *
++ * DLM_LKF_VALBLK
++ *
++ * Requests DLM to return the current contents of the lock value block in the
++ * lock status block. When this flag is set in a lock conversion from PW or EX
++ * modes, DLM assigns the value specified in the lock status block to the lock
++ * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
++ * containing application-specific information.
++ *
++ * DLM_LKF_QUECVT
++ *
++ * Force a conversion lock request to the back of the convert queue. All other
++ * conversion requests ahead of it must be granted before it can be granted.
++ * This enforces a FIFO ordering on the convert queue. When this flag is set,
++ * indefinite postponement is averted. This flag is allowed only when
++ * converting a lock to a more restrictive mode.
++ *
++ * DLM_LKF_CANCEL
++ *
++ * Used to cancel a pending conversion (with dlm_unlock). Lock is returned to
++ * previously granted mode.
++ *
++ * DLM_LKF_IVVALBLK
++ *
++ * Invalidate/clear the lock value block.
++ *
++ * DLM_LKF_CONVDEADLK
++ *
++ * The granted mode of a lock being converted (from a non-NL mode) can be
++ * changed to NL in the process of acquiring the requested mode to avoid
++ * conversion deadlock.
++ *
++ * DLM_LKF_PERSISTENT
++ *
++ * Only relevant to locks originating in userspace. Signals to the ioctl.c code
++ * that this lock should not be unlocked when the process exits.
++ *
++ * DLM_LKF_NODLKWT
++ *
++ * This lock is not to be checked for conversion deadlocks.
++ *
++ * DLM_LKF_NODLCKBLK
++ *
++ * not yet implemented
++ *
++ * DLM_LKF_EXPEDITE
++ *
++ * If this lock conversion cannot be granted immediately it is to go to the
++ * head of the conversion queue regardless of its requested lock mode.
++ *
++ * DLM_LKF_NOQUEUEBAST
++ *
++ * Send blocking AST's before returning -EAGAIN to the caller. It is only
++ * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
++ * NOQUEUE requests otherwise.
++ *
++ */
++
++#define DLM_LKF_NOQUEUE (0x00000001)
++#define DLM_LKF_CANCEL (0x00000002)
++#define DLM_LKF_CONVERT (0x00000004)
++#define DLM_LKF_VALBLK (0x00000008)
++#define DLM_LKF_QUECVT (0x00000010)
++#define DLM_LKF_IVVALBLK (0x00000020)
++#define DLM_LKF_CONVDEADLK (0x00000040)
++#define DLM_LKF_PERSISTENT (0x00000080)
++#define DLM_LKF_NODLCKWT (0x00000100)
++#define DLM_LKF_NODLCKBLK (0x00000200)
++#define DLM_LKF_EXPEDITE (0x00000400)
++#define DLM_LKF_NOQUEUEBAST (0x00000800)
++
++/*
++ * Some return codes that are not not in errno.h
++ */
++
++#define DLM_ECANCEL (0x10001)
++#define DLM_EUNLOCK (0x10002)
++
++typedef void dlm_lockspace_t;
++
++/*
++ * Lock range structure
++ */
++
++struct dlm_range {
++ uint64_t ra_start;
++ uint64_t ra_end;
++};
++
++/*
++ * Lock status block
++ *
++ * Use this structure to specify the contents of the lock value block. For a
++ * conversion request, this structure is used to specify the lock ID of the
++ * lock. DLM writes the status of the lock request and the lock ID assigned
++ * to the request in the lock status block.
++ *
++ * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
++ * It is available when dlm_lock returns.
++ *
++ * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
++ * shown for the DLM_LKF_VALBLK flag.
++ *
++ * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
++ * it was first demoted to NL to avoid conversion deadlock.
++ *
++ * sb_status: the returned status of the lock request set prior to AST
++ * execution. Possible return values:
++ *
++ * 0 if lock request was successful
++ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
++ * -ENOMEM if there is no memory to process request
++ * -EINVAL if there are invalid parameters
++ * -DLM_EUNLOCK if unlock request was successful
++ * -DLM_ECANCEL ?
++ */
++
++#define DLM_SBF_DEMOTED (0x01)
++
++struct dlm_lksb {
++ int sb_status;
++ uint32_t sb_lkid;
++ char sb_flags;
++ char * sb_lvbptr;
++};
++
++/*
++ * These defines are the bits that make up the
++ * query code.
++ */
++
++/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
++ * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
++#define DLM_LOCK_THIS 0x0007
++#define DLM_QUERY_MODE_MASK 0x0007
++
++/* Bits 3, 4, 5 bitmap of queue(s) to query */
++#define DLM_QUERY_QUEUE_WAIT 0x0008
++#define DLM_QUERY_QUEUE_CONVERT 0x0010
++#define DLM_QUERY_QUEUE_GRANT 0x0020
++#define DLM_QUERY_QUEUE_GRANTED 0x0030 /* Shorthand */
++#define DLM_QUERY_QUEUE_ALL 0x0038 /* Shorthand */
++
++/* Bit 6, Return only the information that can be established without a network
++ * round-trip. The caller must be aware of the implications of this. Useful for
++ * just getting the master node id or resource name. */
++#define DLM_QUERY_LOCAL 0x0040
++
++/* Bits 8 up, query type */
++#define DLM_QUERY_LOCKS_HIGHER 0x0100
++#define DLM_QUERY_LOCKS_LOWER 0x0200
++#define DLM_QUERY_LOCKS_EQUAL 0x0300
++#define DLM_QUERY_LOCKS_BLOCKING 0x0400
++#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
++#define DLM_QUERY_LOCKS_ALL 0x0600
++#define DLM_QUERY_MASK 0x0F00
++
++/* GRMODE is the default for mode comparisons,
++ RQMODE might also be handy */
++#define DLM_QUERY_GRMODE 0x0000
++#define DLM_QUERY_RQMODE 0x1000
++
++/* Structures passed into and out of the query */
++
++struct dlm_lockinfo {
++ int lki_lkid; /* Lock ID on originating node */
++ int lki_mstlkid; /* Lock ID on master node */
++ int lki_parent;
++ int lki_node; /* Originating node (not master) */
++ uint8_t lki_state; /* Queue the lock is on */
++ uint8_t lki_grmode; /* Granted mode */
++ uint8_t lki_rqmode; /* Requested mode */
++ struct dlm_range lki_grrange; /* Granted range, if applicable */
++ struct dlm_range lki_rqrange; /* Requested range, if applicable */
++};
++
++struct dlm_resinfo {
++ int rsi_length;
++ int rsi_grantcount; /* No. of nodes on grant queue */
++ int rsi_convcount; /* No. of nodes on convert queue */
++ int rsi_waitcount; /* No. of nodes on wait queue */
++ int rsi_masternode; /* Master for this resource */
++ char rsi_name[DLM_RESNAME_MAXLEN]; /* Resource name */
++ char rsi_valblk[DLM_LVB_LEN]; /* Master's LVB contents, if applicable
++ */
++};
++
++struct dlm_queryinfo {
++ struct dlm_resinfo *gqi_resinfo;
++ struct dlm_lockinfo *gqi_lockinfo; /* This points to an array
++ * of structs */
++ int gqi_locksize; /* input */
++ int gqi_lockcount; /* output */
++};
++
++#ifdef __KERNEL__
++/*
++ * dlm_init
++ *
++ * Starts and initializes DLM threads and structures. Creation of the first
++ * lockspace will call this if it has not been called already.
++ *
++ * Returns: 0 if successful, -EXXX on error
++ */
++
++int dlm_init(void);
++
++/*
++ * dlm_release
++ *
++ * Stops DLM threads.
++ *
++ * Returns: 0 if successful, -EXXX on error
++ */
++
++int dlm_release(void);
++
++/*
++ * dlm_new_lockspace
++ *
++ * Starts a lockspace with the given name. If the named lockspace exists in
++ * the cluster, the calling node joins it.
++ */
++
++int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
++ int flags);
++
++/*
++ * dlm_release_lockspace
++ *
++ * Stop a lockspace.
++ */
++
++int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
++
++/*
++ * dlm_lock
++ *
++ * Make an asyncronous request to acquire or convert a lock on a named
++ * resource.
++ *
++ * lockspace: context for the request
++ * mode: the requested mode of the lock (DLM_LOCK_)
++ * lksb: lock status block for input and async return values
++ * flags: input flags (DLM_LKF_)
++ * name: name of the resource to lock, can be binary
++ * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
++ * parent: the lock ID of a parent lock or 0 if none
++ * lockast: function DLM executes when it completes processing the request
++ * astarg: argument passed to lockast and bast functions
++ * bast: function DLM executes when this lock later blocks another request
++ *
++ * Returns:
++ * 0 if request is successfully queued for processing
++ * -EINVAL if any input parameters are invalid
++ * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
++ * -ENOMEM if there is no memory to process request
++ * -ENOTCONN if there is a communication error
++ *
++ * If the call to dlm_lock returns an error then the operation has failed and
++ * the AST routine will not be called. If dlm_lock returns 0 it is still
++ * possible that the lock operation will fail. The AST routine will be called
++ * when the locking is complete and the status is returned in the lksb.
++ *
++ * If the AST routines or parameter are passed to a conversion operation then
++ * they will overwrite those values that were passed to a previous dlm_lock
++ * call.
++ *
++ * AST routines should not block (at least not for long), but may make
++ * any locking calls they please.
++ */
++
++int dlm_lock(dlm_lockspace_t *lockspace,
++ uint32_t mode,
++ struct dlm_lksb *lksb,
++ uint32_t flags,
++ void *name,
++ unsigned int namelen,
++ uint32_t parent,
++ void (*lockast) (void *astarg),
++ void *astarg,
++ void (*bast) (void *astarg, int mode),
++ struct dlm_range *range);
++
++/*
++ * dlm_unlock
++ *
++ * Asynchronously release a lock on a resource. The AST routine is called
++ * when the resource is successfully unlocked.
++ *
++ * lockspace: context for the request
++ * lkid: the lock ID as returned in the lksb
++ * flags: input flags (DLM_LKF_)
++ * lksb: if NULL the lksb parameter passed to last lock request is used
++ * astarg: if NULL, astarg in last lock request is used
++ *
++ * Returns:
++ * 0 if request is successfully queued for processing
++ * -EINVAL if any input parameters are invalid
++ * -ENOTEMPTY if the lock still has sublocks
++ * -EBUSY if the lock is waiting for a remote lock operation
++ * -ENOTCONN if there is a communication error
++ */
++
++extern int dlm_unlock(dlm_lockspace_t *lockspace,
++ uint32_t lkid,
++ uint32_t flags,
++ struct dlm_lksb *lksb,
++ void *astarg);
++
++/* Query interface
++ *
++ * Query the other holders of a resource, given a known lock ID
++ *
++ * lockspace: context for the request
++ * lksb: LKSB, sb_lkid contains the lock ID of a valid lock
++ * on the resource. sb_status will contain the status
++ * of the request on completion.
++ * query: query bitmap see DLM_QUERY_* above
++ * qinfo: pointer to dlm_queryinfo structure
++ * ast_routine: AST routine to call on completion
++ * artarg: argument to AST routine. It is "traditional"
++ * to put the qinfo pointer into lksb->sb_lvbptr
++ * and pass the lksb in here.
++ */
++extern int dlm_query(dlm_lockspace_t *lockspace,
++ struct dlm_lksb *lksb,
++ int query,
++ struct dlm_queryinfo *qinfo,
++ void (ast_routine(void *)),
++ void *astarg);
++
++#endif /* __KERNEL__ */
++
++#endif /* __DLM_DOT_H__ */
+diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
+--- linux-orig/include/cluster/dlm_device.h 1970-01-01 07:30:00.000000000 +0730
++++ linux-patched/include/cluster/dlm_device.h 2004-06-25 18:31:07.000000000 +0800
+@@ -0,0 +1,63 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* This is the device interface for dlm, most users will use a library
++ * interface.
++ */
++
++/* Version of the device interface */
++#define DLM_DEVICE_VERSION_MAJOR 2
++#define DLM_DEVICE_VERSION_MINOR 0
++#define DLM_DEVICE_VERSION_PATCH 0
++
++/* struct passed to the lock write */
++struct dlm_lock_params {
++ uint32_t version[3];
++ uint8_t cmd;
++ uint8_t mode;
++ uint16_t flags;
++ uint32_t lkid;
++ uint32_t parent;
++ struct dlm_range range;
++ uint8_t namelen;
++ void *astparam;
++ void *astaddr;
++ void *bastaddr;
++ struct dlm_lksb *lksb;
++ char name[1];
++};
++
++
++/* struct read from the "device" fd,
++ consists mainly of userspace pointers for the library to use */
++struct dlm_lock_result {
++ uint8_t cmd;
++ void *astparam;
++ void (*astaddr)(void *astparam);
++ struct dlm_lksb *user_lksb;
++ struct dlm_lksb lksb; /* But this has real data in it */
++ uint8_t bast_mode; /* Not yet used */
++};
++
++/* commands passed to the device */
++#define DLM_USER_LOCK 1
++#define DLM_USER_UNLOCK 2
++#define DLM_USER_QUERY 3
++
++/* Arbitrary length restriction */
++#define MAX_LS_NAME_LEN 64
++
++/* ioctls on the device */
++#define DLM_CREATE_LOCKSPACE _IOW('D', 0x01, char *)
++#define DLM_RELEASE_LOCKSPACE _IOW('D', 0x02, char *)
++#define DLM_FORCE_RELEASE_LOCKSPACE _IOW('D', 0x03, char *)
--- /dev/null
+# Make the VFS call down into the FS on flock calls.
+diff -urN -p linux-2.6.7/fs/locks.c linux/fs/locks.c
+--- linux-2.6.7/fs/locks.c 2004-06-16 12:00:44.567463632 -0500
++++ linux/fs/locks.c 2004-06-16 12:01:58.844205936 -0500
+@@ -1294,6 +1294,27 @@ out_unlock:
+ return error;
+ }
+
++/*
++ * Wrapper function around the file_operations lock routine when called for
++ * flock(). The lock routine is called for both fcntl() and flock(), so
++ * the flock parameters must be translated to an equivalent fcntl()-like
++ * lock.
++ *
++ * Don't use locks_alloc_lock() (or flock_make_lock()) here, as
++ * this is just a temporary lock structure. We especially don't
++ * want to fail because we couldn't allocate a lock structure if
++ * this is an unlock operation.
++ */
++int flock_fs_file(struct file *filp, int type, int wait)
++{
++ struct file_lock fl = { .fl_flags = FL_FLOCK,
++ .fl_type = type };
++
++ return filp->f_op->lock(filp,
++ (wait) ? F_SETLKW : F_SETLK,
++ &fl);
++}
++
+ /**
+ * sys_flock: - flock() system call.
+ * @fd: the file descriptor to lock.
+@@ -1342,6 +1363,50 @@ asmlinkage long sys_flock(unsigned int f
+ if (error)
+ goto out_free;
+
++ /*
++ * Execute any filesystem-specific flock routines. The filesystem may
++ * maintain supplemental locks. This code allows the supplemental locks
++ * to be kept in sync with the vfs flock lock. If flock() is called on
++ * a lock already held for the given filp, the current flock lock is
++ * dropped before obtaining the requested lock. This unlock operation
++ * must be completed for the any filesystem specific locks and the vfs
++ * flock lock before proceeding with obtaining the requested lock. When
++ * the filesystem routine drops a lock for such a request, it must
++ * return -EDEADLK, allowing the vfs lock to be dropped, and the
++ * filesystem code is then re-executed to obtain the lock.
++ *
++ * A non-blocking request that returns EWOULDBLOCK also causes any vfs
++ * flock lock to be released, but then returns the error to the caller.
++ */
++ if (filp->f_op && filp->f_op->lock) {
++ repeat:
++ error = flock_fs_file(filp, lock->fl_type, can_sleep);
++ if (error < 0) {
++ /*
++ * We may have dropped a lock. We need to
++ * finish unlocking before returning or
++ * continuing with lock acquisition.
++ */
++ if (error != -ENOLCK)
++ flock_lock_file(filp, &(struct file_lock){.fl_type = F_UNLCK});
++
++ /*
++ * We already held the lock in some mode, and
++ * had to drop filesystem-specific locks before
++ * proceeding. We come back through this
++ * routine to unlock the vfs flock lock. Now go
++ * back and try again. Using EAGAIN as the
++ * error here would be better, but the one valid
++ * error value defined for flock(), EWOULDBLOCK,
++ * is defined as EAGAIN.
++ */
++ if (error == -EDEADLK)
++ goto repeat;
++
++ goto out_free;
++ }
++ }
++
+ for (;;) {
+ error = flock_lock_file(filp, lock);
+ if ((error != -EAGAIN) || !can_sleep)
+@@ -1354,6 +1419,13 @@ asmlinkage long sys_flock(unsigned int f
+ break;
+ }
+
++ /*
++ * If we failed to get the vfs flock, we need to clean up any
++ * filesystem-specific lock state that we previously obtained.
++ */
++ if (error && filp->f_op && filp->f_op->lock)
++ flock_fs_file(filp, F_UNLCK, 1);
++
+ out_free:
+ if (list_empty(&lock->fl_link)) {
+ locks_free_lock(lock);
+@@ -1714,6 +1786,8 @@ void locks_remove_flock(struct file *fil
+ if (fl->fl_file == filp) {
+ if (IS_FLOCK(fl)) {
+ locks_delete_lock(before);
++ if (filp->f_op && filp->f_op->lock)
++ flock_fs_file(filp, F_UNLCK, 1);
+ continue;
+ }
+ if (IS_LEASE(fl)) {
+# Add lock harness to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig 2004-06-16 12:00:44.558465722 -0500
++++ linux/fs/Kconfig 2004-06-16 12:02:02.401379449 -0500
+@@ -1669,6 +1669,14 @@ config AFS_FS
+ config RXRPC
+ tristate
+
++config LOCK_HARNESS
++ tristate "GFS Lock Harness"
++ help
++ The module that connects GFS to the modules that provide
++ locking for GFS.
++
++ If you want to use GFS (a cluster filesystem) say Y here.
++
+ endmenu
+
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/Makefile linux/fs/Makefile
+--- linux-2.6.7/fs/Makefile 2004-06-16 12:00:44.558465722 -0500
++++ linux/fs/Makefile 2004-06-16 12:02:02.402379216 -0500
+@@ -91,3 +91,4 @@ obj-$(CONFIG_JFS_FS) += jfs/
+ obj-$(CONFIG_XFS_FS) += xfs/
+ obj-$(CONFIG_AFS_FS) += afs/
+ obj-$(CONFIG_BEFS_FS) += befs/
++obj-$(CONFIG_LOCK_HARNESS) += gfs_locking/
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/Makefile 2004-06-16 12:02:02.402379216 -0500
+@@ -0,0 +1,14 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_HARNESS) += lock_harness/
++
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_harness/Makefile linux/fs/gfs_locking/lock_harness/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_harness/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_harness/Makefile 2004-06-16 12:02:02.402379216 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_HARNESS) += lock_harness.o
++
++lock_harness-y := main.o
++
+# Add GFS to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig 2004-06-25 13:57:24.435829621 -0500
++++ linux/fs/Kconfig 2004-06-25 13:59:16.786347614 -0500
+@@ -316,13 +316,13 @@ config JFS_STATISTICS
+ to be made available to the user in the /proc/fs/jfs/ directory.
+
+ config FS_POSIX_ACL
+-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs)
++# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/GFS)
+ #
+ # NOTE: you can implement Posix ACLs without these helpers (XFS does).
+ # Never use this symbol for ifdefs.
+ #
+ bool
+- depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL
++ depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || GFS_FS
+ default y
+
+ config XFS_FS
+@@ -1677,6 +1677,20 @@ config LOCK_HARNESS
+
+ If you want to use GFS (a cluster filesystem) say Y here.
+
++config GFS_FS
++ tristate "GFS file system support"
++ depends on LOCK_HARNESS
++ help
++ A cluster filesystem.
++
++ Allows a cluster of computers to simultaneously use a block device
++ that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
++ and writes to the block device like a local filesystem, but also uses
++ a lock module to allow the computers coordinate their I/O so
++ filesystem consistency is maintained. One of the nifty features of
++ GFS is perfect consistency -- changes made to the filesystem on one
++ machine show up immediately on all other machines in the cluster.
++
+ endmenu
+
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/Makefile linux/fs/Makefile
+--- linux-2.6.7/fs/Makefile 2004-06-25 13:57:24.436829391 -0500
++++ linux/fs/Makefile 2004-06-25 13:57:24.447826863 -0500
+@@ -92,3 +92,4 @@ obj-$(CONFIG_XFS_FS) += xfs/
+ obj-$(CONFIG_AFS_FS) += afs/
+ obj-$(CONFIG_BEFS_FS) += befs/
+ obj-$(CONFIG_LOCK_HARNESS) += gfs_locking/
++obj-$(CONFIG_GFS_FS) += gfs/
+diff -urN -p linux-2.6.7/fs/gfs/Makefile linux/fs/gfs/Makefile
+--- linux-2.6.7/fs/gfs/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs/Makefile 2004-06-25 13:57:24.448826633 -0500
+@@ -0,0 +1,51 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_GFS_FS) += gfs.o
++
++gfs-y := acl.o \
++ bits.o \
++ bmap.o \
++ daemon.o \
++ dio.o \
++ dir.o \
++ eattr.o \
++ file.o \
++ flock.o \
++ glock.o \
++ glops.o \
++ inode.o \
++ ioctl.o \
++ locking.o \
++ log.o \
++ lops.o \
++ lvb.o \
++ main.o \
++ mount.o \
++ ondisk.o \
++ ops_address.o \
++ ops_dentry.o \
++ ops_export.o \
++ ops_file.o \
++ ops_fstype.o \
++ ops_inode.o \
++ ops_super.o \
++ ops_vm.o \
++ page.o \
++ quota.o \
++ recovery.o \
++ rgrp.o \
++ super.o \
++ trans.o \
++ unlinked.o \
++ util.o
++
+# Add lock_nolock to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig 2004-06-16 12:02:09.563715325 -0500
++++ linux/fs/Kconfig 2004-06-16 12:02:09.574712769 -0500
+@@ -1691,6 +1691,12 @@ config GFS_FS
+ GFS is perfect consistency -- changes made to the filesystem on one
+ machine show up immediately on all other machines in the cluster.
+
++config LOCK_NOLOCK
++ tristate "Lock Nolock"
++ depends on LOCK_HARNESS
++ help
++ A "fake" lock module that allows GFS to run as a local filesystem.
++
+ endmenu
+
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile 2004-06-16 12:02:05.985546690 -0500
++++ linux/fs/gfs_locking/Makefile 2004-06-16 12:02:09.574712769 -0500
+@@ -11,4 +11,5 @@
+ ###############################################################################
+
+ obj-$(CONFIG_LOCK_HARNESS) += lock_harness/
++obj-$(CONFIG_LOCK_NOLOCK) += lock_nolock/
+
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_nolock/Makefile linux/fs/gfs_locking/lock_nolock/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_nolock/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_nolock/Makefile 2004-06-16 12:02:09.575712537 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_NOLOCK) += lock_nolock.o
++
++lock_nolock-y := main.o
++
+# Add lock_dlm to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig 2004-06-16 12:02:13.145883030 -0500
++++ linux/fs/Kconfig 2004-06-16 12:02:13.157880243 -0500
+@@ -1697,6 +1697,12 @@ config LOCK_NOLOCK
+ help
+ A "fake" lock module that allows GFS to run as a local filesystem.
+
++config LOCK_DLM
++ tristate "Lock DLM"
++ depends on LOCK_HARNESS
++ help
++ A lock module that allows GFS to use a Distributed Lock Manager.
++
+ endmenu
+
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile 2004-06-16 12:02:13.146882798 -0500
++++ linux/fs/gfs_locking/Makefile 2004-06-16 12:02:13.157880243 -0500
+@@ -12,4 +12,5 @@
+
+ obj-$(CONFIG_LOCK_HARNESS) += lock_harness/
+ obj-$(CONFIG_LOCK_NOLOCK) += lock_nolock/
++obj-$(CONFIG_LOCK_DLM) += lock_dlm/
+
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_dlm/Makefile linux/fs/gfs_locking/lock_dlm/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_dlm/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_dlm/Makefile 2004-06-16 12:02:13.157880243 -0500
+@@ -0,0 +1,16 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_DLM) += lock_dlm.o
++
++lock_dlm-y := main.o group.o lock.o mount.o thread.o plock.o
++
+# Add lock_gulm to the build system.
+diff -urN -p linux-2.6.7/fs/Kconfig linux/fs/Kconfig
+--- linux-2.6.7/fs/Kconfig 2004-06-16 12:02:16.816030294 -0500
++++ linux/fs/Kconfig 2004-06-16 12:02:16.827027739 -0500
+@@ -1703,6 +1703,12 @@ config LOCK_DLM
+ help
+ A lock module that allows GFS to use a Distributed Lock Manager.
+
++config LOCK_GULM
++ tristate "Lock GULM"
++ depends on LOCK_HARNESS
++ help
++ A lock module that allows GFS to use a Failover Lock Manager.
++
+ endmenu
+
+ menu "Partition Types"
+diff -urN -p linux-2.6.7/fs/gfs_locking/Makefile linux/fs/gfs_locking/Makefile
+--- linux-2.6.7/fs/gfs_locking/Makefile 2004-06-16 12:02:16.817030062 -0500
++++ linux/fs/gfs_locking/Makefile 2004-06-16 12:02:16.828027507 -0500
+@@ -13,4 +13,5 @@
+ obj-$(CONFIG_LOCK_HARNESS) += lock_harness/
+ obj-$(CONFIG_LOCK_NOLOCK) += lock_nolock/
+ obj-$(CONFIG_LOCK_DLM) += lock_dlm/
++obj-$(CONFIG_LOCK_GULM) += lock_gulm/
+
+diff -urN -p linux-2.6.7/fs/gfs_locking/lock_gulm/Makefile linux/fs/gfs_locking/lock_gulm/Makefile
+--- linux-2.6.7/fs/gfs_locking/lock_gulm/Makefile 1969-12-31 18:00:00.000000000 -0600
++++ linux/fs/gfs_locking/lock_gulm/Makefile 2004-06-16 12:02:16.828027507 -0500
+@@ -0,0 +1,33 @@
++###############################################################################
++###############################################################################
++##
++## Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++##
++## This copyrighted material is made available to anyone wishing to use,
++## modify, copy, or redistribute it subject to the terms and conditions
++## of the GNU General Public License v.2.
++##
++###############################################################################
++###############################################################################
++
++obj-$(CONFIG_LOCK_GULM) += lock_gulm.o
++
++lock_gulm-y := gulm_core.o \
++ gulm_fs.o \
++ gulm_jid.o \
++ gulm_lt.o \
++ gulm_procinfo.o \
++ handler.o \
++ lg_core.o \
++ lg_lock.o \
++ lg_main.o \
++ linux_gulm_main.o \
++ load_info.o \
++ util.o \
++ utils_crc.o \
++ utils_tostr.o \
++ utils_verb_flags.o \
++ xdr_base.o \
++ xdr_io.o \
++ xdr_socket.o
++
+diff -urN linux-orig/fs/gfs/acl.c linux-patched/fs/gfs/acl.c
+--- linux-orig/fs/gfs/acl.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/acl.c 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,397 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "dio.h"
++#include "eattr.h"
++#include "glock.h"
++#include "trans.h"
++#include "inode.h"
++
++/*
++ * Check to make sure that the acl is actually valid
++ */
++int
++gfs_validate_acl(struct gfs_inode *ip, const char *value, int size, int access)
++{
++ int err = 0;
++ struct posix_acl *acl = NULL;
++ struct gfs_sbd *sdp = ip->i_sbd;
++
++ if ((current->fsuid != ip->i_di.di_uid) && !capable(CAP_FOWNER))
++ return -EPERM;
++ if (ip->i_di.di_type == GFS_FILE_LNK)
++ return -EOPNOTSUPP;
++ if (!access && ip->i_di.di_type != GFS_FILE_DIR)
++ return -EACCES;
++ if (!sdp->sd_args.ar_posixacls)
++ return -EOPNOTSUPP;
++
++ if (value) {
++ acl = posix_acl_from_xattr(value, size);
++ if (IS_ERR(acl))
++ return PTR_ERR(acl);
++ else if (acl) {
++ err = posix_acl_valid(acl);
++ posix_acl_release(acl);
++ }
++ }
++ return err;
++}
++
++void
++gfs_acl_set_mode(struct gfs_inode *ip, struct posix_acl *acl)
++{
++ struct inode *inode;
++ mode_t mode;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ mode = inode->i_mode;
++ posix_acl_equiv_mode(acl, &mode);
++ inode->i_mode = mode;
++ iput(inode);
++ gfs_inode_attr_out(ip);
++}
++
++
++/**
++ * gfs_replace_acl - replace the value of the ea to the value of the acl
++ *
++ * NOTE: The new value must be the same size as the old one.
++ */
++int
++gfs_replace_acl(struct inode *inode, struct posix_acl *acl, int access,
++ struct gfs_ea_location location)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_easet_io req;
++ int size;
++ void *data;
++ int error;
++
++ size = posix_acl_to_xattr(acl, NULL, 0);
++ GFS_ASSERT(size == GFS_EA_DATA_LEN(location.ea),
++ printk("new acl size = %d, ea size = %u\n", size,
++ GFS_EA_DATA_LEN(location.ea)););
++
++ data = gmalloc(size);
++
++ posix_acl_to_xattr(acl, data, size);
++
++ req.es_data = data;
++ req.es_name = (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT;
++ req.es_data_len = size;
++ req.es_name_len = (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN;
++ req.es_cmd = GFS_EACMD_REPLACE;
++ req.es_type = GFS_EATYPE_SYS;
++
++ error = replace_ea(ip->i_sbd, ip, location.ea, &req);
++ if (!error)
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++
++ kfree(data);
++
++ return error;
++}
++
++/**
++ * gfs_findacl - returns the requested posix acl
++ *
++ * this function does not log the inode. It assumes that a lock is already
++ * held on it.
++ */
++int
++gfs_findacl(struct gfs_inode *ip, int access, struct posix_acl **acl_ptr,
++ struct gfs_ea_location *location)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct posix_acl *acl;
++ uint32_t avail_size;
++ void *data;
++ int error;
++
++ avail_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++ *acl_ptr = NULL;
++
++ if (!ip->i_di.di_eattr)
++ return 0;
++
++ error = find_eattr(ip,
++ (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT,
++ (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN,
++ GFS_EATYPE_SYS, location);
++ if (error <= 0)
++ return error;
++
++ data = gmalloc(GFS_EA_DATA_LEN(location->ea));
++
++ error = 0;
++ if (GFS_EA_IS_UNSTUFFED(location->ea))
++ error = read_unstuffed(data, ip, sdp, location->ea, avail_size,
++ gfs_ea_memcpy);
++ else
++ gfs_ea_memcpy(data, GFS_EA_DATA(location->ea),
++ GFS_EA_DATA_LEN(location->ea));
++ if (error)
++ goto out;
++
++ acl = posix_acl_from_xattr(data, GFS_EA_DATA_LEN(location->ea));
++ if (IS_ERR(acl))
++ error = PTR_ERR(acl);
++ else
++ *acl_ptr = acl;
++
++ out:
++ kfree(data);
++ if (error)
++ brelse(location->bh);
++
++ return error;
++}
++
++int
++gfs_getacl(struct inode *inode, int access, struct posix_acl **acl_ptr)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_eaget_io req;
++ struct posix_acl *acl;
++ int size;
++ void *data;
++ int error = 0;
++
++ *acl_ptr = NULL;
++
++ if (!sdp->sd_args.ar_posixacls)
++ return 0;
++
++ req.eg_name = (access) ? GFS_POSIX_ACL_ACCESS : GFS_POSIX_ACL_DEFAULT;
++ req.eg_name_len = (access) ? GFS_POSIX_ACL_ACCESS_LEN : GFS_POSIX_ACL_DEFAULT_LEN;
++ req.eg_type = GFS_EATYPE_SYS;
++ req.eg_len = NULL;
++ req.eg_data = NULL;
++ req.eg_data_len = 0;
++
++ error = gfs_ea_read_permission(&req, ip);
++ if (error)
++ return error;
++
++ if (!ip->i_di.di_eattr)
++ return error;
++
++ size = get_ea(sdp, ip, &req, gfs_ea_memcpy);
++ if (size < 0) {
++ if (size != -ENODATA)
++ error = size;
++ return error;
++ }
++
++ data = gmalloc(size);
++
++ req.eg_data = data;
++ req.eg_data_len = size;
++
++ size = get_ea(sdp, ip, &req, gfs_ea_memcpy);
++ if (size < 0) {
++ error = size;
++ goto out_free;
++ }
++
++ acl = posix_acl_from_xattr(data, size);
++ if (IS_ERR(acl))
++ error = PTR_ERR(acl);
++ else
++ *acl_ptr = acl;
++
++ out_free:
++ kfree(data);
++
++ return error;
++}
++
++int
++gfs_setup_new_acl(struct gfs_inode *dip,
++ unsigned int type, unsigned int *mode,
++ struct posix_acl **acl_ptr)
++{
++ struct gfs_ea_location location;
++ struct posix_acl *acl = NULL;
++ mode_t access_mode = *mode;
++ int error;
++
++ if (type == GFS_FILE_LNK)
++ return 0;
++
++ error = gfs_findacl(dip, FALSE, &acl, &location);
++ if (error)
++ return error;
++ if (!acl) {
++ (*mode) &= ~current->fs->umask;
++ return 0;
++ }
++ brelse(location.bh);
++
++ if (type == GFS_FILE_DIR) {
++ *acl_ptr = acl;
++ return 0;
++ }
++
++ error = posix_acl_create_masq(acl, &access_mode);
++ *mode = access_mode;
++ if (error > 0) {
++ *acl_ptr = acl;
++ return 0;
++ }
++
++ posix_acl_release(acl);
++
++ return error;
++}
++
++/**
++ * gfs_init_default_acl - initializes the default acl
++ *
++ * NOTE: gfs_init_access_acl must be called first
++ */
++int
++gfs_create_default_acl(struct gfs_inode *dip, struct gfs_inode *ip, void *data,
++ int size)
++{
++ struct gfs_easet_io req;
++ struct gfs_ea_location avail;
++ int error;
++
++ memset(&avail, 0, sizeof(struct gfs_ea_location));
++
++ req.es_data = data;
++ req.es_name = GFS_POSIX_ACL_DEFAULT;
++ req.es_data_len = size;
++ req.es_name_len = GFS_POSIX_ACL_DEFAULT_LEN;
++ req.es_cmd = GFS_EACMD_CREATE;
++ req.es_type = GFS_EATYPE_SYS;
++
++ error = find_sys_space(dip, ip, size, &avail);
++ if (error)
++ return error;
++
++ avail.ea = prep_ea(avail.ea);
++
++ error = write_ea(ip->i_sbd, dip, ip, avail.ea, &req);
++ if (!error)
++ gfs_trans_add_bh(ip->i_gl, avail.bh); /* Huh!?! */
++
++ brelse(avail.bh);
++
++ return error;
++}
++
++/**
++ * gfs_init_access_acl - initialized the access acl
++ *
++ * NOTE: This must be the first extended attribute that is created for
++ * this inode.
++ */
++int
++gfs_init_access_acl(struct gfs_inode *dip, struct gfs_inode *ip, void *data,
++ int size)
++{
++ struct gfs_easet_io req;
++
++ req.es_data = data;
++ req.es_name = GFS_POSIX_ACL_ACCESS;
++ req.es_data_len = size;
++ req.es_name_len = GFS_POSIX_ACL_ACCESS_LEN;
++ req.es_cmd = GFS_EACMD_CREATE;
++ req.es_type = GFS_EATYPE_SYS;
++
++ return init_new_inode_eattr(dip, ip, &req);
++}
++
++int
++gfs_init_acl(struct gfs_inode *dip, struct gfs_inode *ip, unsigned int type,
++ struct posix_acl *acl)
++{
++ struct buffer_head *dibh;
++ void *data;
++ int size;
++ int error;
++
++ size = posix_acl_to_xattr(acl, NULL, 0);
++
++ data = gmalloc(size);
++
++ posix_acl_to_xattr(acl, data, size);
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto out;
++
++ error = gfs_init_access_acl(dip, ip, data, size);
++ if (error)
++ goto out_relse;
++
++ if (type == GFS_FILE_DIR) {
++ error = gfs_create_default_acl(dip, ip, data, size);
++ if (error)
++ goto out_relse;
++ }
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ out_relse:
++ brelse(dibh);
++
++ out:
++ kfree(data);
++ posix_acl_release(acl);
++
++ return error;
++}
++
++int
++gfs_acl_setattr(struct inode *inode)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ struct posix_acl *acl;
++ struct gfs_ea_location location;
++ int error;
++
++ if (S_ISLNK(inode->i_mode))
++ return 0;
++
++ memset(&location, 0, sizeof(struct gfs_ea_location));
++
++ error = gfs_findacl(ip, TRUE, &acl, &location); /* Check error here? */
++ if (!location.ea)
++ return error;
++
++ error = posix_acl_chmod_masq(acl, inode->i_mode);
++ if (!error)
++ error = gfs_replace_acl(inode, acl, TRUE, location);
++
++ posix_acl_release(acl);
++ brelse(location.bh);
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/acl.h linux-patched/fs/gfs/acl.h
+--- linux-orig/fs/gfs/acl.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/acl.h 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,28 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __ACL_DOT_H__
++#define __ACL_DOT_H__
++
++int gfs_setup_new_acl(struct gfs_inode *dip,
++ unsigned int type, unsigned int *mode,
++ struct posix_acl **acl_ptr);
++int gfs_getacl(struct inode *inode, int access, struct posix_acl **acl_ptr);
++int gfs_init_acl(struct gfs_inode *dip, struct gfs_inode *ip, unsigned int type,
++ struct posix_acl *acl);
++int gfs_acl_setattr(struct inode *inode);
++int gfs_validate_acl(struct gfs_inode *ip, const char *value, int size,
++ int access);
++void gfs_acl_set_mode(struct gfs_inode *ip, struct posix_acl *acl);
++
++#endif /* __ACL_DOT_H__ */
+diff -urN linux-orig/fs/gfs/bits.c linux-patched/fs/gfs/bits.c
+--- linux-orig/fs/gfs/bits.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bits.c 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,183 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * These routines are used by the resource group routines (rgrp.c)
++ * to keep track of block allocation. Each block is represented by two
++ * bits. One bit indicates whether or not the block is used. (1=used,
++ * 0=free) The other bit indicates whether or not the block contains a
++ * dinode or not. (1=dinode, 0=data block) So, each byte represents
++ * GFS_NBBY (i.e. 4) blocks.
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bits.h"
++
++static const char valid_change[16] = {
++ /* current */
++
++ /* n */ 0, 1, 1, 1,
++ /* e */ 1, 0, 0, 0,
++ /* w */ 1, 0, 0, 1,
++ 0, 0, 1, 0
++};
++
++/**
++ * gfs_setbit - Set a bit in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @block: the block to set
++ * @new_state: the new state of the block
++ *
++ */
++
++void
++gfs_setbit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ uint32_t block, unsigned char new_state)
++{
++ unsigned char *byte, *end, cur_state;
++ unsigned int bit;
++
++ byte = buffer + (block / GFS_NBBY);
++ bit = (block % GFS_NBBY) * GFS_BIT_SIZE;
++ end = buffer + buflen;
++
++ GFS_ASSERT_RGRPD(byte < end, rgd,);
++
++ cur_state = (*byte >> bit) & GFS_BIT_MASK;
++ GFS_ASSERT_RGRPD(valid_change[new_state * 4 + cur_state], rgd,
++ printk("cur_state = %u, new_state = %u\n",
++ cur_state, new_state););
++
++ *byte ^= cur_state << bit;
++ *byte |= new_state << bit;
++}
++
++/**
++ * gfs_testbit - test a bit in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @block: the block to read
++ *
++ */
++
++unsigned char
++gfs_testbit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen, uint32_t block)
++{
++ unsigned char *byte, *end, cur_state;
++ unsigned int bit;
++
++ byte = buffer + (block / GFS_NBBY);
++ bit = (block % GFS_NBBY) * GFS_BIT_SIZE;
++ end = buffer + buflen;
++
++ GFS_ASSERT_RGRPD(byte < end, rgd,);
++
++ cur_state = (*byte >> bit) & GFS_BIT_MASK;
++
++ return cur_state;
++}
++
++/**
++ * gfs_bitfit - Find a free block in the bitmaps
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @goal: the block to try to allocate
++ * @old_state: the state of the block we're looking for
++ *
++ * Return: the block number that was allocated
++ */
++
++uint32_t
++gfs_bitfit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ uint32_t goal, unsigned char old_state)
++{
++ unsigned char *byte, *end, alloc;
++ uint32_t blk = goal;
++ unsigned int bit;
++
++ byte = buffer + (goal / GFS_NBBY);
++ bit = (goal % GFS_NBBY) * GFS_BIT_SIZE;
++ end = buffer + buflen;
++ alloc = (old_state & 1) ? 0 : 0x55;
++
++ while (byte < end) {
++ if ((*byte & 0x55) == alloc) {
++ blk += (8 - bit) >> 1;
++
++ bit = 0;
++ byte++;
++
++ continue;
++ }
++
++ if (((*byte >> bit) & GFS_BIT_MASK) == old_state)
++ return blk;
++
++ bit += GFS_BIT_SIZE;
++ if (bit >= 8) {
++ bit = 0;
++ byte++;
++ }
++
++ blk++;
++ }
++
++ return BFITNOENT;
++}
++
++/**
++ * gfs_bitcount - count the number of bits in a certain state
++ * @buffer: the buffer that holds the bitmaps
++ * @buflen: the length (in bytes) of the buffer
++ * @state: the state of the block we're looking for
++ *
++ * Returns: The number of bits
++ */
++
++uint32_t
++gfs_bitcount(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ unsigned char state)
++{
++ unsigned char *byte = buffer;
++ unsigned char *end = buffer + buflen;
++ unsigned char state1 = state << 2;
++ unsigned char state2 = state << 4;
++ unsigned char state3 = state << 6;
++ uint32_t count = 0;
++
++ for (; byte < end; byte++) {
++ if (((*byte) & 0x03) == state)
++ count++;
++ if (((*byte) & 0x0C) == state1)
++ count++;
++ if (((*byte) & 0x30) == state2)
++ count++;
++ if (((*byte) & 0xC0) == state3)
++ count++;
++ }
++
++ return count;
++}
+diff -urN linux-orig/fs/gfs/bits.h linux-patched/fs/gfs/bits.h
+--- linux-orig/fs/gfs/bits.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bits.h 2004-06-20 22:48:17.946947249 -0500
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __BITS_DOT_H__
++#define __BITS_DOT_H__
++
++#define BFITNOENT (0xFFFFFFFF)
++
++void gfs_setbit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ uint32_t block, unsigned char new_state);
++unsigned char gfs_testbit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ uint32_t block);
++uint32_t gfs_bitfit(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ uint32_t goal, unsigned char old_state);
++uint32_t gfs_bitcount(struct gfs_rgrpd *rgd,
++ unsigned char *buffer, unsigned int buflen,
++ unsigned char state);
++
++#endif /* __BITS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/bmap.c linux-patched/fs/gfs/bmap.c
+--- linux-orig/fs/gfs/bmap.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bmap.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,1404 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++struct metapath {
++ unsigned int mp_list[GFS_MAX_META_HEIGHT];
++};
++
++typedef int (*block_call_t) (struct gfs_inode *ip, struct buffer_head *dibh,
++ struct buffer_head *bh, uint64_t *top,
++ uint64_t *bottom, unsigned int height,
++ void *data);
++
++struct strip_mine {
++ int sm_first;
++ unsigned int sm_height;
++};
++
++/**
++ * gfs_unstuffer_sync - unstuff a dinode synchronously
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_sync(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ int error;
++
++ error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++ if (error)
++ return error;
++
++ gfs_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs_dinode));
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * gfs_unstuffer_async - unstuff a dinode asynchronously
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_async(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ int error;
++
++ error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++ if (error)
++ return error;
++
++ gfs_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs_dinode));
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY);
++
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * gfs_unstuff_dinode - Unstuff a dinode when the data has grown too big
++ * @ip: The GFS inode to unstuff
++ * @unstuffer: the routine that handles unstuffing a non-zero length file
++ * @private: private data for the unstuffer
++ *
++ * This routine unstuffs a dinode and returns it to a "normal" state such
++ * that the height can be grown in the traditional way.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_unstuff_dinode(struct gfs_inode *ip, gfs_unstuffer_t unstuffer,
++ void *private)
++{
++ struct buffer_head *bh, *dibh;
++ uint64_t block = 0;
++ int journaled = gfs_is_jdata(ip);
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(ip), ip,);
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ if (ip->i_di.di_size) {
++ /* Get a free block, fill it with the stuffed data,
++ and write it out to disk */
++
++ if (journaled) {
++ error = gfs_metaalloc(ip, &block);
++ if (error)
++ goto fail;
++
++ error = gfs_get_data_buffer(ip, block, TRUE, &bh);
++ if (error)
++ goto fail;
++
++ gfs_buffer_copy_tail(bh, sizeof(struct gfs_meta_header),
++ dibh, sizeof(struct gfs_dinode));
++
++ brelse(bh);
++ } else {
++ gfs_blkalloc(ip, &block);
++
++ error = unstuffer(ip, dibh, block, private);
++ if (error)
++ goto fail;
++ }
++ }
++
++ /* Set up the pointer to the new block */
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++ if (ip->i_di.di_size) {
++ *(uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode)) = cpu_to_gfs64(block);
++ ip->i_di.di_blocks++;
++ }
++
++ ip->i_di.di_height = 1;
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++
++ fail:
++ brelse(dibh);
++
++ return error;
++}
++
++/**
++ * calc_tree_height - Calculate the height of a metadata tree
++ * @ip: The GFS inode
++ * @size: The proposed size of the file
++ *
++ * Work out how tall a metadata tree needs to be in order to accommodate a
++ * file of a particular size. If size is less than the current size of
++ * the inode, then the current size of the inode is used instead of the
++ * supplied one.
++ *
++ * Returns: the height the tree should be
++ */
++
++static unsigned int
++calc_tree_height(struct gfs_inode *ip, uint64_t size)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t *arr;
++ unsigned int max, height;
++
++ if (ip->i_di.di_size > size)
++ size = ip->i_di.di_size;
++
++ if (gfs_is_jdata(ip)) {
++ arr = sdp->sd_jheightsize;
++ max = sdp->sd_max_jheight;
++ } else {
++ arr = sdp->sd_heightsize;
++ max = sdp->sd_max_height;
++ }
++
++ for (height = 0; height < max; height++)
++ if (arr[height] >= size)
++ break;
++
++ return height;
++}
++
++/**
++ * build_height - Build a metadata tree of the requested height
++ * @ip: The GFS inode
++ * @height: The height to build to
++ *
++ * This routine makes sure that the metadata tree is tall enough to hold
++ * "size" bytes of data.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++build_height(struct gfs_inode *ip, int height)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh, *dibh;
++ uint64_t block, *bp;
++ unsigned int x;
++ int new_block;
++ int error;
++
++ while (ip->i_di.di_height < height) {
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ new_block = FALSE;
++ bp = (uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode));
++ for (x = 0; x < sdp->sd_diptrs; x++, bp++)
++ if (*bp) {
++ new_block = TRUE;
++ break;
++ }
++
++ if (new_block) {
++ /* Get a new block, fill it with the old direct pointers,
++ and write it out */
++
++ error = gfs_metaalloc(ip, &block);
++ if (error)
++ goto fail;
++
++ error = gfs_dread(sdp, block, ip->i_gl,
++ DIO_NEW | DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto fail;
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_IN,
++ GFS_FORMAT_IN);
++ memset(bh->b_data + sizeof(struct gfs_meta_header),
++ 0,
++ sizeof(struct gfs_indirect) -
++ sizeof(struct gfs_meta_header));
++ gfs_buffer_copy_tail(bh, sizeof(struct gfs_indirect),
++ dibh, sizeof(struct gfs_dinode));
++
++ brelse(bh);
++ }
++
++ /* Set up the new direct pointer and write it out to disk */
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++ if (new_block) {
++ *(uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode)) = cpu_to_gfs64(block);
++ ip->i_di.di_blocks++;
++ }
++
++ ip->i_di.di_height++;
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++ }
++
++ return 0;
++
++ fail:
++ brelse(dibh);
++
++ return error;
++}
++
++/**
++ * find_metapath - Find path through the metadata tree
++ * @ip: The inode pointer
++ * @mp: The metapath to return the result in
++ * @block: The disk block to look up
++ *
++ * This routine returns a struct metapath structure that defines a path through
++ * the metadata of inode "ip" to get to block "block".
++ *
++ * Example:
++ * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
++ * filesystem with a blocksize of 4096.
++ *
++ * find_metapath() would return a struct metapath structure set to:
++ * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
++ * and mp_list[2] = 165.
++ *
++ * That means that in order to get to the block containing the byte at
++ * offset 101342453, we would load the indirect block pointed to by pointer
++ * 0 in the dinode. We would then load the indirect block pointed to by
++ * pointer 48 in that indirect block. We would then load the data block
++ * pointed to by pointer 165 in that indirect block.
++ *
++ * ----------------------------------------
++ * | Dinode | |
++ * | | 4|
++ * | |0 1 2 3 4 5 9|
++ * | | 6|
++ * ----------------------------------------
++ * |
++ * |
++ * V
++ * ----------------------------------------
++ * | Indirect Block |
++ * | 5|
++ * | 4 4 4 4 4 5 5 1|
++ * |0 5 6 7 8 9 0 1 2|
++ * ----------------------------------------
++ * |
++ * |
++ * V
++ * ----------------------------------------
++ * | Indirect Block |
++ * | 1 1 1 1 1 5|
++ * | 6 6 6 6 6 1|
++ * |0 3 4 5 6 7 2|
++ * ----------------------------------------
++ * |
++ * |
++ * V
++ * ----------------------------------------
++ * | Data block containing offset |
++ * | 101342453 |
++ * | |
++ * | |
++ * ----------------------------------------
++ *
++ */
++
++static struct metapath *
++find_metapath(struct gfs_inode *ip, uint64_t block)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct metapath *mp;
++ uint64_t b = block;
++ unsigned int i;
++
++ mp = gmalloc(sizeof(struct metapath));
++ memset(mp, 0, sizeof(struct metapath));
++
++ for (i = ip->i_di.di_height; i--;)
++ mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
++
++ return mp;
++}
++
++/**
++ * metapointer - Return pointer to start of metadata in a buffer
++ * @bh: The buffer
++ * @height: The metadata height (0 = dinode)
++ * @mp: The metapath
++ *
++ * Return a pointer to the block number of the next height of the metadata
++ * tree given a buffer containing the pointer to the current height of the
++ * metadata tree.
++ */
++
++static __inline__ uint64_t *
++metapointer(struct buffer_head *bh, unsigned int height, struct metapath *mp)
++{
++ unsigned int head_size = (height > 0) ?
++ sizeof(struct gfs_indirect) : sizeof(struct gfs_dinode);
++
++ return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height];
++}
++
++/**
++ * get_metablock - Get the next metadata block in metadata tree
++ * @ip: The GFS inode
++ * @bh: Buffer containing the pointers to metadata blocks
++ * @height: The height of the tree (0 = dinode)
++ * @mp: The metapath
++ * @create: Non-zero if we may create a new meatdata block
++ * @new: Used to indicate if we did create a new metadata block
++ * @block: the returned disk block number
++ *
++ * Given a metatree, complete to a particular height, checks to see if the next
++ * height of the tree exists. If not the next height of the tree is created.
++ * The block number of the next height of the metadata tree is returned.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_metablock(struct gfs_inode *ip,
++ struct buffer_head *bh, unsigned int height, struct metapath *mp,
++ int create, int *new, uint64_t *block)
++{
++ uint64_t *ptr = metapointer(bh, height, mp);
++ int error;
++
++ if (*ptr) {
++ *block = gfs64_to_cpu(*ptr);
++ return 0;
++ }
++
++ *block = 0;
++
++ if (!create)
++ return 0;
++
++ error = gfs_metaalloc(ip, block);
++ if (error)
++ return error;
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++
++ *ptr = cpu_to_gfs64(*block);
++ ip->i_di.di_blocks++;
++
++ *new = 1;
++
++ return 0;
++}
++
++/**
++ * get_datablock - Get datablock number from metadata block
++ * @ip: The GFS inode
++ * @bh: The buffer containing pointers to datablocks
++ * @mp: The metapath
++ * @create: Non-zero if we may create a new data block
++ * @new: Used to indicate if we created a new data block
++ * @block: the returned disk block number
++ *
++ * Given a fully built metadata tree, checks to see if a particular data
++ * block exists. It is created if it does not exist and the block number
++ * on disk is returned.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_datablock(struct gfs_inode *ip,
++ struct buffer_head *bh, struct metapath *mp,
++ int create, int *new, uint64_t *block)
++{
++ uint64_t *ptr = metapointer(bh, ip->i_di.di_height - 1, mp);
++
++ if (*ptr) {
++ *block = gfs64_to_cpu(*ptr);
++ return 0;
++ }
++
++ *block = 0;
++
++ if (!create)
++ return 0;
++
++ if (gfs_is_jdata(ip)) {
++ int error;
++ error = gfs_metaalloc(ip, block);
++ if (error)
++ return error;
++ } else
++ gfs_blkalloc(ip, block);
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++
++ *ptr = cpu_to_gfs64(*block);
++ ip->i_di.di_blocks++;
++
++ *new = 1;
++
++ return 0;
++}
++
++/**
++ * gfs_block_map - Map a block from an inode to a disk block
++ * @ip: The GFS inode
++ * @lblock: The logical block number
++ * @new: Value/Result argument (1 = may create/did create new blocks)
++ * @dblock: the disk block number of the start of an extent
++ * @extlen: the size of the extent
++ *
++ * Find the block number on the current device which corresponds to an
++ * inode's block. If the block had to be created, "new" will be set.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_block_map(struct gfs_inode *ip,
++ uint64_t lblock, int *new,
++ uint64_t *dblock, uint32_t *extlen)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ struct metapath *mp;
++ int create = *new;
++ unsigned int bsize;
++ unsigned int height;
++ unsigned int end_of_metadata;
++ unsigned int x;
++ int error;
++
++ *new = 0;
++ *dblock = 0;
++ if (extlen)
++ *extlen = 0;
++
++ if (gfs_is_stuffed(ip)) {
++ if (!lblock) {
++ *dblock = ip->i_num.no_addr;
++ if (extlen)
++ *extlen = 1;
++ }
++ return 0;
++ }
++
++ bsize = (gfs_is_jdata(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
++
++ height = calc_tree_height(ip, (lblock + 1) * bsize);
++ if (ip->i_di.di_height < height) {
++ if (!create)
++ return 0;
++
++ error = build_height(ip, height);
++ if (error)
++ return error;
++ }
++
++ mp = find_metapath(ip, lblock);
++ end_of_metadata = ip->i_di.di_height - 1;
++
++ error = gfs_get_inode_buffer(ip, &bh);
++ if (error)
++ goto out;
++
++ for (x = 0; x < end_of_metadata; x++) {
++ error = get_metablock(ip, bh, x, mp, create, new, dblock);
++ brelse(bh);
++ if (error || !*dblock)
++ goto out;
++
++ error = gfs_get_meta_buffer(ip, x + 1, *dblock, *new, &bh);
++ if (error)
++ goto out;
++ }
++
++ error = get_datablock(ip, bh, mp, create, new, dblock);
++ if (error) {
++ brelse(bh);
++ goto out;
++ }
++
++ if (extlen && *dblock) {
++ *extlen = 1;
++
++ if (!*new) {
++ uint64_t tmp_dblock;
++ int tmp_new;
++ unsigned int nptrs;
++
++ nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
++
++ while (++mp->mp_list[end_of_metadata] < nptrs) {
++ get_datablock(ip, bh, mp,
++ FALSE, &tmp_new,
++ &tmp_dblock);
++
++ if (*dblock + *extlen != tmp_dblock)
++ break;
++
++ (*extlen)++;
++ }
++ }
++ }
++
++ brelse(bh);
++
++ if (*new) {
++ error = gfs_get_inode_buffer(ip, &bh);
++ if (!error) {
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_dinode_out(&ip->i_di, bh->b_data);
++ brelse(bh);
++ }
++ }
++
++ out:
++ kfree(mp);
++
++ return error;
++}
++
++/**
++ * do_grow - Make a file look bigger than it is
++ * @ip: the inode
++ * @size: the size to set the file to
++ *
++ * Called with an exclusive lock on @ip.
++ *
++ * Returns: 0 on succes, -EXXX on failure
++ */
++
++static int
++do_grow(struct gfs_inode *ip, uint64_t size)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al;
++ struct buffer_head *dibh;
++ unsigned int h;
++ int journaled = gfs_is_jdata(ip);
++ int error;
++
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail;
++
++ error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ if (journaled)
++ al->al_requested_meta = sdp->sd_max_height + 1;
++ else {
++ al->al_requested_meta = sdp->sd_max_height;
++ al->al_requested_data = 1;
++ }
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ Full extention of the metadata tree, block allocation,
++ a dinode modification, and a quota change */
++
++ error = gfs_trans_begin(sdp,
++ sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
++ 1 + !!journaled,
++ 1);
++ if (error)
++ goto fail_ipres;
++
++ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) {
++ if (gfs_is_stuffed(ip)) {
++ error = gfs_unstuff_dinode(ip, gfs_unstuffer_sync, NULL);
++ if (error)
++ goto fail_end_trans;
++ }
++
++ h = calc_tree_height(ip, size);
++ if (ip->i_di.di_height < h) {
++ error = build_height(ip, h);
++ if (error)
++ goto fail_end_trans;
++ }
++ }
++
++ ip->i_di.di_size = size;
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++
++ gfs_inplace_release(ip);
++ gfs_quota_unlock_m(ip);
++ gfs_alloc_put(ip);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++ gfs_quota_unlock_m(ip);
++
++ fail:
++ gfs_alloc_put(ip);
++
++ return error;
++}
++
++/**
++ * recursive_scan - recursively scan through the end of a file
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @mp: the path through the metadata to the point to start
++ * @height: the height the recursion is at
++ * @block: the indirect block to look at
++ * @first: TRUE if this is the first block
++ * @bc: the call to make for each piece of metadata
++ * @data: data opaque to this function to pass to @bc
++ *
++ * When this is first called @height and @block should be zero and
++ * @first should be TRUE.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++recursive_scan(struct gfs_inode *ip, struct buffer_head *dibh,
++ struct metapath *mp, unsigned int height, uint64_t block,
++ int first, block_call_t bc, void *data)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh = NULL;
++ uint64_t *top, *bottom;
++ uint64_t bn;
++ int error;
++
++ if (!height) {
++ error = gfs_get_inode_buffer(ip, &bh);
++ if (error)
++ goto fail;
++ dibh = bh;
++
++ top = (uint64_t *)(bh->b_data + sizeof(struct gfs_dinode)) +
++ mp->mp_list[0];
++ bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs_dinode)) +
++ sdp->sd_diptrs;
++ } else {
++ error = gfs_get_meta_buffer(ip, height, block, FALSE, &bh);
++ if (error)
++ goto fail;
++
++ top = (uint64_t *)(bh->b_data + sizeof(struct gfs_indirect)) +
++ ((first) ? mp->mp_list[height] : 0);
++ bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs_indirect)) +
++ sdp->sd_inptrs;
++ }
++
++ error = bc(ip, dibh, bh, top, bottom, height, data);
++ if (error)
++ goto fail;
++
++ if (height < ip->i_di.di_height - 1)
++ for (; top < bottom; top++, first = FALSE) {
++ if (!*top)
++ continue;
++
++ bn = gfs64_to_cpu(*top);
++
++ error = recursive_scan(ip, dibh, mp,
++ height + 1, bn, first,
++ bc, data);
++ if (error)
++ goto fail;
++ }
++
++ brelse(bh);
++
++ return 0;
++
++ fail:
++ if (bh)
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * do_strip - Look for a layer a particular layer of the file and strip it off
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @bh: A buffer of pointers
++ * @top: The first pointer in the buffer
++ * @bottom: One more than the last pointer
++ * @height: the height this buffer is at
++ * @data: a pointer to a struct strip_mine
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_strip(struct gfs_inode *ip, struct buffer_head *dibh,
++ struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
++ unsigned int height, void *data)
++{
++ struct strip_mine *sm = (struct strip_mine *)data;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_holder ri_gh;
++ struct gfs_rgrp_list rlist;
++ uint64_t bn, bstart;
++ uint32_t blen;
++ uint64_t *p;
++ unsigned int rg_blocks = 0;
++ int metadata;
++ int x;
++ int error;
++
++ if (!*top)
++ sm->sm_first = FALSE;
++
++ if (height != sm->sm_height)
++ return 0;
++
++ if (sm->sm_first) {
++ top++;
++ sm->sm_first = FALSE;
++ }
++
++ metadata = (height != ip->i_di.di_height - 1) || gfs_is_jdata(ip);
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ return error;
++
++ memset(&rlist, 0, sizeof(struct gfs_rgrp_list));
++ bstart = 0;
++ blen = 0;
++
++ for (p = top; p < bottom; p++) {
++ if (!*p)
++ continue;
++
++ bn = gfs64_to_cpu(*p);
++
++ if (bstart + blen == bn)
++ blen++;
++ else {
++ if (bstart)
++ gfs_rlist_add(sdp, &rlist, bstart);
++
++ bstart = bn;
++ blen = 1;
++ }
++ }
++
++ if (bstart)
++ gfs_rlist_add(sdp, &rlist, bstart);
++ else
++ goto out; /* Nothing to do */
++
++ gfs_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
++
++ error = gfs_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
++ if (error)
++ goto fail;
++
++ for (x = 0; x < rlist.rl_rgrps; x++) {
++ struct gfs_rgrpd *rgd;
++ rgd = gl2rgd(rlist.rl_ghs[x].gh_gl);
++ rg_blocks += rgd->rd_ri.ri_length;
++ }
++
++ /* Trans may require:
++ All the bitmaps that were reserved.
++ One block for the dinode.
++ One block for the indirect block being cleared.
++ One block for a quota change. */
++
++ error = gfs_trans_begin(sdp, rg_blocks + 2, 1);
++ if (error)
++ goto fail_rg_gunlock;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_trans_add_bh(ip->i_gl, bh);
++
++ bstart = 0;
++ blen = 0;
++
++ for (p = top; p < bottom; p++) {
++ if (!*p)
++ continue;
++
++ bn = gfs64_to_cpu(*p);
++
++ if (bstart + blen == bn)
++ blen++;
++ else {
++ if (bstart) {
++ if (metadata)
++ gfs_metafree(ip, bstart, blen);
++ else
++ gfs_blkfree(ip, bstart, blen);
++ }
++
++ bstart = bn;
++ blen = 1;
++ }
++
++ *p = 0;
++ ip->i_di.di_blocks--;
++ }
++
++ if (bstart) {
++ if (metadata)
++ gfs_metafree(ip, bstart, blen);
++ else
++ gfs_blkfree(ip, bstart, blen);
++ }
++
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++ gfs_rlist_free(&rlist);
++
++ out:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return 0;
++
++ fail_rg_gunlock:
++ gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++
++ fail:
++ gfs_rlist_free(&rlist);
++
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return error;
++}
++
++/**
++ * gfs_truncator_default - truncate a partial data block
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncator_default(struct gfs_inode *ip, uint64_t size)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ uint64_t bn;
++ int not_new = 0;
++ int error;
++
++ error = gfs_block_map(ip, size >> sdp->sd_sb.sb_bsize_shift, ¬_new,
++ &bn, NULL);
++ if (error)
++ return error;
++ if (!bn)
++ return 0;
++
++ error = gfs_get_data_buffer(ip, bn, FALSE, &bh);
++ if (error)
++ return error;
++
++ gfs_buffer_clear_tail(bh, size & (sdp->sd_sb.sb_bsize - 1));
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY);
++
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * truncator_journaled - truncate a partial data block
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++truncator_journaled(struct gfs_inode *ip, uint64_t size)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ uint64_t lbn, dbn;
++ uint32_t off;
++ int not_new = 0;
++ int error;
++
++ lbn = size;
++ off = do_div(lbn, sdp->sd_jbsize);
++
++ error = gfs_block_map(ip, lbn, ¬_new, &dbn, NULL);
++ if (error)
++ return error;
++ if (!dbn)
++ return 0;
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error)
++ return error;
++
++ error = gfs_get_data_buffer(ip, dbn, FALSE, &bh);
++ if (!error) {
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_buffer_clear_tail(bh,
++ sizeof(struct gfs_meta_header) +
++ off);
++ brelse(bh);
++ }
++
++ gfs_trans_end(sdp);
++
++ return error;
++}
++
++/**
++ * gfs_shrink - make a file smaller
++ * @ip: the inode
++ * @size: the size to make the file
++ * @truncator: function to truncate the last partial block
++ *
++ * Called with an exclusive lock on @ip.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_shrink(struct gfs_inode *ip, uint64_t size, gfs_truncator_t truncator)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_holder ri_gh;
++ struct gfs_rgrpd *rgd;
++ struct buffer_head *dibh;
++ uint64_t block;
++ unsigned int height;
++ int journaled = gfs_is_jdata(ip);
++ int error;
++
++ if (!size)
++ block = 0;
++ else if (journaled) {
++ block = size - 1;
++ do_div(block, sdp->sd_jbsize);
++ }
++ else
++ block = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
++
++ /* Get rid of all the data/metadata blocks */
++
++ height = ip->i_di.di_height;
++ if (height) {
++ struct metapath *mp = find_metapath(ip, block);
++ gfs_alloc_get(ip);
++
++ error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error) {
++ gfs_alloc_put(ip);
++ kfree(mp);
++ return error;
++ }
++
++ while (height--) {
++ struct strip_mine sm;
++
++ sm.sm_first = (size) ? TRUE : FALSE;
++ sm.sm_height = height;
++
++ error = recursive_scan(ip, NULL, mp, 0, 0, TRUE,
++ do_strip, &sm);
++ if (error) {
++ gfs_quota_unhold_m(ip);
++ gfs_alloc_put(ip);
++ kfree(mp);
++ return error;
++ }
++ }
++
++ gfs_quota_unhold_m(ip);
++ gfs_alloc_put(ip);
++ kfree(mp);
++ }
++
++ /* If we truncated in the middle of a block, zero out the leftovers. */
++
++ if (gfs_is_stuffed(ip)) {
++ /* Do nothing */
++ } else if (journaled) {
++ if (do_mod(size, sdp->sd_jbsize)) {
++ error = truncator_journaled(ip, size);
++ if (error)
++ return error;
++ }
++ } else if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1)) {
++ error = truncator(ip, size);
++ if (error)
++ return error;
++ }
++
++ /* Set the new size (and possibly the height) */
++
++ if (!size) {
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ return error;
++ }
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error)
++ goto out;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto out_end_trans;
++
++ if (!size) {
++ ip->i_di.di_height = 0;
++
++ rgd = gfs_blk2rgrpd(sdp, ip->i_num.no_addr);
++ GFS_ASSERT_INODE(rgd, ip,);
++
++ ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++ ip->i_di.di_goal_dblk =
++ ip->i_di.di_goal_mblk =
++ ip->i_num.no_addr - rgd->rd_ri.ri_data1;
++ }
++
++ ip->i_di.di_size = size;
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ if (!ip->i_di.di_height &&
++ size < sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))
++ gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode) + size);
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ out_end_trans:
++ gfs_trans_end(sdp);
++
++ out:
++ if (!size)
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return error;
++}
++
++/**
++ * do_same - truncate to same size (update time stamps)
++ * @ip:
++ *
++ * Returns: errno
++ */
++
++static int
++do_same(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *dibh;
++ int error;
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error)
++ return error;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto out;
++
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ out:
++ gfs_trans_end(sdp);
++
++ return error;
++}
++
++/**
++ * gfs_truncatei - make a file a give size
++ * @ip: the inode
++ * @size: the size to make the file
++ * @truncator: function to truncate the last partial block
++ *
++ * The file size can grow, shrink, or stay the same size.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncatei(struct gfs_inode *ip, uint64_t size,
++ gfs_truncator_t truncator)
++{
++ GFS_ASSERT_INODE(ip->i_di.di_type == GFS_FILE_REG, ip,);
++
++ if (size == ip->i_di.di_size)
++ return do_same(ip);
++ else if (size > ip->i_di.di_size)
++ return do_grow(ip, size);
++ else
++ return gfs_shrink(ip, size, truncator);
++}
++
++/**
++ * gfs_write_calc_reserv - calculate the number of blocks needed to write to a file
++ * @ip: the file
++ * @len: the number of bytes to be written to the file
++ * @data_blocks: returns the number of data blocks required
++ * @ind_blocks: returns the number of indirect blocks required
++ *
++ */
++
++void
++gfs_write_calc_reserv(struct gfs_inode *ip, unsigned int len,
++ unsigned int *data_blocks, unsigned int *ind_blocks)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ unsigned int tmp;
++
++ if (gfs_is_jdata(ip)) {
++ *data_blocks = DIV_RU(len, sdp->sd_jbsize) + 2;
++ *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
++ } else {
++ *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
++ *ind_blocks = 3 * (sdp->sd_max_height - 1);
++ }
++
++ for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
++ tmp = DIV_RU(tmp, sdp->sd_inptrs);
++ *ind_blocks += tmp;
++ }
++}
++
++/**
++ * gfs_write_alloc_required - figure out if a write is going to require an allocation
++ * @ip: the file being written to
++ * @offset: the offset to write to
++ * @len: the number of bytes being written
++ * @alloc_required: the int is set to TRUE if an alloc is required, FALSE otherwise
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_write_alloc_required(struct gfs_inode *ip,
++ uint64_t offset, unsigned int len,
++ int *alloc_required)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t lblock, lblock_stop, dblock;
++ uint32_t extlen;
++ int not_new = FALSE;
++ int error = 0;
++
++ *alloc_required = FALSE;
++
++ if (!len)
++ return 0;
++
++ if (gfs_is_stuffed(ip)) {
++ if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))
++ *alloc_required = TRUE;
++ return 0;
++ }
++
++ if (gfs_is_jdata(ip)) {
++ unsigned int bsize = sdp->sd_jbsize;
++ lblock = offset;
++ do_div(lblock, bsize);
++ lblock_stop = offset + len + bsize - 1;
++ do_div(lblock_stop, bsize);
++ } else {
++ unsigned int shift = sdp->sd_sb.sb_bsize_shift;
++ lblock = offset >> shift;
++ lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
++ }
++
++ for (; lblock < lblock_stop; lblock += extlen) {
++ error = gfs_block_map(ip, lblock, ¬_new, &dblock, &extlen);
++ if (error)
++ return error;
++
++ if (!dblock) {
++ *alloc_required = TRUE;
++ return 0;
++ }
++ }
++
++ return 0;
++}
++
++/**
++ * do_gfm - Copy out the dinode/indirect blocks of a file
++ * @ip: the file
++ * @dibh: the dinode buffer
++ * @bh: the indirect buffer we're looking at
++ * @top: the first pointer in the block
++ * @bottom: one more than the last pointer in the block
++ * @height: the height the block is at
++ * @data: a pointer to a struct gfs_user_buffer structure
++ *
++ * If this is a journaled file, copy out the data too.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_gfm(struct gfs_inode *ip, struct buffer_head *dibh,
++ struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
++ unsigned int height, void *data)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_user_buffer *ub = (struct gfs_user_buffer *)data;
++ struct buffer_head *data_bh;
++ uint64_t *bp, bn;
++ int error;
++
++ error = gfs_add_bh_to_ub(ub, bh);
++ if (error)
++ return error;
++
++ if (ip->i_di.di_type != GFS_FILE_DIR ||
++ height + 1 != ip->i_di.di_height)
++ return 0;
++
++ for (bp = top; bp < bottom; bp++)
++ if (*bp) {
++ bn = gfs64_to_cpu(*bp);
++
++ error = gfs_dread(sdp, bn, ip->i_gl,
++ DIO_START | DIO_WAIT, &data_bh);
++ if (error)
++ return error;
++
++ error = gfs_add_bh_to_ub(ub, data_bh);
++
++ brelse(data_bh);
++
++ if (error)
++ return error;
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_get_file_meta - return all the metadata for a file
++ * @ip: the file
++ * @ub: the structure representing the meta
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_file_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub)
++{
++ struct buffer_head *dibh;
++ struct metapath *mp;
++ int error;
++
++ if (gfs_is_stuffed(ip)) {
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (!error) {
++ error = gfs_add_bh_to_ub(ub, dibh);
++ brelse(dibh);
++ }
++ } else {
++ mp = find_metapath(ip, 0);
++ error = recursive_scan(ip, NULL, mp, 0, 0, TRUE, do_gfm, ub);
++ kfree(mp);
++ }
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/bmap.h linux-patched/fs/gfs/bmap.h
+--- linux-orig/fs/gfs/bmap.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/bmap.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,48 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __BMAP_DOT_H__
++#define __BMAP_DOT_H__
++
++typedef int (*gfs_unstuffer_t) (struct gfs_inode * ip,
++ struct buffer_head * dibh, uint64_t block,
++ void *private);
++
++int gfs_unstuffer_sync(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private);
++int gfs_unstuffer_async(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private);
++
++int gfs_unstuff_dinode(struct gfs_inode *ip, gfs_unstuffer_t unstuffer,
++ void *private);
++
++int gfs_block_map(struct gfs_inode *ip,
++ uint64_t lblock, int *new,
++ uint64_t *dblock, uint32_t *extlen);
++
++typedef int (*gfs_truncator_t) (struct gfs_inode * ip, uint64_t size);
++
++int gfs_truncator_default(struct gfs_inode *ip, uint64_t size);
++
++int gfs_shrink(struct gfs_inode *ip, uint64_t size, gfs_truncator_t truncator);
++int gfs_truncatei(struct gfs_inode *ip, uint64_t size,
++ gfs_truncator_t truncator);
++
++void gfs_write_calc_reserv(struct gfs_inode *ip, unsigned int len,
++ unsigned int *data_blocks, unsigned int *ind_blocks);
++int gfs_write_alloc_required(struct gfs_inode *ip, uint64_t offset,
++ unsigned int len, int *alloc_required);
++
++int gfs_get_file_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++#endif /* __BMAP_DOT_H__ */
+diff -urN linux-orig/fs/gfs/daemon.c linux-patched/fs/gfs/daemon.c
+--- linux-orig/fs/gfs/daemon.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/daemon.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,259 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "daemon.h"
++#include "glock.h"
++#include "log.h"
++#include "quota.h"
++#include "recovery.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_scand - Writing of cached scan chanes into the scan file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_scand(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++ daemonize("gfs_scand");
++ sdp->sd_scand_process = current;
++ set_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ gfs_scand_internal(sdp);
++
++ if (!test_bit(SDF_SCAND_RUN, &sdp->sd_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(sdp->sd_tune.gt_scand_secs * HZ);
++ }
++
++ down(&sdp->sd_thread_lock);
++ up(&sdp->sd_thread_lock);
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
++
++/**
++ * gfs_glockd - Writing of cached scan chanes into the scan file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_glockd(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++ daemonize("gfs_glockd");
++ set_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ while (atomic_read(&sdp->sd_reclaim_count))
++ gfs_reclaim_glock(sdp);
++
++ if (!test_bit(SDF_GLOCKD_RUN, &sdp->sd_flags))
++ break;
++
++ {
++ DECLARE_WAITQUEUE(__wait_chan, current);
++ current->state = TASK_INTERRUPTIBLE;
++ add_wait_queue(&sdp->sd_reclaim_wchan, &__wait_chan);
++ if (!atomic_read(&sdp->sd_reclaim_count)
++ && test_bit(SDF_GLOCKD_RUN, &sdp->sd_flags))
++ schedule();
++ remove_wait_queue(&sdp->sd_reclaim_wchan, &__wait_chan);
++ current->state = TASK_RUNNING;
++ }
++ }
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
++
++/**
++ * gfs_recoverd - Recovery of dead machine's journals
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_recoverd(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++ daemonize("gfs_recoverd");
++ sdp->sd_recoverd_process = current;
++ set_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ gfs_check_journals(sdp);
++
++ if (!test_bit(SDF_RECOVERD_RUN, &sdp->sd_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(sdp->sd_tune.gt_recoverd_secs * HZ);
++ }
++
++ down(&sdp->sd_thread_lock);
++ up(&sdp->sd_thread_lock);
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
++
++/**
++ * gfs_logd - Writing of cached log chanes into the log file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_logd(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++ struct gfs_holder ji_gh;
++
++ daemonize("gfs_logd");
++ sdp->sd_logd_process = current;
++ set_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ gfs_ail_empty(sdp);
++
++ if (time_after_eq(jiffies,
++ sdp->sd_jindex_refresh_time +
++ sdp->sd_tune.gt_jindex_refresh_secs * HZ)) {
++ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags) &&
++ !gfs_jindex_hold(sdp, &ji_gh))
++ gfs_glock_dq_uninit(&ji_gh);
++ sdp->sd_jindex_refresh_time = jiffies;
++ }
++
++ if (!test_bit(SDF_LOGD_RUN, &sdp->sd_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(sdp->sd_tune.gt_logd_secs * HZ);
++ }
++
++ down(&sdp->sd_thread_lock);
++ up(&sdp->sd_thread_lock);
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
++
++/**
++ * gfs_quotad - Writing of cached quota chanes into the quota file
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_quotad(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++ int error;
++
++ daemonize("gfs_quotad");
++ sdp->sd_quotad_process = current;
++ set_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ if (time_after_eq(jiffies,
++ sdp->sd_quota_sync_time +
++ sdp->sd_tune.gt_quota_quantum * HZ)) {
++ error = gfs_quota_sync(sdp);
++ if (error && error != -EROFS)
++ printk("GFS: fsid=%s: quotad: error = %d\n",
++ sdp->sd_fsname, error);
++ sdp->sd_quota_sync_time = jiffies;
++ }
++
++ gfs_quota_scan(sdp);
++
++ if (!test_bit(SDF_QUOTAD_RUN, &sdp->sd_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(sdp->sd_tune.gt_quotad_secs * HZ);
++ }
++
++ down(&sdp->sd_thread_lock);
++ up(&sdp->sd_thread_lock);
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
++
++/**
++ * gfs_inoded - Deallocation of unlinked inodes
++ * @sdp: Pointer to GFS superblock
++ *
++ */
++
++int
++gfs_inoded(void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)data;
++
++ daemonize("gfs_inoded");
++ sdp->sd_inoded_process = current;
++ set_bit(SDF_INODED_RUN, &sdp->sd_flags);
++ complete(&sdp->sd_thread_completion);
++
++ for (;;) {
++ gfs_unlinked_dealloc(sdp);
++
++ if (!test_bit(SDF_INODED_RUN, &sdp->sd_flags))
++ break;
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(sdp->sd_tune.gt_inoded_secs * HZ);
++ }
++
++ down(&sdp->sd_thread_lock);
++ up(&sdp->sd_thread_lock);
++
++ complete(&sdp->sd_thread_completion);
++
++ return 0;
++}
+diff -urN linux-orig/fs/gfs/daemon.h linux-patched/fs/gfs/daemon.h
+--- linux-orig/fs/gfs/daemon.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/daemon.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,24 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DAEMON_DOT_H__
++#define __DAEMON_DOT_H__
++
++int gfs_scand(void *data);
++int gfs_glockd(void *data);
++int gfs_recoverd(void *data);
++int gfs_logd(void *data);
++int gfs_quotad(void *data);
++int gfs_inoded(void *data);
++
++#endif /* __DAEMON_DOT_H__ */
+diff -urN linux-orig/fs/gfs/dio.c linux-patched/fs/gfs/dio.c
+--- linux-orig/fs/gfs/dio.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dio.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,1302 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/writeback.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "lops.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define buffer_busy(bh) ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
++
++/**
++ * aspace_get_block -
++ * @inode:
++ * @lblock:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++aspace_get_block(struct inode *inode, sector_t lblock,
++ struct buffer_head *bh_result, int create)
++{
++ struct gfs_sbd *sdp = vfs2sdp(inode->i_sb);
++ GFS_ASSERT_SBD(FALSE, sdp,);
++}
++
++/**
++ * gfs_aspace_writepage - write an aspace page
++ * @page: the page
++ * @wbc:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_aspace_writepage(struct page *page, struct writeback_control *wbc)
++{
++ return block_write_full_page(page, aspace_get_block, wbc);
++}
++
++/**
++ * stuck_releasepage - We're stuck in gfs_releasepage(). Print stuff out.
++ * @bh: the buffer we're stuck on
++ *
++ */
++
++static void
++stuck_releasepage(struct buffer_head *bh)
++{
++ struct gfs_sbd *sdp = vfs2sdp(bh->b_page->mapping->host->i_sb);
++ struct gfs_bufdata *bd = bh2bd(bh);
++
++ printk("GFS: fsid=%s: stuck in gfs_releasepage()...\n", sdp->sd_fsname);
++ printk("GFS: fsid=%s: blkno = %"PRIu64", bh->b_count = %d\n",
++ sdp->sd_fsname,
++ (uint64_t)bh->b_blocknr,
++ atomic_read(&bh->b_count));
++ printk("GFS: fsid=%s: bh2bd(bh) = %s\n",
++ sdp->sd_fsname,
++ (bd) ? "!NULL" : "NULL");
++
++ if (bd) {
++ struct gfs_glock *gl = bd->bd_gl;
++
++ printk("GFS: fsid=%s: gl = (%u, %"PRIu64")\n",
++ sdp->sd_fsname,
++ gl->gl_name.ln_type,
++ gl->gl_name.ln_number);
++
++ printk("GFS: fsid=%s: bd_new_le.le_trans = %s\n",
++ sdp->sd_fsname,
++ (bd->bd_new_le.le_trans) ? "!NULL" : "NULL");
++ printk("GFS: fsid=%s: bd_incore_le.le_trans = %s\n",
++ sdp->sd_fsname,
++ (bd->bd_incore_le.le_trans) ? "!NULL" : "NULL");
++ printk("GFS: fsid=%s: bd_frozen = %s\n",
++ sdp->sd_fsname,
++ (bd->bd_frozen) ? "!NULL" : "NULL");
++ printk("GFS: fsid=%s: bd_pinned = %u\n",
++ sdp->sd_fsname, bd->bd_pinned);
++ printk("GFS: fsid=%s: bd_ail_tr_list = %s\n",
++ sdp->sd_fsname,
++ (list_empty(&bd->bd_ail_tr_list)) ? "Empty" : "!Empty");
++
++ if (gl->gl_ops == &gfs_inode_glops) {
++ struct gfs_inode *ip = gl2ip(gl);
++
++ if (ip) {
++ unsigned int x;
++
++ printk("GFS: fsid=%s: ip = %"PRIu64"/%"PRIu64"\n",
++ sdp->sd_fsname,
++ ip->i_num.no_formal_ino,
++ ip->i_num.no_addr);
++ printk("GFS: fsid=%s: ip->i_count = %d, ip->i_vnode = %s\n",
++ sdp->sd_fsname,
++ atomic_read(&ip->i_count),
++ (ip->i_vnode) ? "!NULL" : "NULL");
++ for (x = 0; x < GFS_MAX_META_HEIGHT; x++)
++ printk("GFS: fsid=%s: ip->i_cache[%u] = %s\n",
++ sdp->sd_fsname, x,
++ (ip->i_cache[x]) ? "!NULL" : "NULL");
++ }
++ }
++ }
++}
++
++/**
++ * gfs_aspace_releasepage - free the metadata associated with a page
++ * @page: the page that's being released
++ * @gfp_mask: huh??
++ *
++ * Call try_to_free_buffers() if the buffers in this page can be
++ * released.
++ *
++ * Returns: 0
++ */
++
++static int
++gfs_aspace_releasepage(struct page *page, int gfp_mask)
++{
++ struct inode *aspace = page->mapping->host;
++ struct gfs_sbd *sdp = vfs2sdp(aspace->i_sb);
++ struct buffer_head *bh, *head;
++ struct gfs_bufdata *bd;
++ unsigned long t;
++
++ if (!page_has_buffers(page))
++ goto out;
++
++ head = bh = page_buffers(page);
++ do {
++ t = jiffies;
++
++ while (atomic_read(&bh->b_count)) {
++ if (atomic_read(&aspace->i_writecount)) {
++ if (time_after_eq(jiffies,
++ t +
++ sdp->sd_tune.gt_stall_secs * HZ)) {
++ stuck_releasepage(bh);
++ t = jiffies;
++ }
++
++ yield();
++ continue;
++ }
++
++ return 0;
++ }
++
++ bd = bh2bd(bh);
++ if (bd) {
++ GFS_ASSERT_SBD(bd->bd_bh == bh, sdp,);
++ GFS_ASSERT_SBD(!bd->bd_new_le.le_trans, sdp,);
++ GFS_ASSERT_SBD(!bd->bd_incore_le.le_trans, sdp,);
++ GFS_ASSERT_SBD(!bd->bd_frozen, sdp,);
++ GFS_ASSERT_SBD(!bd->bd_pinned, sdp,);
++ GFS_ASSERT_SBD(list_empty(&bd->bd_ail_tr_list), sdp,);
++ kmem_cache_free(gfs_bufdata_cachep, bd);
++ atomic_dec(&sdp->sd_bufdata_count);
++ bh2bd(bh) = NULL;
++ }
++
++ bh = bh->b_this_page;
++ }
++ while (bh != head);
++
++ out:
++ return try_to_free_buffers(page);
++}
++
++static struct address_space_operations aspace_aops = {
++ .writepage = gfs_aspace_writepage,
++ .releasepage = gfs_aspace_releasepage,
++};
++
++/**
++ * gfs_aspace_get - Get and initialize a struct inode structure
++ * @sdp: the filesystem the aspace is in
++ *
++ * Right now a struct inode is just a struct inode. Maybe Linux
++ * will supply a more lightweight address space construct (that works)
++ * in the future.
++ *
++ * Make sure pages/buffers in this aspace aren't in high memory.
++ *
++ * Returns: the aspace
++ */
++
++struct inode *
++gfs_aspace_get(struct gfs_sbd *sdp)
++{
++ struct inode *aspace;
++
++ aspace = new_inode(sdp->sd_vfs);
++ if (aspace) {
++ mapping_set_gfp_mask(aspace->i_mapping, GFP_KERNEL);
++ aspace->i_mapping->a_ops = &aspace_aops;
++ aspace->i_size = ~0ULL;
++ vn2ip(aspace) = NULL;
++ insert_inode_hash(aspace);
++ }
++
++ return aspace;
++}
++
++/**
++ * gfs_aspace_put - get rid of an aspace
++ * @aspace:
++ *
++ */
++
++void
++gfs_aspace_put(struct inode *aspace)
++{
++ remove_inode_hash(aspace);
++ iput(aspace);
++}
++
++/**
++ * gfs_ail_start_trans - Start I/O on a part of the AIL
++ * @sdp: the filesystem
++ * @tr: the part of the AIL
++ *
++ */
++
++void
++gfs_ail_start_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *head, *tmp, *prev;
++ struct gfs_bufdata *bd;
++ struct buffer_head *bh;
++ int retry;
++
++ do {
++ retry = FALSE;
++
++ spin_lock(&sdp->sd_ail_lock);
++
++ for (head = &tr->tr_ail_bufs, tmp = head->prev, prev = tmp->prev;
++ tmp != head;
++ tmp = prev, prev = tmp->prev) {
++ bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++ bh = bd->bd_bh;
++
++ if (gfs_trylock_buffer(bh))
++ continue;
++
++ if (bd->bd_pinned) {
++ gfs_unlock_buffer(bh);
++ continue;
++ }
++
++ if (!buffer_busy(bh)) {
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++
++ list_del_init(&bd->bd_ail_tr_list);
++ list_del(&bd->bd_ail_gl_list);
++
++ gfs_unlock_buffer(bh);
++ brelse(bh);
++ continue;
++ }
++
++ if (buffer_dirty(bh)) {
++ list_move(&bd->bd_ail_tr_list, head);
++
++ spin_unlock(&sdp->sd_ail_lock);
++ wait_on_buffer(bh);
++ ll_rw_block(WRITE, 1, &bh);
++ spin_lock(&sdp->sd_ail_lock);
++
++ gfs_unlock_buffer(bh);
++ retry = TRUE;
++ break;
++ }
++
++ gfs_unlock_buffer(bh);
++ }
++
++ spin_unlock(&sdp->sd_ail_lock);
++ } while (retry);
++}
++
++/**
++ * gfs_ail_empty_trans - Check whether or not a trans in the AIL has been synced
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++int
++gfs_ail_empty_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *head, *tmp, *prev;
++ struct gfs_bufdata *bd;
++ struct buffer_head *bh;
++ int ret;
++
++ spin_lock(&sdp->sd_ail_lock);
++
++ for (head = &tr->tr_ail_bufs, tmp = head->prev, prev = tmp->prev;
++ tmp != head;
++ tmp = prev, prev = tmp->prev) {
++ bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++ bh = bd->bd_bh;
++
++ if (gfs_trylock_buffer(bh))
++ continue;
++
++ if (bd->bd_pinned || buffer_busy(bh)) {
++ gfs_unlock_buffer(bh);
++ continue;
++ }
++
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++
++ list_del_init(&bd->bd_ail_tr_list);
++ list_del(&bd->bd_ail_gl_list);
++
++ gfs_unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ ret = list_empty(head);
++
++ spin_unlock(&sdp->sd_ail_lock);
++
++ return ret;
++}
++
++/**
++ * ail_empty_gl - remove all buffers for a given lock from the AIL
++ * @gl: the glock
++ *
++ * None of the buffers should be dirty, locked, or pinned.
++ */
++
++static void
++ail_empty_gl(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_bufdata *bd;
++ struct buffer_head *bh;
++
++ spin_lock(&sdp->sd_ail_lock);
++
++ while (!list_empty(&gl->gl_ail_bufs)) {
++ bd = list_entry(gl->gl_ail_bufs.next,
++ struct gfs_bufdata, bd_ail_gl_list);
++ bh = bd->bd_bh;
++
++ GFS_ASSERT_GLOCK(!bd->bd_pinned && !buffer_busy(bh), gl,
++ printk("%u %.8lX\n", bd->bd_pinned, bh->b_state););
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++
++ list_del_init(&bd->bd_ail_tr_list);
++ list_del(&bd->bd_ail_gl_list);
++
++ brelse(bh);
++ }
++
++ spin_unlock(&sdp->sd_ail_lock);
++}
++
++/**
++ * gfs_inval_buf - Invalidate all buffers associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_buf(struct gfs_glock *gl)
++{
++ struct inode *aspace = gl->gl_aspace;
++ struct address_space *mapping = gl->gl_aspace->i_mapping;
++
++ ail_empty_gl(gl);
++
++ atomic_inc(&aspace->i_writecount);
++ truncate_inode_pages(mapping, 0);
++ atomic_dec(&aspace->i_writecount);
++
++ GFS_ASSERT_GLOCK(!mapping->nrpages, gl,);
++}
++
++/**
++ * gfs_sync_buf - Sync all buffers associated with a glock
++ * @gl: The glock
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_buf(struct gfs_glock *gl, int flags)
++{
++ struct address_space *mapping = gl->gl_aspace->i_mapping;
++ int error = 0;
++
++ if (flags & DIO_START)
++ error = filemap_fdatawrite(mapping);
++ if (!error && (flags & DIO_WAIT))
++ error = filemap_fdatawait(mapping);
++ if (!error && (flags & (DIO_INVISIBLE | DIO_CHECK)) == DIO_CHECK)
++ ail_empty_gl(gl);
++
++ if (error)
++ gfs_io_error(gl->gl_sbd);
++}
++
++/**
++ * getbuf - Get a buffer with a given address space
++ * @sdp: the filesystem
++ * @aspace: the address space
++ * @blkno: the block number
++ * @create: TRUE if the buffer should be created
++ *
++ * Returns: the buffer
++ */
++
++static struct buffer_head *
++getbuf(struct gfs_sbd *sdp, struct inode *aspace, uint64_t blkno, int create)
++{
++ struct page *page;
++ struct buffer_head *bh;
++ unsigned int shift;
++ unsigned long index;
++ unsigned int bufnum;
++
++ shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
++ index = blkno >> shift;
++ bufnum = blkno - (index << shift);
++
++ if (create) {
++ RETRY_MALLOC(page = grab_cache_page(aspace->i_mapping, index), page);
++ } else {
++ page = find_lock_page(aspace->i_mapping, index);
++ if (!page)
++ return NULL;
++ }
++
++ if (!page_has_buffers(page))
++ create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
++
++ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
++ /* Do nothing */;
++ get_bh(bh);
++
++ if (!buffer_mapped(bh))
++ map_bh(bh, sdp->sd_vfs, blkno);
++ else
++ GFS_ASSERT_SBD(bh->b_bdev == sdp->sd_vfs->s_bdev &&
++ bh->b_blocknr == blkno,
++ sdp,);
++
++ unlock_page(page);
++ page_cache_release(page);
++
++ return bh;
++}
++
++/**
++ * gfs_dgetblk - Get a block
++ * @sdp: The GFS superblock
++ * @blkno: The block number
++ * @gl: The glock associated with this block
++ *
++ * Returns: The buffer
++ */
++
++struct buffer_head *
++gfs_dgetblk(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl)
++{
++ struct buffer_head *bh;
++
++ if (gl)
++ bh = getbuf(sdp, gl->gl_aspace, blkno, CREATE);
++ else
++ bh = sb_getblk(sdp->sd_vfs, blkno);
++
++ return bh;
++}
++
++/**
++ * gfs_dread - Read a block from disk
++ * @sdp: The GFS superblock
++ * @blkno: The block number
++ * @gl: The glock covering the block
++ * @flags: flags to gfs_dreread()
++ * @bhp: the place where the buffer is returned
++ *
++ * Returns: The buffer on success, NULL on failur
++ */
++
++int
++gfs_dread(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl, int flags,
++ struct buffer_head **bhp)
++{
++ int error;
++
++ *bhp = gfs_dgetblk(sdp, blkno, gl);
++ error = gfs_dreread(sdp, *bhp, flags);
++ if (error)
++ brelse(*bhp);
++
++ return error;
++}
++
++/**
++ * gfs_prep_new_buffer - Mark a new buffer we just gfs_dgetblk()ed uptodate
++ * @bh: the buffer
++ *
++ */
++
++void
++gfs_prep_new_buffer(struct buffer_head *bh)
++{
++ wait_on_buffer(bh);
++ clear_buffer_dirty(bh);
++ set_buffer_uptodate(bh);
++}
++
++/**
++ * gfs_dreread - Reread a block from disk
++ * @sdp: the filesystem
++ * @bh: The block to read
++ * @flags: Flags that control the read
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dreread(struct gfs_sbd *sdp, struct buffer_head *bh, int flags)
++{
++ int error = 0;
++
++ if (flags & DIO_NEW) {
++ if (gfs_mhc_fish(sdp, bh))
++ return 0;
++ clear_buffer_uptodate(bh);
++ }
++
++ if (flags & DIO_FORCE)
++ clear_buffer_uptodate(bh);
++
++ if ((flags & DIO_START) && !buffer_uptodate(bh))
++ ll_rw_block(READ, 1, &bh);
++
++ if (flags & DIO_WAIT) {
++ wait_on_buffer(bh);
++
++ if (!buffer_uptodate(bh)) {
++ gfs_io_error_bh(sdp, bh);
++ error = -EIO;
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_dwrite - Write a buffer
++ * @sdp: the filesystem
++ * @bh: The buffer to write
++ * @flags: The type of write operation to do
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dwrite(struct gfs_sbd *sdp, struct buffer_head *bh, int flags)
++{
++ int error = 0;
++
++ GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++ GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++
++ if (flags & DIO_CLEAN) {
++ lock_buffer(bh);
++ clear_buffer_dirty(bh);
++ unlock_buffer(bh);
++ }
++
++ if (flags & DIO_DIRTY)
++ mark_buffer_dirty(bh);
++
++ if ((flags & DIO_START) && buffer_dirty(bh)) {
++ wait_on_buffer(bh);
++ ll_rw_block(WRITE, 1, &bh);
++ }
++
++ if (flags & DIO_WAIT) {
++ wait_on_buffer(bh);
++
++ if (!buffer_uptodate(bh) || buffer_dirty(bh)) {
++ gfs_io_error_bh(sdp, bh);
++ error = -EIO;
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_attach_bufdata - attach a struct gfs_bufdata structure to a buffer
++ * @bh: The buffer to be attached to
++ * @gl: the glock the buffer belongs to
++ *
++ */
++
++void
++gfs_attach_bufdata(struct buffer_head *bh, struct gfs_glock *gl)
++{
++ struct gfs_bufdata *bd;
++
++ lock_page(bh->b_page);
++
++ if (bh2bd(bh)) {
++ unlock_page(bh->b_page);
++ return;
++ }
++
++ RETRY_MALLOC(bd = kmem_cache_alloc(gfs_bufdata_cachep, GFP_KERNEL), bd);
++ atomic_inc(&gl->gl_sbd->sd_bufdata_count);
++
++ memset(bd, 0, sizeof(struct gfs_bufdata));
++
++ bd->bd_bh = bh;
++ bd->bd_gl = gl;
++
++ INIT_LE(&bd->bd_new_le, &gfs_buf_lops);
++ INIT_LE(&bd->bd_incore_le, &gfs_buf_lops);
++
++ init_MUTEX(&bd->bd_lock);
++
++ INIT_LIST_HEAD(&bd->bd_ail_tr_list);
++
++ bh2bd(bh) = bd;
++
++ unlock_page(bh->b_page);
++}
++
++/**
++ * gfs_is_pinned - Figure out if a buffer is pinned or not
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to be pinned
++ *
++ * Returns: TRUE if the buffer is pinned, FALSE otherwise
++ */
++
++int
++gfs_is_pinned(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ struct gfs_bufdata *bd = bh2bd(bh);
++ int ret = FALSE;
++
++ if (bd) {
++ gfs_lock_buffer(bh);
++ if (bd->bd_pinned)
++ ret = TRUE;
++ gfs_unlock_buffer(bh);
++ }
++
++ return ret;
++}
++
++/**
++ * gfs_dpin - Pin a metadata buffer in memory
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to be pinned
++ *
++ */
++
++void
++gfs_dpin(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ struct gfs_bufdata *bd;
++ char *data;
++
++ GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++ GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++
++ bd = bh2bd(bh);
++ GFS_ASSERT_SBD(bd, sdp,);
++
++ gfs_lock_buffer(bh);
++
++ GFS_ASSERT_GLOCK(!bd->bd_frozen, bd->bd_gl,);
++
++ if (!bd->bd_pinned++) {
++ wait_on_buffer(bh);
++
++ /* If this buffer is in the AIL and it has already been written,
++ remove it from the AIL. */
++
++ spin_lock(&sdp->sd_ail_lock);
++ if (!list_empty(&bd->bd_ail_tr_list) && !buffer_busy(bh)) {
++ list_del_init(&bd->bd_ail_tr_list);
++ list_del(&bd->bd_ail_gl_list);
++ brelse(bh);
++ }
++ spin_unlock(&sdp->sd_ail_lock);
++
++ clear_buffer_dirty(bh);
++ wait_on_buffer(bh);
++
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++ } else {
++ gfs_unlock_buffer(bh);
++
++ data = gmalloc(sdp->sd_sb.sb_bsize);
++
++ gfs_lock_buffer(bh);
++ if (bd->bd_pinned > 1) {
++ memcpy(data, bh->b_data, sdp->sd_sb.sb_bsize);
++ bd->bd_frozen = data;
++ } else
++ kfree(data);
++ }
++
++ gfs_unlock_buffer(bh);
++
++ get_bh(bh);
++}
++
++/**
++ * gfs_dunpin - Unpin a buffer
++ * @sdp: the filesystem the buffer belongs to
++ * @bh: The buffer to unpin
++ * @tr: The transaction in the AIL that contains this buffer
++ *
++ */
++
++void
++gfs_dunpin(struct gfs_sbd *sdp, struct buffer_head *bh, struct gfs_trans *tr)
++{
++ struct gfs_bufdata *bd;
++
++ GFS_ASSERT_SBD(buffer_uptodate(bh), sdp,);
++
++ bd = bh2bd(bh);
++ GFS_ASSERT_SBD(bd, sdp,);
++
++ gfs_lock_buffer(bh);
++
++ GFS_ASSERT_GLOCK(bd->bd_pinned, bd->bd_gl,);
++
++ if (bd->bd_pinned == 1)
++ mark_buffer_dirty(bh);
++
++ bd->bd_pinned--;
++
++ gfs_unlock_buffer(bh);
++
++ /* Add the buffer to the AIL
++ and get rid of an old reference if there is one */
++
++ if (tr) {
++ spin_lock(&sdp->sd_ail_lock);
++
++ if (list_empty(&bd->bd_ail_tr_list))
++ list_add(&bd->bd_ail_gl_list, &bd->bd_gl->gl_ail_bufs);
++ else {
++ list_del_init(&bd->bd_ail_tr_list);
++ brelse(bh);
++ }
++ list_add(&bd->bd_ail_tr_list, &tr->tr_ail_bufs);
++
++ spin_unlock(&sdp->sd_ail_lock);
++ } else
++ brelse(bh);
++}
++
++/**
++ * logbh_end_io - called at the end of a logbh write
++ * @bh: the buffer
++ * @uptodate: whether or not the write succeeded
++ *
++ * Don't do ENTER() AND EXIT() here.
++ *
++ */
++
++static void
++logbh_end_io(struct buffer_head *bh, int uptodate)
++{
++ if (uptodate)
++ set_buffer_uptodate(bh);
++ else
++ clear_buffer_uptodate(bh);
++ unlock_buffer(bh);
++}
++
++/**
++ * gfs_logbh_init - Initialize a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to initialize
++ * @blkno: the block address of the buffer
++ * @data: the data to be written
++ *
++ */
++
++void
++gfs_logbh_init(struct gfs_sbd *sdp, struct buffer_head *bh,
++ uint64_t blkno, char *data)
++{
++ memset(bh, 0, sizeof(struct buffer_head));
++ bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
++ atomic_set(&bh->b_count, 1);
++ set_bh_page(bh, virt_to_page(data), ((unsigned long)data) & (PAGE_SIZE - 1));
++ bh->b_blocknr = blkno;
++ bh->b_size = sdp->sd_sb.sb_bsize;
++ bh->b_bdev = sdp->sd_vfs->s_bdev;
++ init_buffer(bh, logbh_end_io, NULL);
++ INIT_LIST_HEAD(&bh->b_assoc_buffers);
++}
++
++/**
++ * gfs_logbh_uninit - Clean up a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to clean
++ *
++ */
++
++void
++gfs_logbh_uninit(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ GFS_ASSERT_SBD(!buffer_busy(bh) &&
++ atomic_read(&bh->b_count) == 1,
++ sdp,);
++}
++
++/**
++ * gfs_logbh_start - Start writing a fake buffer head
++ * @sdp: the filesystem
++ * @bh: the buffer to write
++ *
++ * Returns: 0 on success, -EXXX on error;
++ */
++
++int
++gfs_logbh_start(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ submit_bh(WRITE, bh);
++ return 0;
++}
++
++/**
++ * gfs_logbh_wait - Wait for the write of a fake buffer head to complete
++ * @sdp: the filesystem
++ * @bh: the buffer to write
++ *
++ * Returns: 0 on success, -EXXX on error;
++ */
++
++int
++gfs_logbh_wait(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ int error = 0;
++
++ wait_on_buffer(bh);
++
++ if (!buffer_uptodate(bh) || buffer_dirty(bh)) {
++ gfs_io_error_bh(sdp, bh);
++ error = -EIO;
++ }
++
++ return error;
++}
++
++/**
++ * gfs_replay_buf - write a log buffer to its inplace location
++ * @gl: the journal's glock
++ * @bh: the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_replay_buf(struct gfs_glock *gl, struct buffer_head *bh)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_bufdata *bd;
++
++ bd = bh2bd(bh);
++ if (!bd) {
++ gfs_attach_bufdata(bh, gl);
++ bd = bh2bd(bh);
++ }
++
++ mark_buffer_dirty(bh);
++
++ if (list_empty(&bd->bd_ail_tr_list)) {
++ get_bh(bh);
++ list_add(&bd->bd_ail_tr_list, &sdp->sd_recovery_bufs);
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_replay_check - Check up on journal replay
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_replay_check(struct gfs_sbd *sdp)
++{
++ struct buffer_head *bh;
++ struct gfs_bufdata *bd;
++
++ while (!list_empty(&sdp->sd_recovery_bufs)) {
++ bd = list_entry(sdp->sd_recovery_bufs.prev,
++ struct gfs_bufdata, bd_ail_tr_list);
++ bh = bd->bd_bh;
++
++ if (buffer_busy(bh)) {
++ list_move(&bd->bd_ail_tr_list,
++ &sdp->sd_recovery_bufs);
++ break;
++ } else {
++ list_del_init(&bd->bd_ail_tr_list);
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++ brelse(bh);
++ }
++ }
++}
++
++/**
++ * gfs_replay_wait - Wait for all replayed buffers to hit the disk
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_replay_wait(struct gfs_sbd *sdp)
++{
++ struct list_head *head, *tmp, *prev;
++ struct buffer_head *bh;
++ struct gfs_bufdata *bd;
++
++ for (head = &sdp->sd_recovery_bufs, tmp = head->prev, prev = tmp->prev;
++ tmp != head;
++ tmp = prev, prev = tmp->prev) {
++ bd = list_entry(tmp, struct gfs_bufdata, bd_ail_tr_list);
++ bh = bd->bd_bh;
++
++ if (!buffer_busy(bh)) {
++ list_del_init(&bd->bd_ail_tr_list);
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++ brelse(bh);
++ continue;
++ }
++
++ if (buffer_dirty(bh)) {
++ wait_on_buffer(bh);
++ ll_rw_block(WRITE, 1, &bh);
++ }
++ }
++
++ while (!list_empty(head)) {
++ bd = list_entry(head->prev, struct gfs_bufdata, bd_ail_tr_list);
++ bh = bd->bd_bh;
++
++ wait_on_buffer(bh);
++
++ GFS_ASSERT_SBD(!buffer_busy(bh), sdp,);
++
++ list_del_init(&bd->bd_ail_tr_list);
++ if (!buffer_uptodate(bh))
++ gfs_io_error_bh(sdp, bh);
++ brelse(bh);
++ }
++}
++
++/**
++ * gfs_wipe_buffers - make buffers so they aren't dirty/pinned anymore
++ * @ip: the inode who owns the buffers
++ * @bstart: the first buffer in the run
++ * @blen: the number of buffers in the run
++ *
++ */
++
++void
++gfs_wipe_buffers(struct gfs_inode *ip, struct gfs_rgrpd *rgd,
++ uint64_t bstart, uint32_t blen)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct inode *aspace = ip->i_gl->gl_aspace;
++ struct buffer_head *bh;
++ struct gfs_bufdata *bd;
++ int busy;
++ int add = FALSE;
++
++ while (blen) {
++ bh = getbuf(sdp, aspace, bstart, NO_CREATE);
++ if (bh) {
++
++ bd = bh2bd(bh);
++
++ if (buffer_uptodate(bh)) {
++ if (bd) {
++ gfs_lock_buffer(bh);
++ gfs_mhc_add(rgd, &bh, 1);
++ busy = bd->bd_pinned || buffer_busy(bh);
++ gfs_unlock_buffer(bh);
++
++ if (busy)
++ add = TRUE;
++ else {
++ spin_lock(&sdp->sd_ail_lock);
++ if (!list_empty(&bd->bd_ail_tr_list)) {
++ list_del_init(&bd->bd_ail_tr_list);
++ list_del(&bd->bd_ail_gl_list);
++ brelse(bh);
++ }
++ spin_unlock(&sdp->sd_ail_lock);
++ }
++ } else {
++ GFS_ASSERT_INODE(!buffer_dirty(bh), ip,);
++ wait_on_buffer(bh);
++ GFS_ASSERT_INODE(!buffer_busy(bh), ip,);
++ gfs_mhc_add(rgd, &bh, 1);
++ }
++ } else {
++ GFS_ASSERT_INODE(!bd || !bd->bd_pinned, ip,);
++ GFS_ASSERT_INODE(!buffer_dirty(bh), ip,);
++ wait_on_buffer(bh);
++ GFS_ASSERT_INODE(!buffer_busy(bh), ip,);
++ }
++
++ brelse(bh);
++ }
++
++ bstart++;
++ blen--;
++ }
++
++ if (add)
++ gfs_depend_add(rgd, ip->i_num.no_formal_ino);
++}
++
++/**
++ * gfs_sync_meta - sync all the buffers in a filesystem
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_sync_meta(struct gfs_sbd *sdp)
++{
++ gfs_log_flush(sdp);
++ for (;;) {
++ gfs_ail_start(sdp, DIO_ALL);
++ if (gfs_ail_empty(sdp))
++ break;
++
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ / 10);
++ }
++}
++
++/**
++ * gfs_flush_meta_cache - get rid of any references on buffers for this inode
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_flush_meta_cache(struct gfs_inode *ip)
++{
++ struct buffer_head **bh_slot;
++ unsigned int x;
++
++ spin_lock(&ip->i_lock);
++
++ for (x = 0; x < GFS_MAX_META_HEIGHT; x++) {
++ bh_slot = &ip->i_cache[x];
++ if (*bh_slot) {
++ brelse(*bh_slot);
++ *bh_slot = NULL;
++ }
++ }
++
++ spin_unlock(&ip->i_lock);
++}
++
++/**
++ * gfs_get_meta_buffer - Get a metadata buffer
++ * @ip: The GFS inode
++ * @depth: The depth in the metadata tree
++ * @num: The block number (device relative) of the buffer
++ * @new: Non-zero if we may create a new buffer
++ * @bhp: the buffer is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_meta_buffer(struct gfs_inode *ip, int height, uint64_t num, int new,
++ struct buffer_head **bhp)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh, **bh_slot = &ip->i_cache[height];
++ int flags = ((new) ? DIO_NEW : 0) | DIO_START | DIO_WAIT;
++ int error;
++
++ spin_lock(&ip->i_lock);
++ bh = *bh_slot;
++ if (bh) {
++ if (bh->b_blocknr == num)
++ get_bh(bh);
++ else
++ bh = NULL;
++ }
++ spin_unlock(&ip->i_lock);
++
++ if (bh) {
++ error = gfs_dreread(sdp, bh, flags);
++ if (error) {
++ brelse(bh);
++ return error;
++ }
++ } else {
++ error = gfs_dread(sdp, num, ip->i_gl, flags, &bh);
++ if (error)
++ return error;
++
++ spin_lock(&ip->i_lock);
++ if (*bh_slot != bh) {
++ if (*bh_slot)
++ brelse(*bh_slot);
++ *bh_slot = bh;
++ get_bh(bh);
++ }
++ spin_unlock(&ip->i_lock);
++ }
++
++ if (new) {
++ GFS_ASSERT_INODE(height, ip,);
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_IN, GFS_FORMAT_IN);
++ gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++ } else
++ gfs_metatype_check(sdp, bh,
++ (height) ? GFS_METATYPE_IN : GFS_METATYPE_DI);
++
++ *bhp = bh;
++
++ return 0;
++}
++
++/**
++ * gfs_get_data_buffer - Get a data buffer
++ * @ip: The GFS inode
++ * @num: The block number (device relative) of the data block
++ * @new: Non-zero if this is a new allocation
++ * @bhp: the buffer is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_data_buffer(struct gfs_inode *ip, uint64_t block, int new,
++ struct buffer_head **bhp)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ int error = 0;
++
++ if (block == ip->i_num.no_addr) {
++ GFS_ASSERT_INODE(!new, ip,);
++
++ error = gfs_dread(sdp, block, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_DI);
++ } else if (gfs_is_jdata(ip)) {
++ if (new) {
++ error = gfs_dread(sdp, block, ip->i_gl,
++ DIO_NEW | DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_JD, GFS_FORMAT_JD);
++ gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++ } else {
++ error = gfs_dread(sdp, block, ip->i_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_JD);
++ }
++ } else {
++ if (new) {
++ bh = gfs_dgetblk(sdp, block, ip->i_gl);
++ gfs_prep_new_buffer(bh);
++ } else {
++ error = gfs_dread(sdp, block, ip->i_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++ }
++ }
++
++ *bhp = bh;
++
++ return 0;
++}
++
++/**
++ * gfs_start_ra - start readahead on an extent of a file
++ * @gl: the glock the blocks belong to
++ * @dblock: the starting disk block
++ * @extlen: the number of blocks in the extent
++ *
++ */
++
++void
++gfs_start_ra(struct gfs_glock *gl, uint64_t dblock, uint32_t extlen)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct inode *aspace = gl->gl_aspace;
++ struct buffer_head *first_bh, *bh;
++ uint32_t max_ra = sdp->sd_tune.gt_max_readahead >> sdp->sd_sb.sb_bsize_shift;
++ int error;
++
++ GFS_ASSERT_GLOCK(extlen, gl,);
++ if (!max_ra)
++ return;
++ if (extlen > max_ra)
++ extlen = max_ra;
++
++ first_bh = getbuf(sdp, aspace, dblock, CREATE);
++
++ if (buffer_uptodate(first_bh))
++ goto out;
++ if (!buffer_locked(first_bh)) {
++ error = gfs_dreread(sdp, first_bh, DIO_START);
++ if (error)
++ goto out;
++ }
++
++ dblock++;
++ extlen--;
++
++ while (extlen) {
++ bh = getbuf(sdp, aspace, dblock, CREATE);
++
++ if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
++ error = gfs_dreread(sdp, bh, DIO_START);
++ brelse(bh);
++ if (error)
++ goto out;
++ } else
++ brelse(bh);
++
++ dblock++;
++ extlen--;
++
++ if (buffer_uptodate(first_bh))
++ break;
++ }
++
++ out:
++ brelse(first_bh);
++}
+diff -urN linux-orig/fs/gfs/dio.h linux-patched/fs/gfs/dio.h
+--- linux-orig/fs/gfs/dio.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dio.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,195 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIO_DOT_H__
++#define __DIO_DOT_H__
++
++void gfs_ail_start_trans(struct gfs_sbd *sdp, struct gfs_trans *tr);
++int gfs_ail_empty_trans(struct gfs_sbd *sdp, struct gfs_trans *tr);
++
++/* Asynchronous I/O Routines */
++
++struct buffer_head *gfs_dgetblk(struct gfs_sbd *sdp, uint64_t blkno,
++ struct gfs_glock *gl);
++int gfs_dread(struct gfs_sbd *sdp, uint64_t blkno, struct gfs_glock *gl,
++ int flags, struct buffer_head **bhp);
++
++void gfs_prep_new_buffer(struct buffer_head *bh);
++int gfs_dreread(struct gfs_sbd *sdp, struct buffer_head *bh, int flags);
++int gfs_dwrite(struct gfs_sbd *sdp, struct buffer_head *bh, int flags);
++
++void gfs_attach_bufdata(struct buffer_head *bh, struct gfs_glock *gl);
++int gfs_is_pinned(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_dpin(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_dunpin(struct gfs_sbd *sdp, struct buffer_head *bh,
++ struct gfs_trans *tr);
++
++static __inline__
++void gfs_lock_buffer(struct buffer_head *bh)
++{
++ struct gfs_bufdata *bd = bh2bd(bh);
++ down(&bd->bd_lock);
++}
++static __inline__
++int gfs_trylock_buffer(struct buffer_head *bh)
++{
++ struct gfs_bufdata *bd = bh2bd(bh);
++ return down_trylock(&bd->bd_lock);
++}
++static __inline__
++void gfs_unlock_buffer(struct buffer_head *bh)
++{
++ struct gfs_bufdata *bd = bh2bd(bh);
++ up(&bd->bd_lock);
++}
++
++void gfs_logbh_init(struct gfs_sbd *sdp, struct buffer_head *bh, uint64_t blkno,
++ char *data);
++void gfs_logbh_uninit(struct gfs_sbd *sdp, struct buffer_head *bh);
++int gfs_logbh_start(struct gfs_sbd *sdp, struct buffer_head *bh);
++int gfs_logbh_wait(struct gfs_sbd *sdp, struct buffer_head *bh);
++
++int gfs_replay_buf(struct gfs_glock *gl, struct buffer_head *bh);
++void gfs_replay_check(struct gfs_sbd *sdp);
++void gfs_replay_wait(struct gfs_sbd *sdp);
++
++void gfs_wipe_buffers(struct gfs_inode *ip, struct gfs_rgrpd *rgd,
++ uint64_t bstart, uint32_t blen);
++
++void gfs_sync_meta(struct gfs_sbd *sdp);
++
++/* Buffer Caching routines */
++
++int gfs_get_meta_buffer(struct gfs_inode *ip, int height, uint64_t num, int new,
++ struct buffer_head **bhp);
++int gfs_get_data_buffer(struct gfs_inode *ip, uint64_t block, int new,
++ struct buffer_head **bhp);
++void gfs_start_ra(struct gfs_glock *gl, uint64_t dblock, uint32_t extlen);
++
++static __inline__ int
++gfs_get_inode_buffer(struct gfs_inode *ip, struct buffer_head **bhp)
++{
++ return gfs_get_meta_buffer(ip, 0, ip->i_num.no_addr, FALSE, bhp);
++}
++
++struct inode *gfs_aspace_get(struct gfs_sbd *sdp);
++void gfs_aspace_put(struct inode *aspace);
++
++void gfs_inval_buf(struct gfs_glock *gl);
++void gfs_sync_buf(struct gfs_glock *gl, int flags);
++
++void gfs_flush_meta_cache(struct gfs_inode *ip);
++
++/* Buffer Content Functions */
++
++/**
++ * gfs_buffer_clear - Zeros out a buffer
++ * @ip: The GFS inode
++ * @bh: The buffer to zero
++ *
++ */
++
++static __inline__ void
++gfs_buffer_clear(struct buffer_head *bh)
++{
++ memset(bh->b_data, 0, bh->b_size);
++}
++
++/**
++ * gfs_buffer_clear_tail - Clear buffer beyond the dinode
++ * @bh: The buffer containing the on-disk inode
++ * @head: the size of the head of the buffer
++ *
++ * Clears the remaining part of an on-disk inode that is not a dinode.
++ * i.e. The data part of a stuffed inode, or the top level of metadata
++ * of a non-stuffed inode.
++ */
++
++static __inline__ void
++gfs_buffer_clear_tail(struct buffer_head *bh, int head)
++{
++ memset(bh->b_data + head, 0, bh->b_size - head);
++}
++
++/**
++ * gfs_buffer_clear_ends - Zero out any bits of a buffer which are not being written
++ * @bh: The buffer
++ * @offset: Offset in buffer where write starts
++ * @amount: Amount of data being written
++ * @journaled: TRUE if this is a journaled buffer
++ *
++ */
++
++static __inline__ void
++gfs_buffer_clear_ends(struct buffer_head *bh, int offset, int amount,
++ int journaled)
++{
++ int z_off1 = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++ int z_len1 = offset - z_off1;
++ int z_off2 = offset + amount;
++ int z_len2 = (bh)->b_size - z_off2;
++
++ if (z_len1)
++ memset(bh->b_data + z_off1, 0, z_len1);
++
++ if (z_len2)
++ memset(bh->b_data + z_off2, 0, z_len2);
++}
++
++/**
++ * gfs_buffer_copy_tail - copies the tail of one buffer to another
++ * @to_bh: the buffer to copy to
++ * @to_head: the size of the head of to_bh
++ * @from_bh: the buffer to copy from
++ * @from_head: the size of the head of from_bh
++ *
++ * from_head is guaranteed to bigger than to_head
++ */
++
++static __inline__ void
++gfs_buffer_copy_tail(struct buffer_head *to_bh, int to_head,
++ struct buffer_head *from_bh, int from_head)
++{
++ memcpy(to_bh->b_data + to_head,
++ from_bh->b_data + from_head,
++ from_bh->b_size - from_head);
++ memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
++ 0,
++ from_head - to_head);
++}
++
++/**
++ * gfs_buffer_print - print a buffer to the debug console
++ * @bh: the buffer
++ * @string: what to print before the contents of the buffer
++ *
++ */
++
++static __inline__ void
++gfs_buffer_print(struct buffer_head *bh, char *string)
++{
++ unsigned int x, size = (bh)->b_size;
++ unsigned char *c = (bh)->b_data;
++
++ printk("%s\n", string);
++
++ for (x = 0; x < size; x++) {
++ printk("%.2X ", c[x]);
++ if (x % 16 == 15)
++ printk("\n");
++ }
++ if (x % 16 != 0)
++ printk("\n");
++}
++
++#endif /* __DIO_DOT_H__ */
+diff -urN linux-orig/fs/gfs/dir.c linux-patched/fs/gfs/dir.c
+--- linux-orig/fs/gfs/dir.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dir.c 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,2273 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++* Implements Extendible Hashing as described in:
++* "Extendible Hashing" by Fagin, et al in
++* __ACM Trans. on Database Systems__, Sept 1979.
++*
++*
++* Here's the layout of dirents which is essentially the same as that of ext2
++* within a single block. The field de_name_len is the number of bytes
++* actually required for the name (no null terminator). The field de_rec_len
++* is the number of bytes allocated to the dirent. The offset of the next
++* dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
++* deleted, the preceding dirent inherits its allocated space, ie
++* prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
++* by adding de_rec_len to the current dirent, this essentially causes the
++* deleted dirent to get jumped over when iterating through all the dirents.
++* When deleting the first dirent in a block, there is no previous dirent so
++* the field de_ino is set to zero to designate it as deleted. When allocating
++* a dirent, gfs_dirent_alloc iterates through the dirents in a block. If the
++* first dirent has (de_ino == 0) and de_rec_len is large enough, this first
++* dirent is allocated. Otherwise it must go through all the 'used' dirents
++* searching for one in which the amount of total space minus the amount of
++* used space will provide enough space for the new dirent.
++* There are two types of blocks in which dirents reside. In a stuffed dinode,
++* the dirents begin at offset sizeof(struct gfs_dinode) from the beginning of the block.
++* In leaves, they begin at offset sizeof (struct gfs_leaf) from the beginning of the
++* leaf block. The dirents reside in leaves when
++*
++* dip->i_di.di_regime == GFS_DIR_EXHASH.
++*
++* The dirents are in the stuffed dinode when dip->i_di.di_regime == GFS_DIR_LINEAR.
++* When the dirents are in leaves, the actual contents of the directory file are
++* used as an array of 64-bit block pointers pointing to the leaf blocks. The
++* dirents are NOT in the directory file itself. There can be more than one block
++* pointer in the array that points to the same leaf. In fact, when a directory is
++* first converted from linear to exhash, all of the pointers point to the same
++* leaf. When a leaf is completely full, the size of the hash table can be doubled
++* unless it is already at the maximum size which is hard coded into
++* GFS_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list but
++* never before the maximum hash table size has been reached.
++*/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "dir.h"
++#include "file.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define IS_LEAF (1)
++#define IS_DINODE (2)
++
++#if 1
++#define gfs_dir_hash2offset(h) (((uint64_t)(h)) >> 1)
++#define gfs_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
++#else
++#define gfs_dir_hash2offset(h) (((uint64_t)(h)))
++#define gfs_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
++#endif
++
++typedef int (*leaf_call_t) (struct gfs_inode *dip,
++ uint32_t index, uint32_t len, uint64_t leaf_no,
++ void *data);
++
++/**
++ * int gfs_filecmp - Compare two filenames
++ * @file1: The first filename
++ * @file2: The second filename
++ * @len_of_file2: The length of the second file
++ *
++ * This routine compares two filenames and returns TRUE if they are equal.
++ *
++ * Returns: TRUE (!=0) if the files are the same, otherwise FALSE (0).
++ */
++
++int
++gfs_filecmp(struct qstr *file1, char *file2, int len_of_file2)
++{
++ if (file1->len != len_of_file2)
++ return FALSE;
++ if (memcmp(file1->name, file2, file1->len))
++ return FALSE;
++ return TRUE;
++}
++
++/**
++ * dirent_first - Return the first dirent
++ * @dip: the directory
++ * @bh: The buffer
++ * @dent: Pointer to list of dirents
++ *
++ * return first dirent whether bh points to leaf or stuffed dinode
++ *
++ * Returns: IS_LEAF or IS_DINODE
++ */
++
++static int
++dirent_first(struct gfs_inode *dip, struct buffer_head *bh,
++ struct gfs_dirent **dent)
++{
++ struct gfs_meta_header *h = (struct gfs_meta_header *)bh->b_data;
++
++ if (gfs32_to_cpu(h->mh_type) == GFS_METATYPE_LF) {
++ gfs_meta_check(dip->i_sbd, bh);
++ *dent = (struct gfs_dirent *)(bh->b_data + sizeof(struct gfs_leaf));
++ return IS_LEAF;
++ } else {
++ gfs_metatype_check(dip->i_sbd, bh, GFS_METATYPE_DI);
++ *dent = (struct gfs_dirent *)(bh->b_data + sizeof(struct gfs_dinode));
++ return IS_DINODE;
++ }
++}
++
++/**
++ * dirent_next - Next dirent
++ * @dip: the directory
++ * @bh: The buffer
++ * @dent: Pointer to list of dirents
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++dirent_next(struct gfs_inode *dip, struct buffer_head *bh,
++ struct gfs_dirent **dent)
++{
++ struct gfs_dirent *tmp, *cur;
++ char *bh_end;
++ uint32_t cur_rec_len;
++
++ cur = *dent;
++ bh_end = bh->b_data + bh->b_size;
++
++ cur_rec_len = gfs16_to_cpu(cur->de_rec_len);
++
++ if ((char *)cur + cur_rec_len >= bh_end) {
++ GFS_ASSERT_INODE((char *)cur + cur_rec_len == bh_end, dip,);
++ return -ENOENT;
++ }
++
++ tmp = (struct gfs_dirent *)((char *)cur + cur_rec_len);
++
++ GFS_ASSERT_INODE((char *)tmp + gfs16_to_cpu(tmp->de_rec_len) <= bh_end,
++ dip,);
++ /* Only the first dent could ever have de_ino == 0 */
++ GFS_ASSERT_INODE(tmp->de_inum.no_formal_ino, dip,);
++
++ *dent = tmp;
++
++ return 0;
++}
++
++/**
++ * dirent_del - Delete a dirent
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @prev: The previous dirent
++ * @cur: The current dirent
++ *
++ */
++
++static void
++dirent_del(struct gfs_inode *dip, struct buffer_head *bh,
++ struct gfs_dirent *prev, struct gfs_dirent *cur)
++{
++ uint32_t cur_rec_len, prev_rec_len;
++
++ GFS_ASSERT_INODE(cur->de_inum.no_formal_ino, dip,);
++
++ gfs_trans_add_bh(dip->i_gl, bh);
++
++ /* If there is no prev entry, this is the first entry in the block.
++ The de_rec_len is already as big as it needs to be. Just zero
++ out the inode number and return. */
++
++ if (!prev) {
++ cur->de_inum.no_formal_ino = 0; /* No endianess worries */
++ return;
++ }
++
++ /* Combine this dentry with the previous one. */
++
++ prev_rec_len = gfs16_to_cpu(prev->de_rec_len);
++ cur_rec_len = gfs16_to_cpu(cur->de_rec_len);
++
++ GFS_ASSERT_INODE((char *)prev + prev_rec_len == (char *)cur, dip,);
++ GFS_ASSERT_INODE((char *)cur + cur_rec_len <=
++ bh->b_data + bh->b_size, dip,);
++
++ prev_rec_len += cur_rec_len;
++ prev->de_rec_len = cpu_to_gfs16(prev_rec_len);
++}
++
++/**
++ * gfs_dirent_alloc - Allocate a directory entry
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @name_len: The length of the name
++ * @dent_out: Pointer to list of dirents
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_dirent_alloc(struct gfs_inode *dip, struct buffer_head *bh, int name_len,
++ struct gfs_dirent **dent_out)
++{
++ struct gfs_dirent *dent, *new;
++ unsigned int rec_len = GFS_DIRENT_SIZE(name_len);
++ unsigned int entries = 0, offset = 0, x = 0;
++ int type;
++
++ type = dirent_first(dip, bh, &dent);
++
++ if (type == IS_LEAF) {
++ struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++ entries = gfs16_to_cpu(leaf->lf_entries);
++ offset = sizeof(struct gfs_leaf);
++ } else {
++ struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++ entries = gfs32_to_cpu(dinode->di_entries);
++ offset = sizeof(struct gfs_dinode);
++ }
++
++ if (!entries) {
++ gfs_trans_add_bh(dip->i_gl, bh);
++
++ dent->de_rec_len = bh->b_size - offset;
++ dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++ dent->de_name_len = cpu_to_gfs16(name_len);
++
++ *dent_out = dent;
++ return 0;
++ }
++
++ do {
++ uint32_t cur_rec_len, cur_name_len;
++
++ cur_rec_len = gfs16_to_cpu(dent->de_rec_len);
++ cur_name_len = gfs16_to_cpu(dent->de_name_len);
++
++ if ((!dent->de_inum.no_formal_ino && cur_rec_len >= rec_len) ||
++ (cur_rec_len >= GFS_DIRENT_SIZE(cur_name_len) + rec_len)) {
++ gfs_trans_add_bh(dip->i_gl, bh);
++
++ if (dent->de_inum.no_formal_ino) {
++ new = (struct gfs_dirent *)((char *)dent +
++ GFS_DIRENT_SIZE(cur_name_len));
++ memset(new, 0, sizeof(struct gfs_dirent));
++
++ new->de_rec_len = cpu_to_gfs16(cur_rec_len -
++ GFS_DIRENT_SIZE(cur_name_len));
++ new->de_name_len = cpu_to_gfs16(name_len);
++
++ dent->de_rec_len = cur_rec_len - gfs16_to_cpu(new->de_rec_len);
++ dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++
++ *dent_out = new;
++ return 0;
++ }
++
++ dent->de_name_len = cpu_to_gfs16(name_len);
++
++ *dent_out = dent;
++ return 0;
++ }
++
++ GFS_ASSERT_INODE(x < entries, dip,);
++
++ if (dent->de_inum.no_formal_ino)
++ x++;
++ }
++ while (dirent_next(dip, bh, &dent) == 0);
++
++ return -ENOSPC;
++}
++
++/**
++ * dirent_fits - See if we can fit a entry in this buffer
++ * @dip: The GFS inode
++ * @bh: The buffer
++ * @name_len: The length of the name
++ *
++ * Returns: TRUE if it can fit, FALSE otherwise
++ */
++
++static int
++dirent_fits(struct gfs_inode *dip, struct buffer_head *bh, int name_len)
++{
++ struct gfs_dirent *dent;
++ unsigned int rec_len = GFS_DIRENT_SIZE(name_len);
++ unsigned int entries = 0, x = 0;
++ int type;
++
++ type = dirent_first(dip, bh, &dent);
++
++ if (type == IS_LEAF) {
++ struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++ entries = gfs16_to_cpu(leaf->lf_entries);
++ } else {
++ struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++ entries = gfs32_to_cpu(dinode->di_entries);
++ }
++
++ if (!entries)
++ return TRUE;
++
++ do {
++ uint32_t cur_rec_len, cur_name_len;
++
++ cur_rec_len = gfs16_to_cpu(dent->de_rec_len);
++ cur_name_len = gfs16_to_cpu(dent->de_name_len);
++
++ if ((!dent->de_inum.no_formal_ino && cur_rec_len >= rec_len) ||
++ (cur_rec_len >= GFS_DIRENT_SIZE(cur_name_len) + rec_len))
++ return TRUE;
++
++ GFS_ASSERT_INODE(x < entries, dip,);
++
++ if (dent->de_inum.no_formal_ino)
++ x++;
++ }
++ while (dirent_next(dip, bh, &dent) == 0);
++
++ return FALSE;
++}
++
++/**
++ * leaf_search
++ * @bh:
++ * @filename:
++ * @dent_out:
++ * @dent_prev:
++ *
++ * Returns:
++ */
++
++static int
++leaf_search(struct gfs_inode *dip,
++ struct buffer_head *bh, struct qstr *filename,
++ struct gfs_dirent **dent_out, struct gfs_dirent **dent_prev)
++{
++ uint32_t hash;
++ struct gfs_dirent *dent, *prev = NULL;
++ unsigned int entries = 0, x = 0;
++ int type;
++
++ type = dirent_first(dip, bh, &dent);
++
++ if (type == IS_LEAF) {
++ struct gfs_leaf *leaf = (struct gfs_leaf *)bh->b_data;
++ entries = gfs16_to_cpu(leaf->lf_entries);
++ } else if (type == IS_DINODE) {
++ struct gfs_dinode *dinode = (struct gfs_dinode *)bh->b_data;
++ entries = gfs32_to_cpu(dinode->di_entries);
++ }
++
++ hash = gfs_dir_hash(filename->name, filename->len);
++
++ do {
++ if (!dent->de_inum.no_formal_ino) {
++ prev = dent;
++ continue;
++ }
++
++ if (gfs32_to_cpu(dent->de_hash) == hash &&
++ gfs_filecmp(filename, (char *)(dent + 1),
++ gfs16_to_cpu(dent->de_name_len))) {
++ *dent_out = dent;
++ if (dent_prev)
++ *dent_prev = prev;
++
++ return 0;
++ }
++
++ GFS_ASSERT_INODE(x < entries, dip,);
++ x++;
++ prev = dent;
++ }
++ while (dirent_next(dip, bh, &dent) == 0);
++
++ return -ENOENT;
++}
++
++/**
++ * get_leaf - Get leaf
++ * @dip:
++ * @leaf_no:
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_leaf(struct gfs_inode *dip, uint64_t leaf_no, struct buffer_head **bhp)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ int error;
++
++ error = gfs_dread(sdp, leaf_no, dip->i_gl, DIO_START | DIO_WAIT, bhp);
++ if (!error)
++ gfs_metatype_check(sdp, *bhp, GFS_METATYPE_LF);
++
++ return error;
++}
++
++/**
++ * get_leaf_nr - Get a leaf number associated with the index
++ * @dip: The GFS inode
++ * @index:
++ * @leaf_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_leaf_nr(struct gfs_inode *dip, uint32_t index, uint64_t *leaf_out)
++{
++ uint64_t leaf_no;
++ int error;
++
++ error = gfs_internal_read(dip, (char *)&leaf_no,
++ index * sizeof(uint64_t),
++ sizeof(uint64_t));
++ if (error != sizeof(uint64_t))
++ return (error < 0) ? error : -EIO;
++
++ *leaf_out = gfs64_to_cpu(leaf_no);
++
++ return 0;
++}
++
++/**
++ * get_first_leaf - Get first leaf
++ * @dip: The GFS inode
++ * @index:
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_first_leaf(struct gfs_inode *dip, uint32_t index,
++ struct buffer_head **bh_out)
++{
++ uint64_t leaf_no;
++ int error;
++
++ error = get_leaf_nr(dip, index, &leaf_no);
++ if (!error)
++ error = get_leaf(dip, leaf_no, bh_out);
++
++ return error;
++}
++
++/**
++ * get_next_leaf - Get next leaf
++ * @dip: The GFS inode
++ * @bh_in: The buffer
++ * @bh_out:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++get_next_leaf(struct gfs_inode *dip, struct buffer_head *bh_in,
++ struct buffer_head **bh_out)
++{
++ struct gfs_leaf *leaf;
++ int error;
++
++ leaf = (struct gfs_leaf *)bh_in->b_data;
++
++ if (!leaf->lf_next)
++ error = -ENOENT;
++ else
++ error = get_leaf(dip, gfs64_to_cpu(leaf->lf_next), bh_out);
++
++ return error;
++}
++
++/**
++ * linked_leaf_search - Linked leaf search
++ * @dip: The GFS inode
++ * @filename: The filename to search for
++ * @dent_out:
++ * @dent_prev:
++ * @bh_out:
++ *
++ * Returns: 0 on sucess, error code otherwise
++ */
++
++static int
++linked_leaf_search(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_dirent **dent_out, struct gfs_dirent **dent_prev,
++ struct buffer_head **bh_out)
++{
++ struct buffer_head *bh = NULL, *bh_next;
++ uint32_t hsize, index;
++ uint32_t hash;
++ int error;
++
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++ /* Figure out the address of the leaf node. */
++
++ hash = gfs_dir_hash(filename->name, filename->len);
++ index = hash >> (32 - dip->i_di.di_depth);
++
++ error = get_first_leaf(dip, index, &bh_next);
++ if (error)
++ return error;
++
++ /* Find the entry */
++
++ do {
++ if (bh)
++ brelse(bh);
++
++ bh = bh_next;
++
++ error = leaf_search(dip, bh, filename, dent_out, dent_prev);
++ switch (error) {
++ case 0:
++ *bh_out = bh;
++ return 0;
++
++ case -ENOENT:
++ break;
++
++ default:
++ brelse(bh);
++ return error;
++ }
++
++ error = get_next_leaf(dip, bh, &bh_next);
++ }
++ while (!error);
++
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * dir_make_exhash - Convet a stuffed directory into an ExHash directory
++ * @dip: The GFS inode
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++dir_make_exhash(struct gfs_inode *dip)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_dirent *dent;
++ struct buffer_head *bh, *dibh;
++ struct gfs_leaf *leaf;
++ int y;
++ uint32_t x;
++ uint64_t *lp, bn;
++ int error;
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ /* Allocate a new block for the first leaf node */
++
++ error = gfs_metaalloc(dip, &bn);
++ if (error)
++ goto fail;
++
++ /* Turn over a new leaf */
++
++ error = gfs_dread(sdp, bn, dip->i_gl, DIO_NEW | DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto fail;
++
++ gfs_trans_add_bh(dip->i_gl, bh);
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_LF, GFS_FORMAT_LF);
++ gfs_buffer_clear_tail(bh, sizeof(struct gfs_meta_header));
++
++ /* Fill in the leaf structure */
++
++ leaf = (struct gfs_leaf *)bh->b_data;
++
++ GFS_ASSERT_INODE(dip->i_di.di_entries < (1 << 16), dip,);
++
++ leaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++ leaf->lf_entries = cpu_to_gfs16(dip->i_di.di_entries);
++
++ /* Copy dirents */
++
++ gfs_buffer_copy_tail(bh, sizeof(struct gfs_leaf), dibh,
++ sizeof(struct gfs_dinode));
++
++ /* Find last entry */
++
++ x = 0;
++ dirent_first(dip, bh, &dent);
++
++ do {
++ if (!dent->de_inum.no_formal_ino)
++ continue;
++ if (++x == dip->i_di.di_entries)
++ break;
++ }
++ while (dirent_next(dip, bh, &dent) == 0);
++
++ /* Adjust the last dirent's record length
++ (Remember that dent still points to the last entry.) */
++
++ dent->de_rec_len = gfs16_to_cpu(dent->de_rec_len) +
++ sizeof(struct gfs_dinode) -
++ sizeof(struct gfs_leaf);
++ dent->de_rec_len = cpu_to_gfs16(dent->de_rec_len);
++
++ brelse(bh);
++
++ /* We're done with the new leaf block, now setup the new
++ hash table. */
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++ gfs_buffer_clear_tail(dibh, sizeof (struct gfs_dinode));
++
++ lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs_dinode));
++
++ for (x = sdp->sd_hash_ptrs; x--; lp++)
++ *lp = cpu_to_gfs64(bn);
++
++ dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
++ dip->i_di.di_blocks++;
++ dip->i_di.di_flags |= GFS_DIF_EXHASH;
++ dip->i_di.di_payload_format = 0;
++
++ for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
++ dip->i_di.di_depth = y;
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ return 0;
++
++ fail:
++ brelse(dibh);
++ return error;
++}
++
++/**
++ * dir_split_leaf - Split a leaf block into two
++ * @dip: The GFS inode
++ * @index:
++ * @leaf_no:
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++static int
++dir_split_leaf(struct gfs_inode *dip, uint32_t index, uint64_t leaf_no)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *nbh, *obh, *dibh;
++ struct gfs_leaf *nleaf, *oleaf;
++ struct gfs_dirent *dent, *prev = NULL, *next = NULL, *new;
++ uint32_t start, len, half_len, divider;
++ uint64_t bn, *lp;
++ uint32_t name_len;
++ int x, moved = FALSE;
++ int error;
++
++ /* Allocate the new leaf block */
++
++ error = gfs_metaalloc(dip, &bn);
++ if (error)
++ return error;
++
++ /* Get the new leaf block */
++
++ error = gfs_dread(sdp, bn, dip->i_gl,
++ DIO_NEW | DIO_START | DIO_WAIT, &nbh);
++ if (error)
++ return error;
++
++ gfs_trans_add_bh(dip->i_gl, nbh);
++ gfs_metatype_set(sdp, nbh, GFS_METATYPE_LF, GFS_FORMAT_LF);
++ gfs_buffer_clear_tail(nbh, sizeof (struct gfs_meta_header));
++
++ nleaf = (struct gfs_leaf *)nbh->b_data;
++
++ nleaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++
++ /* Get the old leaf block */
++
++ error = get_leaf(dip, leaf_no, &obh);
++ if (error)
++ goto fail;
++
++ gfs_trans_add_bh(dip->i_gl, obh);
++
++ oleaf = (struct gfs_leaf *)obh->b_data;
++
++ /* Compute the start and len of leaf pointers in the hash table. */
++
++ len = 1 << (dip->i_di.di_depth - gfs16_to_cpu(oleaf->lf_depth));
++ GFS_ASSERT_INODE(len != 1, dip,);
++ half_len = len >> 1;
++
++ start = (index & ~(len - 1));
++
++ /* Change the pointers.
++ Don't bother distinguishing stuffed from non-stuffed.
++ This code is complicated enough already. */
++
++ lp = gmalloc(half_len * sizeof(uint64_t));
++
++ error = gfs_internal_read(dip, (char *)lp, start * sizeof(uint64_t),
++ half_len * sizeof(uint64_t));
++ if (error != half_len * sizeof(uint64_t)) {
++ if (error >= 0)
++ error = -EIO;
++ goto fail_lpfree;
++ }
++
++ /* Change the pointers */
++
++ for (x = 0; x < half_len; x++)
++ lp[x] = cpu_to_gfs64(bn);
++
++ error = gfs_internal_write(dip, (char *)lp, start * sizeof(uint64_t),
++ half_len * sizeof(uint64_t));
++ if (error != half_len * sizeof(uint64_t)) {
++ if (error >= 0)
++ error = -EIO;
++ goto fail_lpfree;
++ }
++
++ kfree(lp);
++
++ /* Compute the divider */
++
++ divider = (start + half_len) << (32 - dip->i_di.di_depth);
++
++ /* Copy the entries */
++
++ dirent_first(dip, obh, &dent);
++
++ do {
++ next = dent;
++ if (dirent_next(dip, obh, &next))
++ next = NULL;
++
++ if (dent->de_inum.no_formal_ino &&
++ gfs32_to_cpu(dent->de_hash) < divider) {
++ name_len = gfs16_to_cpu(dent->de_name_len);
++
++ error = gfs_dirent_alloc(dip, nbh, name_len, &new);
++ GFS_ASSERT_INODE(!error, dip,);
++
++ new->de_inum = dent->de_inum; /* No endianness worries */
++ new->de_hash = dent->de_hash; /* No endianness worries */
++ new->de_type = dent->de_type; /* No endianness worries */
++ memcpy((char *)(new + 1), (char *)(dent + 1),
++ name_len);
++
++ nleaf->lf_entries = gfs16_to_cpu(nleaf->lf_entries) + 1;
++ nleaf->lf_entries = cpu_to_gfs16(nleaf->lf_entries);
++
++ dirent_del(dip, obh, prev, dent);
++
++ GFS_ASSERT_INODE(gfs16_to_cpu(oleaf->lf_entries), dip,);
++ oleaf->lf_entries = gfs16_to_cpu(oleaf->lf_entries) - 1;
++ oleaf->lf_entries = cpu_to_gfs16(oleaf->lf_entries);
++
++ if (!prev)
++ prev = dent;
++
++ moved = TRUE;
++ } else
++ prev = dent;
++
++ dent = next;
++ }
++ while (dent);
++
++ /* If none of the entries got moved into the new leaf,
++ artificially fill in the first entry. */
++
++ if (!moved) {
++ error = gfs_dirent_alloc(dip, nbh, 0, &new);
++ GFS_ASSERT_INODE(!error, dip,);
++ new->de_inum.no_formal_ino = 0;
++ }
++
++ oleaf->lf_depth = gfs16_to_cpu(oleaf->lf_depth) + 1;
++ oleaf->lf_depth = cpu_to_gfs16(oleaf->lf_depth);
++ nleaf->lf_depth = oleaf->lf_depth;
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ GFS_ASSERT_INODE(!error, dip,); /* Pinned in gfs_internal_write() */
++
++ dip->i_di.di_blocks++;
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ brelse(obh);
++ brelse(nbh);
++
++ return 0;
++
++ fail_lpfree:
++ kfree(lp);
++
++ brelse(obh);
++
++ fail:
++ brelse(nbh);
++ return error;
++}
++
++/**
++ * dir_double_exhash - Double size of ExHash table
++ * @dip: The GFS dinode
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++static int
++dir_double_exhash(struct gfs_inode *dip)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *dibh;
++ uint32_t hsize;
++ uint64_t *buf;
++ uint64_t *from, *to;
++ uint64_t block;
++ int x;
++ int error = 0;
++
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++ /* Allocate both the "from" and "to" buffers in one big chunk */
++
++ buf = gmalloc(3 * sdp->sd_hash_bsize);
++
++ for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
++ error = gfs_internal_read(dip, (char *)buf,
++ block * sdp->sd_hash_bsize,
++ sdp->sd_hash_bsize);
++ if (error != sdp->sd_hash_bsize) {
++ if (error >= 0)
++ error = -EIO;
++ goto fail;
++ }
++
++ from = buf;
++ to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
++
++ for (x = sdp->sd_hash_ptrs; x--; from++) {
++ *to++ = *from; /* No endianess worries */
++ *to++ = *from;
++ }
++
++ error = gfs_internal_write(dip, (char *)buf + sdp->sd_hash_bsize,
++ block * sdp->sd_sb.sb_bsize,
++ sdp->sd_sb.sb_bsize);
++ if (error != sdp->sd_sb.sb_bsize) {
++ if (error >= 0)
++ error = -EIO;
++ goto fail;
++ }
++ }
++
++ kfree(buf);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ GFS_ASSERT_INODE(!error, dip,); /* Pinned in gfs_internal_write() */
++
++ dip->i_di.di_depth++;
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++
++ fail:
++ kfree(buf);
++
++ return error;
++}
++
++/**
++ * compare_dents - compare directory entries by hash value
++ * @a: first dent
++ * @b: second dent
++ *
++ * When comparing the hash entries of @a to @b:
++ * gt: returns 1
++ * lt: returns -1
++ * eq: returns 0
++ */
++
++static int
++compare_dents(void *a, void *b)
++{
++ struct gfs_dirent *dent_a, *dent_b;
++ uint32_t hash_a, hash_b;
++ int ret = 0;
++
++ dent_a = *(struct gfs_dirent **)a;
++ hash_a = dent_a->de_hash;
++ hash_a = gfs32_to_cpu(hash_a);
++
++ dent_b = *(struct gfs_dirent **)b;
++ hash_b = dent_b->de_hash;
++ hash_b = gfs32_to_cpu(hash_b);
++
++ if (hash_a > hash_b)
++ ret = 1;
++ else if (hash_a < hash_b)
++ ret = -1;
++ else {
++ unsigned int len_a = gfs16_to_cpu(dent_a->de_name_len);
++ unsigned int len_b = gfs16_to_cpu(dent_b->de_name_len);
++
++ if (len_a > len_b)
++ ret = 1;
++ else if (len_a < len_b)
++ ret = -1;
++ else
++ ret = memcmp((char *)(dent_a + 1),
++ (char *)(dent_b + 1),
++ len_a);
++ }
++
++ return ret;
++}
++
++/**
++ * do_filldir_main - read out directory entries
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to
++ * @darr: an array of struct gfs_dirent pointers to read
++ * @entries: the number of entries in darr
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Jump through some hoops to make sure that if there are hash collsions,
++ * they are read out at the beginning of a buffer. We want to minimize
++ * the possibility that they will fall into different readdir buffers or
++ * that someone will want to seek to that location.
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_main(struct gfs_inode *dip, uint64_t *offset,
++ void *opaque, gfs_filldir_t filldir,
++ struct gfs_dirent **darr, uint32_t entries, int *copied)
++{
++ struct gfs_dirent *dent, *dent_next;
++ struct gfs_inum inum;
++ uint64_t off, off_next;
++ unsigned int x, y;
++ int run = FALSE;
++ int error = 0;
++
++ gfs_sort(darr, entries, sizeof(struct gfs_dirent *), compare_dents);
++
++ dent_next = darr[0];
++ off_next = gfs32_to_cpu(dent_next->de_hash);
++ off_next = gfs_dir_hash2offset(off_next);
++
++ for (x = 0, y = 1; x < entries; x++, y++) {
++ dent = dent_next;
++ off = off_next;
++
++ if (y < entries) {
++ dent_next = darr[y];
++ off_next = gfs32_to_cpu(dent_next->de_hash);
++ off_next = gfs_dir_hash2offset(off_next);
++
++ if (off < *offset)
++ continue;
++ *offset = off;
++
++ if (off_next == off) {
++ if (*copied && !run)
++ return 1;
++ run = TRUE;
++ } else
++ run = FALSE;
++ } else {
++ if (off < *offset)
++ continue;
++ *offset = off;
++ }
++
++ gfs_inum_in(&inum, (char *)&dent->de_inum);
++
++ error = filldir(opaque, (char *)(dent + 1),
++ gfs16_to_cpu(dent->de_name_len),
++ off, &inum,
++ gfs16_to_cpu(dent->de_type));
++ if (error)
++ return 1;
++
++ *copied = TRUE;
++ }
++
++ /* Increment the *offset by one, so the next time we come into the do_filldir fxn,
++ we get the next entry instead of the last one in the current leaf */
++
++ (*offset)++;
++
++ return 0;
++}
++
++/**
++ * do_filldir_single - Read directory entries out of a single block
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to
++ * @bh: the block
++ * @entries: the number of entries in the block
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_single(struct gfs_inode *dip, uint64_t *offset,
++ void *opaque, gfs_filldir_t filldir,
++ struct buffer_head *bh, uint32_t entries, int *copied)
++{
++ struct gfs_dirent **darr;
++ struct gfs_dirent *de;
++ unsigned int e = 0;
++ int error = 0;
++
++ if (!entries)
++ return 0;
++
++ darr = gmalloc(entries * sizeof(struct gfs_dirent *));
++
++ dirent_first(dip, bh, &de);
++ do {
++ if (!de->de_inum.no_formal_ino)
++ continue;
++ darr[e++] = de;
++ }
++ while (dirent_next(dip, bh, &de) == 0);
++
++ GFS_ASSERT_INODE(e == entries, dip,);
++
++ error = do_filldir_main(dip, offset, opaque, filldir, darr,
++ entries, copied);
++
++ kfree(darr);
++
++ return error;
++}
++
++/**
++ * do_filldir_multi - Read directory entries out of a linked leaf list
++ * @dip: The GFS inode
++ * @offset: The offset in the file to read from
++ * @opaque: opaque data to pass to filldir
++ * @filldir: The function to pass entries to
++ * @bh: the first leaf in the list
++ * @copied: pointer to int that's non-zero if a entry has been copied out
++ *
++ * Returns: 0 on success, -EXXX on failure, >0 on exception from filldir
++ */
++
++static int
++do_filldir_multi(struct gfs_inode *dip, uint64_t *offset,
++ void *opaque, gfs_filldir_t filldir,
++ struct buffer_head *bh, int *copied)
++{
++ struct buffer_head **larr = NULL;
++ struct gfs_dirent **darr;
++ struct gfs_leaf *leaf;
++ struct buffer_head *tmp_bh;
++ struct gfs_dirent *de;
++ unsigned int entries, e = 0;
++ unsigned int leaves = 0, l = 0;
++ unsigned int x;
++ uint64_t ln;
++ int error = 0;
++
++ /* Count leaves and entries */
++
++ leaf = (struct gfs_leaf *)bh->b_data;
++ entries = gfs16_to_cpu(leaf->lf_entries);
++ ln = leaf->lf_next;
++
++ while (ln) {
++ ln = gfs64_to_cpu(ln);
++
++ error = get_leaf(dip, ln, &tmp_bh);
++ if (error)
++ return error;
++
++ leaf = (struct gfs_leaf *)tmp_bh->b_data;
++ if (leaf->lf_entries) {
++ entries += gfs16_to_cpu(leaf->lf_entries);
++ leaves++;
++ }
++ ln = leaf->lf_next;
++
++ brelse(tmp_bh);
++ }
++
++ /* Bail out if there's nothing to do */
++
++ if (!entries)
++ return 0;
++
++ /* Alloc arrays */
++
++ if (leaves)
++ larr = gmalloc(leaves * sizeof(struct buffer_head *));
++
++ darr = gmalloc(entries * sizeof(struct gfs_dirent *));
++
++ /* Fill in arrays */
++
++ leaf = (struct gfs_leaf *)bh->b_data;
++ if (leaf->lf_entries) {
++ dirent_first(dip, bh, &de);
++ do {
++ if (!de->de_inum.no_formal_ino)
++ continue;
++ darr[e++] = de;
++ }
++ while (dirent_next(dip, bh, &de) == 0);
++ }
++ ln = leaf->lf_next;
++
++ while (ln) {
++ ln = gfs64_to_cpu(ln);
++
++ error = get_leaf(dip, ln, &tmp_bh);
++ if (error)
++ goto out;
++
++ leaf = (struct gfs_leaf *)tmp_bh->b_data;
++ if (leaf->lf_entries) {
++ dirent_first(dip, tmp_bh, &de);
++ do {
++ if (!de->de_inum.no_formal_ino)
++ continue;
++ darr[e++] = de;
++ }
++ while (dirent_next(dip, tmp_bh, &de) == 0);
++
++ larr[l++] = tmp_bh;
++
++ ln = leaf->lf_next;
++ } else {
++ ln = leaf->lf_next;
++ brelse(tmp_bh);
++ }
++ }
++
++ GFS_ASSERT_INODE(l == leaves, dip,);
++ GFS_ASSERT_INODE(e == entries, dip,);
++
++ /* Do work */
++
++ error = do_filldir_main(dip, offset, opaque, filldir, darr,
++ entries, copied);
++
++ /* Clean up */
++
++ out:
++ kfree(darr);
++
++ for (x = 0; x < l; x++)
++ brelse(larr[x]);
++
++ if (leaves)
++ kfree(larr);
++
++ return error;
++}
++
++/**
++ * dir_e_search -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_search(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int *type)
++{
++ struct buffer_head *bh;
++ struct gfs_dirent *dent;
++ int error;
++
++ error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
++ if (error)
++ return error;
++
++ if (inum)
++ gfs_inum_in(inum, (char *)&dent->de_inum);
++ if (type)
++ *type = gfs16_to_cpu(dent->de_type);
++
++ brelse(bh);
++
++ return 0;
++}
++
++/**
++ * dir_e_add -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ * @type:
++ *
++ */
++
++static int
++dir_e_add(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int type)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *bh, *nbh, *dibh;
++ struct gfs_leaf *leaf, *nleaf;
++ struct gfs_dirent *dent;
++ uint32_t hsize, index;
++ uint32_t hash;
++ uint64_t leaf_no, bn;
++ int error;
++
++ restart:
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++ /* Figure out the address of the leaf node. */
++
++ hash = gfs_dir_hash(filename->name, filename->len);
++ index = hash >> (32 - dip->i_di.di_depth);
++
++ error = get_leaf_nr(dip, index, &leaf_no);
++ if (error)
++ return error;
++
++ /* Add entry to the leaf */
++
++ for (;;) {
++ error = get_leaf(dip, leaf_no, &bh);
++ if (error)
++ return error;
++
++ leaf = (struct gfs_leaf *)bh->b_data;
++
++ if (gfs_dirent_alloc(dip, bh, filename->len, &dent)) {
++
++ if (gfs16_to_cpu(leaf->lf_depth) < dip->i_di.di_depth) {
++ /* Can we split the leaf? */
++
++ brelse(bh);
++
++ error = dir_split_leaf(dip, index, leaf_no);
++ if (error)
++ return error;
++
++ goto restart;
++
++ } else if (dip->i_di.di_depth < GFS_DIR_MAX_DEPTH) {
++ /* Can we double the hash table? */
++
++ brelse(bh);
++
++ error = dir_double_exhash(dip);
++ if (error)
++ return error;
++
++ goto restart;
++
++ } else if (leaf->lf_next) {
++ /* Can we try the next leaf in the list? */
++ leaf_no = gfs64_to_cpu(leaf->lf_next);
++ brelse(bh);
++ continue;
++
++ } else {
++ /* Create a new leaf and add it to the list. */
++
++ error = gfs_metaalloc(dip, &bn);
++ if (error) {
++ brelse(bh);
++ return error;
++ }
++
++ error = gfs_dread(sdp, bn, dip->i_gl,
++ DIO_NEW | DIO_START | DIO_WAIT,
++ &nbh);
++ if (error) {
++ brelse(bh);
++ return error;
++ }
++
++ gfs_trans_add_bh(dip->i_gl, nbh);
++ gfs_metatype_set(sdp, nbh, GFS_METATYPE_LF,
++ GFS_FORMAT_LF);
++ gfs_buffer_clear_tail(nbh,
++ sizeof(struct gfs_meta_header));
++
++ gfs_trans_add_bh(dip->i_gl, bh);
++ leaf->lf_next = cpu_to_gfs64(bn);
++
++ nleaf = (struct gfs_leaf *)nbh->b_data;
++ nleaf->lf_depth = leaf->lf_depth;
++ nleaf->lf_dirent_format = cpu_to_gfs32(GFS_FORMAT_DE);
++
++ if (gfs_dirent_alloc(dip, nbh, filename->len, &dent))
++ GFS_ASSERT_INODE(FALSE, dip,);
++
++ dip->i_di.di_blocks++;
++
++ brelse(bh);
++
++ bh = nbh;
++ leaf = nleaf;
++ }
++ }
++
++ /* If the gfs_dirent_alloc() succeeded, it pinned the "bh". */
++
++ gfs_inum_out(inum, (char *)&dent->de_inum);
++ dent->de_hash = cpu_to_gfs32(hash);
++ dent->de_type = cpu_to_gfs16(type);
++ memcpy((char *)(dent + 1), filename->name, filename->len);
++
++ leaf->lf_entries = gfs16_to_cpu(leaf->lf_entries) + 1;
++ leaf->lf_entries = cpu_to_gfs16(leaf->lf_entries);
++
++ brelse(bh);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ dip->i_di.di_entries++;
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++ }
++
++ return -ENOENT;
++}
++
++/**
++ * dir_e_del -
++ * @dip: The GFS inode
++ * @filename:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_del(struct gfs_inode *dip, struct qstr *filename)
++{
++ struct buffer_head *bh, *dibh;
++ struct gfs_dirent *dent, *prev;
++ struct gfs_leaf *leaf;
++ unsigned int entries;
++ int error;
++
++ error = linked_leaf_search(dip, filename, &dent, &prev, &bh);
++ GFS_ASSERT_INODE(error != -ENOENT, dip,);
++ if (error)
++ return error;
++
++ dirent_del(dip, bh, prev, dent); /* Pins bh */
++
++ leaf = (struct gfs_leaf *)bh->b_data;
++ entries = gfs16_to_cpu(leaf->lf_entries);
++ GFS_ASSERT_INODE(entries, dip,);
++ entries--;
++ leaf->lf_entries = cpu_to_gfs16(entries);
++
++ brelse(bh);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_entries, dip,);
++ dip->i_di.di_entries--;
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * dir_e_read - Reads the entries from a directory into a filldir buffer
++ * @dip: dinode pointer
++ * @offset: the hash of the last entry read shifted to the right once
++ * @opaque: buffer for the filldir function to fill
++ * @filldir: points to the filldir function to use
++ *
++ */
++
++static int
++dir_e_read(struct gfs_inode *dip, uint64_t *offset, void *opaque,
++ gfs_filldir_t filldir)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *bh;
++ struct gfs_leaf leaf;
++ uint32_t hsize, len;
++ uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
++ uint32_t hash, index;
++ uint64_t *lp;
++ int copied = FALSE;
++ int error = 0;
++
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++ hash = gfs_dir_offset2hash(*offset);
++ index = hash >> (32 - dip->i_di.di_depth);
++
++ lp = gmalloc(sdp->sd_hash_bsize);
++
++ while (index < hsize) {
++ lp_offset = index & (sdp->sd_hash_ptrs - 1);
++ ht_offset = index - lp_offset;
++
++ if (ht_offset_cur != ht_offset) {
++ error = gfs_internal_read(dip, (char *)lp,
++ ht_offset * sizeof(uint64_t),
++ sdp->sd_hash_bsize);
++ if (error != sdp->sd_hash_bsize) {
++ if (error >= 0)
++ error = -EIO;
++ goto out;
++ }
++ ht_offset_cur = ht_offset;
++ }
++
++ error = get_leaf(dip, gfs64_to_cpu(lp[lp_offset]), &bh);
++ if (error)
++ goto out;
++
++ gfs_leaf_in(&leaf, bh->b_data);
++
++ if (leaf.lf_next)
++ error = do_filldir_multi(dip, offset,
++ opaque, filldir,
++ bh, &copied);
++ else
++ error = do_filldir_single(dip, offset,
++ opaque, filldir,
++ bh, leaf.lf_entries,
++ &copied);
++
++ brelse(bh);
++
++ if (error) {
++ if (error > 0)
++ error = 0;
++ goto out;
++ }
++
++ len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
++ index = (index & ~(len - 1)) + len;
++ }
++
++ out:
++ kfree(lp);
++
++ return error;
++}
++
++/**
++ * dir_e_mvino -
++ * @dip: The GFS inode
++ * @filename:
++ * @new_inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_e_mvino(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int new_type)
++{
++ struct buffer_head *bh, *dibh;
++ struct gfs_dirent *dent;
++ int error;
++
++ error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
++ GFS_ASSERT_INODE(error != -ENOENT, dip,);
++ if (error)
++ return error;
++
++ gfs_trans_add_bh(dip->i_gl, bh);
++
++ gfs_inum_out(inum, (char *)&dent->de_inum);
++ dent->de_type = cpu_to_gfs16(new_type);
++
++ brelse(bh);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * dir_l_search -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_search(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int *type)
++{
++ struct buffer_head *dibh;
++ struct gfs_dirent *dent;
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ error = leaf_search(dip, dibh, filename, &dent, NULL);
++ if (!error) {
++ if (inum)
++ gfs_inum_in(inum, (char *)&dent->de_inum);
++ if (type)
++ *type = gfs16_to_cpu(dent->de_type);
++ }
++
++ brelse(dibh);
++
++ return error;
++}
++
++/**
++ * dir_l_add -
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ * @type:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_add(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int type)
++{
++ struct buffer_head *dibh;
++ struct gfs_dirent *dent;
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ if (gfs_dirent_alloc(dip, dibh, filename->len, &dent)) {
++ brelse(dibh);
++
++ error = dir_make_exhash(dip);
++ if (!error)
++ error = dir_e_add(dip, filename, inum, type);
++
++ return error;
++ }
++
++ /* gfs_dirent_alloc() pins */
++
++ gfs_inum_out(inum, (char *)&dent->de_inum);
++ dent->de_hash = gfs_dir_hash(filename->name, filename->len);
++ dent->de_hash = cpu_to_gfs32(dent->de_hash);
++ dent->de_type = cpu_to_gfs16(type);
++ memcpy((char *)(dent + 1), filename->name, filename->len);
++
++ dip->i_di.di_entries++;
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * dir_l_del -
++ * @dip: The GFS inode
++ * @filename:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_del(struct gfs_inode *dip, struct qstr *filename)
++{
++ struct buffer_head *dibh;
++ struct gfs_dirent *dent, *prev;
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ error = leaf_search(dip, dibh, filename, &dent, &prev);
++ GFS_ASSERT_INODE(!error, dip,);
++
++ dirent_del(dip, dibh, prev, dent);
++
++ /* dirent_del() pins */
++
++ GFS_ASSERT_INODE(dip->i_di.di_entries, dip,);
++ dip->i_di.di_entries--;
++
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * dir_l_read -
++ * @dip:
++ * @offset:
++ * @opaque:
++ * @filldir:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_read(struct gfs_inode *dip, uint64_t *offset, void *opaque,
++ gfs_filldir_t filldir)
++{
++ struct buffer_head *dibh;
++ int copied = FALSE;
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++ if (!dip->i_di.di_entries)
++ return 0;
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ error = do_filldir_single(dip, offset,
++ opaque, filldir,
++ dibh, dip->i_di.di_entries,
++ &copied);
++ if (error > 0)
++ error = 0;
++
++ brelse(dibh);
++
++ return error;
++}
++
++/**
++ * dir_l_mvino -
++ * @dip:
++ * @filename:
++ * @new_inode:
++ *
++ * Returns:
++ */
++
++static int
++dir_l_mvino(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int new_type)
++{
++ struct buffer_head *dibh;
++ struct gfs_dirent *dent;
++ int error;
++
++ GFS_ASSERT_INODE(gfs_is_stuffed(dip), dip,);
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ return error;
++
++ error = leaf_search(dip, dibh, filename, &dent, NULL);
++ GFS_ASSERT_INODE(!error, dip,);
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++
++ gfs_inum_out(inum, (char *)&dent->de_inum);
++ dent->de_type = cpu_to_gfs16(new_type);
++
++ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
++
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * gfs_dir_search - Search a directory
++ * @dip: The GFS inode
++ * @filename:
++ * @inode:
++ *
++ * This routine searches a directory for a file or another directory.
++ * Assumes a glock is held on dip.
++ *
++ * Returns: Inode number if found, -EXXXX on failure.
++ */
++
++int
++gfs_dir_search(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int *type)
++{
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++ error = dir_e_search(dip, filename, inum, type);
++ else
++ error = dir_l_search(dip, filename, inum, type);
++
++ return error;
++}
++
++/**
++ * gfs_dir_add - Add new filename into directory
++ * @dip: The GFS inode
++ * @filename: The new name
++ * @inode: The inode number of the entry
++ * @type: The type of the entry
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int
++gfs_dir_add(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int type)
++{
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++ error = dir_e_add(dip, filename, inum, type);
++ else
++ error = dir_l_add(dip, filename, inum, type);
++
++ return error;
++}
++
++/**
++ * gfs_dir_del - Delete a directory entry
++ * @dip: The GFS inode
++ * @filename: The filename
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int
++gfs_dir_del(struct gfs_inode *dip, struct qstr *filename)
++{
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++ error = dir_e_del(dip, filename);
++ else
++ error = dir_l_del(dip, filename);
++
++ return error;
++}
++
++/**
++ * gfs_dir_read - Translate a GFS filename
++ * @dip: The GFS inode
++ * @offset:
++ * @opaque:
++ * @filldir:
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_dir_read(struct gfs_inode *dip, uint64_t * offset, void *opaque,
++ gfs_filldir_t filldir)
++{
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++ error = dir_e_read(dip, offset, opaque, filldir);
++ else
++ error = dir_l_read(dip, offset, opaque, filldir);
++
++ return error;
++}
++
++/**
++ * gfs_dir_mvino - Change inode number of directory entry
++ * @dip: The GFS inode
++ * @filename:
++ * @new_inode:
++ *
++ * This routine changes the inode number of a directory entry. It's used
++ * by rename to change ".." when a directory is moved.
++ * Assumes a glock is held on dvp.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_dir_mvino(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int new_type)
++{
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH)
++ error = dir_e_mvino(dip, filename, inum, new_type);
++ else
++ error = dir_l_mvino(dip, filename, inum, new_type);
++
++ return error;
++}
++
++/**
++ * foreach_leaf - call a function for each leaf in a directory
++ * @dip: the directory
++ * @lc: the function to call for each each
++ * @data: private data to pass to it
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++foreach_leaf(struct gfs_inode *dip, leaf_call_t lc, void *data)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *bh;
++ struct gfs_leaf leaf;
++ uint32_t hsize, len;
++ uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
++ uint32_t index = 0;
++ uint64_t *lp;
++ uint64_t leaf_no;
++ int error = 0;
++
++ GFS_ASSERT_INODE(dip->i_di.di_flags & GFS_DIF_EXHASH, dip,);
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size, dip,);
++
++ lp = gmalloc(sdp->sd_hash_bsize);
++
++ while (index < hsize) {
++ lp_offset = index & (sdp->sd_hash_ptrs - 1);
++ ht_offset = index - lp_offset;
++
++ if (ht_offset_cur != ht_offset) {
++ error = gfs_internal_read(dip, (char *)lp,
++ ht_offset * sizeof(uint64_t),
++ sdp->sd_hash_bsize);
++ if (error != sdp->sd_hash_bsize) {
++ if (error >= 0)
++ error = -EIO;
++ goto out;
++ }
++ ht_offset_cur = ht_offset;
++ }
++
++ leaf_no = gfs64_to_cpu(lp[lp_offset]);
++ if (leaf_no) {
++ error = get_leaf(dip, leaf_no, &bh);
++ if (error)
++ goto out;
++ gfs_leaf_in(&leaf, bh->b_data);
++ brelse(bh);
++
++ len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
++
++ error = lc(dip, index, len, leaf_no, data);
++ if (error)
++ goto out;
++
++ index = (index & ~(len - 1)) + len;
++ } else
++ index++;
++ }
++
++ GFS_ASSERT_INODE(index == hsize, dip,);
++
++ out:
++ kfree(lp);
++
++ return error;
++}
++
++/**
++ * leaf_free - Deallocate a directory leaf
++ * @dip: the directory
++ * @index: the hash table offset in the directory
++ * @len: the number of pointers to this leaf
++ * @leaf_no: the leaf number
++ * @data: not used
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++leaf_free(struct gfs_inode *dip,
++ uint32_t index, uint32_t len,
++ uint64_t leaf_no, void *data)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_holder ri_gh;
++ struct gfs_leaf tmp_leaf;
++ struct gfs_rgrp_list rlist;
++ struct buffer_head *bh, *dibh;
++ uint64_t blk;
++ unsigned int rg_blocks = 0;
++ char *ht;
++ unsigned int x, size = len * sizeof(uint64_t);
++ int error;
++
++ memset(&rlist, 0, sizeof(struct gfs_rgrp_list));
++
++ ht = gmalloc(size);
++ memset(ht, 0, size);
++
++ gfs_alloc_get(dip);
++
++ error = gfs_quota_hold_m(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ goto fail_qs;
++
++ /* Count the number of leaves */
++
++ for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
++ error = get_leaf(dip, blk, &bh);
++ if (error)
++ goto fail_rlist;
++ gfs_leaf_in(&tmp_leaf, (bh)->b_data);
++ brelse(bh);
++
++ gfs_rlist_add(sdp, &rlist, blk);
++ }
++
++ gfs_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
++
++ error = gfs_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
++ if (error)
++ goto fail_rlist;
++
++ for (x = 0; x < rlist.rl_rgrps; x++) {
++ struct gfs_rgrpd *rgd;
++ rgd = gl2rgd(rlist.rl_ghs[x].gh_gl);
++ rg_blocks += rgd->rd_ri.ri_length;
++ }
++
++ /* Trans may require:
++ All the bitmaps that were reserved.
++ One block for the dinode.
++ All the hash blocks that will be changed.
++ One block for a quota change. */
++
++ error = gfs_trans_begin(sdp,
++ rg_blocks + 1 + (DIV_RU(size, sdp->sd_jbsize) + 1),
++ 1);
++ if (error)
++ goto fail_rg_gunlock;
++
++ for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
++ error = get_leaf(dip, blk, &bh);
++ if (error)
++ goto fail_end_trans;
++ gfs_leaf_in(&tmp_leaf, bh->b_data);
++ brelse(bh);
++
++ gfs_metafree(dip, blk, 1);
++
++ dip->i_di.di_blocks--;
++ }
++
++ error = gfs_internal_write(dip, ht, index * sizeof(uint64_t), size);
++ if (error != size) {
++ if (error >= 0)
++ error = -EIO;
++ goto fail_end_trans;
++ }
++
++ error = gfs_get_inode_buffer(dip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_add_bh(dip->i_gl, dibh);
++ gfs_dinode_out(&dip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++ gfs_rlist_free(&rlist);
++ gfs_glock_dq_uninit(&ri_gh);
++ gfs_quota_unhold_m(dip);
++ gfs_alloc_put(dip);
++ kfree(ht);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_rg_gunlock:
++ gfs_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
++
++ fail_rlist:
++ gfs_rlist_free(&rlist);
++ gfs_glock_dq_uninit(&ri_gh);
++
++ fail_qs:
++ gfs_quota_unhold_m(dip);
++
++ fail:
++ gfs_alloc_put(dip);
++ kfree(ht);
++
++ return error;
++}
++
++/**
++ * gfs_dir_exhash_free - free all the leaf block in a directory
++ * @dip: the directory
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_dir_exhash_free(struct gfs_inode *dip)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct buffer_head *bh;
++ int error;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ error = foreach_leaf(dip, leaf_free, NULL);
++ if (error)
++ return error;
++
++ /* Make this a regular file in case we crash.
++ (We don't want to free these blocks a second time.) */
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error)
++ return error;
++
++ error = gfs_get_inode_buffer(dip, &bh);
++ if (error)
++ goto fail;
++
++ gfs_trans_add_bh(dip->i_gl, bh);
++ ((struct gfs_dinode *)bh->b_data)->di_type = cpu_to_gfs16(GFS_FILE_REG);
++
++ brelse(bh);
++
++ gfs_trans_end(sdp);
++
++ return 0;
++
++ fail:
++ gfs_trans_end(sdp);
++ return error;
++}
++
++/**
++ * gfs_diradd_alloc_required - figure out if an entry addition is going to require an allocation
++ * @ip: the file being written to
++ * @filname: the filename that's going to be added
++ * @alloc_required: the int is set to TRUE if an alloc is required, FALSE otherwise
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_diradd_alloc_required(struct gfs_inode *dip, struct qstr *filename,
++ int *alloc_required)
++{
++ struct buffer_head *bh = NULL, *bh_next;
++ uint32_t hsize, hash, index;
++ int error = 0;
++
++ *alloc_required = FALSE;
++
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++
++ if (dip->i_di.di_flags & GFS_DIF_EXHASH) {
++ hsize = 1 << dip->i_di.di_depth;
++ GFS_ASSERT_INODE(hsize * sizeof(uint64_t) == dip->i_di.di_size,
++ dip,);
++
++ hash = gfs_dir_hash(filename->name, filename->len);
++ index = hash >> (32 - dip->i_di.di_depth);
++
++ error = get_first_leaf(dip, index, &bh_next);
++ if (error)
++ return error;
++
++ do {
++ if (bh)
++ brelse(bh);
++
++ bh = bh_next;
++
++ if (dirent_fits(dip, bh, filename->len))
++ break;
++
++ error = get_next_leaf(dip, bh, &bh_next);
++ if (error == -ENOENT) {
++ *alloc_required = TRUE;
++ error = 0;
++ break;
++ }
++ }
++ while (!error);
++
++ brelse(bh);
++ } else {
++ error = gfs_get_inode_buffer(dip, &bh);
++ if (error)
++ return error;
++
++ if (!dirent_fits(dip, bh, filename->len))
++ *alloc_required = TRUE;
++
++ brelse(bh);
++ }
++
++ return error;
++}
++
++/**
++ * do_gdm - copy out one leaf (or list of leaves)
++ * @dip: the directory
++ * @index: the hash table offset in the directory
++ * @len: the number of pointers to this leaf
++ * @leaf_no: the leaf number
++ * @data: a pointer to a struct gfs_user_buffer structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_gdm(struct gfs_inode *dip, uint32_t index, uint32_t len, uint64_t leaf_no,
++ void *data)
++{
++ struct gfs_user_buffer *ub = (struct gfs_user_buffer *)data;
++ struct gfs_leaf leaf;
++ struct buffer_head *bh;
++ uint64_t blk;
++ int error = 0;
++
++ for (blk = leaf_no; blk; blk = leaf.lf_next) {
++ error = get_leaf(dip, blk, &bh);
++ if (error)
++ break;
++
++ gfs_leaf_in(&leaf, bh->b_data);
++
++ error = gfs_add_bh_to_ub(ub, bh);
++
++ brelse(bh);
++
++ if (error)
++ break;
++ }
++
++ return error;
++}
++
++/**
++ * gfs_get_dir_meta - return all the leaf blocks of a directory
++ * @dip: the directory
++ * @ub: the structure representing the meta
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_dir_meta(struct gfs_inode *dip, struct gfs_user_buffer *ub)
++{
++ GFS_ASSERT_INODE(dip->i_di.di_type == GFS_FILE_DIR, dip,);
++ return foreach_leaf(dip, do_gdm, ub);
++}
+diff -urN linux-orig/fs/gfs/dir.h linux-patched/fs/gfs/dir.h
+--- linux-orig/fs/gfs/dir.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/dir.h 2004-06-20 22:48:17.947946967 -0500
+@@ -0,0 +1,55 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __DIR_DOT_H__
++#define __DIR_DOT_H__
++
++/**
++ * gfs_filldir_t - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++typedef int (*gfs_filldir_t) (void *opaque,
++ const char *name, unsigned int length,
++ uint64_t offset,
++ struct gfs_inum *inum, unsigned int type);
++
++int gfs_filecmp(struct qstr *file1, char *file2, int len_of_file2);
++int gfs_dirent_alloc(struct gfs_inode *dip, struct buffer_head *bh,
++ int name_len, struct gfs_dirent **dent_out);
++
++int gfs_dir_search(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int *type);
++int gfs_dir_add(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *inum, unsigned int type);
++int gfs_dir_del(struct gfs_inode *dip, struct qstr *filename);
++int gfs_dir_read(struct gfs_inode *dip, uint64_t * offset, void *opaque,
++ gfs_filldir_t filldir);
++int gfs_dir_mvino(struct gfs_inode *dip, struct qstr *filename,
++ struct gfs_inum *new_inum, unsigned int new_type);
++
++int gfs_dir_exhash_free(struct gfs_inode *dip);
++
++int gfs_diradd_alloc_required(struct gfs_inode *dip, struct qstr *filename,
++ int *alloc_required);
++
++int gfs_get_dir_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++#endif /* __DIR_DOT_H__ */
+diff -urN linux-orig/fs/gfs/eattr.c linux-patched/fs/gfs/eattr.c
+--- linux-orig/fs/gfs/eattr.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/eattr.c 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,2340 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "dio.h"
++#include "eattr.h"
++#include "glock.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++#define GFS_EA_REC_LEN(x) gfs32_to_cpu((x)->ea_rec_len)
++#define GFS_EA_NAME(x) ((char *)(x) + sizeof(struct gfs_ea_header))
++#define GFS_EA_DATA_PTRS(x) ((uint64_t *)((char *)(x) + sizeof(struct gfs_ea_header) + (((x)->ea_name_len + 7) & ~7)))
++
++#define GFS_EA_NEXT(x) (struct gfs_ea_header *)((char *)(x) + GFS_EA_REC_LEN(x))
++#define GFS_EA_FREESPACE(x) (struct gfs_ea_header *)((char *)(x) + GFS_EA_SIZE(x))
++
++#define GFS_EAREQ_IS_STUFFED(x, y) (((sizeof(struct gfs_ea_header) + (x)->es_data_len + (x)->es_name_len + 7) & ~7) <= y)
++
++#define GFS_EADATA_NUM_PTRS(x, y) (((x) + (y) - 1) / (y))
++
++#define GFS_EA_SIZE(x) ((sizeof(struct gfs_ea_header) + (x)->ea_name_len + (GFS_EA_IS_UNSTUFFED(x)? (8 * (x)->ea_num_ptrs) : GFS_EA_DATA_LEN(x)) + 7) & ~ 7)
++
++#define GFS_EACMD_VALID(x) ((x) <= GFS_EACMD_REMOVE)
++
++#define GFS_EA_IS_LAST(x) ((x)->ea_flags & GFS_EAFLAG_LAST)
++
++#define GFS_EA_STRLEN(x) ((x)->ea_name_len + 1 + (((x)->ea_type == GFS_EATYPE_USR)? 5 : 7))
++
++#define GFS_FIRST_EA(x) ((struct gfs_ea_header *) ((x)->b_data + sizeof(struct gfs_meta_header)))
++
++#define EA_ALLOC 1
++#define EA_DEALLOC 2
++
++static struct buffer_head *alloc_eattr_blk(struct gfs_sbd *sdp,
++ struct gfs_inode *alloc_ip,
++ struct gfs_inode *ip,
++ uint64_t * block);
++
++/**
++ * can_replace - returns true if ea is large enough to hold the data in
++ * the request
++ */
++
++static __inline__ int
++can_replace(struct gfs_ea_header *ea, struct gfs_easet_io *req,
++ uint32_t avail_size)
++{
++ int data_space =
++ GFS_EA_REC_LEN(ea) - sizeof (struct gfs_ea_header) -
++ ea->ea_name_len;
++
++ if (GFS_EAREQ_IS_STUFFED(req, avail_size) && !GFS_EA_IS_UNSTUFFED(ea))
++ return (req->es_data_len <= data_space);
++ else
++ return (GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size) <=
++ ea->ea_num_ptrs);
++}
++
++/**
++ * get_req_size - returns the acutal number of bytes the request will take up
++ * (not counting any unstuffed data blocks)
++ */
++
++static __inline__ uint32_t
++get_req_size(struct gfs_easet_io *req, uint32_t avail_size)
++{
++ uint32_t size =
++ ((sizeof (struct gfs_ea_header) + req->es_data_len +
++ req->es_name_len + 7) & ~7);
++
++ if (size <= avail_size)
++ return size;
++
++ return ((sizeof (struct gfs_ea_header) + req->es_name_len + 7) & ~7) +
++ (8 * GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size));
++}
++
++/**
++ * gfs_ea_write_permission - decides if the user has permission to write to
++ * the ea
++ * @req: the write request
++ * @ip: inode of file with the ea
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_ea_write_permission(struct gfs_easet_io *req, struct gfs_inode *ip)
++{
++ struct inode *inode = gfs_iget(ip, NO_CREATE);
++ int error = 0;
++
++ GFS_ASSERT_INODE(inode, ip,);
++
++ if (req->es_type == GFS_EATYPE_USR) {
++ if (!S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ error = -EPERM;
++ else {
++ error = permission(inode, MAY_WRITE, NULL);
++ if (error == -EACCES)
++ error = -EPERM;
++ }
++ } else if (req->es_type == GFS_EATYPE_SYS) {
++ if (IS_ACCESS_ACL(req->es_name, req->es_name_len))
++ error = gfs_validate_acl(ip, req->es_data,
++ req->es_data_len, 1);
++ else if (IS_DEFAULT_ACL(req->es_name, req->es_name_len))
++ error = gfs_validate_acl(ip, req->es_data,
++ req->es_data_len, 0);
++ else {
++ if (!capable(CAP_SYS_ADMIN))
++ error = -EPERM;
++ }
++ } else
++ error = -EOPNOTSUPP;
++
++ iput(inode);
++
++ return error;
++}
++
++/**
++ * gfs_ea_read_permission - decides if the user has permission to read from
++ * the ea
++ * @req: the read request
++ * @ip: inode of file with the ea
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_ea_read_permission(struct gfs_eaget_io *req, struct gfs_inode *ip)
++{
++ struct inode *inode = gfs_iget(ip, NO_CREATE);
++ int error = 0;
++
++ GFS_ASSERT_INODE(inode, ip,);
++
++ if (req->eg_type == GFS_EATYPE_USR){
++ error = permission(inode, MAY_READ, NULL);
++ if (error == -EACCES)
++ error = -EPERM;
++ }
++ else if (req->eg_type == GFS_EATYPE_SYS) {
++ if (IS_ACCESS_ACL(req->eg_name, req->eg_name_len) ||
++ IS_DEFAULT_ACL(req->eg_name, req->eg_name_len))
++ error = 0;
++ else{
++ if (!capable(CAP_SYS_ADMIN))
++ error = -EPERM;
++ }
++ } else
++ error = -EOPNOTSUPP;
++
++ iput(inode);
++
++ return error;
++}
++
++/**
++ * gfs_es_memcpy - gfs memcpy wrapper with a return value
++ *
++ */
++
++int
++gfs_ea_memcpy(void *dest, void *src, unsigned long size)
++{
++ memcpy(dest, src, size);
++ return 0;
++}
++
++/**
++ * gfs_ea_copy_to_user - copy_to_user wrapper
++ */
++
++int
++gfs_ea_copy_to_user(void *dest, void *src, unsigned long size)
++{
++ int error;
++ error = (copy_to_user(dest, src, size)) ? -EFAULT : 0;
++ return error;
++}
++
++/**
++ * Returns: 1 if find_direct_eattr should stop checking (if the eattr was found
++ * location will be set)
++ * 0 if find_eattr should keep on checking
++ * -EXXX on error
++ */
++int
++find_direct_eattr(struct gfs_inode *ip, uint64_t blkno, char *name,
++ int name_len, int type, struct gfs_ea_location *location)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_ea_header *curr, *prev = NULL;
++
++ err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ curr =
++ (struct gfs_ea_header *) ((bh)->b_data +
++ sizeof (struct gfs_meta_header));
++ if (curr->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(curr))
++ goto out_drelse;
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ if (type != curr->ea_type && ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ if (type == GFS_EATYPE_SYS)
++ err = 1;
++ goto out_drelse;
++ }
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++
++ if (type == curr->ea_type && name_len == curr->ea_name_len &&
++ !memcmp(name, GFS_EA_NAME(curr), name_len)) {
++ location->bh = bh;
++ location->ea = curr;
++ location->prev = prev;
++ err = 1;
++ goto out;
++ }
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++
++ out_drelse:
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++/**
++ * find_eattr - find a matching eattr
++ *
++ * Returns: 1 if ea found, 0 if no ea found, -EXXX on error
++ */
++int
++find_eattr(struct gfs_inode *ip, char *name, int name_len, int type,
++ struct gfs_ea_location *location)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t *eablk, *end;
++
++ memset(location, 0, sizeof (struct gfs_ea_location));
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ err =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto fail;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++ eablk =
++ (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++ end =
++ eablk +
++ ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++ while (eablk < end && *eablk) {
++ err =
++ find_direct_eattr(ip, gfs64_to_cpu(*eablk), name,
++ name_len, type, location);
++ if (err || location->ea)
++ break;
++ eablk++;
++ }
++ brelse(bh);
++ if (err < 0)
++ goto fail;
++ } else {
++ err =
++ find_direct_eattr(ip, ip->i_di.di_eattr, name, name_len,
++ type, location);
++ if (err < 0)
++ goto fail;
++ }
++
++ return (location->ea != NULL);
++
++ fail:
++ return err;
++}
++
++static void
++make_space(struct gfs_inode *ip, struct buffer_head *bh, uint32_t size,
++ uint64_t blkno, struct gfs_ea_location *avail)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint32_t free_size, avail_size;
++ struct gfs_ea_header *ea, *new_ea;
++ void *buf;
++
++ free_size = 0;
++ avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ ea = GFS_FIRST_EA(bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++ if (ea->ea_type == GFS_EATYPE_UNUSED) {
++ free_size = GFS_EA_REC_LEN(ea);
++ ea = GFS_EA_NEXT(ea);
++ }
++ while (free_size < size) {
++ free_size += (GFS_EA_REC_LEN(ea) - GFS_EA_SIZE(ea));
++ if (GFS_EA_IS_LAST(ea))
++ break;
++ ea = GFS_EA_NEXT(ea);
++ }
++ if (free_size < size)
++ goto out;
++ buf = gmalloc(avail_size);
++
++ free_size = avail_size;
++ ea = GFS_FIRST_EA(bh);
++ if (ea->ea_type == GFS_EATYPE_UNUSED)
++ ea = GFS_EA_NEXT(ea);
++ new_ea = (struct gfs_ea_header *) buf;
++ new_ea->ea_flags = 0;
++ new_ea->ea_rec_len = cpu_to_gfs32(size);
++ new_ea->ea_num_ptrs = 0;
++ new_ea->ea_type = GFS_EATYPE_UNUSED;
++ free_size -= size;
++ new_ea = GFS_EA_NEXT(new_ea);
++ while (1) {
++ memcpy(new_ea, ea, GFS_EA_SIZE(ea));
++ if (GFS_EA_IS_LAST(ea))
++ break;
++ new_ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(ea));
++ free_size -= GFS_EA_SIZE(ea);
++ ea = GFS_EA_NEXT(ea);
++ new_ea = GFS_EA_NEXT(new_ea);
++ }
++ new_ea->ea_rec_len = cpu_to_gfs32(free_size);
++ memcpy(GFS_FIRST_EA(bh), buf, avail_size);
++ kfree(buf);
++ avail->ea = GFS_FIRST_EA(bh);
++ avail->prev = NULL;
++ avail->bh = bh;
++
++ out:
++ return;
++}
++
++static int
++expand_to_indirect(struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++ struct buffer_head **bh)
++{
++ int err;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh1 = NULL, *bh2 = NULL, *indbh = NULL;
++ uint64_t blkno, *blkptr;
++ uint32_t free_size, avail_size;
++ struct gfs_ea_header *prev, *curr, *new_ea = NULL;
++
++ avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ free_size = avail_size;
++ ip->i_di.di_flags |= GFS_DIF_EA_INDIRECT;
++ blkno = ip->i_di.di_eattr;
++ err = gfs_metaalloc(alloc_ip, &ip->i_di.di_eattr);
++ if (err)
++ goto out;
++ ip->i_di.di_blocks++;
++ err = gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_NEW | DIO_START |
++ DIO_WAIT, &indbh);
++ if (err)
++ goto out;
++ bh1 = *bh;
++ *bh = indbh;
++ gfs_trans_add_bh(ip->i_gl, indbh);
++ gfs_metatype_set(sdp, indbh, GFS_METATYPE_IN, GFS_FORMAT_IN);
++ memset((indbh)->b_data + sizeof (struct gfs_meta_header), 0,
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++ blkptr = (uint64_t *) ((indbh)->b_data + sizeof (struct gfs_indirect));
++ *blkptr++ = cpu_to_gfs64(blkno);
++ prev = NULL;
++ curr = GFS_FIRST_EA(bh1);
++ while (curr->ea_type != GFS_EATYPE_USR) {
++ if (GFS_EA_IS_LAST(curr))
++ goto out_drelse1;
++ free_size -= GFS_EA_REC_LEN(curr);
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ if (!prev || prev->ea_type == GFS_EATYPE_UNUSED)
++ goto out_drelse1;
++ gfs_trans_add_bh(ip->i_gl, bh1);
++ prev->ea_rec_len = cpu_to_gfs32(GFS_EA_REC_LEN(prev) + free_size);
++ prev->ea_flags |= GFS_EAFLAG_LAST;
++ bh2 = alloc_eattr_blk(sdp, alloc_ip, ip, &blkno);
++ if (!bh2) {
++ err = -EIO;
++ goto out_drelse1;
++ }
++ free_size = avail_size;
++ new_ea = GFS_FIRST_EA(bh2);
++ while (1) {
++ memcpy(new_ea, curr, GFS_EA_SIZE(curr));
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ new_ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(curr));
++ free_size -= GFS_EA_SIZE(curr);
++ curr = GFS_EA_NEXT(curr);
++ new_ea = GFS_EA_NEXT(new_ea);
++ }
++ new_ea->ea_rec_len = cpu_to_gfs32(free_size);
++ *blkptr = cpu_to_gfs64(blkno);
++ brelse(bh2);
++
++ out_drelse1:
++ brelse(bh1);
++
++ out:
++ return err;
++}
++
++static void
++find_direct_sys_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++ struct gfs_ea_location *avail)
++{
++ struct gfs_ea_header *curr, *prev = NULL;
++
++ curr = GFS_FIRST_EA(bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (curr->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_REC_LEN(curr) >= size) {
++ avail->ea = curr;
++ avail->prev = NULL;
++ avail->bh = bh;
++ goto out;
++ }
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ while (curr->ea_type == GFS_EATYPE_SYS) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++ avail->ea = curr;
++ avail->prev = prev;
++ avail->bh = bh;
++ goto out;
++ }
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ make_space(ip, bh, size, ip->i_di.di_eattr, avail);
++
++ out:
++ return;
++}
++
++/**
++ * int find_indirect_space
++ *
++ * @space:
++ * @blktype: returns the type of block GFS_EATYPE_...
++ *
++ * returns 0 on success, -EXXX on failure
++ */
++static int
++find_indirect_space(struct gfs_inode *ip, uint64_t blkno, int type,
++ int size, struct gfs_ea_location *avail, int *blktype)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_ea_header *curr, *prev = NULL;
++
++ err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ curr = GFS_FIRST_EA(bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (curr->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(curr)) {
++ avail->ea = curr;
++ avail->prev = NULL;
++ avail->bh = bh;
++ *blktype = GFS_EATYPE_UNUSED;
++ goto out;
++ }
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ if (type != curr->ea_type) {
++ *blktype = curr->ea_type;
++ goto out_drelse;
++ } else
++ *blktype = type;
++ if (prev && GFS_EA_REC_LEN(prev) >= size) {
++ avail->ea = prev;
++ avail->prev = NULL;
++ avail->bh = bh;
++ goto out;
++ }
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++ avail->ea = curr;
++ avail->prev = prev;
++ avail->bh = bh;
++ goto out;
++ }
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++
++ out_drelse:
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++static int
++find_indirect_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++ int size, struct buffer_head *bh,
++ struct gfs_ea_location *avail)
++{
++ int err = 0;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t *eablk, *end, *first_usr_blk = NULL;
++ int blktype;
++ uint64_t blkno;
++
++ eablk = (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++ end =
++ eablk + ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++ while (eablk < end && *eablk) {
++ err =
++ find_indirect_space(ip, gfs64_to_cpu(*eablk),
++ GFS_EATYPE_SYS, size, avail, &blktype);
++ if (err)
++ goto out;
++ if (blktype == GFS_EATYPE_USR && !first_usr_blk)
++ first_usr_blk = eablk;
++ if (avail->ea) {
++ if (!first_usr_blk)
++ goto out;
++ gfs_trans_add_bh(ip->i_gl, bh);
++ blkno = *eablk;
++ *eablk = *first_usr_blk;
++ *first_usr_blk = blkno;
++ goto out;
++ }
++ eablk++;
++ }
++ if (eablk >= end) {
++ err = -ENOSPC;
++ goto out;
++ }
++ avail->bh = alloc_eattr_blk(sdp, alloc_ip, ip, &blkno);
++ if (!avail->bh) {
++ err = -EIO;
++ goto out;
++ }
++ avail->ea = GFS_FIRST_EA(avail->bh);
++ avail->prev = NULL;
++ gfs_trans_add_bh(ip->i_gl, bh);
++ if (first_usr_blk) {
++ *eablk = *first_usr_blk;
++ *first_usr_blk = cpu_to_gfs64(blkno);
++ } else
++ *eablk = cpu_to_gfs64(blkno);
++
++ out:
++ return err;
++}
++
++int
++find_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip, int size,
++ struct gfs_ea_location *avail)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++
++ err =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++ &bh);
++ if (err)
++ goto out;
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++ err = find_indirect_sys_space(alloc_ip, ip, size, bh, avail);
++ } else {
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ find_direct_sys_space(ip, size, bh, avail);
++ if (!avail->ea) {
++ err = expand_to_indirect(alloc_ip, ip, &bh);
++ if (err)
++ goto out_drelse;
++ err =
++ find_indirect_sys_space(alloc_ip, ip, size, bh,
++ avail);
++ }
++ }
++
++ out_drelse:
++ if (avail->bh != bh)
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++static int
++get_blk_type(struct gfs_inode *ip, uint64_t blkno, int *blktype)
++{
++ int err = 0;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ struct gfs_ea_header *ea;
++
++ err = gfs_dread(sdp, blkno, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ ea = GFS_FIRST_EA(bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++ if (ea->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(ea)) {
++ *blktype = GFS_EATYPE_UNUSED;
++ goto out_drelse;
++ }
++ ea = GFS_EA_NEXT(ea);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++ }
++ *blktype = ea->ea_type;
++
++ out_drelse:
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++static void
++find_direct_usr_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++ struct gfs_ea_location *avail)
++{
++ struct gfs_ea_header *curr, *prev = NULL;
++
++ curr = GFS_FIRST_EA(bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (curr->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(curr)) {
++ avail->ea = curr;
++ avail->prev = NULL;
++ avail->bh = bh;
++ goto out;
++ }
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ if (curr->ea_type == GFS_EATYPE_USR
++ && GFS_EA_REC_LEN(prev) >= size) {
++ avail->ea = prev;
++ avail->prev = NULL;
++ avail->bh = bh;
++ goto out;
++ }
++ }
++ while (curr->ea_type != GFS_EATYPE_USR) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++ avail->ea = curr;
++ avail->prev = prev;
++ avail->bh = bh;
++ goto out;
++ }
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++
++ out:
++ return;
++}
++
++static int
++find_indirect_usr_space(struct gfs_inode *ip, int size, struct buffer_head *bh,
++ struct gfs_ea_location *avail)
++{
++ int err = 0;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t *eablk, *end, *last_sys_blk = NULL, *first_usr_blk = NULL;
++ int blktype;
++ uint64_t blkno;
++
++ eablk = (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++ end =
++ eablk + ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++ while (eablk < end && *eablk) {
++ err =
++ find_indirect_space(ip, gfs64_to_cpu(*eablk),
++ GFS_EATYPE_USR, size, avail, &blktype);
++ if (err)
++ goto out;
++ if (blktype == GFS_EATYPE_SYS)
++ last_sys_blk = eablk;
++ if (blktype == GFS_EATYPE_USR && !first_usr_blk)
++ first_usr_blk = eablk;
++ if (avail->ea) {
++ if (first_usr_blk)
++ goto out;
++ first_usr_blk = eablk + 1;
++ while (first_usr_blk < end && *first_usr_blk) {
++ err =
++ get_blk_type(ip,
++ gfs64_to_cpu(*first_usr_blk),
++ &blktype);
++ if (blktype == GFS_EATYPE_SYS)
++ last_sys_blk = first_usr_blk;
++ if (blktype == GFS_EATYPE_USR)
++ break;
++ first_usr_blk++;
++ }
++ if (last_sys_blk > eablk) {
++ gfs_trans_add_bh(ip->i_gl, bh);
++ blkno = *eablk;
++ *eablk = *last_sys_blk;
++ *last_sys_blk = blkno;
++ }
++ goto out;
++ }
++ eablk++;
++ }
++
++ if (eablk >= end) {
++ err = -ENOSPC;
++ goto out;
++ }
++ avail->bh = alloc_eattr_blk(sdp, ip, ip, &blkno);
++ if (!avail->bh) {
++ err = -EIO;
++ goto out;
++ }
++ avail->ea = GFS_FIRST_EA(avail->bh);
++ avail->prev = NULL;
++ gfs_trans_add_bh(ip->i_gl, bh);
++ *eablk = cpu_to_gfs64(blkno);
++
++ out:
++ return err;
++}
++
++static int
++find_usr_space(struct gfs_inode *ip, int size, struct gfs_ea_location *avail)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++
++ err =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++ &bh);
++ if (err)
++ goto out;
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++ err = find_indirect_usr_space(ip, size, bh, avail);
++ } else {
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ find_direct_usr_space(ip, size, bh, avail);
++ if (!avail->ea) {
++ err = expand_to_indirect(ip, ip, &bh);
++ if (err)
++ goto out_drelse;
++ err = find_indirect_usr_space(ip, size, bh, avail);
++ }
++ }
++
++ out_drelse:
++ if (avail->bh != bh)
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++static int
++find_space(struct gfs_inode *ip, int size, int type,
++ struct gfs_ea_location *avail)
++{
++ int err;
++
++ memset(avail, 0, sizeof (struct gfs_ea_location));
++
++ if (type == GFS_EATYPE_SYS)
++ err = find_sys_space(ip, ip, size, avail);
++ else
++ err = find_usr_space(ip, size, avail);
++
++ return err;
++}
++
++static int
++can_replace_in_block(struct gfs_inode *ip, int size,
++ struct gfs_ea_location found, struct gfs_ea_header **space)
++{
++ struct gfs_ea_header *curr, *prev = NULL;
++
++ *space = NULL;
++ curr = GFS_FIRST_EA(found.bh);
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (curr->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_REC_LEN(curr) >= size) {
++ *space = curr;
++ goto out;
++ }
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(curr), ip,);
++ if (curr == found.ea) {
++ /*
++ * See if there will be enough space after the old version of the eattr
++ * is deleted.
++ */
++ if (prev) {
++ if (prev->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_REC_LEN(prev) +
++ GFS_EA_REC_LEN(curr) >= size) {
++ *space = prev;
++ goto out;
++ }
++ } else if (GFS_EA_REC_LEN(prev) +
++ GFS_EA_REC_LEN(curr) >=
++ GFS_EA_SIZE(prev) + size) {
++ *space = prev;
++ goto out;
++ }
++ } else if (GFS_EA_REC_LEN(curr) >= size) {
++ *space = curr;
++ goto out;
++ }
++ } else if (GFS_EA_REC_LEN(curr) >= GFS_EA_SIZE(curr) + size) {
++ *space = curr;
++ goto out;
++ }
++ if (GFS_EA_IS_LAST(curr))
++ break;
++ prev = curr;
++ curr = GFS_EA_NEXT(curr);
++ }
++
++ out:
++ return (*space != NULL);
++}
++
++/**
++ * read_unstuffed - actually copies the unstuffed data into the
++ * request buffer
++ */
++
++int
++read_unstuffed(void *dest, struct gfs_inode *ip, struct gfs_sbd *sdp,
++ struct gfs_ea_header *ea, uint32_t avail_size,
++ gfs_ea_copy_fn_t copy_fn)
++{
++ struct buffer_head *bh[66]; /* This is the maximum number of data ptrs possible */
++ int err = 0;
++ int max = GFS_EADATA_NUM_PTRS(GFS_EA_DATA_LEN(ea), avail_size);
++ int i, j, left = GFS_EA_DATA_LEN(ea);
++ char *outptr, *buf;
++ uint64_t *indptr = GFS_EA_DATA_PTRS(ea);
++
++ for (i = 0; i < max; i++) {
++ err =
++ gfs_dread(sdp, gfs64_to_cpu(*indptr), ip->i_gl, DIO_START,
++ &bh[i]);
++ indptr++;
++ if (err) {
++ for (j = 0; j < i; j++)
++ brelse(bh[j]);
++ goto out;
++ }
++ }
++
++ outptr = dest;
++
++ for (i = 0; i < max; i++) {
++ err = gfs_dreread(sdp, bh[i], DIO_WAIT);
++ if (err) {
++ for (j = i; j < max; j++)
++ brelse(bh[j]);
++ goto out;
++ }
++ gfs_metatype_check(sdp, bh[i], GFS_METATYPE_EA);
++ buf = (bh[i])->b_data + sizeof (struct gfs_meta_header);
++ err =
++ copy_fn(outptr, buf,
++ (avail_size > left) ? left : avail_size);
++ if (err) {
++ for (j = i; j < max; j++)
++ brelse(bh[j]);
++ goto out;
++ }
++ left -= avail_size;
++ outptr += avail_size;
++ brelse(bh[i]);
++ }
++
++ out:
++
++ return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++int
++get_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++ gfs_ea_copy_fn_t copy_fn)
++{
++ int err;
++ struct gfs_ea_location location;
++ uint32_t avail_size;
++
++ avail_size = sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++ err = find_eattr(ip, req->eg_name, req->eg_name_len, req->eg_type,
++ &location);
++ if (err != 1) {
++ if (err == 0)
++ err = -ENODATA;
++ goto out;
++ }
++
++ if (req->eg_data_len) {
++ if (req->eg_data_len < GFS_EA_DATA_LEN(location.ea))
++ err = -ERANGE;
++ else if (GFS_EA_IS_UNSTUFFED(location.ea))
++ err =
++ read_unstuffed(req->eg_data, ip, sdp, location.ea,
++ avail_size, copy_fn);
++ else
++ err = copy_fn(req->eg_data, GFS_EA_DATA(location.ea),
++ GFS_EA_DATA_LEN(location.ea));
++ if (!err)
++ err = GFS_EA_DATA_LEN(location.ea);
++ } else
++ err = GFS_EA_DATA_LEN(location.ea);
++
++ brelse(location.bh);
++
++ out:
++ return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct gfs_ea_header *
++prep_ea(struct gfs_ea_header *ea)
++{
++ struct gfs_ea_header *new = ea;
++
++ if (ea->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(ea))
++ ea->ea_flags = GFS_EAFLAG_LAST;
++ else
++ ea->ea_flags = 0;
++ } else {
++ new = GFS_EA_FREESPACE(ea);
++ new->ea_rec_len =
++ cpu_to_gfs32(GFS_EA_REC_LEN(ea) - GFS_EA_SIZE(ea));
++ ea->ea_rec_len = cpu_to_gfs32(GFS_EA_SIZE(ea));
++ if (GFS_EA_IS_LAST(ea)) {
++ ea->ea_flags &= ~GFS_EAFLAG_LAST;
++ new->ea_flags = GFS_EAFLAG_LAST;
++ } else
++ new->ea_flags = 0;
++ }
++
++ return new;
++}
++
++/**
++ * replace_ea - replaces the existing data with the request data
++ */
++int
++replace_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_ea_header *ea,
++ struct gfs_easet_io *req)
++{
++ int err = 0;
++ int i;
++ uint32_t copy_size, data_left = req->es_data_len;
++ struct buffer_head *bh;
++ uint64_t *datablk = GFS_EA_DATA_PTRS(ea);
++ const char *dataptr = req->es_data;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++ ea->ea_data_len = cpu_to_gfs32(req->es_data_len);
++ if (!GFS_EA_IS_UNSTUFFED(ea))
++ memcpy(GFS_EA_DATA(ea), req->es_data, req->es_data_len);
++ else {
++ for (i = 0; i < ea->ea_num_ptrs && data_left > 0; i++) {
++ err = gfs_dread(sdp, gfs64_to_cpu(*datablk), ip->i_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++ gfs_trans_add_bh(ip->i_gl, bh);
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++ copy_size =
++ (data_left > avail_size) ? avail_size : data_left;
++ memcpy((bh)->b_data + sizeof (struct gfs_meta_header),
++ dataptr, copy_size);
++ dataptr += copy_size;
++ data_left -= copy_size;
++ datablk++;
++ brelse(bh);
++ }
++ GFS_ASSERT_INODE(data_left == 0, ip,
++ printk
++ ("req->es_data_len = %u, ea->ea_num_ptrs = %d\n",
++ req->es_data_len, ea->ea_num_ptrs);
++ );
++ }
++
++ out:
++ return err;
++}
++
++/**
++ * write_ea - writes the request info to an ea, creating new blocks if
++ * necessary
++ *
++ * @sdp: superblock pointer
++ * @alloc_ip: inode that has the blocks reserved for allocation
++ * @ip: inode that is being modified
++ * @ea: the location of the new ea in a block
++ * @req: the write request
++ *
++ * Note: does not update ea_rec_len or the GFS_EAFLAG_LAST bin of ea_flags
++ *
++ * returns : 0 on success, -EXXX on error
++ */
++
++int
++write_ea(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip, struct gfs_inode *ip,
++ struct gfs_ea_header *ea, struct gfs_easet_io *req)
++{
++ int err = 0;
++ uint64_t *blkptr;
++ uint64_t temp;
++ const char *dataptr;
++ uint32_t data_left, copy;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ int i;
++ struct buffer_head *bh = NULL;
++
++ ea->ea_data_len = cpu_to_gfs32(req->es_data_len);
++ ea->ea_name_len = req->es_name_len;
++ ea->ea_type = req->es_type;
++ ea->ea_pad = 0;
++
++ memcpy(GFS_EA_NAME(ea), req->es_name, req->es_name_len);
++
++ if (GFS_EAREQ_IS_STUFFED(req, avail_size)) {
++ ea->ea_num_ptrs = 0;
++ memcpy(GFS_EA_DATA(ea), req->es_data, req->es_data_len);
++ } else {
++ blkptr = GFS_EA_DATA_PTRS(ea);
++ dataptr = req->es_data;
++ data_left = req->es_data_len;
++ ea->ea_num_ptrs =
++ GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size);
++
++ for (i = 0; i < ea->ea_num_ptrs; i++) {
++ if ((bh =
++ alloc_eattr_blk(sdp, alloc_ip, ip,
++ &temp)) == NULL) {
++ err = -EIO;
++ goto out;
++ }
++ copy =
++ (data_left > avail_size) ? avail_size : data_left;
++ memcpy((bh)->b_data + sizeof (struct gfs_meta_header),
++ dataptr, copy);
++ *blkptr = cpu_to_gfs64(temp);
++ dataptr += copy;
++ data_left -= copy;
++ blkptr++;
++ brelse(bh);
++ }
++
++ GFS_ASSERT_INODE(!data_left, ip,);
++ }
++
++ out:
++
++ return err;
++}
++
++/**
++ * erase_ea_data_ptrs - deallocate all the unstuffed data blocks pointed to
++ * ea records in this block
++ * @sdp: the superblock
++ * @ip: the inode
++ * @blk: the block to check for data pointers
++ *
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++erase_ea_data_ptrs(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct buffer_head *dibh, uint64_t blk)
++{
++ struct gfs_holder rgd_gh;
++ int i, err = 0;
++ uint64_t *datablk;
++ struct buffer_head *eabh;
++ char *buf;
++ struct gfs_ea_header *ea;
++ struct gfs_rgrpd *rgd = NULL;
++
++ err = gfs_dread(sdp, blk, ip->i_gl, DIO_WAIT | DIO_START, &eabh);
++ if (err)
++ goto fail;
++
++ gfs_metatype_check(sdp, eabh, GFS_METATYPE_EA);
++ buf = (eabh)->b_data + sizeof (struct gfs_meta_header);
++ ea = (struct gfs_ea_header *) buf;
++
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++ if (GFS_EA_IS_UNSTUFFED(ea)) {
++ datablk = GFS_EA_DATA_PTRS(ea);
++ rgd = gfs_blk2rgrpd(sdp, gfs64_to_cpu(*datablk));
++ GFS_ASSERT_INODE(rgd, ip,
++ printk("block = %" PRIu64 "\n",
++ gfs64_to_cpu(*datablk)););
++ err =
++ gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
++ &rgd_gh);
++ if (err)
++ goto fail_eabh;
++ /* Trans may require:
++ One block for the RG header. One block for each ea data block. One
++ One block for the dinode. One block for the current ea block.
++ One block for a quote change.
++ FIXME */
++ err =
++ gfs_trans_begin(sdp,
++ 3 + ea->ea_num_ptrs, 1);
++ if (err)
++ goto fail_glock_rg;
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ for (i = 0; i < ea->ea_num_ptrs; i++, datablk++) {
++ gfs_metafree(ip, gfs64_to_cpu(*datablk), 1);
++ ip->i_di.di_blocks--;
++ }
++ ea->ea_num_ptrs = 0;
++ gfs_trans_add_bh(ip->i_gl, eabh);
++ gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++ gfs_trans_end(sdp);
++ gfs_glock_dq_uninit(&rgd_gh);
++ }
++ if (GFS_EA_IS_LAST(ea))
++ break;
++ ea = GFS_EA_NEXT(ea);
++ }
++
++ brelse(eabh);
++
++ return err;
++
++ fail_glock_rg:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_eabh:
++ brelse(eabh);
++
++ fail:
++ return err;
++}
++
++/**
++ * gfs_ea_dealloc - deallocate the extended attribute fork
++ * @ip: the inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_ea_dealloc(struct gfs_inode *ip)
++{
++ struct gfs_holder ri_gh, rgd_gh;
++ int err = 0;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *dibh, *indbh = NULL;
++ uint64_t *startblk, *eablk, *end, *next;
++ uint64_t temp;
++ int num_blks;
++ struct gfs_rgrpd *rgd = NULL;
++
++ if (!ip->i_di.di_eattr)
++ goto out;
++
++ gfs_alloc_get(ip);
++
++ err = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (err)
++ goto out_alloc;
++
++ err = gfs_rindex_hold(sdp, &ri_gh);
++ if (err)
++ goto out_unhold_q;
++
++ err = gfs_get_inode_buffer(ip, &dibh);
++ if (err)
++ goto out_rindex_release;
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ err =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++ DIO_WAIT | DIO_START, &indbh);
++ if (err)
++ goto out_dibh;
++
++ gfs_metatype_check(sdp, indbh, GFS_METATYPE_IN);
++
++ eablk =
++ (uint64_t *) ((indbh)->b_data +
++ sizeof (struct gfs_indirect));
++ end =
++ eablk +
++ ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++ while (*eablk && eablk < end) {
++ err =
++ erase_ea_data_ptrs(sdp, ip, dibh,
++ gfs64_to_cpu(*eablk));
++ if (err)
++ goto out_indbh;
++ eablk++;
++ }
++
++ startblk = eablk - 1;
++ end =
++ (uint64_t *) ((indbh)->b_data +
++ sizeof (struct gfs_indirect));
++
++ while (startblk >= end) {
++ rgd = gfs_blk2rgrpd(sdp, gfs64_to_cpu(*startblk));
++ GFS_ASSERT_INODE(rgd, ip,);
++
++ num_blks = 1;
++ next = eablk = startblk - 1;
++
++ while (eablk >= end) {
++ if (rgd ==
++ gfs_blk2rgrpd(sdp, gfs64_to_cpu(*eablk))) {
++ if (eablk != next) {
++ temp = *eablk;
++ *eablk = *next;
++ *next = temp;
++ }
++ num_blks++;
++ next--;
++ }
++ eablk--;
++ }
++
++ err =
++ gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
++ &rgd_gh);
++ if (err)
++ goto out_rindex_release;
++
++ /* Trans may require:
++ One block for the RG header. One block for each block from this
++ resource group. One block for the indirect ea block,
++ One block for the quote change */
++
++ err =
++ gfs_trans_begin(sdp, 3 + num_blks,
++ 1);
++ if (err)
++ goto out_gunlock_rg;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ while (startblk > next) {
++ gfs_metafree(ip, gfs64_to_cpu(*startblk), 1);
++ ip->i_di.di_blocks--;
++ *startblk = 0;
++ startblk--;
++ }
++
++ gfs_trans_add_bh(ip->i_gl, indbh);
++ gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_uninit(&rgd_gh);
++ }
++
++ brelse(indbh);
++ indbh = NULL;
++ } else {
++ err = erase_ea_data_ptrs(sdp, ip, dibh, ip->i_di.di_eattr);
++ if (err)
++ goto out_rindex_release;
++ }
++
++ rgd = gfs_blk2rgrpd(sdp, ip->i_di.di_eattr);
++ GFS_ASSERT_INODE(rgd, ip,
++ printk("block = %" PRIu64 "\n", ip->i_di.di_eattr);
++ );
++
++ err = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++ if (err)
++ goto out_rindex_release;
++
++ err = gfs_trans_begin(sdp, 3, 1);
++ if (err)
++ goto out_gunlock_rg;
++
++ gfs_metafree(ip, ip->i_di.di_eattr, 1);
++
++ ip->i_di.di_blocks--;
++ ip->i_di.di_eattr = 0;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++ gfs_trans_end(sdp);
++
++ out_gunlock_rg:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ out_indbh:
++ if (indbh)
++ brelse(indbh);
++
++ out_dibh:
++ brelse(dibh);
++
++ out_rindex_release:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ out_unhold_q:
++ gfs_quota_unhold_m(ip);
++
++ out_alloc:
++ gfs_alloc_put(ip);
++
++ out:
++
++ return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static void
++remove_ea(struct gfs_inode *ip, struct gfs_ea_header *ea,
++ struct gfs_ea_header *prev)
++{
++ uint64_t *datablk;
++ int i;
++
++ if (GFS_EA_IS_UNSTUFFED(ea)) {
++ datablk = GFS_EA_DATA_PTRS(ea);
++ for (i = 0; i < ea->ea_num_ptrs; i++, datablk++) {
++ gfs_metafree(ip, gfs64_to_cpu(*datablk), 1);
++ ip->i_di.di_blocks--;
++ }
++ }
++
++ ea->ea_type = GFS_EATYPE_UNUSED;
++ ea->ea_num_ptrs = 0;
++
++ if (prev && prev != ea) {
++ prev->ea_rec_len =
++ cpu_to_gfs32(GFS_EA_REC_LEN(prev) + GFS_EA_REC_LEN(ea));
++ if (GFS_EA_IS_LAST(ea))
++ prev->ea_flags |= GFS_EAFLAG_LAST;
++ }
++}
++
++int
++init_new_inode_eattr(struct gfs_inode *dip, struct gfs_inode *ip,
++ struct gfs_easet_io *req)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_ea_header *ea;
++
++ err = gfs_metaalloc(dip, &ip->i_di.di_eattr);
++ if (err)
++ goto out;
++
++ err = gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++ DIO_NEW | DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_EA, GFS_FORMAT_EA);
++
++ ip->i_di.di_blocks++;
++
++ ea = GFS_FIRST_EA(bh);
++ ea->ea_flags = GFS_EAFLAG_LAST;
++ ea->ea_rec_len =
++ cpu_to_gfs32(sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++ ea->ea_num_ptrs = 0;
++ ea->ea_type = GFS_EATYPE_UNUSED;
++ err = write_ea(sdp, dip, ip, ea, req);
++ if (err)
++ goto out_drelse;
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++
++ out_drelse:
++ brelse(bh);
++
++ out:
++ return err;
++}
++
++int
++do_init_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_easet_io *req)
++{
++ int err;
++ struct buffer_head *bh;
++ struct gfs_ea_header *ea;
++
++ bh = alloc_eattr_blk(sdp, ip, ip, &ip->i_di.di_eattr);
++ if (bh) {
++ ea = GFS_FIRST_EA(bh);
++ err = write_ea(sdp, ip, ip, ea, req);
++ brelse(bh);
++ } else
++ err = -EIO;
++
++ return err;
++}
++
++/**
++ * init_eattr - initializes a new eattr block
++ */
++
++static int
++init_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req)
++{
++ int err = 0;
++ struct gfs_alloc *al;
++ uint32_t ea_metablks;
++ struct buffer_head *dibh;
++ struct posix_acl *acl = NULL;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++
++ ea_metablks =
++ GFS_EAREQ_IS_STUFFED(req,
++ avail_size) ? 1 : (1 +
++ GFS_EADATA_NUM_PTRS(req->
++ es_data_len,
++ avail_size));
++
++ if (IS_ACCESS_ACL(req->es_name, req->es_name_len)){
++ acl = posix_acl_from_xattr(req->es_data, req->es_data_len);
++ if (IS_ERR(acl)) {
++ err = PTR_ERR(acl);
++ goto out;
++ }
++ }
++
++ al = gfs_alloc_get(ip);
++
++ err = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (err)
++ goto out_alloc;
++
++ al->al_requested_meta = ea_metablks;
++
++ err = gfs_inplace_reserve(ip);
++ if (err)
++ goto out_gunlock_q;
++
++ err = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (err)
++ goto out_ipres;
++
++ err = gfs_get_inode_buffer(ip, &dibh);
++ if (err)
++ goto out_ipres;
++
++ /* Trans may require:
++ A modified dinode, multiple EA metadata blocks, and all blocks for a RG
++ bitmap */
++
++ err =
++ gfs_trans_begin(sdp,
++ 1 + ea_metablks + al->al_rgd->rd_ri.ri_length, 1);
++ if (err)
++ goto out_dibh;
++
++ err = do_init_eattr(sdp, ip, req);
++ if (err)
++ goto out_end_trans;
++
++ if (acl)
++ gfs_acl_set_mode(ip, acl);
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++
++ out_end_trans:
++ gfs_trans_end(sdp);
++
++ out_dibh:
++ brelse(dibh);
++
++ out_ipres:
++ gfs_inplace_release(ip);
++
++ out_gunlock_q:
++ gfs_quota_unlock_m(ip);
++
++ out_alloc:
++ gfs_alloc_put(ip);
++ posix_acl_release(acl);
++
++ out:
++ return err;
++}
++
++/**
++ * alloc_eattr_blk - allocates a new block for extended attributes.
++ * @sdp: A pointer to the superblock
++ * @alloc_ip: A pointer to the inode that has reserved the blocks for
++ * allocation
++ * @ip: A pointer to the inode that's getting extended attributes
++ * @block: the block allocated
++ *
++ * Returns: the buffer head on success, NULL on failure
++ */
++
++static struct buffer_head *
++alloc_eattr_blk(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip,
++ struct gfs_inode *ip, uint64_t * block)
++{
++ int err = 0;
++ struct buffer_head *bh = NULL;
++ struct gfs_ea_header *ea;
++
++ err = gfs_metaalloc(alloc_ip, block);
++ if (err)
++ goto out;
++
++ err =
++ gfs_dread(sdp, *block, ip->i_gl, DIO_NEW | DIO_START | DIO_WAIT, &bh);
++ if (err)
++ goto out;
++
++ gfs_metatype_set(sdp, bh, GFS_METATYPE_EA, GFS_FORMAT_EA);
++
++ ip->i_di.di_blocks++;
++
++ ea = GFS_FIRST_EA(bh);
++ ea->ea_flags = GFS_EAFLAG_LAST;
++ ea->ea_rec_len =
++ cpu_to_gfs32(sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header));
++ ea->ea_num_ptrs = 0;
++ ea->ea_type = GFS_EATYPE_UNUSED;
++
++ gfs_trans_add_bh(ip->i_gl, bh);
++
++ out:
++
++ return bh;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++list_direct_ea(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct buffer_head *bh, struct gfs_eaget_io *req,
++ gfs_ea_copy_fn_t copy_fn, uint32_t * size)
++{
++ int err = 0;
++ struct gfs_ea_header *ea;
++ char buf[256];
++ char *ptr;
++
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_EA);
++
++ ea = (struct gfs_ea_header *) ((bh)->b_data +
++ sizeof (struct gfs_meta_header));
++ if (ea->ea_type == GFS_EATYPE_UNUSED) {
++ if (GFS_EA_IS_LAST(ea))
++ goto out;
++ else
++ ea = GFS_EA_NEXT(ea);
++ }
++
++ while (1) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++
++ if (req->eg_data_len) {
++ if (*size > req->eg_data_len) {
++ err = -ERANGE;
++ break;
++ }
++ ptr = buf;
++
++ GFS_ASSERT_INODE(GFS_EATYPE_VALID(ea->ea_type), ip,);
++ if (ea->ea_type == GFS_EATYPE_USR) {
++ memcpy(ptr, "user.", 5);
++ ptr += 5;
++ } else {
++ memcpy(ptr, "system.", 7);
++ ptr += 7;
++ }
++ memcpy(ptr, GFS_EA_NAME(ea), ea->ea_name_len);
++ ptr += ea->ea_name_len;
++ *ptr = 0;
++ err =
++ copy_fn(req->eg_data + *size, buf,
++ GFS_EA_STRLEN(ea));
++ if (err)
++ break;
++ }
++
++ *size = *size + GFS_EA_STRLEN(ea);
++
++ if (GFS_EA_IS_LAST(ea))
++ break;
++ ea = GFS_EA_NEXT(ea);
++ }
++
++ out:
++
++ return err;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++list_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++ gfs_ea_copy_fn_t copy_fn)
++{
++ int err;
++ struct buffer_head *bh, *eabh;
++ uint64_t *eablk, *end;
++ uint32_t size = 0;
++
++ err =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl, DIO_START | DIO_WAIT,
++ &bh);
++ if (err)
++ goto out;
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_IN);
++ eablk =
++ (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++ end =
++ eablk +
++ ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++ while (*eablk && eablk < end) {
++ err =
++ gfs_dread(sdp, gfs64_to_cpu(*eablk), ip->i_gl,
++ DIO_START | DIO_WAIT, &eabh);
++ if (err)
++ goto out_drelse;
++ err = list_direct_ea(sdp, ip, eabh, req, copy_fn, &size);
++ brelse(eabh);
++ if (err)
++ goto out_drelse;
++ eablk++;
++ }
++ } else {
++ err = list_direct_ea(sdp, ip, bh, req, copy_fn, &size);
++ if (err)
++ goto out_drelse;
++ }
++
++ if (!err)
++ err = size;
++
++ out_drelse:
++ brelse(bh);
++
++ out:
++
++ return err;
++}
++
++/**
++ * gfs_get_eattr - read an extended attribute, or a list of ea names
++ * @sdp: pointer to the superblock
++ * @ip: pointer to the inode for the target file
++ * @req: the request information
++ * @copy_fn: the function to use to do the actual copying
++ *
++ * Returns: actual size of data on success, -EXXX on error
++ */
++int
++gfs_get_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_eaget_io *req, gfs_ea_copy_fn_t copy_fn)
++{
++ struct gfs_holder i_gh;
++ int err;
++
++ if (req->eg_name) {
++ err = gfs_ea_read_permission(req, ip);
++ if (err)
++ goto out;
++ }
++
++ /* This seems to be a read. Are we sure we don't want to acquire the lock in LM_ST_SHARED? */
++
++ err = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (err)
++ goto out;
++
++ if (ip->i_di.di_eattr == 0) {
++ if (!req->eg_name) {
++ if (!req->eg_data_len && req->eg_len) {
++ uint32_t no_data = 0;
++
++ err =
++ copy_fn(req->eg_len, &no_data,
++ sizeof (uint32_t));
++ }
++ } else
++ err = -ENODATA;
++
++ goto out_gunlock;
++ }
++
++ if (req->eg_name)
++ err = get_ea(sdp, ip, req, copy_fn);
++ else
++ err = list_ea(sdp, ip, req, copy_fn);
++
++ out_gunlock:
++ gfs_glock_dq_uninit(&i_gh);
++
++ out:
++
++ return err;
++}
++
++static int
++do_set_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req,
++ struct gfs_ea_location location)
++{
++ int err = 0;
++ int req_size;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ struct gfs_ea_location space;
++
++ req_size = get_req_size(req, avail_size);
++
++ if (location.ea) {
++ struct gfs_ea_header *new_space;
++ if (req->es_cmd == GFS_EACMD_REMOVE) {
++ remove_ea(ip, location.ea, location.prev);
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++ goto out;
++ }
++ if (can_replace(location.ea, req, avail_size)) {
++ err = replace_ea(sdp, ip, location.ea, req);
++ if (!err)
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++ goto out;
++ }
++ /*
++ * This part is kind of confusing. If the inode has direct EAs
++ * Then adding another EA can't run it out of space, so it is safe to
++ * delete the EA before looking for space. If the inode has indirect
++ * EAs, there may not be enough space left, so first you check for space
++ * and they you delete the EA.
++ */
++ if ((ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) == 0) {
++ remove_ea(ip, location.ea, location.prev);
++ err = find_space(ip, req_size, req->es_type, &space);
++ if (err)
++ goto out;
++ new_space = prep_ea(space.ea);
++ err = write_ea(sdp, ip, ip, new_space, req);
++ if (!err) {
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++ gfs_trans_add_bh(ip->i_gl, space.bh);
++ }
++ brelse(space.bh);
++ goto out;
++ }
++ if (can_replace_in_block(ip, req_size, location, &new_space)) {
++ remove_ea(ip, location.ea, location.prev);
++ new_space = prep_ea(new_space);
++ err = write_ea(sdp, ip, ip, new_space, req);
++ if (!err)
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++ goto out;
++ }
++ err = find_space(ip, req_size, req->es_type, &space);
++ if (err)
++ /* You can return a non IO error here. If there is no space left,
++ * you can return -ENOSPC. So you must not have added a buffer to
++ * the transaction yet.
++ */
++ goto out;
++ remove_ea(ip, location.ea, location.prev);
++ new_space = prep_ea(space.ea);
++ err = write_ea(sdp, ip, ip, new_space, req);
++ if (!err) {
++ gfs_trans_add_bh(ip->i_gl, location.bh);
++ gfs_trans_add_bh(ip->i_gl, space.bh);
++ }
++ brelse(space.bh);
++ goto out;
++ }
++ err = find_space(ip, req_size, req->es_type, &space);
++ if (err)
++ /* you can also get -ENOSPC here */
++ goto out;
++ space.ea = prep_ea(space.ea);
++ err = write_ea(sdp, ip, ip, space.ea, req);
++ if (!err)
++ gfs_trans_add_bh(ip->i_gl, space.bh);
++ brelse(space.bh);
++
++ out:
++ return err;
++}
++
++static int
++set_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_easet_io *req,
++ struct gfs_ea_location location)
++{
++ int err;
++ struct gfs_alloc *al;
++ struct gfs_rgrpd *rgd = NULL;
++ struct buffer_head *dibh;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ int unstuffed_ea_blks = 0;
++ struct gfs_holder ri_gh, rgd_gh;
++ struct posix_acl *acl = NULL;
++
++ if (IS_ACCESS_ACL(req->es_name, req->es_name_len) && req->es_data){
++ acl = posix_acl_from_xattr(req->es_data, req->es_data_len);
++ if (IS_ERR(acl)) {
++ err = PTR_ERR(acl);
++ goto out;
++ }
++ }
++
++ err = gfs_get_inode_buffer(ip, &dibh);
++ if (err)
++ goto out_acl;
++ al = gfs_alloc_get(ip);
++
++ err = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (err)
++ goto out_alloc;
++
++ /*
++ * worst case, you need to switch from direct to indirect, which can
++ * take up to 3 new blocks, and you need to create enough unstuffed data
++ * blocks to hold all the data
++ */
++ al->al_requested_meta = 3 + GFS_EADATA_NUM_PTRS(req->es_data_len, avail_size);
++
++ err = gfs_inplace_reserve(ip);
++ if (err)
++ goto out_lock_quota;
++
++ err = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (err)
++ goto out_reserve;
++
++ if (location.ea && GFS_EA_IS_UNSTUFFED(location.ea)) {
++ /*
++ * If there is an EA, we might need to delete it.
++ * Since all unstuffed data blocks are added at the same time,
++ * they are all from the same resource group.
++ */
++ err = gfs_rindex_hold(sdp, &ri_gh);
++ if (err)
++ goto out_reserve;
++ rgd =
++ gfs_blk2rgrpd(sdp,
++ gfs64_to_cpu(*GFS_EA_DATA_PTRS(location.ea)));
++ GFS_ASSERT_INODE(rgd, ip,
++ printk("block = %" PRIu64 "\n",
++ gfs64_to_cpu(*GFS_EA_DATA_PTRS
++ (location.ea)));
++ );
++ err =
++ gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++ if (err)
++ goto out_rindex;
++ unstuffed_ea_blks = location.ea->ea_num_ptrs;
++ }
++
++ /*
++ * The transaction may require:
++ * Modifying the dinode block, Modifying the indirect ea block,
++ * modifying an ea block, all the allocation blocks, all the blocks for
++ * a RG bitmap, the RG header block, a RG block for each unstuffed data
++ * block you might be deleting.
++ */
++ err = gfs_trans_begin(sdp, 4 + al->al_requested_meta +
++ al->al_rgd->rd_ri.ri_length + unstuffed_ea_blks,
++ 1);
++ if (err)
++ goto out_lock_rg;
++
++ err = do_set_ea(sdp, ip, req, location);
++
++ if (!err) {
++ if (acl)
++ gfs_acl_set_mode(ip, acl);
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, (dibh)->b_data);
++ }
++
++ gfs_trans_end(sdp);
++
++ out_lock_rg:
++ if (rgd)
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ out_rindex:
++ if (rgd)
++ gfs_glock_dq_uninit(&ri_gh);
++
++ out_reserve:
++ gfs_inplace_release(ip);
++
++ out_lock_quota:
++ gfs_quota_unlock_m(ip);
++
++ out_alloc:
++ gfs_alloc_put(ip);
++ brelse(dibh);
++
++ out_acl:
++ posix_acl_release(acl);
++
++ out:
++ return err;
++}
++
++/**
++ * gfs_set_eattr - sets (or creates or replaces) an extended attribute
++ * @sdp: pointer to the superblock
++ * @ip: pointer to the inode of the target file
++ * @req: request information
++ *
++ * Returns: 0 on success -EXXX on error
++ */
++int
++gfs_set_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_easet_io *req)
++{
++ struct gfs_holder i_gh;
++ int err;
++ uint32_t req_size;
++ uint32_t avail_size =
++ sdp->sd_sb.sb_bsize - sizeof (struct gfs_meta_header);
++ struct gfs_ea_location location;
++
++ if (!GFS_EACMD_VALID(req->es_cmd)) {
++ err = -EOPNOTSUPP;
++ goto out;
++ }
++
++ if (strlen(req->es_name) == 0) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = gfs_ea_write_permission(req, ip);
++ if (err)
++ goto out;
++
++ if ((req_size = get_req_size(req, avail_size)) > avail_size) {
++ /* This can only happen with 512 byte blocks */
++ err = -ERANGE;
++ goto out;
++ }
++ err = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (err)
++ goto out;
++
++ if (ip->i_di.di_eattr == 0) {
++ if (req->es_cmd == GFS_EACMD_REPLACE
++ || req->es_cmd == GFS_EACMD_REMOVE) {
++ err = -ENODATA;
++ goto out_gunlock;
++ }
++ err = init_eattr(sdp, ip, req);
++ goto out_gunlock;
++ }
++
++ err = find_eattr(ip, req->es_name, req->es_name_len, req->es_type,
++ &location);
++ if (err < 0)
++ goto out_gunlock;
++ if (err == 0 && (req->es_cmd == GFS_EACMD_REPLACE ||
++ req->es_cmd == GFS_EACMD_REMOVE)) {
++ err = -ENODATA;
++ goto out_relse;
++ }
++ err = set_ea(sdp, ip, req, location);
++
++ out_relse:
++ if (location.bh)
++ brelse(location.bh);
++
++ out_gunlock:
++ gfs_glock_dq_uninit(&i_gh);
++
++ out:
++ return err;
++}
++
++/**
++ * gfs_set_eattr_ioctl - creates, modifies, or removes an extended attribute.
++ * @sdp: pointer to the superblock
++ * @ip: a pointer to the gfs inode for the file
++ * @arg: a pointer to gfs_set_eattr_io_t struct with the request
++ *
++ * Notes: ioctl wrapper for gfs_set_eattr
++ * Returns: 0 on success, -EXXX or error
++ */
++
++int
++gfs_set_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg)
++{
++ struct gfs_easet_io req;
++ int err = 0;
++ char *name = NULL;
++ char *data = NULL;
++
++ if (copy_from_user(&req, arg, sizeof (struct gfs_easet_io))) {
++ err = -EFAULT;
++ goto out;
++ }
++
++ name = gmalloc(req.es_name_len);
++
++ if (req.es_data) {
++ data = gmalloc(req.es_data_len);
++
++ if (copy_from_user(data, req.es_data, req.es_data_len)) {
++ err = -EFAULT;
++ goto out_free;
++ }
++ }
++ if (copy_from_user(name, req.es_name, req.es_name_len)) {
++ err = -EFAULT;
++ goto out_free;
++ }
++ req.es_data = data;
++ req.es_name = name;
++ err = gfs_set_eattr(sdp, ip, &req);
++
++ out_free:
++ kfree(name);
++ if (data)
++ kfree(data);
++
++ out:
++ return err;
++}
++
++/**
++ * gfs_get_eattr_ioctl - gets the value for the requested attribute name,
++ * or a list of all the extended attribute names.
++ * @sdp: pointer to the superblock
++ * @ip: a pointer to the inode for the file
++ * @arg: a pointer to the struct gfs_eaget_io struct holding the request
++ *
++ * Notes: ioctl wrapper for the gfs_get_eattr function
++ * Returns: 0 on success, -EXXX on error.
++ */
++
++int
++gfs_get_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg)
++{
++ struct gfs_eaget_io req;
++ int result = 0;
++ char *name = NULL;
++ uint32_t size;
++
++ if (copy_from_user(&req, arg, sizeof (struct gfs_eaget_io))) {
++ result = -EFAULT;
++ goto out;
++ }
++
++ if (req.eg_name) {
++ name = gmalloc(req.eg_name_len);
++
++ if (copy_from_user(name, req.eg_name, req.eg_name_len)) {
++ result = -EFAULT;
++ goto out_free;
++ }
++ req.eg_name = name;
++ }
++ result = gfs_get_eattr(sdp, ip, &req, gfs_ea_copy_to_user);
++
++ out_free:
++ if (name)
++ kfree(name);
++
++ if (result >= 0) {
++ size = result;
++ result =
++ gfs_ea_copy_to_user(req.eg_len, &size, sizeof(uint32_t));
++ }
++
++ out:
++
++ return result;
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++gfs_get_direct_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub,
++ uint64_t blk)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *databh, *bh;
++ struct gfs_ea_header *ea;
++ uint64_t *datablk;
++ unsigned int i;
++ int error;
++
++ error = gfs_dread(sdp, blk, ip->i_gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto out;
++
++ error = gfs_add_bh_to_ub(ub, bh);
++
++ ea = (struct gfs_ea_header *) ((bh)->b_data +
++ sizeof (struct gfs_meta_header));
++ for (;;) {
++ GFS_ASSERT_INODE(GFS_EA_REC_LEN(ea), ip,);
++
++ datablk = GFS_EA_DATA_PTRS(ea);
++
++ for (i = 0; i < ea->ea_num_ptrs; i++) {
++ error =
++ gfs_dread(sdp, gfs64_to_cpu(*datablk), ip->i_gl,
++ DIO_START | DIO_WAIT, &databh);
++ if (error)
++ goto out_relse;
++
++ error = gfs_add_bh_to_ub(ub, databh);
++
++ brelse(databh);
++
++ if (error)
++ goto out_relse;
++
++ datablk++;
++ }
++
++ if (GFS_EA_IS_LAST(ea))
++ break;
++ ea = GFS_EA_NEXT(ea);
++ }
++
++ out_relse:
++ brelse(bh);
++
++ out:
++
++ return error;
++}
++
++/**
++ * gfs_get_eattr_meta - return all the eattr blocks of a file
++ * @dip: the directory
++ * @ub: the structure representing the user buffer to copy to
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_get_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ int error;
++ uint64_t *eablk, *end;
++
++ if (ip->i_di.di_flags & GFS_DIF_EA_INDIRECT) {
++ error =
++ gfs_dread(sdp, ip->i_di.di_eattr, ip->i_gl,
++ DIO_WAIT | DIO_START, &bh);
++ if (error)
++ goto out;
++
++ error = gfs_add_bh_to_ub(ub, bh);
++
++ eablk =
++ (uint64_t *) ((bh)->b_data + sizeof (struct gfs_indirect));
++ end =
++ eablk +
++ ((sdp->sd_sb.sb_bsize - sizeof (struct gfs_indirect)) / 8);
++
++ while (*eablk && eablk < end) {
++ error =
++ gfs_get_direct_eattr_meta(ip, ub,
++ gfs64_to_cpu(*eablk));
++ if (error) {
++ brelse(bh);
++ goto out;
++ }
++ eablk++;
++ }
++ brelse(bh);
++ } else
++ error = gfs_get_direct_eattr_meta(ip, ub, ip->i_di.di_eattr);
++
++ out:
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/eattr.h linux-patched/fs/gfs/eattr.h
+--- linux-orig/fs/gfs/eattr.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/eattr.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,90 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __EATTR_DOT_H__
++#define __EATTR_DOT_H__
++
++#define GFS_EA_MAY_WRITE 1
++#define GFS_EA_MAY_READ 2
++
++#define GFS_EA_DATA_LEN(x) gfs32_to_cpu((x)->ea_data_len)
++#define GFS_EA_IS_UNSTUFFED(x) ((x)->ea_num_ptrs)
++#define GFS_EA_DATA(x) ((char *)(x) + sizeof(struct gfs_ea_header) + (x)->ea_name_len)
++
++struct gfs_ea_location {
++ struct buffer_head *bh;
++ struct gfs_ea_header *ea;
++ struct gfs_ea_header *prev;
++};
++
++#define GFS_POSIX_ACL_ACCESS "posix_acl_access"
++#define GFS_POSIX_ACL_ACCESS_LEN 16
++#define GFS_POSIX_ACL_DEFAULT "posix_acl_default"
++#define GFS_POSIX_ACL_DEFAULT_LEN 17
++
++#define IS_ACCESS_ACL(name, len) \
++ ((len) == GFS_POSIX_ACL_ACCESS_LEN && \
++ !memcmp(GFS_POSIX_ACL_ACCESS, (name), (len)))
++
++#define IS_DEFAULT_ACL(name, len) \
++ ((len) == GFS_POSIX_ACL_DEFAULT_LEN && \
++ !memcmp(GFS_POSIX_ACL_DEFAULT, (name), (len)))
++
++#define GFS_MAX_EA_ACL_BLKS 66 /* 65 for unstuffed data blocks, 1 for the ea
++ itself */
++
++typedef int (*gfs_ea_copy_fn_t) (void *dest, void *src, unsigned long size);
++
++int gfs_ea_memcpy(void *dest, void *src, unsigned long size);
++int gfs_ea_copy_to_user(void *dest, void *src, unsigned long size);
++
++int find_sys_space(struct gfs_inode *alloc_ip, struct gfs_inode *ip, int size,
++ struct gfs_ea_location *avail);
++
++struct gfs_ea_header *prep_ea(struct gfs_ea_header *ea);
++
++int write_ea(struct gfs_sbd *sdp, struct gfs_inode *alloc_ip,
++ struct gfs_inode *ip, struct gfs_ea_header *ea,
++ struct gfs_easet_io *req);
++
++int gfs_get_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_eaget_io *req, gfs_ea_copy_fn_t copy_fn);
++int gfs_set_eattr(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_easet_io *req);
++
++int gfs_set_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg);
++int gfs_get_eattr_ioctl(struct gfs_sbd *sdp, struct gfs_inode *ip, void *arg);
++
++int gfs_ea_dealloc(struct gfs_inode *ip);
++
++int gfs_get_eattr_meta(struct gfs_inode *ip, struct gfs_user_buffer *ub);
++
++int replace_ea(struct gfs_sbd *sdp, struct gfs_inode *ip,
++ struct gfs_ea_header *ea, struct gfs_easet_io *req);
++
++int find_eattr(struct gfs_inode *ip, char *name, int name_len, int type,
++ struct gfs_ea_location *location);
++
++int read_unstuffed(void *dest, struct gfs_inode *ip, struct gfs_sbd *sdp,
++ struct gfs_ea_header *ea, uint32_t avail_size,
++ gfs_ea_copy_fn_t copy_fn);
++
++int get_ea(struct gfs_sbd *sdp, struct gfs_inode *ip, struct gfs_eaget_io *req,
++ gfs_ea_copy_fn_t copy_fn);
++
++int init_new_inode_eattr(struct gfs_inode *dip, struct gfs_inode *ip,
++ struct gfs_easet_io *req);
++
++int gfs_ea_read_permission(struct gfs_eaget_io *req, struct gfs_inode *ip);
++
++#endif /* __EATTR_DOT_H__ */
+diff -urN linux-orig/fs/gfs/file.c linux-patched/fs/gfs/file.c
+--- linux-orig/fs/gfs/file.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/file.c 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,382 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "file.h"
++#include "inode.h"
++#include "trans.h"
++
++/**
++ * gfs_copy2mem - Trivial copy function for gfs_readi()
++ * @bh: The buffer to copy from, or NULL meaning zero the buffer
++ * @buf: The buffer to copy/zero
++ * @offset: The offset in the buffer to copy from
++ * @size: The amount of data to copy/zero
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy2mem(struct buffer_head *bh, void **buf, unsigned int offset,
++ unsigned int size)
++{
++ char **p = (char **)buf;
++
++ if (bh)
++ memcpy(*p, bh->b_data + offset, size);
++ else
++ memset(*p, 0, size);
++
++ *p += size;
++
++ return 0;
++}
++
++/**
++ * gfs_copy2user - Copy data to user space
++ * @bh: The buffer
++ * @buf: The destination of the data
++ * @offset: The offset into the buffer
++ * @size: The amount of data to copy
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy2user(struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size)
++{
++ char **p = (char **)buf;
++ int error;
++
++ if (bh)
++ error = copy_to_user(*p, bh->b_data + offset, size);
++ else
++ error = clear_user(*p, size);
++
++ if (error)
++ error = -EFAULT;
++ else
++ *p += size;
++
++ return error;
++}
++
++/**
++ * gfs_readi - Read a file
++ * @ip: The GFS Inode
++ * @buf: The buffer to place result into
++ * @offset: File offset to begin reading from
++ * @size: Amount of data to transfer
++ * @copy_fn: Function to actually perform the copy
++ *
++ * The @copy_fn only copies a maximum of a single block at once so
++ * we are safe calling it with int arguments. It is done so that
++ * we don't needlessly put 64bit arguments on the stack and it
++ * also makes the code in the @copy_fn nicer too.
++ *
++ * Returns: The amount of data actually copied or the error
++ */
++
++int
++gfs_readi(struct gfs_inode *ip, void *buf,
++ uint64_t offset, unsigned int size,
++ read_copy_fn_t copy_fn)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *bh;
++ uint64_t lblock, dblock;
++ unsigned int o;
++ uint32_t extlen = 0;
++ unsigned int amount;
++ int not_new = 0;
++ int journaled = gfs_is_jdata(ip);
++ int copied = 0;
++ int error = 0;
++
++ if (offset >= ip->i_di.di_size)
++ return 0;
++
++ if ((offset + size) > ip->i_di.di_size)
++ size = ip->i_di.di_size - offset;
++
++ if (!size)
++ return 0;
++
++ if (journaled) {
++ lblock = offset;
++ o = do_div(lblock, sdp->sd_jbsize);
++ } else {
++ lblock = offset >> sdp->sd_sb.sb_bsize_shift;
++ o = offset & (sdp->sd_sb.sb_bsize - 1);
++ }
++
++ if (gfs_is_stuffed(ip))
++ o += sizeof(struct gfs_dinode);
++ else if (journaled)
++ o += sizeof(struct gfs_meta_header);
++
++ while (copied < size) {
++ amount = size - copied;
++ if (amount > sdp->sd_sb.sb_bsize - o)
++ amount = sdp->sd_sb.sb_bsize - o;
++
++ if (!extlen) {
++ error = gfs_block_map(ip, lblock, ¬_new,
++ &dblock, &extlen);
++ if (error)
++ goto fail;
++ }
++
++ if (extlen > 1)
++ gfs_start_ra(ip->i_gl, dblock, extlen);
++
++ if (dblock) {
++ error = gfs_get_data_buffer(ip, dblock, not_new, &bh);
++ if (error)
++ goto fail;
++
++ dblock++;
++ extlen--;
++ } else
++ bh = NULL;
++
++ error = copy_fn(bh, &buf, o, amount);
++ if (bh)
++ brelse(bh);
++ if (error)
++ goto fail;
++
++ copied += amount;
++ lblock++;
++
++ o = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++ }
++
++ return copied;
++
++ fail:
++ return (copied) ? copied : error;
++}
++
++/**
++ * gfs_copy_from_mem - Trivial copy function for gfs_writei()
++ * @ip: The file to write to
++ * @bh: The buffer to copy to or clear
++ * @buf: The buffer to copy from
++ * @offset: The offset in the buffer to write to
++ * @size: The amount of data to write
++ * @new: Flag indicating that remaining space in the buffer should be zeroed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy_from_mem(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size, int new)
++{
++ char **p = (char **)buf;
++ int error = 0;
++
++ if (bh->b_blocknr == ip->i_num.no_addr) {
++ GFS_ASSERT_INODE(!new, ip,);
++ gfs_trans_add_bh(ip->i_gl, bh);
++ memcpy(bh->b_data + offset, *p, size);
++ } else if (gfs_is_jdata(ip)) {
++ gfs_trans_add_bh(ip->i_gl, bh);
++ memcpy(bh->b_data + offset, *p, size);
++ if (new)
++ gfs_buffer_clear_ends(bh, offset, size, TRUE);
++ } else {
++ memcpy(bh->b_data + offset, *p, size);
++ if (new)
++ gfs_buffer_clear_ends(bh, offset, size, FALSE);
++ error = gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++ }
++
++ if (!error)
++ *p += size;
++
++ return error;
++}
++
++/**
++ * gfs_copy_from_user - Copy bytes from user space for gfs_writei()
++ * @ip: The file to write to
++ * @bh: The buffer to copy to or clear
++ * @buf: The buffer to copy from
++ * @offset: The offset in the buffer to write to
++ * @size: The amount of data to write
++ * @new: Flag indicating that remaining space in the buffer should be zeroed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copy_from_user(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size, int new)
++{
++ char **p = (char **)buf;
++ int error = 0;
++
++ if (bh->b_blocknr == ip->i_num.no_addr) {
++ GFS_ASSERT_INODE(!new, ip,);
++ gfs_trans_add_bh(ip->i_gl, bh);
++ if (copy_from_user(bh->b_data + offset, *p, size))
++ error = -EFAULT;
++ } else if (gfs_is_jdata(ip)) {
++ gfs_trans_add_bh(ip->i_gl, bh);
++ if (copy_from_user(bh->b_data + offset, *p, size))
++ error = -EFAULT;
++ if (new) {
++ gfs_buffer_clear_ends(bh, offset, size, TRUE);
++ if (error)
++ memset(bh->b_data + offset, 0, size);
++ }
++ } else {
++ if (copy_from_user(bh->b_data + offset, *p, size))
++ error = -EFAULT;
++ if (error) {
++ if (new)
++ gfs_buffer_clear(bh);
++ gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++ } else {
++ if (new)
++ gfs_buffer_clear_ends(bh, offset, size, FALSE);
++ error = gfs_dwrite(ip->i_sbd, bh, DIO_DIRTY);
++ }
++ }
++
++ if (!error)
++ *p += size;
++
++ return error;
++}
++
++/**
++ * gfs_writei - Write bytes to a file
++ * @ip: The GFS inode
++ * @buf: The buffer containing information to be written
++ * @offset: The file offset to start writing at
++ * @size: The amount of data to write
++ * @copy_fn: Function to do the actual copying
++ *
++ * Returns: The number of bytes correctly written or error code
++ */
++
++int
++gfs_writei(struct gfs_inode *ip, void *buf,
++ uint64_t offset, unsigned int size,
++ write_copy_fn_t copy_fn)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct buffer_head *dibh, *bh;
++ uint64_t lblock, dblock;
++ unsigned int o;
++ uint32_t extlen = 0;
++ unsigned int amount;
++ int new;
++ int journaled = gfs_is_jdata(ip);
++ const uint64_t start = offset;
++ int copied = 0;
++ int error = 0;
++
++ if (!size)
++ return 0;
++
++ if (gfs_is_stuffed(ip) &&
++ ((start + size) > (sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)))) {
++ error = gfs_unstuff_dinode(ip, gfs_unstuffer_async, NULL);
++ if (error)
++ return error;
++ }
++
++ if (journaled) {
++ lblock = offset;
++ o = do_div(lblock, sdp->sd_jbsize);
++ } else {
++ lblock = offset >> sdp->sd_sb.sb_bsize_shift;
++ o = offset & (sdp->sd_sb.sb_bsize - 1);
++ }
++
++ if (gfs_is_stuffed(ip))
++ o += sizeof(struct gfs_dinode);
++ else if (journaled)
++ o += sizeof(struct gfs_meta_header);
++
++ while (copied < size) {
++ amount = size - copied;
++ if (amount > sdp->sd_sb.sb_bsize - o)
++ amount = sdp->sd_sb.sb_bsize - o;
++
++ if (!extlen) {
++ new = TRUE;
++ error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++ if (error)
++ goto fail;
++ GFS_ASSERT_INODE(dblock, ip,);
++ }
++
++ if (journaled && extlen > 1)
++ gfs_start_ra(ip->i_gl, dblock, extlen);
++
++ error = gfs_get_data_buffer(ip, dblock,
++ (amount == sdp->sd_sb.sb_bsize) ? TRUE : new,
++ &bh);
++ if (error)
++ goto fail;
++
++ error = copy_fn(ip, bh, &buf, o, amount, new);
++ brelse(bh);
++ if (error)
++ goto fail;
++
++ copied += amount;
++ lblock++;
++ dblock++;
++ extlen--;
++
++ o = (journaled) ? sizeof(struct gfs_meta_header) : 0;
++ }
++
++ out:
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ if (ip->i_di.di_size < start + copied)
++ ip->i_di.di_size = start + copied;
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return copied;
++
++ fail:
++ if (copied)
++ goto out;
++ return error;
++}
+diff -urN linux-orig/fs/gfs/file.h linux-patched/fs/gfs/file.h
+--- linux-orig/fs/gfs/file.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/file.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,51 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FILE_DOT_H__
++#define __FILE_DOT_H__
++
++typedef int (*read_copy_fn_t) (struct buffer_head * bh, void **buf,
++ unsigned int offset, unsigned int size);
++typedef int (*write_copy_fn_t) (struct gfs_inode * ip, struct buffer_head * bh,
++ void **buf, unsigned int offset,
++ unsigned int size, int new);
++
++int gfs_copy2mem(struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size);
++int gfs_copy2user(struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size);
++int gfs_readi(struct gfs_inode *ip, void *buf, uint64_t offset,
++ unsigned int size, read_copy_fn_t copy_fn);
++
++int gfs_copy_from_mem(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size, int new);
++int gfs_copy_from_user(struct gfs_inode *ip, struct buffer_head *bh, void **buf,
++ unsigned int offset, unsigned int size, int new);
++int gfs_writei(struct gfs_inode *ip, void *buf, uint64_t offset,
++ unsigned int size, write_copy_fn_t copy_fn);
++
++static __inline__ int
++gfs_internal_read(struct gfs_inode *ip, char *buf, uint64_t offset,
++ unsigned int size)
++{
++ return gfs_readi(ip, buf, offset, size, gfs_copy2mem);
++}
++
++static __inline__ int
++gfs_internal_write(struct gfs_inode *ip, char *buf, uint64_t offset,
++ unsigned int size)
++{
++ return gfs_writei(ip, buf, offset, size, gfs_copy_from_mem);
++}
++
++#endif /* __FILE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/fixed_div64.h linux-patched/fs/gfs/fixed_div64.h
+--- linux-orig/fs/gfs/fixed_div64.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/fixed_div64.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,142 @@
++/*
++ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of version 2 of the GNU General Public License as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it would be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
++ *
++ * Further, this software is distributed without any warranty that it is
++ * free of the rightful claim of any third person regarding infringement
++ * or the like. Any license provided herein, whether implied or
++ * otherwise, applies only to this software file. Patent licenses, if
++ * any, provided herein do not apply to combinations of this program with
++ * other software, or any other product whatsoever.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with this program; if not, write the Free Software Foundation, Inc., 59
++ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
++ *
++ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
++ * Mountain View, CA 94043, or:
++ *
++ * http://www.sgi.com
++ *
++ * For further information regarding this notice, see:
++ *
++ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
++ *
++ * Additional munging:
++ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++ */
++
++#ifndef __FIXED_DIV64_DOT_H__
++#define __FIXED_DIV64_DOT_H__
++
++#include <asm/div64.h>
++
++#if defined __i386__
++/* For ia32 we need to pull some tricks to get past various versions
++ * of the compiler which do not like us using do_div in the middle
++ * of large functions.
++ */
++static inline __u32 fixed_div64_do_div(void *a, __u32 b, int n)
++{
++ __u32 mod;
++
++ switch (n) {
++ case 4:
++ mod = *(__u32 *)a % b;
++ *(__u32 *)a = *(__u32 *)a / b;
++ return mod;
++ case 8:
++ {
++ unsigned long __upper, __low, __high, __mod;
++ __u64 c = *(__u64 *)a;
++ __upper = __high = c >> 32;
++ __low = c;
++ if (__high) {
++ __upper = __high % (b);
++ __high = __high / (b);
++ }
++ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
++ asm("":"=A" (c):"a" (__low),"d" (__high));
++ *(__u64 *)a = c;
++ return __mod;
++ }
++ }
++
++ /* NOTREACHED */
++ return 0;
++}
++
++/* Side effect free 64 bit mod operation */
++static inline __u32 fixed_div64_do_mod(void *a, __u32 b, int n)
++{
++ switch (n) {
++ case 4:
++ return *(__u32 *)a % b;
++ case 8:
++ {
++ unsigned long __upper, __low, __high, __mod;
++ __u64 c = *(__u64 *)a;
++ __upper = __high = c >> 32;
++ __low = c;
++ if (__high) {
++ __upper = __high % (b);
++ __high = __high / (b);
++ }
++ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
++ asm("":"=A" (c):"a" (__low),"d" (__high));
++ return __mod;
++ }
++ }
++
++ /* NOTREACHED */
++ return 0;
++}
++#else
++static inline __u32 fixed_div64_do_div(void *a, __u32 b, int n)
++{
++ __u32 mod;
++
++ switch (n) {
++ case 4:
++ mod = *(__u32 *)a % b;
++ *(__u32 *)a = *(__u32 *)a / b;
++ return mod;
++ case 8:
++ mod = do_div(*(__u64 *)a, b);
++ return mod;
++ }
++
++ /* NOTREACHED */
++ return 0;
++}
++
++/* Side effect free 64 bit mod operation */
++static inline __u32 fixed_div64_do_mod(void *a, __u32 b, int n)
++{
++ switch (n) {
++ case 4:
++ return *(__u32 *)a % b;
++ case 8:
++ {
++ __u64 c = *(__u64 *)a;
++ return do_div(c, b);
++ }
++ }
++
++ /* NOTREACHED */
++ return 0;
++}
++#endif
++
++#undef do_div
++#define do_div(a, b) fixed_div64_do_div(&(a), (b), sizeof(a))
++#define do_mod(a, b) fixed_div64_do_mod(&(a), (b), sizeof(a))
++
++#endif /* __FIXED_DIV64_DOT_H__ */
+diff -urN linux-orig/fs/gfs/flock.c linux-patched/fs/gfs/flock.c
+--- linux-orig/fs/gfs/flock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/flock.c 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "flock.h"
++#include "glock.h"
++#include "glops.h"
++
++/**
++ * gfs_flock - Acquire a flock on a file
++ * @fp: the file
++ * @ex: exclusive lock
++ * @wait: wait for lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_flock(struct gfs_file *fp, int ex, int wait)
++{
++ struct gfs_holder *fl_gh = &fp->f_fl_gh;
++ struct gfs_inode *ip = fp->f_inode;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_glock *gl;
++ int error = 0;
++
++ down(&fp->f_fl_lock);
++
++ if (fl_gh->gh_gl) {
++ gfs_glock_dq_uninit(fl_gh);
++ error = -EDEADLK;
++ goto out;
++ }
++
++ error = gfs_glock_get(sdp,
++ ip->i_num.no_formal_ino, &gfs_flock_glops,
++ CREATE, &gl);
++ if (error)
++ goto out;
++
++ gfs_holder_init(gl, (ex) ? LM_ST_EXCLUSIVE : LM_ST_SHARED,
++ ((wait) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE,
++ fl_gh);
++ fl_gh->gh_owner = NULL;
++
++ gfs_glock_put(gl);
++
++ error = gfs_glock_nq(fl_gh);
++ if (error) {
++ gfs_holder_uninit(fl_gh);
++ if (error == GLR_TRYFAILED) {
++ GFS_ASSERT_INODE(!wait, ip,);
++ error = -EAGAIN;
++ }
++ }
++
++ out:
++ up(&fp->f_fl_lock);
++
++ return error;
++}
++
++/**
++ * gfs_funlock - Release a flock on a file
++ * @fp: the file
++ *
++ */
++
++int
++gfs_funlock(struct gfs_file *fp)
++{
++ struct gfs_holder *fl_gh = &fp->f_fl_gh;
++
++ down(&fp->f_fl_lock);
++ if (fl_gh->gh_gl)
++ gfs_glock_dq_uninit(fl_gh);
++ up(&fp->f_fl_lock);
++
++ return 0;
++}
+diff -urN linux-orig/fs/gfs/flock.h linux-patched/fs/gfs/flock.h
+--- linux-orig/fs/gfs/flock.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/flock.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FLOCK_DOT_H__
++#define __FLOCK_DOT_H__
++
++int gfs_flock(struct gfs_file *fp, int ex, int wait);
++int gfs_funlock(struct gfs_file *fp);
++
++#endif /* __FLOCK_DOT_H__ */
+diff -urN linux-orig/fs/gfs/format.h linux-patched/fs/gfs/format.h
+--- linux-orig/fs/gfs/format.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/format.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,30 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __FORMAT_DOT_H__
++#define __FORMAT_DOT_H__
++
++static const uint32_t gfs_old_fs_formats[] = {
++ 1308,
++ 1307,
++ 1306,
++ 1305,
++ 0
++};
++
++static const uint32_t gfs_old_multihost_formats[] = {
++ 1400,
++ 0
++};
++
++#endif /* __FORMAT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/gfs.h linux-patched/fs/gfs/gfs.h
+--- linux-orig/fs/gfs/gfs.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/gfs.h 2004-06-20 22:48:17.948946686 -0500
+@@ -0,0 +1,130 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_DOT_H__
++#define __GFS_DOT_H__
++
++#define GFS_RELEASE_NAME "<CVS>"
++
++#include <linux/lm_interface.h>
++#include <linux/gfs_ondisk.h>
++#include <linux/gfs_ioctl.h>
++
++#include "fixed_div64.h"
++#include "lvb.h"
++#include "incore.h"
++#include "util.h"
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#define NO_CREATE (0)
++#define CREATE (1)
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++/* Divide x by y. Round up if there is a remainder. */
++#define DIV_RU(x, y) (((x) + (y) - 1) / (y))
++
++#define GFS_FAST_NAME_SIZE (8)
++
++#define vfs2sdp(sb) ((struct gfs_sbd *)(sb)->s_fs_info)
++#define vn2ip(inode) ((struct gfs_inode *)(inode)->u.generic_ip)
++#define vf2fp(file) ((struct gfs_file *)(file)->private_data)
++#define bh2bd(bh) ((struct gfs_bufdata *)(bh)->b_private)
++#define current_transaction ((struct gfs_trans *)(current->journal_info))
++
++#define gl2ip(gl) ((struct gfs_inode *)(gl)->gl_object)
++#define gl2rgd(gl) ((struct gfs_rgrpd *)(gl)->gl_object)
++#define gl2gl(gl) ((struct gfs_glock *)(gl)->gl_object)
++
++#define gfs_meta_check(sdp, bh) \
++do \
++{ \
++ uint32_t meta_check_magic = ((struct gfs_meta_header *)(bh)->b_data)->mh_magic; \
++ meta_check_magic = gfs32_to_cpu(meta_check_magic); \
++ GFS_ASSERT_SBD(meta_check_magic == GFS_MAGIC, (sdp), \
++ struct gfs_meta_header meta_check_mh; \
++ printk("Bad metadata at %"PRIu64"\n", (uint64_t)(bh)->b_blocknr); \
++ gfs_meta_header_in(&meta_check_mh, (bh)->b_data); \
++ gfs_meta_header_print(&meta_check_mh);); \
++} \
++while (0)
++
++#define gfs_metatype_check(sdp, bh, type) \
++do \
++{ \
++ uint32_t metatype_check_magic = ((struct gfs_meta_header *)(bh)->b_data)->mh_magic; \
++ uint32_t metatype_check_type = ((struct gfs_meta_header *)(bh)->b_data)->mh_type; \
++ metatype_check_magic = gfs32_to_cpu(metatype_check_magic); \
++ metatype_check_type = gfs32_to_cpu(metatype_check_type); \
++ GFS_ASSERT_SBD(metatype_check_magic == GFS_MAGIC && \
++ metatype_check_type == (type), (sdp), \
++ struct gfs_meta_header metatype_check_mh; \
++ printk("Bad metadata at %"PRIu64", should be %u\n", (uint64_t)(bh)->b_blocknr, (type)); \
++ gfs_meta_header_in(&metatype_check_mh, (bh)->b_data); \
++ gfs_meta_header_print(&metatype_check_mh);); \
++} \
++while (0)
++
++#define gfs_metatype_set(sdp, bh, type, format) \
++do \
++{ \
++ gfs_meta_check((sdp), (bh)); \
++ ((struct gfs_meta_header *)(bh)->b_data)->mh_type = cpu_to_gfs32((type)); \
++ ((struct gfs_meta_header *)(bh)->b_data)->mh_format = cpu_to_gfs32((format)); \
++} \
++while (0)
++
++#define gfs_sprintf(fmt, args...) \
++do { \
++ if (buf) { \
++ if (*count + 256 > size) { \
++ error = -ENOMEM; \
++ goto out; \
++ } \
++ *count += snprintf(buf + *count, 256, fmt, ##args); \
++ } \
++ else \
++ printk(fmt, ##args); \
++} \
++while (0)
++
++#endif /* __GFS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/glock.c linux-patched/fs/gfs/glock.c
+--- linux-orig/fs/gfs/glock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glock.c 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,2524 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "lops.h"
++#include "quota.h"
++#include "recovery.h"
++
++/* Must be kept in sync with the beginning of struct gfs_glock */
++struct glock_plug {
++ struct list_head gl_list;
++ unsigned long gl_flags;
++};
++
++typedef void (*glock_examiner) (struct gfs_glock * gl);
++
++/**
++ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
++ * @actual: the current state of the lock
++ * @requested: the lock state that was requested by the caller
++ * @flags: the modifier flags passed in by the caller
++ *
++ * Returns: TRUE if the locks are compatible, FALSE otherwise
++ */
++
++static __inline__ int
++relaxed_state_ok(unsigned int actual, unsigned requested, int flags)
++{
++ if (actual == requested)
++ return TRUE;
++
++ if (flags & GL_EXACT)
++ return FALSE;
++
++ if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
++ return TRUE;
++
++ if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
++ return TRUE;
++
++ return FALSE;
++}
++
++/**
++ * gl_hash() - Turn glock number into hash bucket number
++ * @lock: The glock number
++ *
++ * Returns: The number of the corresponding hash bucket
++ */
++
++static unsigned int
++gl_hash(struct lm_lockname *name)
++{
++ unsigned int h;
++
++ h = gfs_hash(&name->ln_number, sizeof(uint64_t));
++ h = gfs_hash_more(&name->ln_type, sizeof(unsigned int), h);
++ h &= GFS_GL_HASH_MASK;
++
++ return h;
++}
++
++/**
++ * glock_hold() - increment reference count on glock
++ * @gl: The glock to put
++ *
++ */
++
++static __inline__ void
++glock_hold(struct gfs_glock *gl)
++{
++ atomic_inc(&gl->gl_count);
++}
++
++/**
++ * glock_put() - Decrement reference count on glock
++ * @gl: The glock to put
++ *
++ */
++
++static __inline__ void
++glock_put(struct gfs_glock *gl)
++{
++ if (atomic_read(&gl->gl_count) == 1)
++ gfs_glock_schedule_for_reclaim(gl);
++ atomic_dec(&gl->gl_count);
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) >= 0, gl,);
++}
++
++/**
++ * queue_empty - check to see if a glock's queue is empty
++ * @gl: the glock
++ * @head: the head of the queue to check
++ *
++ * Returns: TRUE if the queue is empty
++ */
++
++static __inline__ int
++queue_empty(struct gfs_glock *gl, struct list_head *head)
++{
++ int empty;
++ spin_lock(&gl->gl_spin);
++ empty = list_empty(head);
++ spin_unlock(&gl->gl_spin);
++ return empty;
++}
++
++/**
++ * search_bucket() - Find struct gfs_glock by lock number
++ * @bucket: the bucket to search
++ * @name: The lock name
++ *
++ * Returns: NULL, or the struct gfs_glock with the requested number
++ */
++
++static struct gfs_glock *
++search_bucket(struct gfs_gl_hash_bucket *bucket, struct lm_lockname *name)
++{
++ struct list_head *tmp, *head;
++ struct gfs_glock *gl;
++
++ for (head = &bucket->hb_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++ if (test_bit(GLF_PLUG, &gl->gl_flags))
++ continue;
++ if (!lm_name_equal(&gl->gl_name, name))
++ continue;
++
++ glock_hold(gl);
++
++ return gl;
++ }
++
++ return NULL;
++}
++
++/**
++ * gfs_glock_find() - Find glock by lock number
++ * @sdp: The GFS superblock
++ * @name: The lock name
++ *
++ * Figure out what bucket the lock is in, acquire the read lock on
++ * it and call search_bucket().
++ *
++ * Returns: NULL, or the struct gfs_glock with the requested number
++ */
++
++struct gfs_glock *
++gfs_glock_find(struct gfs_sbd *sdp, struct lm_lockname *name)
++{
++ struct gfs_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
++ struct gfs_glock *gl;
++
++ read_lock(&bucket->hb_lock);
++ gl = search_bucket(bucket, name);
++ read_unlock(&bucket->hb_lock);
++
++ return gl;
++}
++
++/**
++ * glock_free() - Perform a few checks and then release struct gfs_glock
++ * @gl: The glock to release
++ *
++ */
++
++static void
++glock_free(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct inode *aspace = gl->gl_aspace;
++
++ GFS_ASSERT_GLOCK(list_empty(&gl->gl_list), gl,);
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) == 1, gl,);
++ GFS_ASSERT_GLOCK(list_empty(&gl->gl_holders), gl,);
++ GFS_ASSERT_GLOCK(list_empty(&gl->gl_waiters1), gl,);
++ GFS_ASSERT_GLOCK(list_empty(&gl->gl_waiters2), gl,);
++ GFS_ASSERT_GLOCK(gl->gl_state == LM_ST_UNLOCKED, gl,);
++ GFS_ASSERT_GLOCK(!gl->gl_object, gl,);
++ GFS_ASSERT_GLOCK(!gl->gl_lvb, gl,);
++ GFS_ASSERT_GLOCK(list_empty(&gl->gl_reclaim), gl,);
++
++ sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
++
++ if (aspace)
++ gfs_aspace_put(aspace);
++
++ kmem_cache_free(gfs_glock_cachep, gl);
++
++ atomic_dec(&sdp->sd_glock_count);
++}
++
++/**
++ * gfs_glock_get() - Get a glock, or create one if one doesn't exist
++ * @sdp: The GFS superblock
++ * @number: the lock number
++ * @glops: The glock_operations to use
++ * @create: If FALSE, don't create the glock if it doesn't exist
++ * @glp: the glock is returned here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_glock_get(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ int create, struct gfs_glock **glp)
++{
++ struct lm_lockname name;
++ struct gfs_glock *gl, *tmp;
++ struct gfs_gl_hash_bucket *bucket;
++ int error;
++
++ name.ln_number = number;
++ name.ln_type = glops->go_type;
++ bucket = &sdp->sd_gl_hash[gl_hash(&name)];
++
++ read_lock(&bucket->hb_lock);
++ gl = search_bucket(bucket, &name);
++ read_unlock(&bucket->hb_lock);
++
++ if (gl || !create) {
++ *glp = gl;
++ return 0;
++ }
++
++ gl = kmem_cache_alloc(gfs_glock_cachep, GFP_KERNEL);
++ if (!gl)
++ return -ENOMEM;
++
++ memset(gl, 0, sizeof(struct gfs_glock));
++
++ INIT_LIST_HEAD(&gl->gl_list);
++ gl->gl_name = name;
++ atomic_set(&gl->gl_count, 1);
++
++ spin_lock_init(&gl->gl_spin);
++
++ gl->gl_state = LM_ST_UNLOCKED;
++ INIT_LIST_HEAD(&gl->gl_holders);
++ INIT_LIST_HEAD(&gl->gl_waiters1);
++ INIT_LIST_HEAD(&gl->gl_waiters2);
++
++ gl->gl_ops = glops;
++
++ INIT_LE(&gl->gl_new_le, &gfs_glock_lops);
++ INIT_LE(&gl->gl_incore_le, &gfs_glock_lops);
++
++ gl->gl_bucket = bucket;
++ INIT_LIST_HEAD(&gl->gl_reclaim);
++
++ gl->gl_sbd = sdp;
++
++ INIT_LIST_HEAD(&gl->gl_dirty_buffers);
++ INIT_LIST_HEAD(&gl->gl_ail_bufs);
++
++ if (glops == &gfs_inode_glops ||
++ glops == &gfs_rgrp_glops ||
++ glops == &gfs_meta_glops) {
++ gl->gl_aspace = gfs_aspace_get(sdp);
++ if (!gl->gl_aspace) {
++ error = -ENOMEM;
++ goto fail;
++ }
++ }
++
++ error = sdp->sd_lockstruct.ls_ops->lm_get_lock(sdp->sd_lockstruct.ls_lockspace,
++ &name,
++ &gl->gl_lock);
++ if (error)
++ goto fail_aspace;
++
++ atomic_inc(&sdp->sd_glock_count);
++
++ write_lock(&bucket->hb_lock);
++ tmp = search_bucket(bucket, &name);
++ if (tmp) {
++ write_unlock(&bucket->hb_lock);
++ glock_free(gl);
++ gl = tmp;
++ } else {
++ list_add_tail(&gl->gl_list, &bucket->hb_list);
++ write_unlock(&bucket->hb_lock);
++ }
++
++ *glp = gl;
++
++ return 0;
++
++ fail_aspace:
++ if (gl->gl_aspace)
++ gfs_aspace_put(gl->gl_aspace);
++
++ fail:
++ kmem_cache_free(gfs_glock_cachep, gl);
++
++ return error;
++}
++
++/**
++ * gfs_glock_hold() - As glock_hold(), but suitable for exporting
++ * @gl: The glock to hold
++ *
++ */
++
++void
++gfs_glock_hold(struct gfs_glock *gl)
++{
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++ glock_hold(gl);
++}
++
++/**
++ * gfs_glock_put() - As glock_put(), but suitable for exporting
++ * @gl: The glock to put
++ *
++ */
++
++void
++gfs_glock_put(struct gfs_glock *gl)
++{
++ glock_put(gl);
++}
++
++/**
++ * gfs_holder_init - initialize a struct gfs_holder in the default way
++ * @gl: the glock
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_init(struct gfs_glock *gl, unsigned int state, int flags,
++ struct gfs_holder *gh)
++{
++ memset(gh, 0, sizeof(struct gfs_holder));
++
++ INIT_LIST_HEAD(&gh->gh_list);
++ gh->gh_gl = gl;
++ gh->gh_owner = current;
++ gh->gh_state = state;
++ gh->gh_flags = flags;
++
++ if (gh->gh_state == LM_ST_EXCLUSIVE)
++ gh->gh_flags |= GL_LOCAL_EXCL;
++
++ init_completion(&gh->gh_wait);
++
++ glock_hold(gl);
++}
++
++/**
++ * gfs_holder_reinit - reinitialize a struct gfs_holder so we can requeue it
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ * Don't mess with the glock.
++ *
++ */
++
++void
++gfs_holder_reinit(unsigned int state, int flags, struct gfs_holder *gh)
++{
++ int alloced;
++
++ GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gh->gh_gl,);
++
++ gh->gh_state = state;
++ gh->gh_flags = flags;
++
++ if (gh->gh_state == LM_ST_EXCLUSIVE)
++ gh->gh_flags |= GL_LOCAL_EXCL;
++
++ alloced = test_bit(HIF_ALLOCED, &gh->gh_iflags);
++ memset(&gh->gh_iflags, 0, sizeof(unsigned long));
++ if (alloced)
++ set_bit(HIF_ALLOCED, &gh->gh_iflags);
++}
++
++/**
++ * gfs_holder_uninit - uninitialize a holder structure (drop reference on glock)
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_uninit(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++
++ GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gl,);
++ gh->gh_gl = NULL;
++
++ glock_put(gl);
++}
++
++/**
++ * gfs_holder_get - get a struct gfs_holder structure
++ * @gl: the glock
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ *
++ * Figure out how big an impact this function has. Either:
++ * 1) Replace it with a cache of structures hanging off the struct gfs_sbd
++ * 2) Get rid of it and call gmalloc() directly
++ * 3) Leave it like it is
++ *
++ * Returns: the holder structure
++ */
++
++struct gfs_holder *
++gfs_holder_get(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ struct gfs_holder *gh;
++
++ gh = gmalloc(sizeof(struct gfs_holder));
++ gfs_holder_init(gl, state, flags, gh);
++ set_bit(HIF_ALLOCED, &gh->gh_iflags);
++
++ return gh;
++}
++
++/**
++ * gfs_holder_put - get rid of a struct gfs_holder structure
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_holder_put(struct gfs_holder *gh)
++{
++ GFS_ASSERT_GLOCK(test_bit(HIF_ALLOCED, &gh->gh_iflags), gh->gh_gl,);
++ gfs_holder_uninit(gh);
++ kfree(gh);
++}
++
++/**
++ * handle_recurse - put other holder structures (marked recursive) into the holders list
++ * @gh: the holder structure
++ *
++ */
++
++static void
++handle_recurse(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct list_head *tmp, *head, *next;
++ struct gfs_holder *tmp_gh;
++ int found = FALSE;
++
++ GFS_ASSERT_GLOCK(gh->gh_owner, gl,);
++
++ for (head = &gl->gl_waiters2, tmp = head->next, next = tmp->next;
++ tmp != head;
++ tmp = next, next = tmp->next) {
++ tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (tmp_gh->gh_owner != gh->gh_owner)
++ continue;
++
++ GFS_ASSERT_GLOCK(test_bit(HIF_RECURSE, &tmp_gh->gh_iflags),
++ gl,);
++
++ list_move_tail(&tmp_gh->gh_list, &gl->gl_holders);
++ tmp_gh->gh_error = 0;
++ set_bit(HIF_HOLDER, &tmp_gh->gh_iflags);
++
++ complete(&tmp_gh->gh_wait);
++
++ found = TRUE;
++ }
++
++ GFS_ASSERT_GLOCK(found, gl,);
++}
++
++/**
++ * do_unrecurse - a recursive holder was just dropped of the waiters2 list
++ * @gh: the holder
++ *
++ * If there is only one other recursive holder, clear is HIF_RECURSE bit.
++ * If there is more than one, leave them alone.
++ *
++ */
++
++static void
++do_unrecurse(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct list_head *tmp, *head;
++ struct gfs_holder *tmp_gh, *last_gh = NULL;
++ int found = FALSE;
++
++ GFS_ASSERT_GLOCK(gh->gh_owner, gl,);
++
++ for (head = &gl->gl_waiters2, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (tmp_gh->gh_owner != gh->gh_owner)
++ continue;
++
++ GFS_ASSERT_GLOCK(test_bit(HIF_RECURSE, &tmp_gh->gh_iflags),
++ gl,);
++
++ if (found)
++ return;
++
++ found = TRUE;
++ last_gh = tmp_gh;
++ }
++
++ GFS_ASSERT_GLOCK(found, gl,);
++ clear_bit(HIF_RECURSE, &last_gh->gh_iflags);
++}
++
++/**
++ * rq_mutex - process a mutex request in the queue
++ * @gh: the glock holder
++ *
++ * Returns: TRUE if the queue is blocked,
++ */
++
++static int
++rq_mutex(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++
++ list_del_init(&gh->gh_list);
++ /* gh->gh_error never examined. */
++ set_bit(GLF_LOCK, &gl->gl_flags);
++ complete(&gh->gh_wait);
++
++ return TRUE;
++}
++
++/**
++ * rq_promote - process a promote request in the queue
++ * @gh: the glock holder
++ * @promote_ok: It's ok to ask the LM to do promotes on a sync lock module
++ *
++ * Returns: TRUE if the queue is blocked,
++ */
++
++static int
++rq_promote(struct gfs_holder *gh, int promote_ok)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ int recurse;
++
++ if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
++ if (list_empty(&gl->gl_holders)) {
++ if (promote_ok || GFS_ASYNC_LM(sdp)) {
++ gl->gl_req_gh = gh;
++ set_bit(GLF_LOCK, &gl->gl_flags);
++ spin_unlock(&gl->gl_spin);
++
++ if (atomic_read(&sdp->sd_reclaim_count) >
++ sdp->sd_tune.gt_reclaim_limit &&
++ !(gh->gh_flags & LM_FLAG_PRIORITY)) {
++ gfs_reclaim_glock(sdp);
++ gfs_reclaim_glock(sdp);
++ }
++
++ glops->go_xmote_th(gl, gh->gh_state,
++ gh->gh_flags);
++
++ spin_lock(&gl->gl_spin);
++ } else
++ if (!test_and_set_bit(HIF_WAKEUP, &gh->gh_iflags))
++ complete(&gh->gh_wait);
++ }
++ return TRUE;
++ }
++
++ if (list_empty(&gl->gl_holders)) {
++ set_bit(HIF_FIRST, &gh->gh_iflags);
++ set_bit(GLF_LOCK, &gl->gl_flags);
++ recurse = FALSE;
++ } else {
++ struct gfs_holder *next_gh;
++ if (gh->gh_flags & GL_LOCAL_EXCL)
++ return TRUE;
++ next_gh = list_entry(gl->gl_holders.next, struct gfs_holder, gh_list);
++ if (next_gh->gh_flags & GL_LOCAL_EXCL)
++ return TRUE;
++ recurse = test_bit(HIF_RECURSE, &gh->gh_iflags);
++ }
++
++ list_move_tail(&gh->gh_list, &gl->gl_holders);
++ gh->gh_error = 0;
++ set_bit(HIF_HOLDER, &gh->gh_iflags);
++
++ if (recurse)
++ handle_recurse(gh);
++
++ complete(&gh->gh_wait);
++
++ return FALSE;
++}
++
++/**
++ * rq_demote - process a demote request in the queue
++ * @gh: the glock holder
++ *
++ * Returns: TRUE if the queue is blocked,
++ */
++
++static int
++rq_demote(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++
++ if (!list_empty(&gl->gl_holders))
++ return TRUE;
++
++ if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
++ list_del_init(&gh->gh_list);
++ gh->gh_error = 0;
++ spin_unlock(&gl->gl_spin);
++ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++ gfs_holder_put(gh);
++ else
++ complete(&gh->gh_wait);
++ spin_lock(&gl->gl_spin);
++ } else {
++ gl->gl_req_gh = gh;
++ set_bit(GLF_LOCK, &gl->gl_flags);
++ spin_unlock(&gl->gl_spin);
++
++ if (gh->gh_state == LM_ST_UNLOCKED ||
++ gl->gl_state != LM_ST_EXCLUSIVE)
++ glops->go_drop_th(gl);
++ else
++ glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
++
++ spin_lock(&gl->gl_spin);
++ }
++
++ return FALSE;
++}
++
++/**
++ * run_queue - process holder structures on a glock
++ * @gl: the glock
++ * @promote_ok: It's ok to ask the LM to do promotes on a sync lock module
++ *
++ */
++
++static void
++run_queue(struct gfs_glock *gl, int promote_ok)
++{
++ struct gfs_holder *gh;
++ int blocked;
++
++ for (;;) {
++ if (test_bit(GLF_LOCK, &gl->gl_flags))
++ break;
++
++ if (!list_empty(&gl->gl_waiters1)) {
++ gh = list_entry(gl->gl_waiters1.next,
++ struct gfs_holder, gh_list);
++
++ if (test_bit(HIF_MUTEX, &gh->gh_iflags))
++ blocked = rq_mutex(gh);
++ else
++ GFS_ASSERT_GLOCK(FALSE, gl,);
++
++ } else if (!list_empty(&gl->gl_waiters2)) {
++ gh = list_entry(gl->gl_waiters2.next,
++ struct gfs_holder, gh_list);
++
++ if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
++ blocked = rq_promote(gh, promote_ok);
++ else if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
++ blocked = rq_demote(gh);
++ else
++ GFS_ASSERT_GLOCK(FALSE, gl,);
++
++ } else
++ break;
++
++ if (blocked)
++ break;
++ }
++}
++
++/**
++ * lock_on_glock - acquire a local lock on a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++lock_on_glock(struct gfs_glock *gl)
++{
++ struct gfs_holder gh;
++
++ gfs_holder_init(gl, 0, 0, &gh);
++ set_bit(HIF_MUTEX, &gh.gh_iflags);
++
++ spin_lock(&gl->gl_spin);
++ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
++ list_add_tail(&gh.gh_list, &gl->gl_waiters1);
++ else
++ complete(&gh.gh_wait);
++ spin_unlock(&gl->gl_spin);
++
++ wait_for_completion(&gh.gh_wait);
++ gfs_holder_uninit(&gh);
++}
++
++/**
++ * trylock_on_glock - try to acquire a local lock on a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if the glock is acquired
++ */
++
++static int
++trylock_on_glock(struct gfs_glock *gl)
++{
++ int acquired = TRUE;
++
++ spin_lock(&gl->gl_spin);
++ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
++ acquired = FALSE;
++ spin_unlock(&gl->gl_spin);
++
++ return acquired;
++}
++
++/**
++ * unlock_on_glock - release a local lock on a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++unlock_on_glock(struct gfs_glock *gl)
++{
++ spin_lock(&gl->gl_spin);
++ clear_bit(GLF_LOCK, &gl->gl_flags);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * handle_callback - add a demote request to a lock's queue
++ * @gl: the glock
++ * @state: the state the callback is us to change to
++ *
++ */
++
++static void
++handle_callback(struct gfs_glock *gl, unsigned int state)
++{
++ struct list_head *tmp, *head;
++ struct gfs_holder *gh, *new_gh = NULL;
++
++ GFS_ASSERT_GLOCK(state != LM_ST_EXCLUSIVE, gl,);
++
++ restart:
++ spin_lock(&gl->gl_spin);
++
++ for (head = &gl->gl_waiters2, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
++ gl->gl_req_gh != gh) {
++ if (gh->gh_state != state)
++ gh->gh_state = LM_ST_UNLOCKED;
++ goto out;
++ }
++ }
++
++ if (new_gh) {
++ list_add(&new_gh->gh_list, &gl->gl_waiters2);
++ new_gh = NULL;
++ } else {
++ spin_unlock(&gl->gl_spin);
++
++ new_gh = gfs_holder_get(gl, state, LM_FLAG_TRY);
++ set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
++ set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
++ new_gh->gh_owner = NULL;
++
++ goto restart;
++ }
++
++ out:
++ spin_unlock(&gl->gl_spin);
++
++ if (new_gh)
++ gfs_holder_put(new_gh);
++}
++
++/**
++ * state_change - record that the glock is now in a different state
++ * @gl: the glock
++ * @new_state the new state
++ *
++ */
++
++static void
++state_change(struct gfs_glock *gl, unsigned int new_state)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ int held1, held2;
++
++ held1 = (gl->gl_state != LM_ST_UNLOCKED);
++ held2 = (new_state != LM_ST_UNLOCKED);
++
++ if (held1 != held2) {
++ if (held2) {
++ atomic_inc(&sdp->sd_glock_held_count);
++ glock_hold(gl);
++ } else {
++ atomic_dec(&sdp->sd_glock_held_count);
++ glock_put(gl);
++ }
++ }
++
++ gl->gl_state = new_state;
++}
++
++/**
++ * xmote_bh - Called after the lock module is done acquiring a lock
++ * @gl: The glock in question
++ * @ret: the int returned from the lock module
++ *
++ */
++
++static void
++xmote_bh(struct gfs_glock *gl, unsigned int ret)
++{
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ struct gfs_holder *gh = gl->gl_req_gh;
++ int prev_state = gl->gl_state;
++ int op_done = TRUE;
++
++ GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++ GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++ GFS_ASSERT_GLOCK(!(ret & LM_OUT_ASYNC), gl,);
++
++ state_change(gl, ret & LM_OUT_ST_MASK);
++
++ if (ret & LM_OUT_NEED_E)
++ handle_callback(gl, LM_ST_UNLOCKED);
++ else if (ret & LM_OUT_NEED_D)
++ handle_callback(gl, LM_ST_DEFERRED);
++ else if (ret & LM_OUT_NEED_S)
++ handle_callback(gl, LM_ST_SHARED);
++
++ if (ret & LM_OUT_LVB_INVALID)
++ set_bit(GLF_LVB_INVALID, &gl->gl_flags);
++
++ if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
++ if (glops->go_inval)
++ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
++ } else if (gl->gl_state == LM_ST_DEFERRED) {
++ /* We might not want to do this here.
++ Look at moving to the inode glops. */
++ if (glops->go_inval)
++ glops->go_inval(gl, DIO_DATA);
++ }
++
++ /* Deal with each possible exit condition */
++
++ if (!gh)
++ gl->gl_stamp = jiffies;
++
++ else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ if (gl->gl_state == gh->gh_state ||
++ gl->gl_state == LM_ST_UNLOCKED)
++ gh->gh_error = 0;
++ else
++ gh->gh_error = GLR_TRYFAILED;
++ spin_unlock(&gl->gl_spin);
++
++ if (ret & LM_OUT_CANCELED)
++ handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
++
++ } else if (ret & LM_OUT_CANCELED) {
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ gh->gh_error = GLR_CANCELED;
++ if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++ do_unrecurse(gh);
++ spin_unlock(&gl->gl_spin);
++
++ } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
++ spin_lock(&gl->gl_spin);
++ list_move_tail(&gh->gh_list, &gl->gl_holders);
++ gh->gh_error = 0;
++ set_bit(HIF_HOLDER, &gh->gh_iflags);
++ spin_unlock(&gl->gl_spin);
++
++ set_bit(HIF_FIRST, &gh->gh_iflags);
++
++ op_done = FALSE;
++
++ } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ gh->gh_error = GLR_TRYFAILED;
++ if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++ do_unrecurse(gh);
++ spin_unlock(&gl->gl_spin);
++
++ } else
++ GFS_ASSERT_GLOCK(FALSE, gl,);
++
++ if (glops->go_xmote_bh)
++ glops->go_xmote_bh(gl);
++
++ if (op_done) {
++ spin_lock(&gl->gl_spin);
++ gl->gl_req_gh = NULL;
++ gl->gl_req_bh = NULL;
++ clear_bit(GLF_LOCK, &gl->gl_flags);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++ }
++
++ glock_put(gl);
++
++ if (gh) {
++ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++ gfs_holder_put(gh);
++ else
++ complete(&gh->gh_wait);
++ }
++}
++
++/**
++ * gfs_glock_xmote_th - Call into the lock module to acquire a glock
++ * @gl: The glock in question
++ * @state: the requested state
++ * @flags: modifier flags to the lock call
++ *
++ */
++
++void
++gfs_glock_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
++ LM_FLAG_NOEXP | LM_FLAG_ANY |
++ LM_FLAG_PRIORITY);
++ unsigned int lck_ret;
++
++ GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++ GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++ GFS_ASSERT_GLOCK(state != LM_ST_UNLOCKED, gl,);
++ GFS_ASSERT_GLOCK(state != gl->gl_state, gl,);
++
++ if (gl->gl_state == LM_ST_EXCLUSIVE) {
++ if (glops->go_sync)
++ glops->go_sync(gl, DIO_METADATA | DIO_DATA);
++ }
++
++ glock_hold(gl);
++ gl->gl_req_bh = xmote_bh;
++
++ atomic_inc(&sdp->sd_lm_lock_calls);
++
++ lck_ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl->gl_lock,
++ gl->gl_state,
++ state, lck_flags);
++
++ if (lck_ret & LM_OUT_ASYNC)
++ GFS_ASSERT_GLOCK(lck_ret == LM_OUT_ASYNC, gl,);
++ else
++ xmote_bh(gl, lck_ret);
++}
++
++/**
++ * drop_bh - Called after a lock module unlock completes
++ * @gl: the glock
++ * @ret: the return status
++ *
++ * Doesn't wake up the process waiting on the struct gfs_holder (if any)
++ * Doesn't drop the reference on the glock the top half took out
++ *
++ */
++
++static void
++drop_bh(struct gfs_glock *gl, unsigned int ret)
++{
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ struct gfs_holder *gh = gl->gl_req_gh;
++
++ clear_bit(GLF_PREFETCH, &gl->gl_flags);
++
++ GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++ GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++ GFS_ASSERT_GLOCK(!ret, gl,);
++
++ state_change(gl, LM_ST_UNLOCKED);
++
++ if (glops->go_inval)
++ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
++
++ if (gh) {
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ gh->gh_error = 0;
++ spin_unlock(&gl->gl_spin);
++ }
++
++ if (glops->go_drop_bh)
++ glops->go_drop_bh(gl);
++
++ spin_lock(&gl->gl_spin);
++ gl->gl_req_gh = NULL;
++ gl->gl_req_bh = NULL;
++ clear_bit(GLF_LOCK, &gl->gl_flags);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++
++ glock_put(gl);
++
++ if (gh) {
++ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
++ gfs_holder_put(gh);
++ else
++ complete(&gh->gh_wait);
++ }
++}
++
++/**
++ * gfs_glock_drop_th - call into the lock module to unlock a lock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_drop_th(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ unsigned int ret;
++
++ GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++ GFS_ASSERT_GLOCK(queue_empty(gl, &gl->gl_holders), gl,);
++ GFS_ASSERT_GLOCK(gl->gl_state != LM_ST_UNLOCKED, gl,);
++
++ if (gl->gl_state == LM_ST_EXCLUSIVE) {
++ if (glops->go_sync)
++ glops->go_sync(gl, DIO_METADATA | DIO_DATA);
++ }
++
++ glock_hold(gl);
++ gl->gl_req_bh = drop_bh;
++
++ atomic_inc(&sdp->sd_lm_unlock_calls);
++
++ ret = sdp->sd_lockstruct.ls_ops->lm_unlock(gl->gl_lock, gl->gl_state);
++
++ if (!ret)
++ drop_bh(gl, ret);
++ else
++ GFS_ASSERT_GLOCK(ret == LM_OUT_ASYNC, gl,);
++}
++
++/**
++ * handle_cancels - cancel requests for locks stuck waiting on an expire flag
++ * @gh: the LM_FLAG_NOEXP holder waiting to acquire the lock
++ *
++ */
++
++static void
++handle_cancels(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++
++ spin_lock(&gl->gl_spin);
++
++ while (gl->gl_req_gh != gh &&
++ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
++ !test_bit(HIF_WAKEUP, &gh->gh_iflags) &&
++ !list_empty(&gh->gh_list)) {
++ if (gl->gl_req_bh) {
++ spin_unlock(&gl->gl_spin);
++ gl->gl_sbd->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
++ yield();
++ spin_lock(&gl->gl_spin);
++ } else {
++ spin_unlock(&gl->gl_spin);
++ yield();
++ spin_lock(&gl->gl_spin);
++ }
++ }
++
++ spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * glock_wait_internal - wait on a glock acquisition
++ * @gh: the glock holder
++ *
++ * Returns: 0 on success
++ */
++
++static int
++glock_wait_internal(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ int error = 0;
++
++ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
++ spin_lock(&gl->gl_spin);
++ if (gl->gl_req_gh != gh &&
++ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
++ !test_bit(HIF_WAKEUP, &gh->gh_iflags) &&
++ !list_empty(&gh->gh_list)) {
++ list_del_init(&gh->gh_list);
++ gh->gh_error = GLR_TRYFAILED;
++ if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++ do_unrecurse(gh);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++ return GLR_TRYFAILED;
++ }
++ spin_unlock(&gl->gl_spin);
++ }
++
++ if (gh->gh_flags & LM_FLAG_NOEXP)
++ handle_cancels(gh);
++
++ for (;;) {
++ wait_for_completion(&gh->gh_wait);
++
++ spin_lock(&gl->gl_spin);
++ if (test_and_clear_bit(HIF_WAKEUP, &gh->gh_iflags)) {
++ run_queue(gl, TRUE);
++ spin_unlock(&gl->gl_spin);
++ } else {
++ spin_unlock(&gl->gl_spin);
++ break;
++ }
++ }
++
++ if (gh->gh_error)
++ return gh->gh_error;
++
++ GFS_ASSERT_GLOCK(test_bit(HIF_HOLDER, &gh->gh_iflags), gl,);
++ GFS_ASSERT_GLOCK(relaxed_state_ok(gl->gl_state, gh->gh_state,
++ gh->gh_flags), gl,);
++
++ if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
++ GFS_ASSERT_GLOCK(test_bit(GLF_LOCK, &gl->gl_flags), gl,);
++
++ if (glops->go_lock) {
++ error = glops->go_lock(gl, gh->gh_flags);
++ if (error) {
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ gh->gh_error = error;
++ if (test_and_clear_bit(HIF_RECURSE, &gh->gh_iflags))
++ do_unrecurse(gh);
++ spin_unlock(&gl->gl_spin);
++ }
++ }
++
++ spin_lock(&gl->gl_spin);
++ gl->gl_req_gh = NULL;
++ gl->gl_req_bh = NULL;
++ clear_bit(GLF_LOCK, &gl->gl_flags);
++ if (test_bit(HIF_RECURSE, &gh->gh_iflags))
++ handle_recurse(gh);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++ }
++
++ return error;
++}
++
++/**
++ * add_to_queue - Add a holder to the wait queue (but look for recursion)
++ * @gh: the holder structure
++ *
++ */
++
++static void
++add_to_queue(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct list_head *tmp, *head;
++ struct gfs_holder *tmp_gh;
++
++ if (gh->gh_owner) {
++ for (head = &gl->gl_holders, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (tmp_gh->gh_owner == gh->gh_owner) {
++ GFS_ASSERT_GLOCK((gh->gh_flags & LM_FLAG_ANY) ||
++ !(tmp_gh->gh_flags & LM_FLAG_ANY),
++ gl,);
++ GFS_ASSERT_GLOCK((tmp_gh->gh_flags & GL_LOCAL_EXCL) ||
++ !(gh->gh_flags & GL_LOCAL_EXCL),
++ gl,);
++ GFS_ASSERT_GLOCK(relaxed_state_ok(gl->gl_state,
++ gh->gh_state,
++ gh->gh_flags),
++ gl,);
++
++ list_add_tail(&gh->gh_list, &gl->gl_holders);
++ set_bit(HIF_HOLDER, &gh->gh_iflags);
++
++ gh->gh_error = 0;
++ complete(&gh->gh_wait);
++
++ return;
++ }
++ }
++
++ for (head = &gl->gl_waiters2, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ tmp_gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (tmp_gh->gh_owner == gh->gh_owner) {
++ GFS_ASSERT_GLOCK(test_bit(HIF_PROMOTE,
++ &tmp_gh->gh_iflags),
++ gl,);
++ GFS_ASSERT_GLOCK((gh->gh_flags & LM_FLAG_ANY) ||
++ !(tmp_gh->gh_flags & LM_FLAG_ANY),
++ gl,);
++ GFS_ASSERT_GLOCK((tmp_gh->gh_flags & GL_LOCAL_EXCL) ||
++ !(gh->gh_flags & GL_LOCAL_EXCL),
++ gl,);
++ GFS_ASSERT_GLOCK(relaxed_state_ok(tmp_gh->gh_state,
++ gh->gh_state,
++ gh->gh_flags),
++ gl,);
++
++ set_bit(HIF_RECURSE, &gh->gh_iflags);
++ set_bit(HIF_RECURSE, &tmp_gh->gh_iflags);
++
++ list_add_tail(&gh->gh_list, &gl->gl_waiters2);
++
++ return;
++ }
++ }
++ }
++
++ if (gh->gh_flags & LM_FLAG_PRIORITY)
++ list_add(&gh->gh_list, &gl->gl_waiters2);
++ else
++ list_add_tail(&gh->gh_list, &gl->gl_waiters2);
++}
++
++/**
++ * gfs_glock_nq - enqueue a struct gfs_holder onto a glock (acquire a glock)
++ * @gh: the holder structure
++ *
++ * if (gh->gh_flags & GL_ASYNC), this never returns an error
++ *
++ * Returns: 0, GLR_TRYFAILED, or -EXXX on failure
++ */
++
++int
++gfs_glock_nq(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ int error = 0;
++
++ GFS_ASSERT_GLOCK(list_empty(&gh->gh_list), gl,);
++ GFS_ASSERT_GLOCK(gh->gh_state != LM_ST_UNLOCKED, gl,);
++ GFS_ASSERT_GLOCK((gh->gh_flags & (LM_FLAG_ANY | GL_EXACT)) !=
++ (LM_FLAG_ANY | GL_EXACT), gl,);
++ GFS_ASSERT_GLOCK(GFS_ASYNC_LM(sdp) ||
++ !(gh->gh_flags & GL_ASYNC), gl,);
++
++ atomic_inc(&sdp->sd_glock_nq_calls);
++
++ restart:
++ set_bit(HIF_PROMOTE, &gh->gh_iflags);
++
++ spin_lock(&gl->gl_spin);
++ add_to_queue(gh);
++ run_queue(gl, TRUE);
++ spin_unlock(&gl->gl_spin);
++
++ if (!(gh->gh_flags & GL_ASYNC)) {
++ error = glock_wait_internal(gh);
++ if (error == GLR_CANCELED) {
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto restart;
++ }
++ }
++
++ clear_bit(GLF_PREFETCH, &gl->gl_flags);
++
++ return error;
++}
++
++/**
++ * gfs_glock_poll - poll to see if an async request has been completed
++ * @gh: the holder
++ *
++ * Returns: TRUE if the request is ready to be gfs_glock_wait()ed on
++ */
++
++int
++gfs_glock_poll(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ int ready = FALSE;
++
++ GFS_ASSERT_GLOCK(gh->gh_flags & GL_ASYNC, gl,);
++ GFS_ASSERT_GLOCK(!test_bit(HIF_WAKEUP, &gh->gh_iflags), gl,);
++
++ spin_lock(&gl->gl_spin);
++
++ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++ ready = TRUE;
++ else if (list_empty(&gh->gh_list)) {
++ if (gh->gh_error == GLR_CANCELED) {
++ spin_unlock(&gl->gl_spin);
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ gfs_glock_nq(gh);
++ return FALSE;
++ } else
++ ready = TRUE;
++ }
++
++ spin_unlock(&gl->gl_spin);
++
++ return ready;
++}
++
++/**
++ * gfs_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
++ * @gh: the holder structure
++ *
++ * Returns: 0, GLR_TRYFAILED, or -EXXX on failure
++ */
++
++int
++gfs_glock_wait(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ int error;
++
++ GFS_ASSERT_GLOCK(gh->gh_flags & GL_ASYNC, gl,);
++ GFS_ASSERT_GLOCK(!test_bit(HIF_WAKEUP, &gh->gh_iflags), gl,);
++
++ error = glock_wait_internal(gh);
++ if (error == GLR_CANCELED) {
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ gh->gh_flags &= ~GL_ASYNC;
++ error = gfs_glock_nq(gh);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_glock_dq - dequeue a struct gfs_holder from a glock (release a glock)
++ * @gh: the glock holder
++ *
++ */
++
++void
++gfs_glock_dq(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++
++ GFS_ASSERT_GLOCK(!queue_empty(gl, &gh->gh_list), gl,);
++ GFS_ASSERT_GLOCK(test_bit(HIF_HOLDER, &gh->gh_iflags), gl,);
++
++ atomic_inc(&gl->gl_sbd->sd_glock_dq_calls);
++
++ if (gh->gh_flags & GL_SYNC)
++ set_bit(GLF_SYNC, &gl->gl_flags);
++ if (gh->gh_flags & GL_NOCACHE)
++ handle_callback(gl, LM_ST_UNLOCKED);
++
++ lock_on_glock(gl);
++
++ spin_lock(&gl->gl_spin);
++ list_del_init(&gh->gh_list);
++ if (list_empty(&gl->gl_holders)) {
++ spin_unlock(&gl->gl_spin);
++
++ if (glops->go_unlock)
++ glops->go_unlock(gl, gh->gh_flags);
++
++ if (test_bit(GLF_SYNC, &gl->gl_flags)) {
++ if (glops->go_sync)
++ glops->go_sync(gl,
++ DIO_METADATA |
++ DIO_DATA |
++ DIO_INVISIBLE);
++ }
++
++ gl->gl_stamp = jiffies;
++
++ spin_lock(&gl->gl_spin);
++ }
++
++ clear_bit(GLF_LOCK, &gl->gl_flags);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++}
++
++/**
++ * gfs_glock_prefetch - Try to prefetch a glock
++ * @gl: the glock
++ * @state: the state to prefetch in
++ * @flags: flags passed to go_xmote_th()
++ *
++ */
++
++void
++gfs_glock_prefetch(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ struct gfs_glock_operations *glops = gl->gl_ops;
++
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++ GFS_ASSERT_GLOCK(state != LM_ST_UNLOCKED, gl,);
++ GFS_ASSERT_GLOCK((flags & (LM_FLAG_ANY | GL_EXACT)) !=
++ (LM_FLAG_ANY | GL_EXACT), gl,);
++
++ spin_lock(&gl->gl_spin);
++
++ if (test_bit(GLF_LOCK, &gl->gl_flags) ||
++ !list_empty(&gl->gl_holders) ||
++ !list_empty(&gl->gl_waiters1) ||
++ !list_empty(&gl->gl_waiters2) ||
++ relaxed_state_ok(gl->gl_state, state, flags)) {
++ spin_unlock(&gl->gl_spin);
++ return;
++ }
++
++ set_bit(GLF_PREFETCH, &gl->gl_flags);
++
++ GFS_ASSERT_GLOCK(!gl->gl_req_gh, gl,);
++ set_bit(GLF_LOCK, &gl->gl_flags);
++ spin_unlock(&gl->gl_spin);
++
++ glops->go_xmote_th(gl, state, flags);
++
++ atomic_inc(&gl->gl_sbd->sd_glock_prefetch_calls);
++}
++
++/**
++ * gfs_glock_force_drop - Force a glock to be uncached
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_force_drop(struct gfs_glock *gl)
++{
++ struct gfs_holder gh;
++
++ gfs_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
++ set_bit(HIF_DEMOTE, &gh.gh_iflags);
++ gh.gh_owner = NULL;
++
++ spin_lock(&gl->gl_spin);
++ list_add(&gh.gh_list, &gl->gl_waiters2);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++
++ wait_for_completion(&gh.gh_wait);
++ gfs_holder_uninit(&gh);
++}
++
++/**
++ * gfs_glock_nq_init - intialize a holder and enqueue it on a glock
++ * @gl: the glock
++ * @state: the state we're requesting
++ * @flags: the modifier flags
++ * @gh: the holder structure
++ *
++ * Returns: 0, GLR_*, or -EXXX
++ */
++
++int
++gfs_glock_nq_init(struct gfs_glock *gl, unsigned int state, int flags,
++ struct gfs_holder *gh)
++{
++ int error;
++
++ gfs_holder_init(gl, state, flags, gh);
++
++ error = gfs_glock_nq(gh);
++ if (error)
++ gfs_holder_uninit(gh);
++
++ return error;
++}
++
++/**
++ * gfs_glock_dq_uninit - dequeue a holder from a glock and initialize it
++ * @gh: the holder structure
++ *
++ */
++
++void
++gfs_glock_dq_uninit(struct gfs_holder *gh)
++{
++ gfs_glock_dq(gh);
++ gfs_holder_uninit(gh);
++}
++
++/**
++ * gfs_glock_nq_num - acquire a glock based on lock number
++ * @sdp: the filesystem
++ * @number: the lock number
++ * @glops: the glock operations for the type of glock
++ * @state: the state to acquire the glock in
++ * @flags: modifier flags for the aquisition
++ * @gh: the struct gfs_holder
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_glock_nq_num(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ unsigned int state, int flags, struct gfs_holder *gh)
++{
++ struct gfs_glock *gl;
++ int error;
++
++ error = gfs_glock_get(sdp, number, glops, CREATE, &gl);
++ if (!error) {
++ error = gfs_glock_nq_init(gl, state, flags, gh);
++ glock_put(gl);
++ }
++
++ return error;
++}
++
++/**
++ * glock_compare - Compare two struct gfs_glock structures for sorting
++ * @arg_a: the first structure
++ * @arg_b: the second structure
++ *
++ */
++
++static int
++glock_compare(void *arg_a, void *arg_b)
++{
++ struct gfs_holder *gh_a = *(struct gfs_holder **)arg_a;
++ struct gfs_holder *gh_b = *(struct gfs_holder **)arg_b;
++ struct lm_lockname *a = &gh_a->gh_gl->gl_name;
++ struct lm_lockname *b = &gh_b->gh_gl->gl_name;
++ int ret = 0;
++
++ if (a->ln_number > b->ln_number)
++ ret = 1;
++ else if (a->ln_number < b->ln_number)
++ ret = -1;
++ else {
++ if (gh_a->gh_state == LM_ST_SHARED &&
++ gh_b->gh_state == LM_ST_EXCLUSIVE)
++ ret = 1;
++ else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
++ (gh_b->gh_flags & GL_LOCAL_EXCL))
++ ret = 1;
++ }
++
++ return ret;
++}
++
++/**
++ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++static int
++nq_m_sync(unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct gfs_holder *p[num_gh];
++ unsigned int x;
++ int error = 0;
++
++ for (x = 0; x < num_gh; x++)
++ p[x] = &ghs[x];
++
++ gfs_sort(p, num_gh, sizeof(struct gfs_holder *), glock_compare);
++
++ for (x = 0; x < num_gh; x++) {
++ p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++
++ error = gfs_glock_nq(p[x]);
++ if (error) {
++ while (x--)
++ gfs_glock_dq(p[x]);
++ break;
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_glock_nq_m - acquire multiple glocks
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Figure out how big an impact this function has. Either:
++ * 1) Replace this code with code that calls gfs_glock_prefetch()
++ * 2) Forget async stuff and just call nq_m_sync()
++ * 3) Leave it like it is
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++int
++gfs_glock_nq_m(unsigned int num_gh, struct gfs_holder *ghs)
++{
++ int e[num_gh];
++ unsigned int x;
++ int borked = FALSE, serious = 0;
++ int error = 0;
++
++ GFS_ASSERT(num_gh,);
++
++ if (num_gh == 1) {
++ ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++ error = gfs_glock_nq(ghs);
++ return error;
++ }
++
++ if (!GFS_ASYNC_LM(ghs->gh_gl->gl_sbd)) {
++ error = nq_m_sync(num_gh, ghs);
++ return error;
++ }
++
++ for (x = 0; x < num_gh; x++) {
++ ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
++ gfs_glock_nq(&ghs[x]);
++ }
++
++ for (x = 0; x < num_gh; x++) {
++ error = e[x] = glock_wait_internal(&ghs[x]);
++ if (error) {
++ borked = TRUE;
++ if (error != GLR_TRYFAILED && error != GLR_CANCELED)
++ serious = error;
++ }
++ }
++
++ if (!borked)
++ return 0;
++
++ for (x = 0; x < num_gh; x++)
++ if (!e[x])
++ gfs_glock_dq(&ghs[x]);
++
++ if (serious)
++ error = serious;
++ else {
++ for (x = 0; x < num_gh; x++)
++ gfs_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
++ &ghs[x]);
++ error = nq_m_sync(num_gh, ghs);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_glock_dq_m - release multiple glocks
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ */
++
++void
++gfs_glock_dq_m(unsigned int num_gh, struct gfs_holder *ghs)
++{
++ unsigned int x;
++
++ for (x = 0; x < num_gh; x++)
++ gfs_glock_dq(&ghs[x]);
++}
++
++/**
++ * gfs_glock_prefetch_num - prefetch a glock based on lock number
++ * @sdp: the filesystem
++ * @number: the lock number
++ * @glops: the glock operations for the type of glock
++ * @state: the state to acquire the glock in
++ * @flags: modifier flags for the aquisition
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_glock_prefetch_num(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ unsigned int state, int flags)
++{
++ struct gfs_glock *gl;
++ int error;
++
++ if (atomic_read(&sdp->sd_reclaim_count) < sdp->sd_tune.gt_reclaim_limit) {
++ error = gfs_glock_get(sdp, number, glops, CREATE, &gl);
++ if (!error) {
++ gfs_glock_prefetch(gl, state, flags);
++ glock_put(gl);
++ }
++ }
++}
++
++/**
++ * gfs_lvb_hold - attach a LVB from a glock
++ * @gl: The glock in question
++ *
++ */
++
++int
++gfs_lvb_hold(struct gfs_glock *gl)
++{
++ int error = 0;
++
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_count) > 0, gl,);
++
++ lock_on_glock(gl);
++
++ atomic_inc(&gl->gl_lvb_count);
++ if (atomic_read(&gl->gl_lvb_count) == 1) {
++ glock_hold(gl);
++ GFS_ASSERT_GLOCK(!gl->gl_lvb, gl,);
++ error = gl->gl_sbd->sd_lockstruct.ls_ops->lm_hold_lvb(gl->gl_lock,
++ &gl->gl_lvb);
++ if (error) {
++ glock_put(gl);
++ atomic_dec(&gl->gl_lvb_count);
++ }
++ }
++
++ unlock_on_glock(gl);
++
++ return error;
++}
++
++/**
++ * gfs_lvb_unhold - detach a LVB from a glock
++ * @gl: The glock in question
++ *
++ */
++
++void
++gfs_lvb_unhold(struct gfs_glock *gl)
++{
++ glock_hold(gl);
++
++ lock_on_glock(gl);
++
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_lvb_count), gl,);
++ if (atomic_dec_and_test(&gl->gl_lvb_count)) {
++ GFS_ASSERT_GLOCK(gl->gl_lvb, gl,);
++ gl->gl_sbd->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock,
++ gl->gl_lvb);
++ gl->gl_lvb = NULL;
++ glock_put(gl);
++ }
++
++ unlock_on_glock(gl);
++
++ glock_put(gl);
++}
++
++/**
++ * gfs_lvb_sync - sync a LVB
++ * @gl: The glock in question
++ *
++ */
++
++void
++gfs_lvb_sync(struct gfs_glock *gl)
++{
++ GFS_ASSERT_GLOCK(atomic_read(&gl->gl_lvb_count), gl,);
++
++ lock_on_glock(gl);
++
++ GFS_ASSERT_GLOCK(gfs_glock_is_held_excl(gl), gl,);
++ gl->gl_sbd->sd_lockstruct.ls_ops->lm_sync_lvb(gl->gl_lock, gl->gl_lvb);
++
++ unlock_on_glock(gl);
++}
++
++/**
++ * gfs_glock_cb - Callback used by locking module
++ * @fsdata: Pointer to the superblock
++ * @type: Type of callback
++ * @data: Type dependent data pointer
++ *
++ * Called by the locking module when it wants to tell us something.
++ * Either we need to drop a lock or another client expired.
++ */
++
++void
++gfs_glock_cb(lm_fsdata_t * fsdata, unsigned int type, void *data)
++{
++ struct gfs_sbd *sdp = (struct gfs_sbd *)fsdata;
++ struct gfs_glock *gl;
++ struct lm_lockname *name = NULL;
++ unsigned int state = 0;
++ struct lm_async_cb *async;
++ unsigned int journal;
++
++ atomic_inc(&sdp->sd_lm_callbacks);
++
++ switch (type) {
++ case LM_CB_NEED_E:
++ name = (struct lm_lockname *)data;
++ state = LM_ST_UNLOCKED;
++ break;
++
++ case LM_CB_NEED_D:
++ name = (struct lm_lockname *)data;
++ state = LM_ST_DEFERRED;
++ break;
++
++ case LM_CB_NEED_S:
++ name = (struct lm_lockname *)data;
++ state = LM_ST_SHARED;
++ break;
++
++ case LM_CB_ASYNC:
++ async = (struct lm_async_cb *)data;
++
++ gl = gfs_glock_find(sdp, &async->lc_name);
++ GFS_ASSERT_SBD(gl, sdp,);
++ GFS_ASSERT_GLOCK(gl->gl_req_bh, gl,);
++ gl->gl_req_bh(gl, async->lc_ret);
++ glock_put(gl);
++
++ break;
++
++ case LM_CB_NEED_RECOVERY:
++ journal = *(unsigned int *)data;
++
++ gfs_add_dirty_j(sdp, journal);
++
++ if (test_bit(SDF_RECOVERD_RUN, &sdp->sd_flags))
++ wake_up_process(sdp->sd_recoverd_process);
++
++ break;
++
++ case LM_CB_DROPLOCKS:
++ gfs_gl_hash_clear(sdp, FALSE);
++ gfs_quota_scan(sdp);
++ break;
++
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,
++ printk("type = %u\n", type););
++ break;
++ }
++
++ if (name) {
++ gl = gfs_glock_find(sdp, name);
++ if (gl) {
++ if (gl->gl_ops->go_callback)
++ gl->gl_ops->go_callback(gl, state);
++ handle_callback(gl, state);
++ spin_lock(&gl->gl_spin);
++ run_queue(gl, FALSE);
++ spin_unlock(&gl->gl_spin);
++ glock_put(gl);
++ }
++ }
++}
++
++/**
++ * gfs_try_toss_inode - try to remove a particular inode from GFS' cache
++ * sdp: the filesystem
++ * inum: the inode number
++ *
++ */
++
++void
++gfs_try_toss_inode(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ struct gfs_glock *gl;
++ struct gfs_inode *ip;
++ int error;
++
++ error = gfs_glock_get(sdp,
++ inum->no_formal_ino, &gfs_inode_glops,
++ NO_CREATE, &gl);
++ if (error || !gl)
++ return;
++
++ if (!trylock_on_glock(gl))
++ goto out;
++
++ if (!queue_empty(gl, &gl->gl_holders))
++ goto out_unlock;
++
++ ip = gl2ip(gl);
++ if (!ip)
++ goto out_unlock;
++
++ if (atomic_read(&ip->i_count))
++ goto out_unlock;
++
++ gfs_inode_destroy(ip);
++
++ out_unlock:
++ unlock_on_glock(gl);
++
++ out:
++ glock_put(gl);
++}
++
++/**
++ * gfs_iopen_go_callback - Try to kick the inode/vnode associated with an iopen glock from memory
++ * @io_gl: the iopen glock
++ * @state: the state into which the glock should be put
++ *
++ */
++
++void
++gfs_iopen_go_callback(struct gfs_glock *io_gl, unsigned int state)
++{
++ struct gfs_glock *i_gl;
++ struct gfs_inode *ip;
++
++ if (state != LM_ST_UNLOCKED)
++ return;
++
++ spin_lock(&io_gl->gl_spin);
++ i_gl = gl2gl(io_gl);
++ if (i_gl) {
++ glock_hold(i_gl);
++ spin_unlock(&io_gl->gl_spin);
++ } else {
++ spin_unlock(&io_gl->gl_spin);
++ return;
++ }
++
++ if (trylock_on_glock(i_gl)) {
++ if (queue_empty(i_gl, &i_gl->gl_holders)) {
++ ip = gl2ip(i_gl);
++ if (ip) {
++ gfs_try_toss_vnode(ip);
++ unlock_on_glock(i_gl);
++ gfs_glock_schedule_for_reclaim(i_gl);
++ goto out;
++ }
++ }
++ unlock_on_glock(i_gl);
++ }
++
++ out:
++ glock_put(i_gl);
++}
++
++/**
++ * demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++demote_ok(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_glock_operations *glops = gl->gl_ops;
++ int demote = TRUE;
++
++ if (test_bit(GLF_STICKY, &gl->gl_flags))
++ demote = FALSE;
++ else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
++ demote = time_after_eq(jiffies,
++ gl->gl_stamp +
++ sdp->sd_tune.gt_prefetch_secs * HZ);
++ else if (glops->go_demote_ok)
++ demote = glops->go_demote_ok(gl);
++
++ return demote;
++}
++
++/**
++ * gfs_glock_schedule_for_reclaim - Add a glock to the reclaim list
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_glock_schedule_for_reclaim(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++
++ spin_lock(&sdp->sd_reclaim_lock);
++ if (list_empty(&gl->gl_reclaim)) {
++ glock_hold(gl);
++ list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
++ atomic_inc(&sdp->sd_reclaim_count);
++ }
++ spin_unlock(&sdp->sd_reclaim_lock);
++
++ wake_up(&sdp->sd_reclaim_wchan);
++}
++
++/**
++ * gfs_reclaim_glock - process an glock on the reclaim list
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_reclaim_glock(struct gfs_sbd *sdp)
++{
++ struct gfs_glock *gl;
++ struct gfs_gl_hash_bucket *bucket;
++
++ spin_lock(&sdp->sd_reclaim_lock);
++
++ if (list_empty(&sdp->sd_reclaim_list)) {
++ spin_unlock(&sdp->sd_reclaim_lock);
++ return;
++ }
++
++ gl = list_entry(sdp->sd_reclaim_list.next,
++ struct gfs_glock, gl_reclaim);
++ list_del_init(&gl->gl_reclaim);
++
++ spin_unlock(&sdp->sd_reclaim_lock);
++
++ atomic_dec(&sdp->sd_reclaim_count);
++ atomic_inc(&sdp->sd_reclaimed);
++
++ if (trylock_on_glock(gl)) {
++ if (queue_empty(gl, &gl->gl_holders)) {
++ if (gl->gl_ops == &gfs_inode_glops) {
++ struct gfs_inode *ip = gl2ip(gl);
++ if (ip && !atomic_read(&ip->i_count))
++ gfs_inode_destroy(ip);
++ }
++ if (gl->gl_state != LM_ST_UNLOCKED &&
++ demote_ok(gl))
++ handle_callback(gl, LM_ST_UNLOCKED);
++ }
++ unlock_on_glock(gl);
++ }
++
++ bucket = gl->gl_bucket;
++
++ write_lock(&bucket->hb_lock);
++ if (atomic_read(&gl->gl_count) == 1) {
++ list_del_init(&gl->gl_list);
++ write_unlock(&bucket->hb_lock);
++ glock_free(gl);
++ } else {
++ write_unlock(&bucket->hb_lock);
++ glock_put(gl);
++ }
++}
++
++/**
++ * examine_bucket - Call a function for glock in a hash bucket
++ * @examiner: the function
++ * @sdp: the filesystem
++ * @bucket: the bucket
++ *
++ * Returns: TRUE if the bucket is has entries
++ */
++
++static int
++examine_bucket(glock_examiner examiner,
++ struct gfs_sbd *sdp, struct gfs_gl_hash_bucket *bucket)
++{
++ struct glock_plug plug;
++ struct list_head *tmp;
++ struct gfs_glock *gl;
++ int entries;
++
++ memset(&plug.gl_flags, 0, sizeof(unsigned long));
++ set_bit(GLF_PLUG, &plug.gl_flags);
++
++ write_lock(&bucket->hb_lock);
++ list_add(&plug.gl_list, &bucket->hb_list);
++ write_unlock(&bucket->hb_lock);
++
++ for (;;) {
++ write_lock(&bucket->hb_lock);
++
++ for (;;) {
++ tmp = plug.gl_list.next;
++ if (tmp == &bucket->hb_list) {
++ list_del(&plug.gl_list);
++ entries = !list_empty(&bucket->hb_list);
++ write_unlock(&bucket->hb_lock);
++ return entries;
++ }
++ gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++ list_move(&plug.gl_list, &gl->gl_list);
++
++ if (test_bit(GLF_PLUG, &gl->gl_flags))
++ continue;
++
++ glock_hold(gl);
++
++ break;
++ }
++
++ write_unlock(&bucket->hb_lock);
++
++ examiner(gl);
++ }
++}
++
++/**
++ * scan_glock - lock at a glock and see if we can do stuff to it
++ * @gl: the glock to look at
++ *
++ */
++
++static void
++scan_glock(struct gfs_glock *gl)
++{
++ if (trylock_on_glock(gl)) {
++ if (queue_empty(gl, &gl->gl_holders)) {
++ if (gl->gl_ops == &gfs_inode_glops) {
++ struct gfs_inode *ip = gl2ip(gl);
++ if (ip && !atomic_read(&ip->i_count)) {
++ unlock_on_glock(gl);
++ gfs_glock_schedule_for_reclaim(gl);
++ goto out;
++ }
++ }
++ if (gl->gl_state != LM_ST_UNLOCKED &&
++ demote_ok(gl)) {
++ unlock_on_glock(gl);
++ gfs_glock_schedule_for_reclaim(gl);
++ goto out;
++ }
++ }
++
++ unlock_on_glock(gl);
++ }
++
++ out:
++ glock_put(gl);
++}
++
++/**
++ * gfs_scand_internal - Look for glocks and inodes to toss from memory
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_scand_internal(struct gfs_sbd *sdp)
++{
++ unsigned int x;
++
++ for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++ examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
++ cond_resched();
++ }
++}
++
++/**
++ * clear_glock - lock at a glock and see if we can do stuff to it
++ * @gl: the glock to look at
++ * @timeout: demote locks left unused for longer than this many seconds
++ *
++ */
++
++static void
++clear_glock(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_gl_hash_bucket *bucket = gl->gl_bucket;
++
++ spin_lock(&sdp->sd_reclaim_lock);
++ if (!list_empty(&gl->gl_reclaim)) {
++ list_del_init(&gl->gl_reclaim);
++ atomic_dec(&sdp->sd_reclaim_count);
++ glock_put(gl);
++ }
++ spin_unlock(&sdp->sd_reclaim_lock);
++
++ if (trylock_on_glock(gl)) {
++ if (queue_empty(gl, &gl->gl_holders)) {
++ if (gl->gl_ops == &gfs_inode_glops) {
++ struct gfs_inode *ip = gl2ip(gl);
++ if (ip && !atomic_read(&ip->i_count))
++ gfs_inode_destroy(ip);
++ }
++ if (gl->gl_state != LM_ST_UNLOCKED)
++ handle_callback(gl, LM_ST_UNLOCKED);
++ }
++
++ unlock_on_glock(gl);
++ }
++
++ write_lock(&bucket->hb_lock);
++ if (atomic_read(&gl->gl_count) == 1) {
++ list_del_init(&gl->gl_list);
++ write_unlock(&bucket->hb_lock);
++ glock_free(gl);
++ } else {
++ write_unlock(&bucket->hb_lock);
++ glock_put(gl);
++ }
++}
++
++/**
++ * gfs_gl_hash_clear - Empty out the glock hash table
++ * @sdp: the filesystem
++ * @wait: wait until it's all gone
++ *
++ */
++
++void
++gfs_gl_hash_clear(struct gfs_sbd *sdp, int wait)
++{
++ unsigned long t;
++ unsigned int x;
++ int cont;
++
++ t = jiffies;
++
++ for (;;) {
++ cont = FALSE;
++
++ for (x = 0; x < GFS_GL_HASH_SIZE; x++)
++ if (examine_bucket(clear_glock, sdp, &sdp->sd_gl_hash[x]))
++ cont = TRUE;
++
++ if (!wait || !cont)
++ break;
++
++ if (time_after_eq(jiffies, t + sdp->sd_tune.gt_stall_secs * HZ)) {
++ printk("GFS: fsid=%s: Unmount seems to be stalled. Dumping lock state...\n",
++ sdp->sd_fsname);
++ gfs_dump_lockstate(sdp, NULL);
++ t = jiffies;
++ }
++
++ invalidate_inodes(sdp->sd_vfs);
++ yield();
++ }
++}
++
++/*
++ * Diagnostic routines to help debug distributed deadlock
++ */
++
++/**
++ * dump_holder - print information about a glock holder
++ * @str: a string naming the type of holder
++ * @gh: the glock holder
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_holder(char *str, struct gfs_holder *gh,
++ char *buf, unsigned int size, unsigned int *count)
++{
++ unsigned int x;
++ int error = 0;
++
++ gfs_sprintf(" %s\n", str);
++ gfs_sprintf(" owner = %ld\n",
++ (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
++ gfs_sprintf(" gh_state = %u\n", gh->gh_state);
++ gfs_sprintf(" gh_flags =");
++ for (x = 0; x < 32; x++)
++ if (gh->gh_flags & (1 << x))
++ gfs_sprintf(" %u", x);
++ gfs_sprintf(" \n");
++ gfs_sprintf(" error = %d\n", gh->gh_error);
++ gfs_sprintf(" gh_iflags =");
++ for (x = 0; x < 32; x++)
++ if (test_bit(x, &gh->gh_iflags))
++ gfs_sprintf(" %u", x);
++ gfs_sprintf(" \n");
++
++ out:
++ return error;
++}
++
++/**
++ * dump_inode - print information about an inode
++ * @ip: the inode
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_inode(struct gfs_inode *ip,
++ char *buf, unsigned int size, unsigned int *count)
++{
++ unsigned int x;
++ int error = 0;
++
++ gfs_sprintf(" Inode:\n");
++ gfs_sprintf(" num = %" PRIu64 "/%" PRIu64 "\n",
++ ip->i_num.no_formal_ino, ip->i_num.no_addr);
++ gfs_sprintf(" type = %u\n", ip->i_di.di_type);
++ gfs_sprintf(" i_count = %d\n", atomic_read(&ip->i_count));
++ gfs_sprintf(" i_flags =");
++ for (x = 0; x < 32; x++)
++ if (test_bit(x, &ip->i_flags))
++ gfs_sprintf(" %u", x);
++ gfs_sprintf(" \n");
++ gfs_sprintf(" vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
++
++ out:
++ return error;
++}
++
++/**
++ * dump_glock - print information about a glock
++ * @gl: the glock
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: 0 on success, -ENOBUFS when we run out of space
++ */
++
++static int
++dump_glock(struct gfs_glock *gl,
++ char *buf, unsigned int size, unsigned int *count)
++{
++ struct list_head *head, *tmp;
++ struct gfs_holder *gh;
++ unsigned int x;
++ int error = 0;
++
++ spin_lock(&gl->gl_spin);
++
++ gfs_sprintf("Glock (%u, %" PRIu64 ")\n",
++ gl->gl_name.ln_type,
++ gl->gl_name.ln_number);
++ gfs_sprintf(" gl_flags =");
++ for (x = 0; x < 32; x++)
++ if (test_bit(x, &gl->gl_flags))
++ gfs_sprintf(" %u", x);
++ gfs_sprintf(" \n");
++ gfs_sprintf(" gl_count = %d\n", atomic_read(&gl->gl_count));
++ gfs_sprintf(" gl_state = %u\n", gl->gl_state);
++ gfs_sprintf(" lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
++ gfs_sprintf(" object = %s\n", (gl->gl_object) ? "yes" : "no");
++ if (gl->gl_aspace)
++ gfs_sprintf(" aspace = %lu\n",
++ gl->gl_aspace->i_mapping->nrpages);
++ else
++ gfs_sprintf(" aspace = no\n");
++ gfs_sprintf(" reclaim = %s\n",
++ (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
++ if (gl->gl_req_gh) {
++ error = dump_holder("Request", gl->gl_req_gh, buf, size, count);
++ if (error)
++ goto out;
++ }
++ for (head = &gl->gl_holders, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gh = list_entry(tmp, struct gfs_holder, gh_list);
++ error = dump_holder("Holder", gh, buf, size, count);
++ if (error)
++ goto out;
++ }
++ for (head = &gl->gl_waiters1, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gh = list_entry(tmp, struct gfs_holder, gh_list);
++ error = dump_holder("Waiter1", gh, buf, size, count);
++ if (error)
++ goto out;
++ }
++ for (head = &gl->gl_waiters2, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gh = list_entry(tmp, struct gfs_holder, gh_list);
++ error = dump_holder("Waiter2", gh, buf, size, count);
++ if (error)
++ goto out;
++ }
++ if (gl->gl_ops == &gfs_inode_glops && gl2ip(gl)) {
++ if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
++ list_empty(&gl->gl_holders)) {
++ error = dump_inode(gl2ip(gl), buf, size, count);
++ if (error)
++ goto out;
++ } else
++ gfs_sprintf(" Inode: busy\n");
++ }
++
++ out:
++ spin_unlock(&gl->gl_spin);
++
++ return error;
++}
++
++/**
++ * gfs_dump_lockstate - print out the current lockstate
++ * @sdp: the filesystem
++ * @ub: the buffer to copy the information into
++ *
++ * If @ub is NULL, dump the lockstate to the console.
++ *
++ */
++
++int
++gfs_dump_lockstate(struct gfs_sbd *sdp, struct gfs_user_buffer *ub)
++{
++ struct gfs_gl_hash_bucket *bucket;
++ struct list_head *tmp, *head;
++ struct gfs_glock *gl;
++ char *buf = NULL;
++ unsigned int size = sdp->sd_tune.gt_lockdump_size;
++ unsigned int x, count;
++ int error = 0;
++
++ if (ub) {
++ buf = kmalloc(size, GFP_KERNEL);
++ if (!buf)
++ return -ENOMEM;
++ }
++
++ for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++ bucket = &sdp->sd_gl_hash[x];
++ count = 0;
++
++ read_lock(&bucket->hb_lock);
++
++ for (head = &bucket->hb_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gl = list_entry(tmp, struct gfs_glock, gl_list);
++
++ if (test_bit(GLF_PLUG, &gl->gl_flags))
++ continue;
++
++ error = dump_glock(gl, buf, size, &count);
++ if (error)
++ break;
++ }
++
++ read_unlock(&bucket->hb_lock);
++
++ if (error)
++ break;
++
++ if (ub) {
++ if (ub->ub_count + count > ub->ub_size) {
++ error = -ENOMEM;
++ break;
++ }
++ if (copy_to_user(ub->ub_data + ub->ub_count, buf, count)) {
++ error = -EFAULT;
++ break;
++ }
++ ub->ub_count += count;
++ }
++ }
++
++ if (ub)
++ kfree(buf);
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/glock.h linux-patched/fs/gfs/glock.h
+--- linux-orig/fs/gfs/glock.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glock.h 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,134 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_GLOCK_DOT_H__
++#define __GFS_GLOCK_DOT_H__
++
++/*
++#define LM_FLAG_TRY (0x00000001)
++#define LM_FLAG_TRY_1CB (0x00000002)
++#define LM_FLAG_NOEXP (0x00000004)
++#define LM_FLAG_ANY (0x00000008)
++#define LM_FLAG_PRIORITY (0x00000010)
++*/
++#define GL_LOCAL_EXCL (0x00000020)
++#define GL_ASYNC (0x00000040)
++#define GL_EXACT (0x00000080)
++#define GL_SKIP (0x00000100)
++#define GL_ATIME (0x00000200)
++#define GL_NOCACHE (0x00000400)
++#define GL_SYNC (0x00000800)
++
++#define GLR_TRYFAILED (13)
++#define GLR_CANCELED (14)
++
++static __inline__ int
++gfs_glock_is_locked_by_me(struct gfs_glock *gl)
++{
++ struct list_head *tmp, *head;
++ struct gfs_holder *gh;
++ int locked = FALSE;
++
++ spin_lock(&gl->gl_spin);
++ for (head = &gl->gl_holders, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gh = list_entry(tmp, struct gfs_holder, gh_list);
++ if (gh->gh_owner == current) {
++ locked = TRUE;
++ break;
++ }
++ }
++ spin_unlock(&gl->gl_spin);
++
++ return locked;
++}
++static __inline__ int
++gfs_glock_is_held_excl(struct gfs_glock *gl)
++{
++ return (gl->gl_state == LM_ST_EXCLUSIVE);
++}
++static __inline__ int
++gfs_glock_is_held_dfrd(struct gfs_glock *gl)
++{
++ return (gl->gl_state == LM_ST_DEFERRED);
++}
++static __inline__ int
++gfs_glock_is_held_shrd(struct gfs_glock *gl)
++{
++ return (gl->gl_state == LM_ST_SHARED);
++}
++
++#define GFS_ASYNC_LM(sdp) ((sdp)->sd_lockstruct.ls_flags & LM_LSFLAG_ASYNC)
++
++struct gfs_glock *gfs_glock_find(struct gfs_sbd *sdp,
++ struct lm_lockname *name);
++int gfs_glock_get(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ int create, struct gfs_glock **glp);
++void gfs_glock_hold(struct gfs_glock *gl);
++void gfs_glock_put(struct gfs_glock *gl);
++
++void gfs_holder_init(struct gfs_glock *gl, unsigned int state, int flags,
++ struct gfs_holder *gh);
++void gfs_holder_reinit(unsigned int state, int flags, struct gfs_holder *gh);
++void gfs_holder_uninit(struct gfs_holder *gh);
++struct gfs_holder *gfs_holder_get(struct gfs_glock *gl, unsigned int state,
++ int flags);
++void gfs_holder_put(struct gfs_holder *gh);
++
++void gfs_glock_xmote_th(struct gfs_glock *gl, unsigned int state, int flags);
++void gfs_glock_drop_th(struct gfs_glock *gl);
++
++int gfs_glock_nq(struct gfs_holder *gh);
++int gfs_glock_poll(struct gfs_holder *gh);
++int gfs_glock_wait(struct gfs_holder *gh);
++void gfs_glock_dq(struct gfs_holder *gh);
++
++void gfs_glock_prefetch(struct gfs_glock *gl, unsigned int state, int flags);
++void gfs_glock_force_drop(struct gfs_glock *gl);
++
++int gfs_glock_nq_init(struct gfs_glock *gl, unsigned int state, int flags,
++ struct gfs_holder *gh);
++void gfs_glock_dq_uninit(struct gfs_holder *gh);
++int gfs_glock_nq_num(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ unsigned int state, int flags, struct gfs_holder *gh);
++
++int gfs_glock_nq_m(unsigned int num_gh, struct gfs_holder *ghs);
++void gfs_glock_dq_m(unsigned int num_gh, struct gfs_holder *ghs);
++
++void gfs_glock_prefetch_num(struct gfs_sbd *sdp,
++ uint64_t number, struct gfs_glock_operations *glops,
++ unsigned int state, int flags);
++
++/* Lock Value Block functions */
++
++int gfs_lvb_hold(struct gfs_glock *gl);
++void gfs_lvb_unhold(struct gfs_glock *gl);
++void gfs_lvb_sync(struct gfs_glock *gl);
++
++void gfs_glock_cb(lm_fsdata_t * fsdata, unsigned int type, void *data);
++
++void gfs_try_toss_inode(struct gfs_sbd *sdp, struct gfs_inum *inum);
++void gfs_iopen_go_callback(struct gfs_glock *gl, unsigned int state);
++
++void gfs_glock_schedule_for_reclaim(struct gfs_glock *gl);
++void gfs_reclaim_glock(struct gfs_sbd *sdp);
++
++void gfs_scand_internal(struct gfs_sbd *sdp);
++void gfs_gl_hash_clear(struct gfs_sbd *sdp, int wait);
++
++int gfs_dump_lockstate(struct gfs_sbd *sdp, struct gfs_user_buffer *ub);
++
++#endif /* __GFS_GLOCK_DOT_H__ */
+diff -urN linux-orig/fs/gfs/glops.c linux-patched/fs/gfs/glops.c
+--- linux-orig/fs/gfs/glops.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glops.c 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,526 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "page.h"
++#include "recovery.h"
++#include "rgrp.h"
++
++/**
++ * meta_go_sync - sync out the metadata for this glock
++ * @gl: the glock
++ * @flags: DIO_*
++ *
++ */
++
++static void
++meta_go_sync(struct gfs_glock *gl, int flags)
++{
++ if (!(flags & DIO_METADATA))
++ return;
++
++ if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
++ gfs_log_flush_glock(gl);
++ gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++ }
++
++ clear_bit(GLF_DIRTY, &gl->gl_flags);
++ clear_bit(GLF_SYNC, &gl->gl_flags);
++}
++
++/**
++ * meta_go_inval - invalidate the metadata for this glock
++ * @gl: the glock
++ * @flags:
++ *
++ */
++
++static void
++meta_go_inval(struct gfs_glock *gl, int flags)
++{
++ if (!(flags & DIO_METADATA))
++ return;
++
++ gfs_inval_buf(gl);
++ gl->gl_vn++;
++}
++
++/**
++ * meta_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++meta_go_demote_ok(struct gfs_glock *gl)
++{
++ return (gl->gl_aspace->i_mapping->nrpages) ? FALSE : TRUE;
++}
++
++/**
++ * inode_go_xmote_th - promote/demote a glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++inode_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ if (gl->gl_state != LM_ST_UNLOCKED)
++ gfs_inval_pte(gl);
++ gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * inode_go_xmote_bh - promote/demote a glock
++ * @gl: the glock
++ *
++ * This will be really broken when (no_formal_ino != no_addr)
++ *
++ */
++
++static void
++inode_go_xmote_bh(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_holder *gh = gl->gl_req_gh;
++ struct buffer_head *bh;
++ int error;
++
++ if (gl->gl_state != LM_ST_UNLOCKED &&
++ (!gh || !(gh->gh_flags & GL_SKIP))) {
++ error = gfs_dread(sdp, gl->gl_name.ln_number, gl, DIO_START, &bh);
++ if (!error)
++ brelse(bh);
++ }
++}
++
++/**
++ * inode_go_drop_th - unlock a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++inode_go_drop_th(struct gfs_glock *gl)
++{
++ gfs_inval_pte(gl);
++ gfs_glock_drop_th(gl);
++}
++
++/**
++ * inode_go_sync - Sync the dirty data for a inode glock
++ * @gl: the glock
++ * @flags:
++ *
++ */
++
++static void
++inode_go_sync(struct gfs_glock *gl, int flags)
++{
++ int meta = (flags & DIO_METADATA);
++ int data = (flags & DIO_DATA);
++
++ if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
++ if (meta && data) {
++ gfs_sync_page(gl, flags | DIO_START);
++ gfs_log_flush_glock(gl);
++ gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++ gfs_sync_page(gl, flags | DIO_WAIT | DIO_CHECK);
++ } else if (meta) {
++ gfs_log_flush_glock(gl);
++ gfs_sync_buf(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++ } else if (data)
++ gfs_sync_page(gl, flags | DIO_START | DIO_WAIT | DIO_CHECK);
++ }
++
++ if (meta && data) {
++ if (!(flags & DIO_INVISIBLE))
++ clear_bit(GLF_DIRTY, &gl->gl_flags);
++ clear_bit(GLF_SYNC, &gl->gl_flags);
++ }
++}
++
++/**
++ * inode_go_inval - prepare a inode glock to be released
++ * @gl: the glock
++ * @flags:
++ *
++ */
++
++static void
++inode_go_inval(struct gfs_glock *gl, int flags)
++{
++ int meta = (flags & DIO_METADATA);
++ int data = (flags & DIO_DATA);
++
++ if (meta) {
++ gfs_inval_buf(gl);
++ gl->gl_vn++;
++ }
++ if (data)
++ gfs_inval_page(gl);
++}
++
++/**
++ * inode_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++inode_go_demote_ok(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ int demote = FALSE;
++
++ if (!gl2ip(gl) && !gl->gl_aspace->i_mapping->nrpages)
++ demote = TRUE;
++ else if (!sdp->sd_args.ar_localcaching &&
++ time_after_eq(jiffies, gl->gl_stamp + sdp->sd_tune.gt_demote_secs * HZ))
++ demote = TRUE;
++
++ return demote;
++}
++
++/**
++ * inode_go_lock - operation done after an inode lock is locked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_glock()
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_go_lock(struct gfs_glock *gl, int flags)
++{
++ struct gfs_inode *ip = gl2ip(gl);
++ int error = 0;
++
++ if (ip && ip->i_vn != gl->gl_vn) {
++ error = gfs_copyin_dinode(ip);
++ if (!error)
++ gfs_inode_attr_in(ip);
++ }
++
++ return error;
++}
++
++/**
++ * inode_go_unlock - operation done before an inode lock is unlocked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_gunlock()
++ *
++ */
++
++static void
++inode_go_unlock(struct gfs_glock *gl, int flags)
++{
++ struct gfs_inode *ip = gl2ip(gl);
++
++ if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
++ gfs_inode_attr_in(ip);
++
++ if (ip)
++ gfs_flush_meta_cache(ip);
++}
++
++/**
++ * rgrp_go_xmote_th - promote/demote a glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++rgrp_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++ GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++ gfs_mhc_zap(rgd);
++ gfs_depend_sync(rgd);
++ gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * rgrp_go_drop_th - unlock a glock
++ * @gl: the glock
++ *
++ */
++
++static void
++rgrp_go_drop_th(struct gfs_glock *gl)
++{
++ struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++ GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++ gfs_mhc_zap(rgd);
++ gfs_depend_sync(rgd);
++ gfs_glock_drop_th(gl);
++}
++
++/**
++ * rgrp_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++rgrp_go_demote_ok(struct gfs_glock *gl)
++{
++ struct gfs_rgrpd *rgd = gl2rgd(gl);
++ int demote = TRUE;
++
++ if (gl->gl_aspace->i_mapping->nrpages)
++ demote = FALSE;
++ else if (rgd && !list_empty(&rgd->rd_mhc)) /* Don't bother with lock here */
++ demote = FALSE;
++
++ return demote;
++}
++
++/**
++ * rgrp_go_lock - operation done after an rgrp lock is locked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_glock()
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++rgrp_go_lock(struct gfs_glock *gl, int flags)
++{
++ struct gfs_rgrpd *rgd = gl2rgd(gl);
++ int error = 0;
++
++ GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++ if (!(flags & GL_SKIP))
++ error = gfs_rgrp_read(rgd);
++
++ return error;
++}
++
++/**
++ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by a process
++ * @gl: the glock
++ * @flags: the flags passed into gfs_gunlock()
++ *
++ */
++
++static void
++rgrp_go_unlock(struct gfs_glock *gl, int flags)
++{
++ struct gfs_rgrpd *rgd = gl2rgd(gl);
++
++ GFS_ASSERT_GLOCK(rgd && gl->gl_lvb, gl,);
++
++ if (!(flags & GL_SKIP)) {
++ gfs_rgrp_relse(rgd);
++ if (test_bit(GLF_DIRTY, &gl->gl_flags))
++ gfs_rgrp_lvb_fill(rgd);
++ }
++}
++
++/**
++ * trans_go_xmote_th - promote/demote a metadata glock
++ * @gl: the glock
++ * @state: the requested state
++ * @flags: the flags passed into gfs_glock()
++ *
++ */
++
++static void
++trans_go_xmote_th(struct gfs_glock *gl, unsigned int state, int flags)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ int error;
++
++ if (gl->gl_state != LM_ST_UNLOCKED &&
++ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++ gfs_sync_meta(sdp);
++
++ error = gfs_log_shutdown(sdp);
++ if (error)
++ gfs_io_error(sdp);
++ }
++
++ gfs_glock_xmote_th(gl, state, flags);
++}
++
++/**
++ * trans_go_xmote_bh - promote/demote a metadata glock
++ * @gl: the glock
++ *
++ */
++
++static void
++trans_go_xmote_bh(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_glock *j_gl = sdp->sd_journal_gh.gh_gl;
++ struct gfs_log_header head;
++ int error;
++
++ if (gl->gl_state != LM_ST_UNLOCKED &&
++ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++ j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
++
++ error = gfs_find_jhead(sdp, &sdp->sd_jdesc, j_gl, &head);
++ GFS_ASSERT_SBD(!error, sdp,); /* FixMe!!! */
++ GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++
++ /* Initialize some head of the log stuff */
++ sdp->sd_sequence = head.lh_sequence;
++ sdp->sd_log_head = head.lh_first + 1;
++ }
++}
++
++/**
++ * trans_go_drop_th - prepare the transaction glock to be released
++ * @gl: the glock
++ *
++ * We want to sync the device even with localcaching. Remember
++ * that localcaching journal replay only marks buffers dirty.
++ */
++
++static void
++trans_go_drop_th(struct gfs_glock *gl)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ int error;
++
++ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
++ gfs_sync_meta(sdp);
++
++ error = gfs_log_shutdown(sdp);
++ if (error)
++ gfs_io_error(sdp);
++ }
++
++ gfs_glock_drop_th(gl);
++}
++
++/**
++ * nondisk_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++nondisk_go_demote_ok(struct gfs_glock *gl)
++{
++ return FALSE;
++}
++
++/**
++ * quota_go_demote_ok - check to see if it's ok to unlock a glock
++ * @gl: the glock
++ *
++ * Returns: TRUE if it's ok
++ */
++
++static int
++quota_go_demote_ok(struct gfs_glock *gl)
++{
++ return !atomic_read(&gl->gl_lvb_count);
++}
++
++struct gfs_glock_operations gfs_meta_glops = {
++ .go_xmote_th = gfs_glock_xmote_th,
++ .go_drop_th = gfs_glock_drop_th,
++ .go_sync = meta_go_sync,
++ .go_inval = meta_go_inval,
++ .go_demote_ok = meta_go_demote_ok,
++ .go_type = LM_TYPE_META
++};
++
++struct gfs_glock_operations gfs_inode_glops = {
++ .go_xmote_th = inode_go_xmote_th,
++ .go_xmote_bh = inode_go_xmote_bh,
++ .go_drop_th = inode_go_drop_th,
++ .go_sync = inode_go_sync,
++ .go_inval = inode_go_inval,
++ .go_demote_ok = inode_go_demote_ok,
++ .go_lock = inode_go_lock,
++ .go_unlock = inode_go_unlock,
++ .go_type = LM_TYPE_INODE
++};
++
++struct gfs_glock_operations gfs_rgrp_glops = {
++ .go_xmote_th = rgrp_go_xmote_th,
++ .go_drop_th = rgrp_go_drop_th,
++ .go_sync = meta_go_sync,
++ .go_inval = meta_go_inval,
++ .go_demote_ok = rgrp_go_demote_ok,
++ .go_lock = rgrp_go_lock,
++ .go_unlock = rgrp_go_unlock,
++ .go_type = LM_TYPE_RGRP
++};
++
++struct gfs_glock_operations gfs_trans_glops = {
++ .go_xmote_th = trans_go_xmote_th,
++ .go_xmote_bh = trans_go_xmote_bh,
++ .go_drop_th = trans_go_drop_th,
++ .go_type = LM_TYPE_NONDISK
++};
++
++struct gfs_glock_operations gfs_iopen_glops = {
++ .go_xmote_th = gfs_glock_xmote_th,
++ .go_drop_th = gfs_glock_drop_th,
++ .go_callback = gfs_iopen_go_callback,
++ .go_type = LM_TYPE_IOPEN
++};
++
++struct gfs_glock_operations gfs_flock_glops = {
++ .go_xmote_th = gfs_glock_xmote_th,
++ .go_drop_th = gfs_glock_drop_th,
++ .go_type = LM_TYPE_FLOCK
++};
++
++struct gfs_glock_operations gfs_nondisk_glops = {
++ .go_xmote_th = gfs_glock_xmote_th,
++ .go_drop_th = gfs_glock_drop_th,
++ .go_demote_ok = nondisk_go_demote_ok,
++ .go_type = LM_TYPE_NONDISK
++};
++
++struct gfs_glock_operations gfs_quota_glops = {
++ .go_xmote_th = gfs_glock_xmote_th,
++ .go_drop_th = gfs_glock_drop_th,
++ .go_demote_ok = quota_go_demote_ok,
++ .go_type = LM_TYPE_QUOTA
++};
+diff -urN linux-orig/fs/gfs/glops.h linux-patched/fs/gfs/glops.h
+--- linux-orig/fs/gfs/glops.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/glops.h 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,26 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GLOPS_DOT_H__
++#define __GLOPS_DOT_H__
++
++extern struct gfs_glock_operations gfs_meta_glops;
++extern struct gfs_glock_operations gfs_inode_glops;
++extern struct gfs_glock_operations gfs_rgrp_glops;
++extern struct gfs_glock_operations gfs_trans_glops;
++extern struct gfs_glock_operations gfs_iopen_glops;
++extern struct gfs_glock_operations gfs_flock_glops;
++extern struct gfs_glock_operations gfs_nondisk_glops;
++extern struct gfs_glock_operations gfs_quota_glops;
++
++#endif /* __GLOPS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/incore.h linux-patched/fs/gfs/incore.h
+--- linux-orig/fs/gfs/incore.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/incore.h 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,726 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __INCORE_DOT_H__
++#define __INCORE_DOT_H__
++
++#define DIO_NEW (0x00000001)
++#define DIO_FORCE (0x00000002)
++#define DIO_CLEAN (0x00000004)
++#define DIO_DIRTY (0x00000008)
++#define DIO_START (0x00000010)
++#define DIO_WAIT (0x00000020)
++#define DIO_METADATA (0x00000040)
++#define DIO_DATA (0x00000080)
++#define DIO_INVISIBLE (0x00000100)
++#define DIO_CHECK (0x00000200)
++#define DIO_ALL (0x00000400)
++
++/* Structure prototypes */
++
++struct gfs_log_operations;
++struct gfs_log_element;
++struct gfs_meta_header_cache;
++struct gfs_depend;
++struct gfs_bitmap;
++struct gfs_rgrpd;
++struct gfs_bufdata;
++struct gfs_glock_operations;
++struct gfs_holder;
++struct gfs_glock;
++struct gfs_alloc;
++struct gfs_inode;
++struct gfs_file;
++struct gfs_unlinked;
++struct gfs_quota_le;
++struct gfs_quota_data;
++struct gfs_log_buf;
++struct gfs_trans;
++struct gfs_gl_hash_bucket;
++struct gfs_sbd;
++
++typedef void (*gfs_glop_bh_t) (struct gfs_glock * gl, unsigned int ret);
++
++/*
++ * Structure of operations that are associated with each
++ * type of element in the log.
++ */
++
++struct gfs_log_operations {
++ /* Operations specific to a given log element */
++
++ void (*lo_add) (struct gfs_sbd * sdp, struct gfs_log_element * le);
++ void (*lo_trans_end) (struct gfs_sbd * sdp,
++ struct gfs_log_element * le);
++ void (*lo_print) (struct gfs_sbd * sdp, struct gfs_log_element * le,
++ unsigned int where);
++ struct gfs_trans *(*lo_overlap_trans) (struct gfs_sbd * sdp,
++ struct gfs_log_element * le);
++ void (*lo_incore_commit) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++ struct gfs_log_element * le);
++ void (*lo_add_to_ail) (struct gfs_sbd * sdp,
++ struct gfs_log_element * le);
++ void (*lo_clean_dump) (struct gfs_sbd * sdp,
++ struct gfs_log_element * le);
++
++ /* Operations specific to a class of log elements */
++
++ void (*lo_trans_size) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++ unsigned int *mblks, unsigned int *eblks,
++ unsigned int *blocks, unsigned int *bmem);
++ void (*lo_trans_combine) (struct gfs_sbd * sdp, struct gfs_trans * tr,
++ struct gfs_trans * new_tr);
++ void (*lo_build_bhlist) (struct gfs_sbd * sdp, struct gfs_trans * tr);
++ void (*lo_dump_size) (struct gfs_sbd * sdp, unsigned int *elements,
++ unsigned int *blocks, unsigned int *bmem);
++ void (*lo_build_dump) (struct gfs_sbd * sdp, struct gfs_trans * tr);
++
++ /* Operations that happen at recovery time */
++
++ void (*lo_before_scan) (struct gfs_sbd * sdp, unsigned int jid,
++ struct gfs_log_header * head,
++ unsigned int pass);
++ int (*lo_scan_elements) (struct gfs_sbd * sdp,
++ struct gfs_jindex * jdesc,
++ struct gfs_glock * gl, uint64_t start,
++ struct gfs_log_descriptor * desc,
++ unsigned int pass);
++ void (*lo_after_scan) (struct gfs_sbd * sdp, unsigned int jid,
++ unsigned int pass);
++
++ char *lo_name;
++};
++
++/*
++ * Structure that gets added to struct gfs_trans->tr_elements. They
++ * make up the "stuff" in each transaction.
++ */
++
++struct gfs_log_element {
++ struct gfs_log_operations *le_ops;
++
++ struct gfs_trans *le_trans;
++ struct list_head le_list;
++};
++
++struct gfs_meta_header_cache {
++ struct list_head mc_list_hash;
++ struct list_head mc_list_single;
++ struct list_head mc_list_rgd;
++
++ uint64_t mc_block;
++ struct gfs_meta_header mc_mh;
++};
++
++struct gfs_depend {
++ struct list_head gd_list_hash;
++ struct list_head gd_list_rgd;
++
++ struct gfs_rgrpd *gd_rgd;
++ uint64_t gd_formal_ino;
++ unsigned long gd_time;
++};
++
++/*
++ * Structure containing information about the allocation bitmaps.
++ * There are one of these for each fs block that the bitmap for
++ * the resource group header covers.
++ */
++
++struct gfs_bitmap {
++ uint32_t bi_offset; /* The offset in the buffer of the first byte */
++ uint32_t bi_start; /* The position of the first byte in this block */
++ uint32_t bi_len; /* The number of bytes in this block */
++};
++
++/*
++ * Structure containing information Resource Groups
++ */
++
++struct gfs_rgrpd {
++ struct list_head rd_list; /* Link with superblock */
++ struct list_head rd_list_mru;
++ struct list_head rd_recent; /* Recently used rgrps */
++
++ struct gfs_glock *rd_gl; /* Glock for rgrp */
++
++ unsigned long rd_flags;
++
++ struct gfs_rindex rd_ri; /* Resource Index structure */
++ struct gfs_rgrp rd_rg; /* Resource Group structure */
++ uint64_t rd_rg_vn;
++
++ struct gfs_bitmap *rd_bits;
++ struct buffer_head **rd_bh;
++
++ uint32_t rd_last_alloc_data;
++ uint32_t rd_last_alloc_meta;
++
++ struct list_head rd_mhc;
++ struct list_head rd_depend;
++
++ struct gfs_sbd *rd_sbd;
++};
++
++/*
++ * Per-buffer data
++ */
++
++struct gfs_bufdata {
++ struct buffer_head *bd_bh; /* struct buffer_head which this struct belongs to */
++ struct gfs_glock *bd_gl; /* Pointer to Glock struct for this bh */
++
++ struct gfs_log_element bd_new_le;
++ struct gfs_log_element bd_incore_le;
++
++ char *bd_frozen;
++ struct semaphore bd_lock;
++
++ unsigned int bd_pinned; /* Pin count */
++ struct list_head bd_ail_tr_list; /* List of buffers hanging off tr_ail_bufs */
++ struct list_head bd_ail_gl_list; /* List of buffers hanging off gl_ail_bufs */
++};
++
++/*
++ * Glock operations
++ */
++
++struct gfs_glock_operations {
++ void (*go_xmote_th) (struct gfs_glock * gl, unsigned int state,
++ int flags);
++ void (*go_xmote_bh) (struct gfs_glock * gl);
++ void (*go_drop_th) (struct gfs_glock * gl);
++ void (*go_drop_bh) (struct gfs_glock * gl);
++ void (*go_sync) (struct gfs_glock * gl, int flags);
++ void (*go_inval) (struct gfs_glock * gl, int flags);
++ int (*go_demote_ok) (struct gfs_glock * gl);
++ int (*go_lock) (struct gfs_glock * gl, int flags);
++ void (*go_unlock) (struct gfs_glock * gl, int flags);
++ void (*go_callback) (struct gfs_glock * gl, unsigned int state);
++ int go_type;
++};
++
++/* Actions */
++#define HIF_MUTEX (0)
++#define HIF_PROMOTE (1)
++#define HIF_DEMOTE (2)
++
++/* States */
++#define HIF_ALLOCED (3)
++#define HIF_DEALLOC (4)
++#define HIF_HOLDER (5)
++#define HIF_FIRST (6)
++#define HIF_WAKEUP (7)
++#define HIF_RECURSE (8)
++
++struct gfs_holder {
++ struct list_head gh_list;
++
++ struct gfs_glock *gh_gl;
++ struct task_struct *gh_owner;
++ unsigned int gh_state;
++ int gh_flags;
++
++ int gh_error;
++ unsigned long gh_iflags;
++ struct completion gh_wait;
++};
++
++/*
++ * Glock Structure
++ */
++
++#define GLF_PLUG (0)
++#define GLF_LOCK (1)
++#define GLF_STICKY (2)
++#define GLF_PREFETCH (3)
++#define GLF_SYNC (4)
++#define GLF_DIRTY (5)
++#define GLF_LVB_INVALID (6)
++
++struct gfs_glock {
++ struct list_head gl_list;
++ unsigned long gl_flags;
++ struct lm_lockname gl_name;
++ atomic_t gl_count;
++
++ spinlock_t gl_spin;
++
++ unsigned int gl_state;
++ struct list_head gl_holders;
++ struct list_head gl_waiters1; /* HIF_MUTEX */
++ struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_PROMOTE */
++
++ struct gfs_glock_operations *gl_ops;
++
++ struct gfs_holder *gl_req_gh;
++ gfs_glop_bh_t gl_req_bh;
++
++ lm_lock_t *gl_lock;
++ char *gl_lvb;
++ atomic_t gl_lvb_count;
++
++ uint64_t gl_vn;
++ unsigned long gl_stamp;
++ void *gl_object;
++
++ struct gfs_log_element gl_new_le;
++ struct gfs_log_element gl_incore_le;
++
++ struct gfs_gl_hash_bucket *gl_bucket;
++ struct list_head gl_reclaim;
++
++ struct gfs_sbd *gl_sbd;
++
++ struct inode *gl_aspace;
++ struct list_head gl_dirty_buffers;
++ struct list_head gl_ail_bufs;
++};
++
++/*
++ * In-Place Reservation structure
++ */
++
++struct gfs_alloc {
++ /* Quota stuff */
++
++ unsigned int al_qd_num;
++ struct gfs_quota_data *al_qd[4];
++ struct gfs_holder al_qd_ghs[4];
++
++ /* Filled in by the caller to gfs_inplace_reserve() */
++
++ uint32_t al_requested_di;
++ uint32_t al_requested_meta;
++ uint32_t al_requested_data;
++
++ /* Filled in by gfs_inplace_reserve() */
++
++ char *al_file;
++ unsigned int al_line;
++ struct gfs_holder al_ri_gh;
++ struct gfs_holder al_rgd_gh;
++ struct gfs_rgrpd *al_rgd;
++ uint32_t al_reserved_meta;
++ uint32_t al_reserved_data;
++
++ /* Filled in by gfs_blkalloc() */
++
++ uint32_t al_alloced_di;
++ uint32_t al_alloced_meta;
++ uint32_t al_alloced_data;
++
++ /* Dinode allocation crap */
++
++ struct gfs_unlinked *al_ul;
++};
++
++/*
++ * Incore inode structure
++ */
++
++#define GIF_QD_LOCKED (0)
++#define GIF_PAGED (1)
++#define GIF_SW_PAGED (2)
++
++struct gfs_inode {
++ struct gfs_inum i_num;
++
++ atomic_t i_count;
++ unsigned long i_flags;
++
++ uint64_t i_vn;
++ struct gfs_dinode i_di;
++
++ struct gfs_glock *i_gl;
++ struct gfs_sbd *i_sbd;
++ struct inode *i_vnode;
++
++ struct gfs_holder i_iopen_gh;
++
++ struct gfs_alloc *i_alloc;
++ uint64_t i_last_rg_alloc;
++
++ struct task_struct *i_creat_task;
++ pid_t i_creat_pid;
++
++ spinlock_t i_lock;
++ struct buffer_head *i_cache[GFS_MAX_META_HEIGHT];
++};
++
++/*
++ * GFS per-fd structure
++ */
++
++#define GFF_DID_DIRECT_ALLOC (0)
++
++struct gfs_file {
++ unsigned long f_flags;
++
++ struct semaphore f_fl_lock;
++ struct gfs_holder f_fl_gh;
++
++ struct gfs_inode *f_inode;
++ struct file *f_vfile;
++};
++
++/*
++ * Unlinked inode log entry
++ */
++
++#define ULF_NEW_UL (0)
++#define ULF_INCORE_UL (1)
++#define ULF_IC_LIST (2)
++#define ULF_OD_LIST (3)
++#define ULF_LOCK (4)
++
++struct gfs_unlinked {
++ struct list_head ul_list;
++ unsigned int ul_count;
++
++ struct gfs_inum ul_inum;
++ unsigned long ul_flags;
++
++ struct gfs_log_element ul_new_le;
++ struct gfs_log_element ul_incore_le;
++ struct gfs_log_element ul_ondisk_le;
++};
++
++/*
++ * Quota log element
++ */
++
++struct gfs_quota_le {
++ struct gfs_log_element ql_le;
++
++ struct gfs_quota_data *ql_data;
++ struct list_head ql_data_list;
++
++ int64_t ql_change;
++};
++
++#define QDF_USER (0)
++#define QDF_OD_LIST (1)
++#define QDF_LOCK (2)
++
++struct gfs_quota_data {
++ struct list_head qd_list;
++ unsigned int qd_count;
++
++ uint32_t qd_id;
++ unsigned long qd_flags;
++
++ struct list_head qd_le_list;
++
++ int64_t qd_change_new;
++ int64_t qd_change_ic;
++ int64_t qd_change_od;
++ int64_t qd_change_sync;
++
++ struct gfs_quota_le qd_ondisk_ql;
++ uint64_t qd_sync_gen;
++
++ struct gfs_glock *qd_gl;
++ struct gfs_quota_lvb qd_qb;
++
++ unsigned long qd_last_warn;
++};
++
++struct gfs_log_buf {
++ struct list_head lb_list;
++
++ struct buffer_head lb_bh;
++ struct buffer_head *lb_unlock;
++};
++
++/*
++ * Transaction structures
++ */
++
++#define TRF_LOG_DUMP (0x00000001)
++
++struct gfs_trans {
++ struct list_head tr_list;
++
++ /* Initial creation stuff */
++
++ char *tr_file;
++ unsigned int tr_line;
++
++ unsigned int tr_mblks_asked; /* Number of log blocks asked to be reserved */
++ unsigned int tr_eblks_asked;
++ unsigned int tr_seg_reserved; /* Number of segments reserved */
++
++ struct gfs_holder *tr_t_gh;
++
++ /* Stuff filled in during creation */
++
++ unsigned int tr_flags;
++ struct list_head tr_elements;
++
++ /* Stuff modified during the commit */
++
++ unsigned int tr_num_free_bufs;
++ struct list_head tr_free_bufs;
++ unsigned int tr_num_free_bmem;
++ struct list_head tr_free_bmem;
++
++ uint64_t tr_log_head; /* The current log head */
++ uint64_t tr_first_head; /* First header block */
++
++ struct list_head tr_bufs; /* List of buffers going to the log */
++
++ /* Stuff that's part of the AIL */
++
++ struct list_head tr_ail_bufs;
++
++ /* Private data for different log element types */
++
++ unsigned int tr_num_gl;
++ unsigned int tr_num_buf;
++ unsigned int tr_num_iul;
++ unsigned int tr_num_ida;
++ unsigned int tr_num_q;
++};
++
++/*
++ * One bucket of the glock hash table.
++ */
++
++struct gfs_gl_hash_bucket {
++ rwlock_t hb_lock;
++ struct list_head hb_list;
++} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
++
++/*
++ * Super Block Data Structure (One per filesystem)
++ */
++
++#define SDF_JOURNAL_LIVE (0)
++#define SDF_SCAND_RUN (1)
++#define SDF_GLOCKD_RUN (2)
++#define SDF_RECOVERD_RUN (3)
++#define SDF_LOGD_RUN (4)
++#define SDF_QUOTAD_RUN (5)
++#define SDF_INODED_RUN (6)
++#define SDF_NOATIME (7)
++#define SDF_ROFS (8)
++#define SDF_NEED_LOG_DUMP (9)
++#define SDF_FOUND_UL_DUMP (10)
++#define SDF_FOUND_Q_DUMP (11)
++#define SDF_IN_LOG_DUMP (12)
++
++#define GFS_GL_HASH_SHIFT (13)
++#define GFS_GL_HASH_SIZE (1 << GFS_GL_HASH_SHIFT)
++#define GFS_GL_HASH_MASK (GFS_GL_HASH_SIZE - 1)
++
++#define GFS_MHC_HASH_SHIFT (10)
++#define GFS_MHC_HASH_SIZE (1 << GFS_MHC_HASH_SHIFT)
++#define GFS_MHC_HASH_MASK (GFS_MHC_HASH_SIZE - 1)
++
++#define GFS_DEPEND_HASH_SHIFT (10)
++#define GFS_DEPEND_HASH_SIZE (1 << GFS_DEPEND_HASH_SHIFT)
++#define GFS_DEPEND_HASH_MASK (GFS_DEPEND_HASH_SIZE - 1)
++
++struct gfs_sbd {
++ struct gfs_sb sd_sb; /* Super Block */
++
++ struct super_block *sd_vfs; /* FS's device independent sb */
++
++ struct gfs_args sd_args;
++ unsigned long sd_flags;
++
++ struct gfs_tune sd_tune; /* FS tuning structure */
++
++ /* Resource group stuff */
++
++ struct gfs_inode *sd_riinode; /* rindex inode */
++ uint64_t sd_riinode_vn; /* Version number of the resource index inode */
++
++ struct list_head sd_rglist; /* List of resource groups */
++ struct semaphore sd_rindex_lock;
++
++ struct list_head sd_rg_mru_list; /* List of resource groups in MRU order */
++ spinlock_t sd_rg_mru_lock; /* Lock for MRU list */
++ struct list_head sd_rg_recent; /* Recently used rgrps */
++ spinlock_t sd_rg_recent_lock;
++ struct gfs_rgrpd *sd_rg_forward; /* Next new rgrp to try for allocation */
++ spinlock_t sd_rg_forward_lock;
++
++ unsigned int sd_rgcount; /* Count of resource groups */
++
++ /* Constants computed on mount */
++
++ uint32_t sd_fsb2bb;
++ uint32_t sd_fsb2bb_shift; /* Shift FS Block numbers to the left by
++ this to get buffer cache blocks */
++ uint32_t sd_diptrs; /* Number of pointers in a dinode */
++ uint32_t sd_inptrs; /* Number of pointers in a indirect block */
++ uint32_t sd_jbsize; /* Size of a journaled data block */
++ uint32_t sd_hash_bsize; /* sizeof(exhash block) */
++ uint32_t sd_hash_bsize_shift;
++ uint32_t sd_hash_ptrs; /* Number of points in a hash block */
++ uint32_t sd_max_dirres; /* Maximum space needed to add a directory entry */
++ uint32_t sd_max_height; /* Maximum height of a file's metadata tree */
++ uint64_t sd_heightsize[GFS_MAX_META_HEIGHT];
++ uint32_t sd_max_jheight; /* Maximum height of a journaled file's metadata tree */
++ uint64_t sd_jheightsize[GFS_MAX_META_HEIGHT];
++
++ /* Lock Stuff */
++
++ struct gfs_gl_hash_bucket sd_gl_hash[GFS_GL_HASH_SIZE];
++
++ struct list_head sd_reclaim_list;
++ spinlock_t sd_reclaim_lock;
++ wait_queue_head_t sd_reclaim_wchan;
++ atomic_t sd_reclaim_count;
++
++ struct lm_lockstruct sd_lockstruct;
++
++ struct list_head sd_mhc[GFS_MHC_HASH_SIZE];
++ struct list_head sd_mhc_single;
++ spinlock_t sd_mhc_lock;
++ atomic_t sd_mhc_count;
++
++ struct list_head sd_depend[GFS_DEPEND_HASH_SIZE];
++ spinlock_t sd_depend_lock;
++ atomic_t sd_depend_count;
++
++ struct gfs_holder sd_live_gh;
++
++ struct gfs_holder sd_freeze_gh;
++ struct semaphore sd_freeze_lock;
++ unsigned int sd_freeze_count;
++
++ /* Inode Stuff */
++
++ struct gfs_inode *sd_rooti; /* FS's root inode */
++
++ struct gfs_glock *sd_rename_gl; /* rename glock */
++
++ /* Daemon stuff */
++
++ struct task_struct *sd_scand_process;
++ unsigned int sd_glockd_num;
++ struct task_struct *sd_recoverd_process;
++ struct task_struct *sd_logd_process;
++ struct task_struct *sd_quotad_process;
++ struct task_struct *sd_inoded_process;
++
++ struct semaphore sd_thread_lock;
++ struct completion sd_thread_completion;
++
++ /* Log stuff */
++
++ struct gfs_glock *sd_trans_gl; /* transaction glock */
++
++ struct gfs_inode *sd_jiinode; /* jindex inode */
++ uint64_t sd_jiinode_vn; /* Version number of the journal index inode */
++
++ unsigned int sd_journals; /* Number of journals in the FS */
++ struct gfs_jindex *sd_jindex; /* Array of Jindex structures describing this FS's journals */
++ struct semaphore sd_jindex_lock;
++ unsigned long sd_jindex_refresh_time;
++
++ struct gfs_jindex sd_jdesc; /* Jindex structure describing this machine's journal */
++ struct gfs_holder sd_journal_gh; /* the glock for this machine's journal */
++
++ uint64_t sd_sequence; /* Assigned to xactions in order they commit */
++ uint64_t sd_log_head; /* Block number of next journal write */
++ uint64_t sd_log_wrap;
++
++ spinlock_t sd_log_seg_lock;
++ unsigned int sd_log_seg_free; /* Free segments in the log */
++ struct list_head sd_log_seg_list;
++ wait_queue_head_t sd_log_seg_wait;
++
++ struct list_head sd_log_ail; /* struct gfs_trans structures that form the Active Items List
++ "next" is the head, "prev" is the tail */
++
++ struct list_head sd_log_incore; /* transactions that have been commited incore (but not ondisk)
++ "next" is the newest, "prev" is the oldest */
++ unsigned int sd_log_buffers; /* Number of buffers in the incore log */
++
++ struct semaphore sd_log_lock; /* Lock for access to log values */
++
++ uint64_t sd_log_dump_last;
++ uint64_t sd_log_dump_last_wrap;
++
++ /* unlinked crap */
++
++ struct list_head sd_unlinked_list;
++ spinlock_t sd_unlinked_lock;
++
++ atomic_t sd_unlinked_ic_count;
++ atomic_t sd_unlinked_od_count;
++
++ /* quota crap */
++
++ struct list_head sd_quota_list;
++ spinlock_t sd_quota_lock;
++
++ atomic_t sd_quota_count;
++ atomic_t sd_quota_od_count;
++
++ struct gfs_inode *sd_qinode;
++
++ uint64_t sd_quota_sync_gen;
++ unsigned long sd_quota_sync_time;
++
++ /* license crap */
++
++ struct gfs_inode *sd_linode;
++
++ /* Recovery stuff */
++
++ struct list_head sd_dirty_j;
++ spinlock_t sd_dirty_j_lock;
++
++ unsigned int sd_recovery_replays;
++ unsigned int sd_recovery_skips;
++ unsigned int sd_recovery_sames;
++
++ /* Counters */
++
++ atomic_t sd_glock_count;
++ atomic_t sd_glock_held_count;
++ atomic_t sd_inode_count;
++ atomic_t sd_bufdata_count;
++ atomic_t sd_fh2dentry_misses;
++ atomic_t sd_reclaimed;
++ atomic_t sd_glock_nq_calls;
++ atomic_t sd_glock_dq_calls;
++ atomic_t sd_glock_prefetch_calls;
++ atomic_t sd_lm_lock_calls;
++ atomic_t sd_lm_unlock_calls;
++ atomic_t sd_lm_callbacks;
++ atomic_t sd_ops_address;
++ atomic_t sd_ops_dentry;
++ atomic_t sd_ops_export;
++ atomic_t sd_ops_file;
++ atomic_t sd_ops_inode;
++ atomic_t sd_ops_super;
++ atomic_t sd_ops_vm;
++
++ char sd_fsname[256];
++
++ /* Debugging crud */
++
++ unsigned long sd_last_readdirplus;
++ unsigned long sd_last_unlocked_aop;
++
++ spinlock_t sd_ail_lock;
++ struct list_head sd_recovery_bufs;
++};
++
++#endif /* __INCORE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/inode.c linux-patched/fs/gfs/inode.c
+--- linux-orig/fs/gfs/inode.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/inode.c 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,1993 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/xattr_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "ops_address.h"
++#include "ops_file.h"
++#include "ops_inode.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * inode_attr_in - Copy attributes from the dinode into the VFS inode
++ * @ip: The GFS inode
++ *
++ */
++
++static void
++inode_attr_in(struct gfs_inode *ip, struct inode *ino)
++{
++ unsigned int mode;
++
++ ino->i_ino = ip->i_num.no_formal_ino;
++
++ switch (ip->i_di.di_type) {
++ case GFS_FILE_REG:
++ mode = S_IFREG;
++ ino->i_rdev = 0;
++ break;
++ case GFS_FILE_DIR:
++ mode = S_IFDIR;
++ ino->i_rdev = 0;
++ break;
++ case GFS_FILE_LNK:
++ mode = S_IFLNK;
++ ino->i_rdev = 0;
++ break;
++ case GFS_FILE_BLK:
++ mode = S_IFBLK;
++ ino->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
++ break;
++ case GFS_FILE_CHR:
++ mode = S_IFCHR;
++ ino->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
++ break;
++ case GFS_FILE_FIFO:
++ mode = S_IFIFO;
++ ino->i_rdev = 0;
++ break;
++ case GFS_FILE_SOCK:
++ mode = S_IFSOCK;
++ ino->i_rdev = 0;
++ break;
++ default:
++ GFS_ASSERT_INODE(FALSE, ip,
++ printk("type = %u\n", ip->i_di.di_type););
++ break;
++ };
++
++ ino->i_mode = mode | (ip->i_di.di_mode & S_IALLUGO);
++ ino->i_nlink = ip->i_di.di_nlink;
++ ino->i_uid = ip->i_di.di_uid;
++ ino->i_gid = ip->i_di.di_gid;
++ i_size_write(ino, ip->i_di.di_size);
++ ino->i_atime.tv_sec = ip->i_di.di_atime;
++ ino->i_mtime.tv_sec = ip->i_di.di_mtime;
++ ino->i_ctime.tv_sec = ip->i_di.di_ctime;
++ ino->i_atime.tv_nsec = ino->i_mtime.tv_nsec = ino->i_ctime.tv_nsec = 0;
++ ino->i_blksize = PAGE_SIZE;
++ ino->i_blocks = ip->i_di.di_blocks <<
++ (ip->i_sbd->sd_sb.sb_bsize_shift - GFS_BASIC_BLOCK_SHIFT);
++ ino->i_generation = ip->i_di.di_header.mh_incarn;
++}
++
++/**
++ * gfs_inode_attr_in - Copy attributes from the dinode into the VFS inode
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_attr_in(struct gfs_inode *ip)
++{
++ struct inode *inode;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (inode) {
++ inode_attr_in(ip, inode);
++ iput(inode);
++ }
++
++}
++
++/**
++ * gfs_inode_attr_out - Copy attributes from VFS inode into the dinode
++ * @ip: The GFS inode
++ *
++ * Only copy out the attributes that we want the VFS layer
++ * to be able to modify.
++ */
++
++void
++gfs_inode_attr_out(struct gfs_inode *ip)
++{
++ struct inode *inode;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (inode) {
++ ip->i_di.di_mode = inode->i_mode & S_IALLUGO;
++ ip->i_di.di_uid = inode->i_uid;
++ ip->i_di.di_gid = inode->i_gid;
++ ip->i_di.di_atime = inode->i_atime.tv_sec;
++ ip->i_di.di_mtime = inode->i_mtime.tv_sec;
++ ip->i_di.di_ctime = inode->i_ctime.tv_sec;
++ iput(inode);
++ }
++}
++
++/**
++ * gfs_iget - Get/Create a struct inode for a struct gfs_inode
++ * @ip: the struct gfs_inode to get the struct inode for
++ *
++ * Returns: An inode
++ */
++
++struct inode *
++gfs_iget(struct gfs_inode *ip, int create)
++{
++ struct inode *inode = NULL, *tmp;
++
++ spin_lock(&ip->i_lock);
++ if (ip->i_vnode)
++ inode = igrab(ip->i_vnode);
++ spin_unlock(&ip->i_lock);
++
++ if (inode || !create)
++ return inode;
++
++ tmp = new_inode(ip->i_sbd->sd_vfs);
++ if (!tmp)
++ return NULL;
++
++ inode_attr_in(ip, tmp);
++
++ if (ip->i_di.di_type == GFS_FILE_REG) {
++ tmp->i_op = &gfs_file_iops;
++ tmp->i_fop = &gfs_file_fops;
++ tmp->i_mapping->a_ops = &gfs_file_aops;
++ } else if (ip->i_di.di_type == GFS_FILE_DIR) {
++ tmp->i_op = &gfs_dir_iops;
++ tmp->i_fop = &gfs_dir_fops;
++ } else if (ip->i_di.di_type == GFS_FILE_LNK) {
++ tmp->i_op = &gfs_symlink_iops;
++ } else {
++ tmp->i_op = &gfs_dev_iops;
++ init_special_inode(tmp, tmp->i_mode, tmp->i_rdev);
++ }
++
++ vn2ip(tmp) = NULL;
++
++ for (;;) {
++ spin_lock(&ip->i_lock);
++ if (!ip->i_vnode)
++ break;
++ inode = igrab(ip->i_vnode);
++ spin_unlock(&ip->i_lock);
++
++ if (inode) {
++ iput(tmp);
++ return inode;
++ }
++ yield();
++ }
++
++ inode = tmp;
++
++ gfs_inode_hold(ip);
++ ip->i_vnode = inode;
++ vn2ip(inode) = ip;
++
++ spin_unlock(&ip->i_lock);
++
++ insert_inode_hash(inode);
++
++ return inode;
++}
++
++/**
++ * gfs_copyin_dinode - Refresh the incore copy of the dinode
++ * @ip: The GFS inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_copyin_dinode(struct gfs_inode *ip)
++{
++ struct buffer_head *dibh;
++ int error;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ gfs_metatype_check(ip->i_sbd, dibh, GFS_METATYPE_DI);
++ gfs_dinode_in(&ip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ GFS_ASSERT_INODE(ip->i_num.no_formal_ino ==
++ ip->i_di.di_num.no_formal_ino, ip,
++ gfs_dinode_print(&ip->i_di););
++
++ /* Handle a moved inode */
++
++ if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
++ /* Not implemented yet */
++ GFS_ASSERT_INODE(FALSE, ip,);
++ }
++
++ ip->i_vn = ip->i_gl->gl_vn;
++
++ return 0;
++}
++
++/**
++ * inode_create - create a struct gfs_inode
++ * @i_gl: The glock covering the inode
++ * @inum: The inode number
++ * @io_gl: the iopen glock, or NULL
++ * @io_state: the state the iopen glock should be acquire in
++ * @ipp: pointer to put the returned inode in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_create(struct gfs_glock *i_gl, struct gfs_inum *inum,
++ struct gfs_glock *io_gl, unsigned int io_state,
++ struct gfs_inode **ipp)
++{
++ struct gfs_sbd *sdp = i_gl->gl_sbd;
++ struct gfs_inode *ip;
++ int error = 0;
++
++ RETRY_MALLOC(ip = kmem_cache_alloc(gfs_inode_cachep, GFP_KERNEL), ip);
++ memset(ip, 0, sizeof(struct gfs_inode));
++
++ ip->i_num = *inum;
++
++ atomic_set(&ip->i_count, 1);
++
++ ip->i_gl = i_gl;
++ ip->i_sbd = sdp;
++
++ spin_lock_init(&ip->i_lock);
++
++ error = gfs_glock_nq_init(io_gl,
++ io_state, GL_LOCAL_EXCL | GL_EXACT,
++ &ip->i_iopen_gh);
++ if (error)
++ goto fail;
++
++ ip->i_iopen_gh.gh_owner = NULL;
++
++ spin_lock(&io_gl->gl_spin);
++ gfs_glock_hold(i_gl);
++ gl2gl(io_gl) = i_gl;
++ spin_unlock(&io_gl->gl_spin);
++
++ error = gfs_copyin_dinode(ip);
++ if (error)
++ goto fail_iopen;
++
++ gfs_glock_hold(i_gl);
++ gl2ip(i_gl) = ip;
++
++ atomic_inc(&sdp->sd_inode_count);
++
++ *ipp = ip;
++
++ return 0;
++
++ fail_iopen:
++ spin_lock(&io_gl->gl_spin);
++ gl2gl(io_gl) = NULL;
++ gfs_glock_put(i_gl);
++ spin_unlock(&io_gl->gl_spin);
++
++ gfs_glock_dq_uninit(&ip->i_iopen_gh);
++
++ fail:
++ gfs_flush_meta_cache(ip);
++ kmem_cache_free(gfs_inode_cachep, ip);
++ *ipp = NULL;
++
++ return error;
++}
++
++/**
++ * gfs_inode_get - Get an inode given its number
++ * @i_gl: The glock covering the inode
++ * @inum: The inode number
++ * @create: Flag to say if we are allowed to create a new struct gfs_inode
++ * @ipp: pointer to put the returned inode in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_inode_get(struct gfs_glock *i_gl, struct gfs_inum *inum, int create,
++ struct gfs_inode **ipp)
++{
++ struct gfs_glock *io_gl;
++ int error = 0;
++
++ *ipp = gl2ip(i_gl);
++ if (*ipp) {
++ atomic_inc(&(*ipp)->i_count);
++ GFS_ASSERT_INODE((*ipp)->i_num.no_formal_ino ==
++ inum->no_formal_ino,
++ (*ipp),);
++ } else if (create) {
++ error = gfs_glock_get(i_gl->gl_sbd,
++ inum->no_addr, &gfs_iopen_glops,
++ CREATE, &io_gl);
++ if (!error) {
++ error = inode_create(i_gl, inum, io_gl,
++ LM_ST_SHARED, ipp);
++ gfs_glock_put(io_gl);
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_inode_hold - hold a struct gfs_inode structure
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_hold(struct gfs_inode *ip)
++{
++ GFS_ASSERT_INODE(atomic_read(&ip->i_count), ip,);
++ atomic_inc(&ip->i_count);
++}
++
++/**
++ * gfs_inode_put - put a struct gfs_inode structure
++ * @ip: The GFS inode
++ *
++ */
++
++void
++gfs_inode_put(struct gfs_inode *ip)
++{
++ atomic_dec(&ip->i_count);
++ GFS_ASSERT_INODE(atomic_read(&ip->i_count) >= 0, ip,);
++}
++
++/**
++ * gfs_inode_destroy - Destroy an inode structure with no references on it
++ * @ip: The GFS inode
++ *
++ * This function must be called with a glock held on the inode.
++ *
++ */
++
++void
++gfs_inode_destroy(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_glock *io_gl = ip->i_iopen_gh.gh_gl;
++ struct gfs_glock *i_gl = ip->i_gl;
++
++ GFS_ASSERT_INODE(!atomic_read(&ip->i_count), ip,);
++ GFS_ASSERT_INODE(gl2gl(io_gl) == i_gl, ip,);
++
++ spin_lock(&io_gl->gl_spin);
++ gl2gl(io_gl) = NULL;
++ gfs_glock_put(i_gl);
++ spin_unlock(&io_gl->gl_spin);
++
++ gfs_glock_dq_uninit(&ip->i_iopen_gh);
++
++ gfs_flush_meta_cache(ip);
++ kmem_cache_free(gfs_inode_cachep, ip);
++
++ gl2ip(i_gl) = NULL;
++ gfs_glock_put(i_gl);
++
++ atomic_dec(&sdp->sd_inode_count);
++}
++
++/**
++ * dinode_mark_unused -
++ * @ip:
++ *
++ * Returns: errno
++ */
++
++static int
++dinode_mark_unused(struct gfs_inode *ip)
++{
++ struct buffer_head *dibh;
++ struct gfs_dinode *di;
++ uint32_t incarn;
++ uint64_t ctime;
++ uint32_t flags;
++ int error;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ di = (struct gfs_dinode *)dibh->b_data;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ incarn = gfs32_to_cpu(di->di_header.mh_incarn) + 1;
++ di->di_header.mh_incarn = cpu_to_gfs32(incarn);
++
++ ctime = get_seconds();
++ di->di_ctime = cpu_to_gfs64(ctime);
++
++ flags = (gfs32_to_cpu(di->di_flags)) | GFS_DIF_UNUSED;
++ di->di_flags = cpu_to_gfs32(flags);
++
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * dinode_dealloc - Put deallocate a dinode
++ * @ip: The GFS inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++dinode_dealloc(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_rgrpd *rgd;
++ struct gfs_holder ri_gh, rgd_gh;
++ int error;
++
++ gfs_alloc_get(ip);
++
++ error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ goto fail_qs;
++
++ rgd = gfs_blk2rgrpd(sdp, ip->i_num.no_addr);
++ GFS_ASSERT_INODE(rgd, ip,);
++
++ error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++ if (error)
++ goto fail_rindex_relse;
++
++ GFS_ASSERT_INODE(ip->i_di.di_blocks == 1, ip,
++ gfs_dinode_print(&ip->i_di););
++
++ /* Trans may require:
++ One block for the RG header.
++ One block for the dinode bit.
++ One block for the dinode.
++ We also need a block for the unlinked change.
++ One block for the quota change. */
++
++ error = gfs_trans_begin(sdp, 3, 2);
++ if (error)
++ goto fail_rg_gunlock;
++
++ error = dinode_mark_unused(ip);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_difree(rgd, ip);
++
++ gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA, &ip->i_num);
++ clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_uninit(&rgd_gh);
++ gfs_glock_dq_uninit(&ri_gh);
++
++ gfs_quota_unhold_m(ip);
++ gfs_alloc_put(ip);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_rg_gunlock:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex_relse:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ fail_qs:
++ gfs_quota_unhold_m(ip);
++
++ fail:
++ gfs_alloc_put(ip);
++
++ return error;
++}
++
++/**
++ * inode_dealloc - Deallocate an inode
++ * @sdp: the filesystem
++ * @inum: the inode number to deallocate
++ * @io_gh: a holder for the iopen glock for this inode
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum,
++ struct gfs_holder *io_gh)
++{
++ struct gfs_inode *ip;
++ struct gfs_holder i_gh;
++ int error;
++
++ error = gfs_glock_nq_num(sdp,
++ inum->no_formal_ino, &gfs_inode_glops,
++ LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (error)
++ return error;
++
++ /* We reacquire the iopen lock here to avoid a race with the NFS server
++ calling gfs_read_inode() with the inode number of a inode we're in the
++ process of deallocating. And we can't keep our hold on the lock
++ from try_dealloc_inode() for deadlock reasons. */
++
++ gfs_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY, io_gh);
++ error = gfs_glock_nq(io_gh);
++ switch (error) {
++ case 0:
++ break;
++ case GLR_TRYFAILED:
++ error = 0;
++ goto fail;
++ default:
++ GFS_ASSERT_SBD(error < 0, sdp,);
++ goto fail;
++ }
++
++ GFS_ASSERT_GLOCK(!gl2ip(i_gh.gh_gl), i_gh.gh_gl,);
++ error = inode_create(i_gh.gh_gl, inum, io_gh->gh_gl, LM_ST_EXCLUSIVE,
++ &ip);
++
++ gfs_glock_dq(io_gh);
++
++ if (error)
++ goto fail;
++
++ GFS_ASSERT_INODE(!ip->i_di.di_nlink, ip,
++ gfs_dinode_print(&ip->i_di););
++ GFS_ASSERT_INODE(atomic_read(&ip->i_count) == 1, ip,);
++ GFS_ASSERT_INODE(!ip->i_vnode, ip,);
++
++ if (ip->i_di.di_type == GFS_FILE_DIR &&
++ (ip->i_di.di_flags & GFS_DIF_EXHASH)) {
++ error = gfs_dir_exhash_free(ip);
++ if (error)
++ goto fail_iput;
++ }
++
++ if (ip->i_di.di_eattr) {
++ error = gfs_ea_dealloc(ip);
++ if (error)
++ goto fail_iput;
++ }
++
++ error = gfs_shrink(ip, 0, NULL);
++ if (error)
++ goto fail_iput;
++
++ error = dinode_dealloc(ip);
++ if (error)
++ goto fail_iput;
++
++ gfs_inode_put(ip);
++ gfs_inode_destroy(ip);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return 0;
++
++ fail_iput:
++ gfs_inode_put(ip);
++ gfs_inode_destroy(ip);
++
++ fail:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * inode_dealloc_init - Try to deallocate an inode and all its blocks
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++static int
++inode_dealloc_init(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ struct gfs_holder io_gh;
++ int error = 0;
++
++ gfs_try_toss_inode(sdp, inum);
++
++ error = gfs_glock_nq_num(sdp,
++ inum->no_addr, &gfs_iopen_glops,
++ LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &io_gh);
++ switch (error) {
++ case 0:
++ break;
++ case GLR_TRYFAILED:
++ return 1;
++ default:
++ GFS_ASSERT_SBD(error < 0, sdp,);
++ return error;
++ }
++
++ gfs_glock_dq(&io_gh);
++ error = inode_dealloc(sdp, inum, &io_gh);
++ gfs_holder_uninit(&io_gh);
++
++ return error;
++}
++
++/**
++ * inode_dealloc_uninit - dealloc an uninitialized inode
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++static int
++inode_dealloc_uninit(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ struct gfs_rgrpd *rgd;
++ struct gfs_holder ri_gh, rgd_gh;
++ int error;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ return error;
++
++ rgd = gfs_blk2rgrpd(sdp, inum->no_addr);
++ GFS_ASSERT_SBD(rgd, sdp,);
++
++ error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++ if (error)
++ goto fail;
++
++ /* Trans may require:
++ One block for the RG header.
++ One block for the dinode bit.
++ We also need a block for the unlinked change. */
++
++ error = gfs_trans_begin(sdp, 2, 1);
++ if (error)
++ goto fail_gunlock;
++
++ gfs_difree_uninit(rgd, inum->no_addr);
++ gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA, inum);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_uninit(&rgd_gh);
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return 0;
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ fail:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return error;
++}
++
++/**
++ * gfs_inode_dealloc - Grab an unlinked inode off the list and try to free it.
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -errno on error, 1 on busy
++ */
++
++int
++gfs_inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ if (inum->no_formal_ino)
++ return inode_dealloc_init(sdp, inum);
++ else
++ return inode_dealloc_uninit(sdp, inum);
++}
++
++/**
++ * gfs_change_nlink - Change nlink count on inode
++ * @ip: The GFS inode
++ * @diff: The change in the nlink count required
++ *
++ * Returns: 0 on success, -EXXXX on failure.
++ */
++
++int
++gfs_change_nlink(struct gfs_inode *ip, int diff)
++{
++ struct buffer_head *dibh;
++ uint32_t nlink;
++ int error;
++
++ nlink = ip->i_di.di_nlink + diff;
++
++ if (diff < 0)
++ GFS_ASSERT_INODE(nlink < ip->i_di.di_nlink, ip,
++ gfs_dinode_print(&ip->i_di););
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ return error;
++
++ ip->i_di.di_nlink = nlink;
++ ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * gfs_lookupi - Look up a filename in a directory and return its inode
++ * @d_gh: An initialized holder for the directory glock
++ * @name: The name of the inode to look for
++ * @is_root: If TRUE, ignore the caller's permissions
++ * @i_gh: An uninitialized holder for the new inode glock
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_lookupi(struct gfs_holder *d_gh, struct qstr *name,
++ int is_root, struct gfs_holder *i_gh)
++{
++ struct gfs_inode *dip = gl2ip(d_gh->gh_gl);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_glock *gl;
++ struct gfs_inode *ip;
++ struct gfs_inum inum, inum2;
++ unsigned int type;
++ int error;
++
++ i_gh->gh_gl = NULL;
++
++ if (!name->len || name->len > GFS_FNAMESIZE)
++ return -ENAMETOOLONG;
++
++ if (gfs_filecmp(name, ".", 1)) {
++ gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++ error = gfs_glock_nq(d_gh);
++ if (!error) {
++ error = gfs_glock_nq_init(dip->i_gl,
++ LM_ST_SHARED, 0,
++ i_gh);
++ GFS_ASSERT_INODE(!error, ip,);
++ gfs_inode_hold(dip);
++ }
++
++ return error;
++ }
++
++ if (gfs_glock_is_locked_by_me(d_gh->gh_gl))
++ bitch_about(sdp, &sdp->sd_last_readdirplus,
++ "readdirplus-type behavior");
++
++ gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++ error = gfs_glock_nq(d_gh);
++ if (error)
++ return error;
++
++ if (!is_root) {
++ struct inode *dir = gfs_iget(dip, NO_CREATE);
++ if (dir) {
++ error = permission(dir, MAY_EXEC, NULL);
++ iput(dir);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ return error;
++ }
++ }
++ }
++
++ error = gfs_dir_search(dip, name, &inum, &type);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ if (error == -ENOENT)
++ error = 0;
++ return error;
++ }
++
++ restart:
++ error = gfs_glock_get(sdp, inum.no_formal_ino, &gfs_inode_glops,
++ CREATE, &gl);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ return error;
++ }
++
++ /* Acquire the second lock */
++
++ if (gl->gl_name.ln_number < dip->i_gl->gl_name.ln_number) {
++ gfs_glock_dq(d_gh);
++
++ error = gfs_glock_nq_init(gl, LM_ST_SHARED,
++ LM_FLAG_ANY | GL_LOCAL_EXCL,
++ i_gh);
++ if (error)
++ goto out;
++
++ gfs_holder_reinit(LM_ST_SHARED, 0, d_gh);
++ error = gfs_glock_nq(d_gh);
++ if (error) {
++ gfs_glock_dq_uninit(i_gh);
++ goto out;
++ }
++
++ if (!is_root) {
++ struct inode *dir = gfs_iget(dip, NO_CREATE);
++ if (dir) {
++ error = permission(dir, MAY_EXEC, NULL);
++ iput(dir);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ gfs_glock_dq_uninit(i_gh);
++ goto out;
++ }
++ }
++ }
++
++ error = gfs_dir_search(dip, name, &inum2, &type);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ gfs_glock_dq_uninit(i_gh);
++ if (error == -ENOENT)
++ error = 0;
++ goto out;
++ }
++
++ if (!gfs_inum_equal(&inum, &inum2)) {
++ gfs_glock_dq_uninit(i_gh);
++ gfs_glock_put(gl);
++ inum = inum2;
++ goto restart;
++ }
++ } else {
++ error = gfs_glock_nq_init(gl, LM_ST_SHARED,
++ LM_FLAG_ANY | GL_LOCAL_EXCL,
++ i_gh);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ goto out;
++ }
++ }
++
++ error = gfs_inode_get(gl, &inum, CREATE, &ip);
++ if (error) {
++ gfs_glock_dq(d_gh);
++ gfs_glock_dq_uninit(i_gh);
++ }
++ GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++
++ out:
++ gfs_glock_put(gl);
++
++ return error;
++}
++
++/**
++ * create_ok -
++ * @dip:
++ * @name:
++ * @type:
++ *
++ * Returns: errno
++ */
++
++static int
++create_ok(struct gfs_inode *dip, struct qstr *name, unsigned int type)
++{
++ int error;
++
++ {
++ struct inode *dir = gfs_iget(dip, NO_CREATE);
++ if (dir) {
++ error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++ iput(dir);
++ if (error)
++ return error;
++ }
++ }
++
++ /* Don't create entries in an unlinked directory */
++
++ if (!dip->i_di.di_nlink)
++ return -EPERM;
++
++ error = gfs_dir_search(dip, name, NULL, NULL);
++ switch (error) {
++ case -ENOENT:
++ error = 0;
++ break;
++ case 0:
++ return -EEXIST;
++ default:
++ return error;
++ }
++
++ if (dip->i_di.di_entries == (uint32_t)-1)
++ return -EFBIG;
++ if (type == GFS_FILE_DIR && dip->i_di.di_nlink == (uint32_t)-1)
++ return -EMLINK;
++
++ return 0;
++}
++
++/**
++ * dinode_alloc -
++ * @dip:
++ * @ul:
++ *
++ * Returns: errno
++ */
++
++static int
++dinode_alloc(struct gfs_inode *dip, struct gfs_unlinked **ul)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_alloc *al;
++ struct gfs_inum inum;
++ int error;
++
++ al = gfs_alloc_get(dip);
++
++ al->al_requested_di = 1;
++
++ error = gfs_inplace_reserve(dip);
++ if (error)
++ goto out;
++
++ error = gfs_trans_begin(sdp, al->al_rgd->rd_ri.ri_length, 1);
++ if (error)
++ goto out_inplace;
++
++ inum.no_formal_ino = 0;
++ error = gfs_dialloc(dip, &inum.no_addr);
++ if (error)
++ goto out_end_trans;
++
++ *ul = gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &inum);
++ gfs_unlinked_lock(sdp, *ul);
++
++ gfs_trans_add_gl(dip->i_gl);
++
++ out_end_trans:
++ gfs_trans_end(sdp);
++
++ out_inplace:
++ gfs_inplace_release(dip);
++
++ out:
++ gfs_alloc_put(dip);
++
++ return error;
++}
++
++/**
++ * pick_formal_ino - Pick a formal inode number for a given inode
++ * @sdp: the filesystem
++ * @inum: the inode number structure
++ *
++ */
++
++static void
++pick_formal_ino(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ /* This won't always be true */
++ inum->no_formal_ino = inum->no_addr;
++}
++
++/**
++ * make_dinode - Fill in a new dinode structure
++ * @dip: the directory this inode is being created in
++ * @gl: The glock covering the new inode
++ * @inum: the inode number
++ * @type: the file type
++ * @mode: the file permissions
++ * @uid:
++ * @gid:
++ *
++ */
++
++static int
++make_dinode(struct gfs_inode *dip,
++ struct gfs_glock *gl, struct gfs_inum *inum,
++ unsigned int type, unsigned int mode,
++ unsigned int uid, unsigned int gid)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_dinode di;
++ struct buffer_head *dibh;
++ struct gfs_rgrpd *rgd;
++ int error;
++
++ error = gfs_dread(sdp, inum->no_addr, gl,
++ DIO_NEW | DIO_START | DIO_WAIT,
++ &dibh);
++ if (error)
++ return error;
++
++ gfs_trans_add_bh(gl, dibh);
++ gfs_metatype_set(sdp, dibh, GFS_METATYPE_DI, GFS_FORMAT_DI);
++ gfs_buffer_clear_tail(dibh, sizeof(struct gfs_dinode));
++
++ memset(&di, 0, sizeof(struct gfs_dinode));
++
++ gfs_meta_header_in(&di.di_header, dibh->b_data);
++
++ di.di_num = *inum;
++
++ di.di_mode = mode & S_IALLUGO;
++ di.di_uid = uid;
++ di.di_gid = gid;
++ di.di_nlink = 1;
++ di.di_blocks = 1;
++ di.di_atime = di.di_mtime = di.di_ctime = get_seconds();
++
++ rgd = gfs_blk2rgrpd(sdp, inum->no_addr);
++ GFS_ASSERT_SBD(rgd, sdp,
++ printk("block = %"PRIu64"\n", inum->no_addr););
++
++ di.di_rgrp = rgd->rd_ri.ri_addr;
++ di.di_goal_rgrp = di.di_rgrp;
++ di.di_goal_dblk = di.di_goal_mblk = inum->no_addr - rgd->rd_ri.ri_data1;
++
++ if (type == GFS_FILE_REG) {
++ if ((dip->i_di.di_flags & GFS_DIF_INHERIT_JDATA) ||
++ sdp->sd_tune.gt_new_files_jdata)
++ di.di_flags |= GFS_DIF_JDATA;
++ if ((dip->i_di.di_flags & GFS_DIF_INHERIT_DIRECTIO) ||
++ sdp->sd_tune.gt_new_files_directio)
++ di.di_flags |= GFS_DIF_DIRECTIO;
++ } else if (type == GFS_FILE_DIR) {
++ di.di_flags |= (dip->i_di.di_flags & GFS_DIF_INHERIT_DIRECTIO);
++ di.di_flags |= (dip->i_di.di_flags & GFS_DIF_INHERIT_JDATA);
++ }
++
++ di.di_type = type;
++
++ gfs_dinode_out(&di, dibh->b_data);
++ brelse(dibh);
++
++ return 0;
++}
++
++/**
++ * inode_init_and_link -
++ * @dip:
++ * @name:
++ * @inum:
++ * @gl:
++ * @type:
++ * @mode:
++ *
++ * Returns: errno
++ */
++
++static int
++inode_init_and_link(struct gfs_inode *dip, struct qstr *name,
++ struct gfs_inum *inum, struct gfs_glock *gl,
++ unsigned int type, unsigned int mode)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct posix_acl *acl = NULL;
++ struct gfs_alloc *al;
++ struct gfs_inode *ip;
++ unsigned int gid;
++ int alloc_required;
++ int error;
++
++ error = gfs_setup_new_acl(dip, type, &mode, &acl);
++ if (error)
++ return error;
++
++ if (dip->i_di.di_mode & S_ISGID) {
++ if (type == GFS_FILE_DIR)
++ mode |= S_ISGID;
++ gid = dip->i_di.di_gid;
++ }
++ else
++ gid = current->fsgid;
++
++ al = gfs_alloc_get(dip);
++
++ error = gfs_quota_lock_m(dip,
++ current->fsuid,
++ gid);
++ if (error)
++ goto fail;
++
++ error = gfs_quota_check(dip, current->fsuid, gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ if (acl)
++ alloc_required = TRUE;
++ else {
++ error = gfs_diradd_alloc_required(dip, name, &alloc_required);
++ if (error)
++ goto fail_gunlock_q;
++ }
++
++ if (alloc_required) {
++ error = gfs_quota_check(dip, dip->i_di.di_uid, dip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ al->al_requested_meta = sdp->sd_max_dirres + GFS_MAX_EA_ACL_BLKS;
++
++ error = gfs_inplace_reserve(dip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ blocks for two dinodes, the directory blocks necessary for
++ a new entry, RG bitmap blocks for an allocation,
++ and one block for a quota change and
++ one block for an unlinked tag. */
++
++ error = gfs_trans_begin(sdp,
++ 2 + sdp->sd_max_dirres +
++ al->al_rgd->rd_ri.ri_length +
++ GFS_MAX_EA_ACL_BLKS, 2);
++ if (error)
++ goto fail_inplace;
++ } else {
++ /* Trans may require:
++ blocks for two dinodes, a leaf block,
++ and one block for a quota change and
++ one block for an unlinked tag. */
++
++ error = gfs_trans_begin(sdp, 3, 2);
++ if (error)
++ goto fail_gunlock_q;
++ }
++
++ error = gfs_dir_add(dip, name, inum, type);
++ if (error)
++ goto fail_end_trans;
++
++ error = make_dinode(dip, gl, inum, type, mode, current->fsuid, gid);
++ if (error)
++ goto fail_end_trans;
++
++ al->al_ul = gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA,
++ &(struct gfs_inum){0, inum->no_addr});
++ gfs_trans_add_quota(sdp, +1, current->fsuid, gid);
++
++ /* Gfs_inode_get() can't fail here. But then again, it shouldn't be
++ here (it should be in gfs_createi()). Gfs_init_acl() has no
++ business needing a memory-resident inode. */
++
++ gfs_inode_get(gl, inum, CREATE, &ip);
++
++ if (acl) {
++ error = gfs_init_acl(dip, ip, type, acl);
++ GFS_ASSERT(!error, ); /* Sigh. */
++ }
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_inplace:
++ if (alloc_required)
++ gfs_inplace_release(dip);
++
++ fail_gunlock_q:
++ gfs_quota_unlock_m(dip);
++
++ fail:
++ gfs_alloc_put(dip);
++ if (acl)
++ posix_acl_release(acl);
++
++ return error;
++}
++
++/**
++ * gfs_createi - Create a new inode
++ * @d_gh: An initialized holder for the directory glock
++ * @name: The name of the new file
++ * @type: The type of dinode (GFS_FILE_REG, GFS_FILE_DIR, GFS_FILE_LNK, ...)
++ * @mode: the permissions on the new inode
++ * @i_gh: An uninitialized holder for the new inode glock
++ *
++ * If the return value is 0, the glocks on both the directory and the new
++ * file are held. A transaction has been started and an inplace reservation
++ * is held, as well.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_createi(struct gfs_holder *d_gh, struct qstr *name,
++ unsigned int type, unsigned int mode,
++ struct gfs_holder *i_gh)
++{
++ struct gfs_inode *dip = gl2ip(d_gh->gh_gl);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_unlinked *ul;
++ struct gfs_inum inum;
++ struct gfs_holder io_gh;
++ int error;
++
++ if (!name->len || name->len > GFS_FNAMESIZE)
++ return -ENAMETOOLONG;
++
++ gfs_holder_reinit(LM_ST_EXCLUSIVE, 0, d_gh);
++ error = gfs_glock_nq(d_gh);
++ if (error)
++ return error;
++
++ error = create_ok(dip, name, type);
++ if (error)
++ goto fail;
++
++ error = dinode_alloc(dip, &ul);
++ if (error)
++ goto fail;
++
++ inum.no_addr = ul->ul_inum.no_addr;
++ pick_formal_ino(sdp, &inum);
++
++ if (inum.no_formal_ino < dip->i_num.no_formal_ino) {
++ gfs_glock_dq(d_gh);
++
++ error = gfs_glock_nq_num(sdp,
++ inum.no_formal_ino, &gfs_inode_glops,
++ LM_ST_EXCLUSIVE, GL_SKIP, i_gh);
++ if (error) {
++ gfs_unlinked_unlock(sdp, ul);
++ return error;
++ }
++
++ gfs_holder_reinit(LM_ST_EXCLUSIVE, 0, d_gh);
++ error = gfs_glock_nq(d_gh);
++ if (error) {
++ gfs_glock_dq_uninit(i_gh);
++ gfs_unlinked_unlock(sdp, ul);
++ return error;
++ }
++
++ error = create_ok(dip, name, type);
++ if (error)
++ goto fail_gunlock_i;
++ } else {
++ error = gfs_glock_nq_num(sdp,
++ inum.no_formal_ino, &gfs_inode_glops,
++ LM_ST_EXCLUSIVE, GL_SKIP, i_gh);
++ if (error)
++ goto fail_ul;
++ }
++
++ error = gfs_glock_nq_num(sdp,
++ inum.no_addr, &gfs_iopen_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL | GL_EXACT,
++ &io_gh);
++ if (error)
++ goto fail_gunlock_i;
++
++ error = inode_init_and_link(dip, name, &inum, i_gh->gh_gl, type, mode);
++ if (error)
++ goto fail_gunlock_io;
++
++ gfs_glock_dq_uninit(&io_gh);
++
++ return 0;
++
++ fail_gunlock_io:
++ gfs_glock_dq_uninit(&io_gh);
++
++ fail_gunlock_i:
++ gfs_glock_dq_uninit(i_gh);
++
++ fail_ul:
++ gfs_unlinked_unlock(sdp, ul);
++
++ fail:
++ gfs_glock_dq(d_gh);
++
++ return error;
++}
++
++/**
++ * gfs_unlinki - Unlink a file
++ * @dip: The inode of the directory
++ * @name: The name of the file to be unlinked
++ * @ip: The inode of the file to be removed
++ *
++ * Assumes Glocks on both dip and ip are held.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ int error;
++
++ error = gfs_dir_del(dip, name);
++ if (error)
++ return error;
++
++ error = gfs_change_nlink(ip, -1);
++ if (error)
++ return error;
++
++ /* If this inode is being unlinked from the directory structure,
++ we need to mark that in the log so that it isn't lost during
++ a crash. */
++
++ if (!ip->i_di.di_nlink) {
++ gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &ip->i_num);
++ set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_rmdiri - Remove a directory
++ * @dip: The parent directory of the directory to be removed
++ * @name: The name of the directory to be removed
++ * @ip: The GFS inode of the directory to be removed
++ *
++ * Assumes Glocks on dip and ip are held
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++int
++gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct qstr dotname;
++ int error;
++
++ GFS_ASSERT_INODE(ip->i_di.di_entries == 2, ip,
++ gfs_dinode_print(&ip->i_di););
++
++ error = gfs_dir_del(dip, name);
++ if (error)
++ return error;
++
++ error = gfs_change_nlink(dip, -1);
++ if (error)
++ return error;
++
++ dotname.len = 1;
++ dotname.name = ".";
++ error = gfs_dir_del(ip, &dotname);
++ if (error)
++ return error;
++
++ dotname.len = 2;
++ dotname.name = "..";
++ error = gfs_dir_del(ip, &dotname);
++ if (error)
++ return error;
++
++ error = gfs_change_nlink(ip, -2);
++ if (error)
++ return error;
++
++ /* This inode is being unlinked from the directory structure and
++ we need to mark that in the log so that it isn't lost during
++ a crash. */
++
++ gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IUL, &ip->i_num);
++ set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
++
++ return 0;
++}
++
++/*
++ * gfs_revalidate - check to see that a inode is still in a directory
++ * @dip: the directory
++ * @name: the name of the file
++ * @ip: the inode
++ *
++ * Assumes that the lock on (at least) @dip is held.
++ *
++ * Returns: 0 if the parent/child relationship is correct, -ENOENT if it isn't
++ */
++
++int
++gfs_revalidate(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip)
++{
++ struct gfs_inum inum;
++ unsigned int type;
++ int error;
++
++ error = gfs_dir_search(dip, name, &inum, &type);
++ if (!error) {
++ if (inum.no_formal_ino == ip->i_num.no_formal_ino)
++ GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++ else
++ error = -ENOENT;
++ }
++
++ return error;
++}
++
++/*
++ * gfs_ok_to_move - check if it's ok to move a directory to another directory
++ * @this: move this
++ * @to: to here
++ *
++ * Follow @to back to the root and make sure we don't encounter @this
++ * Assumes we already hold the rename lock.
++ *
++ * Returns: 0 if it's ok to move, -EXXX if it isn't
++ */
++
++int
++gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to)
++{
++ struct gfs_sbd *sdp = this->i_sbd;
++ struct gfs_inode *tmp;
++ struct gfs_holder to_gh, tmp_gh;
++ struct qstr dotdot;
++ int error = 0;
++
++ memset(&dotdot, 0, sizeof (struct qstr));
++ dotdot.name = "..";
++ dotdot.len = 2;
++
++ gfs_inode_hold(to);
++
++ for (;;) {
++ if (to == this) {
++ error = -EINVAL;
++ break;
++ }
++ if (to == sdp->sd_rooti) {
++ error = 0;
++ break;
++ }
++
++ gfs_holder_init(to->i_gl, 0, 0, &to_gh);
++
++ error = gfs_lookupi(&to_gh, &dotdot, TRUE, &tmp_gh);
++ if (error) {
++ gfs_holder_uninit(&to_gh);
++ break;
++ }
++ if (!tmp_gh.gh_gl) {
++ gfs_holder_uninit(&to_gh);
++ error = -ENOENT;
++ break;
++ }
++
++ tmp = gl2ip(tmp_gh.gh_gl);
++
++ gfs_glock_dq_uninit(&to_gh);
++ gfs_glock_dq_uninit(&tmp_gh);
++
++ gfs_inode_put(to);
++ to = tmp;
++ }
++
++ gfs_inode_put(to);
++
++ return error;
++}
++
++/**
++ * gfs_readlinki - return the contents of a symlink
++ * @ip: the symlink's inode
++ * @buf: a pointer to the buffer to be filled
++ * @len: a pointer to the length of @buf
++ *
++ * If @buf is too small, a piece of memory is gmalloc()ed and needs
++ * to be freed by the caller.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len)
++{
++ struct gfs_holder i_gh;
++ struct buffer_head *dibh;
++ unsigned int x;
++ int error;
++
++ gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
++ error = gfs_glock_nq_atime(&i_gh);
++ if (error) {
++ gfs_holder_uninit(&i_gh);
++ return error;
++ }
++
++ GFS_ASSERT_INODE(ip->i_di.di_size, ip,);
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto out;
++
++ x = ip->i_di.di_size + 1;
++ if (x > *len)
++ *buf = gmalloc(x);
++
++ memcpy(*buf, dibh->b_data + sizeof(struct gfs_dinode), x);
++ *len = x;
++
++ brelse(dibh);
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_glock_nq_atime - Acquire the glock and conditionally update the atime on an inode
++ * @gh: the holder to acquire
++ *
++ * Tests atime for gfs_read, gfs_readdir and gfs_test_mmap
++ * Update if the difference between the current time and the current atime
++ * is greater than a interval specfied at mount.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_glock_nq_atime(struct gfs_holder *gh)
++{
++ struct gfs_glock *gl = gh->gh_gl;
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_inode *ip;
++ int64_t curtime, quantum = sdp->sd_tune.gt_atime_quantum;
++ unsigned int state;
++ int flags;
++ int error;
++
++ GFS_ASSERT_GLOCK(gh->gh_flags & GL_ATIME, gl,);
++ GFS_ASSERT_GLOCK(!(gh->gh_flags & GL_ASYNC), gl,);
++ GFS_ASSERT_GLOCK(gl->gl_ops == &gfs_inode_glops, gl,);
++
++ ip = gl2ip(gl);
++ GFS_ASSERT_GLOCK(ip, gl,);
++
++ state = gh->gh_state;
++ flags = gh->gh_flags;
++
++ error = gfs_glock_nq(gh);
++ if (error)
++ return error;
++
++ if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
++ test_bit(SDF_ROFS, &sdp->sd_flags))
++ return 0;
++
++ curtime = get_seconds();
++ if (curtime - ip->i_di.di_atime >= quantum) {
++ int was_exclusive = (gl->gl_state == LM_ST_EXCLUSIVE);
++
++ gfs_glock_dq(gh);
++ gfs_holder_reinit(LM_ST_EXCLUSIVE,
++ gh->gh_flags & ~LM_FLAG_ANY,
++ gh);
++ error = gfs_glock_nq(gh);
++ if (error)
++ return error;
++
++ /* Verify this hasn't been updated while we were
++ trying to get exclusive lock. */
++
++ curtime = get_seconds();
++ if (curtime - ip->i_di.di_atime >= quantum) {
++ struct buffer_head *dibh;
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error == -EROFS)
++ return 0;
++ if (error)
++ goto fail;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ ip->i_di.di_atime = curtime;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++ }
++
++ if (!was_exclusive) {
++ gfs_glock_dq(gh);
++ flags &= ~LM_FLAG_ANY;
++ flags |= GL_EXACT;
++ gfs_holder_reinit(state, flags, gh);
++ error = gfs_glock_nq(gh);
++ return error;
++ }
++ }
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail:
++ gfs_glock_dq(gh);
++
++ return error;
++}
++
++/**
++ * glock_compare_atime - Compare two struct gfs_glock structures for sorting
++ * @arg_a: the first structure
++ * @arg_b: the second structure
++ *
++ */
++
++static int
++glock_compare_atime(void *arg_a, void *arg_b)
++{
++ struct gfs_holder *gh_a = *(struct gfs_holder **)arg_a;
++ struct gfs_holder *gh_b = *(struct gfs_holder **)arg_b;
++ struct lm_lockname *a = &gh_a->gh_gl->gl_name;
++ struct lm_lockname *b = &gh_b->gh_gl->gl_name;
++ int ret = 0;
++
++ if (a->ln_number > b->ln_number)
++ ret = 1;
++ else if (a->ln_number < b->ln_number)
++ ret = -1;
++ else {
++ if (gh_a->gh_state == LM_ST_SHARED &&
++ gh_b->gh_state == LM_ST_EXCLUSIVE)
++ ret = 1;
++ else if (gh_a->gh_state == LM_ST_SHARED &&
++ (gh_b->gh_flags & GL_ATIME))
++ ret = 1;
++ }
++
++ return ret;
++}
++
++/**
++ * gfs_glock_nq_m_atime - acquire multiple glocks where one may need an atime update
++ * @num_gh: the number of structures
++ * @ghs: an array of struct gfs_holder structures
++ *
++ * Returns: 0 on success (all glocks acquired), -EXXX on failure (no glocks acquired)
++ */
++
++int
++gfs_glock_nq_m_atime(unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct gfs_holder *p[num_gh];
++ unsigned int x;
++ int error = 0;
++
++ GFS_ASSERT(num_gh,);
++
++ if (num_gh == 1) {
++ ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++ if (ghs->gh_flags & GL_ATIME)
++ error = gfs_glock_nq_atime(ghs);
++ else
++ error = gfs_glock_nq(ghs);
++ return error;
++ }
++
++ for (x = 0; x < num_gh; x++)
++ p[x] = &ghs[x];
++
++ gfs_sort(p, num_gh, sizeof(struct gfs_holder *), glock_compare_atime);
++
++ for (x = 0; x < num_gh; x++) {
++ p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
++
++ if (p[x]->gh_flags & GL_ATIME)
++ error = gfs_glock_nq_atime(p[x]);
++ else
++ error = gfs_glock_nq(p[x]);
++
++ if (error) {
++ while (x--)
++ gfs_glock_dq(p[x]);
++ break;
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_try_toss_vnode - See if we can toss a vnode from memory
++ * @ip: the inode
++ *
++ * Returns: TRUE if the vnode was tossed
++ */
++
++void
++gfs_try_toss_vnode(struct gfs_inode *ip)
++{
++ struct inode *inode;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (!inode)
++ return;
++
++ d_prune_aliases(inode);
++
++ if (ip->i_di.di_type == GFS_FILE_DIR) {
++ struct list_head *head = &inode->i_dentry;
++ struct dentry *d = NULL;
++
++ spin_lock(&dcache_lock);
++ if (list_empty(head))
++ spin_unlock(&dcache_lock);
++ else {
++ d = list_entry(head->next, struct dentry, d_alias);
++ dget_locked(d);
++ spin_unlock(&dcache_lock);
++
++ if (have_submounts(d))
++ dput(d);
++ else {
++ shrink_dcache_parent(d);
++ dput(d);
++ d_prune_aliases(inode);
++ }
++ }
++ }
++
++ inode->i_nlink = 0;
++ iput(inode);
++}
++
++/**
++ * iah_make_jdata -
++ * @gl:
++ * @inum:
++ *
++ */
++
++static void
++iah_make_jdata(struct gfs_glock *gl, struct gfs_inum *inum)
++{
++ struct buffer_head *bh;
++ struct gfs_dinode *di;
++ uint32_t flags;
++ int error;
++
++ error = gfs_dread(gl->gl_sbd, inum->no_addr, gl, DIO_START | DIO_WAIT, &bh);
++ GFS_ASSERT_GLOCK(!error, gl,); /* Already pinned */
++
++ di = (struct gfs_dinode *)bh->b_data;
++
++ flags = di->di_flags;
++ flags = gfs32_to_cpu(flags) | GFS_DIF_JDATA;
++ di->di_flags = cpu_to_gfs32(flags);
++
++ brelse(bh);
++}
++
++/**
++ * iah_super_update -
++ * @sdp:
++ *
++ * Returns: errno
++ */
++
++static int
++iah_super_update(struct gfs_sbd *sdp)
++{
++ struct gfs_glock *gl;
++ struct buffer_head *bh;
++ int error;
++
++ error = gfs_glock_get(sdp,
++ GFS_SB_LOCK, &gfs_meta_glops,
++ NO_CREATE, &gl);
++ GFS_ASSERT_SBD(!error && gl, sdp,); /* This should already be held. */
++
++ error = gfs_dread(sdp,
++ GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (!error) {
++ gfs_trans_add_bh(gl, bh);
++ gfs_sb_out(&sdp->sd_sb, bh->b_data);
++ brelse(bh);
++ }
++
++ gfs_glock_put(gl);
++
++ return error;
++}
++
++/**
++ * inode_alloc_hidden -
++ * @sdp:
++ * @inum:
++ *
++ * Returns: errno
++ */
++
++static int
++inode_alloc_hidden(struct gfs_sbd *sdp, struct gfs_inum *inum)
++{
++ struct gfs_inode *dip = sdp->sd_rooti;
++ struct gfs_holder d_gh, i_gh;
++ struct gfs_unlinked *ul;
++ int error;
++
++ error = gfs_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh);
++ if (error)
++ return error;
++
++ error = dinode_alloc(dip, &ul);
++ if (error)
++ goto fail;
++
++ inum->no_addr = ul->ul_inum.no_addr;
++ pick_formal_ino(sdp, inum);
++
++ /* Don't worry about deadlock ordering here. We're the first
++ mounter and still under the mount lock (i.e. there is no
++ contention). */
++
++ error = gfs_glock_nq_num(sdp,
++ inum->no_formal_ino, &gfs_inode_glops,
++ LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
++ if (error)
++ goto fail_ul;
++
++ gfs_alloc_get(dip);
++
++ error = gfs_quota_hold_m(dip, 0, 0);
++ if (error)
++ goto fail_al;
++
++ /* Trans may require:
++ The new inode, the superblock,
++ and one block for a quota change and
++ one block for an unlinked tag. */
++
++ error = gfs_trans_begin(sdp, 2, 2);
++ if (error)
++ goto fail_unhold;
++
++ error = make_dinode(dip, i_gh.gh_gl, inum, GFS_FILE_REG, 0600, 0, 0);
++ if (error)
++ goto fail_end_trans;
++
++ iah_make_jdata(i_gh.gh_gl, inum);
++
++ error = iah_super_update(sdp);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_add_unlinked(sdp, GFS_LOG_DESC_IDA,
++ &(struct gfs_inum){0, inum->no_addr});
++ gfs_trans_add_quota(sdp, +1, 0, 0);
++ gfs_trans_add_gl(dip->i_gl);
++
++ gfs_trans_end(sdp);
++ gfs_quota_unhold_m(dip);
++ gfs_alloc_put(dip);
++
++ gfs_glock_dq_uninit(&i_gh);
++ gfs_glock_dq_uninit(&d_gh);
++
++ gfs_unlinked_unlock(sdp, ul);
++
++ gfs_log_flush(sdp);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_unhold:
++ gfs_quota_unhold_m(dip);
++
++ fail_al:
++ gfs_alloc_put(dip);
++ gfs_glock_dq_uninit(&i_gh);
++
++ fail_ul:
++ gfs_unlinked_unlock(sdp, ul);
++
++ fail:
++ gfs_glock_dq_uninit(&d_gh);
++
++ return error;
++}
++
++/**
++ * gfs_alloc_qinode - allocate a quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_alloc_qinode(struct gfs_sbd *sdp)
++{
++ return inode_alloc_hidden(sdp, &sdp->sd_sb.sb_quota_di);
++}
++
++/**
++ * gfs_alloc_linode - allocate a license inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_alloc_linode(struct gfs_sbd *sdp)
++{
++ return inode_alloc_hidden(sdp, &sdp->sd_sb.sb_license_di);
++}
+diff -urN linux-orig/fs/gfs/inode.h linux-patched/fs/gfs/inode.h
+--- linux-orig/fs/gfs/inode.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/inode.h 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,68 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __INODE_DOT_H__
++#define __INODE_DOT_H__
++
++void gfs_inode_attr_in(struct gfs_inode *ip);
++void gfs_inode_attr_out(struct gfs_inode *ip);
++struct inode *gfs_iget(struct gfs_inode *ip, int create);
++
++int gfs_copyin_dinode(struct gfs_inode *ip);
++
++int gfs_inode_get(struct gfs_glock *i_gl, struct gfs_inum *inum, int create,
++ struct gfs_inode **ipp);
++void gfs_inode_hold(struct gfs_inode *ip);
++void gfs_inode_put(struct gfs_inode *ip);
++void gfs_inode_destroy(struct gfs_inode *ip);
++
++int gfs_inode_dealloc(struct gfs_sbd *sdp, struct gfs_inum *inum);
++
++int gfs_change_nlink(struct gfs_inode *ip, int diff);
++int gfs_lookupi(struct gfs_holder *d_gh, struct qstr *name,
++ int is_root, struct gfs_holder *i_gh);
++int gfs_createi(struct gfs_holder *d_gh, struct qstr *name,
++ unsigned int type, unsigned int mode,
++ struct gfs_holder *i_gh);
++int gfs_unlinki(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
++int gfs_rmdiri(struct gfs_inode *dip, struct qstr *name, struct gfs_inode *ip);
++int gfs_revalidate(struct gfs_inode *dip, struct qstr *name,
++ struct gfs_inode *ip);
++int gfs_ok_to_move(struct gfs_inode *this, struct gfs_inode *to);
++int gfs_readlinki(struct gfs_inode *ip, char **buf, unsigned int *len);
++
++int gfs_glock_nq_atime(struct gfs_holder *gh);
++int gfs_glock_nq_m_atime(unsigned int num_gh, struct gfs_holder *ghs);
++
++void gfs_try_toss_vnode(struct gfs_inode *ip);
++
++/* Backwards compatibility functions */
++
++int gfs_alloc_qinode(struct gfs_sbd *sdp);
++int gfs_alloc_linode(struct gfs_sbd *sdp);
++
++/* Inlines */
++
++static __inline__ int
++gfs_is_stuffed(struct gfs_inode *ip)
++{
++ return !ip->i_di.di_height;
++}
++
++static __inline__ int
++gfs_is_jdata(struct gfs_inode *ip)
++{
++ return ip->i_di.di_flags & GFS_DIF_JDATA;
++}
++
++#endif /* __INODE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ioctl.c linux-patched/fs/gfs/ioctl.c
+--- linux-orig/fs/gfs/ioctl.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ioctl.c 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,983 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * gfs_add_bh_to_ub - copy a buffer up to user space
++ * @ub: the structure representing where to copy
++ * @bh: the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_add_bh_to_ub(struct gfs_user_buffer *ub, struct buffer_head *bh)
++{
++ uint64_t blkno = bh->b_blocknr;
++
++ if (ub->ub_count + sizeof(uint64_t) + bh->b_size > ub->ub_size)
++ return -ENOMEM;
++
++ if (copy_to_user(ub->ub_data + ub->ub_count,
++ &blkno,
++ sizeof(uint64_t)))
++ return -EFAULT;
++ ub->ub_count += sizeof(uint64_t);
++
++ if (copy_to_user(ub->ub_data + ub->ub_count,
++ bh->b_data,
++ bh->b_size))
++ return -EFAULT;
++ ub->ub_count += bh->b_size;
++
++ return 0;
++}
++
++/**
++ * get_meta - Read out all the metadata for a file
++ * @ip: the file
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_meta(struct gfs_inode *ip, void *arg)
++{
++ struct gfs_holder i_gh;
++ struct gfs_user_buffer ub;
++ int error;
++
++ if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++ return -EFAULT;
++ ub.ub_count = 0;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_get_file_meta(ip, &ub);
++ if (error)
++ goto out;
++
++ if (ip->i_di.di_type == GFS_FILE_DIR &&
++ (ip->i_di.di_flags & GFS_DIF_EXHASH)) {
++ error = gfs_get_dir_meta(ip, &ub);
++ if (error)
++ goto out;
++ }
++
++ if (ip->i_di.di_eattr) {
++ error = gfs_get_eattr_meta(ip, &ub);
++ if (error)
++ goto out;
++ }
++
++ if (copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++ error = -EFAULT;
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * file_stat - return the struct gfs_dinode of a file to user space
++ * @ip: the inode
++ * @arg: where to copy to
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++file_stat(struct gfs_inode *ip, void *arg)
++{
++ struct gfs_holder i_gh;
++ struct gfs_dinode di;
++ int error;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (error)
++ return error;
++
++ memcpy(&di, &ip->i_di, sizeof(struct gfs_dinode));
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ if (copy_to_user(arg, &di, sizeof(struct gfs_dinode)))
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * do_get_super - Dump the superblock into a buffer
++ * @sb: The superblock
++ * @ptr: The buffer pointer
++ *
++ * Returns: 0 or error code
++ */
++
++static int
++do_get_super(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_sb *sb;
++ struct gfs_holder sb_gh;
++ struct buffer_head *bh;
++ int error;
++
++ sb = gmalloc(sizeof(struct gfs_sb));
++
++ error = gfs_glock_nq_num(sdp,
++ GFS_SB_LOCK, &gfs_meta_glops,
++ LM_ST_SHARED, 0, &sb_gh);
++ if (error)
++ goto out;
++
++ error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, sb_gh.gh_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (error) {
++ gfs_glock_dq_uninit(&sb_gh);
++ goto out;
++ }
++
++ gfs_sb_in(sb, bh->b_data);
++ brelse(bh);
++
++ gfs_glock_dq_uninit(&sb_gh);
++
++ if (copy_to_user(arg, sb, sizeof(struct gfs_sb)))
++ error = -EFAULT;
++
++ out:
++ kfree(sb);
++
++ return error;
++}
++
++/**
++ * jt2ip - convert the file type in a jio struct to the right hidden ip
++ * @sdp: the filesystem
++ * @jt: the gfs_jio_structure
++ *
++ * Returns: The inode structure for the correct hidden file
++ */
++
++static struct gfs_inode *
++jt2ip(struct gfs_sbd *sdp, struct gfs_jio *jt)
++{
++ struct gfs_inode *ip = NULL;
++
++ switch (jt->jio_file) {
++ case GFS_HIDDEN_JINDEX:
++ ip = sdp->sd_jiinode;
++ break;
++
++ case GFS_HIDDEN_RINDEX:
++ ip = sdp->sd_riinode;
++ break;
++
++ case GFS_HIDDEN_QUOTA:
++ ip = sdp->sd_qinode;
++ break;
++
++ case GFS_HIDDEN_LICENSE:
++ ip = sdp->sd_linode;
++ break;
++ }
++
++ return ip;
++}
++
++/**
++ * jread_ioctl - Read from a journaled data file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: Amount of data copied or error
++ */
++
++static int
++jread_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_jio jt;
++ struct gfs_inode *ip;
++ struct gfs_holder i_gh;
++ int error;
++
++ if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ ip = jt2ip(sdp, &jt);
++ if (!ip)
++ return -EINVAL;
++
++ GFS_ASSERT_INODE(gfs_is_jdata(ip), ip,);
++
++ if (!access_ok(VERIFY_WRITE, jt.jio_data, jt.jio_size))
++ return -EFAULT;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_readi(ip, jt.jio_data, jt.jio_offset, jt.jio_size,
++ gfs_copy2user);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ if (error < 0)
++ return error;
++ jt.jio_count = error;
++
++ if (copy_to_user(arg, &jt, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * jwrite_ioctl - Write to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: Amount of data copied or error
++ */
++
++static int
++jwrite_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_jio jt;
++ struct gfs_inode *ip;
++ struct gfs_alloc *al = NULL;
++ struct gfs_holder i_gh;
++ unsigned int data_blocks, ind_blocks;
++ int alloc_required;
++ int error;
++
++ if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ ip = jt2ip(sdp, &jt);
++ if (!ip)
++ return -EINVAL;
++
++ GFS_ASSERT_INODE(gfs_is_jdata(ip), ip,);
++
++ if (!access_ok(VERIFY_READ, jt.jio_data, jt.jio_size))
++ return -EFAULT;
++
++ gfs_write_calc_reserv(ip, jt.jio_size, &data_blocks, &ind_blocks);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
++ LM_FLAG_PRIORITY | GL_SYNC, &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_write_alloc_required(ip, jt.jio_offset, jt.jio_size,
++ &alloc_required);
++ if (error)
++ goto out;
++
++ if (alloc_required) {
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE,
++ NO_QUOTA_CHANGE);
++ if (error)
++ goto out_alloc;
++
++ al->al_requested_meta = ind_blocks + data_blocks;
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto out_qs;
++
++ /* Trans may require:
++ All blocks for a RG bitmap, all the "data" blocks, whatever
++ indirect blocks we need, a modified dinode, and a quota change */
++
++ error = gfs_trans_begin(sdp,
++ 1 + al->al_rgd->rd_ri.ri_length +
++ ind_blocks + data_blocks, 1);
++ if (error)
++ goto out_relse;
++ } else {
++ /* Trans may require:
++ All the "data" blocks and a modified dinode. */
++
++ error = gfs_trans_begin(sdp, 1 + data_blocks, 0);
++ if (error)
++ goto out_relse;
++ }
++
++ error = gfs_writei(ip, jt.jio_data, jt.jio_offset, jt.jio_size,
++ gfs_copy_from_user);
++ if (error >= 0) {
++ jt.jio_count = error;
++ error = 0;
++ }
++
++ gfs_trans_end(sdp);
++
++ out_relse:
++ if (alloc_required) {
++ GFS_ASSERT_INODE(error || al->al_alloced_meta, ip,);
++ gfs_inplace_release(ip);
++ }
++
++ out_qs:
++ if (alloc_required)
++ gfs_quota_unhold_m(ip);
++
++ out_alloc:
++ if (alloc_required)
++ gfs_alloc_put(ip);
++
++ out:
++ ip->i_gl->gl_vn++;
++ gfs_glock_dq_uninit(&i_gh);
++
++ if (!error && copy_to_user(arg, &jt, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ return error;
++}
++
++/**
++ * jstat_ioctl - Stat to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++jstat_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_jio jt;
++ struct gfs_inode *ip;
++ struct gfs_holder i_gh;
++ int error;
++
++ if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ ip = jt2ip(sdp, &jt);
++ if (!ip)
++ return -EINVAL;
++
++ if (jt.jio_size < sizeof(struct gfs_dinode))
++ return -EINVAL;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (error)
++ return error;
++
++ error = copy_to_user(jt.jio_data, &ip->i_di, sizeof(struct gfs_dinode));
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ if (error)
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * jtrunc_ioctl - Truncate to a journaled file via ioctl
++ * @sdp: the filesystem
++ * @arg: The argument from ioctl
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++jtrunc_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_jio jt;
++ struct gfs_inode *ip;
++ struct gfs_holder i_gh;
++ int error;
++
++ if (copy_from_user(&jt, arg, sizeof(struct gfs_jio)))
++ return -EFAULT;
++
++ ip = jt2ip(sdp, &jt);
++ if (!ip)
++ return -EINVAL;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SYNC, &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_truncatei(ip, jt.jio_offset, NULL);
++
++ ip->i_gl->gl_vn++;
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * lock_dump - copy out info about the GFS' lock space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_user_buffer in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++lock_dump(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_user_buffer ub;
++ int error;
++
++ if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++ return -EFAULT;
++ ub.ub_count = 0;
++
++ error = gfs_dump_lockstate(sdp, &ub);
++ if (error)
++ return error;
++
++ if (copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * stat_gfs_ioctl - Do a GFS specific statfs
++ * @sdp: the filesystem
++ * @arg: the struct gfs_usage structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++stat_gfs_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_usage *u;
++ int error;
++
++ u = gmalloc(sizeof(struct gfs_usage));
++
++ error = gfs_stat_gfs(sdp, u, TRUE);
++ if (!error && copy_to_user(arg, u, sizeof(struct gfs_usage)))
++ return -EFAULT;
++
++ kfree(u);
++
++ return error;
++}
++
++/**
++ * reclaim_ioctl - ioctl called to perform metadata reclaimation
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_reclaim_stats in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++reclaim_ioctl(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_reclaim_stats stats;
++ int error;
++
++ memset(&stats, 0, sizeof(struct gfs_reclaim_stats));
++
++ error = gfs_reclaim_metadata(sdp, &stats);
++ if (error)
++ return error;
++
++ if (copy_to_user(arg, &stats, sizeof(struct gfs_reclaim_stats)))
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * get_tune - pass the current tuneable parameters up to user space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_tune in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_tune(struct gfs_sbd *sdp, void *arg)
++{
++ if (copy_to_user(arg, &sdp->sd_tune, sizeof(struct gfs_tune)))
++ return -EFAULT;
++
++ return 0;
++}
++
++/**
++ * set_tune - replace the current tuneable parameters with a set from user space
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_tune in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++set_tune(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_tune *gt;
++ int error = 0;
++
++ gt = gmalloc(sizeof(struct gfs_tune));
++
++ if (copy_from_user(gt, arg, sizeof(struct gfs_tune)))
++ error = -EFAULT;
++ else {
++ if (gt->gt_tune_version != GFS_TUNE_VERSION) {
++ printk("GFS: fsid=%s: invalid version of tuneable parameters\n",
++ sdp->sd_fsname);
++ error = -EINVAL;
++ } else
++ memcpy(&sdp->sd_tune, gt, sizeof(struct gfs_tune));
++ }
++
++ kfree(gt);
++
++ return error;
++}
++
++/**
++ * gfs_set_flag - set/clear a flag on an inode
++ * @ip: the inode
++ * @cmd: GFS_SET_FLAG or GFS_CLEAR_FLAG
++ * @arg: the flag to change (in user space)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_set_flag(struct gfs_inode *ip, unsigned int cmd, void *arg)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_holder i_gh;
++ struct buffer_head *dibh;
++ uint32_t flag;
++ int error;
++
++ if (copy_from_user(&flag, arg, sizeof(uint32_t)))
++ return -EFAULT;
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (error)
++ return error;
++
++ error = -EACCES;
++ if (ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
++ goto out;
++
++ error = -EINVAL;
++
++ switch (flag) {
++ case GFS_DIF_EXHASH:
++ case GFS_DIF_UNUSED:
++ case GFS_DIF_EA_INDIRECT:
++ goto out;
++
++ case GFS_DIF_JDATA:
++ if (ip->i_di.di_type != GFS_FILE_REG || ip->i_di.di_size)
++ goto out;
++ break;
++
++ case GFS_DIF_DIRECTIO:
++ if (ip->i_di.di_type != GFS_FILE_REG)
++ goto out;
++ break;
++
++ case GFS_DIF_IMMUTABLE:
++ case GFS_DIF_APPENDONLY:
++ case GFS_DIF_NOATIME:
++ case GFS_DIF_SYNC:
++ /* FixMe!!! */
++ error = -ENOSYS;
++ goto out;
++
++ case GFS_DIF_INHERIT_DIRECTIO:
++ case GFS_DIF_INHERIT_JDATA:
++ if (ip->i_di.di_type != GFS_FILE_DIR)
++ goto out;
++ break;
++
++ default:
++ goto out;
++ }
++
++ error = gfs_trans_begin(sdp, 1, 0);
++ if (error)
++ goto out;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto out_trans_end;
++
++ if (cmd == GFS_SET_FLAG)
++ ip->i_di.di_flags |= flag;
++ else
++ ip->i_di.di_flags &= ~flag;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ out_trans_end:
++ gfs_trans_end(sdp);
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * handle_roll - Read a atomic_t as an unsigned int
++ * @a: a counter
++ *
++ * if @a is negative, reset it to zero
++ *
++ * Returns: the value of the counter
++ */
++
++static unsigned int
++handle_roll(atomic_t *a)
++{
++ int x = atomic_read(a);
++ if (x < 0) {
++ atomic_set(a, 0);
++ return 0;
++ }
++ return (unsigned int)x;
++}
++
++/**
++ * fill_counters - Write a FS' counters into a buffer
++ * @sdp: the filesystem
++ * @buf: the buffer
++ * @size: the size of the buffer
++ * @count: where we are in the buffer
++ *
++ * Returns: errno
++ */
++
++static int
++fill_counters(struct gfs_sbd *sdp,
++ char *buf, unsigned int size, unsigned int *count)
++{
++ int error = 0;
++
++ gfs_sprintf("sd_glock_count:locks::%d\n",
++ atomic_read(&sdp->sd_glock_count));
++ gfs_sprintf("sd_glock_held_count:locks held::%d\n",
++ atomic_read(&sdp->sd_glock_held_count));
++ gfs_sprintf("sd_inode_count:incore inodes::%d\n",
++ atomic_read(&sdp->sd_inode_count));
++ gfs_sprintf("sd_bufdata_count:metadata buffers::%d\n",
++ atomic_read(&sdp->sd_bufdata_count));
++ gfs_sprintf("sd_unlinked_ic_count:unlinked inodes::%d\n",
++ atomic_read(&sdp->sd_unlinked_ic_count));
++ gfs_sprintf("sd_quota_count:quota IDs::%d\n",
++ atomic_read(&sdp->sd_quota_count));
++ gfs_sprintf("sd_log_buffers:incore log buffers::%u\n",
++ sdp->sd_log_buffers);
++ gfs_sprintf("sd_log_seg_free:log segments free::%u\n",
++ sdp->sd_log_seg_free);
++ gfs_sprintf("ji_nsegment:log segments total::%u\n",
++ sdp->sd_jdesc.ji_nsegment);
++ gfs_sprintf("sd_mhc_count:meta header cache entries::%d\n",
++ atomic_read(&sdp->sd_mhc_count));
++ gfs_sprintf("sd_depend_count:glock dependencies::%d\n",
++ atomic_read(&sdp->sd_depend_count));
++ gfs_sprintf("sd_reclaim_count:glocks on reclaim list::%d\n",
++ atomic_read(&sdp->sd_reclaim_count));
++ gfs_sprintf("sd_log_wrap:log wraps::%"PRIu64"\n",
++ sdp->sd_log_wrap);
++ gfs_sprintf("sd_fh2dentry_misses:fh2dentry misses:diff:%u\n",
++ handle_roll(&sdp->sd_fh2dentry_misses));
++ gfs_sprintf("sd_reclaimed:glocks reclaimed:diff:%u\n",
++ handle_roll(&sdp->sd_reclaimed));
++ gfs_sprintf("sd_glock_nq_calls:glock nq calls:diff:%u\n",
++ handle_roll(&sdp->sd_glock_nq_calls));
++ gfs_sprintf("sd_glock_dq_calls:glock dq calls:diff:%u\n",
++ handle_roll(&sdp->sd_glock_dq_calls));
++ gfs_sprintf("sd_glock_prefetch_calls:glock prefetch calls:diff:%u\n",
++ handle_roll(&sdp->sd_glock_prefetch_calls));
++ gfs_sprintf("sd_lm_lock_calls:lm_lock calls:diff:%u\n",
++ handle_roll(&sdp->sd_lm_lock_calls));
++ gfs_sprintf("sd_lm_unlock_calls:lm_unlock calls:diff:%u\n",
++ handle_roll(&sdp->sd_lm_unlock_calls));
++ gfs_sprintf("sd_lm_callbacks:lm callbacks:diff:%u\n",
++ handle_roll(&sdp->sd_lm_callbacks));
++ gfs_sprintf("sd_ops_address:address operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_address));
++ gfs_sprintf("sd_ops_dentry:dentry operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_dentry));
++ gfs_sprintf("sd_ops_export:export operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_export));
++ gfs_sprintf("sd_ops_file:file operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_file));
++ gfs_sprintf("sd_ops_inode:inode operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_inode));
++ gfs_sprintf("sd_ops_super:super operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_super));
++ gfs_sprintf("sd_ops_vm:vm operations:diff:%u\n",
++ handle_roll(&sdp->sd_ops_vm));
++
++ out:
++ return error;
++}
++
++/**
++ * get_counters - return usage counters to user space
++ * @sdp: the filesystem
++ * @arg: the counter structure to fill
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_counters(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_user_buffer ub;
++ unsigned int size = sdp->sd_tune.gt_lockdump_size;
++ char *buf;
++ int error;
++
++ if (copy_from_user(&ub, arg, sizeof(struct gfs_user_buffer)))
++ return -EFAULT;
++ ub.ub_count = 0;
++
++ if (size > ub.ub_size)
++ size = ub.ub_size;
++
++ buf = kmalloc(size, GFP_KERNEL);
++ if (!buf)
++ return -ENOMEM;
++
++ error = fill_counters(sdp, buf, size, &ub.ub_count);
++ if (!error) {
++ if (copy_to_user(ub.ub_data, buf, ub.ub_count) ||
++ copy_to_user(arg, &ub, sizeof(struct gfs_user_buffer)))
++ error = -EFAULT;
++ }
++
++ kfree(buf);
++
++ return error;
++}
++
++/**
++ * gfs_ioctli - filesystem independent ioctl function
++ * @ip: the inode the ioctl was on
++ * @cmd: the ioctl number
++ * @arg: the argument (still in user space)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_ioctli(struct gfs_inode *ip, unsigned int cmd, void *arg)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ int error = 0;
++
++ switch (cmd) {
++ case GFS_GET_META:
++ error = get_meta(ip, arg);
++ break;
++
++ case GFS_FILE_STAT:
++ error = file_stat(ip, arg);
++ break;
++
++ case GFS_SHRINK:
++ if (capable(CAP_SYS_ADMIN))
++ gfs_gl_hash_clear(sdp, FALSE);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_GET_ARGS:
++ if (copy_to_user(arg, &sdp->sd_args,
++ sizeof(struct gfs_args)))
++ error = -EFAULT;
++ break;
++
++ case GFS_GET_LOCKSTRUCT:
++ if (copy_to_user(arg, &sdp->sd_lockstruct,
++ sizeof(struct lm_lockstruct)))
++ error = -EFAULT;
++ break;
++
++ case GFS_GET_SUPER:
++ error = do_get_super(sdp, arg);
++ break;
++
++ case GFS_JREAD:
++ if (capable(CAP_SYS_ADMIN))
++ error = jread_ioctl(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_JWRITE:
++ if (capable(CAP_SYS_ADMIN))
++ error = jwrite_ioctl(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_JSTAT:
++ error = jstat_ioctl(sdp, arg);
++ break;
++
++ case GFS_JTRUNC:
++ if (capable(CAP_SYS_ADMIN))
++ error = jtrunc_ioctl(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_LOCK_DUMP:
++ if (capable(CAP_SYS_ADMIN))
++ error = lock_dump(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_STATGFS:
++ error = stat_gfs_ioctl(sdp, arg);
++ break;
++
++ case GFS_FREEZE:
++ if (capable(CAP_SYS_ADMIN))
++ error = gfs_freeze_fs(sdp);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_UNFREEZE:
++ if (capable(CAP_SYS_ADMIN))
++ gfs_unfreeze_fs(sdp);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_RECLAIM_METADATA:
++ if (capable(CAP_SYS_ADMIN))
++ error = reclaim_ioctl(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_QUOTA_SYNC:
++ if (capable(CAP_SYS_ADMIN))
++ error = gfs_quota_sync(sdp);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_QUOTA_REFRESH:
++ if (capable(CAP_SYS_ADMIN))
++ error = gfs_quota_refresh(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_QUOTA_READ:
++ /* Permissions handled later */
++ error = gfs_quota_read(sdp, arg);
++ break;
++
++ case GFS_GET_TUNE:
++ error = get_tune(sdp, arg);
++ break;
++
++ case GFS_SET_TUNE:
++ if (capable(CAP_SYS_ADMIN))
++ error = set_tune(sdp, arg);
++ else
++ error = -EACCES;
++ break;
++
++ case GFS_EATTR_GET:
++ /* Permissions handled later */
++ error = gfs_get_eattr_ioctl(sdp, ip, arg);
++ break;
++
++ case GFS_EATTR_SET:
++ /* Permissions handled later */
++ error = gfs_set_eattr_ioctl(sdp, ip, arg);
++ break;
++
++ case GFS_WHERE_ARE_YOU:
++ {
++ unsigned int x = GFS_MAGIC;
++ if (copy_to_user(arg, &x, sizeof(unsigned int)))
++ error = -EFAULT;
++ }
++ break;
++
++ case GFS_SET_FLAG:
++ case GFS_CLEAR_FLAG:
++ /* Permissions handled later */
++ error = gfs_set_flag(ip, cmd, arg);
++ break;
++
++ case GFS_GET_COUNTERS:
++ error = get_counters(sdp, arg);
++ break;
++
++ case GFS_FILE_FLUSH:
++ gfs_glock_force_drop(ip->i_gl);
++ break;
++
++ default:
++ error = -ENOTTY;
++ break;
++ }
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/ioctl.h linux-patched/fs/gfs/ioctl.h
+--- linux-orig/fs/gfs/ioctl.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ioctl.h 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,21 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __IOCTL_DOT_H__
++#define __IOCTL_DOT_H__
++
++int gfs_add_bh_to_ub(struct gfs_user_buffer *ub, struct buffer_head *bh);
++
++int gfs_ioctli(struct gfs_inode *ip, unsigned int cmd, void *arg);
++
++#endif /* __IOCTL_DOT_H__ */
+diff -urN linux-orig/fs/gfs/locking.c linux-patched/fs/gfs/locking.c
+--- linux-orig/fs/gfs/locking.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/locking.c 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,114 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "locking.h"
++#include "super.h"
++
++/**
++ * gfs_mount_lockproto - mount a locking protocol
++ * @sdp: the filesystem
++ * @args: mount arguements
++ * @silent: if TRUE, don't complain if the FS isn't a GFS fs
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_mount_lockproto(struct gfs_sbd *sdp, int silent)
++{
++ struct gfs_sb *sb = NULL;
++ struct buffer_head *bh;
++ char *proto, *table, *p = NULL;
++ int error = 0;
++
++ proto = sdp->sd_args.ar_lockproto;
++ table = sdp->sd_args.ar_locktable;
++
++ /* Try to autodetect */
++
++ if (!proto[0] || !table[0]) {
++ error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, NULL,
++ DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto out;
++
++ sb = gmalloc(sizeof(struct gfs_sb));
++ gfs_sb_in(sb, bh->b_data);
++ brelse(bh);
++
++ error = gfs_check_sb(sdp, sb, silent);
++ if (error)
++ goto out;
++
++ if (!proto[0])
++ proto = sb->sb_lockproto;
++
++ if (!table[0])
++ table = sb->sb_locktable;
++ }
++
++ error = lm_mount(proto, table, sdp->sd_args.ar_hostdata,
++ gfs_glock_cb, sdp,
++ GFS_MIN_LVB_SIZE, &sdp->sd_lockstruct);
++ if (error) {
++ printk("GFS: can't mount proto = %s, table = %s, hostdata = %s\n",
++ proto, table, sdp->sd_args.ar_hostdata);
++ goto out;
++ }
++
++ GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_lockspace, sdp,);
++ GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_ops, sdp,);
++ GFS_ASSERT_SBD(sdp->sd_lockstruct.ls_lvb_size >= GFS_MIN_LVB_SIZE,
++ sdp,);
++
++ if (!*table) {
++ table = p = gmalloc(sizeof(sdp->sd_vfs->s_id) + 1);
++ strncpy(table, sdp->sd_vfs->s_id, sizeof(sdp->sd_vfs->s_id));
++ table[sizeof(sdp->sd_vfs->s_id)] = 0;
++ }
++
++ snprintf(sdp->sd_fsname, 256, "%s.%u", table,
++ sdp->sd_lockstruct.ls_jid);
++
++ if (p)
++ kfree(p);
++
++ out:
++ if (sb)
++ kfree(sb);
++
++ return error;
++}
++
++/**
++ * gfs_unmount_lockproto - Unmount lock protocol
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_unmount_lockproto(struct gfs_sbd *sdp)
++{
++ lm_unmount(&sdp->sd_lockstruct);
++}
+diff -urN linux-orig/fs/gfs/locking.h linux-patched/fs/gfs/locking.h
+--- linux-orig/fs/gfs/locking.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/locking.h 2004-06-20 22:48:17.950946122 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOCKING_DOT_H__
++#define __LOCKING_DOT_H__
++
++int gfs_mount_lockproto(struct gfs_sbd *sdp, int silent);
++void gfs_unmount_lockproto(struct gfs_sbd *sdp);
++
++#endif /* __LOCKING_DOT_H__ */
+diff -urN linux-orig/fs/gfs/log.c linux-patched/fs/gfs/log.c
+--- linux-orig/fs/gfs/log.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/log.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,1315 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ What rolls down stairs
++ Alone or in pairs
++ Rolls over your neighbor's dog.
++ What's great for a snack
++ And fits on your back
++ It's log, log, log!
++ It's lo-og, lo-og,
++ It's big, it's heavy, it's wood.
++ It's lo-og, lo-og,
++ It's better than bad, it's good.
++ Everyone wants a log,
++ You're gonna love it, log
++ Come on and get your log,
++ Everyone needs a log...
++ LOG... FROM BLAMMO!
++
++ -- The Ren and Stimpy Show
++*/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "log.h"
++#include "lops.h"
++
++/**
++ * gfs_struct2blk - compute stuff
++ * @sdp: the filesystem
++ * @nstruct: the number of structures
++ * @ssize: the size of the structures
++ *
++ * Compute the number of log descriptor blocks needed to hold a certain number
++ * of structures of a certain size.
++ *
++ * Returns: the number of blocks needed
++ */
++
++unsigned int
++gfs_struct2blk(struct gfs_sbd *sdp, unsigned int nstruct, unsigned int ssize)
++{
++ unsigned int blks;
++ unsigned int first, second;
++
++ blks = 1;
++ first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_log_descriptor)) / ssize;
++
++ if (nstruct > first) {
++ second = sdp->sd_sb.sb_bsize / ssize;
++ blks += DIV_RU(nstruct - first, second);
++ }
++
++ return blks;
++}
++
++/**
++ * gfs_blk2seg - Convert number of blocks into number of segments
++ * @sdp: The GFS superblock
++ * @blocks: The number of blocks
++ *
++ * Returns: The number of journal segments
++ */
++
++unsigned int
++gfs_blk2seg(struct gfs_sbd *sdp, unsigned int blocks)
++{
++ return DIV_RU(blocks, sdp->sd_sb.sb_seg_size - 1);
++}
++
++/**
++ * log_distance - Compute distance between two journal blocks
++ * @sdp: The GFS superblock
++ * @newer: The most recent journal block of the pair
++ * @older: The older journal block of the pair
++ *
++ * Compute the distance (in the journal direction) between two
++ * blocks in the journal
++ *
++ * Returns: the distance in blocks
++ */
++
++static __inline__ unsigned int
++log_distance(struct gfs_sbd *sdp, uint64_t newer, uint64_t older)
++{
++ int64_t dist;
++
++ dist = newer - older;
++ if (dist < 0)
++ dist += sdp->sd_jdesc.ji_nsegment * sdp->sd_sb.sb_seg_size;
++
++ return dist;
++}
++
++/**
++ * log_incr_head - Increment journal head
++ * @sdp: The GFS superblock
++ * @head: the variable holding the head of the journal
++ *
++ * Increment journal head by one.
++ * At the end of the journal, wrap head back to the start.
++ *
++ */
++
++static __inline__ void
++log_incr_head(struct gfs_sbd *sdp, uint64_t * head)
++{
++ struct gfs_jindex *jdesc = &sdp->sd_jdesc;
++
++ if (++*head ==
++ jdesc->ji_addr + jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size)
++ *head = jdesc->ji_addr;
++}
++
++/**
++ * gfs_ail_start - Start I/O on the AIL
++ * @sdp: the filesystem
++ * @flags:
++ *
++ */
++
++void
++gfs_ail_start(struct gfs_sbd *sdp, int flags)
++{
++ struct list_head *head = &sdp->sd_log_ail;
++ struct list_head *first, *tmp;
++ struct gfs_trans *first_tr, *tr;
++
++ gfs_log_lock(sdp);
++
++ if (list_empty(head)) {
++ gfs_log_unlock(sdp);
++ return;
++ }
++
++ first = head->prev;
++ first_tr = list_entry(first, struct gfs_trans, tr_list);
++ gfs_ail_start_trans(sdp, first_tr);
++
++ if (flags & DIO_ALL)
++ first_tr = NULL;
++
++ for (tmp = first->prev; tmp != head; tmp = tmp->prev) {
++ if (first_tr && gfs_ail_empty_trans(sdp, first_tr))
++ break;
++
++ tr = list_entry(tmp, struct gfs_trans, tr_list);
++ gfs_ail_start_trans(sdp, tr);
++ }
++
++ gfs_log_unlock(sdp);
++}
++
++/**
++ * current_tail - Find block number of current log tail
++ * @sdp: The GFS superblock
++ *
++ * Find the block number of the current tail of the log.
++ * Assumes that the log lock is held.
++ *
++ * Returns: The tail's block number
++ */
++
++static uint64_t
++current_tail(struct gfs_sbd *sdp)
++{
++ struct gfs_trans *tr;
++ uint64_t tail;
++
++ if (list_empty(&sdp->sd_log_ail)) {
++ tail = sdp->sd_log_head;
++
++ if (!gfs_log_is_header(sdp, tail)) {
++ tail--;
++ GFS_ASSERT_SBD(gfs_log_is_header(sdp, tail), sdp,);
++ }
++ } else {
++ tr = list_entry(sdp->sd_log_ail.prev,
++ struct gfs_trans, tr_list);
++ tail = tr->tr_first_head;
++ }
++
++ return tail;
++}
++
++/**
++ * gfs_ail_empty - move the tail of the log forward (if possible)
++ * @sdp: the filesystem
++ *
++ * Returns: TRUE if the AIL is empty
++ */
++
++int
++gfs_ail_empty(struct gfs_sbd *sdp)
++{
++ struct list_head *head, *tmp, *prev;
++ struct gfs_trans *tr;
++ uint64_t oldtail, newtail;
++ unsigned int dist;
++ unsigned int segments;
++ int ret;
++
++ gfs_log_lock(sdp);
++
++ oldtail = current_tail(sdp);
++
++ for (head = &sdp->sd_log_ail, tmp = head->prev, prev = tmp->prev;
++ tmp != head;
++ tmp = prev, prev = tmp->prev) {
++ tr = list_entry(tmp, struct gfs_trans, tr_list);
++
++ if (gfs_ail_empty_trans(sdp, tr)) {
++ list_del(&tr->tr_list);
++ kfree(tr);
++ }
++ }
++
++ newtail = current_tail(sdp);
++
++ if (oldtail != newtail) {
++ dist = log_distance(sdp, newtail, oldtail);
++
++ segments = dist / sdp->sd_sb.sb_seg_size;
++ GFS_ASSERT_SBD(segments * sdp->sd_sb.sb_seg_size == dist, sdp,);
++
++ spin_lock(&sdp->sd_log_seg_lock);
++ sdp->sd_log_seg_free += segments;
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++ sdp,);
++ spin_unlock(&sdp->sd_log_seg_lock);
++ }
++
++ ret = list_empty(head);
++
++ gfs_log_unlock(sdp);
++
++ return ret;
++}
++
++/**
++ * gfs_log_reserve - Make a log reservation
++ * @sdp: The GFS superblock
++ * @segments: The number of segments to reserve
++ * @jump_queue: if TRUE, don't care about fairness ordering
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_log_reserve(struct gfs_sbd *sdp, unsigned int segments, int jump_queue)
++{
++ unsigned long start;
++ struct list_head list;
++ unsigned int try = 0;
++
++ GFS_ASSERT_SBD(segments, sdp,);
++
++ if (segments >= sdp->sd_jdesc.ji_nsegment) {
++ printk("GFS: fsid=%s: error reserving log space (%u, %u)\n",
++ sdp->sd_fsname, segments, sdp->sd_jdesc.ji_nsegment);
++ return -EINVAL;
++ }
++
++ INIT_LIST_HEAD(&list);
++ start = jiffies;
++
++ for (;;) {
++ spin_lock(&sdp->sd_log_seg_lock);
++
++ if (list_empty(&list)) {
++ if (jump_queue)
++ list_add(&list, &sdp->sd_log_seg_list);
++ else {
++ list_add_tail(&list, &sdp->sd_log_seg_list);
++ while (sdp->sd_log_seg_list.next != &list) {
++ DECLARE_WAITQUEUE(__wait_chan, current);
++ current->state = TASK_UNINTERRUPTIBLE;
++ add_wait_queue(&sdp->sd_log_seg_wait,
++ &__wait_chan);
++ spin_unlock(&sdp->sd_log_seg_lock);
++ schedule();
++ spin_lock(&sdp->sd_log_seg_lock);
++ remove_wait_queue(&sdp->sd_log_seg_wait,
++ &__wait_chan);
++ current->state = TASK_RUNNING;
++ }
++ }
++ }
++
++ if (sdp->sd_log_seg_free >= segments) {
++ sdp->sd_log_seg_free -= segments;
++ list_del(&list);
++ spin_unlock(&sdp->sd_log_seg_lock);
++ wake_up(&sdp->sd_log_seg_wait);
++ break;
++ }
++
++ spin_unlock(&sdp->sd_log_seg_lock);
++
++ if (try) {
++ gfs_log_flush(sdp);
++ gfs_ail_start(sdp, 0);
++ }
++
++ gfs_ail_empty(sdp);
++
++ try++;
++ if (time_after_eq(jiffies, start + 60 * HZ))
++ printk("GFS: fsid=%s: pid %d can't make log reservation (asking for %u segments)\n",
++ sdp->sd_fsname, current->pid, segments);
++ yield();
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_log_release - Release a given number of log segments
++ * @sdp: The GFS superblock
++ * @segments: The number of segments
++ *
++ */
++
++void
++gfs_log_release(struct gfs_sbd *sdp, unsigned int segments)
++{
++ spin_lock(&sdp->sd_log_seg_lock);
++ sdp->sd_log_seg_free += segments;
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment, sdp,);
++ spin_unlock(&sdp->sd_log_seg_lock);
++}
++
++/**
++ * log_get_header - Get the journal header buffer
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ * @next: TRUE is this is not a continuation of an existing transaction
++ *
++ * Returns: the log buffer
++ */
++
++static struct gfs_log_buf *
++log_get_header(struct gfs_sbd *sdp, struct gfs_trans *tr, int next)
++{
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++ struct gfs_log_header header;
++
++ GFS_ASSERT_SBD(gfs_log_is_header(sdp, tr->tr_log_head), sdp,);
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++ !list_empty(&tr->tr_free_bufs), sdp,);
++ lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++ list_del(&lb->lb_list);
++ tr->tr_num_free_bufs--;
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bmem &&
++ !list_empty(&tr->tr_free_bmem), sdp,);
++ bmem = tr->tr_free_bmem.next;
++ list_del(bmem);
++ tr->tr_num_free_bmem--;
++
++ gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, (char *)bmem);
++ memset(bmem, 0, sdp->sd_sb.sb_bsize);
++
++ memset(&header, 0, sizeof (header));
++
++ if (next) {
++ header.lh_header.mh_magic = GFS_MAGIC;
++ header.lh_header.mh_type = GFS_METATYPE_LH;
++ header.lh_header.mh_format = GFS_FORMAT_LH;
++ header.lh_first = tr->tr_log_head;
++ header.lh_sequence = sdp->sd_sequence + 1;
++ header.lh_tail = current_tail(sdp);
++ header.lh_last_dump = sdp->sd_log_dump_last;
++ } else {
++ header.lh_header.mh_magic = GFS_MAGIC;
++ header.lh_header.mh_type = GFS_METATYPE_LH;
++ header.lh_header.mh_format = GFS_FORMAT_LH;
++ header.lh_first = tr->tr_first_head;
++ header.lh_sequence = sdp->sd_sequence;
++ header.lh_tail = current_tail(sdp);
++ header.lh_last_dump = sdp->sd_log_dump_last;
++
++ list_add(&lb->lb_list, &tr->tr_bufs);
++ }
++
++ gfs_log_header_out(&header, lb->lb_bh.b_data);
++ gfs_log_header_out(&header,
++ lb->lb_bh.b_data + GFS_BASIC_BLOCK -
++ sizeof(struct gfs_log_header));
++
++ log_incr_head(sdp, &tr->tr_log_head);
++
++ return lb;
++}
++
++/**
++ * gfs_log_get_buf - Get a buffer to use for control data
++ * @sdp: The GFS superblock
++ * @tr: The GFS transaction
++ *
++ * Generate a regular buffer for use in the journal as control data.
++ *
++ * Returns: the buffer
++ */
++
++struct gfs_log_buf *
++gfs_log_get_buf(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++
++ if (gfs_log_is_header(sdp, tr->tr_log_head))
++ log_get_header(sdp, tr, FALSE);
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++ !list_empty(&tr->tr_free_bufs), sdp,);
++ lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++ list_del(&lb->lb_list);
++ tr->tr_num_free_bufs--;
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bmem
++ && !list_empty(&tr->tr_free_bmem), sdp,);
++ bmem = tr->tr_free_bmem.next;
++ list_del(bmem);
++ tr->tr_num_free_bmem--;
++
++ gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, (char *)bmem);
++ memset(bmem, 0, sdp->sd_sb.sb_bsize);
++
++ list_add(&lb->lb_list, &tr->tr_bufs);
++
++ log_incr_head(sdp, &tr->tr_log_head);
++
++ return lb;
++}
++
++/**
++ * gfs_log_fake_buf - Build a fake buffer head
++ * @sdp: the filesystem
++ * @tr: the transaction this is part of
++ * @data: the data the buffer should point to
++ * @unlock: a buffer that is unlocked as this struct gfs_log_buf is torn down
++ *
++ */
++
++void
++gfs_log_fake_buf(struct gfs_sbd *sdp, struct gfs_trans *tr, char *data,
++ struct buffer_head *unlock)
++{
++ struct gfs_log_buf *lb;
++
++ if (gfs_log_is_header(sdp, tr->tr_log_head))
++ log_get_header(sdp, tr, FALSE);
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bufs &&
++ !list_empty(&tr->tr_free_bufs), sdp,);
++ lb = list_entry(tr->tr_free_bufs.next, struct gfs_log_buf, lb_list);
++ list_del(&lb->lb_list);
++ tr->tr_num_free_bufs--;
++
++ gfs_logbh_init(sdp, &lb->lb_bh, tr->tr_log_head, data);
++ lb->lb_unlock = unlock;
++
++ list_add(&lb->lb_list, &tr->tr_bufs);
++
++ log_incr_head(sdp, &tr->tr_log_head);
++}
++
++/**
++ * check_seg_usage - Check that we didn't use too many segments
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Also, make sure we don't write ever get to a point where there are
++ * no dumps in the log (corrupting the log). Panic before we let
++ * that happen.
++ *
++ */
++
++static void
++check_seg_usage(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_jindex *jdesc = &sdp->sd_jdesc;
++ unsigned int dist;
++ unsigned int segments;
++ uint64_t head_off, head_wrap;
++ uint64_t dump_off, dump_wrap;
++
++ dist = log_distance(sdp, tr->tr_log_head, tr->tr_first_head);
++
++ segments = dist / sdp->sd_sb.sb_seg_size;
++ GFS_ASSERT_SBD(segments * sdp->sd_sb.sb_seg_size == dist, sdp,);
++ GFS_ASSERT_SBD(segments == tr->tr_seg_reserved, sdp,);
++
++ if (sdp->sd_log_dump_last) {
++ head_off = tr->tr_first_head +
++ tr->tr_seg_reserved * sdp->sd_sb.sb_seg_size;
++ head_wrap = sdp->sd_log_wrap;
++ if (head_off >= jdesc->ji_addr +
++ jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size) {
++ head_off -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++ head_wrap++;
++ }
++
++ dump_off = sdp->sd_log_dump_last;
++ dump_wrap = sdp->sd_log_dump_last_wrap;
++
++ switch (head_wrap - dump_wrap) {
++ case 0:
++ break;
++
++ case 1:
++ if (head_off < dump_off)
++ break;
++ else if (head_off == dump_off &&
++ (tr->tr_flags & TRF_LOG_DUMP))
++ break;
++
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,
++ printk("head_off = %"PRIu64", head_wrap = %"PRIu64"\n",
++ head_off, head_wrap);
++ printk("dump_off = %"PRIu64", dump_wrap = %"PRIu64"\n",
++ dump_off, dump_wrap););
++ break;
++ }
++ }
++}
++
++/**
++ * log_free_buf - Free a struct gfs_log_buf (and possibly the data it points to)
++ * @sdp: the filesystem
++ * @lb: the log buffer
++ *
++ */
++
++static void
++log_free_buf(struct gfs_sbd *sdp, struct gfs_log_buf *lb)
++{
++ char *bmem;
++
++ bmem = lb->lb_bh.b_data;
++ gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++ if (lb->lb_unlock)
++ gfs_unlock_buffer(lb->lb_unlock);
++ else
++ kfree(bmem);
++
++ kfree(lb);
++}
++
++/**
++ * sync_trans - Add "last" descriptor to transaction and sync to disk
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Add the "last" descriptor on to the end of the current transaction
++ * and sync it out to disk. Don't commit it yet, though.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++sync_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head, *prev;
++ struct gfs_log_descriptor desc;
++ struct gfs_log_buf *lb;
++ uint64_t blk;
++ int error = 0, e;
++
++ /* Build LAST descriptor */
++
++ lb = gfs_log_get_buf(sdp, tr);
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = GFS_LOG_DESC_LAST;
++ desc.ld_length = 1;
++ for (blk = tr->tr_log_head; !gfs_log_is_header(sdp, blk); blk++)
++ desc.ld_length++;
++ gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++ while (!gfs_log_is_header(sdp, tr->tr_log_head))
++ log_incr_head(sdp, &tr->tr_log_head);
++
++ check_seg_usage(sdp, tr);
++
++ /* Start I/O
++ Go in "prev" direction to start the I/O in order. */
++
++ for (head = &tr->tr_bufs, tmp = head->prev, prev = tmp->prev;
++ tmp != head;
++ tmp = prev, prev = tmp->prev) {
++ lb = list_entry(tmp, struct gfs_log_buf, lb_list);
++
++ if (error) {
++ list_del(&lb->lb_list);
++ log_free_buf(sdp, lb);
++ } else {
++ e = gfs_logbh_start(sdp, &lb->lb_bh);
++ if (e) {
++ list_del(&lb->lb_list);
++ log_free_buf(sdp, lb);
++ error = e;
++ }
++ }
++ }
++
++ /* Wait on I/O
++ Go in "next" direction to minimize sleeps/wakeups. */
++
++ while (!list_empty(&tr->tr_bufs)) {
++ lb = list_entry(tr->tr_bufs.next, struct gfs_log_buf, lb_list);
++
++ e = gfs_logbh_wait(sdp, &lb->lb_bh);
++ if (e)
++ error = e;
++
++ list_del(&lb->lb_list);
++ log_free_buf(sdp, lb);
++ }
++
++ return error;
++}
++
++/**
++ * commit_trans - Commit the current transaction
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Write next header to commit
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++commit_trans(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_log_buf *lb;
++ int error;
++
++ lb = log_get_header(sdp, tr, TRUE);
++
++ error = gfs_logbh_start(sdp, &lb->lb_bh);
++ if (!error)
++ error = gfs_logbh_wait(sdp, &lb->lb_bh);
++
++ log_free_buf(sdp, lb);
++
++ return error;
++}
++
++/**
++ * disk_commit - Write a transaction to the on-disk journal
++ * @sdp: The GFS superblock
++ * @tr: The transaction
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++disk_commit(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ uint64_t last_dump, last_dump_wrap;
++ int error = 0;
++
++ GFS_ASSERT_SBD(!test_bit(SDF_ROFS, &sdp->sd_flags), sdp,);
++ tr->tr_log_head = sdp->sd_log_head;
++ tr->tr_first_head = tr->tr_log_head - 1;
++ GFS_ASSERT_SBD(gfs_log_is_header(sdp, tr->tr_first_head), sdp,);
++
++ LO_BUILD_BHLIST(sdp, tr);
++
++ GFS_ASSERT_SBD(!list_empty(&tr->tr_bufs), sdp,);
++
++ error = sync_trans(sdp, tr);
++ if (error) {
++ /* Eat unusable commit buffer */
++ log_free_buf(sdp, log_get_header(sdp, tr, TRUE));
++ goto out;
++ }
++
++ if (tr->tr_flags & TRF_LOG_DUMP) {
++ /* This commit header should point to the log dump we're
++ commiting as the current one. But save the copy of the
++ old one in case we have problems commiting the dump. */
++
++ last_dump = sdp->sd_log_dump_last;
++ last_dump_wrap = sdp->sd_log_dump_last_wrap;
++
++ sdp->sd_log_dump_last = tr->tr_first_head;
++ sdp->sd_log_dump_last_wrap = sdp->sd_log_wrap;
++
++ error = commit_trans(sdp, tr);
++ if (error) {
++ sdp->sd_log_dump_last = last_dump;
++ sdp->sd_log_dump_last_wrap = last_dump_wrap;
++ goto out;
++ }
++ } else {
++ error = commit_trans(sdp, tr);
++ if (error)
++ goto out;
++ }
++
++ if (sdp->sd_log_head > tr->tr_log_head)
++ sdp->sd_log_wrap++;
++ sdp->sd_log_head = tr->tr_log_head;
++ sdp->sd_sequence++;
++
++ out:
++ GFS_ASSERT_SBD(!tr->tr_num_free_bufs &&
++ list_empty(&tr->tr_free_bufs), sdp,);
++ GFS_ASSERT_SBD(!tr->tr_num_free_bmem &&
++ list_empty(&tr->tr_free_bmem), sdp,);
++
++ return error;
++}
++
++/**
++ * add_trans_to_ail - Add a ondisk commited transaction to the AIL
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++add_trans_to_ail(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_log_element *le;
++
++ while (!list_empty(&tr->tr_elements)) {
++ le = list_entry(tr->tr_elements.next,
++ struct gfs_log_element, le_list);
++ LO_ADD_TO_AIL(sdp, le);
++ }
++
++ list_add(&tr->tr_list, &sdp->sd_log_ail);
++}
++
++/**
++ * log_refund - Refund log segments to the free pool
++ * @sdp: The GFS superblock
++ * @tr: The tranaction to examine
++ *
++ * Look at the number of segments reserved for this transaction and the
++ * number of segments actually needed for it. If they aren't the
++ * same, refund the difference to the free segment pool.
++ *
++ * Called with the log lock held
++ */
++
++static void
++log_refund(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++ unsigned int num_bufs = 0, num_bmem = 0;
++ unsigned int segments;
++
++ LO_TRANS_SIZE(sdp, tr, NULL, NULL, &num_bufs, &num_bmem);
++
++ segments = gfs_blk2seg(sdp, num_bufs + 1);
++ num_bufs += segments + 1;
++ num_bmem += segments + 1;
++
++ if (tr->tr_seg_reserved > segments) {
++ spin_lock(&sdp->sd_log_seg_lock);
++ sdp->sd_log_seg_free += tr->tr_seg_reserved - segments;
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++ sdp,);
++ spin_unlock(&sdp->sd_log_seg_lock);
++
++ tr->tr_seg_reserved = segments;
++ } else
++ GFS_ASSERT_SBD(tr->tr_seg_reserved == segments, sdp,);
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bufs >= num_bufs, sdp,);
++ while (tr->tr_num_free_bufs > num_bufs) {
++ lb = list_entry(tr->tr_free_bufs.next,
++ struct gfs_log_buf, lb_list);
++ list_del(&lb->lb_list);
++ kfree(lb);
++ tr->tr_num_free_bufs--;
++ }
++
++ GFS_ASSERT_SBD(tr->tr_num_free_bmem >= num_bmem, sdp,);
++ while (tr->tr_num_free_bmem > num_bmem) {
++ bmem = tr->tr_free_bmem.next;
++ list_del(bmem);
++ kfree(bmem);
++ tr->tr_num_free_bmem--;
++ }
++}
++
++/**
++ * trans_combine - combine two transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that gets freed
++ *
++ * Assumes that the two transactions are independent.
++ */
++
++static void
++trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_trans *new_tr)
++{
++ struct gfs_log_element *le;
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++
++ tr->tr_file = __FILE__;
++ tr->tr_line = __LINE__;
++ tr->tr_seg_reserved += new_tr->tr_seg_reserved;
++ tr->tr_flags |= new_tr->tr_flags;
++ tr->tr_num_free_bufs += new_tr->tr_num_free_bufs;
++ tr->tr_num_free_bmem += new_tr->tr_num_free_bmem;
++
++ /* Combine the elements of the two transactions */
++
++ while (!list_empty(&new_tr->tr_elements)) {
++ le = list_entry(new_tr->tr_elements.next,
++ struct gfs_log_element, le_list);
++ GFS_ASSERT_SBD(le->le_trans == new_tr, sdp,);
++ le->le_trans = tr;
++ list_move(&le->le_list, &tr->tr_elements);
++ }
++
++ LO_TRANS_COMBINE(sdp, tr, new_tr);
++
++ while (!list_empty(&new_tr->tr_free_bufs)) {
++ lb = list_entry(new_tr->tr_free_bufs.next,
++ struct gfs_log_buf, lb_list);
++ list_move(&lb->lb_list, &tr->tr_free_bufs);
++ new_tr->tr_num_free_bufs--;
++ }
++ while (!list_empty(&new_tr->tr_free_bmem)) {
++ bmem = new_tr->tr_free_bmem.next;
++ list_move(bmem, &tr->tr_free_bmem);
++ new_tr->tr_num_free_bmem--;
++ }
++
++ GFS_ASSERT_SBD(!new_tr->tr_num_free_bufs, sdp,);
++ GFS_ASSERT_SBD(!new_tr->tr_num_free_bmem, sdp,);
++
++ kfree(new_tr);
++}
++
++/**
++ * log_flush_internal - flush incore transactions
++ * @sdp: the filesystem
++ * @gl: The glock structure to flush. If NULL, flush the whole incore log
++ *
++ */
++
++static void
++log_flush_internal(struct gfs_sbd *sdp, struct gfs_glock *gl)
++{
++ struct gfs_trans *trans = NULL, *tr;
++ int error;
++
++ gfs_log_lock(sdp);
++
++ if (list_empty(&sdp->sd_log_incore))
++ goto out;
++
++ if (gl) {
++ if (!gl->gl_incore_le.le_trans)
++ goto out;
++
++ trans = gl->gl_incore_le.le_trans;
++
++ list_del(&trans->tr_list);
++ } else {
++ while (!list_empty(&sdp->sd_log_incore)) {
++ tr = list_entry(sdp->sd_log_incore.next,
++ struct gfs_trans, tr_list);
++
++ list_del(&tr->tr_list);
++
++ if (trans)
++ trans_combine(sdp, trans, tr);
++ else
++ trans = tr;
++ }
++ }
++
++ GFS_ASSERT_SBD(trans, sdp,);
++
++ log_refund(sdp, trans);
++
++ /* Actually do the stuff to commit the transaction */
++
++ error = disk_commit(sdp, trans);
++ if (error)
++ gfs_io_error(sdp);
++
++ add_trans_to_ail(sdp, trans);
++
++ if (log_distance(sdp, sdp->sd_log_head, sdp->sd_log_dump_last) * GFS_DUMPS_PER_LOG >=
++ sdp->sd_jdesc.ji_nsegment * sdp->sd_sb.sb_seg_size)
++ set_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags);
++
++ out:
++ if (list_empty(&sdp->sd_log_incore))
++ sdp->sd_vfs->s_dirt = FALSE;
++
++ gfs_log_unlock(sdp);
++
++ /* Dump if we need to. */
++
++ if (test_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags))
++ gfs_log_dump(sdp, FALSE);
++}
++
++/**
++ * gfs_log_flush - flush the whole incore log
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_log_flush(struct gfs_sbd *sdp)
++{
++ log_flush_internal(sdp, NULL);
++}
++
++/**
++ * gfs_log_flush_glock - flush the incore log for a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_log_flush_glock(struct gfs_glock *gl)
++{
++ log_flush_internal(gl->gl_sbd, gl);
++}
++
++/**
++ * incore_commit - commit a transaction in-core
++ * @sdp: the filesystem
++ * @new_tr: the transaction to commit
++ *
++ * Add the transaction @new_tr to the end of the incore commit list.
++ * Pull up and merge an previously commited transactions that share
++ * locks. Also pull up any rename transactions that need it.
++ */
++
++static void
++incore_commit(struct gfs_sbd *sdp, struct gfs_trans *new_tr)
++{
++ struct gfs_log_element *le;
++ struct gfs_trans *trans = NULL, *exist_tr;
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++ struct list_head *tmp, *head, *next;
++
++ for (head = &new_tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++
++ exist_tr = LO_OVERLAP_TRANS(sdp, le);
++ if (!exist_tr)
++ continue;
++
++ if (exist_tr != trans) {
++ list_del(&exist_tr->tr_list);
++ if (trans)
++ trans_combine(sdp, trans, exist_tr);
++ else
++ trans = exist_tr;
++ }
++ }
++
++ if (trans) {
++ trans->tr_file = __FILE__;
++ trans->tr_line = __LINE__;
++ trans->tr_seg_reserved += new_tr->tr_seg_reserved;
++ trans->tr_flags |= new_tr->tr_flags;
++ trans->tr_num_free_bufs += new_tr->tr_num_free_bufs;
++ trans->tr_num_free_bmem += new_tr->tr_num_free_bmem;
++
++ while (!list_empty(&new_tr->tr_free_bufs)) {
++ lb = list_entry(new_tr->tr_free_bufs.next,
++ struct gfs_log_buf, lb_list);
++ list_move(&lb->lb_list, &trans->tr_free_bufs);
++ new_tr->tr_num_free_bufs--;
++ }
++ while (!list_empty(&new_tr->tr_free_bmem)) {
++ bmem = new_tr->tr_free_bmem.next;
++ list_move(bmem, &trans->tr_free_bmem);
++ new_tr->tr_num_free_bmem--;
++ }
++ } else
++ trans = new_tr;
++
++ for (head = &new_tr->tr_elements, tmp = head->next, next = tmp->next;
++ tmp != head;
++ tmp = next, next = next->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ LO_INCORE_COMMIT(sdp, trans, le);
++ }
++
++ if (trans != new_tr) {
++ GFS_ASSERT_SBD(!new_tr->tr_num_free_bufs, sdp,);
++ GFS_ASSERT_SBD(!new_tr->tr_num_free_bmem, sdp,);
++ GFS_ASSERT_SBD(list_empty(&new_tr->tr_elements), sdp,);
++ kfree(new_tr);
++ }
++
++ log_refund(sdp, trans);
++
++ list_add(&trans->tr_list, &sdp->sd_log_incore);
++}
++
++/**
++ * gfs_log_commit - Commit a transaction to the log
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_log_commit(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++ unsigned int num_mblks = 0, num_eblks = 0, num_bufs = 0, num_bmem = 0;
++ unsigned int segments;
++
++ LO_TRANS_SIZE(sdp, tr, &num_mblks, &num_eblks, &num_bufs, &num_bmem);
++
++ GFS_ASSERT_SBD(num_mblks <= tr->tr_mblks_asked &&
++ num_eblks <= tr->tr_eblks_asked, sdp,
++ printk("type = (%s, %u)\n",
++ tr->tr_file, tr->tr_line);
++ printk("num_mblks = %u, tr->tr_mblks_asked = %u\n",
++ num_mblks, tr->tr_mblks_asked);
++ printk("num_eblks = %u, tr->tr_eblks_asked = %u\n",
++ num_eblks, tr->tr_eblks_asked););
++
++ segments = gfs_blk2seg(sdp, num_bufs + 1);
++ num_bufs += segments + 1;
++ num_bmem += segments + 1;
++
++ while (num_bufs--) {
++ lb = gmalloc(sizeof(struct gfs_log_buf));
++ memset(lb, 0, sizeof(struct gfs_log_buf));
++ list_add(&lb->lb_list, &tr->tr_free_bufs);
++ tr->tr_num_free_bufs++;
++ }
++ while (num_bmem--) {
++ bmem = gmalloc(sdp->sd_sb.sb_bsize);
++ list_add(bmem, &tr->tr_free_bmem);
++ tr->tr_num_free_bmem++;
++ }
++
++ gfs_log_lock(sdp);
++
++ incore_commit(sdp, tr);
++
++ if (sdp->sd_log_buffers > sdp->sd_tune.gt_incore_log_blocks) {
++ gfs_log_unlock(sdp);
++ gfs_log_flush(sdp);
++ } else {
++ sdp->sd_vfs->s_dirt = TRUE;
++ gfs_log_unlock(sdp);
++ }
++
++}
++
++/**
++ * gfs_log_dump - make a Log Dump entry in the log
++ * @sdp: the filesystem
++ * @force: if TRUE, always make the dump even if one has been made recently
++ *
++ */
++
++void
++gfs_log_dump(struct gfs_sbd *sdp, int force)
++{
++ struct gfs_log_element *le;
++ struct gfs_trans tr;
++ struct gfs_log_buf *lb;
++ struct list_head *bmem;
++ unsigned int num_bufs, num_bmem;
++ unsigned int segments;
++ int error;
++
++ if (test_and_set_bit(SDF_IN_LOG_DUMP, &sdp->sd_flags)) {
++ GFS_ASSERT_SBD(!force, sdp,);
++ return;
++ }
++
++ memset(&tr, 0, sizeof(struct gfs_trans));
++ INIT_LIST_HEAD(&tr.tr_elements);
++ INIT_LIST_HEAD(&tr.tr_free_bufs);
++ INIT_LIST_HEAD(&tr.tr_free_bmem);
++ INIT_LIST_HEAD(&tr.tr_bufs);
++ tr.tr_flags = TRF_LOG_DUMP;
++ tr.tr_file = __FILE__;
++ tr.tr_line = __LINE__;
++
++ for (;;) {
++ gfs_log_lock(sdp);
++
++ if (!force && !test_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags))
++ goto out;
++
++ num_bufs = num_bmem = 0;
++ LO_DUMP_SIZE(sdp, NULL, &num_bufs, &num_bmem);
++ GFS_ASSERT_SBD(num_bufs, sdp,);
++ segments = gfs_blk2seg(sdp, num_bufs + 1);
++ num_bufs += segments + 1;
++ num_bmem += segments + 1;
++
++ if (tr.tr_seg_reserved >= segments &&
++ tr.tr_num_free_bufs >= num_bufs &&
++ tr.tr_num_free_bmem >= num_bmem)
++ break;
++
++ gfs_log_unlock(sdp);
++
++ if (tr.tr_seg_reserved < segments) {
++ error = gfs_log_reserve(sdp,
++ segments - tr.tr_seg_reserved,
++ TRUE);
++ GFS_ASSERT_SBD(!error, sdp,);
++ tr.tr_seg_reserved = segments;
++ }
++ while (tr.tr_num_free_bufs < num_bufs) {
++ lb = gmalloc(sizeof(struct gfs_log_buf));
++ memset(lb, 0, sizeof(struct gfs_log_buf));
++ list_add(&lb->lb_list, &tr.tr_free_bufs);
++ tr.tr_num_free_bufs++;
++ }
++ while (tr.tr_num_free_bmem < num_bmem) {
++ bmem = gmalloc(sdp->sd_sb.sb_bsize);
++ list_add(bmem, &tr.tr_free_bmem);
++ tr.tr_num_free_bmem++;
++ }
++ }
++
++ if (tr.tr_seg_reserved > segments) {
++ spin_lock(&sdp->sd_log_seg_lock);
++ sdp->sd_log_seg_free += tr.tr_seg_reserved - segments;
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++ sdp,);
++ spin_unlock(&sdp->sd_log_seg_lock);
++ tr.tr_seg_reserved = segments;
++ }
++ while (tr.tr_num_free_bufs > num_bufs) {
++ lb = list_entry(tr.tr_free_bufs.next,
++ struct gfs_log_buf, lb_list);
++ list_del(&lb->lb_list);
++ kfree(lb);
++ tr.tr_num_free_bufs--;
++ }
++ while (tr.tr_num_free_bmem > num_bmem) {
++ bmem = tr.tr_free_bmem.next;
++ list_del(bmem);
++ kfree(bmem);
++ tr.tr_num_free_bmem--;
++ }
++
++ LO_BUILD_DUMP(sdp, &tr);
++
++ error = disk_commit(sdp, &tr);
++ if (error)
++ gfs_io_error(sdp);
++
++ while (!list_empty(&tr.tr_elements)) {
++ le = list_entry(tr.tr_elements.next,
++ struct gfs_log_element, le_list);
++ LO_CLEAN_DUMP(sdp, le);
++ }
++
++ /* If there isn't anything the AIL, we won't get back the log
++ space we reserved unless we do it ourselves. */
++
++ if (list_empty(&sdp->sd_log_ail)) {
++ spin_lock(&sdp->sd_log_seg_lock);
++ sdp->sd_log_seg_free += tr.tr_seg_reserved;
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free < sdp->sd_jdesc.ji_nsegment,
++ sdp,);
++ spin_unlock(&sdp->sd_log_seg_lock);
++ }
++
++ clear_bit(SDF_NEED_LOG_DUMP, &sdp->sd_flags);
++
++ out:
++ gfs_log_unlock(sdp);
++ clear_bit(SDF_IN_LOG_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * gfs_log_shutdown - write a shutdown header into a journal
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_log_shutdown(struct gfs_sbd *sdp)
++{
++ struct gfs_log_buf *lb;
++ char *bmem;
++ struct gfs_log_header head;
++ struct gfs_log_descriptor desc;
++ unsigned int elements = 0;
++ int error;
++
++ lb = gmalloc(sizeof(struct gfs_log_buf));
++ memset(lb, 0, sizeof(struct gfs_log_buf));
++ bmem = gmalloc(sdp->sd_sb.sb_bsize);
++
++ gfs_log_lock(sdp);
++
++ GFS_ASSERT_SBD(list_empty(&sdp->sd_log_ail), sdp,);
++ GFS_ASSERT_SBD(sdp->sd_log_seg_free == sdp->sd_jdesc.ji_nsegment - 1,
++ sdp,);
++ GFS_ASSERT_SBD(!sdp->sd_log_buffers, sdp,);
++ GFS_ASSERT_SBD(gfs_log_is_header(sdp, sdp->sd_log_head - 1), sdp,);
++
++ /* Build a "last" log descriptor */
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = GFS_LOG_DESC_LAST;
++ desc.ld_length = sdp->sd_sb.sb_seg_size - 1;
++
++ /* Write the descriptor */
++
++ gfs_logbh_init(sdp, &lb->lb_bh, sdp->sd_log_head, bmem);
++ memset(bmem, 0, sdp->sd_sb.sb_bsize);
++ gfs_desc_out(&desc, lb->lb_bh.b_data);
++ error = gfs_logbh_start(sdp, &lb->lb_bh);
++ if (!error)
++ error = gfs_logbh_wait(sdp, &lb->lb_bh);
++ gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++ if (error)
++ goto out;
++
++ /* Move to the next header */
++
++ while (!gfs_log_is_header(sdp, sdp->sd_log_head))
++ log_incr_head(sdp, &sdp->sd_log_head);
++
++ LO_DUMP_SIZE(sdp, &elements, NULL, NULL);
++
++ /* Build the shutdown header */
++
++ memset(&head, 0, sizeof (struct gfs_log_header));
++ head.lh_header.mh_magic = GFS_MAGIC;
++ head.lh_header.mh_type = GFS_METATYPE_LH;
++ head.lh_header.mh_format = GFS_FORMAT_LH;
++ head.lh_flags = GFS_LOG_HEAD_UNMOUNT;
++ head.lh_first = sdp->sd_log_head;
++ head.lh_sequence = sdp->sd_sequence + 1;
++ /* Don't care about tail */
++ head.lh_last_dump = (elements) ? sdp->sd_log_dump_last : 0;
++
++ /* Write out the shutdown header */
++
++ gfs_logbh_init(sdp, &lb->lb_bh, sdp->sd_log_head, bmem);
++ memset(bmem, 0, sdp->sd_sb.sb_bsize);
++ gfs_log_header_out(&head, lb->lb_bh.b_data);
++ gfs_log_header_out(&head,
++ lb->lb_bh.b_data + GFS_BASIC_BLOCK -
++ sizeof(struct gfs_log_header));
++ error = gfs_logbh_start(sdp, &lb->lb_bh);
++ if (!error)
++ error = gfs_logbh_wait(sdp, &lb->lb_bh);
++ gfs_logbh_uninit(sdp, &lb->lb_bh);
++
++ out:
++ gfs_log_unlock(sdp);
++
++ kfree(lb);
++ kfree(bmem);
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/log.h linux-patched/fs/gfs/log.h
+--- linux-orig/fs/gfs/log.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/log.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,79 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOG_DOT_H__
++#define __LOG_DOT_H__
++
++/**
++ * gfs_log_lock - acquire the right to mess with the log manager
++ * @sdp: the filesystem
++ *
++ */
++
++static __inline__ void
++gfs_log_lock(struct gfs_sbd *sdp)
++{
++ down(&sdp->sd_log_lock);
++}
++
++/**
++ * gfs_log_unlock - release the right to mess with the log manager
++ * @sdp: the filesystem
++ *
++ */
++
++static __inline__ void
++gfs_log_unlock(struct gfs_sbd *sdp)
++{
++ up(&sdp->sd_log_lock);
++}
++
++unsigned int gfs_struct2blk(struct gfs_sbd *sdp, unsigned int nstruct,
++ unsigned int ssize);
++unsigned int gfs_blk2seg(struct gfs_sbd *sdp, unsigned int blocks);
++
++int gfs_log_reserve(struct gfs_sbd *sdp, unsigned int segments, int jump_queue);
++void gfs_log_release(struct gfs_sbd *sdp, unsigned int segments);
++
++void gfs_ail_start(struct gfs_sbd *sdp, int flags);
++int gfs_ail_empty(struct gfs_sbd *sdp);
++
++void gfs_log_commit(struct gfs_sbd *sdp, struct gfs_trans *trans);
++void gfs_log_flush(struct gfs_sbd *sdp);
++void gfs_log_flush_glock(struct gfs_glock *gl);
++
++int gfs_log_shutdown(struct gfs_sbd *sdp);
++
++void gfs_log_dump(struct gfs_sbd *sdp, int force);
++
++/* Internal crap used the log operations */
++
++/**
++ * gfs_log_is_header - Discover if block is on journal header
++ * @sdp: The GFS superblock
++ * @block: The block number
++ *
++ * Returns: TRUE if the block is on a journal segment boundary, FALSE otherwise
++ */
++
++static __inline__ int
++gfs_log_is_header(struct gfs_sbd *sdp, uint64_t block)
++{
++ return !do_mod(block, sdp->sd_sb.sb_seg_size);
++}
++
++struct gfs_log_buf *gfs_log_get_buf(struct gfs_sbd *sdp, struct gfs_trans *tr);
++void gfs_log_fake_buf(struct gfs_sbd *sdp, struct gfs_trans *tr, char *data,
++ struct buffer_head *unlock);
++
++#endif /* __LOG_DOT_H__ */
+diff -urN linux-orig/fs/gfs/lops.c linux-patched/fs/gfs/lops.c
+--- linux-orig/fs/gfs/lops.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lops.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,1563 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "log.h"
++#include "lops.h"
++#include "quota.h"
++#include "recovery.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * generic_le_add - generic routine to add a log element to a transaction
++ * @sdp: the filesystem
++ * @le: the log entry
++ *
++ */
++
++static void
++generic_le_add(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_trans *tr;
++
++ GFS_ASSERT_SBD(le->le_ops &&
++ !le->le_trans &&
++ list_empty(&le->le_list), sdp,);
++
++ tr = current_transaction;
++ GFS_ASSERT_SBD(tr, sdp,);
++
++ le->le_trans = tr;
++ list_add(&le->le_list, &tr->tr_elements);
++}
++
++/**
++ * glock_trans_end - drop a glock reference
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++glock_trans_end(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++ GFS_ASSERT_GLOCK(gfs_glock_is_locked_by_me(gl) &&
++ gfs_glock_is_held_excl(gl), gl,);
++ gfs_glock_put(gl);
++}
++
++/**
++ * glock_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++glock_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++ struct gfs_glock *gl;
++
++ switch (where) {
++ case TRANS_IS_NEW:
++ gl = container_of(le, struct gfs_glock, gl_new_le);
++ break;
++ case TRANS_IS_INCORE:
++ gl = container_of(le, struct gfs_glock, gl_incore_le);
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,);
++ }
++
++ printk(" Glock: (%u, %"PRIu64")\n",
++ gl->gl_name.ln_type,
++ gl->gl_name.ln_number);
++}
++
++/**
++ * glock_overlap_trans - Find any incore transactions that might overlap with this LE
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static struct gfs_trans *
++glock_overlap_trans(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++ return gl->gl_incore_le.le_trans;
++}
++
++/**
++ * glock_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++glock_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_log_element *le)
++{
++ struct gfs_glock *gl = container_of(le, struct gfs_glock, gl_new_le);
++
++ if (gl->gl_incore_le.le_trans)
++ GFS_ASSERT_GLOCK(gl->gl_incore_le.le_trans == tr, gl,);
++ else {
++ gl->gl_incore_le.le_trans = tr;
++ list_add(&gl->gl_incore_le.le_list, &tr->tr_elements);
++ if (tr != le->le_trans)
++ tr->tr_num_gl++;
++ }
++
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++}
++
++/**
++ * glock_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++glock_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++}
++
++/**
++ * glock_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++glock_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_trans *new_tr)
++{
++ tr->tr_num_gl += new_tr->tr_num_gl;
++}
++
++/**
++ * buf_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++buf_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++ struct gfs_bufdata *bd;
++
++ switch (where) {
++ case TRANS_IS_NEW:
++ bd = container_of(le, struct gfs_bufdata, bd_new_le);
++ break;
++ case TRANS_IS_INCORE:
++ bd = container_of(le, struct gfs_bufdata, bd_incore_le);
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,);
++ }
++
++ printk(" Buffer: %"PRIu64"\n", (uint64_t)bd->bd_bh->b_blocknr);
++}
++
++/**
++ * buf_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++buf_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_log_element *le)
++{
++ struct gfs_bufdata *bd = container_of(le, struct gfs_bufdata, bd_new_le);
++
++ if (bd->bd_frozen) {
++ kfree(bd->bd_frozen);
++ bd->bd_frozen = NULL;
++ }
++
++ if (bd->bd_incore_le.le_trans) {
++ GFS_ASSERT_SBD(bd->bd_incore_le.le_trans == tr, sdp,);
++ gfs_dunpin(sdp, bd->bd_bh, NULL);
++ } else {
++ bd->bd_incore_le.le_trans = tr;
++ list_add(&bd->bd_incore_le.le_list, &tr->tr_elements);
++ if (tr != le->le_trans)
++ tr->tr_num_buf++;
++
++ sdp->sd_log_buffers++;
++ }
++
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++}
++
++/**
++ * buf_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++buf_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_bufdata *bd = container_of(le,
++ struct gfs_bufdata,
++ bd_incore_le);
++
++ gfs_dunpin(sdp, bd->bd_bh, le->le_trans);
++
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++
++ GFS_ASSERT_SBD(sdp->sd_log_buffers, sdp,);
++ sdp->sd_log_buffers--;
++}
++
++/**
++ * buf_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++buf_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ unsigned int *mblks, unsigned int *eblks,
++ unsigned int *blocks, unsigned int *bmem)
++{
++ unsigned int cblks;
++
++ if (tr->tr_num_buf) {
++ cblks = gfs_struct2blk(sdp, tr->tr_num_buf,
++ sizeof(struct gfs_block_tag));
++
++ if (mblks)
++ *mblks += tr->tr_num_buf;
++ if (blocks)
++ *blocks += tr->tr_num_buf + cblks;
++ if (bmem)
++ *bmem += cblks;
++ }
++}
++
++/**
++ * buf_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++buf_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_trans *new_tr)
++{
++ tr->tr_num_buf += new_tr->tr_num_buf;
++}
++
++/**
++ * increment_generation - increment the generation number in metadata buffer
++ * @sdp: the filesystem
++ * @bd: the struct gfs_bufdata structure associated with the buffer
++ *
++ */
++
++static void
++increment_generation(struct gfs_sbd *sdp, struct gfs_bufdata *bd)
++{
++ struct gfs_meta_header *mh, *mh2;
++ uint64_t tmp64;
++
++ mh = (struct gfs_meta_header *)bd->bd_bh->b_data;
++
++ tmp64 = gfs64_to_cpu(mh->mh_generation) + 1;
++ tmp64 = cpu_to_gfs64(tmp64);
++
++ if (bd->bd_frozen) {
++ mh2 = (struct gfs_meta_header *)bd->bd_frozen;
++ GFS_ASSERT_SBD(mh->mh_generation == mh2->mh_generation, sdp,);
++ mh2->mh_generation = tmp64;
++ }
++ mh->mh_generation = tmp64;
++}
++
++/**
++ * buf_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++buf_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head;
++ struct gfs_log_element *le;
++ struct gfs_bufdata *bd;
++ struct gfs_log_descriptor desc;
++ struct gfs_block_tag tag;
++ struct gfs_log_buf *clb = NULL;
++ unsigned int num_ctl;
++ unsigned int offset = sizeof(struct gfs_log_descriptor);
++ unsigned int x, bufs;
++
++ if (!tr->tr_num_buf)
++ return;
++
++ /* set up control buffers for descriptor and tags */
++
++ num_ctl = gfs_struct2blk(sdp, tr->tr_num_buf,
++ sizeof(struct gfs_block_tag));
++
++ for (x = 0; x < num_ctl; x++) {
++ if (clb)
++ gfs_log_get_buf(sdp, tr);
++ else
++ clb = gfs_log_get_buf(sdp, tr);
++ }
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = GFS_LOG_DESC_METADATA;
++ desc.ld_length = num_ctl + tr->tr_num_buf;
++ desc.ld_data1 = tr->tr_num_buf;
++ gfs_desc_out(&desc, clb->lb_bh.b_data);
++
++ x = 1;
++ bufs = 0;
++
++ for (head = &tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ if (le->le_ops != &gfs_buf_lops)
++ continue;
++ bd = container_of(le, struct gfs_bufdata, bd_incore_le);
++
++ gfs_meta_check(sdp, bd->bd_bh);
++
++ gfs_lock_buffer(bd->bd_bh);
++
++ increment_generation(sdp, bd);
++
++ gfs_log_fake_buf(sdp, tr,
++ (bd->bd_frozen) ? bd->bd_frozen : bd->bd_bh->b_data,
++ bd->bd_bh);
++
++ if (offset + sizeof(struct gfs_block_tag) > sdp->sd_sb.sb_bsize) {
++ clb = list_entry(clb->lb_list.prev,
++ struct gfs_log_buf, lb_list);
++ if (gfs_log_is_header(sdp, clb->lb_bh.b_blocknr))
++ clb = list_entry(clb->lb_list.prev,
++ struct gfs_log_buf, lb_list);
++ x++;
++ offset = 0;
++ }
++
++ memset(&tag, 0, sizeof(struct gfs_block_tag));
++ tag.bt_blkno = bd->bd_bh->b_blocknr;
++
++ gfs_block_tag_out(&tag, clb->lb_bh.b_data + offset);
++
++ offset += sizeof(struct gfs_block_tag);
++ bufs++;
++ }
++
++ GFS_ASSERT_SBD(x == num_ctl, sdp,);
++ GFS_ASSERT_SBD(bufs == tr->tr_num_buf, sdp,);
++}
++
++/**
++ * buf_before_scan - called before journal replay
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be replayed
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++buf_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++ struct gfs_log_header *head, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_A1)
++ sdp->sd_recovery_replays =
++ sdp->sd_recovery_skips =
++ sdp->sd_recovery_sames = 0;
++}
++
++/**
++ * replay_block - Replay a single metadata block
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being replayed
++ * @gl: the journal's glock
++ * @tag: the block tag describing the inplace location of the block
++ * @blkno: the location of the log's copy of the block
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++replay_block(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, struct gfs_block_tag *tag, uint64_t blkno)
++{
++ struct buffer_head *inplace_bh, *log_bh;
++ struct gfs_meta_header inplace_mh, log_mh;
++ int replay_block = TRUE;
++ int error = 0;
++
++ gfs_replay_check(sdp);
++
++ /* Warning: Using a real buffer here instead of a tempbh can be bad
++ on a OS that won't support multiple simultaneous buffers for the
++ same block on different glocks. */
++
++ error = gfs_dread(sdp, tag->bt_blkno, gl,
++ DIO_START | DIO_WAIT, &inplace_bh);
++ if (error)
++ return error;
++ gfs_meta_check(sdp, inplace_bh);
++ gfs_meta_header_in(&inplace_mh, inplace_bh->b_data);
++
++ error = gfs_dread(sdp, blkno, gl, DIO_START | DIO_WAIT, &log_bh);
++ if (error) {
++ brelse(inplace_bh);
++ return error;
++ }
++ gfs_meta_check(sdp, log_bh);
++ gfs_meta_header_in(&log_mh, log_bh->b_data);
++
++ if (log_mh.mh_generation < inplace_mh.mh_generation) {
++ replay_block = FALSE;
++ sdp->sd_recovery_skips++;
++ } else if (log_mh.mh_generation == inplace_mh.mh_generation) {
++ if (memcmp(log_bh->b_data,
++ inplace_bh->b_data,
++ sdp->sd_sb.sb_bsize) == 0) {
++ replay_block = FALSE;
++ sdp->sd_recovery_sames++;
++ }
++ }
++
++ if (replay_block) {
++ memcpy(inplace_bh->b_data,
++ log_bh->b_data,
++ sdp->sd_sb.sb_bsize);
++
++ error = gfs_replay_buf(gl, inplace_bh);
++ if (!error)
++ sdp->sd_recovery_replays++;
++ }
++
++ brelse(log_bh);
++ brelse(inplace_bh);
++
++ return error;
++}
++
++/**
++ * buf_scan_elements - Replay a metadata log descriptor
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being replayed
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++buf_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t start,
++ struct gfs_log_descriptor *desc, unsigned int pass)
++{
++ struct gfs_block_tag tag;
++ struct buffer_head *bh;
++ uint64_t cblk = start;
++ unsigned int num_tags = desc->ld_data1;
++ unsigned int offset = sizeof(struct gfs_log_descriptor);
++ unsigned int x;
++ int error;
++
++ if (pass != GFS_RECPASS_A1)
++ return 0;
++ if (desc->ld_type != GFS_LOG_DESC_METADATA)
++ return 0;
++
++ x = gfs_struct2blk(sdp, num_tags, sizeof(struct gfs_block_tag));
++ while (x--) {
++ error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++ if (error)
++ return error;
++ }
++
++ for (;;) {
++ GFS_ASSERT_SBD(num_tags, sdp,);
++
++ error = gfs_dread(sdp, cblk, gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++
++ /* Do readahead for the inplace blocks in this control block */
++ {
++ unsigned int o2 = offset;
++ unsigned int nt2 = num_tags;
++
++ while (o2 + sizeof(struct gfs_block_tag) <=
++ sdp->sd_sb.sb_bsize) {
++ gfs_block_tag_in(&tag, bh->b_data + o2);
++ gfs_start_ra(gl, tag.bt_blkno, 1);
++ if (!--nt2)
++ break;
++ o2 += sizeof(struct gfs_block_tag);
++ }
++ }
++
++ while (offset + sizeof(struct gfs_block_tag) <=
++ sdp->sd_sb.sb_bsize) {
++ gfs_block_tag_in(&tag, bh->b_data + offset);
++
++ error = replay_block(sdp, jdesc, gl, &tag, start);
++ if (error)
++ goto out_drelse;
++
++ if (!--num_tags)
++ goto out_drelse;
++
++ error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++ if (error)
++ goto out_drelse;
++
++ offset += sizeof(struct gfs_block_tag);
++ }
++
++ brelse(bh);
++
++ error = gfs_increment_blkno(sdp, jdesc, gl, &cblk, TRUE);
++ if (error)
++ return error;
++
++ offset = 0;
++ }
++
++ return 0;
++
++ out_drelse:
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * buf_after_scan - called after journal replay
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be replayed
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++buf_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_A1) {
++ printk("GFS: fsid=%s: jid=%u: Replayed %u of %u blocks\n",
++ sdp->sd_fsname, jid,
++ sdp->sd_recovery_replays,
++ sdp->sd_recovery_replays + sdp->sd_recovery_skips +
++ sdp->sd_recovery_sames);
++ printk("GFS: fsid=%s: jid=%u: replays = %u, skips = %u, sames = %u\n",
++ sdp->sd_fsname, jid, sdp->sd_recovery_replays,
++ sdp->sd_recovery_skips, sdp->sd_recovery_sames);
++ }
++}
++
++/**
++ * unlinked_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++unlinked_print(struct gfs_sbd *sdp, struct gfs_log_element *le,
++ unsigned int where)
++{
++ struct gfs_unlinked *ul;
++ char *type;
++
++ switch (where) {
++ case TRANS_IS_NEW:
++ ul = container_of(le, struct gfs_unlinked, ul_new_le);
++ type = (test_bit(ULF_NEW_UL, &ul->ul_flags)) ?
++ "unlink" : "dealloc";
++ break;
++ case TRANS_IS_INCORE:
++ ul = container_of(le, struct gfs_unlinked, ul_incore_le);
++ type = (test_bit(ULF_INCORE_UL, &ul->ul_flags)) ?
++ "unlink" : "dealloc";
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,);
++ }
++
++ printk(" unlinked: %"PRIu64"/%"PRIu64", %s\n",
++ ul->ul_inum.no_formal_ino, ul->ul_inum.no_addr,
++ type);
++}
++
++/**
++ * unlinked_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_log_element *le)
++{
++ struct gfs_unlinked *ul = container_of(le,
++ struct gfs_unlinked,
++ ul_new_le);
++ int n = !!test_bit(ULF_NEW_UL, &ul->ul_flags);
++ int i = !!test_bit(ULF_INCORE_UL, &ul->ul_flags);
++
++ if (ul->ul_incore_le.le_trans) {
++ GFS_ASSERT_SBD(ul->ul_incore_le.le_trans == tr, sdp,);
++ GFS_ASSERT_SBD(n != i, sdp,);
++
++ ul->ul_incore_le.le_trans = NULL;
++ list_del_init(&ul->ul_incore_le.le_list);
++ gfs_unlinked_put(sdp, ul);
++
++ if (i) {
++ GFS_ASSERT_SBD(tr->tr_num_iul, sdp,);
++ tr->tr_num_iul--;
++ } else {
++ GFS_ASSERT_SBD(tr->tr_num_ida, sdp,);
++ tr->tr_num_ida--;
++ }
++ } else {
++ gfs_unlinked_hold(sdp, ul);
++ ul->ul_incore_le.le_trans = tr;
++ list_add(&ul->ul_incore_le.le_list, &tr->tr_elements);
++
++ if (n) {
++ set_bit(ULF_INCORE_UL, &ul->ul_flags);
++ if (tr != le->le_trans)
++ tr->tr_num_iul++;
++ } else {
++ clear_bit(ULF_INCORE_UL, &ul->ul_flags);
++ if (tr != le->le_trans)
++ tr->tr_num_ida++;
++ }
++ }
++
++ if (n) {
++ gfs_unlinked_hold(sdp, ul);
++ GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags), sdp,);
++ set_bit(ULF_IC_LIST, &ul->ul_flags);
++ atomic_inc(&sdp->sd_unlinked_ic_count);
++ } else {
++ GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags), sdp,);
++ clear_bit(ULF_IC_LIST, &ul->ul_flags);
++ gfs_unlinked_put(sdp, ul);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count), sdp,);
++ atomic_dec(&sdp->sd_unlinked_ic_count);
++ }
++
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++ gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * unlinked_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_unlinked *ul = container_of(le,
++ struct gfs_unlinked,
++ ul_incore_le);
++ int i = !!test_bit(ULF_INCORE_UL, &ul->ul_flags);
++
++ if (i) {
++ gfs_unlinked_hold(sdp, ul);
++ GFS_ASSERT_SBD(!test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++ set_bit(ULF_OD_LIST, &ul->ul_flags);
++ atomic_inc(&sdp->sd_unlinked_od_count);
++ } else {
++ GFS_ASSERT_SBD(test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++ clear_bit(ULF_OD_LIST, &ul->ul_flags);
++ gfs_unlinked_put(sdp, ul);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++ atomic_dec(&sdp->sd_unlinked_od_count);
++ }
++
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++ gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * unlinked_clean_dump - clean up a LE after a log dump
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++unlinked_clean_dump(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++}
++
++/**
++ * unlinked_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++unlinked_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ unsigned int *mblks, unsigned int *eblks,
++ unsigned int *blocks, unsigned int *bmem)
++{
++ unsigned int ublks = 0;
++
++ if (tr->tr_num_iul)
++ ublks = gfs_struct2blk(sdp, tr->tr_num_iul,
++ sizeof(struct gfs_inum));
++ if (tr->tr_num_ida)
++ ublks += gfs_struct2blk(sdp, tr->tr_num_ida,
++ sizeof(struct gfs_inum));
++
++ if (eblks)
++ *eblks += ublks;
++ if (blocks)
++ *blocks += ublks;
++ if (bmem)
++ *bmem += ublks;
++}
++
++/**
++ * unlinked_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++unlinked_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_trans *new_tr)
++{
++ tr->tr_num_iul += new_tr->tr_num_iul;
++ tr->tr_num_ida += new_tr->tr_num_ida;
++}
++
++/**
++ * unlinked_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++unlinked_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head;
++ struct gfs_log_element *le;
++ struct gfs_unlinked *ul;
++ struct gfs_log_descriptor desc;
++ struct gfs_log_buf *lb;
++ unsigned int pass = 2;
++ unsigned int type, number;
++ unsigned int offset, entries;
++
++ while (pass--) {
++ if (tr->tr_flags & TRF_LOG_DUMP) {
++ if (pass) {
++ type = GFS_LOG_DESC_IUL;
++ number = tr->tr_num_iul;
++ } else
++ break;
++ } else {
++ if (pass) {
++ type = GFS_LOG_DESC_IUL;
++ number = tr->tr_num_iul;
++ } else {
++ type = GFS_LOG_DESC_IDA;
++ number = tr->tr_num_ida;
++ }
++
++ if (!number)
++ continue;
++ }
++
++ lb = gfs_log_get_buf(sdp, tr);
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = type;
++ desc.ld_length = gfs_struct2blk(sdp, number, sizeof(struct gfs_inum));
++ desc.ld_data1 = (tr->tr_flags & TRF_LOG_DUMP) ? TRUE : FALSE;
++ gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++ offset = sizeof(struct gfs_log_descriptor);
++ entries = 0;
++
++ for (head = &tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ if (le->le_ops != &gfs_unlinked_lops)
++ continue;
++ if (tr->tr_flags & TRF_LOG_DUMP)
++ ul = container_of(le,
++ struct gfs_unlinked,
++ ul_ondisk_le);
++ else {
++ ul = container_of(le,
++ struct gfs_unlinked,
++ ul_incore_le);
++ if (!!test_bit(ULF_INCORE_UL, &ul->ul_flags) != pass)
++ continue;
++ }
++
++ if (offset + sizeof(struct gfs_inum) > sdp->sd_sb.sb_bsize) {
++ offset = 0;
++ lb = gfs_log_get_buf(sdp, tr);
++ }
++
++ gfs_inum_out(&ul->ul_inum,
++ lb->lb_bh.b_data + offset);
++
++ offset += sizeof(struct gfs_inum);
++ entries++;
++ }
++
++ GFS_ASSERT_SBD(entries == number, sdp,);
++ }
++}
++
++/**
++ * unlinked_dump_size - compute how much space the LE class takes up in a log dump
++ * @sdp: the filesystem
++ * @elements: the number of log elements in the dump
++ * @blocks: the number of blocks in the dump
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++unlinked_dump_size(struct gfs_sbd *sdp, unsigned int *elements,
++ unsigned int *blocks, unsigned int *bmem)
++{
++ unsigned int c = atomic_read(&sdp->sd_unlinked_od_count);
++ unsigned int b = gfs_struct2blk(sdp, c, sizeof(struct gfs_inum));
++
++ if (elements)
++ *elements += c;
++ if (blocks)
++ *blocks += b;
++ if (bmem)
++ *bmem += b;
++}
++
++/**
++ * unlinked_build_dump - create a transaction that represents a log dump for this LE class
++ * @sdp: the filesystem
++ * @tr: the transaction to fill
++ *
++ */
++
++static void
++unlinked_build_dump(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head;
++ struct gfs_unlinked *ul;
++ unsigned int x = 0;
++
++ tr->tr_num_iul = atomic_read(&sdp->sd_unlinked_od_count);
++
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ for (head = &sdp->sd_unlinked_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++ if (!test_bit(ULF_OD_LIST, &ul->ul_flags))
++ continue;
++
++ GFS_ASSERT_SBD(!ul->ul_ondisk_le.le_trans, sdp,);
++ ul->ul_ondisk_le.le_trans = tr;
++ list_add(&ul->ul_ondisk_le.le_list, &tr->tr_elements);
++
++ x++;
++ }
++
++ spin_unlock(&sdp->sd_unlinked_lock);
++
++ GFS_ASSERT_SBD(x == atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++}
++
++/**
++ * unlinked_before_scan - called before a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++unlinked_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++ struct gfs_log_header *head, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_B1)
++ clear_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * unlinked_scan_elements - scan unlinked inodes from the journal
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being scaned
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++unlinked_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t start,
++ struct gfs_log_descriptor *desc, unsigned int pass)
++{
++ struct gfs_inum inum;
++ struct buffer_head *bh;
++ unsigned int offset = sizeof(struct gfs_log_descriptor);
++ unsigned int x;
++ int error;
++
++ if (pass != GFS_RECPASS_B1)
++ return 0;
++
++ switch (desc->ld_type) {
++ case GFS_LOG_DESC_IUL:
++ if (test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags))
++ GFS_ASSERT_SBD(!desc->ld_data1, sdp,);
++ else {
++ GFS_ASSERT_SBD(desc->ld_data1, sdp,);
++ set_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags);
++ }
++ break;
++
++ case GFS_LOG_DESC_IDA:
++ GFS_ASSERT_SBD(test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags),
++ sdp,);
++ break;
++
++ default:
++ return 0;
++ }
++
++ for (x = 0; x < desc->ld_length; x++) {
++ error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++
++ for (;
++ offset + sizeof(struct gfs_inum) <= sdp->sd_sb.sb_bsize;
++ offset += sizeof(struct gfs_inum)) {
++ gfs_inum_in(&inum, bh->b_data + offset);
++
++ if (inum.no_addr)
++ gfs_unlinked_merge(sdp, desc->ld_type, &inum);
++ }
++
++ brelse(bh);
++
++ error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++ if (error)
++ return error;
++
++ offset = 0;
++ }
++
++ return 0;
++}
++
++/**
++ * unlinked_after_scan - called after a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++unlinked_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_B1) {
++ GFS_ASSERT_SBD(test_bit(SDF_FOUND_UL_DUMP, &sdp->sd_flags),
++ sdp,);
++ printk("GFS: fsid=%s: Found %d unlinked inodes\n",
++ sdp->sd_fsname, atomic_read(&sdp->sd_unlinked_ic_count));
++ }
++}
++
++/**
++ * quota_print - print debug info about a log element
++ * @sdp: the filesystem
++ * @le: the log element
++ * @where: is this a new transaction or a incore transaction
++ *
++ */
++
++static void
++quota_print(struct gfs_sbd *sdp, struct gfs_log_element *le, unsigned int where)
++{
++ struct gfs_quota_le *ql;
++
++ ql = container_of(le, struct gfs_quota_le, ql_le);
++ printk(" quota: %s %u: %"PRId64" blocks\n",
++ (test_bit(QDF_USER, &ql->ql_data->qd_flags)) ? "user" : "group",
++ ql->ql_data->qd_id, ql->ql_change);
++}
++
++/**
++ * quota_incore_commit - commit this LE to the incore log
++ * @sdp: the filesystem
++ * @tr: the incore transaction this LE is a part of
++ * @le: the log element
++ *
++ */
++
++static void
++quota_incore_commit(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_log_element *le)
++{
++ struct gfs_quota_le *ql = container_of(le, struct gfs_quota_le, ql_le);
++ struct gfs_quota_data *qd = ql->ql_data;
++
++ GFS_ASSERT_SBD(ql->ql_change, sdp,);
++
++ /* Make this change under the sd_quota_lock, so other processes
++ checking qd_change_ic don't have to acquire the log lock. */
++
++ spin_lock(&sdp->sd_quota_lock);
++ qd->qd_change_new -= ql->ql_change;
++ qd->qd_change_ic += ql->ql_change;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ if (le->le_trans == tr)
++ list_add(&ql->ql_data_list, &qd->qd_le_list);
++ else {
++ struct list_head *tmp, *head;
++ struct gfs_quota_le *tmp_ql;
++ int found = FALSE;
++
++ for (head = &qd->qd_le_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ tmp_ql = list_entry(tmp, struct gfs_quota_le, ql_data_list);
++ if (tmp_ql->ql_le.le_trans != tr)
++ continue;
++
++ tmp_ql->ql_change += ql->ql_change;
++
++ list_del(&le->le_list);
++ gfs_quota_put(sdp, qd);
++ kfree(ql);
++
++ if (!tmp_ql->ql_change) {
++ list_del(&tmp_ql->ql_data_list);
++ list_del(&tmp_ql->ql_le.le_list);
++ gfs_quota_put(sdp, tmp_ql->ql_data);
++ kfree(tmp_ql);
++ tr->tr_num_q--;
++ }
++
++ found = TRUE;
++ break;
++ }
++
++ if (!found) {
++ le->le_trans = tr;
++ list_move(&le->le_list, &tr->tr_elements);
++ tr->tr_num_q++;
++ list_add(&ql->ql_data_list, &qd->qd_le_list);
++ }
++ }
++}
++
++/**
++ * quota_add_to_ail - Add this LE to the AIL
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++quota_add_to_ail(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ struct gfs_quota_le *ql = container_of(le, struct gfs_quota_le, ql_le);
++ struct gfs_quota_data *qd = ql->ql_data;
++
++ qd->qd_change_od += ql->ql_change;
++ if (qd->qd_change_od) {
++ if (!test_bit(QDF_OD_LIST, &qd->qd_flags)) {
++ gfs_quota_hold(sdp, qd);
++ set_bit(QDF_OD_LIST, &qd->qd_flags);
++ atomic_inc(&sdp->sd_quota_od_count);
++ }
++ } else {
++ GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags), sdp,);
++ clear_bit(QDF_OD_LIST, &qd->qd_flags);
++ gfs_quota_put(sdp, qd);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_quota_od_count), sdp,);
++ atomic_dec(&sdp->sd_quota_od_count);
++ }
++
++ list_del(&ql->ql_data_list);
++ list_del(&le->le_list);
++ gfs_quota_put(sdp, qd);
++ kfree(ql);
++}
++
++/**
++ * quota_clean_dump - clean up a LE after a log dump
++ * @sdp: the filesystem
++ * @le: the log element
++ *
++ */
++
++static void
++quota_clean_dump(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ le->le_trans = NULL;
++ list_del_init(&le->le_list);
++}
++
++/**
++ * quota_trans_size - compute how much space the LE class takes up in a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ * @mblks: the number of regular metadata blocks
++ * @eblks: the number of extra blocks
++ * @blocks: the number of log blocks
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++quota_trans_size(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ unsigned int *mblks, unsigned int *eblks,
++ unsigned int *blocks, unsigned int *bmem)
++{
++ unsigned int qblks;
++
++ if (tr->tr_num_q) {
++ qblks = gfs_struct2blk(sdp, tr->tr_num_q,
++ sizeof(struct gfs_quota_tag));
++
++ if (eblks)
++ *eblks += qblks;
++ if (blocks)
++ *blocks += qblks;
++ if (bmem)
++ *bmem += qblks;
++ }
++}
++
++/**
++ * quota_trans_combine - combine to incore transactions
++ * @sdp: the filesystem
++ * @tr: the surviving transaction
++ * @new_tr: the transaction that's going to disappear
++ *
++ */
++
++static void
++quota_trans_combine(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ struct gfs_trans *new_tr)
++{
++ tr->tr_num_q += new_tr->tr_num_q;
++}
++
++/**
++ * quota_build_bhlist - create the buffers that will make up the ondisk part of a transaction
++ * @sdp: the filesystem
++ * @tr: the transaction
++ *
++ */
++
++static void
++quota_build_bhlist(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head;
++ struct gfs_log_element *le;
++ struct gfs_quota_le *ql;
++ struct gfs_log_descriptor desc;
++ struct gfs_quota_tag tag;
++ struct gfs_log_buf *lb;
++ unsigned int offset = sizeof(struct gfs_log_descriptor), entries = 0;
++
++ if (!tr->tr_num_q && !(tr->tr_flags & TRF_LOG_DUMP))
++ return;
++
++ lb = gfs_log_get_buf(sdp, tr);
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = GFS_LOG_DESC_Q;
++ desc.ld_length = gfs_struct2blk(sdp, tr->tr_num_q,
++ sizeof(struct gfs_quota_tag));
++ desc.ld_data1 = tr->tr_num_q;
++ desc.ld_data2 = (tr->tr_flags & TRF_LOG_DUMP) ? TRUE : FALSE;
++ gfs_desc_out(&desc, lb->lb_bh.b_data);
++
++ for (head = &tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ if (le->le_ops != &gfs_quota_lops)
++ continue;
++
++ ql = container_of(le, struct gfs_quota_le, ql_le);
++
++ if (offset + sizeof(struct gfs_quota_tag) >
++ sdp->sd_sb.sb_bsize) {
++ offset = 0;
++ lb = gfs_log_get_buf(sdp, tr);
++ }
++
++ memset(&tag, 0, sizeof(struct gfs_quota_tag));
++ tag.qt_change = ql->ql_change;
++ tag.qt_flags = (test_bit(QDF_USER, &ql->ql_data->qd_flags)) ?
++ GFS_QTF_USER : 0;
++ tag.qt_id = ql->ql_data->qd_id;
++
++ gfs_quota_tag_out(&tag, lb->lb_bh.b_data + offset);
++
++ offset += sizeof(struct gfs_quota_tag);
++ entries++;
++ }
++
++ GFS_ASSERT_SBD(entries == tr->tr_num_q, sdp,);
++}
++
++/**
++ * quota_dump_size - compute how much space the LE class takes up in a log dump
++ * @sdp: the filesystem
++ * @elements: the number of log elements in the dump
++ * @blocks: the number of blocks in the dump
++ * @bmem: the number of buffer-sized chunks of memory we need
++ *
++ */
++
++static void
++quota_dump_size(struct gfs_sbd *sdp, unsigned int *elements,
++ unsigned int *blocks, unsigned int *bmem)
++{
++ unsigned int c = atomic_read(&sdp->sd_quota_od_count);
++ unsigned int b = gfs_struct2blk(sdp, c, sizeof(struct gfs_quota_tag));
++
++ if (elements)
++ *elements += c;
++ if (blocks)
++ *blocks += b;
++ if (bmem)
++ *bmem += b;
++}
++
++/**
++ * quota_build_dump - create a transaction that represents a log dump for this LE class
++ * @sdp: the filesystem
++ * @tr: the transaction to fill
++ *
++ */
++
++static void
++quota_build_dump(struct gfs_sbd *sdp, struct gfs_trans *tr)
++{
++ struct list_head *tmp, *head;
++ struct gfs_quota_data *qd;
++ struct gfs_quota_le *ql;
++ unsigned int x = 0;
++
++ tr->tr_num_q = atomic_read(&sdp->sd_quota_od_count);
++
++ spin_lock(&sdp->sd_quota_lock);
++
++ for (head = &sdp->sd_quota_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++ if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++ continue;
++
++ ql = &qd->qd_ondisk_ql;
++
++ ql->ql_le.le_ops = &gfs_quota_lops;
++ GFS_ASSERT_SBD(!ql->ql_le.le_trans, sdp,);
++ ql->ql_le.le_trans = tr;
++ list_add(&ql->ql_le.le_list, &tr->tr_elements);
++
++ ql->ql_data = qd;
++ ql->ql_change = qd->qd_change_od;
++
++ x++;
++ }
++
++ spin_unlock(&sdp->sd_quota_lock);
++
++ GFS_ASSERT_SBD(x == atomic_read(&sdp->sd_quota_od_count), sdp,);
++}
++
++/**
++ * quota_before_scan - called before a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @head: the current head of the log
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++quota_before_scan(struct gfs_sbd *sdp, unsigned int jid,
++ struct gfs_log_header *head, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_B1)
++ clear_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags);
++}
++
++/**
++ * quota_scan_elements - scan quota inodes from the journal
++ * @sdp: the filesystem
++ * @jdesc: the struct gfs_jindex structure for the journal being scaned
++ * @gl: the journal's glock
++ * @start: the starting block of the descriptor
++ * @desc: the descriptor structure
++ * @pass: the pass through the journal
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++quota_scan_elements(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t start,
++ struct gfs_log_descriptor *desc, unsigned int pass)
++{
++ struct gfs_quota_tag tag;
++ struct buffer_head *bh;
++ unsigned int num_tags = desc->ld_data1;
++ unsigned int offset = sizeof(struct gfs_log_descriptor);
++ unsigned int x;
++ int error;
++
++ if (pass != GFS_RECPASS_B1)
++ return 0;
++ if (desc->ld_type != GFS_LOG_DESC_Q)
++ return 0;
++
++ if (test_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags))
++ GFS_ASSERT_SBD(!desc->ld_data2, sdp,);
++ else {
++ GFS_ASSERT_SBD(desc->ld_data2, sdp,);
++ set_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags);
++ }
++
++ if (!num_tags)
++ return 0;
++
++ for (x = 0; x < desc->ld_length; x++) {
++ error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++
++ while (offset + sizeof(struct gfs_quota_tag) <=
++ sdp->sd_sb.sb_bsize) {
++ gfs_quota_tag_in(&tag, bh->b_data + offset);
++
++ error = gfs_quota_merge(sdp, &tag);
++ if (error)
++ goto out_drelse;
++
++ if (!--num_tags)
++ goto out_drelse;
++
++ offset += sizeof(struct gfs_quota_tag);
++ }
++
++ brelse(bh);
++
++ error = gfs_increment_blkno(sdp, jdesc, gl, &start, TRUE);
++ if (error)
++ return error;
++
++ offset = 0;
++ }
++
++ return 0;
++
++ out_drelse:
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * quota_after_scan - called after a log dump is recovered
++ * @sdp: the filesystem
++ * @jid: the journal ID about to be scanned
++ * @pass: the pass through the journal
++ *
++ */
++
++static void
++quota_after_scan(struct gfs_sbd *sdp, unsigned int jid, unsigned int pass)
++{
++ if (pass == GFS_RECPASS_B1) {
++ GFS_ASSERT_SBD(!sdp->sd_sb.sb_quota_di.no_formal_ino ||
++ test_bit(SDF_FOUND_Q_DUMP, &sdp->sd_flags),
++ sdp,);
++ printk("GFS: fsid=%s: Found quota changes for %d IDs\n",
++ sdp->sd_fsname, atomic_read(&sdp->sd_quota_od_count));
++ }
++}
++
++struct gfs_log_operations gfs_glock_lops = {
++ .lo_add = generic_le_add,
++ .lo_trans_end = glock_trans_end,
++ .lo_print = glock_print,
++ .lo_overlap_trans = glock_overlap_trans,
++ .lo_incore_commit = glock_incore_commit,
++ .lo_add_to_ail = glock_add_to_ail,
++ .lo_trans_combine = glock_trans_combine,
++ .lo_name = "glock"
++};
++
++struct gfs_log_operations gfs_buf_lops = {
++ .lo_add = generic_le_add,
++ .lo_print = buf_print,
++ .lo_incore_commit = buf_incore_commit,
++ .lo_add_to_ail = buf_add_to_ail,
++ .lo_trans_size = buf_trans_size,
++ .lo_trans_combine = buf_trans_combine,
++ .lo_build_bhlist = buf_build_bhlist,
++ .lo_before_scan = buf_before_scan,
++ .lo_scan_elements = buf_scan_elements,
++ .lo_after_scan = buf_after_scan,
++ .lo_name = "buf"
++};
++
++struct gfs_log_operations gfs_unlinked_lops = {
++ .lo_add = generic_le_add,
++ .lo_print = unlinked_print,
++ .lo_incore_commit = unlinked_incore_commit,
++ .lo_add_to_ail = unlinked_add_to_ail,
++ .lo_clean_dump = unlinked_clean_dump,
++ .lo_trans_size = unlinked_trans_size,
++ .lo_trans_combine = unlinked_trans_combine,
++ .lo_build_bhlist = unlinked_build_bhlist,
++ .lo_dump_size = unlinked_dump_size,
++ .lo_build_dump = unlinked_build_dump,
++ .lo_before_scan = unlinked_before_scan,
++ .lo_scan_elements = unlinked_scan_elements,
++ .lo_after_scan = unlinked_after_scan,
++ .lo_name = "unlinked"
++};
++
++struct gfs_log_operations gfs_quota_lops = {
++ .lo_add = generic_le_add,
++ .lo_print = quota_print,
++ .lo_incore_commit = quota_incore_commit,
++ .lo_add_to_ail = quota_add_to_ail,
++ .lo_clean_dump = quota_clean_dump,
++ .lo_trans_size = quota_trans_size,
++ .lo_trans_combine = quota_trans_combine,
++ .lo_build_bhlist = quota_build_bhlist,
++ .lo_dump_size = quota_dump_size,
++ .lo_build_dump = quota_build_dump,
++ .lo_before_scan = quota_before_scan,
++ .lo_scan_elements = quota_scan_elements,
++ .lo_after_scan = quota_after_scan,
++ .lo_name = "quota"
++};
++
++struct gfs_log_operations *gfs_log_ops[] = {
++ &gfs_glock_lops,
++ &gfs_buf_lops,
++ &gfs_unlinked_lops,
++ &gfs_quota_lops,
++ NULL
++};
+diff -urN linux-orig/fs/gfs/lops.h linux-patched/fs/gfs/lops.h
+--- linux-orig/fs/gfs/lops.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lops.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,179 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LOPS_DOT_H__
++#define __LOPS_DOT_H__
++
++extern struct gfs_log_operations gfs_glock_lops;
++extern struct gfs_log_operations gfs_buf_lops;
++extern struct gfs_log_operations gfs_unlinked_lops;
++extern struct gfs_log_operations gfs_quota_lops;
++
++extern struct gfs_log_operations *gfs_log_ops[];
++
++#define INIT_LE(le, lops) \
++do \
++{ \
++ (le)->le_ops = (lops); \
++ (le)->le_trans = NULL; \
++ INIT_LIST_HEAD(&(le)->le_list); \
++} \
++while (0)
++
++#define LO_ADD(sdp, le) \
++do \
++{ \
++ if ((le)->le_ops->lo_add) \
++ (le)->le_ops->lo_add((sdp), (le)); \
++} \
++while (0)
++
++#define LO_TRANS_END(sdp, le) \
++do \
++{ \
++ if ((le)->le_ops->lo_trans_end) \
++ (le)->le_ops->lo_trans_end((sdp), (le)); \
++} \
++while (0)
++
++#define LO_PRINT(sdp, le, where) \
++do \
++{ \
++ if ((le)->le_ops->lo_print) \
++ (le)->le_ops->lo_print((sdp), (le), (where)); \
++} \
++while (0)
++
++static __inline__ struct gfs_trans *
++LO_OVERLAP_TRANS(struct gfs_sbd *sdp, struct gfs_log_element *le)
++{
++ if (le->le_ops->lo_overlap_trans)
++ return le->le_ops->lo_overlap_trans(sdp, le);
++ else
++ return NULL;
++}
++
++#define LO_INCORE_COMMIT(sdp, tr, le) \
++do \
++{ \
++ if ((le)->le_ops->lo_incore_commit) \
++ (le)->le_ops->lo_incore_commit((sdp), (tr), (le)); \
++} \
++while (0)
++
++#define LO_ADD_TO_AIL(sdp, le) \
++do \
++{ \
++ if ((le)->le_ops->lo_add_to_ail) \
++ (le)->le_ops->lo_add_to_ail((sdp), (le)); \
++} \
++while (0)
++
++#define LO_CLEAN_DUMP(sdp, le) \
++do \
++{ \
++ if ((le)->le_ops->lo_clean_dump) \
++ (le)->le_ops->lo_clean_dump((sdp), (le)); \
++} \
++while (0)
++
++#define LO_TRANS_SIZE(sdp, tr, mblks, eblks, blocks, bmem) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_trans_size) \
++ gfs_log_ops[__lops_x]->lo_trans_size((sdp), (tr), (mblks), (eblks), (blocks), (bmem)); \
++} \
++while (0)
++
++#define LO_TRANS_COMBINE(sdp, tr, new_tr) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_trans_combine) \
++ gfs_log_ops[__lops_x]->lo_trans_combine((sdp), (tr), (new_tr)); \
++} \
++while (0)
++
++#define LO_BUILD_BHLIST(sdp, tr) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_build_bhlist) \
++ gfs_log_ops[__lops_x]->lo_build_bhlist((sdp), (tr)); \
++} \
++while (0)
++
++#define LO_DUMP_SIZE(sdp, elements, blocks, bmem) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_dump_size) \
++ gfs_log_ops[__lops_x]->lo_dump_size((sdp), (elements), (blocks), (bmem)); \
++} \
++while (0)
++
++#define LO_BUILD_DUMP(sdp, tr) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_build_dump) \
++ gfs_log_ops[__lops_x]->lo_build_dump((sdp), (tr)); \
++} \
++while (0)
++
++#define LO_BEFORE_SCAN(sdp, jid, head, pass) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_before_scan) \
++ gfs_log_ops[__lops_x]->lo_before_scan((sdp), (jid), (head), (pass)); \
++} \
++while (0)
++
++static __inline__ int
++LO_SCAN_ELEMENTS(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t start,
++ struct gfs_log_descriptor *desc, unsigned int pass)
++{
++ int x;
++ int error;
++
++ for (x = 0; gfs_log_ops[x]; x++)
++ if (gfs_log_ops[x]->lo_scan_elements) {
++ error = gfs_log_ops[x]->lo_scan_elements(sdp, jdesc, gl,
++ start, desc, pass);
++ if (error)
++ return error;
++ }
++
++ return 0;
++}
++
++#define LO_AFTER_SCAN(sdp, jid, pass) \
++do \
++{ \
++ int __lops_x; \
++ for (__lops_x = 0; gfs_log_ops[__lops_x]; __lops_x++) \
++ if (gfs_log_ops[__lops_x]->lo_after_scan) \
++ gfs_log_ops[__lops_x]->lo_after_scan((sdp), (jid), (pass)); \
++} \
++while (0)
++
++#endif /* __LOPS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/lvb.c linux-patched/fs/gfs/lvb.c
+--- linux-orig/fs/gfs/lvb.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lvb.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,148 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++
++#define pv(struct, member, fmt) printk(" "#member" = "fmt"\n", struct->member);
++
++#define CPIN_08(s1, s2, member, count) {memcpy((s1->member), (s2->member), (count));}
++#define CPOUT_08(s1, s2, member, count) {memcpy((s2->member), (s1->member), (count));}
++#define CPIN_16(s1, s2, member) {(s1->member) = gfs16_to_cpu((s2->member));}
++#define CPOUT_16(s1, s2, member) {(s2->member) = cpu_to_gfs16((s1->member));}
++#define CPIN_32(s1, s2, member) {(s1->member) = gfs32_to_cpu((s2->member));}
++#define CPOUT_32(s1, s2, member) {(s2->member) = cpu_to_gfs32((s1->member));}
++#define CPIN_64(s1, s2, member) {(s1->member) = gfs64_to_cpu((s2->member));}
++#define CPOUT_64(s1, s2, member) {(s2->member) = cpu_to_gfs64((s1->member));}
++
++/**
++ * gfs_rgrp_lvb_in - Read in rgrp data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_rgrp_lvb_in(struct gfs_rgrp_lvb *rb, char *lvb)
++{
++ struct gfs_rgrp_lvb *str = (struct gfs_rgrp_lvb *)lvb;
++
++ CPIN_32(rb, str, rb_magic);
++ CPIN_32(rb, str, rb_free);
++ CPIN_32(rb, str, rb_useddi);
++ CPIN_32(rb, str, rb_freedi);
++ CPIN_32(rb, str, rb_usedmeta);
++ CPIN_32(rb, str, rb_freemeta);
++}
++
++/**
++ * gfs_rgrp_lvb_out - Write out rgrp data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_rgrp_lvb_out(struct gfs_rgrp_lvb *rb, char *lvb)
++{
++ struct gfs_rgrp_lvb *str = (struct gfs_rgrp_lvb *)lvb;
++
++ CPOUT_32(rb, str, rb_magic);
++ CPOUT_32(rb, str, rb_free);
++ CPOUT_32(rb, str, rb_useddi);
++ CPOUT_32(rb, str, rb_freedi);
++ CPOUT_32(rb, str, rb_usedmeta);
++ CPOUT_32(rb, str, rb_freemeta);
++}
++
++/**
++ * gfs_rgrp_lvb_print - Print out rgrp data
++ * @rb: the cpu-order structure
++ * @console - TRUE if this should be printed to the console,
++ * FALSE if it should be just printed to the incore debug
++ * buffer
++ */
++
++void
++gfs_rgrp_lvb_print(struct gfs_rgrp_lvb *rb)
++{
++ pv(rb, rb_magic, "%u");
++ pv(rb, rb_free, "%u");
++ pv(rb, rb_useddi, "%u");
++ pv(rb, rb_freedi, "%u");
++ pv(rb, rb_usedmeta, "%u");
++ pv(rb, rb_freemeta, "%u");
++}
++
++/**
++ * gfs_quota_lvb_in - Read in quota data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_quota_lvb_in(struct gfs_quota_lvb *qb, char *lvb)
++{
++ struct gfs_quota_lvb *str = (struct gfs_quota_lvb *)lvb;
++
++ CPIN_32(qb, str, qb_magic);
++ CPIN_32(qb, str, qb_pad);
++ CPIN_64(qb, str, qb_limit);
++ CPIN_64(qb, str, qb_warn);
++ CPIN_64(qb, str, qb_value);
++}
++
++/**
++ * gfs_quota_lvb_out - Write out quota data
++ * @rb: the cpu-order structure
++ * @lvb: the lvb
++ *
++ */
++
++void
++gfs_quota_lvb_out(struct gfs_quota_lvb *qb, char *lvb)
++{
++ struct gfs_quota_lvb *str = (struct gfs_quota_lvb *)lvb;
++
++ CPOUT_32(qb, str, qb_magic);
++ CPOUT_32(qb, str, qb_pad);
++ CPOUT_64(qb, str, qb_limit);
++ CPOUT_64(qb, str, qb_warn);
++ CPOUT_64(qb, str, qb_value);
++}
++
++/**
++ * gfs_quota_lvb_print - Print out quota data
++ * @rb: the cpu-order structure
++ * @console - TRUE if this should be printed to the console,
++ * FALSE if it should be just printed to the incore debug
++ * buffer
++ */
++
++void
++gfs_quota_lvb_print(struct gfs_quota_lvb *qb)
++{
++ pv(qb, qb_magic, "%u");
++ pv(qb, qb_pad, "%u");
++ pv(qb, qb_limit, "%"PRIu64);
++ pv(qb, qb_warn, "%"PRIu64);
++ pv(qb, qb_value, "%"PRId64);
++}
+diff -urN linux-orig/fs/gfs/lvb.h linux-patched/fs/gfs/lvb.h
+--- linux-orig/fs/gfs/lvb.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/lvb.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,48 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __LVB_DOT_H__
++#define __LVB_DOT_H__
++
++#define GFS_MIN_LVB_SIZE (32)
++
++struct gfs_rgrp_lvb {
++ uint32_t rb_magic;
++ uint32_t rb_free;
++ uint32_t rb_useddi;
++ uint32_t rb_freedi;
++ uint32_t rb_usedmeta;
++ uint32_t rb_freemeta;
++};
++
++struct gfs_quota_lvb {
++ uint32_t qb_magic;
++ uint32_t qb_pad;
++ uint64_t qb_limit;
++ uint64_t qb_warn;
++ int64_t qb_value;
++};
++
++/* Translation functions */
++
++void gfs_rgrp_lvb_in(struct gfs_rgrp_lvb *rb, char *lvb);
++void gfs_rgrp_lvb_out(struct gfs_rgrp_lvb *rb, char *lvb);
++void gfs_quota_lvb_in(struct gfs_quota_lvb *qb, char *lvb);
++void gfs_quota_lvb_out(struct gfs_quota_lvb *qb, char *lvb);
++
++/* Printing functions */
++
++void gfs_rgrp_lvb_print(struct gfs_rgrp_lvb *rb);
++void gfs_quota_lvb_print(struct gfs_quota_lvb *qb);
++
++#endif /* __LVB_DOT_H__ */
+diff -urN linux-orig/fs/gfs/main.c linux-patched/fs/gfs/main.c
+--- linux-orig/fs/gfs/main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/main.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,142 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/init.h>
++
++#include "gfs.h"
++#include "mount.h"
++#include "ops_fstype.h"
++
++struct proc_dir_entry *gfs_proc_entry = NULL;
++
++/**
++ * init_gfs_fs - Register GFS as a filesystem
++ *
++ * Returns: 0 on success, error code on failure
++ */
++
++int __init
++init_gfs_fs(void)
++{
++ int error = 0;
++
++ init_MUTEX(&gfs_mount_args_lock);
++
++ gfs_proc_entry = create_proc_read_entry("fs/gfs", S_IFREG | 0200, NULL, NULL, NULL);
++ if (!gfs_proc_entry) {
++ printk("GFS: can't register /proc/fs/gfs\n");
++ error = -EINVAL;
++ goto fail;
++ }
++ gfs_proc_entry->write_proc = gfs_proc_write;
++
++ gfs_random_number = xtime.tv_nsec;
++
++ gfs_glock_cachep = kmem_cache_create("gfs_glock", sizeof(struct gfs_glock),
++ 0, 0,
++ NULL, NULL);
++ if (!gfs_glock_cachep)
++ goto fail2;
++
++ gfs_inode_cachep = kmem_cache_create("gfs_inode", sizeof(struct gfs_inode),
++ 0, 0,
++ NULL, NULL);
++ if (!gfs_inode_cachep)
++ goto fail2;
++
++ gfs_bufdata_cachep = kmem_cache_create("gfs_bufdata", sizeof(struct gfs_bufdata),
++ 0, 0,
++ NULL, NULL);
++ if (!gfs_bufdata_cachep)
++ goto fail2;
++
++ gfs_mhc_cachep = kmem_cache_create("gfs_meta_header_cache", sizeof(struct gfs_meta_header_cache),
++ 0, 0,
++ NULL, NULL);
++ if (!gfs_mhc_cachep)
++ goto fail2;
++
++ error = register_filesystem(&gfs_fs_type);
++ if (error)
++ goto fail2;
++
++ printk("GFS %s (built %s %s) installed\n",
++ GFS_RELEASE_NAME, __DATE__, __TIME__);
++
++ return 0;
++
++ fail2:
++ if (gfs_mhc_cachep)
++ kmem_cache_destroy(gfs_mhc_cachep);
++
++ if (gfs_bufdata_cachep)
++ kmem_cache_destroy(gfs_bufdata_cachep);
++
++ if (gfs_inode_cachep)
++ kmem_cache_destroy(gfs_inode_cachep);
++
++ if (gfs_glock_cachep)
++ kmem_cache_destroy(gfs_glock_cachep);
++
++ down(&gfs_mount_args_lock);
++ if (gfs_mount_args) {
++ kfree(gfs_mount_args);
++ gfs_mount_args = NULL;
++ }
++ up(&gfs_mount_args_lock);
++ remove_proc_entry("fs/gfs", NULL);
++
++ fail:
++ return error;
++}
++
++/**
++ * exit_gfs_fs - Unregister the file system
++ *
++ */
++
++void __exit
++exit_gfs_fs(void)
++{
++ unregister_filesystem(&gfs_fs_type);
++
++ kmem_cache_destroy(gfs_mhc_cachep);
++ kmem_cache_destroy(gfs_bufdata_cachep);
++ kmem_cache_destroy(gfs_inode_cachep);
++ kmem_cache_destroy(gfs_glock_cachep);
++
++ down(&gfs_mount_args_lock);
++ if (gfs_mount_args) {
++ kfree(gfs_mount_args);
++ gfs_mount_args = NULL;
++ }
++ up(&gfs_mount_args_lock);
++ remove_proc_entry("fs/gfs", NULL);
++}
++
++MODULE_DESCRIPTION("Global File System " GFS_RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++module_init(init_gfs_fs);
++module_exit(exit_gfs_fs);
++
+diff -urN linux-orig/fs/gfs/mount.c linux-patched/fs/gfs/mount.c
+--- linux-orig/fs/gfs/mount.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/mount.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,212 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "mount.h"
++
++char *gfs_mount_args = NULL;
++struct semaphore gfs_mount_args_lock;
++
++/**
++ * gfs_make_args - Parse mount arguments
++ * @data:
++ * @args:
++ *
++ * Return: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_args(char *data, struct gfs_args *args)
++{
++ char *options, *x, *y;
++ int do_free = FALSE;
++ int error = 0;
++
++ /* If someone preloaded options, use those instead */
++
++ down(&gfs_mount_args_lock);
++ if (gfs_mount_args) {
++ data = gfs_mount_args;
++ gfs_mount_args = NULL;
++ do_free = TRUE;
++ }
++ up(&gfs_mount_args_lock);
++
++ /* Set some defaults */
++
++ memset(args, 0, sizeof(struct gfs_args));
++ args->ar_num_glockd = GFS_GLOCKD_DEFAULT;
++
++ /* Split the options into tokens with the "," character and
++ process them */
++
++ for (options = data; (x = strsep(&options, ",")); ) {
++ if (!*x)
++ continue;
++
++ y = strchr(x, '=');
++ if (y)
++ *y++ = 0;
++
++ if (!strcmp(x, "lockproto")) {
++ if (!y) {
++ printk("GFS: need argument to lockproto\n");
++ error = -EINVAL;
++ break;
++ }
++ strncpy(args->ar_lockproto, y, 256);
++ args->ar_lockproto[255] = 0;
++ }
++
++ else if (!strcmp(x, "locktable")) {
++ if (!y) {
++ printk("GFS: need argument to locktable\n");
++ error = -EINVAL;
++ break;
++ }
++ strncpy(args->ar_locktable, y, 256);
++ args->ar_locktable[255] = 0;
++ }
++
++ else if (!strcmp(x, "hostdata")) {
++ if (!y) {
++ printk("GFS: need argument to hostdata\n");
++ error = -EINVAL;
++ break;
++ }
++ strncpy(args->ar_hostdata, y, 256);
++ args->ar_hostdata[255] = 0;
++ }
++
++ else if (!strcmp(x, "ignore_local_fs"))
++ args->ar_ignore_local_fs = TRUE;
++
++ else if (!strcmp(x, "localflocks"))
++ args->ar_localflocks = TRUE;
++
++ else if (!strcmp(x, "localcaching"))
++ args->ar_localcaching = TRUE;
++
++ else if (!strcmp(x, "upgrade"))
++ args->ar_upgrade = TRUE;
++
++ else if (!strcmp(x, "num_glockd")) {
++ if (!y) {
++ printk("GFS: need argument to num_glockd\n");
++ error = -EINVAL;
++ break;
++ }
++ sscanf(y, "%u", &args->ar_num_glockd);
++ if (!args->ar_num_glockd || args->ar_num_glockd > GFS_GLOCKD_MAX) {
++ printk("GFS: 0 < num_glockd <= %u (not %u)\n",
++ GFS_GLOCKD_MAX, args->ar_num_glockd);
++ error = -EINVAL;
++ break;
++ }
++ }
++
++ else if (!strcmp(x, "acl"))
++ args->ar_posixacls = TRUE;
++
++ /* Unknown */
++
++ else {
++ printk("GFS: unknown option: %s\n", x);
++ error = -EINVAL;
++ break;
++ }
++ }
++
++ if (error)
++ printk("GFS: invalid mount option(s)\n");
++
++ if (do_free)
++ kfree(data);
++
++ return error;
++}
++
++/**
++ * gfs_proc_write - Read in some mount options
++ * @file: unused
++ * @buffer: a buffer of mount options
++ * @count: the length of the mount options
++ * @data: unused
++ *
++ * Called when someone writes to /proc/fs/gfs.
++ * It allows you to specify mount options when you can't do it
++ * from mount. i.e. from a inital ramdisk
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_proc_write(struct file *file,
++ const char *buffer, unsigned long count,
++ void *data)
++{
++ int error;
++ char *p;
++
++ if (!try_module_get(THIS_MODULE))
++ return -EAGAIN; /* Huh!?! */
++ down(&gfs_mount_args_lock);
++
++ if (gfs_mount_args) {
++ kfree(gfs_mount_args);
++ gfs_mount_args = NULL;
++ }
++
++ if (!count) {
++ error = 0;
++ goto fail;
++ }
++
++ gfs_mount_args = gmalloc(count + 1);
++
++ error = -EFAULT;
++ if (copy_from_user(gfs_mount_args, buffer, count))
++ goto fail_free;
++
++ gfs_mount_args[count] = 0;
++
++ /* Get rid of extra newlines */
++
++ for (p = gfs_mount_args; *p; p++)
++ if (*p == '\n')
++ *p = 0;
++
++ up(&gfs_mount_args_lock);
++ module_put(THIS_MODULE);
++
++ return count;
++
++ fail_free:
++ kfree(gfs_mount_args);
++ gfs_mount_args = NULL;
++
++ fail:
++ up(&gfs_mount_args_lock);
++ module_put(THIS_MODULE);
++ return error;
++}
+diff -urN linux-orig/fs/gfs/mount.h linux-patched/fs/gfs/mount.h
+--- linux-orig/fs/gfs/mount.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/mount.h 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,27 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __MOUNT_DOT_H__
++#define __MOUNT_DOT_H__
++
++int gfs_make_args(char *data, struct gfs_args *args);
++
++/* Allow args to be passed to GFS when using an initial ram disk */
++
++extern char *gfs_mount_args;
++extern struct semaphore gfs_mount_args_lock;
++
++int gfs_proc_write(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++
++#endif /* __MOUNT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ondisk.c linux-patched/fs/gfs/ondisk.c
+--- linux-orig/fs/gfs/ondisk.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ondisk.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,28 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++
++#define pv(struct, member, fmt) printk(" "#member" = "fmt"\n", struct->member);
++
++#define WANT_GFS_CONVERSION_FUNCTIONS
++#include <linux/gfs_ondisk.h>
++
+diff -urN linux-orig/fs/gfs/ops_address.c linux-patched/fs/gfs/ops_address.c
+--- linux-orig/fs/gfs/ops_address.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_address.c 2004-06-20 22:48:17.951945841 -0500
+@@ -0,0 +1,476 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/pagemap.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "file.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_address.h"
++#include "page.h"
++#include "quota.h"
++#include "trans.h"
++
++/**
++ * get_block - Fills in a buffer head with details about a block
++ * @inode: The inode
++ * @lblock: The block number to look up
++ * @bh_result: The buffer head to return the result in
++ * @create: Non-zero if we may add block to the file
++ *
++ * Returns: errno
++ */
++
++static int
++get_block(struct inode *inode, sector_t lblock,
++ struct buffer_head *bh_result, int create)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ int new = create;
++ uint64_t dblock;
++ int error;
++
++ error = gfs_block_map(ip, lblock, &new, &dblock, NULL);
++ if (error)
++ return error;
++
++ GFS_ASSERT_INODE(dblock || !create, ip,);
++
++ if (!dblock)
++ return 0;
++
++ map_bh(bh_result, inode->i_sb, dblock);
++ if (new)
++ set_buffer_new(bh_result);
++
++ return 0;
++}
++
++/**
++ * get_block_noalloc - Fills in a buffer head with details about a block
++ * @inode: The inode
++ * @lblock: The block number to look up
++ * @bh_result: The buffer head to return the result in
++ * @create: Non-zero if we may add block to the file
++ *
++ * Returns: errno
++ */
++
++static int
++get_block_noalloc(struct inode *inode, sector_t lblock,
++ struct buffer_head *bh_result, int create)
++{
++ int error;
++
++ error = get_block(inode, lblock, bh_result, FALSE);
++
++ GFS_ASSERT_INODE(!create || buffer_mapped(bh_result),
++ vn2ip(inode),);
++
++ return error;
++}
++
++/**
++ * get_blocks -
++ * @inode:
++ * @lblock:
++ * @max_blocks:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: errno
++ */
++
++static int
++get_blocks(struct inode *inode, sector_t lblock,
++ unsigned long max_blocks,
++ struct buffer_head *bh_result, int create)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ int new = create;
++ uint64_t dblock;
++ uint32_t extlen;
++ int error;
++
++ error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++ if (error)
++ return error;
++
++ GFS_ASSERT_INODE(dblock || !create, ip,);
++
++ if (!dblock)
++ return 0;
++
++ map_bh(bh_result, inode->i_sb, dblock);
++ if (new)
++ set_buffer_new(bh_result);
++
++ if (extlen > max_blocks)
++ extlen = max_blocks;
++ bh_result->b_size = extlen << inode->i_blkbits;
++
++ return 0;
++}
++
++/**
++ * get_blocks_noalloc -
++ * @inode:
++ * @lblock:
++ * @max_blocks:
++ * @bh_result:
++ * @create:
++ *
++ * Returns: errno
++ */
++
++static int
++get_blocks_noalloc(struct inode *inode, sector_t lblock,
++ unsigned long max_blocks,
++ struct buffer_head *bh_result, int create)
++{
++ int error;
++
++ error = get_blocks(inode, lblock, max_blocks, bh_result, FALSE);
++
++ GFS_ASSERT_INODE(!create || buffer_mapped(bh_result),
++ vn2ip(inode),);
++
++ return error;
++}
++
++/**
++ * gfs_writepage - Write complete page
++ * @page: Page to write
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_writepage(struct page *page, struct writeback_control *wbc)
++{
++ struct gfs_inode *ip = vn2ip(page->mapping->host);
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_address);
++
++ GFS_ASSERT_INODE(gfs_glock_is_held_excl(ip->i_gl) &&
++ !gfs_is_stuffed(ip), ip,);
++
++ error = block_write_full_page(page, get_block_noalloc, wbc);
++
++ gfs_flush_meta_cache(ip);
++
++ if (error == -EIO)
++ gfs_io_error_inode(ip);
++
++ return error;
++}
++
++/**
++ * stuffed_readpage - Fill in a Linux page with stuffed file data
++ * @ip: the inode
++ * @page: the page
++ *
++ * Returns: errno
++ */
++
++static int
++stuffed_readpage(struct gfs_inode *ip, struct page *page)
++{
++ struct buffer_head *dibh;
++ void *kaddr;
++ int error;
++
++ GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (!error) {
++ kaddr = kmap(page);
++ memcpy((char *)kaddr,
++ dibh->b_data + sizeof(struct gfs_dinode),
++ ip->i_di.di_size);
++ memset((char *)kaddr + ip->i_di.di_size,
++ 0,
++ PAGE_CACHE_SIZE - ip->i_di.di_size);
++ kunmap(page);
++
++ brelse(dibh);
++
++ SetPageUptodate(page);
++ }
++
++ return error;
++}
++
++/**
++ * readi_readpage - readpage that goes through gfs_internal_read()
++ * @page: The page to read
++ *
++ * Returns: errno
++ */
++
++static int
++readi_readpage(struct page *page)
++{
++ struct gfs_inode *ip = vn2ip(page->mapping->host);
++ void *kaddr;
++ int ret;
++
++ kaddr = kmap(page);
++
++ ret = gfs_internal_read(ip, kaddr,
++ (uint64_t)page->index << PAGE_CACHE_SHIFT,
++ PAGE_CACHE_SIZE);
++ if (ret >= 0) {
++ if (ret < PAGE_CACHE_SIZE)
++ memset(kaddr + ret, 0, PAGE_CACHE_SIZE - ret);
++ SetPageUptodate(page);
++ ret = 0;
++ }
++
++ kunmap(page);
++
++ unlock_page(page);
++
++ return ret;
++}
++
++/**
++ * gfs_readpage - readpage with locking
++ * @file: The file to read a page for
++ * @page: The page to read
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_readpage(struct file *file, struct page *page)
++{
++ struct gfs_inode *ip = vn2ip(page->mapping->host);
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_address);
++
++ if (!gfs_glock_is_locked_by_me(ip->i_gl)) {
++ unlock_page(page);
++ bitch_about(ip->i_sbd, &ip->i_sbd->sd_last_unlocked_aop,
++ "unlocked readpage request");
++ return -ENOSYS;
++ }
++
++ if (!gfs_is_jdata(ip)) {
++ if (gfs_is_stuffed(ip) && !page->index) {
++ error = stuffed_readpage(ip, page);
++ unlock_page(page);
++ } else
++ error = block_read_full_page(page, get_block);
++ } else
++ error = readi_readpage(page);
++
++ if (error == -EIO)
++ gfs_io_error_inode(ip);
++
++ return error;
++}
++
++/**
++ * gfs_prepare_write - Prepare to write to a file
++ * @file: The file to write to
++ * @page: The page which is to be prepared for writing
++ * @from: From (byte range within page)
++ * @to: To (byte range within page)
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
++{
++ struct gfs_inode *ip = vn2ip(page->mapping->host);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_address);
++
++ if (!gfs_glock_is_locked_by_me(ip->i_gl)) {
++ bitch_about(sdp, &sdp->sd_last_unlocked_aop,
++ "unlocked prepare_write request");
++ return -ENOSYS;
++ }
++
++ if (gfs_is_stuffed(ip)) {
++ uint64_t file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
++
++ if (file_size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) {
++ error = gfs_unstuff_dinode(ip, gfs_unstuffer_page, page);
++ if (!error)
++ error = block_prepare_write(page, from, to, get_block);
++ } else if (!PageUptodate(page))
++ error = stuffed_readpage(ip, page);
++ } else
++ error = block_prepare_write(page, from, to, get_block);
++
++ if (error == -EIO)
++ gfs_io_error_inode(ip);
++
++ return error;
++}
++
++/**
++ * gfs_commit_write - Commit write to a file
++ * @file: The file to write to
++ * @page: The page containing the data
++ * @from: From (byte range within page)
++ * @to: To (byte range within page)
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_commit_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
++{
++ struct inode *inode = page->mapping->host;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_address);
++
++ if (gfs_is_stuffed(ip)) {
++ struct buffer_head *dibh;
++ uint64_t file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
++ void *kaddr;
++
++ GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail;
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++
++ kaddr = kmap(page);
++ memcpy(dibh->b_data + sizeof(struct gfs_dinode) + from,
++ (char *)kaddr + from,
++ to - from);
++ kunmap(page);
++
++ brelse(dibh);
++
++ SetPageUptodate(page);
++
++ if (inode->i_size < file_size)
++ i_size_write(inode, file_size);
++ } else {
++ error = generic_commit_write(file, page, from, to);
++ if (error)
++ goto fail;
++ }
++
++ return 0;
++
++ fail:
++ ClearPageUptodate(page);
++
++ return error;
++}
++
++/**
++ * gfs_bmap - Block map function
++ * @mapping: Address space info
++ * @lblock: The block to map
++ *
++ * Returns: The disk address for the block or 0 on hole or error
++ */
++
++static sector_t
++gfs_bmap(struct address_space *mapping, sector_t lblock)
++{
++ struct gfs_inode *ip = vn2ip(mapping->host);
++ struct gfs_holder i_gh;
++ int dblock = 0;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_address);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (error)
++ return 0;
++
++ if (!gfs_is_stuffed(ip))
++ dblock = generic_block_bmap(mapping, lblock, get_block);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return dblock;
++}
++
++/**
++ * gfs_direct_IO -
++ * @rw:
++ * @iocb:
++ * @iov:
++ * @offset:
++ * @nr_segs:
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
++ loff_t offset, unsigned long nr_segs)
++{
++ struct file *file = iocb->ki_filp;
++ struct inode *inode = file->f_mapping->host;
++ struct gfs_inode *ip = vn2ip(inode);
++ get_blocks_t *gb = get_blocks;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_address);
++
++ GFS_ASSERT_INODE(gfs_glock_is_locked_by_me(ip->i_gl), ip,);
++ GFS_ASSERT_INODE(!gfs_is_stuffed(ip), ip,);
++
++ if (rw == WRITE && !current_transaction)
++ gb = get_blocks_noalloc;
++
++ error = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
++ offset, nr_segs, gb, NULL);
++
++ if (error == -EIO)
++ gfs_io_error_inode(ip);
++
++ return error;
++}
++
++struct address_space_operations gfs_file_aops = {
++ .writepage = gfs_writepage,
++ .readpage = gfs_readpage,
++ .sync_page = block_sync_page,
++ .prepare_write = gfs_prepare_write,
++ .commit_write = gfs_commit_write,
++ .bmap = gfs_bmap,
++ .direct_IO = gfs_direct_IO,
++};
+diff -urN linux-orig/fs/gfs/ops_address.h linux-patched/fs/gfs/ops_address.h
+--- linux-orig/fs/gfs/ops_address.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_address.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_ADDRESS_DOT_H__
++#define __OPS_ADDRESS_DOT_H__
++
++extern struct address_space_operations gfs_file_aops;
++
++#endif /* __OPS_ADDRESS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_dentry.c linux-patched/fs/gfs/ops_dentry.c
+--- linux-orig/fs/gfs/ops_dentry.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_dentry.c 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,124 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dir.h"
++#include "glock.h"
++#include "ops_dentry.h"
++
++/**
++ * gfs_drevalidate - Check directory lookup consistency
++ * @dentry: the mapping to check
++ * @nd:
++ *
++ * Check to make sure the lookup necessary to arrive at this inode from its
++ * parent is still good.
++ *
++ * Returns: 1 if the dentry is ok, 0 if it isn't
++ */
++
++static int
++gfs_drevalidate(struct dentry *dentry, struct nameidata *nd)
++{
++ struct dentry *parent = dget_parent(dentry);
++ struct gfs_inode *dip;
++ struct inode *inode;
++ struct gfs_holder d_gh;
++ struct gfs_inode *ip;
++ struct gfs_inum inum;
++ unsigned int type;
++ int error;
++
++ lock_kernel();
++
++ dip = vn2ip(parent->d_inode);
++ GFS_ASSERT(dip,);
++
++ atomic_inc(&dip->i_sbd->sd_ops_dentry);
++
++ if (dip->i_sbd->sd_args.ar_localcaching)
++ goto valid;
++
++ inode = dentry->d_inode;
++ if (inode && is_bad_inode(inode))
++ goto invalid;
++
++ error = gfs_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
++ if (error)
++ goto fail;
++
++ error = gfs_dir_search(dip, &dentry->d_name, &inum, &type);
++ switch (error) {
++ case 0:
++ if (!inode)
++ goto invalid_gunlock;
++ break;
++ case -ENOENT:
++ if (!inode)
++ goto valid_gunlock;
++ goto invalid_gunlock;
++ default:
++ goto fail_gunlock;
++ }
++
++ ip = vn2ip(inode);
++ GFS_ASSERT_SBD(ip, dip->i_sbd,);
++
++ if (ip->i_num.no_formal_ino != inum.no_formal_ino)
++ goto invalid_gunlock;
++
++ GFS_ASSERT_INODE(ip->i_di.di_type == type, ip,);
++
++ valid_gunlock:
++ gfs_glock_dq_uninit(&d_gh);
++
++ valid:
++ unlock_kernel();
++ dput(parent);
++ return 1;
++
++ invalid_gunlock:
++ gfs_glock_dq_uninit(&d_gh);
++
++ invalid:
++ if (inode && S_ISDIR(inode->i_mode)) {
++ if (have_submounts(dentry))
++ goto valid;
++ shrink_dcache_parent(dentry);
++ }
++ d_drop(dentry);
++
++ unlock_kernel();
++ dput(parent);
++ return 0;
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&d_gh);
++
++ fail:
++ unlock_kernel();
++ dput(parent);
++ return 0;
++}
++
++struct dentry_operations gfs_dops = {
++ .d_revalidate = gfs_drevalidate,
++};
+diff -urN linux-orig/fs/gfs/ops_dentry.h linux-patched/fs/gfs/ops_dentry.h
+--- linux-orig/fs/gfs/ops_dentry.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_dentry.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_DENTRY_DOT_H__
++#define __OPS_DENTRY_DOT_H__
++
++extern struct dentry_operations gfs_dops;
++
++#endif /* __OPS_DENTRY_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_export.c linux-patched/fs/gfs/ops_export.c
+--- linux-orig/fs/gfs/ops_export.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_export.c 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,415 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "dir.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ops_export.h"
++#include "rgrp.h"
++
++struct inode_cookie
++{
++ uint64_t formal_ino;
++ uint32_t gen;
++ int gen_valid;
++};
++
++struct get_name_filldir
++{
++ uint64_t formal_ino;
++ char *name;
++};
++
++/**
++ * gfs_decode_fh -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_decode_fh(struct super_block *sb, __u32 *fh, int fh_len, int fh_type,
++ int (*acceptable)(void *context, struct dentry *dentry),
++ void *context)
++{
++ struct inode_cookie this, parent;
++
++ atomic_inc(&vfs2sdp(sb)->sd_ops_export);
++
++ if (fh_type != fh_len)
++ return NULL;
++
++ memset(&parent, 0, sizeof(struct inode_cookie));
++
++ switch (fh_type) {
++ case 6:
++ parent.gen_valid = TRUE;
++ parent.gen = fh[5];
++ case 5:
++ parent.formal_ino = ((uint64_t)gfs32_to_cpu(fh[3])) << 32;
++ parent.formal_ino |= (uint64_t)gfs32_to_cpu(fh[4]);
++ case 3:
++ this.gen_valid = TRUE;
++ this.gen = gfs32_to_cpu(fh[2]);
++ this.formal_ino = ((uint64_t)gfs32_to_cpu(fh[0])) << 32;
++ this.formal_ino |= (uint64_t)gfs32_to_cpu(fh[1]);
++ break;
++ default:
++ return NULL;
++ }
++
++ return gfs_export_ops.find_exported_dentry(sb, &this, &parent,
++ acceptable, context);
++}
++
++/**
++ * gfs_encode_fh -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++int
++gfs_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
++ int connectable)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ int maxlen = *len;
++
++ atomic_inc(&ip->i_sbd->sd_ops_export);
++
++ if (maxlen < 3)
++ return 255;
++
++ fh[0] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino >> 32));
++ fh[1] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino & 0xFFFFFFFF));
++ fh[2] = cpu_to_gfs32(inode->i_generation);
++ *len = 3;
++
++ if (maxlen < 5 || !connectable)
++ return 3;
++
++ spin_lock(&dentry->d_lock);
++
++ inode = dentry->d_parent->d_inode;
++ ip = vn2ip(inode);
++
++ fh[3] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino >> 32));
++ fh[4] = cpu_to_gfs32((uint32_t)(ip->i_num.no_formal_ino & 0xFFFFFFFF));
++ *len = 5;
++
++ if (maxlen < 6) {
++ spin_unlock(&dentry->d_lock);
++ return 5;
++ }
++
++ fh[5] = cpu_to_gfs32(inode->i_generation);
++
++ spin_unlock(&dentry->d_lock);
++
++ *len = 6;
++
++ return 6;
++}
++
++/**
++ * get_name_filldir -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static int
++get_name_filldir(void *opaque,
++ const char *name, unsigned int length,
++ uint64_t offset,
++ struct gfs_inum *inum, unsigned int type)
++{
++ struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
++
++ if (inum->no_formal_ino != gnfd->formal_ino)
++ return 0;
++
++ memcpy(gnfd->name, name, length);
++ gnfd->name[length] = 0;
++
++ return 1;
++}
++
++/**
++ * gfs_get_name -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++int gfs_get_name(struct dentry *parent, char *name,
++ struct dentry *child)
++{
++ struct inode *dir = parent->d_inode;
++ struct inode *inode = child->d_inode;
++ struct gfs_inode *dip, *ip;
++ struct get_name_filldir gnfd;
++ struct gfs_holder gh;
++ uint64_t offset = 0;
++ int error;
++
++ if (!dir)
++ return -EINVAL;
++
++ atomic_inc(&vfs2sdp(dir->i_sb)->sd_ops_export);
++
++ if (!S_ISDIR(dir->i_mode) || !inode)
++ return -EINVAL;
++
++ dip = vn2ip(dir);
++ ip = vn2ip(inode);
++
++ *name = 0;
++ gnfd.formal_ino = ip->i_num.no_formal_ino;
++ gnfd.name = name;
++
++ error = gfs_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
++ if (error)
++ return error;
++
++ error = gfs_dir_read(dip, &offset, &gnfd, get_name_filldir);
++
++ gfs_glock_dq_uninit(&gh);
++
++ if (!error & !*name)
++ error = -ENOENT;
++
++ return error;
++}
++
++/**
++ * gfs_get_parent -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_get_parent(struct dentry *child)
++{
++ struct gfs_inode *dip = vn2ip(child->d_inode);
++ struct gfs_holder d_gh, i_gh;
++ struct qstr dotdot = { .name = "..", .len = 2 };
++ struct gfs_inode *ip;
++ struct inode *inode;
++ struct dentry *dentry;
++ int error;
++
++ atomic_inc(&dip->i_sbd->sd_ops_export);
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++ error = gfs_lookupi(&d_gh, &dotdot, TRUE, &i_gh);
++ if (error)
++ goto fail;
++
++ error = -ENOENT;
++ if (!i_gh.gh_gl)
++ goto fail;
++
++ ip = gl2ip(i_gh.gh_gl);
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return ERR_PTR(-ENOMEM);
++
++ dentry = d_alloc_anon(inode);
++ if (!dentry) {
++ iput(inode);
++ return ERR_PTR(-ENOMEM);
++ }
++
++ return dentry;
++
++ fail:
++ gfs_holder_uninit(&d_gh);
++ return ERR_PTR(error);
++}
++
++/**
++ * gfs_get_dentry -
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++struct dentry *
++gfs_get_dentry(struct super_block *sb, void *inump)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++ struct inode_cookie *cookie = (struct inode_cookie *)inump;
++ struct gfs_inum inum;
++ struct gfs_holder i_gh, ri_gh, rgd_gh;
++ struct gfs_rgrpd *rgd;
++ struct buffer_head *bh;
++ struct gfs_dinode *di;
++ struct gfs_inode *ip;
++ struct inode *inode;
++ struct dentry *dentry;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_export);
++
++ if (!cookie->formal_ino ||
++ cookie->formal_ino == sdp->sd_jiinode->i_num.no_formal_ino ||
++ cookie->formal_ino == sdp->sd_riinode->i_num.no_formal_ino ||
++ cookie->formal_ino == sdp->sd_qinode->i_num.no_formal_ino ||
++ cookie->formal_ino == sdp->sd_linode->i_num.no_formal_ino)
++ return ERR_PTR(-EINVAL);
++
++ inum.no_formal_ino = cookie->formal_ino;
++ inum.no_addr = cookie->formal_ino;
++
++ error = gfs_glock_nq_num(sdp,
++ inum.no_formal_ino, &gfs_inode_glops,
++ LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
++ &i_gh);
++ if (error)
++ return ERR_PTR(error);
++
++ error = gfs_inode_get(i_gh.gh_gl, &inum, NO_CREATE, &ip);
++ if (error)
++ goto fail;
++ if (ip)
++ goto out;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ goto fail;
++
++ error = -EINVAL;
++ rgd = gfs_blk2rgrpd(sdp, inum.no_addr);
++ if (!rgd)
++ goto fail_rindex;
++
++ error = gfs_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
++ if (error)
++ goto fail_rindex;
++
++ error = -ESTALE;
++ if (gfs_get_block_type(rgd, inum.no_addr) != GFS_BLKST_USEDMETA)
++ goto fail_rgd;
++
++ error = gfs_dread(sdp, inum.no_addr, i_gh.gh_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto fail_rgd;
++
++ di = (struct gfs_dinode *)bh->b_data;
++
++ error = -ESTALE;
++ if (gfs32_to_cpu(di->di_header.mh_magic) != GFS_MAGIC ||
++ gfs32_to_cpu(di->di_header.mh_type) != GFS_METATYPE_DI ||
++ (gfs32_to_cpu(di->di_flags) & GFS_DIF_UNUSED))
++ goto fail_relse;
++
++ brelse(bh);
++ gfs_glock_dq_uninit(&rgd_gh);
++ gfs_glock_dq_uninit(&ri_gh);
++
++ error = gfs_inode_get(i_gh.gh_gl, &inum, CREATE, &ip);
++ if (error)
++ goto fail;
++
++ atomic_inc(&sdp->sd_fh2dentry_misses);
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return ERR_PTR(-ENOMEM);
++
++ if (cookie->gen_valid && cookie->gen != inode->i_generation) {
++ iput(inode);
++ return ERR_PTR(-ESTALE);
++ }
++
++ dentry = d_alloc_anon(inode);
++ if (!dentry) {
++ iput(inode);
++ return ERR_PTR(-ENOMEM);
++ }
++
++ return dentry;
++
++ fail_relse:
++ brelse(bh);
++
++ fail_rgd:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ fail:
++ gfs_glock_dq_uninit(&i_gh);
++ return ERR_PTR(error);
++}
++
++struct export_operations gfs_export_ops = {
++ .decode_fh = gfs_decode_fh,
++ .encode_fh = gfs_encode_fh,
++ .get_name = gfs_get_name,
++ .get_parent = gfs_get_parent,
++ .get_dentry = gfs_get_dentry,
++};
++
+diff -urN linux-orig/fs/gfs/ops_export.h linux-patched/fs/gfs/ops_export.h
+--- linux-orig/fs/gfs/ops_export.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_export.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_EXPORT_DOT_H__
++#define __OPS_EXPORT_DOT_H__
++
++extern struct export_operations gfs_export_ops;
++
++#endif /* __OPS_EXPORT_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_file.c linux-patched/fs/gfs/ops_file.c
+--- linux-orig/fs/gfs/ops_file.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_file.c 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,1552 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <asm/uaccess.h>
++#include <linux/pagemap.h>
++#include <linux/uio.h>
++#include <linux/blkdev.h>
++#include <linux/mm.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "file.h"
++#include "flock.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "ioctl.h"
++#include "log.h"
++#include "ops_file.h"
++#include "ops_vm.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++struct filldir_bad_entry {
++ char *fbe_name;
++ unsigned int fbe_length;
++ uint64_t fbe_offset;
++ struct gfs_inum fbe_inum;
++ unsigned int fbe_type;
++};
++
++struct filldir_bad {
++ struct gfs_sbd *fdb_sbd;
++ int fdb_prefetch;
++
++ struct filldir_bad_entry *fdb_entry;
++ unsigned int fdb_entry_num;
++ unsigned int fdb_entry_off;
++
++ char *fdb_name;
++ unsigned int fdb_name_size;
++ unsigned int fdb_name_off;
++};
++
++struct filldir_reg {
++ struct gfs_sbd *fdr_sbd;
++ int fdr_prefetch;
++
++ filldir_t fdr_filldir;
++ void *fdr_opaque;
++};
++
++typedef ssize_t(*do_rw_t) (struct file * file,
++ char *buf,
++ size_t size, loff_t * offset,
++ unsigned int num_gh, struct gfs_holder * ghs);
++
++/**
++ * gfs_llseek - seek to a location in a file
++ * @file: the file
++ * @offset: the offset
++ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
++ *
++ * SEEK_END requires the glock for the file because it references the
++ * file's size.
++ *
++ * Returns: The new offset, or -EXXX on error
++ */
++
++static loff_t
++gfs_llseek(struct file *file, loff_t offset, int origin)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ struct gfs_holder i_gh;
++ loff_t error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++
++ if (origin == 2) {
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (!error) {
++ error = remote_llseek(file, offset, origin);
++ gfs_glock_dq_uninit(&i_gh);
++ }
++ } else
++ error = remote_llseek(file, offset, origin);
++
++ return error;
++}
++
++#define vma2state(vma) \
++((((vma)->vm_flags & (VM_MAYWRITE | VM_MAYSHARE)) == \
++ (VM_MAYWRITE | VM_MAYSHARE)) ? \
++ LM_ST_EXCLUSIVE : LM_ST_SHARED) \
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static ssize_t
++walk_vm_hard(struct file *file, char *buf, size_t size, loff_t *offset,
++ do_rw_t operation)
++{
++ struct gfs_holder *ghs;
++ unsigned int num_gh = 0;
++ ssize_t count;
++
++ {
++ struct super_block *sb = file->f_dentry->d_inode->i_sb;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ unsigned long start = (unsigned long)buf;
++ unsigned long end = start + size;
++ int dumping = (current->flags & PF_DUMPCORE);
++ unsigned int x = 0;
++
++ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++ if (end <= vma->vm_start)
++ break;
++ if (vma->vm_file &&
++ vma->vm_file->f_dentry->d_inode->i_sb == sb) {
++ num_gh++;
++ }
++ }
++
++ ghs = kmalloc((num_gh + 1) * sizeof(struct gfs_holder), GFP_KERNEL);
++ if (!ghs) {
++ if (!dumping)
++ up_read(&mm->mmap_sem);
++ return -ENOMEM;
++ }
++
++ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++ if (end <= vma->vm_start)
++ break;
++ if (vma->vm_file) {
++ struct inode *inode = vma->vm_file->f_dentry->d_inode;
++ if (inode->i_sb == sb)
++ gfs_holder_init(vn2ip(inode)->i_gl,
++ vma2state(vma),
++ 0, &ghs[x++]);
++ }
++ }
++
++ if (!dumping)
++ up_read(&mm->mmap_sem);
++
++ GFS_ASSERT_SBD(x == num_gh, vfs2sdp(sb),);
++ }
++
++ count = operation(file, buf, size, offset, num_gh, ghs);
++
++ while (num_gh--)
++ gfs_holder_uninit(&ghs[num_gh]);
++ kfree(ghs);
++
++ return count;
++}
++
++/**
++ * walk_vma - Walk the vmas associated with a buffer for read or write.
++ * If any of them are gfs, pass the gfs inode down to the read/write
++ * worker function so that locks can be acquired in the correct order.
++ * @file: The file to read/write from/to
++ * @buf: The buffer to copy to/from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @operation: The read or write worker function
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -errno on failure
++ */
++
++static ssize_t
++walk_vm(struct file *file, char *buf, size_t size, loff_t *offset,
++ do_rw_t operation)
++{
++ if (current->mm) {
++ struct super_block *sb = file->f_dentry->d_inode->i_sb;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ unsigned long start = (unsigned long)buf;
++ unsigned long end = start + size;
++ int dumping = (current->flags & PF_DUMPCORE);
++
++ if (!dumping)
++ down_read(&mm->mmap_sem);
++
++ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
++ if (end <= vma->vm_start)
++ break;
++ if (vma->vm_file &&
++ vma->vm_file->f_dentry->d_inode->i_sb == sb)
++ goto do_locks;
++ }
++
++ if (!dumping)
++ up_read(&mm->mmap_sem);
++ }
++
++ {
++ struct gfs_holder gh;
++ return operation(file, buf, size, offset, 0, &gh);
++ }
++
++ do_locks:
++ return walk_vm_hard(file, buf, size, offset, operation);
++}
++
++/**
++ * functionname - summary
++ * @param1: description
++ * @param2: description
++ * @param3: description
++ *
++ * Function description
++ *
++ * Returns: what is returned
++ */
++
++static ssize_t
++do_read_readi(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ ssize_t count = 0;
++
++ if (*offset < 0)
++ return -EINVAL;
++ if (!access_ok(VERIFY_WRITE, buf, size))
++ return -EFAULT;
++
++ if (!(file->f_flags & O_LARGEFILE)) {
++ if (*offset >= 0x7FFFFFFFull)
++ return -EFBIG;
++ if (*offset + size > 0x7FFFFFFFull)
++ size = 0x7FFFFFFFull - *offset;
++ }
++
++ count = gfs_readi(ip, buf, *offset, size, gfs_copy2user);
++
++ if (count > 0)
++ *offset += count;
++
++ return count;
++}
++
++/**
++ * do_read_direct - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @ghs: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++do_read_direct(struct file *file, char *buf, size_t size, loff_t *offset,
++ unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct inode *inode = file->f_mapping->host;
++ struct gfs_inode *ip = vn2ip(inode);
++ unsigned int state = LM_ST_DEFERRED;
++ int flags = 0;
++ unsigned int x;
++ ssize_t count = 0;
++ int error;
++
++ for (x = 0; x < num_gh; x++)
++ if (ghs[x].gh_gl == ip->i_gl) {
++ state = LM_ST_SHARED;
++ flags |= GL_LOCAL_EXCL;
++ break;
++ }
++
++ gfs_holder_init(ip->i_gl, state, flags, &ghs[num_gh]);
++
++ error = gfs_glock_nq_m(num_gh + 1, ghs);
++ if (error)
++ goto out;
++
++ error = -EINVAL;
++ if (gfs_is_jdata(ip))
++ goto out_gunlock;
++
++ if (gfs_is_stuffed(ip)) {
++ size_t mask = bdev_hardsect_size(inode->i_sb->s_bdev) - 1;
++
++ if (((*offset) & mask) || (((unsigned long)buf) & mask))
++ goto out_gunlock;
++
++ count = do_read_readi(file, buf, size & ~mask, offset);
++ }
++ else
++ count = generic_file_read(file, buf, size, offset);
++
++ error = 0;
++
++ out_gunlock:
++ gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++ gfs_holder_uninit(&ghs[num_gh]);
++
++ return (count) ? count : error;
++}
++
++/**
++ * do_read_buf - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @ghs: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++do_read_buf(struct file *file, char *buf, size_t size, loff_t *offset,
++ unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ ssize_t count = 0;
++ int error;
++
++ gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &ghs[num_gh]);
++
++ error = gfs_glock_nq_m_atime(num_gh + 1, ghs);
++ if (error)
++ goto out;
++
++ if (gfs_is_jdata(ip) ||
++ (gfs_is_stuffed(ip) && !test_bit(GIF_PAGED, &ip->i_flags)))
++ count = do_read_readi(file, buf, size, offset);
++ else
++ count = generic_file_read(file, buf, size, offset);
++
++ gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++ gfs_holder_uninit(&ghs[num_gh]);
++
++ return (count) ? count : error;
++}
++
++/**
++ * gfs_read - Read bytes from a file
++ * @file: The file to read from
++ * @buf: The buffer to copy into
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes read, -EXXX on failure
++ */
++
++static ssize_t
++gfs_read(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++ atomic_inc(&vfs2sdp(file->f_mapping->host->i_sb)->sd_ops_file);
++
++ if (file->f_flags & O_DIRECT)
++ return walk_vm(file, buf, size, offset, do_read_direct);
++ else
++ return walk_vm(file, buf, size, offset, do_read_buf);
++}
++
++/**
++ * grope_mapping - feel up a mapping that needs to be written
++ * @buf: the start of the memory to be written
++ * @size: the size of the memory to be written
++ *
++ * We do this after acquiring the locks on the mapping,
++ * but before starting the write transaction. We need to make
++ * sure that we don't cause recursive transactions if blocks
++ * need to be allocated to the file backing the mapping.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++grope_mapping(char *buf, size_t size)
++{
++ unsigned long start = (unsigned long)buf;
++ unsigned long stop = start + size;
++ char c;
++
++ while (start < stop) {
++ if (copy_from_user(&c, (char *)start, 1))
++ return -EFAULT;
++
++ start += PAGE_CACHE_SIZE;
++ start &= PAGE_CACHE_MASK;
++ }
++
++ return 0;
++}
++
++/**
++ * do_write_direct_alloc - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_direct_alloc(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++ struct inode *inode = file->f_mapping->host;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = NULL;
++ struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++ struct buffer_head *dibh;
++ unsigned int data_blocks, ind_blocks;
++ ssize_t count;
++ int error;
++
++ gfs_write_calc_reserv(ip, size, &data_blocks, &ind_blocks);
++
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail;
++
++ error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ al->al_requested_meta = ind_blocks;
++ al->al_requested_data = data_blocks;
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ All blocks for a RG bitmap, whatever indirect blocks we
++ need, a modified dinode, and a quota change. */
++
++ error = gfs_trans_begin(sdp,
++ 1 + al->al_rgd->rd_ri.ri_length + ind_blocks,
++ 1);
++ if (error)
++ goto fail_ipres;
++
++ if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) {
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? (~(S_ISUID | S_ISGID)) : (~S_ISUID);
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++ }
++
++ if (gfs_is_stuffed(ip)) {
++ error = gfs_unstuff_dinode(ip, gfs_unstuffer_sync, NULL);
++ if (error)
++ goto fail_end_trans;
++ }
++
++ count = generic_file_write_nolock(file, &local_iov, 1, offset);
++ if (count < 0) {
++ error = count;
++ goto fail_end_trans;
++ }
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ if (ip->i_di.di_size < inode->i_size)
++ ip->i_di.di_size = inode->i_size;
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++
++ if (file->f_flags & O_SYNC)
++ gfs_log_flush_glock(ip->i_gl);
++
++ gfs_inplace_release(ip);
++ gfs_quota_unlock_m(ip);
++ gfs_alloc_put(ip);
++
++ return count;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++ gfs_quota_unlock_m(ip);
++
++ fail:
++ gfs_alloc_put(ip);
++
++ return error;
++}
++
++/**
++ * do_write_direct - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @gh: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_direct(struct file *file, char *buf, size_t size, loff_t *offset,
++ unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_file *fp = vf2fp(file);
++ unsigned int state = LM_ST_DEFERRED;
++ int alloc_required;
++ unsigned int x;
++ size_t s;
++ ssize_t count = 0;
++ int error;
++
++ if (test_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags))
++ state = LM_ST_EXCLUSIVE;
++ else
++ for (x = 0; x < num_gh; x++)
++ if (ghs[x].gh_gl == ip->i_gl) {
++ state = LM_ST_EXCLUSIVE;
++ break;
++ }
++
++ restart:
++ gfs_holder_init(ip->i_gl, state, 0, &ghs[num_gh]);
++
++ error = gfs_glock_nq_m(num_gh + 1, ghs);
++ if (error)
++ goto out;
++
++ error = -EINVAL;
++ if (gfs_is_jdata(ip))
++ goto out_gunlock;
++
++ if (num_gh) {
++ error = grope_mapping(buf, size);
++ if (error)
++ goto out_gunlock;
++ }
++
++ if (file->f_flags & O_APPEND)
++ *offset = ip->i_di.di_size;
++
++ if (!(file->f_flags & O_LARGEFILE)) {
++ error = -EFBIG;
++ if (*offset >= 0x7FFFFFFFull)
++ goto out_gunlock;
++ if (*offset + size > 0x7FFFFFFFull)
++ size = 0x7FFFFFFFull - *offset;
++ }
++
++ if (gfs_is_stuffed(ip) ||
++ *offset + size > ip->i_di.di_size ||
++ ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)))
++ alloc_required = TRUE;
++ else {
++ error = gfs_write_alloc_required(ip, *offset, size,
++ &alloc_required);
++ if (error)
++ goto out_gunlock;
++ }
++
++ if (alloc_required && state != LM_ST_EXCLUSIVE) {
++ gfs_glock_dq_m(num_gh + 1, ghs);
++ gfs_holder_uninit(&ghs[num_gh]);
++ state = LM_ST_EXCLUSIVE;
++ goto restart;
++ }
++
++ if (alloc_required) {
++ set_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags);
++
++ while (size) {
++ s = sdp->sd_tune.gt_max_atomic_write;
++ if (s > size)
++ s = size;
++
++ error = do_write_direct_alloc(file, buf, s, offset);
++ if (error < 0)
++ goto out_gunlock;
++
++ buf += error;
++ size -= error;
++ count += error;
++ }
++ } else {
++ struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++ struct gfs_holder t_gh;
++
++ clear_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags);
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
++ if (error)
++ goto out_gunlock;
++
++ count = generic_file_write_nolock(file, &local_iov, 1, offset);
++
++ gfs_glock_dq_uninit(&t_gh);
++ }
++
++ error = 0;
++
++ out_gunlock:
++ gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++ gfs_holder_uninit(&ghs[num_gh]);
++
++ return (count) ? count : error;
++}
++
++/**
++ * do_do_write_buf - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_do_write_buf(struct file *file, char *buf, size_t size, loff_t *offset)
++{
++ struct inode *inode = file->f_mapping->host;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = NULL;
++ struct buffer_head *dibh;
++ unsigned int data_blocks, ind_blocks;
++ int alloc_required, journaled;
++ ssize_t count;
++ int error;
++
++ journaled = gfs_is_jdata(ip);
++
++ gfs_write_calc_reserv(ip, size, &data_blocks, &ind_blocks);
++
++ error = gfs_write_alloc_required(ip, *offset, size, &alloc_required);
++ if (error)
++ return error;
++
++ if (alloc_required) {
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail;
++
++ error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ if (journaled)
++ al->al_requested_meta = ind_blocks + data_blocks;
++ else {
++ al->al_requested_meta = ind_blocks;
++ al->al_requested_data = data_blocks;
++ }
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ All blocks for a RG bitmap, whatever indirect blocks we
++ need, a modified dinode, and a quota change. */
++
++ error = gfs_trans_begin(sdp,
++ 1 + al->al_rgd->rd_ri.ri_length +
++ ind_blocks +
++ ((journaled) ? data_blocks : 0), 1);
++ if (error)
++ goto fail_ipres;
++ } else {
++ /* Trans may require:
++ A modified dinode. */
++
++ error = gfs_trans_begin(sdp,
++ 1 + ((journaled) ? data_blocks : 0), 0);
++ if (error)
++ goto fail_ipres;
++ }
++
++ if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) {
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? (~(S_ISUID | S_ISGID)) : (~S_ISUID);
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++ }
++
++ if (journaled ||
++ (gfs_is_stuffed(ip) && !test_bit(GIF_PAGED, &ip->i_flags) &&
++ *offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode))) {
++
++ count = gfs_writei(ip, buf, *offset, size, gfs_copy_from_user);
++ if (count < 0) {
++ error = count;
++ goto fail_end_trans;
++ }
++
++ *offset += count;
++ } else {
++ struct iovec local_iov = { .iov_base = buf, .iov_len = size };
++
++ count = generic_file_write_nolock(file, &local_iov, 1, offset);
++ if (count < 0) {
++ error = count;
++ goto fail_end_trans;
++ }
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ if (ip->i_di.di_size < inode->i_size)
++ ip->i_di.di_size = inode->i_size;
++ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++ }
++
++ gfs_trans_end(sdp);
++
++ if (file->f_flags & O_SYNC)
++ gfs_log_flush_glock(ip->i_gl);
++
++ if (alloc_required) {
++ GFS_ASSERT_INODE(count != size ||
++ al->al_alloced_meta ||
++ al->al_alloced_data, ip,);
++ gfs_inplace_release(ip);
++ gfs_quota_unlock_m(ip);
++ gfs_alloc_put(ip);
++ }
++
++ return count;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ if (alloc_required)
++ gfs_inplace_release(ip);
++
++ fail_gunlock_q:
++ if (alloc_required)
++ gfs_quota_unlock_m(ip);
++
++ fail:
++ if (alloc_required)
++ gfs_alloc_put(ip);
++
++ return error;
++}
++
++/**
++ * do_write_buf - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ * @num_gh: The number of other locks we need to do the read
++ * @gh: the locks we need plus one for our lock
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++do_write_buf(struct file *file,
++ char *buf, size_t size, loff_t *offset,
++ unsigned int num_gh, struct gfs_holder *ghs)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ size_t s;
++ ssize_t count = 0;
++ int error;
++
++ gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh]);
++
++ error = gfs_glock_nq_m(num_gh + 1, ghs);
++ if (error)
++ goto out;
++
++ if (num_gh) {
++ error = grope_mapping(buf, size);
++ if (error)
++ goto out_gunlock;
++ }
++
++ if (file->f_flags & O_APPEND)
++ *offset = ip->i_di.di_size;
++
++ if (!(file->f_flags & O_LARGEFILE)) {
++ error = -EFBIG;
++ if (*offset >= 0x7FFFFFFFull)
++ goto out_gunlock;
++ if (*offset + size > 0x7FFFFFFFull)
++ size = 0x7FFFFFFFull - *offset;
++ }
++
++ while (size) {
++ s = sdp->sd_tune.gt_max_atomic_write;
++ if (s > size)
++ s = size;
++
++ error = do_do_write_buf(file, buf, s, offset);
++ if (error < 0)
++ goto out_gunlock;
++
++ buf += error;
++ size -= error;
++ count += error;
++ }
++
++ error = 0;
++
++ out_gunlock:
++ gfs_glock_dq_m(num_gh + 1, ghs);
++
++ out:
++ gfs_holder_uninit(&ghs[num_gh]);
++
++ return (count) ? count : error;
++}
++
++/**
++ * gfs_write - Write bytes to a file
++ * @file: The file to write to
++ * @buf: The buffer to copy from
++ * @size: The amount of data requested
++ * @offset: The current file offset
++ *
++ * Outputs: Offset - updated according to number of bytes written
++ *
++ * Returns: The number of bytes written, -EXXX on failure
++ */
++
++static ssize_t
++gfs_write(struct file *file, const char *buf, size_t size, loff_t *offset)
++{
++ struct inode *inode = file->f_mapping->host;
++ ssize_t count;
++
++ atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_file);
++
++ if (*offset < 0)
++ return -EINVAL;
++ if (!access_ok(VERIFY_READ, buf, size))
++ return -EFAULT;
++
++ down(&inode->i_sem);
++ if (file->f_flags & O_DIRECT)
++ count = walk_vm(file, (char *)buf, size, offset, do_write_direct);
++ else
++ count = walk_vm(file, (char *)buf, size, offset, do_write_buf);
++ up(&inode->i_sem);
++
++ return count;
++}
++
++/**
++ * filldir_reg_func - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++static int
++filldir_reg_func(void *opaque,
++ const char *name, unsigned int length,
++ uint64_t offset,
++ struct gfs_inum *inum, unsigned int type)
++{
++ struct filldir_reg *fdr = (struct filldir_reg *)opaque;
++ struct gfs_sbd *sdp = fdr->fdr_sbd;
++ unsigned int vfs_type;
++ int error;
++
++ switch (type) {
++ case GFS_FILE_NON:
++ vfs_type = DT_UNKNOWN;
++ break;
++ case GFS_FILE_REG:
++ vfs_type = DT_REG;
++ break;
++ case GFS_FILE_DIR:
++ vfs_type = DT_DIR;
++ break;
++ case GFS_FILE_LNK:
++ vfs_type = DT_LNK;
++ break;
++ case GFS_FILE_BLK:
++ vfs_type = DT_BLK;
++ break;
++ case GFS_FILE_CHR:
++ vfs_type = DT_CHR;
++ break;
++ case GFS_FILE_FIFO:
++ vfs_type = DT_FIFO;
++ break;
++ case GFS_FILE_SOCK:
++ vfs_type = DT_SOCK;
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,
++ printk("type = %u\n", type););
++ }
++
++ error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
++ inum->no_formal_ino, vfs_type);
++ if (error)
++ return 1;
++
++ if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
++ gfs_glock_prefetch_num(sdp,
++ inum->no_formal_ino, &gfs_inode_glops,
++ LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
++ gfs_glock_prefetch_num(sdp,
++ inum->no_addr, &gfs_iopen_glops,
++ LM_ST_SHARED, LM_FLAG_TRY);
++ }
++
++ return 0;
++}
++
++/**
++ * readdir_reg - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++readdir_reg(struct file *file, void *dirent, filldir_t filldir)
++{
++ struct gfs_inode *dip = vn2ip(file->f_mapping->host);
++ struct filldir_reg fdr;
++ struct gfs_holder d_gh;
++ uint64_t offset = file->f_pos;
++ int error;
++
++ fdr.fdr_sbd = dip->i_sbd;
++ fdr.fdr_prefetch = GFS_ASYNC_LM(dip->i_sbd);
++ fdr.fdr_filldir = filldir;
++ fdr.fdr_opaque = dirent;
++
++ gfs_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
++ error = gfs_glock_nq_atime(&d_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++
++ error = gfs_dir_read(dip, &offset, &fdr, filldir_reg_func);
++
++ gfs_glock_dq_uninit(&d_gh);
++
++ file->f_pos = offset;
++
++ return error;
++}
++
++/**
++ * filldir_bad_func - Report a directory entry to the caller of gfs_dir_read()
++ * @opaque: opaque data used by the function
++ * @name: the name of the directory entry
++ * @length: the length of the name
++ * @offset: the entry's offset in the directory
++ * @inum: the inode number the entry points to
++ * @type: the type of inode the entry points to
++ *
++ * Returns: 0 on success, 1 if buffer full
++ */
++
++static int
++filldir_bad_func(void *opaque,
++ const char *name, unsigned int length,
++ uint64_t offset,
++ struct gfs_inum *inum, unsigned int type)
++{
++ struct filldir_bad *fdb = (struct filldir_bad *)opaque;
++ struct gfs_sbd *sdp = fdb->fdb_sbd;
++ struct filldir_bad_entry *fbe;
++
++ if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
++ fdb->fdb_name_off + length > fdb->fdb_name_size)
++ return 1;
++
++ fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
++ fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
++ memcpy(fbe->fbe_name, name, length);
++ fbe->fbe_length = length;
++ fbe->fbe_offset = offset;
++ fbe->fbe_inum = *inum;
++ fbe->fbe_type = type;
++
++ fdb->fdb_entry_off++;
++ fdb->fdb_name_off += length;
++
++ if (fdb->fdb_prefetch && !(length == 1 && *name == '.')) {
++ gfs_glock_prefetch_num(sdp,
++ inum->no_formal_ino, &gfs_inode_glops,
++ LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
++ gfs_glock_prefetch_num(sdp,
++ inum->no_addr, &gfs_iopen_glops,
++ LM_ST_SHARED, LM_FLAG_TRY);
++ }
++
++ return 0;
++}
++
++/**
++ * readdir_bad - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++readdir_bad(struct file *file, void *dirent, filldir_t filldir)
++{
++ struct gfs_inode *dip = vn2ip(file->f_mapping->host);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct filldir_reg fdr;
++ unsigned int entries, size;
++ struct filldir_bad *fdb;
++ struct gfs_holder d_gh;
++ uint64_t offset = file->f_pos;
++ unsigned int x;
++ struct filldir_bad_entry *fbe;
++ int error;
++
++ entries = sdp->sd_tune.gt_entries_per_readdir;
++ size = sizeof(struct filldir_bad) +
++ entries * (sizeof(struct filldir_bad_entry) + GFS_FAST_NAME_SIZE);
++
++ fdb = gmalloc(size);
++ memset(fdb, 0, size);
++
++ fdb->fdb_sbd = sdp;
++ fdb->fdb_prefetch = GFS_ASYNC_LM(sdp);
++ fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
++ fdb->fdb_entry_num = entries;
++ fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
++ entries * sizeof(struct filldir_bad_entry);
++ fdb->fdb_name_size = entries * GFS_FAST_NAME_SIZE;
++
++ gfs_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
++ error = gfs_glock_nq_atime(&d_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ goto out;
++ }
++
++ error = gfs_dir_read(dip, &offset, fdb, filldir_bad_func);
++
++ gfs_glock_dq_uninit(&d_gh);
++
++ fdr.fdr_sbd = sdp;
++ fdr.fdr_prefetch = FALSE;
++ fdr.fdr_filldir = filldir;
++ fdr.fdr_opaque = dirent;
++
++ for (x = 0; x < fdb->fdb_entry_off; x++) {
++ fbe = &fdb->fdb_entry[x];
++
++ error = filldir_reg_func(&fdr,
++ fbe->fbe_name, fbe->fbe_length,
++ fbe->fbe_offset,
++ &fbe->fbe_inum, fbe->fbe_type);
++ if (error) {
++ file->f_pos = fbe->fbe_offset;
++ error = 0;
++ goto out;
++ }
++ }
++
++ file->f_pos = offset;
++
++ out:
++ kfree(fdb);
++
++ return error;
++}
++
++/**
++ * gfs_readdir - Read directory entries from a directory
++ * @file: The directory to read from
++ * @dirent: Buffer for dirents
++ * @filldir: Function used to do the copying
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_readdir(struct file *file, void *dirent, filldir_t filldir)
++{
++ int error;
++
++ atomic_inc(&vfs2sdp(file->f_mapping->host->i_sb)->sd_ops_file);
++
++ if (strcmp(current->comm, "nfsd") != 0)
++ error = readdir_reg(file, dirent, filldir);
++ else
++ error = readdir_bad(file, dirent, filldir);
++
++ return error;
++}
++
++/**
++ * gfs_ioctl - do an ioctl on a file
++ * @inode: the inode
++ * @file: the file pointer
++ * @cmd: the ioctl command
++ * @arg: the argument
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_ioctl(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++ return gfs_ioctli(ip, cmd, (void *)arg);
++}
++
++/**
++ * gfs_open - open a file
++ * @inode: the inode to open
++ * @file: the struct file for this opening
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_open(struct inode *inode, struct file *file)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_holder i_gh;
++ struct gfs_file *fp;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++
++ fp = gmalloc(sizeof(struct gfs_file));
++ memset(fp, 0, sizeof(struct gfs_file));
++
++ init_MUTEX(&fp->f_fl_lock);
++
++ fp->f_inode = ip;
++ fp->f_vfile = file;
++
++ GFS_ASSERT_INODE(!vf2fp(file), ip,);
++ vf2fp(file) = fp;
++
++ if (ip->i_di.di_type == GFS_FILE_REG) {
++ error = gfs_glock_nq_init(ip->i_gl,
++ LM_ST_SHARED, LM_FLAG_ANY,
++ &i_gh);
++ if (error)
++ goto fail;
++
++ if (!(file->f_flags & O_LARGEFILE) &&
++ ip->i_di.di_size > 0x7FFFFFFFull) {
++ error = -EFBIG;
++ goto fail_gunlock;
++ }
++
++ /* If this is an exclusive create, make sure our gfs_create()
++ says we created the file. The O_EXCL flag isn't passed
++ to gfs_create(), so we have to check it here. */
++
++ if (file->f_flags & O_CREAT) {
++ if (ip->i_creat_task == current &&
++ ip->i_creat_pid == current->pid) {
++ ip->i_creat_task = NULL;
++ ip->i_creat_pid = 0;
++ } else if (file->f_flags & O_EXCL) {
++ error = -EEXIST;
++ goto fail_gunlock;
++ }
++ }
++
++ /* Listen to the Direct I/O flag */
++
++ if (ip->i_di.di_flags & GFS_DIF_DIRECTIO)
++ file->f_flags |= O_DIRECT;
++
++ /* Don't let the user open O_DIRECT on a jdata file */
++
++ if ((file->f_flags & O_DIRECT) && gfs_is_jdata(ip)) {
++ error = -EINVAL;
++ goto fail_gunlock;
++ }
++
++ gfs_glock_dq_uninit(&i_gh);
++ }
++
++ return 0;
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++ vf2fp(file) = NULL;
++ kfree(fp);
++
++ return error;
++}
++
++/**
++ * gfs_close - called to close a struct file
++ * @inode: the inode the struct file belongs to
++ * @file: the struct file being closed
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_close(struct inode *inode, struct file *file)
++{
++ struct gfs_file *fp;
++
++ atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_file);
++
++ fp = vf2fp(file);
++ vf2fp(file) = NULL;
++
++ GFS_ASSERT(fp,);
++
++ kfree(fp);
++
++ return 0;
++}
++
++/**
++ * gfs_fsync - sync the dirty data for a file (across the cluster)
++ * @file: the file that points to the dentry (Huh?)
++ * @dentry: the dentry that points to the inode to sync
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++ struct gfs_inode *ip = vn2ip(dentry->d_inode);
++ struct gfs_holder i_gh;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
++ if (error)
++ return error;
++
++ if (gfs_is_jdata(ip))
++ gfs_log_flush_glock(ip->i_gl);
++ else
++ i_gh.gh_flags |= GL_SYNC;
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_lock - acquire/release a flock or posix lock on a file
++ * @file: the file pointer
++ * @cmd: either modify or retrieve lock state, possibly wait
++ * @fl: type and range of lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_lock(struct file *file, int cmd, struct file_lock *fl)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct lm_lockname name;
++ uint64_t start = fl->fl_start, end = fl->fl_end;
++ pid_t pid = fl->fl_pid;
++ int plock = (fl->fl_flags & FL_POSIX);
++ int flock = (fl->fl_flags & FL_FLOCK);
++ int get, set, wait, ex, sh, un;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_file);
++
++ if (sdp->sd_args.ar_localflocks)
++ return LOCK_USE_CLNT;
++
++ if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
++ return -ENOLCK;
++
++ if (!flock && !plock)
++ return -ENOLCK;
++
++ get = (IS_GETLK(cmd)) ? TRUE : FALSE;
++ set = (IS_SETLK(cmd)) ? TRUE : FALSE;
++ wait = (IS_SETLKW(cmd)) ? TRUE : FALSE;
++
++ if ((flock && (get || (!set && !wait))) ||
++ (plock && (!get && !set && !wait)))
++ return -EINVAL;
++
++ ex = (fl->fl_type == F_WRLCK) ? TRUE : FALSE;
++ sh = (fl->fl_type == F_RDLCK) ? TRUE : FALSE;
++ un = (fl->fl_type == F_UNLCK) ? TRUE : FALSE;
++
++ if (!ex && !sh && !un)
++ return -EINVAL;
++
++ if (flock) {
++ struct gfs_file *fp = vf2fp(file);
++ GFS_ASSERT(fp,);
++
++ if (un)
++ error = gfs_funlock(fp);
++ else
++ error = gfs_flock(fp, ex, wait);
++ } else {
++ name.ln_number = ip->i_num.no_formal_ino;
++ name.ln_type = LM_TYPE_PLOCK;
++ if (get) {
++ error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
++ sdp->sd_lockstruct.ls_lockspace,
++ &name, (unsigned long)fl->fl_owner,
++ &start, &end, &ex, (unsigned long*)&pid);
++ if (error < 0)
++ return error;
++
++ fl->fl_type = F_UNLCK;
++ if (!error)
++ return error;
++
++ fl->fl_start = start;
++ fl->fl_end = end;
++ fl->fl_pid = pid;
++ fl->fl_type = (ex) ? F_WRLCK : F_RDLCK;
++
++ error = 0;
++ } else if (un)
++ error = sdp->sd_lockstruct.ls_ops->lm_punlock(
++ sdp->sd_lockstruct.ls_lockspace,
++ &name, (unsigned long)fl->fl_owner,
++ start, end);
++ else
++ error = sdp->sd_lockstruct.ls_ops->lm_plock(
++ sdp->sd_lockstruct.ls_lockspace,
++ &name, (unsigned long)fl->fl_owner,
++ wait, ex, start, end);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_sendfile - Send bytes to a file or socket
++ * @in_file: The file to read from
++ * @out_file: The file to write to
++ * @count: The amount of data
++ * @offset: The beginning file offset
++ *
++ * Outputs: offset - updated according to number of bytes read
++ *
++ * Returns: The number of bytes sent, -EXXX on failure
++ */
++
++static ssize_t
++gfs_sendfile(struct file *in_file, loff_t *offset, size_t count, read_actor_t actor, void __user *target)
++{
++ struct gfs_inode *ip = vn2ip(in_file->f_mapping->host);
++ struct gfs_holder gh;
++ ssize_t retval;
++
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++
++ gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
++
++ retval = gfs_glock_nq_atime(&gh);
++ if (retval)
++ goto out;
++
++ if (gfs_is_jdata(ip))
++ retval = -ENOSYS;
++ else
++ retval = generic_file_sendfile(in_file, offset, count, actor, target);
++
++ gfs_glock_dq(&gh);
++
++ out:
++ gfs_holder_uninit(&gh);
++
++ return retval;
++}
++
++/**
++ * gfs_mmap - We don't support shared writable mappings right now
++ * @file: The file to map
++ * @vma: The VMA which described the mapping
++ *
++ * Returns: 0 or error code
++ */
++
++static int
++gfs_mmap(struct file *file, struct vm_area_struct *vma)
++{
++ struct gfs_inode *ip = vn2ip(file->f_mapping->host);
++ struct gfs_holder i_gh;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_file);
++
++ gfs_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
++ error = gfs_glock_nq_atime(&i_gh);
++ if (error) {
++ gfs_holder_uninit(&i_gh);
++ return error;
++ }
++
++ if (gfs_is_jdata(ip)) {
++ if (vma->vm_flags & VM_MAYSHARE)
++ error = -ENOSYS;
++ else
++ vma->vm_ops = &gfs_vm_ops_private;
++ } else {
++ /* This is VM_MAYWRITE instead of VM_WRITE because a call
++ to mprotect() can turn on VM_WRITE later. */
++
++ if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == (VM_MAYSHARE | VM_MAYWRITE))
++ vma->vm_ops = &gfs_vm_ops_sharewrite;
++ else
++ vma->vm_ops = &gfs_vm_ops_private;
++ }
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++struct file_operations gfs_file_fops = {
++ .llseek = gfs_llseek,
++ .read = gfs_read,
++ .write = gfs_write,
++ .ioctl = gfs_ioctl,
++ .mmap = gfs_mmap,
++ .open = gfs_open,
++ .release = gfs_close,
++ .fsync = gfs_fsync,
++ .lock = gfs_lock,
++ .sendfile = gfs_sendfile,
++};
++
++struct file_operations gfs_dir_fops = {
++ .readdir = gfs_readdir,
++ .ioctl = gfs_ioctl,
++ .open = gfs_open,
++ .release = gfs_close,
++ .fsync = gfs_fsync,
++ .lock = gfs_lock,
++};
+diff -urN linux-orig/fs/gfs/ops_file.h linux-patched/fs/gfs/ops_file.h
+--- linux-orig/fs/gfs/ops_file.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_file.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_FILE_DOT_H__
++#define __OPS_FILE_DOT_H__
++
++extern struct file_operations gfs_file_fops;
++extern struct file_operations gfs_dir_fops;
++
++#endif /* __OPS_FILE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_fstype.c linux-patched/fs/gfs/ops_fstype.c
+--- linux-orig/fs/gfs/ops_fstype.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_fstype.c 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,626 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/vmalloc.h>
++#include <linux/blkdev.h>
++
++#include "gfs.h"
++#include "daemon.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "locking.h"
++#include "mount.h"
++#include "ops_export.h"
++#include "ops_fstype.h"
++#include "ops_super.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_read_super - Read in superblock
++ * @sb: The VFS superblock
++ * @data: Mount options
++ * @silent: Don't complain if its not a GFS filesystem
++ *
++ * Returns: The VFS superblock, or NULL on error
++ */
++
++static int
++fill_super(struct super_block *sb, void *data, int silent)
++{
++ struct gfs_sbd *sdp;
++ struct gfs_holder mount_gh, sb_gh, ji_gh;
++ struct inode *inode;
++ int super = TRUE, jindex = TRUE;
++ unsigned int x;
++ int error;
++
++ error = -ENOMEM;
++ sdp = vmalloc(sizeof(struct gfs_sbd));
++ if (!sdp)
++ goto fail;
++
++ memset(sdp, 0, sizeof(struct gfs_sbd));
++
++ vfs2sdp(sb) = sdp;
++ sdp->sd_vfs = sb;
++
++ /* Init rgrp variables */
++
++ INIT_LIST_HEAD(&sdp->sd_rglist);
++ init_MUTEX(&sdp->sd_rindex_lock);
++ INIT_LIST_HEAD(&sdp->sd_rg_mru_list);
++ spin_lock_init(&sdp->sd_rg_mru_lock);
++ INIT_LIST_HEAD(&sdp->sd_rg_recent);
++ spin_lock_init(&sdp->sd_rg_recent_lock);
++ spin_lock_init(&sdp->sd_rg_forward_lock);
++
++ for (x = 0; x < GFS_GL_HASH_SIZE; x++) {
++ sdp->sd_gl_hash[x].hb_lock = RW_LOCK_UNLOCKED;
++ INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
++ }
++
++ INIT_LIST_HEAD(&sdp->sd_reclaim_list);
++ spin_lock_init(&sdp->sd_reclaim_lock);
++ init_waitqueue_head(&sdp->sd_reclaim_wchan);
++
++ for (x = 0; x < GFS_MHC_HASH_SIZE; x++)
++ INIT_LIST_HEAD(&sdp->sd_mhc[x]);
++ INIT_LIST_HEAD(&sdp->sd_mhc_single);
++ spin_lock_init(&sdp->sd_mhc_lock);
++
++ for (x = 0; x < GFS_DEPEND_HASH_SIZE; x++)
++ INIT_LIST_HEAD(&sdp->sd_depend[x]);
++ spin_lock_init(&sdp->sd_depend_lock);
++
++ init_MUTEX(&sdp->sd_freeze_lock);
++
++ init_MUTEX(&sdp->sd_thread_lock);
++ init_completion(&sdp->sd_thread_completion);
++
++ spin_lock_init(&sdp->sd_log_seg_lock);
++ INIT_LIST_HEAD(&sdp->sd_log_seg_list);
++ init_waitqueue_head(&sdp->sd_log_seg_wait);
++ INIT_LIST_HEAD(&sdp->sd_log_ail);
++ INIT_LIST_HEAD(&sdp->sd_log_incore);
++ init_MUTEX(&sdp->sd_log_lock);
++ INIT_LIST_HEAD(&sdp->sd_unlinked_list);
++ spin_lock_init(&sdp->sd_unlinked_lock);
++ INIT_LIST_HEAD(&sdp->sd_quota_list);
++ spin_lock_init(&sdp->sd_quota_lock);
++
++ INIT_LIST_HEAD(&sdp->sd_dirty_j);
++ spin_lock_init(&sdp->sd_dirty_j_lock);
++
++ spin_lock_init(&sdp->sd_ail_lock);
++ INIT_LIST_HEAD(&sdp->sd_recovery_bufs);
++
++ gfs_init_tune_data(sdp);
++
++ error = gfs_make_args((char *)data, &sdp->sd_args);
++ if (error) {
++ printk("GFS: can't parse mount arguments\n");
++ goto fail_vfree;
++ }
++
++ /* Copy out mount flags */
++
++ if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
++ set_bit(SDF_NOATIME, &sdp->sd_flags);
++ if (sb->s_flags & MS_RDONLY)
++ set_bit(SDF_ROFS, &sdp->sd_flags);
++
++ /* Setup up Virtual Super Block */
++
++ sb->s_magic = GFS_MAGIC;
++ sb->s_op = &gfs_super_ops;
++ sb->s_export_op = &gfs_export_ops;
++ sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
++ sb->s_maxbytes = ~0ULL;
++
++ if (sdp->sd_args.ar_posixacls)
++ sb->s_flags |= MS_POSIXACL;
++
++ /* Set up the buffer cache and fill in some fake values
++ to allow us to read in the superblock. */
++
++ sdp->sd_sb.sb_bsize = bdev_hardsect_size(sb->s_bdev);
++ if (sdp->sd_sb.sb_bsize < GFS_BASIC_BLOCK)
++ sdp->sd_sb.sb_bsize = GFS_BASIC_BLOCK;
++ sdp->sd_sb.sb_bsize_shift = ffs(sdp->sd_sb.sb_bsize) - 1;
++ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS_BASIC_BLOCK_SHIFT;
++ sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
++
++ GFS_ASSERT_SBD(sizeof(struct gfs_sb) <= sdp->sd_sb.sb_bsize, sdp,);
++
++ set_blocksize(sb->s_bdev, sdp->sd_sb.sb_bsize);
++ sb->s_blocksize = sdp->sd_sb.sb_bsize;
++ sb->s_blocksize_bits = sdp->sd_sb.sb_bsize_shift;
++
++ error = gfs_mount_lockproto(sdp, silent);
++ if (error)
++ goto fail_vfree;
++
++ printk("GFS: fsid=%s: Joined cluster. Now mounting FS...\n",
++ sdp->sd_fsname);
++
++ if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
++ !sdp->sd_args.ar_ignore_local_fs) {
++ /* Force local [p|f]locks */
++ sdp->sd_args.ar_localflocks = TRUE;
++
++ /* Force local read ahead and caching */
++ sdp->sd_args.ar_localcaching = TRUE;
++ }
++
++ /* Start up the scand thread */
++
++ error = kernel_thread(gfs_scand, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start scand thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_lockproto;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Start up the glockd thread */
++
++ for (sdp->sd_glockd_num = 0;
++ sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
++ sdp->sd_glockd_num++) {
++ error = kernel_thread(gfs_glockd, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start glockd thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_glockd;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++ }
++
++ error = gfs_glock_nq_num(sdp,
++ GFS_MOUNT_LOCK, &gfs_nondisk_glops,
++ LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
++ &mount_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't acquire mount glock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_glockd;
++ }
++
++ error = gfs_glock_nq_num(sdp,
++ GFS_LIVE_LOCK, &gfs_nondisk_glops,
++ LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
++ &sdp->sd_live_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't acquire live glock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_mount;
++ }
++
++ sdp->sd_live_gh.gh_owner = NULL;
++
++ error = gfs_glock_nq_num(sdp,
++ GFS_SB_LOCK, &gfs_meta_glops,
++ (sdp->sd_args.ar_upgrade) ? LM_ST_EXCLUSIVE : LM_ST_SHARED,
++ 0, &sb_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't acquire superblock glock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_live;
++ }
++
++ error = gfs_read_sb(sdp, sb_gh.gh_gl, silent);
++ if (error) {
++ printk("GFS: fsid=%s: can't read superblock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_sb;
++ }
++
++ /* Set up the buffer cache and SB for real */
++
++ error = -EINVAL;
++ if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
++ printk("GFS: fsid=%s: FS block size (%u) is too small for device block size (%u)\n",
++ sdp->sd_fsname, sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
++ goto fail_gunlock_sb;
++ }
++ if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
++ printk("GFS: fsid=%s: FS block size (%u) is too big for machine page size (%u)\n",
++ sdp->sd_fsname, sdp->sd_sb.sb_bsize,
++ (unsigned int)PAGE_SIZE);
++ goto fail_gunlock_sb;
++ }
++
++ /* Get rid of buffers from the original block size */
++ sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
++ sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
++
++ set_blocksize(sb->s_bdev, sdp->sd_sb.sb_bsize);
++ sb->s_blocksize = sdp->sd_sb.sb_bsize;
++ sb->s_blocksize_bits = sdp->sd_sb.sb_bsize_shift;
++
++ /* Read in journal index inode */
++
++ error = gfs_get_jiinode(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't get journal index inode: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_sb;
++ }
++
++ init_MUTEX(&sdp->sd_jindex_lock);
++
++ /* Get a handle on the transaction glock */
++
++ error = gfs_glock_get(sdp, GFS_TRANS_LOCK, &gfs_trans_glops,
++ CREATE, &sdp->sd_trans_gl);
++ if (error)
++ goto fail_ji_free;
++ set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
++
++ /* Upgrade version numbers if we need to */
++
++ if (sdp->sd_args.ar_upgrade) {
++ error = gfs_do_upgrade(sdp, sb_gh.gh_gl);
++ if (error)
++ goto fail_trans_gl;
++ }
++
++ /* Load in the journal index */
++
++ error = gfs_jindex_hold(sdp, &ji_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't read journal index: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_trans_gl;
++ }
++
++ error = -EINVAL;
++ if (sdp->sd_lockstruct.ls_jid >= sdp->sd_journals) {
++ printk("GFS: fsid=%s: can't mount journal #%u\n",
++ sdp->sd_fsname, sdp->sd_lockstruct.ls_jid);
++ printk("GFS: fsid=%s: there are only %u journals (0 - %u)\n",
++ sdp->sd_fsname, sdp->sd_journals, sdp->sd_journals - 1);
++ goto fail_gunlock_ji;
++ }
++ sdp->sd_jdesc = sdp->sd_jindex[sdp->sd_lockstruct.ls_jid];
++ sdp->sd_log_seg_free = sdp->sd_jdesc.ji_nsegment - 1;
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_jdesc.ji_addr, &gfs_meta_glops,
++ LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
++ &sdp->sd_journal_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't acquire the journal glock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_ji;
++ }
++
++ if (sdp->sd_lockstruct.ls_first) {
++ for (x = 0; x < sdp->sd_journals; x++) {
++ error = gfs_recover_journal(sdp,
++ x, sdp->sd_jindex + x,
++ TRUE);
++ if (error) {
++ printk("GFS: fsid=%s: error recovering journal %u: %d\n",
++ sdp->sd_fsname, x, error);
++ goto fail_gunlock_journal;
++ }
++ }
++
++ sdp->sd_lockstruct.ls_ops->lm_others_may_mount(sdp->sd_lockstruct.ls_lockspace);
++ sdp->sd_lockstruct.ls_first = FALSE;
++ } else {
++ error = gfs_recover_journal(sdp,
++ sdp->sd_lockstruct.ls_jid, &sdp->sd_jdesc,
++ TRUE);
++ if (error) {
++ printk("GFS: fsid=%s: error recovering my journal: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_journal;
++ }
++ }
++
++ gfs_glock_dq_uninit(&ji_gh);
++ jindex = FALSE;
++
++ /* Disown my Journal glock */
++
++ sdp->sd_journal_gh.gh_owner = NULL;
++
++ /* Drop our cache and reread all the things we read before the replay. */
++
++ error = gfs_read_sb(sdp, sb_gh.gh_gl, FALSE);
++ if (error) {
++ printk("GFS: fsid=%s: can't read superblock: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_journal;
++ }
++
++ gfs_glock_force_drop(sdp->sd_jiinode->i_gl);
++
++ error = gfs_jindex_hold(sdp, &ji_gh);
++ if (error) {
++ printk("GFS: fsid=%s: can't read journal index: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_journal;
++ }
++ gfs_glock_dq_uninit(&ji_gh);
++
++ /* Make the FS read/write */
++
++ if (!test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ error = gfs_make_fs_rw(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't make FS RW: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_gunlock_journal;
++ }
++ }
++
++ /* Start up the recover thread */
++
++ error = kernel_thread(gfs_recoverd, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start recoverd thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_recover_dump;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Read in the resource index inode */
++
++ error = gfs_get_riinode(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't get resource index inode: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_recoverd;
++ }
++
++ /* Get the root inode */
++
++ error = gfs_get_rootinode(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't read in root inode: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_ri_free;
++ }
++
++ /* Read in the quota inode */
++
++ error = gfs_get_qinode(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't get quota file inode: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_root_free;
++ }
++
++ /* Read in the license inode */
++
++ error = gfs_get_linode(sdp);
++ if (error) {
++ printk("GFS: fsid=%s: can't get license file inode: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_qi_free;
++ }
++
++ /* We're through with the superblock lock */
++
++ gfs_glock_dq_uninit(&sb_gh);
++ super = FALSE;
++
++ /* Get the inode/dentry */
++
++ inode = gfs_iget(sdp->sd_rooti, CREATE);
++ if (!inode) {
++ printk("GFS: fsid=%s: can't get root inode\n", sdp->sd_fsname);
++ error = -ENOMEM;
++ goto fail_li_free;
++ }
++
++ sb->s_root = d_alloc_root(inode);
++ if (!sb->s_root) {
++ iput(inode);
++ printk("GFS: fsid=%s: can't get root dentry\n", sdp->sd_fsname);
++ error = -ENOMEM;
++ goto fail_li_free;
++ }
++
++ /* Start up the logd thread */
++
++ sdp->sd_jindex_refresh_time = jiffies;
++
++ error = kernel_thread(gfs_logd, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start logd thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_dput;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Start up the quotad thread */
++
++ error = kernel_thread(gfs_quotad, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start quotad thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_logd;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Start up the inoded thread */
++
++ error = kernel_thread(gfs_inoded, sdp, 0);
++ if (error < 0) {
++ printk("GFS: fsid=%s: can't start inoded thread: %d\n",
++ sdp->sd_fsname, error);
++ goto fail_quotad;
++ }
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Get a handle on the rename lock */
++
++ error = gfs_glock_get(sdp, GFS_RENAME_LOCK, &gfs_nondisk_glops,
++ CREATE, &sdp->sd_rename_gl);
++ if (error)
++ goto fail_inoded;
++
++ gfs_glock_dq_uninit(&mount_gh);
++
++ return 0;
++
++ fail_inoded:
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_INODED_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_inoded_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ fail_quotad:
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_quotad_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ fail_logd:
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_logd_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ fail_dput:
++ dput(sb->s_root);
++
++ fail_li_free:
++ gfs_inode_put(sdp->sd_linode);
++
++ fail_qi_free:
++ gfs_inode_put(sdp->sd_qinode);
++
++ fail_root_free:
++ gfs_inode_put(sdp->sd_rooti);
++
++ fail_ri_free:
++ gfs_inode_put(sdp->sd_riinode);
++ gfs_clear_rgrpd(sdp);
++
++ fail_recoverd:
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_recoverd_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ fail_recover_dump:
++ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++ gfs_unlinked_cleanup(sdp);
++ gfs_quota_cleanup(sdp);
++
++ fail_gunlock_journal:
++ gfs_glock_dq_uninit(&sdp->sd_journal_gh);
++
++ fail_gunlock_ji:
++ if (jindex)
++ gfs_glock_dq_uninit(&ji_gh);
++
++ fail_trans_gl:
++ gfs_glock_put(sdp->sd_trans_gl);
++
++ fail_ji_free:
++ gfs_inode_put(sdp->sd_jiinode);
++ gfs_clear_journals(sdp);
++
++ fail_gunlock_sb:
++ if (super)
++ gfs_glock_dq_uninit(&sb_gh);
++
++ fail_gunlock_live:
++ gfs_glock_dq_uninit(&sdp->sd_live_gh);
++
++ fail_gunlock_mount:
++ gfs_glock_dq_uninit(&mount_gh);
++
++ fail_glockd:
++ clear_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++ wake_up(&sdp->sd_reclaim_wchan);
++ while (sdp->sd_glockd_num--)
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_scand_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ fail_lockproto:
++ gfs_gl_hash_clear(sdp, TRUE);
++ gfs_unmount_lockproto(sdp);
++ gfs_clear_dirty_j(sdp);
++ while (invalidate_inodes(sb))
++ yield();
++
++ fail_vfree:
++ vfree(sdp);
++
++ fail:
++ vfs2sdp(sb) = NULL;
++ return error;
++}
++
++/**
++ * gfs_get_sb -
++ * @fs_type:
++ * @flags:
++ * @dev_name:
++ * @data:
++ *
++ * Returns: the new superblock
++ */
++
++struct super_block *gfs_get_sb(struct file_system_type *fs_type, int flags,
++ const char *dev_name, void *data)
++{
++ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
++}
++
++/**
++ * gfs_kill_sb -
++ * @sb:
++ *
++ */
++
++void gfs_kill_sb(struct super_block *sb)
++{
++ kill_block_super(sb);
++}
++
++struct file_system_type gfs_fs_type = {
++ .name = "gfs",
++ .fs_flags = FS_REQUIRES_DEV /*| FS_REVAL_DOT*/,
++ .get_sb = gfs_get_sb,
++ .kill_sb = gfs_kill_sb,
++ .owner = THIS_MODULE,
++};
+diff -urN linux-orig/fs/gfs/ops_fstype.h linux-patched/fs/gfs/ops_fstype.h
+--- linux-orig/fs/gfs/ops_fstype.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_fstype.h 2004-06-20 22:48:17.952945559 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_FSTYPE_DOT_H__
++#define __OPS_FSTYPE_DOT_H__
++
++extern struct file_system_type gfs_fs_type;
++
++#endif /* __OPS_FSTYPE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_inode.c linux-patched/fs/gfs/ops_inode.c
+--- linux-orig/fs/gfs/ops_inode.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_inode.c 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,1723 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/namei.h>
++#include <linux/utsname.h>
++#include <asm/uaccess.h>
++#include <linux/xattr.h>
++#include <linux/mm.h>
++#include <linux/posix_acl.h>
++
++#include "gfs.h"
++#include "acl.h"
++#include "bmap.h"
++#include "dio.h"
++#include "dir.h"
++#include "eattr.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_dentry.h"
++#include "ops_inode.h"
++#include "page.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * gfs_create - Create a file
++ * @dir: The directory in which to create the file
++ * @dentry: The dentry of the new file
++ * @mode: The mode of the new file
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_create(struct inode *dir, struct dentry *dentry,
++ int mode, struct nameidata *nd)
++{
++ struct gfs_inode *dip = vn2ip(dir), *ip;
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_holder d_gh, i_gh;
++ struct inode *inode;
++ int new = TRUE;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++ for (;;) {
++ error = gfs_createi(&d_gh, &dentry->d_name,
++ GFS_FILE_REG, mode,
++ &i_gh);
++ if (!error)
++ break;
++ else if (error != -EEXIST) {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++
++ error = gfs_lookupi(&d_gh, &dentry->d_name,
++ FALSE, &i_gh);
++ if (!error) {
++ if (i_gh.gh_gl) {
++ new = FALSE;
++ break;
++ }
++ } else {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++ }
++
++ GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++ ip = gl2ip(i_gh.gh_gl);
++
++ if (new) {
++ gfs_trans_end(sdp);
++ if (dip->i_alloc->al_rgd)
++ gfs_inplace_release(dip);
++ gfs_quota_unlock_m(dip);
++ gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++ gfs_alloc_put(dip);
++
++ ip->i_creat_task = current;
++ ip->i_creat_pid = current->pid;
++ }
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return -ENOMEM;
++
++ d_instantiate(dentry, inode);
++ if (new)
++ mark_inode_dirty(inode);
++
++ return 0;
++}
++
++/**
++ * lookup_cdpn_sub_at - Maybe lookup a Context Dependent Pathname
++ * @sdp: the filesystem
++ * @dentry: the original dentry to lookup
++ * @new_dentry: the new dentry, if this was a substitutable path.
++ *
++ */
++
++static void
++lookup_cdpn_sub_at(struct gfs_sbd *sdp, struct dentry *dentry,
++ struct dentry **new_dentry)
++{
++ struct dentry *parent = dget_parent(dentry);
++ char *buf = gmalloc(2 * __NEW_UTS_LEN + 2);
++
++ if (gfs_filecmp(&dentry->d_name, "@hostname", 9))
++ *new_dentry = lookup_one_len(system_utsname.nodename,
++ parent,
++ strlen(system_utsname.nodename));
++ else if (gfs_filecmp(&dentry->d_name, "@mach", 5))
++ *new_dentry = lookup_one_len(system_utsname.machine,
++ parent,
++ strlen(system_utsname.machine));
++ else if (gfs_filecmp(&dentry->d_name, "@os", 3))
++ *new_dentry = lookup_one_len(system_utsname.sysname,
++ parent,
++ strlen(system_utsname.sysname));
++ else if (gfs_filecmp(&dentry->d_name, "@uid", 4))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u", current->fsuid));
++ else if (gfs_filecmp(&dentry->d_name, "@gid", 4))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u", current->fsgid));
++ else if (gfs_filecmp(&dentry->d_name, "@sys", 4))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%s_%s",
++ system_utsname.machine,
++ system_utsname.sysname));
++ else if (gfs_filecmp(&dentry->d_name, "@jid", 4))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u",
++ sdp->sd_lockstruct.ls_jid));
++
++ kfree(buf);
++ dput(parent);
++}
++
++/**
++ * lookup_cdpn_sub_brace - Maybe lookup a Context Dependent Pathname
++ * @sdp: the filesystem
++ * @dentry: the original dentry to lookup
++ * @new_dentry: the new dentry, if this was a substitutable path.
++ *
++ */
++
++static void
++lookup_cdpn_sub_brace(struct gfs_sbd *sdp, struct dentry *dentry,
++ struct dentry **new_dentry)
++{
++ struct dentry *parent = dget_parent(dentry);
++ char *buf = gmalloc(2 * __NEW_UTS_LEN + 2);
++
++ if (gfs_filecmp(&dentry->d_name, "{hostname}", 10))
++ *new_dentry = lookup_one_len(system_utsname.nodename,
++ parent,
++ strlen(system_utsname.nodename));
++ else if (gfs_filecmp(&dentry->d_name, "{mach}", 6))
++ *new_dentry = lookup_one_len(system_utsname.machine,
++ parent,
++ strlen(system_utsname.machine));
++ else if (gfs_filecmp(&dentry->d_name, "{os}", 4))
++ *new_dentry = lookup_one_len(system_utsname.sysname,
++ parent,
++ strlen(system_utsname.sysname));
++ else if (gfs_filecmp(&dentry->d_name, "{uid}", 5))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u", current->fsuid));
++ else if (gfs_filecmp(&dentry->d_name, "{gid}", 5))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u", current->fsgid));
++ else if (gfs_filecmp(&dentry->d_name, "{sys}", 5))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%s_%s",
++ system_utsname.machine,
++ system_utsname.sysname));
++ else if (gfs_filecmp(&dentry->d_name, "{jid}", 5))
++ *new_dentry = lookup_one_len(buf,
++ parent,
++ sprintf(buf, "%u",
++ sdp->sd_lockstruct.ls_jid));
++
++ kfree(buf);
++ dput(parent);
++}
++
++/**
++ * gfs_lookup - Look up a filename in a directory and return its inode
++ * @dir: The directory inode
++ * @dentry: The dentry of the new inode
++ *
++ * Called by the VFS layer. Lock dir and call gfs_lookupi()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static struct dentry *
++gfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
++{
++ struct gfs_inode *dip = vn2ip(dir), *ip;
++ struct gfs_holder d_gh, i_gh;
++ struct inode *inode = NULL;
++ int error;
++
++ atomic_inc(&dip->i_sbd->sd_ops_inode);
++
++ /* Do Context Dependent Path Name expansion */
++
++ if (*dentry->d_name.name == '@' && dentry->d_name.len > 1) {
++ struct dentry *new_dentry = NULL;
++ lookup_cdpn_sub_at(dip->i_sbd, dentry, &new_dentry);
++ if (new_dentry)
++ return new_dentry;
++ } else if (*dentry->d_name.name == '{' && dentry->d_name.len > 2) {
++ struct dentry *new_dentry = NULL;
++ lookup_cdpn_sub_brace(dip->i_sbd, dentry, &new_dentry);
++ if (new_dentry)
++ return new_dentry;
++ }
++
++ dentry->d_op = &gfs_dops;
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++ error = gfs_lookupi(&d_gh, &dentry->d_name, FALSE, &i_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ return ERR_PTR(error);
++ }
++
++ if (i_gh.gh_gl) {
++ ip = gl2ip(i_gh.gh_gl);
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return ERR_PTR(-ENOMEM);
++ } else
++ gfs_holder_uninit(&d_gh);
++
++ if (inode)
++ return d_splice_alias(inode, dentry);
++ d_add(dentry, inode);
++ return NULL;
++}
++
++/**
++ * gfs_link - Link to a file
++ * @old_dentry: The inode to link
++ * @dir: Add link to this directory
++ * @dentry: The name of the link
++ *
++ * Link the inode in "old_dentry" into the directory "dir" with the
++ * name in "dentry".
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
++{
++ struct gfs_inode *dip = vn2ip(dir);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct inode *inode = old_dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_alloc *al = NULL;
++ struct gfs_holder ghs[2];
++ int alloc_required;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ if (ip->i_di.di_type == GFS_FILE_DIR)
++ return -EPERM;
++
++ gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++ gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++ error = gfs_glock_nq_m(2, ghs);
++ if (error)
++ goto fail;
++
++ error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++ if (error)
++ goto fail_gunlock;
++
++ error = gfs_dir_search(dip, &dentry->d_name, NULL, NULL);
++ switch (error) {
++ case -ENOENT:
++ break;
++ case 0:
++ error = -EEXIST;
++ default:
++ goto fail_gunlock;
++ }
++
++ if (!dip->i_di.di_nlink) {
++ error = -EINVAL;
++ goto fail_gunlock;
++ }
++ if (dip->i_di.di_entries == (uint32_t)-1) {
++ error = -EFBIG;
++ goto fail_gunlock;
++ }
++ if (!ip->i_di.di_nlink) {
++ error = -EINVAL;
++ goto fail_gunlock;
++ }
++ if (ip->i_di.di_nlink == (uint32_t)-1) {
++ error = -EMLINK;
++ goto fail_gunlock;
++ }
++
++ error = gfs_diradd_alloc_required(dip, &dentry->d_name, &alloc_required);
++ if (error)
++ goto fail_gunlock;
++
++ if (alloc_required) {
++ al = gfs_alloc_get(dip);
++
++ error = gfs_quota_lock_m(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail_alloc;
++
++ error = gfs_quota_check(dip, dip->i_di.di_uid, dip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ al->al_requested_meta = sdp->sd_max_dirres;
++
++ error = gfs_inplace_reserve(dip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ two dinode blocks, directory modifications to add an entry,
++ RG bitmap blocks to allocate from, and quota change */
++
++ error = gfs_trans_begin(sdp,
++ 2 + sdp->sd_max_dirres +
++ al->al_rgd->rd_ri.ri_length,
++ 1);
++ if (error)
++ goto fail_ipres;
++ } else {
++ /* Trans may require:
++ Two dinode blocks and a leaf block. */
++
++ error = gfs_trans_begin(sdp, 3, 0);
++ if (error)
++ goto fail_ipres;
++ }
++
++ error = gfs_dir_add(dip, &dentry->d_name, &ip->i_num, ip->i_di.di_type);
++ if (error)
++ goto fail_end_trans;
++
++ error = gfs_change_nlink(ip, +1);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_end(sdp);
++
++ if (alloc_required) {
++ GFS_ASSERT_INODE(al->al_alloced_meta, dip,);
++ gfs_inplace_release(dip);
++ gfs_quota_unlock_m(dip);
++ gfs_alloc_put(dip);
++ }
++
++ gfs_glock_dq_m(2, ghs);
++
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ atomic_inc(&inode->i_count);
++
++ d_instantiate(dentry, inode);
++ mark_inode_dirty(inode);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ if (alloc_required)
++ gfs_inplace_release(dip);
++
++ fail_gunlock_q:
++ if (alloc_required)
++ gfs_quota_unlock_m(dip);
++
++ fail_alloc:
++ if (alloc_required)
++ gfs_alloc_put(dip);
++
++ fail_gunlock:
++ gfs_glock_dq_m(2, ghs);
++
++ fail:
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ return error;
++}
++
++/**
++ * gfs_unlink - Unlink a file
++ * @dir: The inode of the directory containing the file to unlink
++ * @dentry: The file itself
++ *
++ * Unlink a file. Call gfs_unlinki()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_unlink(struct inode *dir, struct dentry *dentry)
++{
++ struct gfs_inode *dip = vn2ip(dir);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_inode *ip = vn2ip(dentry->d_inode);
++ struct gfs_holder ghs[2];
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++ gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++ error = gfs_glock_nq_m(2, ghs);
++ if (error)
++ goto fail;
++
++ error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++ if (error)
++ goto fail_gunlock;
++
++ if ((dip->i_di.di_mode & S_ISVTX) &&
++ dip->i_di.di_uid != current->fsuid &&
++ ip->i_di.di_uid != current->fsuid &&
++ !capable(CAP_FOWNER)) {
++ error = -EPERM;
++ goto fail_gunlock;
++ }
++
++ error = gfs_revalidate(dip, &dentry->d_name, ip);
++ if (error)
++ goto fail_gunlock;
++
++ /* Trans may require:
++ Two dinode blocks and one modified directory leaf block
++ and one unlinked tag. */
++
++ error = gfs_trans_begin(sdp, 3, 1);
++ if (error)
++ goto fail_gunlock;
++
++ error = gfs_unlinki(dip, &dentry->d_name, ip);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_m(2, ghs);
++
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_gunlock:
++ gfs_glock_dq_m(2, ghs);
++
++ fail:
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ return error;
++}
++
++/**
++ * gfs_symlink - Create a symlink
++ * @dir: The directory to create the symlink in
++ * @dentry: The dentry to put the symlink in
++ * @symname: The thing which the link points to
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
++{
++ struct gfs_inode *dip = vn2ip(dir), *ip;
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_holder d_gh, i_gh;
++ struct inode *inode;
++ struct buffer_head *dibh;
++ int size;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ /* Must be stuffed with a null terminator for gfs_follow_link() */
++ size = strlen(symname);
++ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode) - 1)
++ return -ENAMETOOLONG;
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++ error = gfs_createi(&d_gh, &dentry->d_name,
++ GFS_FILE_LNK, 0777,
++ &i_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++
++ GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++ ip = gl2ip(i_gh.gh_gl);
++
++ ip->i_di.di_size = size;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ GFS_ASSERT_INODE(!error, ip,);
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ memcpy(dibh->b_data + sizeof(struct gfs_dinode), symname, size);
++
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++ if (dip->i_alloc->al_rgd)
++ gfs_inplace_release(dip);
++ gfs_quota_unlock_m(dip);
++ gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++ gfs_alloc_put(dip);
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return -ENOMEM;
++
++ d_instantiate(dentry, inode);
++ mark_inode_dirty(inode);
++
++ return 0;
++}
++
++/**
++ * gfs_mkdir - Make a directory
++ * @dir: The parent directory of the new one
++ * @dentry: The dentry of the new directory
++ * @mode: The mode of the new directory
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
++{
++ struct gfs_inode *dip = vn2ip(dir), *ip;
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_holder d_gh, i_gh;
++ struct inode *inode;
++ struct buffer_head *dibh;
++ struct gfs_dinode *di;
++ struct gfs_dirent *dent;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++ error = gfs_createi(&d_gh, &dentry->d_name,
++ GFS_FILE_DIR, mode,
++ &i_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++
++ GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++ ip = gl2ip(i_gh.gh_gl);
++
++ ip->i_di.di_nlink = 2;
++ ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++ ip->i_di.di_flags |= GFS_DIF_JDATA;
++ ip->i_di.di_payload_format = GFS_FORMAT_DE;
++ ip->i_di.di_entries = 2;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ GFS_ASSERT_INODE(!error, ip,);
++
++ di = (struct gfs_dinode *)dibh->b_data;
++
++ error = gfs_dirent_alloc(ip, dibh, 1, &dent);
++ GFS_ASSERT_INODE(!error, ip,); /* This should never fail */
++
++ dent->de_inum = di->di_num; /* already GFS endian */
++ dent->de_hash = gfs_dir_hash(".", 1);
++ dent->de_hash = cpu_to_gfs32(dent->de_hash);
++ dent->de_type = cpu_to_gfs16(GFS_FILE_DIR);
++ memcpy((char *) (dent + 1), ".", 1);
++ di->di_entries = cpu_to_gfs32(1);
++
++ error = gfs_dirent_alloc(ip, dibh, 2, &dent);
++ GFS_ASSERT_INODE(!error, ip,); /* This should never fail */
++
++ gfs_inum_out(&dip->i_num, (char *) &dent->de_inum);
++ dent->de_hash = gfs_dir_hash("..", 2);
++ dent->de_hash = cpu_to_gfs32(dent->de_hash);
++ dent->de_type = cpu_to_gfs16(GFS_FILE_DIR);
++ memcpy((char *) (dent + 1), "..", 2);
++
++ gfs_dinode_out(&ip->i_di, (char *)di);
++
++ brelse(dibh);
++
++ error = gfs_change_nlink(dip, +1);
++ GFS_ASSERT_INODE(!error, dip,); /* dip already pinned */
++
++ gfs_trans_end(sdp);
++ if (dip->i_alloc->al_rgd)
++ gfs_inplace_release(dip);
++ gfs_quota_unlock_m(dip);
++ gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++ gfs_alloc_put(dip);
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return -ENOMEM;
++
++ d_instantiate(dentry, inode);
++ mark_inode_dirty(inode);
++
++ return 0;
++}
++
++/**
++ * gfs_rmdir - Remove a directory
++ * @dir: The parent directory of the directory to be removed
++ * @dentry: The dentry of the directory to remove
++ *
++ * Remove a directory. Call gfs_rmdiri()
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_rmdir(struct inode *dir, struct dentry *dentry)
++{
++ struct gfs_inode *dip = vn2ip(dir);
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_inode *ip = vn2ip(dentry->d_inode);
++ struct gfs_holder ghs[2];
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ gfs_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++ gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++
++ error = gfs_glock_nq_m(2, ghs);
++ if (error)
++ goto fail;
++
++ error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
++ if (error)
++ goto fail_gunlock;
++
++ if ((dip->i_di.di_mode & S_ISVTX) &&
++ dip->i_di.di_uid != current->fsuid &&
++ ip->i_di.di_uid != current->fsuid &&
++ !capable(CAP_FOWNER)) {
++ error = -EPERM;
++ goto fail_gunlock;
++ }
++
++ error = gfs_revalidate(dip, &dentry->d_name, ip);
++ if (error)
++ goto fail_gunlock;
++
++ GFS_ASSERT_INODE(ip->i_di.di_entries >= 2, ip,
++ gfs_dinode_print(&ip->i_di););
++
++ if (ip->i_di.di_entries > 2) {
++ error = -ENOTEMPTY;
++ goto fail_gunlock;
++ }
++
++ /* Trans may require:
++ Two dinode blocks, one directory leaf block containing the
++ entry to be rmdired, two leaf blocks containing . and .. of
++ the directory being rmdired, and one unlinked tag */
++
++ error = gfs_trans_begin(sdp, 5, 1);
++ if (error)
++ goto fail_gunlock;
++
++ error = gfs_rmdiri(dip, &dentry->d_name, ip);
++ if (error)
++ goto fail_end_trans;
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_m(2, ghs);
++
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_gunlock:
++ gfs_glock_dq_m(2, ghs);
++
++ fail:
++ gfs_holder_uninit(&ghs[0]);
++ gfs_holder_uninit(&ghs[1]);
++
++ return error;
++}
++
++/**
++ * gfs_mknod - Make a special file
++ * @dir: The directory in which the special file will reside
++ * @dentry: The dentry of the special file
++ * @mode: The mode of the special file
++ * @rdev: The device specification of the special file
++ *
++ */
++
++static int
++gfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
++{
++ struct gfs_inode *dip = vn2ip(dir), *ip;
++ struct gfs_sbd *sdp = dip->i_sbd;
++ struct gfs_holder d_gh, i_gh;
++ struct inode *inode;
++ struct buffer_head *dibh;
++ uint16_t type = 0;
++ uint32_t major = 0, minor = 0;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ switch (mode & S_IFMT) {
++ case S_IFBLK:
++ type = GFS_FILE_BLK;
++ major = MAJOR(dev);
++ minor = MINOR(dev);
++ break;
++ case S_IFCHR:
++ type = GFS_FILE_CHR;
++ major = MAJOR(dev);
++ minor = MINOR(dev);
++ break;
++ case S_IFIFO:
++ type = GFS_FILE_FIFO;
++ break;
++ case S_IFSOCK:
++ type = GFS_FILE_SOCK;
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,
++ printk("mode = %d\n", mode););
++ break;
++ };
++
++ gfs_holder_init(dip->i_gl, 0, 0, &d_gh);
++
++ error = gfs_createi(&d_gh, &dentry->d_name,
++ type, mode,
++ &i_gh);
++ if (error) {
++ gfs_holder_uninit(&d_gh);
++ return error;
++ }
++
++ GFS_ASSERT_SBD(i_gh.gh_gl, sdp,);
++ ip = gl2ip(i_gh.gh_gl);
++
++ ip->i_di.di_major = major;
++ ip->i_di.di_minor = minor;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ GFS_ASSERT_INODE(!error, ip,);
++
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++ if (dip->i_alloc->al_rgd)
++ gfs_inplace_release(dip);
++ gfs_quota_unlock_m(dip);
++ gfs_unlinked_unlock(sdp, dip->i_alloc->al_ul);
++ gfs_alloc_put(dip);
++
++ gfs_glock_dq_uninit(&d_gh);
++ gfs_glock_dq_uninit(&i_gh);
++
++ inode = gfs_iget(ip, CREATE);
++ gfs_inode_put(ip);
++
++ if (!inode)
++ return -ENOMEM;
++
++ d_instantiate(dentry, inode);
++ mark_inode_dirty(inode);
++
++ return 0;
++}
++
++/**
++ * gfs_rename - Rename a file
++ * @odir: Parent directory of old file name
++ * @odentry: The old dentry of the file
++ * @ndir: Parent directory of new file name
++ * @ndentry: The new dentry of the file
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_rename(struct inode *odir, struct dentry *odentry,
++ struct inode *ndir, struct dentry *ndentry)
++{
++ struct gfs_inode *odip = vn2ip(odir);
++ struct gfs_inode *ndip = vn2ip(ndir);
++ struct gfs_inode *ip = vn2ip(odentry->d_inode);
++ struct gfs_inode *nip = NULL;
++ struct gfs_sbd *sdp = odip->i_sbd;
++ struct qstr name;
++ struct gfs_alloc *al;
++ struct gfs_holder ghs[4], r_gh;
++ unsigned int num_gh;
++ int dir_rename = FALSE;
++ int alloc_required;
++ unsigned int x;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ gfs_unlinked_limit(sdp);
++
++ if (ndentry->d_inode) {
++ nip = vn2ip(ndentry->d_inode);
++ if (ip == nip)
++ return 0;
++ }
++
++ /* Make sure we aren't trying to move a dirctory into it's subdir */
++
++ if (ip->i_di.di_type == GFS_FILE_DIR && odip != ndip) {
++ dir_rename = TRUE;
++
++ error = gfs_glock_nq_init(sdp->sd_rename_gl,
++ LM_ST_EXCLUSIVE, 0,
++ &r_gh);
++ if (error)
++ return error;
++
++ error = gfs_ok_to_move(ip, ndip);
++ if (error)
++ goto fail;
++ }
++
++ gfs_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[0]);
++ gfs_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[1]);
++ num_gh = 2;
++
++ if (nip)
++ gfs_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh++]);
++
++ if (dir_rename)
++ gfs_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh++]);
++
++ error = gfs_glock_nq_m(num_gh, ghs);
++ if (error)
++ goto fail_uninit;
++
++ /* Check out the old directory */
++
++ error = permission(odir, MAY_WRITE | MAY_EXEC, NULL);
++ if (error)
++ goto fail_gunlock;
++
++ if ((odip->i_di.di_mode & S_ISVTX) &&
++ odip->i_di.di_uid != current->fsuid &&
++ ip->i_di.di_uid != current->fsuid &&
++ !capable(CAP_FOWNER)) {
++ error = -EPERM;
++ goto fail_gunlock;
++ }
++
++ error = gfs_revalidate(odip, &odentry->d_name, ip);
++ if (error)
++ goto fail_gunlock;
++
++ /* Check out the new directory */
++
++ error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
++ if (error)
++ goto fail_gunlock;
++
++ if (nip) {
++ if ((ndip->i_di.di_mode & S_ISVTX) &&
++ ndip->i_di.di_uid != current->fsuid &&
++ nip->i_di.di_uid != current->fsuid &&
++ !capable(CAP_FOWNER)) {
++ error = -EPERM;
++ goto fail_gunlock;
++ }
++
++ error = gfs_revalidate(ndip, &ndentry->d_name, nip);
++ if (error)
++ goto fail_gunlock;
++
++ if (nip->i_di.di_type == GFS_FILE_DIR) {
++ GFS_ASSERT_INODE(nip->i_di.di_entries >= 2, ip,
++ gfs_dinode_print(&nip->i_di););
++ if (nip->i_di.di_entries > 2) {
++ error = -ENOTEMPTY;
++ goto fail_gunlock;
++ }
++ }
++ } else {
++ error = gfs_dir_search(ndip, &ndentry->d_name, NULL, NULL);
++ switch (error) {
++ case -ENOENT:
++ error = 0;
++ break;
++ case 0:
++ error = -EEXIST;
++ default:
++ goto fail_gunlock;
++ };
++
++ if (odip != ndip) {
++ if (!ndip->i_di.di_nlink) {
++ error = -EINVAL;
++ goto fail_gunlock;
++ }
++ if (ndip->i_di.di_entries == (uint32_t)-1) {
++ error = -EFBIG;
++ goto fail_gunlock;
++ }
++ if (ip->i_di.di_type == GFS_FILE_DIR &&
++ ndip->i_di.di_nlink == (uint32_t)-1) {
++ error = -EMLINK;
++ goto fail_gunlock;
++ }
++ }
++ }
++
++ error = gfs_diradd_alloc_required(ndip, &ndentry->d_name, &alloc_required);
++ if (error)
++ goto fail_gunlock;
++
++ if (alloc_required) {
++ al = gfs_alloc_get(ndip);
++
++ error = gfs_quota_lock_m(ndip,
++ NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto fail_alloc;
++
++ error = gfs_quota_check(ndip, ndip->i_di.di_uid, ndip->i_di.di_gid);
++ if (error)
++ goto fail_gunlock_q;
++
++ al->al_requested_meta = sdp->sd_max_dirres;
++
++ error = gfs_inplace_reserve(ndip);
++ if (error)
++ goto fail_gunlock_q;
++
++ /* Trans may require:
++ Dinodes for the srcdir, srcino, dstdir, dstino. Blocks for
++ adding the entry to dstdir. RG bitmaps for that allocation.
++ One leaf block in the srcdir for removal of the entry.
++ One leaf block for changing .. in srcino (if it's a directory).
++ Two leaf blocks for removing . and .. from dstino (if it exists
++ and it's a directory), one unlinked tag, and one quota block. */
++
++ error = gfs_trans_begin(sdp,
++ 8 + sdp->sd_max_dirres +
++ al->al_rgd->rd_ri.ri_length,
++ 2);
++ if (error)
++ goto fail_ipres;
++ } else {
++ /* Trans may require:
++ Dinodes for the srcdir, srcino, dstdir, dstino. One block for
++ adding the entry to dstdir.
++ One leaf block in the srcdir for removal of the entry.
++ One leaf block for changing .. in srcino (if it's a directory).
++ Two leaf blocks for removing . and .. from dstino (if it exists
++ and it's a directory), and one unlinked tag. */
++
++ error = gfs_trans_begin(sdp, 9, 1);
++ if (error)
++ goto fail_ipres;
++ }
++
++ /* Remove the target file, if it exists */
++
++ if (nip) {
++ if (nip->i_di.di_type == GFS_FILE_DIR)
++ error = gfs_rmdiri(ndip, &ndentry->d_name, nip);
++ else
++ error = gfs_unlinki(ndip, &ndentry->d_name, nip);
++
++ if (error)
++ goto fail_end_trans;
++ }
++
++ if (dir_rename) {
++ error = gfs_change_nlink(ndip, +1);
++ if (error)
++ goto fail_end_trans;
++ error = gfs_change_nlink(odip, -1);
++ if (error)
++ goto fail_end_trans;
++
++ name.len = 2;
++ name.name = "..";
++
++ error = gfs_dir_mvino(ip, &name, &ndip->i_num, GFS_FILE_DIR);
++ if (error)
++ goto fail_end_trans;
++ }
++
++ error = gfs_dir_del(odip, &odentry->d_name);
++ if (error)
++ goto fail_end_trans;
++
++ error = gfs_dir_add(ndip, &ndentry->d_name, &ip->i_num, ip->i_di.di_type);
++ if (error)
++ goto fail_end_trans;
++
++ if (dir_rename)
++ gfs_trans_add_gl(sdp->sd_rename_gl);
++
++ gfs_trans_end(sdp);
++
++ if (alloc_required) {
++ /* Don't check al->al_alloced_meta and friends. */
++ gfs_inplace_release(ndip);
++ gfs_quota_unlock_m(ndip);
++ gfs_alloc_put(ndip);
++ }
++
++ gfs_glock_dq_m(num_gh, ghs);
++
++ for (x = 0; x < num_gh; x++)
++ gfs_holder_uninit(&ghs[x]);
++
++ if (dir_rename)
++ gfs_glock_dq_uninit(&r_gh);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ if (alloc_required)
++ gfs_inplace_release(ndip);
++
++ fail_gunlock_q:
++ if (alloc_required)
++ gfs_quota_unlock_m(ndip);
++
++ fail_alloc:
++ if (alloc_required)
++ gfs_alloc_put(ndip);
++
++ fail_gunlock:
++ gfs_glock_dq_m(num_gh, ghs);
++
++ fail_uninit:
++ for (x = 0; x < num_gh; x++)
++ gfs_holder_uninit(&ghs[x]);
++
++ fail:
++ if (dir_rename)
++ gfs_glock_dq_uninit(&r_gh);
++
++ return error;
++}
++
++/**
++ * gfs_readlink - Read the value of a symlink
++ * @dentry: the symlink
++ * @buf: the buffer to read the symlink data into
++ * @size: the size of the buffer
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_readlink(struct dentry *dentry, char *user_buf, int user_size)
++{
++ struct gfs_inode *ip = vn2ip(dentry->d_inode);
++ char array[GFS_FAST_NAME_SIZE], *buf = array;
++ unsigned int len = GFS_FAST_NAME_SIZE;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++ error = gfs_readlinki(ip, &buf, &len);
++ if (error)
++ return error;
++
++ GFS_ASSERT_INODE(len, ip,);
++
++ if (user_size > len - 1)
++ user_size = len - 1;
++
++ if (copy_to_user(user_buf, buf, user_size))
++ error = -EFAULT;
++ else
++ error = user_size;
++
++ if (buf != array)
++ kfree(buf);
++
++ return error;
++}
++
++/**
++ * gfs_follow_link - Follow a symbolic link
++ * @dentry: The dentry of the link
++ * @nd: Data that we pass to vfs_follow_link()
++ *
++ * This can handle symlinks of any size. It is optimised for symlinks
++ * under GFS_FAST_NAME_SIZE.
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_follow_link(struct dentry *dentry, struct nameidata *nd)
++{
++ struct gfs_inode *ip = vn2ip(dentry->d_inode);
++ char array[GFS_FAST_NAME_SIZE], *buf = array;
++ unsigned int len = GFS_FAST_NAME_SIZE;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++ error = gfs_readlinki(ip, &buf, &len);
++ if (!error) {
++ error = vfs_follow_link(nd, buf);
++ if (buf != array)
++ kfree(buf);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_permission -
++ * @inode:
++ * @mask:
++ * @nd:
++ *
++ * Returns: errno
++ */
++
++static int
++gfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_holder i_gh;
++ struct posix_acl *acl;
++ umode_t mode = inode->i_mode;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++ error = gfs_glock_nq_init(ip->i_gl,
++ LM_ST_SHARED, LM_FLAG_ANY,
++ &i_gh);
++ if (error)
++ return error;
++
++ if (mask & MAY_WRITE) {
++ if (IS_RDONLY(inode) &&
++ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
++ error = -EROFS;
++ goto out;
++ }
++ if (IS_IMMUTABLE(inode)) {
++ error = -EACCES;
++ goto out;
++ }
++ }
++
++ if (capable(CAP_DAC_OVERRIDE))
++ if (!(mask & MAY_EXEC) || (mode & S_IXUGO))
++ goto out;
++
++ if (capable(CAP_DAC_READ_SEARCH) &&
++ (mask == MAY_READ ||
++ (!(mask & MAY_WRITE) && S_ISDIR(mode))))
++ goto out;
++
++ if (inode->i_uid == current->fsuid) {
++ if ((mask & (mode >> 6)) != mask)
++ error = -EACCES;
++ goto out;
++ }
++
++ if ((mask & (mode >> 3)) == mask) {
++ error = gfs_getacl(inode, TRUE, &acl);
++ if (acl) {
++ error = posix_acl_permission(inode, acl, mask);
++ goto out;
++ } else if (error && error != -ENODATA)
++ goto out;
++ error = 0;
++ if (in_group_p(inode->i_gid)) {
++ error = 0;
++ goto out;
++ }
++ } else if (in_group_p(inode->i_gid)) {
++ error = -EACCES;
++ goto out;
++ }
++
++ if ((mask & mode) == mask)
++ goto out;
++
++ error = -EACCES;
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_setattr - Change attributes on an inode
++ * @dentry: The dentry which is changing
++ * @attr: The structure describing the change
++ *
++ * The VFS layer wants to change one or more of an inodes attributes. Write
++ * that change out to disk.
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_setattr(struct dentry *dentry, struct iattr *attr)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_holder i_gh;
++ struct gfs_alloc *al;
++ struct buffer_head *dibh;
++ uint32_t ouid, ogid, nuid, ngid;
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (error)
++ return error;
++
++ error = inode_change_ok(inode, attr);
++ if (error)
++ goto fail;
++
++ if (attr->ia_valid & ATTR_SIZE) {
++ error = permission(inode, MAY_WRITE, NULL);
++ if (error)
++ goto fail;
++
++ if (attr->ia_size != ip->i_di.di_size) {
++ error = vmtruncate(inode, attr->ia_size);
++ if (error)
++ goto fail;
++ }
++
++ error = gfs_truncatei(ip, attr->ia_size, gfs_truncator_page);
++ if (error)
++ goto fail;
++
++ if ((sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) &&
++ !gfs_is_jdata(ip))
++ i_gh.gh_flags |= GL_SYNC;
++ }
++
++ else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) {
++ ouid = ip->i_di.di_uid;
++ ogid = ip->i_di.di_gid;
++ nuid = attr->ia_uid;
++ ngid = attr->ia_gid;
++
++ if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
++ ouid = nuid = NO_QUOTA_CHANGE;
++ if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
++ ogid = ngid = NO_QUOTA_CHANGE;
++
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_lock_m(ip, nuid, ngid);
++ if (error)
++ goto fail_alloc;
++
++ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
++ error = gfs_quota_check(ip, nuid, ngid);
++ if (error)
++ goto fail_gunlock_q;
++ }
++
++ /* Trans may require:
++ one dinode block and one quota change block */
++
++ error = gfs_trans_begin(sdp, 1, 1);
++ if (error)
++ goto fail_gunlock_q;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (error)
++ goto fail_end_trans;
++
++ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
++ gfs_trans_add_quota(sdp, -ip->i_di.di_blocks,
++ ouid, ogid);
++ gfs_trans_add_quota(sdp, ip->i_di.di_blocks,
++ nuid, ngid);
++ }
++
++ inode_setattr(inode, attr);
++ gfs_inode_attr_out(ip);
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++
++ gfs_trans_end(sdp);
++
++ gfs_quota_unlock_m(ip);
++ gfs_alloc_put(ip);
++ }
++
++ else {
++ /* Trans may require:
++ one dinode block plus changes for acl. */
++
++ error = gfs_trans_begin(sdp,
++ 1 + GFS_MAX_EA_ACL_BLKS, 0);
++ if (error)
++ goto fail;
++
++ error = gfs_get_inode_buffer(ip, &dibh);
++ if (!error) {
++ inode_setattr(inode, attr);
++ gfs_inode_attr_out(ip);
++
++ if (attr->ia_valid & ATTR_MODE)
++ error = gfs_acl_setattr(inode);
++
++ gfs_trans_add_bh(ip->i_gl, dibh);
++ gfs_dinode_out(&ip->i_di, dibh->b_data);
++ brelse(dibh);
++ }
++
++ gfs_trans_end(sdp);
++ }
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ mark_inode_dirty(inode);
++
++ return error;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_gunlock_q:
++ gfs_quota_unlock_m(ip);
++
++ fail_alloc:
++ gfs_alloc_put(ip);
++
++ fail:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_getattr - Read out an inode's attributes
++ * @mnt: ?
++ * @dentry: The dentry to stat
++ * @stat: The inode's stats
++ *
++ * Returns: 0 on success, -EXXXX on failure
++ */
++
++static int
++gfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_holder gh;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_inode);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
++ if (!error)
++ {
++ generic_fillattr(inode, stat);
++ gfs_glock_dq_uninit(&gh);
++ }
++
++ return error;
++}
++
++/**
++ * get_eatype - get the type of the ea, and trucate the type from the name
++ * @namep: ea name, possibly with type appended
++ *
++ * Returns: GFS_EATYPE_XXX
++ */
++
++int
++get_eatype(const char *name, char **truncated_name)
++{
++ int type;
++
++ if (strncmp(name, "system.", 7) == 0) {
++ type = GFS_EATYPE_SYS;
++ *truncated_name = strchr(name, '.') + 1;
++ } else if (strncmp(name, "user.", 5) == 0) {
++ type = GFS_EATYPE_USR;
++ *truncated_name = strchr(name, '.') + 1;
++ } else {
++ type = GFS_EATYPE_UNUSED;
++ *truncated_name = NULL;
++ }
++
++ return type;
++}
++
++/**
++ * gfs_setxattr - Set (or create or replace) an inode's extended attribute
++ * @dentry: inode's dentry
++ * @name: name of the extended attribute
++ * @data: the value of the extended attribute
++ * @size: the size of data
++ * @flags: used to specify create or replace actions
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_setxattr(struct dentry *dentry, const char *name,
++ const void *data, size_t size,
++ int flags)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_easet_io req;
++ char *truncated_name;
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ req.es_type = get_eatype(name, &truncated_name);
++
++ if (req.es_type == GFS_EATYPE_UNUSED)
++ error = -EOPNOTSUPP;
++ else {
++ req.es_data = data;
++ req.es_name = truncated_name;
++ req.es_data_len = size;
++ req.es_name_len = strlen(truncated_name);
++ if (flags & XATTR_CREATE)
++ req.es_cmd = GFS_EACMD_CREATE;
++ else if (flags & XATTR_REPLACE)
++ req.es_cmd = GFS_EACMD_REPLACE;
++ else
++ req.es_cmd = GFS_EACMD_SET;
++ error = gfs_set_eattr(sdp, ip, &req);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_getxattr -
++ * @dentry:
++ * @name:
++ * @data:
++ * @size:
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++ssize_t
++gfs_getxattr(struct dentry *dentry, const char *name,
++ void *data, size_t size)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_eaget_io req;
++ char *truncated_name;
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ req.eg_type = get_eatype(name, &truncated_name);
++
++ if (req.eg_type == GFS_EATYPE_UNUSED)
++ error = -EOPNOTSUPP;
++ else {
++ req.eg_name = truncated_name;
++ req.eg_name_len = strlen(truncated_name);
++ req.eg_data = data;
++ req.eg_data_len = size;
++ req.eg_len = NULL;
++ error = gfs_get_eattr(sdp, ip, &req, gfs_ea_memcpy);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_listxattr -
++ * @dentry:
++ * @buffer:
++ * @size:
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++ssize_t
++gfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_eaget_io req;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ req.eg_type = 0;
++ req.eg_name = NULL;
++ req.eg_name_len = 0;
++ req.eg_data = buffer;
++ req.eg_data_len = size;
++ req.eg_len = NULL;
++
++ return gfs_get_eattr(sdp, ip, &req, gfs_ea_memcpy);
++}
++
++/**
++ * gfs_removexattr -
++ * @dentry:
++ * @name:
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_removexattr(struct dentry *dentry, const char *name)
++{
++ struct inode *inode = dentry->d_inode;
++ struct gfs_inode *ip = vn2ip(inode);
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_easet_io req;
++ char *truncated_name;
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_inode);
++
++ req.es_type = get_eatype(name, &truncated_name);
++
++ if (req.es_type == GFS_EATYPE_UNUSED)
++ error = -EOPNOTSUPP;
++ else {
++ req.es_name = truncated_name;
++ req.es_data = NULL;
++ req.es_data_len = 0;
++ req.es_name_len = strlen(truncated_name);
++ req.es_cmd = GFS_EACMD_REMOVE;
++ error = gfs_set_eattr(sdp, ip, &req);
++ }
++
++ return error;
++}
++
++struct inode_operations gfs_file_iops = {
++ .permission = gfs_permission,
++ .setattr = gfs_setattr,
++ .getattr = gfs_getattr,
++ .setxattr = gfs_setxattr,
++ .getxattr = gfs_getxattr,
++ .listxattr = gfs_listxattr,
++ .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_dev_iops = {
++ .permission = gfs_permission,
++ .setattr = gfs_setattr,
++ .getattr = gfs_getattr,
++ .setxattr = gfs_setxattr,
++ .getxattr = gfs_getxattr,
++ .listxattr = gfs_listxattr,
++ .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_dir_iops = {
++ .create = gfs_create,
++ .lookup = gfs_lookup,
++ .link = gfs_link,
++ .unlink = gfs_unlink,
++ .symlink = gfs_symlink,
++ .mkdir = gfs_mkdir,
++ .rmdir = gfs_rmdir,
++ .mknod = gfs_mknod,
++ .rename = gfs_rename,
++ .permission = gfs_permission,
++ .setattr = gfs_setattr,
++ .getattr = gfs_getattr,
++ .setxattr = gfs_setxattr,
++ .getxattr = gfs_getxattr,
++ .listxattr = gfs_listxattr,
++ .removexattr = gfs_removexattr,
++};
++
++struct inode_operations gfs_symlink_iops = {
++ .readlink = gfs_readlink,
++ .follow_link = gfs_follow_link,
++ .permission = gfs_permission,
++ .setattr = gfs_setattr,
++ .getattr = gfs_getattr,
++ .setxattr = gfs_setxattr,
++ .getxattr = gfs_getxattr,
++ .listxattr = gfs_listxattr,
++ .removexattr = gfs_removexattr,
++};
++
+diff -urN linux-orig/fs/gfs/ops_inode.h linux-patched/fs/gfs/ops_inode.h
+--- linux-orig/fs/gfs/ops_inode.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_inode.h 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_INODE_DOT_H__
++#define __OPS_INODE_DOT_H__
++
++extern struct inode_operations gfs_file_iops;
++extern struct inode_operations gfs_dir_iops;
++extern struct inode_operations gfs_symlink_iops;
++extern struct inode_operations gfs_dev_iops;
++
++#endif /* __OPS_INODE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_super.c linux-patched/fs/gfs/ops_super.c
+--- linux-orig/fs/gfs/ops_super.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_super.c 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,416 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/vmalloc.h>
++#include <linux/statfs.h>
++#include <linux/seq_file.h>
++#include <linux/mount.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "inode.h"
++#include "locking.h"
++#include "log.h"
++#include "ops_super.h"
++#include "page.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++
++/**
++ * gfs_write_inode - Make sure the inode is stable on the disk
++ * @inode: The inode
++ * @sync: synchronous write flag
++ *
++ */
++
++static void
++gfs_write_inode(struct inode *inode, int sync)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++
++ atomic_inc(&ip->i_sbd->sd_ops_super);
++
++ if (ip && sync && !gfs_in_panic)
++ gfs_log_flush_glock(ip->i_gl);
++}
++
++/**
++ * gfs_put_inode - put an inode
++ * @inode: The inode
++ *
++ * If i_nlink is zero, any dirty data for the inode is thrown away.
++ * If a process on another machine has the file open, it may need that
++ * data. So, sync it out.
++ */
++
++static void
++gfs_put_inode(struct inode *inode)
++{
++ struct gfs_sbd *sdp = vfs2sdp(inode->i_sb);
++ struct gfs_inode *ip = vn2ip(inode);
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ if (ip &&
++ !inode->i_nlink &&
++ S_ISREG(inode->i_mode) &&
++ !sdp->sd_args.ar_localcaching)
++ gfs_sync_page_i(inode, DIO_START | DIO_WAIT);
++}
++
++/**
++ * gfs_put_super - Unmount the filesystem
++ * @sb: The VFS superblock
++ *
++ */
++
++static void
++gfs_put_super(struct super_block *sb)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++ int error;
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ /* Unfreeze the filesystem, if we need to */
++
++ down(&sdp->sd_freeze_lock);
++ if (sdp->sd_freeze_count)
++ gfs_glock_dq_uninit(&sdp->sd_freeze_gh);
++ up(&sdp->sd_freeze_lock);
++
++ /* Kill off the inode thread */
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_INODED_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_inoded_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Kill off the quota thread */
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_QUOTAD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_quotad_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Kill off the log thread */
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_LOGD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_logd_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Kill off the recoverd thread */
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_RECOVERD_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_recoverd_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Kill off the glockd threads */
++ clear_bit(SDF_GLOCKD_RUN, &sdp->sd_flags);
++ wake_up(&sdp->sd_reclaim_wchan);
++ while (sdp->sd_glockd_num--)
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ /* Kill off the scand thread */
++ down(&sdp->sd_thread_lock);
++ clear_bit(SDF_SCAND_RUN, &sdp->sd_flags);
++ wake_up_process(sdp->sd_scand_process);
++ up(&sdp->sd_thread_lock);
++ wait_for_completion(&sdp->sd_thread_completion);
++
++ if (!test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ gfs_log_flush(sdp);
++ gfs_quota_sync(sdp);
++ gfs_quota_sync(sdp);
++
++ error = gfs_make_fs_ro(sdp);
++ if (error)
++ gfs_io_error(sdp);
++ }
++
++ /* At this point, we're through modifying the disk */
++
++ /* Release stuff */
++
++ gfs_inode_put(sdp->sd_riinode);
++ gfs_inode_put(sdp->sd_jiinode);
++ gfs_inode_put(sdp->sd_rooti);
++ gfs_inode_put(sdp->sd_qinode);
++ gfs_inode_put(sdp->sd_linode);
++
++ gfs_glock_put(sdp->sd_trans_gl);
++ gfs_glock_put(sdp->sd_rename_gl);
++
++ gfs_glock_dq_uninit(&sdp->sd_journal_gh);
++
++ gfs_glock_dq_uninit(&sdp->sd_live_gh);
++
++ /* Get rid of rgrp bitmap structures */
++ gfs_clear_rgrpd(sdp);
++ gfs_clear_journals(sdp);
++
++ /* Take apart glock structures and buffer lists */
++ gfs_gl_hash_clear(sdp, TRUE);
++
++ /* Unmount the locking protocol */
++ gfs_unmount_lockproto(sdp);
++
++ /* At this point, we're through participating in the lockspace */
++
++ gfs_clear_dirty_j(sdp);
++
++ /* Get rid of any extra inodes */
++ while (invalidate_inodes(sb))
++ yield();
++
++ vfree(sdp);
++
++ vfs2sdp(sb) = NULL;
++}
++
++/**
++ * gfs_write_super - disk commit all incore transactions
++ * @sb: the filesystem
++ *
++ * This function is called every time sync(2) is called.
++ * After this exits, all dirty buffers and synced.
++ */
++
++static void
++gfs_write_super(struct super_block *sb)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ if (!gfs_in_panic)
++ gfs_log_flush(sdp);
++}
++
++/**
++ * gfs_write_super_lockfs - prevent further writes to the filesystem
++ * @sb: the VFS structure for the filesystem
++ *
++ */
++
++static void
++gfs_write_super_lockfs(struct super_block *sb)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++ int error;
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ for (;;) {
++ error = gfs_freeze_fs(sdp);
++ if (!error)
++ break;
++
++ switch (error) {
++ case -EBUSY:
++ printk("GFS: fsid=%s: waiting for recovery before freeze\n",
++ sdp->sd_fsname);
++ break;
++
++ default:
++ printk("GFS: fsid=%s: error freezing FS: %d\n",
++ sdp->sd_fsname, error);
++ break;
++ }
++
++ printk("GFS: fsid=%s: retrying...\n", sdp->sd_fsname);
++
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ }
++}
++
++/**
++ * gfs_unlockfs - reallow writes to the filesystem
++ * @sb: the VFS structure for the filesystem
++ *
++ */
++
++static void
++gfs_unlockfs(struct super_block *sb)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ gfs_unfreeze_fs(sdp);
++}
++
++/**
++ * gfs_statfs - Gather and return stats about the filesystem
++ * @sb: The superblock
++ * @statfsbuf: The buffer
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++ struct gfs_usage usage;
++ int error;
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ error = gfs_stat_gfs(sdp, &usage, TRUE);
++ if (error)
++ return error;
++
++ memset(buf, 0, sizeof(struct kstatfs));
++
++ buf->f_type = GFS_MAGIC;
++ buf->f_bsize = usage.gu_block_size;
++ buf->f_blocks = usage.gu_total_blocks;
++ buf->f_bfree = usage.gu_free + usage.gu_free_dinode + usage.gu_free_meta;
++ buf->f_bavail = usage.gu_free + usage.gu_free_dinode + usage.gu_free_meta;
++ buf->f_files = usage.gu_used_dinode + usage.gu_free_dinode + usage.gu_free_meta + usage.gu_free;
++ buf->f_ffree = usage.gu_free_dinode + usage.gu_free_meta + usage.gu_free;
++ buf->f_namelen = GFS_FNAMESIZE;
++
++ return 0;
++}
++
++/**
++ * gfs_remount_fs - called when the FS is remounted
++ * @sb: the filesystem
++ * @flags: the remount flags
++ * @data: extra data passed in (not used right now)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++gfs_remount_fs(struct super_block *sb, int *flags, char *data)
++{
++ struct gfs_sbd *sdp = vfs2sdp(sb);
++ int error = 0;
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ if (*flags & (MS_NOATIME | MS_NODIRATIME))
++ set_bit(SDF_NOATIME, &sdp->sd_flags);
++ else
++ clear_bit(SDF_NOATIME, &sdp->sd_flags);
++
++ if (*flags & MS_RDONLY) {
++ if (!test_bit(SDF_ROFS, &sdp->sd_flags))
++ error = gfs_make_fs_ro(sdp);
++ } else if (!(*flags & MS_RDONLY) &&
++ test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ error = gfs_make_fs_rw(sdp);
++ }
++
++ /* Don't let the VFS update atimes. */
++ *flags |= MS_NOATIME | MS_NODIRATIME;
++
++ return error;
++}
++
++/**
++ * gfs_clear_inode - Deallocate an inode when VFS is done with it
++ * @inode: The VFS inode
++ *
++ */
++
++static void
++gfs_clear_inode(struct inode *inode)
++{
++ struct gfs_inode *ip = vn2ip(inode);
++
++ atomic_inc(&vfs2sdp(inode->i_sb)->sd_ops_super);
++
++ if (ip) {
++ spin_lock(&ip->i_lock);
++ ip->i_vnode = NULL;
++ vn2ip(inode) = NULL;
++ spin_unlock(&ip->i_lock);
++
++ gfs_glock_schedule_for_reclaim(ip->i_gl);
++ gfs_inode_put(ip);
++ }
++}
++
++/**
++ * gfs_show_options - Show mount options for /proc/mounts
++ * @s: seq_file structure
++ * @mnt: vfsmount
++ *
++ * Returns: 0 on success or error code
++ */
++
++static int
++gfs_show_options(struct seq_file *s, struct vfsmount *mnt)
++{
++ struct gfs_sbd *sdp = vfs2sdp(mnt->mnt_sb);
++ struct gfs_args *args = &sdp->sd_args;
++
++ atomic_inc(&sdp->sd_ops_super);
++
++ if (args->ar_lockproto[0]) {
++ seq_printf(s, ",lockproto=");
++ seq_puts(s, args->ar_lockproto);
++ }
++ if (args->ar_locktable[0]) {
++ seq_printf(s, ",locktable=");
++ seq_puts(s, args->ar_locktable);
++ }
++ if (args->ar_hostdata[0]) {
++ seq_printf(s, ",hostdata=");
++ seq_puts(s, args->ar_hostdata);
++ }
++ if (args->ar_ignore_local_fs)
++ seq_printf(s, ",ignore_local_fs");
++ if (args->ar_localflocks)
++ seq_printf(s, ",localflocks");
++ if (args->ar_localcaching)
++ seq_printf(s, ",localcaching");
++ if (args->ar_upgrade)
++ seq_printf(s, ",upgrade");
++ if (args->ar_num_glockd != GFS_GLOCKD_DEFAULT)
++ seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
++ if (args->ar_posixacls)
++ seq_printf(s, ",acl");
++
++ return 0;
++}
++
++struct super_operations gfs_super_ops = {
++ .write_inode = gfs_write_inode,
++ .put_inode = gfs_put_inode,
++ .put_super = gfs_put_super,
++ .write_super = gfs_write_super,
++ .write_super_lockfs = gfs_write_super_lockfs,
++ .unlockfs = gfs_unlockfs,
++ .statfs = gfs_statfs,
++ .remount_fs = gfs_remount_fs,
++ .clear_inode = gfs_clear_inode,
++ .show_options = gfs_show_options,
++};
+diff -urN linux-orig/fs/gfs/ops_super.h linux-patched/fs/gfs/ops_super.h
+--- linux-orig/fs/gfs/ops_super.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_super.h 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,19 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_SUPER_DOT_H__
++#define __OPS_SUPER_DOT_H__
++
++extern struct super_operations gfs_super_ops;
++
++#endif /* __OPS_SUPER_DOT_H__ */
+diff -urN linux-orig/fs/gfs/ops_vm.c linux-patched/fs/gfs/ops_vm.c
+--- linux-orig/fs/gfs/ops_vm.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_vm.c 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,212 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "glock.h"
++#include "inode.h"
++#include "ops_vm.h"
++#include "page.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "trans.h"
++
++/**
++ * gfs_private_nopage -
++ * @area:
++ * @address:
++ * @type:
++ *
++ * Returns: the page
++ */
++
++static struct page *
++gfs_private_nopage(struct vm_area_struct *area,
++ unsigned long address, int *type)
++{
++ struct gfs_inode *ip = vn2ip(area->vm_file->f_mapping->host);
++ struct gfs_holder i_gh;
++ struct page *result;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_vm);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
++ if (error)
++ return NULL;
++
++ set_bit(GIF_PAGED, &ip->i_flags);
++
++ result = filemap_nopage(area, address, type);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return result;
++}
++
++/**
++ * alloc_page_backing -
++ * @ip:
++ * @index:
++ *
++ * Returns: errno
++ */
++
++static int
++alloc_page_backing(struct gfs_inode *ip, unsigned long index)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ uint64_t lblock = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
++ unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
++ struct gfs_alloc *al;
++ unsigned int x;
++ int error;
++
++ al = gfs_alloc_get(ip);
++
++ error = gfs_quota_lock_m(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
++ if (error)
++ goto out;
++
++ error = gfs_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
++ if (error)
++ goto out_gunlock_q;
++
++ gfs_write_calc_reserv(ip, PAGE_CACHE_SIZE,
++ &al->al_requested_data, &al->al_requested_meta);
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto out_gunlock_q;
++
++ /* Trans may require:
++ a dinode block, RG bitmaps to allocate from,
++ indirect blocks, and a quota block */
++
++ error = gfs_trans_begin(sdp,
++ 1 + al->al_rgd->rd_ri.ri_length +
++ al->al_requested_meta, 1);
++ if (error)
++ goto out_ipres;
++
++ if (gfs_is_stuffed(ip)) {
++ error = gfs_unstuff_dinode(ip, gfs_unstuffer_page, NULL);
++ if (error)
++ goto out_trans;
++ }
++
++ for (x = 0; x < blocks; ) {
++ uint64_t dblock;
++ unsigned int extlen;
++ int new = TRUE;
++
++ error = gfs_block_map(ip, lblock, &new, &dblock, &extlen);
++ if (error)
++ goto out_trans;
++ GFS_ASSERT_INODE(dblock, ip,);
++
++ lblock += extlen;
++ x += extlen;
++ }
++
++ GFS_ASSERT_INODE(al->al_alloced_meta || al->al_alloced_data, ip,);
++
++ out_trans:
++ gfs_trans_end(sdp);
++
++ out_ipres:
++ gfs_inplace_release(ip);
++
++ out_gunlock_q:
++ gfs_quota_unlock_m(ip);
++
++ out:
++ gfs_alloc_put(ip);
++
++ return error;
++}
++
++/**
++ * gfs_sharewrite_nopage -
++ * @area:
++ * @address:
++ * @type:
++ *
++ * Returns: the page
++ */
++
++static struct page *
++gfs_sharewrite_nopage(struct vm_area_struct *area,
++ unsigned long address, int *type)
++{
++ struct gfs_inode *ip = vn2ip(area->vm_file->f_mapping->host);
++ struct gfs_holder i_gh;
++ struct page *result = NULL;
++ unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
++ int alloc_required;
++ int error;
++
++ atomic_inc(&ip->i_sbd->sd_ops_vm);
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (error)
++ return NULL;
++
++ if (gfs_is_jdata(ip))
++ goto out;
++
++ set_bit(GIF_PAGED, &ip->i_flags);
++ set_bit(GIF_SW_PAGED, &ip->i_flags);
++
++ error = gfs_write_alloc_required(ip, (uint64_t)index << PAGE_CACHE_SHIFT,
++ PAGE_CACHE_SIZE, &alloc_required);
++ if (error)
++ goto out;
++
++ result = filemap_nopage(area, address, type);
++ if (!result || result == NOPAGE_OOM)
++ goto out;
++
++ if (alloc_required) {
++ error = alloc_page_backing(ip, index);
++ if (error) {
++ page_cache_release(result);
++ result = NULL;
++ }
++ set_page_dirty(result);
++ }
++
++ out:
++ gfs_glock_dq_uninit(&i_gh);
++
++ return result;
++}
++
++struct vm_operations_struct gfs_vm_ops_private = {
++ .nopage = gfs_private_nopage,
++};
++
++struct vm_operations_struct gfs_vm_ops_sharewrite = {
++ .nopage = gfs_sharewrite_nopage,
++};
++
+diff -urN linux-orig/fs/gfs/ops_vm.h linux-patched/fs/gfs/ops_vm.h
+--- linux-orig/fs/gfs/ops_vm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/ops_vm.h 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,20 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __OPS_VM_DOT_H__
++#define __OPS_VM_DOT_H__
++
++extern struct vm_operations_struct gfs_vm_ops_private;
++extern struct vm_operations_struct gfs_vm_ops_sharewrite;
++
++#endif /* __OPS_VM_DOT_H__ */
+diff -urN linux-orig/fs/gfs/page.c linux-patched/fs/gfs/page.c
+--- linux-orig/fs/gfs/page.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/page.c 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,276 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/pagemap.h>
++#include <linux/mm.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "inode.h"
++#include "page.h"
++
++/**
++ * gfs_inval_pte - Sync and invalidate all PTEs associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_pte(struct gfs_glock *gl)
++{
++ struct gfs_inode *ip;
++ struct inode *inode;
++
++ ip = gl2ip(gl);
++ if (!ip ||
++ ip->i_di.di_type != GFS_FILE_REG)
++ return;
++
++ if (!test_bit(GIF_PAGED, &ip->i_flags))
++ return;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (inode) {
++ unmap_shared_mapping_range(inode->i_mapping, 0, 0);
++ iput(inode);
++
++ if (test_bit(GIF_SW_PAGED, &ip->i_flags))
++ set_bit(GLF_DIRTY, &gl->gl_flags);
++ }
++
++ clear_bit(GIF_SW_PAGED, &ip->i_flags);
++}
++
++/**
++ * gfs_inval_page - Invalidate all pages associated with a glock
++ * @gl: the glock
++ *
++ */
++
++void
++gfs_inval_page(struct gfs_glock *gl)
++{
++ struct gfs_inode *ip;
++ struct inode *inode;
++
++ ip = gl2ip(gl);
++ if (!ip ||
++ ip->i_di.di_type != GFS_FILE_REG)
++ return;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (inode) {
++ struct address_space *mapping = inode->i_mapping;
++
++ truncate_inode_pages(mapping, 0);
++ GFS_ASSERT_INODE(!mapping->nrpages, ip,);
++
++ iput(inode);
++ }
++
++ clear_bit(GIF_PAGED, &ip->i_flags);
++}
++
++/**
++ * gfs_sync_page_i - Sync the pages for a struct inode
++ * @inode: the inode
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_page_i(struct inode *inode, int flags)
++{
++ struct address_space *mapping = inode->i_mapping;
++ int error = 0;
++
++ if (flags & DIO_START)
++ error = filemap_fdatawrite(mapping);
++ if (!error && (flags & DIO_WAIT))
++ filemap_fdatawait(mapping);
++
++ if (error)
++ gfs_io_error_inode(vn2ip(inode));
++}
++
++/**
++ * gfs_sync_page - sync the pages associated with a glock
++ * @gl: the glock
++ * @flags: DIO_START | DIO_WAIT
++ *
++ */
++
++void
++gfs_sync_page(struct gfs_glock *gl, int flags)
++{
++ struct gfs_inode *ip;
++ struct inode *inode;
++
++ ip = gl2ip(gl);
++ if (!ip ||
++ ip->i_di.di_type != GFS_FILE_REG)
++ return;
++
++ inode = gfs_iget(ip, NO_CREATE);
++ if (inode) {
++ gfs_sync_page_i(inode, flags);
++ iput(inode);
++ }
++}
++
++/**
++ * gfs_unstuffer_page - unstuff a stuffed inode into a block cached by a page
++ * @ip: the inode
++ * @dibh: the dinode buffer
++ * @block: the block number that was allocated
++ * @private: any locked page held by the caller process
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_unstuffer_page(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private)
++{
++ struct inode *inode = ip->i_vnode;
++ struct page *page = (struct page *)private;
++ struct buffer_head *bh;
++ int release = FALSE;
++
++ if (!page || page->index) {
++ RETRY_MALLOC(page = grab_cache_page(inode->i_mapping, 0), page);
++ release = TRUE;
++ }
++
++ GFS_ASSERT_INODE(PageLocked(page), ip,);
++
++ if (!PageUptodate(page)) {
++ void *kaddr = kmap(page);
++
++ memcpy(kaddr,
++ dibh->b_data + sizeof(struct gfs_dinode),
++ ip->i_di.di_size);
++ memset(kaddr + ip->i_di.di_size,
++ 0,
++ PAGE_CACHE_SIZE - ip->i_di.di_size);
++ kunmap(page);
++
++ SetPageUptodate(page);
++ }
++
++ if (!page_has_buffers(page))
++ create_empty_buffers(page, 1 << inode->i_blkbits,
++ (1 << BH_Uptodate));
++
++ bh = page_buffers(page);
++
++ if (!buffer_mapped(bh))
++ map_bh(bh, inode->i_sb, block);
++ else
++ GFS_ASSERT_INODE(bh->b_bdev == inode->i_sb->s_bdev &&
++ bh->b_blocknr == block,
++ ip,);
++
++ set_buffer_uptodate(bh);
++ mark_buffer_dirty(bh);
++
++ if (release) {
++ unlock_page(page);
++ page_cache_release(page);
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_truncator_page - truncate a partial data block in the page cache
++ * @ip: the inode
++ * @size: the size the file should be
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_truncator_page(struct gfs_inode *ip, uint64_t size)
++{
++ struct inode *inode = ip->i_vnode;
++ struct page *page;
++ struct buffer_head *bh;
++ void *kaddr;
++ uint64_t lbn, dbn;
++ unsigned long index;
++ unsigned int offset;
++ unsigned int bufnum;
++ int not_new = 0;
++ int error;
++
++ lbn = size >> inode->i_blkbits;
++ error = gfs_block_map(ip,
++ lbn, ¬_new,
++ &dbn, NULL);
++ if (error || !dbn)
++ return error;
++
++ index = size >> PAGE_CACHE_SHIFT;
++ offset = size & (PAGE_CACHE_SIZE - 1);
++ bufnum = lbn - (index << (PAGE_CACHE_SHIFT - inode->i_blkbits));
++
++ /* Not in a transaction here -- a non-disk-I/O error is ok. */
++
++ page = read_cache_page(inode->i_mapping, index,
++ (filler_t *)inode->i_mapping->a_ops->readpage,
++ NULL);
++ if (IS_ERR(page))
++ return PTR_ERR(page);
++
++ lock_page(page);
++
++ if (!PageUptodate(page) || PageError(page)) {
++ error = -EIO;
++ goto out;
++ }
++
++ kaddr = kmap(page);
++ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
++ kunmap(page);
++
++ if (!page_has_buffers(page))
++ create_empty_buffers(page, 1 << inode->i_blkbits,
++ (1 << BH_Uptodate));
++
++ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
++ /* Do nothing */;
++
++ if (!buffer_mapped(bh))
++ map_bh(bh, inode->i_sb, dbn);
++ else
++ GFS_ASSERT_INODE(bh->b_bdev == inode->i_sb->s_bdev &&
++ bh->b_blocknr == dbn,
++ ip,);
++
++ set_buffer_uptodate(bh);
++ mark_buffer_dirty(bh);
++
++ out:
++ unlock_page(page);
++ page_cache_release(page);
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/page.h linux-patched/fs/gfs/page.h
+--- linux-orig/fs/gfs/page.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/page.h 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,26 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __PAGE_DOT_H__
++#define __PAGE_DOT_H__
++
++void gfs_inval_pte(struct gfs_glock *gl);
++void gfs_inval_page(struct gfs_glock *gl);
++void gfs_sync_page_i(struct inode *inode, int flags);
++void gfs_sync_page(struct gfs_glock *gl, int flags);
++
++int gfs_unstuffer_page(struct gfs_inode *ip, struct buffer_head *dibh,
++ uint64_t block, void *private);
++int gfs_truncator_page(struct gfs_inode *ip, uint64_t size);
++
++#endif /* __PAGE_DOT_H__ */
+diff -urN linux-orig/fs/gfs/quota.c linux-patched/fs/gfs/quota.c
+--- linux-orig/fs/gfs/quota.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/quota.c 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,1146 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++#include <linux/tty.h>
++#include <asm/uaccess.h>
++
++#include "gfs.h"
++#include "bmap.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "log.h"
++#include "quota.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * gfs_quota_get - Get a structure to represent a quota change
++ * @sdp: the filesystem
++ * @user: TRUE if this is a user quota
++ * @id: the uid or gid
++ * @create: if TRUE, create the structure, otherwise return NULL
++ * @qdp: the returned quota structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_get(struct gfs_sbd *sdp, int user, uint32_t id, int create,
++ struct gfs_quota_data **qdp)
++{
++ struct gfs_quota_data *qd = NULL, *new_qd = NULL;
++ struct list_head *tmp, *head;
++ int error = 0;
++
++ for (;;) {
++ spin_lock(&sdp->sd_quota_lock);
++
++ for (head = &sdp->sd_quota_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++ if (qd->qd_id == id &&
++ !test_bit(QDF_USER, &qd->qd_flags) == !user) {
++ qd->qd_count++;
++ break;
++ }
++ }
++
++ if (tmp == head)
++ qd = NULL;
++
++ if (!qd && new_qd) {
++ qd = new_qd;
++ list_add(&qd->qd_list, &sdp->sd_quota_list);
++ new_qd = NULL;
++ }
++
++ spin_unlock(&sdp->sd_quota_lock);
++
++ if (qd || !create) {
++ if (new_qd) {
++ gfs_lvb_unhold(new_qd->qd_gl);
++ kfree(new_qd);
++ atomic_dec(&sdp->sd_quota_count);
++ }
++ goto out;
++ }
++
++ new_qd = gmalloc(sizeof(struct gfs_quota_data));
++ memset(new_qd, 0, sizeof(struct gfs_quota_data));
++
++ new_qd->qd_count = 1;
++
++ new_qd->qd_id = id;
++ if (user)
++ set_bit(QDF_USER, &new_qd->qd_flags);
++
++ INIT_LIST_HEAD(&new_qd->qd_le_list);
++
++ error = gfs_glock_get(sdp, 2 * (uint64_t)id + ((user) ? 0 : 1),
++ &gfs_quota_glops, CREATE,
++ &new_qd->qd_gl);
++ if (error) {
++ kfree(new_qd);
++ goto out;
++ }
++
++ error = gfs_lvb_hold(new_qd->qd_gl);
++
++ gfs_glock_put(new_qd->qd_gl);
++
++ if (error) {
++ kfree(new_qd);
++ goto out;
++ }
++
++ atomic_inc(&sdp->sd_quota_count);
++ }
++
++ out:
++ *qdp = qd;
++
++ return error;
++}
++
++/**
++ * gfs_quota_hold - increment the usage count on a struct gfs_quota_data
++ * @sdp: the filesystem
++ * @qd: the structure
++ *
++ */
++
++void
++gfs_quota_hold(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++ spin_lock(&sdp->sd_quota_lock);
++ qd->qd_count++;
++ spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * gfs_quota_put - decrement the usage count on a struct gfs_quota_data
++ * @sdp: the filesystem
++ * @qd: the structure
++ *
++ * Free the structure if its reference count hits zero.
++ *
++ */
++
++void
++gfs_quota_put(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++ spin_lock(&sdp->sd_quota_lock);
++ GFS_ASSERT_SBD(qd->qd_count, sdp,);
++ qd->qd_count--;
++ spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * quota_find - Find a quota change to sync to the quota file
++ * @sdp: the filesystem
++ *
++ * The returned structure is locked and needs to be unlocked
++ * with quota_unlock().
++ *
++ * Returns: A quota structure, or NULL
++ */
++
++static struct gfs_quota_data *
++quota_find(struct gfs_sbd *sdp)
++{
++ struct list_head *tmp, *head;
++ struct gfs_quota_data *qd = NULL;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags))
++ return NULL;
++
++ gfs_log_lock(sdp);
++ spin_lock(&sdp->sd_quota_lock);
++
++ if (!atomic_read(&sdp->sd_quota_od_count))
++ goto out;
++
++ for (head = &sdp->sd_quota_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++
++ if (test_bit(QDF_LOCK, &qd->qd_flags))
++ continue;
++ if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++ continue;
++ if (qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
++ continue;
++
++ list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
++
++ set_bit(QDF_LOCK, &qd->qd_flags);
++ qd->qd_count++;
++ qd->qd_change_sync = qd->qd_change_od;
++
++ goto out;
++ }
++
++ qd = NULL;
++
++ out:
++ spin_unlock(&sdp->sd_quota_lock);
++ gfs_log_unlock(sdp);
++
++ return qd;
++}
++
++/**
++ * quota_trylock - Try to lock a given quota entry
++ * @sdp: the filesystem
++ * @qd: the quota data structure
++ *
++ * Returns: TRUE if the lock was successful, FALSE, otherwise
++ */
++
++static int
++quota_trylock(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++ int ret = FALSE;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags))
++ return FALSE;
++
++ gfs_log_lock(sdp);
++ spin_lock(&sdp->sd_quota_lock);
++
++ if (test_bit(QDF_LOCK, &qd->qd_flags))
++ goto out;
++ if (!test_bit(QDF_OD_LIST, &qd->qd_flags))
++ goto out;
++
++ list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
++
++ set_bit(QDF_LOCK, &qd->qd_flags);
++ qd->qd_count++;
++ qd->qd_change_sync = qd->qd_change_od;
++
++ ret = TRUE;
++
++ out:
++ spin_unlock(&sdp->sd_quota_lock);
++ gfs_log_unlock(sdp);
++
++ return ret;
++}
++
++/**
++ * quota_unlock - drop and a reference on a quota structure
++ * @sdp: the filesystem
++ * @qd: the quota inode structure
++ *
++ */
++
++static void
++quota_unlock(struct gfs_sbd *sdp, struct gfs_quota_data *qd)
++{
++ spin_lock(&sdp->sd_quota_lock);
++
++ GFS_ASSERT_SBD(test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++ clear_bit(QDF_LOCK, &qd->qd_flags);
++
++ GFS_ASSERT_SBD(qd->qd_count, sdp,);
++ qd->qd_count--;
++
++ spin_unlock(&sdp->sd_quota_lock);
++}
++
++/**
++ * gfs_quota_merge - add/remove a quota change from the in-memory list
++ * @sdp: the filesystem
++ * @tag: the quota change tag
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_merge(struct gfs_sbd *sdp, struct gfs_quota_tag *tag)
++{
++ struct gfs_quota_data *qd;
++ int error;
++
++ error = gfs_quota_get(sdp,
++ tag->qt_flags & GFS_QTF_USER, tag->qt_id,
++ CREATE, &qd);
++ if (error)
++ return error;
++
++ GFS_ASSERT_SBD(qd->qd_change_ic == qd->qd_change_od, sdp,);
++
++ gfs_log_lock(sdp);
++
++ qd->qd_change_ic += tag->qt_change;
++ qd->qd_change_od += tag->qt_change;
++
++ if (qd->qd_change_od) {
++ if (!test_bit(QDF_OD_LIST, &qd->qd_flags)) {
++ gfs_quota_hold(sdp, qd);
++ set_bit(QDF_OD_LIST, &qd->qd_flags);
++ atomic_inc(&sdp->sd_quota_od_count);
++ }
++ } else {
++ GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags), sdp,);
++ clear_bit(QDF_OD_LIST, &qd->qd_flags);
++ gfs_quota_put(sdp, qd);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_quota_od_count), sdp,);
++ atomic_dec(&sdp->sd_quota_od_count);
++ }
++
++ gfs_log_unlock(sdp);
++
++ gfs_quota_put(sdp, qd);
++
++ return 0;
++}
++
++/**
++ * gfs_quota_scan - Look for unused struct gfs_quota_data structures to throw away
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_quota_scan(struct gfs_sbd *sdp)
++{
++ struct list_head *head, *tmp, *next;
++ struct gfs_quota_data *qd;
++ LIST_HEAD(dead);
++
++ spin_lock(&sdp->sd_quota_lock);
++
++ for (head = &sdp->sd_quota_list, tmp = head->next, next = tmp->next;
++ tmp != head;
++ tmp = next, next = next->next) {
++ qd = list_entry(tmp, struct gfs_quota_data, qd_list);
++ if (!qd->qd_count)
++ list_move(&qd->qd_list, &dead);
++ }
++
++ spin_unlock(&sdp->sd_quota_lock);
++
++ while (!list_empty(&dead)) {
++ qd = list_entry(dead.next, struct gfs_quota_data, qd_list);
++
++ GFS_ASSERT_SBD(!qd->qd_count, sdp,);
++ GFS_ASSERT_SBD(!test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++ !test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++ GFS_ASSERT_SBD(!qd->qd_change_new && !qd->qd_change_ic &&
++ !qd->qd_change_od, sdp,);
++
++ list_del(&qd->qd_list);
++ gfs_lvb_unhold(qd->qd_gl);
++ kfree(qd);
++ atomic_dec(&sdp->sd_quota_count);
++ }
++}
++
++/**
++ * gfs_quota_cleanup - get rid of any extra struct gfs_quota_data structures
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_quota_cleanup(struct gfs_sbd *sdp)
++{
++ struct gfs_quota_data *qd;
++
++ restart:
++ gfs_log_lock(sdp);
++
++ spin_lock(&sdp->sd_quota_lock);
++
++ while (!list_empty(&sdp->sd_quota_list)) {
++ qd = list_entry(sdp->sd_quota_list.next,
++ struct gfs_quota_data,
++ qd_list);
++
++ if (qd->qd_count > 1) {
++ spin_unlock(&sdp->sd_quota_lock);
++ gfs_log_unlock(sdp);
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto restart;
++
++ } else if (qd->qd_count) {
++ GFS_ASSERT_SBD(test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++ !test_bit(QDF_LOCK, &qd->qd_flags),
++ sdp,);
++ GFS_ASSERT_SBD(qd->qd_change_od &&
++ qd->qd_change_od == qd->qd_change_ic,
++ sdp,);
++ GFS_ASSERT_SBD(!qd->qd_change_new, sdp,);
++
++ list_del(&qd->qd_list);
++ atomic_dec(&sdp->sd_quota_od_count);
++
++ spin_unlock(&sdp->sd_quota_lock);
++ gfs_lvb_unhold(qd->qd_gl);
++ kfree(qd);
++ atomic_dec(&sdp->sd_quota_count);
++ spin_lock(&sdp->sd_quota_lock);
++
++ } else {
++ GFS_ASSERT_SBD(!test_bit(QDF_OD_LIST, &qd->qd_flags) &&
++ !test_bit(QDF_LOCK, &qd->qd_flags), sdp,);
++ GFS_ASSERT_SBD(!qd->qd_change_new &&
++ !qd->qd_change_ic &&
++ !qd->qd_change_od, sdp,);
++
++ list_del(&qd->qd_list);
++
++ spin_unlock(&sdp->sd_quota_lock);
++ gfs_lvb_unhold(qd->qd_gl);
++ kfree(qd);
++ atomic_dec(&sdp->sd_quota_count);
++ spin_lock(&sdp->sd_quota_lock);
++ }
++ }
++
++ spin_unlock(&sdp->sd_quota_lock);
++
++ GFS_ASSERT_SBD(!atomic_read(&sdp->sd_quota_od_count), sdp,);
++
++ gfs_log_unlock(sdp);
++}
++
++/**
++ * sort_qd - figure out the order between two quota data structures
++ * @a: first quota data structure
++ * @b: second quota data structure
++ *
++ * Returns: -1 if @a comes before @b, 0 if @a equals @b, 1 if @b comes before @a
++ */
++
++static int
++sort_qd(void *a, void *b)
++{
++ struct gfs_quota_data *qd_a = *(struct gfs_quota_data **)a;
++ struct gfs_quota_data *qd_b = *(struct gfs_quota_data **)b;
++ int ret = 0;
++
++ if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
++ !test_bit(QDF_USER, &qd_b->qd_flags)) {
++ if (test_bit(QDF_USER, &qd_a->qd_flags))
++ ret = -1;
++ else
++ ret = 1;
++ } else {
++ if (qd_a->qd_id < qd_b->qd_id)
++ ret = -1;
++ else if (qd_a->qd_id > qd_b->qd_id)
++ ret = 1;
++ }
++
++ return ret;
++}
++
++/**
++ * do_quota_sync - Sync a bunch quota changes to the quota file
++ * @sdp: the filesystem
++ * @qda: an array of struct gfs_quota_data structures to be synced
++ * @num_qd: the number of elements in @qda
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++do_quota_sync(struct gfs_sbd *sdp, struct gfs_quota_data **qda,
++ unsigned int num_qd)
++{
++ struct gfs_inode *ip = sdp->sd_qinode;
++ struct gfs_alloc *al = NULL;
++ struct gfs_holder i_gh, *ghs;
++ struct gfs_quota q;
++ char buf[sizeof(struct gfs_quota)];
++ uint64_t offset;
++ unsigned int qx, x;
++ int ar;
++ unsigned int nalloc = 0;
++ unsigned int data_blocks, ind_blocks;
++ int error;
++
++ gfs_write_calc_reserv(ip, sizeof(struct gfs_quota), &data_blocks,
++ &ind_blocks);
++
++ ghs = gmalloc(num_qd * sizeof(struct gfs_holder));
++
++ gfs_sort(qda, num_qd, sizeof (struct gfs_quota_data *), sort_qd);
++ for (qx = 0; qx < num_qd; qx++) {
++ error = gfs_glock_nq_init(qda[qx]->qd_gl,
++ LM_ST_EXCLUSIVE,
++ GL_NOCACHE, &ghs[qx]);
++ if (error)
++ goto fail;
++ }
++
++ error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
++ if (error)
++ goto fail;
++
++ for (x = 0; x < num_qd; x++) {
++ offset = (2 * (uint64_t)qda[x]->qd_id +
++ ((test_bit(QDF_USER, &qda[x]->qd_flags)) ? 0 : 1)) *
++ sizeof(struct gfs_quota);
++
++ error = gfs_write_alloc_required(ip, offset,
++ sizeof(struct gfs_quota),
++ &ar);
++ if (error)
++ goto fail_gunlock;
++
++ if (ar)
++ nalloc++;
++ }
++
++ if (nalloc) {
++ al = gfs_alloc_get(ip);
++
++ error =
++ gfs_quota_hold_m(ip, NO_QUOTA_CHANGE,
++ NO_QUOTA_CHANGE);
++ if (error)
++ goto fail_alloc;
++
++ al->al_requested_meta = nalloc * (data_blocks + ind_blocks);
++
++ error = gfs_inplace_reserve(ip);
++ if (error)
++ goto fail_qs;
++
++ /* Trans may require:
++ two (journaled) data blocks, a dinode block, RG bitmaps to allocate from,
++ indirect blocks, and a quota block */
++
++ error = gfs_trans_begin(sdp,
++ 1 + al->al_rgd->rd_ri.ri_length +
++ num_qd * data_blocks +
++ nalloc * ind_blocks,
++ gfs_struct2blk(sdp, num_qd + 2,
++ sizeof(struct gfs_quota_tag)));
++ if (error)
++ goto fail_ipres;
++ } else {
++ /* Trans may require:
++ Data blocks, a dinode block, and quota blocks */
++
++ error = gfs_trans_begin(sdp,
++ 1 + data_blocks * num_qd,
++ gfs_struct2blk(sdp, num_qd,
++ sizeof(struct gfs_quota_tag)));
++ if (error)
++ goto fail_gunlock;
++ }
++
++ for (x = 0; x < num_qd; x++) {
++ offset = (2 * (uint64_t)qda[x]->qd_id +
++ ((test_bit(QDF_USER, &qda[x]->qd_flags)) ? 0 : 1)) *
++ sizeof(struct gfs_quota);
++
++ /* The quota file may not be a multiple of sizeof(struct gfs_quota) bytes. */
++ memset(buf, 0, sizeof(struct gfs_quota));
++
++ error = gfs_internal_read(ip, buf, offset,
++ sizeof(struct gfs_quota));
++ if (error < 0)
++ goto fail_end_trans;
++
++ gfs_quota_in(&q, buf);
++ q.qu_value += qda[x]->qd_change_sync;
++ gfs_quota_out(&q, buf);
++
++ error = gfs_internal_write(ip, buf, offset,
++ sizeof(struct gfs_quota));
++ if (error < 0)
++ goto fail_end_trans;
++ else if (error != sizeof(struct gfs_quota)) {
++ error = -EIO;
++ goto fail_end_trans;
++ }
++
++ if (test_bit(QDF_USER, &qda[x]->qd_flags))
++ gfs_trans_add_quota(sdp, -qda[x]->qd_change_sync,
++ qda[x]->qd_id, NO_QUOTA_CHANGE);
++ else
++ gfs_trans_add_quota(sdp, -qda[x]->qd_change_sync,
++ NO_QUOTA_CHANGE, qda[x]->qd_id);
++
++ memset(&qda[x]->qd_qb, 0, sizeof(struct gfs_quota_lvb));
++ qda[x]->qd_qb.qb_magic = GFS_MAGIC;
++ qda[x]->qd_qb.qb_limit = q.qu_limit;
++ qda[x]->qd_qb.qb_warn = q.qu_warn;
++ qda[x]->qd_qb.qb_value = q.qu_value;
++
++ gfs_quota_lvb_out(&qda[x]->qd_qb, qda[x]->qd_gl->gl_lvb);
++ clear_bit(GLF_LVB_INVALID, &qda[x]->qd_gl->gl_flags);
++ }
++
++ gfs_trans_end(sdp);
++
++ if (nalloc) {
++ GFS_ASSERT_SBD(al->al_alloced_meta, sdp,);
++ gfs_inplace_release(ip);
++ gfs_quota_unhold_m(ip);
++ gfs_alloc_put(ip);
++ }
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ for (x = 0; x < num_qd; x++)
++ gfs_glock_dq_uninit(&ghs[x]);
++
++ kfree(ghs);
++
++ gfs_log_flush_glock(ip->i_gl);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_ipres:
++ if (nalloc)
++ gfs_inplace_release(ip);
++
++ fail_qs:
++ if (nalloc)
++ gfs_quota_unhold_m(ip);
++
++ fail_alloc:
++ if (nalloc)
++ gfs_alloc_put(ip);
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++ while (qx--)
++ gfs_glock_dq_uninit(&ghs[qx]);
++
++ kfree(ghs);
++
++ return error;
++}
++
++/**
++ * glock_q - Acquire a lock for a quota entry
++ * @sdp: the filesystem
++ * @qd: the quota data structure to glock
++ * @force_refresh: If TRUE, always read from the quota file
++ * @q_gh: the glock holder for the quota lock
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++glock_q(struct gfs_sbd *sdp, struct gfs_quota_data *qd, int force_refresh,
++ struct gfs_holder *q_gh)
++{
++ struct gfs_holder i_gh;
++ struct gfs_quota q;
++ char buf[sizeof(struct gfs_quota)];
++ int error;
++
++ restart:
++ error = gfs_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
++ if (error)
++ return error;
++
++ gfs_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
++
++ if (force_refresh ||
++ qd->qd_qb.qb_magic != GFS_MAGIC ||
++ test_bit(GLF_LVB_INVALID, &qd->qd_gl->gl_flags)) {
++ gfs_glock_dq_uninit(q_gh);
++ error = gfs_glock_nq_init(qd->qd_gl,
++ LM_ST_EXCLUSIVE, GL_NOCACHE,
++ q_gh);
++ if (error)
++ return error;
++
++ error = gfs_glock_nq_init(sdp->sd_qinode->i_gl,
++ LM_ST_SHARED, 0,
++ &i_gh);
++ if (error)
++ goto fail;
++
++ memset(buf, 0, sizeof(struct gfs_quota));
++
++ error = gfs_internal_read(sdp->sd_qinode, buf,
++ (2 * (uint64_t)qd->qd_id +
++ ((test_bit(QDF_USER, &qd->qd_flags)) ? 0 : 1)) *
++ sizeof(struct gfs_quota),
++ sizeof(struct gfs_quota));
++ if (error < 0)
++ goto fail_gunlock;
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ gfs_quota_in(&q, buf);
++
++ memset(&qd->qd_qb, 0, sizeof(struct gfs_quota_lvb));
++ qd->qd_qb.qb_magic = GFS_MAGIC;
++ qd->qd_qb.qb_limit = q.qu_limit;
++ qd->qd_qb.qb_warn = q.qu_warn;
++ qd->qd_qb.qb_value = q.qu_value;
++
++ gfs_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
++ clear_bit(GLF_LVB_INVALID, &qd->qd_gl->gl_flags);
++
++ gfs_glock_dq_uninit(q_gh);
++ force_refresh = FALSE;
++ goto restart;
++ }
++
++ return 0;
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&i_gh);
++
++ fail:
++ gfs_glock_dq_uninit(q_gh);
++
++ return error;
++}
++
++/**
++ * gfs_quota_hold_m - Hold the quota structures for up to 4 IDs
++ * @ip: Two of the IDs are the UID and GID from this file
++ * @uid: a UID or the constant NO_QUOTA_CHANGE
++ * @gid: a GID or the constant NO_QUOTA_CHANGE
++ *
++ * The struct gfs_quota_data structures representing the locks are
++ * stored in the ip->i_alloc->al_qd array.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_hold_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ unsigned int x = 0;
++ int error;
++
++ GFS_ASSERT_INODE(al && !al->al_qd_num &&
++ !test_bit(GIF_QD_LOCKED, &ip->i_flags), ip,);
++
++ if (!sdp->sd_tune.gt_quota_account)
++ return 0;
++
++ error = gfs_quota_get(sdp, TRUE, ip->i_di.di_uid,
++ CREATE, &al->al_qd[x]);
++ if (error)
++ goto fail;
++ x++;
++
++ error = gfs_quota_get(sdp, FALSE, ip->i_di.di_gid,
++ CREATE, &al->al_qd[x]);
++ if (error)
++ goto fail;
++ x++;
++
++ if (uid != NO_QUOTA_CHANGE) {
++ error = gfs_quota_get(sdp, TRUE, uid,
++ CREATE, &al->al_qd[x]);
++ if (error)
++ goto fail;
++ x++;
++ }
++
++ if (gid != NO_QUOTA_CHANGE) {
++ error = gfs_quota_get(sdp, FALSE, gid,
++ CREATE, &al->al_qd[x]);
++ if (error)
++ goto fail;
++ x++;
++ }
++
++ al->al_qd_num = x;
++
++ return 0;
++
++ fail:
++ if (x) {
++ al->al_qd_num = x;
++ gfs_quota_unhold_m(ip);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_quota_unhold_m - throw away some quota locks
++ * @ip: the inode who's ip->i_alloc->al_qd array holds the structures
++ *
++ */
++
++void
++gfs_quota_unhold_m(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ unsigned int x;
++
++ GFS_ASSERT_INODE(al &&
++ !test_bit(GIF_QD_LOCKED, &ip->i_flags), ip,);
++
++ for (x = 0; x < al->al_qd_num; x++) {
++ gfs_quota_put(sdp, al->al_qd[x]);
++ al->al_qd[x] = NULL;
++ }
++ al->al_qd_num = 0;
++}
++
++/**
++ * gfs_quota_lock_m - Acquire the quota locks for up to 4 IDs
++ * @ip: Two of the IDs are the UID and GID from this file
++ * @uid: a UID or the constant NO_QUOTA_CHANGE
++ * @gid: a GID or the constant NO_QUOTA_CHANGE
++ *
++ * The struct gfs_quota_data structures representing the locks are
++ * stored in the ip->i_alloc->al_qd array.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_lock_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ unsigned int x;
++ int error;
++
++ gfs_quota_hold_m(ip, uid, gid);
++
++ if (!sdp->sd_tune.gt_quota_enforce)
++ return 0;
++ if (capable(CAP_SYS_RESOURCE))
++ return 0;
++
++ gfs_sort(al->al_qd, al->al_qd_num,
++ sizeof(struct gfs_quota_data *), sort_qd);
++
++ for (x = 0; x < al->al_qd_num; x++) {
++ error = glock_q(sdp, al->al_qd[x], FALSE, &al->al_qd_ghs[x]);
++ if (error)
++ goto fail;
++ }
++
++ set_bit(GIF_QD_LOCKED, &ip->i_flags);
++
++ return 0;
++
++ fail:
++ while (x--)
++ gfs_glock_dq_uninit(&al->al_qd_ghs[x]);
++
++ return error;
++}
++
++/**
++ * gfs_quota_unlock_m - drop some quota locks
++ * @ip: the inode who's ip->i_alloc->al_qd array holds the locks
++ *
++ */
++
++void
++gfs_quota_unlock_m(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ struct gfs_quota_data *qd, *qda[4];
++ int64_t value;
++ unsigned int count = 0;
++ unsigned int x;
++ int do_sync;
++
++ if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
++ goto out;
++
++ for (x = 0; x < al->al_qd_num; x++) {
++ qd = al->al_qd[x];
++
++ spin_lock(&sdp->sd_quota_lock);
++ value = qd->qd_change_new + qd->qd_change_ic;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ do_sync = TRUE;
++ if (!qd->qd_qb.qb_limit)
++ do_sync = FALSE;
++ else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
++ do_sync = FALSE;
++ else {
++ int64_t v;
++ v = value * gfs_num_journals(sdp) * sdp->sd_tune.gt_quota_scale_num;
++ do_div(v, sdp->sd_tune.gt_quota_scale_den);
++ v += qd->qd_qb.qb_value;
++ if (v < (int64_t)qd->qd_qb.qb_limit)
++ do_sync = FALSE;
++ }
++
++ gfs_glock_dq_uninit(&al->al_qd_ghs[x]);
++
++ if (do_sync) {
++ gfs_log_flush(sdp);
++ if (quota_trylock(sdp, qd))
++ qda[count++] = qd;
++ }
++ }
++
++ if (count) {
++ do_quota_sync(sdp, qda, count);
++
++ for (x = 0; x < count; x++)
++ quota_unlock(sdp, qda[x]);
++ }
++
++ out:
++ gfs_quota_unhold_m(ip);
++}
++
++/**
++ * print_quota_message - print a message to the user's tty about quotas
++ * @sdp: the filesystem
++ * @qd: the quota ID that the message is about
++ * @type: the type of message ("exceeded" or "warning")
++ *
++ */
++
++static void
++print_quota_message(struct gfs_sbd *sdp, struct gfs_quota_data *qd, char *type)
++{
++ char *line = gmalloc(256);
++ int len;
++ struct tty_struct *tty;
++
++ len = snprintf(line, 256, "GFS: fsid=%s: quota %s for %s %u\r\n",
++ sdp->sd_fsname, type,
++ (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
++ qd->qd_id);
++
++ if (current->signal) {
++ tty = current->signal->tty;
++ if (tty && tty->driver->write)
++ tty->driver->write(tty, 0, line, len);
++ }
++
++ kfree(line);
++}
++
++/**
++ * gfs_quota_check - Check to see if a block allocation is possible
++ * @ip: the inode who's ip->i_res.ir_qd array holds the quota locks
++ * @uid: the UID the block is allocated for
++ * @gid: the GID the block is allocated for
++ *
++ */
++
++int
++gfs_quota_check(struct gfs_inode *ip, uint32_t uid, uint32_t gid)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ struct gfs_quota_data *qd;
++ int64_t value;
++ unsigned int x;
++ int error = 0;
++
++ if (!al)
++ return 0;
++
++ for (x = 0; x < al->al_qd_num; x++) {
++ qd = al->al_qd[x];
++
++ if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
++ (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
++ continue;
++
++ spin_lock(&sdp->sd_quota_lock);
++ value = qd->qd_change_new + qd->qd_change_ic;
++ spin_unlock(&sdp->sd_quota_lock);
++ value += qd->qd_qb.qb_value;
++
++ if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
++ print_quota_message(sdp, qd, "exceeded");
++ error = -EDQUOT;
++ break;
++ } else if (qd->qd_qb.qb_warn &&
++ (int64_t)qd->qd_qb.qb_warn < value &&
++ time_after_eq(jiffies,
++ qd->qd_last_warn +
++ sdp->sd_tune.gt_quota_warn_period * HZ)) {
++ print_quota_message(sdp, qd, "warning");
++ qd->qd_last_warn = jiffies;
++ }
++ }
++
++ return error;
++}
++
++/**
++ * gfs_quota_sync - Sync quota changes to the quota file
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_sync(struct gfs_sbd *sdp)
++{
++ struct gfs_quota_data **qda;
++ unsigned int max_qd = sdp->sd_tune.gt_quota_simul_sync;
++ unsigned int num_qd;
++ unsigned int x;
++ int error = 0;
++
++ sdp->sd_quota_sync_gen++;
++
++ qda = gmalloc(max_qd * sizeof(struct gfs_quota_data *));
++
++ memset(qda, 0, max_qd * sizeof(struct gfs_quota_data *));
++
++ do {
++ num_qd = 0;
++
++ for (;;) {
++ qda[num_qd] = quota_find(sdp);
++ if (!qda[num_qd])
++ break;
++
++ if (++num_qd == max_qd)
++ break;
++ }
++
++ if (num_qd) {
++ error = do_quota_sync(sdp, qda, num_qd);
++ if (!error)
++ for (x = 0; x < num_qd; x++)
++ qda[x]->qd_sync_gen =
++ sdp->sd_quota_sync_gen;
++
++ for (x = 0; x < num_qd; x++)
++ quota_unlock(sdp, qda[x]);
++ }
++ }
++ while (!error && num_qd == max_qd);
++
++ kfree(qda);
++
++ return error;
++}
++
++/**
++ * gfs_quota_refresh - Refresh the LVB for a given quota ID
++ * @sdp: the filesystem
++ * @arg: a pointer to a struct gfs_quota_name in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_refresh(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_quota_name qn;
++ struct gfs_quota_data *qd;
++ struct gfs_holder q_gh;
++ int error;
++
++ if (copy_from_user(&qn, arg, sizeof(struct gfs_quota_name)))
++ return -EFAULT;
++
++ error = gfs_quota_get(sdp, qn.qn_user, qn.qn_id, CREATE, &qd);
++ if (error)
++ return error;
++
++ error = glock_q(sdp, qd, TRUE, &q_gh);
++ if (!error)
++ gfs_glock_dq_uninit(&q_gh);
++
++ gfs_quota_put(sdp, qd);
++
++ return error;
++}
++
++/**
++ * gfs_quota_read - Read the info a given quota ID
++ * @sdp: the filesystem
++ * @arg: a pointer to a gfs_quota_refresh_t in user space
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_quota_read(struct gfs_sbd *sdp, void *arg)
++{
++ struct gfs_quota_name qn;
++ struct gfs_quota_data *qd;
++ struct gfs_holder q_gh;
++ struct gfs_quota q;
++ int error;
++
++ if (copy_from_user(&qn, arg, sizeof(struct gfs_quota_name)))
++ return -EFAULT;
++
++ if (((qn.qn_user) ?
++ (qn.qn_id != current->fsuid) :
++ (!in_group_p(qn.qn_id))) &&
++ !capable(CAP_SYS_ADMIN))
++ return -EACCES;
++
++ error = gfs_quota_get(sdp, qn.qn_user, qn.qn_id, CREATE, &qd);
++ if (error)
++ return error;
++
++ error = glock_q(sdp, qd, FALSE, &q_gh);
++ if (error)
++ goto out;
++
++ memset(&q, 0, sizeof(struct gfs_quota));
++ q.qu_limit = qd->qd_qb.qb_limit;
++ q.qu_warn = qd->qd_qb.qb_warn;
++ q.qu_value = qd->qd_qb.qb_value;
++
++ spin_lock(&sdp->sd_quota_lock);
++ q.qu_value += qd->qd_change_new + qd->qd_change_ic;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ gfs_glock_dq_uninit(&q_gh);
++
++ out:
++ gfs_quota_put(sdp, qd);
++
++ if (!error &&
++ copy_to_user((char *)arg + sizeof(struct gfs_quota_name),
++ &q, sizeof(struct gfs_quota)))
++ error = -EFAULT;
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/quota.h linux-patched/fs/gfs/quota.h
+--- linux-orig/fs/gfs/quota.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/quota.h 2004-06-20 22:48:17.953945277 -0500
+@@ -0,0 +1,40 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __QUOTA_DOT_H__
++#define __QUOTA_DOT_H__
++
++#define NO_QUOTA_CHANGE ((uint32_t)-1)
++
++int gfs_quota_get(struct gfs_sbd *sdp, int user, uint32_t id, int create,
++ struct gfs_quota_data **qdp);
++void gfs_quota_hold(struct gfs_sbd *sdp, struct gfs_quota_data *qd);
++void gfs_quota_put(struct gfs_sbd *sdp, struct gfs_quota_data *qd);
++
++int gfs_quota_merge(struct gfs_sbd *sdp, struct gfs_quota_tag *tag);
++void gfs_quota_scan(struct gfs_sbd *sdp);
++void gfs_quota_cleanup(struct gfs_sbd *sdp);
++
++int gfs_quota_hold_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++void gfs_quota_unhold_m(struct gfs_inode *ip);
++
++int gfs_quota_lock_m(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++void gfs_quota_unlock_m(struct gfs_inode *ip);
++
++int gfs_quota_check(struct gfs_inode *ip, uint32_t uid, uint32_t gid);
++
++int gfs_quota_sync(struct gfs_sbd *sdp);
++int gfs_quota_refresh(struct gfs_sbd *sdp, void *arg);
++int gfs_quota_read(struct gfs_sbd *sdp, void *arg);
++
++#endif /* __QUOTA_DOT_H__ */
+diff -urN linux-orig/fs/gfs/recovery.c linux-patched/fs/gfs/recovery.c
+--- linux-orig/fs/gfs/recovery.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/recovery.c 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,749 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "glops.h"
++#include "lops.h"
++#include "recovery.h"
++
++#define bn2seg(bn) (((uint32_t)((bn) - jdesc->ji_addr)) / sdp->sd_sb.sb_seg_size)
++#define seg2bn(seg) ((seg) * sdp->sd_sb.sb_seg_size + jdesc->ji_addr)
++
++struct dirty_j {
++ struct list_head dj_list;
++ unsigned int dj_jid;
++ struct gfs_jindex dj_desc;
++};
++
++/**
++ * gfs_add_dirty_j - add a jid to the list of dirty journals
++ * @sdp: the filesystem
++ * @jid: the journal ID number
++ *
++ */
++
++void
++gfs_add_dirty_j(struct gfs_sbd *sdp, unsigned int jid)
++{
++ struct dirty_j *dj;
++
++ dj = gmalloc(sizeof(struct dirty_j));
++ memset(dj, 0, sizeof(struct dirty_j));
++
++ dj->dj_jid = jid;
++
++ spin_lock(&sdp->sd_dirty_j_lock);
++ list_add(&dj->dj_list, &sdp->sd_dirty_j);
++ spin_unlock(&sdp->sd_dirty_j_lock);
++}
++
++/**
++ * get_dirty_j - return a dirty journal from the list
++ * @sdp: the filesystem
++ *
++ * Returns: a struct dirty_j or NULL
++ */
++
++static struct dirty_j *
++get_dirty_j(struct gfs_sbd *sdp)
++{
++ struct dirty_j *dj = NULL;
++
++ spin_lock(&sdp->sd_dirty_j_lock);
++ if (!list_empty(&sdp->sd_dirty_j)) {
++ dj = list_entry(sdp->sd_dirty_j.prev, struct dirty_j, dj_list);
++ list_del(&dj->dj_list);
++ }
++ spin_unlock(&sdp->sd_dirty_j_lock);
++
++ return dj;
++}
++
++/**
++ * gfs_clear_dirty_j - destroy the list of dirty journals
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_clear_dirty_j(struct gfs_sbd *sdp)
++{
++ struct dirty_j *dj;
++ for (;;) {
++ dj = get_dirty_j(sdp);
++ if (!dj)
++ break;
++ kfree(dj);
++ }
++}
++
++/**
++ * gfs_log_header - read the log header for a given segment
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @seg: the segment to look at
++ * @lh: the log header to return
++ *
++ * Read the log header for a given segement in a given journal. Do a few
++ * sanity checks on it.
++ *
++ * Returns: 0 on success, 1 if the header was invalid or incomplete and, -EXXX on error
++ */
++
++static int
++get_log_header(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint32_t seg, struct gfs_log_header *lh)
++{
++ struct buffer_head *bh;
++ struct gfs_log_header lh2;
++ int error;
++
++ error = gfs_dread(sdp, seg2bn(seg), gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++
++ gfs_log_header_in(lh, bh->b_data);
++ gfs_log_header_in(&lh2,
++ bh->b_data + GFS_BASIC_BLOCK -
++ sizeof(struct gfs_log_header));
++
++ brelse(bh);
++
++ if (memcmp(lh, &lh2, sizeof(struct gfs_log_header)) != 0 ||
++ lh->lh_header.mh_magic != GFS_MAGIC ||
++ lh->lh_header.mh_type != GFS_METATYPE_LH)
++ error = 1;
++
++ return error;
++}
++
++/**
++ * find_good_lh - find a good log header
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @seg: the segment to start searching from (it's also filled in with a new value.)
++ * @lh: the log header to fill in
++ * @forward: if true search forward in the log, else search backward
++ *
++ * Call get_log_header() to get a log header for a segment, but if the
++ * segment is bad, either scan forward or backward until we find a good one.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++find_good_lh(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint32_t *seg, struct gfs_log_header *lh,
++ int forward)
++{
++ int error;
++ uint32_t orig_seg = *seg;
++
++ for (;;) {
++ error = get_log_header(sdp, jdesc, gl, *seg, lh);
++ if (error <= 0)
++ return error;
++
++ if (forward) {
++ if (++*seg == jdesc->ji_nsegment)
++ *seg = 0;
++ } else {
++ if (*seg-- == 0)
++ *seg = jdesc->ji_nsegment - 1;
++ }
++
++ GFS_ASSERT_SBD(*seg != orig_seg, sdp,);
++ }
++}
++
++/**
++ * verify_jhead - make sure we've found the head of the log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: this is filled in with the log descriptor of the head
++ *
++ * At this point, seg and lh should be either the head of the log or just
++ * before. Scan forward until we find the head.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++verify_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, struct gfs_log_header *head)
++{
++ struct gfs_log_header lh;
++ uint32_t seg;
++ int error;
++
++ seg = bn2seg(head->lh_first);
++
++ for (;;) {
++ if (++seg == jdesc->ji_nsegment)
++ seg = 0;
++
++ error = get_log_header(sdp, jdesc, gl, seg, &lh);
++ if (error < 0)
++ return error;
++
++ if (error == 1)
++ continue;
++ if (lh.lh_sequence == head->lh_sequence)
++ continue;
++
++ if (lh.lh_sequence < head->lh_sequence)
++ break;
++
++ memcpy(head, &lh, sizeof(struct gfs_log_header));
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_find_jhead - find the head of a log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: the log descriptor for the head of the log is returned here
++ *
++ * Do a binary search of a journal and find the valid log entry with the
++ * highest sequence number. (i.e. the log head)
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_find_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, struct gfs_log_header *head)
++{
++ struct gfs_log_header lh1, lh_m;
++ uint32_t seg1, seg2, seg_m;
++ int error;
++
++ seg1 = 0;
++ seg2 = jdesc->ji_nsegment - 1;
++
++ for (;;) {
++ seg_m = (seg1 + seg2) / 2;
++
++ error = find_good_lh(sdp, jdesc, gl, &seg1, &lh1, TRUE);
++ if (error)
++ break;
++
++ if (seg1 == seg_m) {
++ error = verify_jhead(sdp, jdesc, gl, &lh1);
++ memcpy(head, &lh1, sizeof(struct gfs_log_header));
++ break;
++ }
++
++ error = find_good_lh(sdp, jdesc, gl, &seg_m, &lh_m, FALSE);
++ if (error)
++ break;
++
++ if (lh1.lh_sequence <= lh_m.lh_sequence)
++ seg1 = seg_m;
++ else
++ seg2 = seg_m;
++ }
++
++ return error;
++}
++
++/**
++ * gfs_increment_blkno - move to the next block in a journal
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @addr: the block number to increment
++ * @skip_header: if this is TRUE, skip log headers
++ *
++ * Replace @addr with the location of the next block in the log.
++ * Take care of journal wrap and skip of log header if necessary.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_increment_blkno(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t *addr, int skip_headers)
++{
++ struct gfs_log_header header;
++ int error;
++
++ (*addr)++;
++
++ /* Handle journal wrap */
++
++ if (*addr == seg2bn(jdesc->ji_nsegment))
++ *addr -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++
++ gfs_start_ra(gl, *addr,
++ jdesc->ji_addr +
++ jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size - *addr);
++
++ /* Handle landing on a header block */
++
++ if (skip_headers && !do_mod(*addr, sdp->sd_sb.sb_seg_size)) {
++ error = get_log_header(sdp, jdesc, gl, bn2seg(*addr), &header);
++ if (error < 0)
++ return error;
++
++ GFS_ASSERT_SBD(!error, sdp,); /* Corrupt headers here are bad */
++ GFS_ASSERT_SBD(header.lh_first != *addr, sdp,
++ gfs_log_header_print(&header);
++ printk("*addr = %"PRIu64"\n", *addr););
++
++ (*addr)++;
++ /* Can't wrap here */
++ }
++
++ return 0;
++}
++
++/**
++ * foreach_descriptor - go through the active part of the log
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @start: the first log header in the active region
++ * @end: the last log header (don't process the contents of this entry))
++ * @pass: the recovery pass
++ *
++ * Call a given function once for every log descriptor in the active
++ * portion of the log.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++foreach_descriptor(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t start, uint64_t end,
++ unsigned int pass)
++{
++ struct gfs_log_header header;
++ struct gfs_log_descriptor desc;
++ struct buffer_head *bh;
++ int error = 0;
++
++ while (start != end) {
++ GFS_ASSERT_SBD(!do_mod(start, sdp->sd_sb.sb_seg_size), sdp,);
++
++ error = get_log_header(sdp, jdesc, gl, bn2seg(start), &header);
++ if (error < 0)
++ return error;
++
++ GFS_ASSERT_SBD(!error, sdp,); /* Corrupt headers are bad */
++ GFS_ASSERT_SBD(header.lh_first == start, sdp,
++ gfs_log_header_print(&header);
++ printk("start = %"PRIu64"\n", start););
++
++ start++;
++
++ for (;;) {
++ error = gfs_dread(sdp, start, gl, DIO_START | DIO_WAIT, &bh);
++ if (error)
++ return error;
++
++ gfs_metatype_check(sdp, bh, GFS_METATYPE_LD);
++ gfs_desc_in(&desc, bh->b_data);
++
++ brelse(bh);
++
++ if (desc.ld_type != GFS_LOG_DESC_LAST) {
++ error = LO_SCAN_ELEMENTS(sdp, jdesc, gl, start,
++ &desc, pass);
++ if (error)
++ return error;
++
++ while (desc.ld_length--) {
++ error = gfs_increment_blkno(sdp, jdesc, gl,
++ &start, TRUE);
++ if (error)
++ return error;
++ }
++ } else {
++ while (desc.ld_length--) {
++ error = gfs_increment_blkno(sdp, jdesc, gl,
++ &start,
++ !!desc.ld_length);
++ if (error)
++ return error;
++ }
++
++ break;
++ }
++ }
++ }
++
++ return error;
++}
++
++/**
++ * clean_journal - mark a dirty journal as being clean
++ * @sdp: the filesystem
++ * @jdesc: the journal
++ * @gl: the journal's glock
++ * @head: the head journal to start from
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++clean_journal(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, struct gfs_log_header *head)
++{
++ struct gfs_log_header lh;
++ struct gfs_log_descriptor desc;
++ struct buffer_head *bh;
++ uint32_t seg;
++ uint64_t blkno;
++ int error;
++
++ seg = bn2seg(head->lh_first);
++
++ for (;;) {
++ if (++seg == jdesc->ji_nsegment)
++ seg = 0;
++
++ error = get_log_header(sdp, jdesc, gl, seg, &lh);
++ if (error < 0)
++ return error;
++
++ /* Rewrite corrupt header blocks */
++
++ if (error == 1) {
++ bh = gfs_dgetblk(sdp, seg2bn(seg), gl);
++
++ gfs_prep_new_buffer(bh);
++ gfs_buffer_clear(bh);
++ gfs_log_header_out(head, bh->b_data);
++ gfs_log_header_out(head,
++ bh->b_data + GFS_BASIC_BLOCK -
++ sizeof(struct gfs_log_header));
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++ brelse(bh);
++ if (error)
++ return error;
++ }
++
++ /* Stop when we get to the end of the log. */
++
++ if (lh.lh_sequence < head->lh_sequence)
++ break;
++ }
++
++ /* Build a "last" descriptor for the transaction we are
++ about to commit by writing the shutdown header. */
++
++ memset(&desc, 0, sizeof(struct gfs_log_descriptor));
++ desc.ld_header.mh_magic = GFS_MAGIC;
++ desc.ld_header.mh_type = GFS_METATYPE_LD;
++ desc.ld_header.mh_format = GFS_FORMAT_LD;
++ desc.ld_type = GFS_LOG_DESC_LAST;
++ desc.ld_length = 0;
++
++ for (blkno = head->lh_first + 1; blkno != seg2bn(seg);) {
++ if (do_mod(blkno, sdp->sd_sb.sb_seg_size))
++ desc.ld_length++;
++ if (++blkno == seg2bn(jdesc->ji_nsegment))
++ blkno -= jdesc->ji_nsegment * sdp->sd_sb.sb_seg_size;
++ }
++
++ /* Write the descriptor */
++
++ bh = gfs_dgetblk(sdp, head->lh_first + 1, gl);
++
++ gfs_prep_new_buffer(bh);
++ gfs_buffer_clear(bh);
++ gfs_desc_out(&desc, bh->b_data);
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++ brelse(bh);
++ if (error)
++ return error;
++
++ /* Build a log header that says the journal is clean */
++
++ memset(&lh, 0, sizeof(struct gfs_log_header));
++ lh.lh_header.mh_magic = GFS_MAGIC;
++ lh.lh_header.mh_type = GFS_METATYPE_LH;
++ lh.lh_header.mh_format = GFS_FORMAT_LH;
++ lh.lh_flags = GFS_LOG_HEAD_UNMOUNT;
++ lh.lh_first = seg2bn(seg);
++ lh.lh_sequence = head->lh_sequence + 1;
++ /* Don't care about tail */
++ lh.lh_last_dump = head->lh_last_dump;
++
++ /* Write the header */
++
++ bh = gfs_dgetblk(sdp, lh.lh_first, gl);
++
++ gfs_prep_new_buffer(bh);
++ gfs_buffer_clear(bh);
++ gfs_log_header_out(&lh, bh->b_data);
++ gfs_log_header_out(&lh,
++ bh->b_data + GFS_BASIC_BLOCK -
++ sizeof(struct gfs_log_header));
++
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++ brelse(bh);
++
++ return error;
++}
++
++/**
++ * gfs_recover_journal - recovery a given journal
++ * @sdp: the filesystem
++ * @jid: the number of the journal to recover
++ * @jdesc: the struct gfs_jindex describing the journal
++ * @wait: Don't return until the journal is clean (or an error is encountered)
++ *
++ * Acquire a journals lock, check to see if the journal is clean, and
++ * do recovery if necessary.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_recover_journal(struct gfs_sbd *sdp,
++ unsigned int jid, struct gfs_jindex *jdesc,
++ int wait)
++{
++ struct gfs_log_header head;
++ struct gfs_holder j_gh, t_gh;
++ unsigned long t;
++ int error;
++
++ printk("GFS: fsid=%s: jid=%u: Trying to acquire journal lock...\n",
++ sdp->sd_fsname, jid);
++
++ /* Aquire the journal lock so we can do recovery */
++
++ error = gfs_glock_nq_num(sdp,
++ jdesc->ji_addr, &gfs_meta_glops,
++ LM_ST_EXCLUSIVE,
++ LM_FLAG_NOEXP |
++ ((wait) ? 0 : LM_FLAG_TRY) |
++ GL_NOCACHE, &j_gh);
++ switch (error) {
++ case 0:
++ break;
++
++ case GLR_TRYFAILED:
++ GFS_ASSERT_SBD(!wait, sdp,);
++ printk("GFS: fsid=%s: jid=%u: Busy\n", sdp->sd_fsname, jid);
++ error = 0;
++
++ default:
++ goto fail;
++ };
++
++ printk("GFS: fsid=%s: jid=%u: Looking at journal...\n",
++ sdp->sd_fsname, jid);
++
++ error = gfs_find_jhead(sdp, jdesc, j_gh.gh_gl, &head);
++ if (error)
++ goto fail_gunlock;
++
++ if (!(head.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
++ if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ printk("GFS: fsid=%s: jid=%u: Can't replay: read-only FS\n",
++ sdp->sd_fsname, jid);
++ error = -EROFS;
++ goto fail_gunlock;
++ }
++
++ printk("GFS: fsid=%s: jid=%u: Acquiring the transaction lock...\n",
++ sdp->sd_fsname, jid);
++
++ t = jiffies;
++
++ /* Acquire an exclusive hold on the transaction lock */
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl,
++ LM_ST_EXCLUSIVE,
++ LM_FLAG_NOEXP |
++ LM_FLAG_PRIORITY |
++ GL_NOCACHE,
++ &t_gh);
++ if (error)
++ goto fail_gunlock;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ printk("GFS: fsid=%s: jid=%u: Can't replay: read-only FS\n",
++ sdp->sd_fsname, jid);
++ error = -EROFS;
++ goto fail_gunlock_tr;
++ }
++
++ printk("GFS: fsid=%s: jid=%u: Replaying journal...\n",
++ sdp->sd_fsname, jid);
++
++ set_bit(GLF_DIRTY, &j_gh.gh_gl->gl_flags);
++
++ LO_BEFORE_SCAN(sdp, jid, &head, GFS_RECPASS_A1);
++
++ error = foreach_descriptor(sdp, jdesc, j_gh.gh_gl,
++ head.lh_tail, head.lh_first,
++ GFS_RECPASS_A1);
++ if (error)
++ goto fail_gunlock_tr;
++
++ LO_AFTER_SCAN(sdp, jid, GFS_RECPASS_A1);
++
++ gfs_replay_wait(sdp);
++
++ error = clean_journal(sdp, jdesc, j_gh.gh_gl, &head);
++ if (error)
++ goto fail_gunlock_tr;
++
++ gfs_glock_dq_uninit(&t_gh);
++
++ t = DIV_RU(jiffies - t, HZ);
++
++ printk("GFS: fsid=%s: jid=%u: Journal replayed in %lus\n",
++ sdp->sd_fsname, jid, t);
++ }
++
++ sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++ jid,
++ LM_RD_SUCCESS);
++
++ gfs_glock_dq_uninit(&j_gh);
++
++ printk("GFS: fsid=%s: jid=%u: Done\n", sdp->sd_fsname, jid);
++
++ return 0;
++
++ fail_gunlock_tr:
++ gfs_replay_wait(sdp);
++ gfs_glock_dq_uninit(&t_gh);
++
++ fail_gunlock:
++ gfs_glock_dq_uninit(&j_gh);
++
++ printk("GFS: fsid=%s: jid=%u: %s\n",
++ sdp->sd_fsname, jid, (error) ? "Failed" : "Done");
++
++ fail:
++ sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++ jid,
++ LM_RD_GAVEUP);
++
++ return error;
++}
++
++/**
++ * gfs_check_journals - Recovery any dirty journals
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_check_journals(struct gfs_sbd *sdp)
++{
++ struct dirty_j *dj;
++
++ for (;;) {
++ dj = get_dirty_j(sdp);
++ if (!dj)
++ break;
++
++ down(&sdp->sd_jindex_lock);
++
++ if (dj->dj_jid != sdp->sd_lockstruct.ls_jid &&
++ dj->dj_jid < sdp->sd_journals) {
++ memcpy(&dj->dj_desc,
++ sdp->sd_jindex + dj->dj_jid,
++ sizeof(struct gfs_jindex));
++ up(&sdp->sd_jindex_lock);
++
++ gfs_recover_journal(sdp,
++ dj->dj_jid, &dj->dj_desc,
++ FALSE);
++
++ } else {
++ up(&sdp->sd_jindex_lock);
++ sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace,
++ dj->dj_jid, LM_RD_GAVEUP);
++ }
++
++ kfree(dj);
++ }
++}
++
++/**
++ * gfs_recover_dump - recover the log elements in this machine's journal
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_recover_dump(struct gfs_sbd *sdp)
++{
++ struct gfs_log_header head;
++ int error;
++
++ error = gfs_find_jhead(sdp, &sdp->sd_jdesc, sdp->sd_journal_gh.gh_gl,
++ &head);
++ if (error)
++ goto fail;
++
++ GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++ if (!head.lh_last_dump)
++ return error;
++
++ printk("GFS: fsid=%s: Scanning for log elements...\n",
++ sdp->sd_fsname);
++
++ LO_BEFORE_SCAN(sdp, sdp->sd_lockstruct.ls_jid, &head, GFS_RECPASS_B1);
++
++ error = foreach_descriptor(sdp, &sdp->sd_jdesc, sdp->sd_journal_gh.gh_gl,
++ head.lh_last_dump, head.lh_first,
++ GFS_RECPASS_B1);
++ if (error)
++ goto fail;
++
++ LO_AFTER_SCAN(sdp, sdp->sd_lockstruct.ls_jid, GFS_RECPASS_B1);
++
++ /* We need to make sure if we crash during the next log dump that
++ all intermediate headers in the transaction point to the last
++ log dump before the one we're making so we don't lose it. */
++
++ sdp->sd_log_dump_last = head.lh_last_dump;
++
++ printk("GFS: fsid=%s: Done\n", sdp->sd_fsname);
++
++ return 0;
++
++ fail:
++ printk("GFS: fsid=%s: Failed\n", sdp->sd_fsname);
++
++ return error;
++}
+diff -urN linux-orig/fs/gfs/recovery.h linux-patched/fs/gfs/recovery.h
+--- linux-orig/fs/gfs/recovery.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/recovery.h 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,36 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RECOVERY_DOT_H__
++#define __RECOVERY_DOT_H__
++
++#define GFS_RECPASS_A1 (12)
++#define GFS_RECPASS_B1 (14)
++
++void gfs_add_dirty_j(struct gfs_sbd *sdp, unsigned int jid);
++void gfs_clear_dirty_j(struct gfs_sbd *sdp);
++
++int gfs_find_jhead(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, struct gfs_log_header *head);
++int gfs_increment_blkno(struct gfs_sbd *sdp, struct gfs_jindex *jdesc,
++ struct gfs_glock *gl, uint64_t *addr,
++ int skip_headers);
++
++int gfs_recover_journal(struct gfs_sbd *sdp,
++ unsigned int jid, struct gfs_jindex *jdesc,
++ int wait);
++void gfs_check_journals(struct gfs_sbd *sdp);
++
++int gfs_recover_dump(struct gfs_sbd *sdp);
++
++#endif /* __RECOVERY_DOT_H__ */
+diff -urN linux-orig/fs/gfs/rgrp.c linux-patched/fs/gfs/rgrp.c
+--- linux-orig/fs/gfs/rgrp.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/rgrp.c 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,1932 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "bits.h"
++#include "dio.h"
++#include "file.h"
++#include "glock.h"
++#include "glops.h"
++#include "rgrp.h"
++#include "super.h"
++#include "trans.h"
++
++/**
++ * mhc_hash: find the mhc hash bucket for a buffer
++ * @bh: the buffer
++ *
++ * Returns: The bucket number
++ */
++
++static unsigned int
++mhc_hash(struct buffer_head *bh)
++{
++ uint64_t blkno;
++ unsigned int h;
++
++ blkno = bh->b_blocknr;
++ h = gfs_hash(&blkno, sizeof(uint64_t)) & GFS_MHC_HASH_MASK;
++
++ return h;
++}
++
++/**
++ * mhc_trim -
++ * @sdp:
++ * @max:
++ *
++ */
++
++static void
++mhc_trim(struct gfs_sbd *sdp, unsigned int max)
++{
++ struct gfs_meta_header_cache *mc;
++
++ for (;;) {
++ spin_lock(&sdp->sd_mhc_lock);
++ if (list_empty(&sdp->sd_mhc_single)) {
++ spin_unlock(&sdp->sd_mhc_lock);
++ return;
++ } else {
++ mc = list_entry(sdp->sd_mhc_single.prev,
++ struct gfs_meta_header_cache,
++ mc_list_single);
++ list_del(&mc->mc_list_hash);
++ list_del(&mc->mc_list_single);
++ list_del(&mc->mc_list_rgd);
++ spin_unlock(&sdp->sd_mhc_lock);
++
++ kmem_cache_free(gfs_mhc_cachep, mc);
++ atomic_dec(&sdp->sd_mhc_count);
++
++ if (atomic_read(&sdp->sd_mhc_count) <= max)
++ return;
++ }
++ }
++}
++
++/**
++ * gfs_mhc_add - add buffers to the cache of metadata
++ * @rgd: a RG
++ * @bh: an array of buffers
++ * @num: the number of buffers in the array
++ *
++ */
++
++void
++gfs_mhc_add(struct gfs_rgrpd *rgd,
++ struct buffer_head **bh, unsigned int num)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_meta_header_cache *mc;
++ unsigned int x;
++ uint64_t gen;
++ struct list_head *head;
++
++ for (x = 0; x < num; x++) {
++ gfs_meta_check(sdp, bh[x]);
++
++ RETRY_MALLOC(mc = kmem_cache_alloc(gfs_mhc_cachep, GFP_KERNEL), mc);
++ memset(mc, 0, sizeof(struct gfs_meta_header_cache));
++
++ mc->mc_block = bh[x]->b_blocknr;
++ memcpy(&mc->mc_mh, bh[x]->b_data,
++ sizeof(struct gfs_meta_header));
++
++ gen = gfs64_to_cpu(mc->mc_mh.mh_generation) + 2;
++ mc->mc_mh.mh_generation = cpu_to_gfs64(gen);
++
++ head = &sdp->sd_mhc[mhc_hash(bh[x])];
++
++ spin_lock(&sdp->sd_mhc_lock);
++ list_add(&mc->mc_list_hash, head);
++ list_add(&mc->mc_list_single, &sdp->sd_mhc_single);
++ list_add(&mc->mc_list_rgd, &rgd->rd_mhc);
++ spin_unlock(&sdp->sd_mhc_lock);
++
++ atomic_inc(&sdp->sd_mhc_count);
++ }
++
++ if (atomic_read(&sdp->sd_mhc_count) > sdp->sd_tune.gt_max_mhc)
++ mhc_trim(sdp, sdp->sd_tune.gt_max_mhc);
++}
++
++/**
++ * gfs_mhc_fish - Try to fill in a buffer with data from the cache
++ * @sdp: the filesystem
++ * @bh: the buffer to fill in
++ *
++ * Returns: TRUE if the buffer was cached, FALSE otherwise
++ */
++
++int
++gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh)
++{
++ struct list_head *tmp, *head;
++ struct gfs_meta_header_cache *mc;
++
++ head = &sdp->sd_mhc[mhc_hash(bh)];
++
++ spin_lock(&sdp->sd_mhc_lock);
++
++ for (tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ mc = list_entry(tmp, struct gfs_meta_header_cache, mc_list_hash);
++ if (mc->mc_block != bh->b_blocknr)
++ continue;
++
++ list_del(&mc->mc_list_hash);
++ list_del(&mc->mc_list_single);
++ list_del(&mc->mc_list_rgd);
++ spin_unlock(&sdp->sd_mhc_lock);
++
++ gfs_prep_new_buffer(bh);
++ memcpy(bh->b_data, &mc->mc_mh,
++ sizeof(struct gfs_meta_header));
++
++ kmem_cache_free(gfs_mhc_cachep, mc);
++ atomic_dec(&sdp->sd_mhc_count);
++
++ return TRUE;
++ }
++
++ spin_unlock(&sdp->sd_mhc_lock);
++
++ return FALSE;
++}
++
++/**
++ * gfs_mhc_zap - Get rid of the data in the cache of metadata headers
++ * @rgd: a RG
++ *
++ */
++
++void
++gfs_mhc_zap(struct gfs_rgrpd *rgd)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_meta_header_cache *mc;
++
++ spin_lock(&sdp->sd_mhc_lock);
++
++ while (!list_empty(&rgd->rd_mhc)) {
++ mc = list_entry(rgd->rd_mhc.next,
++ struct gfs_meta_header_cache,
++ mc_list_rgd);
++
++ list_del(&mc->mc_list_hash);
++ list_del(&mc->mc_list_single);
++ list_del(&mc->mc_list_rgd);
++ spin_unlock(&sdp->sd_mhc_lock);
++
++ kmem_cache_free(gfs_mhc_cachep, mc);
++ atomic_dec(&sdp->sd_mhc_count);
++
++ spin_lock(&sdp->sd_mhc_lock);
++ }
++
++ spin_unlock(&sdp->sd_mhc_lock);
++}
++
++/**
++ * depend_hash() - Turn glock number into hash bucket number
++ * @formal_ino:
++ *
++ * Returns: The number of the corresponding hash bucket
++ */
++
++static unsigned int
++depend_hash(uint64_t formal_ino)
++{
++ unsigned int h;
++
++ h = gfs_hash(&formal_ino, sizeof(uint64_t));
++ h &= GFS_DEPEND_HASH_MASK;
++
++ return h;
++}
++
++/**
++ * depend_sync_one -
++ * @sdp:
++ * @gd:
++ *
++ */
++
++static void
++depend_sync_one(struct gfs_sbd *sdp, struct gfs_depend *gd)
++{
++ struct gfs_glock *gl;
++
++ spin_lock(&sdp->sd_depend_lock);
++ list_del(&gd->gd_list_hash);
++ spin_unlock(&sdp->sd_depend_lock);
++ list_del(&gd->gd_list_rgd);
++
++ gl = gfs_glock_find(sdp,
++ &(struct lm_lockname){gd->gd_formal_ino,
++ LM_TYPE_INODE});
++ if (gl) {
++ if (gl->gl_ops->go_sync)
++ gl->gl_ops->go_sync(gl,
++ DIO_METADATA |
++ DIO_INVISIBLE);
++ gfs_glock_put(gl);
++ }
++
++ kfree(gd);
++ atomic_dec(&sdp->sd_depend_count);
++}
++
++/**
++ * depend_sync_old -
++ * @rgd:
++ *
++ */
++
++static void
++depend_sync_old(struct gfs_rgrpd *rgd)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_depend *gd;
++
++ for (;;) {
++ gd = list_entry(rgd->rd_depend.prev,
++ struct gfs_depend,
++ gd_list_rgd);
++
++ if (time_before(jiffies,
++ gd->gd_time +
++ sdp->sd_tune.gt_depend_secs * HZ))
++ return;
++
++ depend_sync_one(sdp, gd);
++ }
++}
++
++/**
++ * gfs_depend_add -
++ * @rgd:
++ * @formal_ino:
++ *
++ */
++
++void
++gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct list_head *head, *tmp;
++ struct gfs_depend *gd;
++
++ head = &sdp->sd_depend[depend_hash(formal_ino)];
++
++ spin_lock(&sdp->sd_depend_lock);
++
++ for (tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ gd = list_entry(tmp, struct gfs_depend, gd_list_hash);
++ if (gd->gd_rgd == rgd &&
++ gd->gd_formal_ino == formal_ino) {
++ list_move(&gd->gd_list_hash, head);
++ spin_unlock(&sdp->sd_depend_lock);
++ list_move(&gd->gd_list_rgd, &rgd->rd_depend);
++ gd->gd_time = jiffies;
++ return;
++ }
++ }
++
++ spin_unlock(&sdp->sd_depend_lock);
++
++ gd = gmalloc(sizeof(struct gfs_depend));
++ memset(gd, 0, sizeof(struct gfs_depend));
++
++ gd->gd_rgd = rgd;
++ gd->gd_formal_ino = formal_ino;
++ gd->gd_time = jiffies;
++
++ spin_lock(&sdp->sd_depend_lock);
++ list_add(&gd->gd_list_hash, head);
++ spin_unlock(&sdp->sd_depend_lock);
++ list_add(&gd->gd_list_rgd, &rgd->rd_depend);
++
++ atomic_inc(&sdp->sd_depend_count);
++
++ depend_sync_old(rgd);
++}
++
++/**
++ * gfs_depend_sync -
++ * @rgd:
++ *
++ */
++
++void
++gfs_depend_sync(struct gfs_rgrpd *rgd)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_depend *gd;
++
++ while (!list_empty(&rgd->rd_depend)) {
++ gd = list_entry(rgd->rd_depend.next,
++ struct gfs_depend,
++ gd_list_rgd);
++ depend_sync_one(sdp, gd);
++ }
++}
++
++/**
++ * rgrp_verify - Verify that a resource group is consistent
++ * @sdp: the filesystem
++ * @rgd: the rgrp
++ *
++ * Somebody should have already called gfs_glock_rg() on this RG.
++ */
++
++static void
++rgrp_verify(struct gfs_rgrpd *rgd)
++{
++ struct gfs_bitmap *bits = NULL;
++ uint32_t length = rgd->rd_ri.ri_length;
++ uint32_t count[4], tmp;
++ int buf, x;
++
++ memset(count, 0, 4 * sizeof(uint32_t));
++
++ for (buf = 0; buf < length; buf++) {
++ bits = &rgd->rd_bits[buf];
++ for (x = 0; x < 4; x++)
++ count[x] += gfs_bitcount(rgd,
++ rgd->rd_bh[buf]->b_data +
++ bits->bi_offset,
++ bits->bi_len, x);
++ }
++
++ GFS_ASSERT_RGRPD(count[0] == rgd->rd_rg.rg_free, rgd,
++ printk("free data mismatch: %u != %u\n",
++ count[0], rgd->rd_rg.rg_free););
++
++ tmp = rgd->rd_ri.ri_data -
++ (rgd->rd_rg.rg_usedmeta + rgd->rd_rg.rg_freemeta) -
++ (rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi) -
++ rgd->rd_rg.rg_free;
++ GFS_ASSERT_RGRPD(count[1] == tmp, rgd,
++ printk("used data mismatch: %u != %u\n",
++ count[1], tmp););
++
++ GFS_ASSERT_RGRPD(count[2] == rgd->rd_rg.rg_freemeta, rgd,
++ printk("free metadata mismatch: %u != %u\n",
++ count[2], rgd->rd_rg.rg_freemeta););
++
++ tmp = rgd->rd_rg.rg_usedmeta +
++ (rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi);
++ GFS_ASSERT_RGRPD(count[3] == tmp, rgd,
++ printk("used metadata mismatch: %u != %u\n",
++ count[3], tmp););
++}
++
++/**
++ * gfs_blk2rgrpd - Find resource group for a given data block number
++ * @sdp: The GFS superblock
++ * @n: The data block number
++ *
++ * Returns: Ths resource group, or NULL if not found
++ */
++
++struct gfs_rgrpd *
++gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk)
++{
++ struct list_head *tmp, *head;
++ struct gfs_rgrpd *rgd = NULL;
++ struct gfs_rindex *ri;
++
++ spin_lock(&sdp->sd_rg_mru_lock);
++
++ for (head = &sdp->sd_rg_mru_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ rgd = list_entry(tmp, struct gfs_rgrpd, rd_list_mru);
++ ri = &rgd->rd_ri;
++
++ if (ri->ri_data1 <= blk && blk < ri->ri_data1 + ri->ri_data) {
++ list_move(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
++ spin_unlock(&sdp->sd_rg_mru_lock);
++ return rgd;
++ }
++ }
++
++ spin_unlock(&sdp->sd_rg_mru_lock);
++
++ return NULL;
++}
++
++/**
++ * gfs_rgrpd_get_first - get the first RG
++ * @sdp: The GFS superblock
++ *
++ * Returns: The first rgrp in the filesystem
++ */
++
++struct gfs_rgrpd *
++gfs_rgrpd_get_first(struct gfs_sbd *sdp)
++{
++ GFS_ASSERT_SBD(!list_empty(&sdp->sd_rglist), sdp,);
++ return list_entry(sdp->sd_rglist.next, struct gfs_rgrpd, rd_list);
++}
++
++/**
++ * gfs_rgrpd_get_next - get the next RG
++ * @rgd: A RG
++ *
++ * Returns: The next rgrp
++ */
++
++struct gfs_rgrpd *
++gfs_rgrpd_get_next(struct gfs_rgrpd *rgd)
++{
++ if (rgd->rd_list.next == &rgd->rd_sbd->sd_rglist)
++ return NULL;
++ return list_entry(rgd->rd_list.next, struct gfs_rgrpd, rd_list);
++}
++
++/**
++ * clear_rgrpdi - Clear up rgrps
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++clear_rgrpdi(struct gfs_sbd *sdp)
++{
++ struct gfs_rgrpd *rgd;
++ struct gfs_glock *gl;
++
++ sdp->sd_rg_forward = NULL;
++
++ while (!list_empty(&sdp->sd_rg_recent)) {
++ rgd = list_entry(sdp->sd_rg_recent.next,
++ struct gfs_rgrpd, rd_recent);
++ list_del(&rgd->rd_recent);
++ }
++
++ while (!list_empty(&sdp->sd_rglist)) {
++ rgd = list_entry(sdp->sd_rglist.next,
++ struct gfs_rgrpd, rd_list);
++ gl = rgd->rd_gl;
++
++ list_del(&rgd->rd_list);
++ list_del(&rgd->rd_list_mru);
++
++ if (gl) {
++ gfs_glock_force_drop(gl);
++ if (atomic_read(&gl->gl_lvb_count))
++ gfs_lvb_unhold(gl);
++ gl2rgd(gl) = NULL;
++ gfs_glock_put(gl);
++ }
++
++ if (rgd->rd_bits)
++ kfree(rgd->rd_bits);
++ if (rgd->rd_bh)
++ kfree(rgd->rd_bh);
++
++ kfree(rgd);
++ }
++}
++
++/**
++ * gfs_clear_rgrpd - Clear up rgrps
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_clear_rgrpd(struct gfs_sbd *sdp)
++{
++ down(&sdp->sd_rindex_lock);
++ clear_rgrpdi(sdp);
++ up(&sdp->sd_rindex_lock);
++}
++
++/**
++ * gfs_compute_bitstructs - Compute the bitmap sizes
++ * @rgd: The resource group descriptor
++ *
++ */
++
++static void
++compute_bitstructs(struct gfs_rgrpd *rgd)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_bitmap *bits;
++ uint32_t length = rgd->rd_ri.ri_length;
++ uint32_t bytes_left, bytes;
++ int x;
++
++ rgd->rd_bits = gmalloc(length * sizeof(struct gfs_bitmap));
++ memset(rgd->rd_bits, 0, length * sizeof(struct gfs_bitmap));
++
++ bytes_left = rgd->rd_ri.ri_bitbytes;
++
++ for (x = 0; x < length; x++) {
++ bits = &rgd->rd_bits[x];
++
++ if (length == 1) {
++ bytes = bytes_left;
++ bits->bi_offset = sizeof(struct gfs_rgrp);
++ bits->bi_start = 0;
++ bits->bi_len = bytes;
++ } else if (x == 0) {
++ bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_rgrp);
++ bits->bi_offset = sizeof(struct gfs_rgrp);
++ bits->bi_start = 0;
++ bits->bi_len = bytes;
++ } else if (x + 1 == length) {
++ bytes = bytes_left;
++ bits->bi_offset = sizeof(struct gfs_meta_header);
++ bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
++ bits->bi_len = bytes;
++ } else {
++ bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++ bits->bi_offset = sizeof(struct gfs_meta_header);
++ bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
++ bits->bi_len = bytes;
++ }
++
++ bytes_left -= bytes;
++ }
++
++ GFS_ASSERT_RGRPD(!bytes_left, rgd,);
++ GFS_ASSERT_RGRPD((rgd->rd_bits[length - 1].bi_start +
++ rgd->rd_bits[length - 1].bi_len) * GFS_NBBY ==
++ rgd->rd_ri.ri_data, rgd,
++ printk("start=%u len=%u offset=%u\n",
++ rgd->rd_bits[length - 1].bi_start,
++ rgd->rd_bits[length - 1].bi_len,
++ rgd->rd_bits[length - 1].bi_offset);
++ gfs_rindex_print(&rgd->rd_ri););
++
++ rgd->rd_bh = gmalloc(length * sizeof(struct buffer_head *));
++ memset(rgd->rd_bh, 0, length * sizeof(struct buffer_head *));
++}
++
++/**
++ * gfs_ri_update - Pull in a new resource index from the disk
++ * @gl: The glock covering the rindex inode
++ *
++ * Returns: 0 on successful update, error code otherwise
++ */
++
++static int
++gfs_ri_update(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_rgrpd *rgd;
++ char buf[sizeof(struct gfs_rindex)];
++ int error;
++
++ GFS_ASSERT_SBD(!do_mod(ip->i_di.di_size, sizeof(struct gfs_rindex)),
++ sdp,);
++
++ clear_rgrpdi(sdp);
++
++ for (sdp->sd_rgcount = 0;; sdp->sd_rgcount++) {
++ error = gfs_internal_read(ip, buf,
++ sdp->sd_rgcount *
++ sizeof(struct gfs_rindex),
++ sizeof(struct gfs_rindex));
++ if (!error)
++ break;
++ if (error != sizeof(struct gfs_rindex)) {
++ if (error > 0)
++ error = -EIO;
++ goto fail;
++ }
++
++ rgd = gmalloc(sizeof(struct gfs_rgrpd));
++ memset(rgd, 0, sizeof(struct gfs_rgrpd));
++
++ INIT_LIST_HEAD(&rgd->rd_mhc);
++ INIT_LIST_HEAD(&rgd->rd_depend);
++ rgd->rd_sbd = sdp;
++
++ list_add_tail(&rgd->rd_list, &sdp->sd_rglist);
++ list_add_tail(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
++
++ gfs_rindex_in(&rgd->rd_ri, buf);
++
++ compute_bitstructs(rgd);
++
++ error = gfs_glock_get(sdp, rgd->rd_ri.ri_addr, &gfs_rgrp_glops,
++ CREATE, &rgd->rd_gl);
++ if (error)
++ goto fail;
++
++ error = gfs_lvb_hold(rgd->rd_gl);
++ if (error)
++ goto fail;
++
++ gl2rgd(rgd->rd_gl) = rgd;
++ rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
++ }
++
++ sdp->sd_riinode_vn = ip->i_gl->gl_vn;
++
++ return 0;
++
++ fail:
++ clear_rgrpdi(sdp);
++
++ return error;
++}
++
++/**
++ * gfs_rindex_hold - Grab a lock on the rindex
++ * @sdp: The GFS superblock
++ * @ri_gh: the glock holder
++ *
++ * We grab a lock in the rindex inode to make sure that it doesn't
++ * change whilst we are performing an operation. We keep this lock
++ * for quite long periods of time compared to other locks. This
++ * doesn't matter, since its shared and it is very, very rarely
++ * accessed in the exclusive mode.
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh)
++{
++ struct gfs_inode *ip = sdp->sd_riinode;
++ struct gfs_glock *gl = ip->i_gl;
++ int error;
++
++ error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
++ if (error)
++ return error;
++
++ if (sdp->sd_riinode_vn != gl->gl_vn) {
++ down(&sdp->sd_rindex_lock);
++ if (sdp->sd_riinode_vn != gl->gl_vn) {
++ error = gfs_ri_update(ip);
++ if (error)
++ gfs_glock_dq_uninit(ri_gh);
++ }
++ up(&sdp->sd_rindex_lock);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_rgrp_read - Read in a RG's bitmaps
++ * @rgd: the struct gfs_rgrpd describing the RG to read in
++ *
++ * Read in RG bitmaps. Must call gfs_rgrp_relse() it free the bitmaps.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_rgrp_read(struct gfs_rgrpd *rgd)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_glock *gl = rgd->rd_gl;
++ unsigned int x, length = rgd->rd_ri.ri_length;
++ int error;
++
++ for (x = 0; x < length; x++) {
++ GFS_ASSERT_RGRPD(!rgd->rd_bh[x], rgd,);
++ rgd->rd_bh[x] = gfs_dgetblk(sdp, rgd->rd_ri.ri_addr + x, gl);
++ }
++
++ for (x = 0; x < length; x++) {
++ error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_START);
++ if (error)
++ goto fail;
++ }
++
++ for (x = length; x--;) {
++ error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_WAIT);
++ if (error)
++ goto fail;
++ gfs_metatype_check(sdp, rgd->rd_bh[x],
++ (x) ? GFS_METATYPE_RB : GFS_METATYPE_RG);
++ }
++
++ if (rgd->rd_rg_vn != gl->gl_vn) {
++ gfs_rgrp_in(&rgd->rd_rg, (rgd->rd_bh[0])->b_data);
++ rgd->rd_rg_vn = gl->gl_vn;
++ }
++
++ return 0;
++
++ fail:
++ for (x = 0; x < length; x++) {
++ brelse(rgd->rd_bh[x]);
++ rgd->rd_bh[x] = NULL;
++ }
++
++ return error;
++}
++
++/**
++ * gfs_rgrp_relse - Release RG bitmaps read in with gfs_rgrp_read()
++ * @rgd: the struct gfs_rgrpd describing the RG to read in
++ *
++ */
++
++void
++gfs_rgrp_relse(struct gfs_rgrpd *rgd)
++{
++ int x, length = rgd->rd_ri.ri_length;
++
++ for (x = 0; x < length; x++) {
++ brelse(rgd->rd_bh[x]);
++ rgd->rd_bh[x] = NULL;
++ }
++}
++
++/**
++ * gfs_rgrp_lvb_fill - copy RG usage data out of the struct gfs_rgrp into the struct gfs_rgrp_lvb
++ * @rgd: the resource group data structure
++ *
++ */
++
++void
++gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd)
++{
++ struct gfs_rgrp *rg = &rgd->rd_rg;
++ struct gfs_rgrp_lvb *rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++
++ rb->rb_magic = cpu_to_gfs32(GFS_MAGIC);
++ rb->rb_free = cpu_to_gfs32(rg->rg_free);
++ rb->rb_useddi = cpu_to_gfs32(rg->rg_useddi);
++ rb->rb_freedi = cpu_to_gfs32(rg->rg_freedi);
++ rb->rb_usedmeta = cpu_to_gfs32(rg->rg_usedmeta);
++ rb->rb_freemeta = cpu_to_gfs32(rg->rg_freemeta);
++
++ clear_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags);
++}
++
++/**
++ * gfs_rgrp_lvb_init - Init the data of a RG LVB
++ * @rgd: the resource group data structure
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd)
++{
++ struct gfs_glock *gl = rgd->rd_gl;
++ struct gfs_holder rgd_gh;
++ int error;
++
++ error = gfs_glock_nq_init(gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
++ if (!error) {
++ gfs_rgrp_lvb_fill(rgd);
++ gfs_glock_dq_uninit(&rgd_gh);
++ }
++
++ return error;
++}
++
++/**
++ * gfs_alloc_get - allocate a struct gfs_alloc structure for an inode
++ * @ip: the inode
++ *
++ * Returns: the struct gfs_alloc
++ */
++
++struct gfs_alloc *
++gfs_alloc_get(struct gfs_inode *ip)
++{
++ struct gfs_alloc *al = ip->i_alloc;
++
++ GFS_ASSERT_INODE(!al, ip,);
++
++ al = gmalloc(sizeof(struct gfs_alloc));
++ memset(al, 0, sizeof(struct gfs_alloc));
++
++ ip->i_alloc = al;
++
++ return al;
++}
++
++/**
++ * gfs_alloc_put - throw away the struct gfs_alloc for an inode
++ * @ip: the inode
++ *
++ */
++
++void
++gfs_alloc_put(struct gfs_inode *ip)
++{
++ struct gfs_alloc *al = ip->i_alloc;
++
++ GFS_ASSERT_INODE(al, ip,);
++
++ ip->i_alloc = NULL;
++ kfree(al);
++}
++
++/**
++ * try_rgrp_fit - See if a given reservation will fit in a given RG
++ * @rgd: the RG data
++ * @al: the struct gfs_alloc structure describing the reservation
++ *
++ * Sets the $ir_datares field in @res.
++ * Sets the $ir_metares field in @res.
++ *
++ * Returns: 1 on success, 0 on failure
++ */
++
++static int
++try_rgrp_fit(struct gfs_rgrpd *rgd, struct gfs_alloc *al)
++{
++ uint32_t freeblks = rgd->rd_rg.rg_free;
++ uint32_t freemeta = rgd->rd_rg.rg_freemeta;
++ uint32_t metares = al->al_requested_meta;
++ uint32_t datares = al->al_requested_data;
++
++ /* First take care of the data blocks required */
++
++ if (freeblks < al->al_requested_data)
++ return 0;
++
++ freeblks -= al->al_requested_data;
++
++ /* Then take care of the dinodes */
++
++ metares += al->al_requested_di;
++
++ /* Then take care of the metadata blocks */
++
++ while (freemeta < metares) {
++ if (freeblks < GFS_META_CLUMP)
++ return 0;
++
++ freeblks -= GFS_META_CLUMP;
++ freemeta += GFS_META_CLUMP;
++
++ datares += GFS_META_CLUMP;
++ }
++
++ al->al_rgd = rgd;
++ al->al_reserved_meta = metares;
++ al->al_reserved_data = datares;
++
++ return 1;
++}
++
++/**
++ * recent_rgrp_first - get first RG from recent list
++ * @sdp: The GFS superblock
++ * @rglast: address of the rgrp used last
++ *
++ * Returns: The first rgrp in the recent list
++ */
++
++static struct gfs_rgrpd *
++recent_rgrp_first(struct gfs_sbd *sdp, uint64_t rglast)
++{
++ struct list_head *tmp, *head;
++ struct gfs_rgrpd *rgd = NULL;
++
++ spin_lock(&sdp->sd_rg_recent_lock);
++
++ if (list_empty(&sdp->sd_rg_recent))
++ goto out;
++
++ if (!rglast)
++ goto first;
++
++ for (head = &sdp->sd_rg_recent, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++ if (rgd->rd_ri.ri_addr == rglast)
++ goto out;
++ }
++
++ first:
++ rgd = list_entry(sdp->sd_rg_recent.next, struct gfs_rgrpd, rd_recent);
++
++ out:
++ spin_unlock(&sdp->sd_rg_recent_lock);
++
++ return rgd;
++}
++
++/**
++ * recent_rgrp_next - get next RG from recent list
++ * @cur_rgd: current rgrp
++ *
++ * Returns: The next rgrp in the recent list
++ */
++
++static struct gfs_rgrpd *
++recent_rgrp_next(struct gfs_rgrpd *cur_rgd)
++{
++ struct gfs_sbd *sdp = cur_rgd->rd_sbd;
++ struct list_head *tmp, *head;
++ struct gfs_rgrpd *rgd;
++
++ spin_lock(&sdp->sd_rg_recent_lock);
++
++ for (head = &sdp->sd_rg_recent, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++ if (rgd == cur_rgd) {
++ if (cur_rgd->rd_recent.next != &sdp->sd_rg_recent)
++ rgd = list_entry(cur_rgd->rd_recent.next,
++ struct gfs_rgrpd, rd_recent);
++ else
++ rgd = NULL;
++
++ goto out;
++ }
++ }
++
++ rgd = NULL;
++
++ out:
++ spin_unlock(&sdp->sd_rg_recent_lock);
++
++ return rgd;
++}
++
++/**
++ * recent_rgrp_remove - remove an RG from recent list
++ * @rgd: The rgrp to remove
++ *
++ */
++
++static void
++recent_rgrp_remove(struct gfs_rgrpd *rgd)
++{
++ spin_lock(&rgd->rd_sbd->sd_rg_recent_lock);
++ list_del(&rgd->rd_recent);
++ spin_unlock(&rgd->rd_sbd->sd_rg_recent_lock);
++}
++
++/**
++ * recent_rgrp_add - add an RG to recent list
++ * @new_rgd: The rgrp to add
++ *
++ */
++
++static void
++recent_rgrp_add(struct gfs_rgrpd *new_rgd)
++{
++ struct gfs_sbd *sdp = new_rgd->rd_sbd;
++ struct list_head *tmp, *head;
++ struct gfs_rgrpd *rgd = NULL;
++ unsigned int count = 0;
++ unsigned int max = sdp->sd_rgcount / gfs_num_journals(sdp);
++
++ spin_lock(&sdp->sd_rg_recent_lock);
++
++ for (head = &sdp->sd_rg_recent, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
++ if (rgd == new_rgd)
++ goto out;
++
++ if (++count >= max)
++ goto out;
++ }
++ list_add_tail(&new_rgd->rd_recent, &sdp->sd_rg_recent);
++
++ out:
++ spin_unlock(&sdp->sd_rg_recent_lock);
++}
++
++/**
++ * forward_rgrp_get - get an rgrp to try next from full list
++ * @sdp: The GFS superblock
++ *
++ * Returns: The rgrp to try next
++ */
++
++static struct gfs_rgrpd *
++forward_rgrp_get(struct gfs_sbd *sdp)
++{
++ struct gfs_rgrpd *rgd;
++ unsigned int journals = gfs_num_journals(sdp);
++ unsigned int rg = 0, x;
++
++ spin_lock(&sdp->sd_rg_forward_lock);
++
++ rgd = sdp->sd_rg_forward;
++ if (!rgd) {
++ if (sdp->sd_rgcount >= journals)
++ rg = sdp->sd_rgcount *
++ sdp->sd_lockstruct.ls_jid /
++ journals;
++
++ for (x = 0, rgd = gfs_rgrpd_get_first(sdp);
++ x < rg;
++ x++, rgd = gfs_rgrpd_get_next(rgd))
++ /* Do Nothing */;
++
++ sdp->sd_rg_forward = rgd;
++ }
++
++ spin_unlock(&sdp->sd_rg_forward_lock);
++
++ return rgd;
++}
++
++/**
++ * forward_rgrp_set - set the forward rgrp pointer
++ * @sdp: the filesystem
++ * @rgd: The new forward rgrp
++ *
++ */
++
++static void
++forward_rgrp_set(struct gfs_sbd *sdp, struct gfs_rgrpd *rgd)
++{
++ spin_lock(&sdp->sd_rg_forward_lock);
++ sdp->sd_rg_forward = rgd;
++ spin_unlock(&sdp->sd_rg_forward_lock);
++}
++
++/**
++ * get_local_rgrp - Choose and lock a rgrp for allocation
++ * @ip: the inode to reserve space for
++ * @rgp: the chosen and locked rgrp
++ *
++ * Try to acquire rgrp in way which avoids contending with others.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++get_local_rgrp(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_rgrpd *rgd, *begin, *next = NULL;
++ struct gfs_alloc *al = ip->i_alloc;
++ int flags = LM_FLAG_TRY;
++ int error = 0;
++ int skipped = 0;
++ int loops = 0;
++ int update_recent = FALSE;
++
++ /* Try recently successful rgrps */
++
++ rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
++
++ while (rgd) {
++ error = gfs_glock_nq_init(rgd->rd_gl,
++ LM_ST_EXCLUSIVE, LM_FLAG_TRY,
++ &al->al_rgd_gh);
++ switch (error) {
++ case 0:
++ if (try_rgrp_fit(rgd, al))
++ goto out;
++
++ next = recent_rgrp_next(rgd);
++ recent_rgrp_remove(rgd);
++ gfs_glock_dq_uninit(&al->al_rgd_gh);
++ rgd = next;
++ break;
++
++ case GLR_TRYFAILED:
++ rgd = recent_rgrp_next(rgd);
++ break;
++
++ default:
++ GFS_ASSERT_RGRPD(error < 0, rgd,);
++ return error;
++ }
++ }
++
++ /* Go through full list of rgrps */
++
++ update_recent = TRUE;
++ begin = rgd = forward_rgrp_get(sdp);
++
++ for (;;) {
++ error = gfs_glock_nq_init(rgd->rd_gl,
++ LM_ST_EXCLUSIVE, flags,
++ &al->al_rgd_gh);
++ switch (error) {
++ case 0:
++ if (try_rgrp_fit(rgd, al))
++ goto out;
++ gfs_glock_dq_uninit(&al->al_rgd_gh);
++ break;
++
++ case GLR_TRYFAILED:
++ GFS_ASSERT_RGRPD(flags == LM_FLAG_TRY, rgd,);
++ skipped++;
++ break;
++
++ default:
++ GFS_ASSERT_RGRPD(error < 0, rgd,);
++ return error;
++ }
++
++ rgd = gfs_rgrpd_get_next(rgd);
++ if (!rgd)
++ rgd = gfs_rgrpd_get_first(sdp);
++
++ if (rgd == begin) {
++ if (++loops >= 2 || !skipped) {
++ return -ENOSPC;
++ }
++ flags = 0;
++ }
++ }
++
++ out:
++ ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
++
++ if (update_recent) {
++ recent_rgrp_add(rgd);
++ rgd = gfs_rgrpd_get_next(rgd);
++ forward_rgrp_set(sdp, rgd);
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_inplace_reserve_i - Reserve space in the filesystem
++ * @ip: the inode to reserve space for
++ *
++ * Acquire resource group locks to allow for the maximum allocation
++ * described by "res".
++ *
++ * This should probably become more complex again, but for now, let's go
++ * for simple (one resource group) reservations.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_inplace_reserve_i(struct gfs_inode *ip,
++ char *file, unsigned int line)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ int error;
++
++ GFS_ASSERT_INODE(al->al_requested_di ||
++ al->al_requested_data ||
++ al->al_requested_meta, ip,);
++
++ error = gfs_rindex_hold(sdp, &al->al_ri_gh);
++ if (error)
++ return error;
++
++ error = get_local_rgrp(ip);
++ if (error) {
++ gfs_glock_dq_uninit(&al->al_ri_gh);
++ return error;
++ }
++
++ gfs_depend_sync(al->al_rgd);
++
++ al->al_file = file;
++ al->al_line = line;
++
++ return 0;
++}
++
++/**
++ * gfs_inplace_release - release an inplace reservation
++ * @ip: the inode the reservation was taken out on
++ *
++ * Release a reservation made by gfs_inplace_reserve().
++ */
++
++void
++gfs_inplace_release(struct gfs_inode *ip)
++{
++ struct gfs_alloc *al = ip->i_alloc;
++
++ GFS_ASSERT_INODE(al->al_alloced_di <= al->al_requested_di, ip,
++ printk("al_alloced_di = %u, al_requested_di = %u\n",
++ al->al_alloced_di, al->al_requested_di);
++ printk("al_file = %s, al_line = %u\n",
++ al->al_file, al->al_line););
++ GFS_ASSERT_INODE(al->al_alloced_meta <= al->al_reserved_meta, ip,
++ printk("al_alloced_meta = %u, al_reserved_meta = %u\n",
++ al->al_alloced_meta, al->al_reserved_meta);
++ printk("al_file = %s, al_line = %u\n",
++ al->al_file, al->al_line););
++ GFS_ASSERT_INODE(al->al_alloced_data <= al->al_reserved_data, ip,
++ printk("al_alloced_data = %u, al_reserved_data = %u\n",
++ al->al_alloced_data, al->al_reserved_data);
++ printk("al_file = %s, al_line = %u\n",
++ al->al_file, al->al_line););
++
++ al->al_rgd = NULL;
++ gfs_glock_dq_uninit(&al->al_rgd_gh);
++ gfs_glock_dq_uninit(&al->al_ri_gh);
++}
++
++/**
++ * gfs_get_block_type - Check a block in a RG is of given type
++ * @rgd: the resource group holding the block
++ * @block: the block number
++ *
++ * Returns: The block type (GFS_BLKST_*)
++ */
++
++unsigned char
++gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block)
++{
++ struct gfs_bitmap *bits = NULL;
++ uint32_t length, rgrp_block, buf_block;
++ unsigned int buf;
++ unsigned char type;
++
++ length = rgd->rd_ri.ri_length;
++ rgrp_block = block - rgd->rd_ri.ri_data1;
++
++ for (buf = 0; buf < length; buf++) {
++ bits = &rgd->rd_bits[buf];
++ if (rgrp_block < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++ break;
++ }
++
++ GFS_ASSERT_RGRPD(buf < length, rgd,);
++ buf_block = rgrp_block - bits->bi_start * GFS_NBBY;
++
++ type = gfs_testbit(rgd,
++ rgd->rd_bh[buf]->b_data + bits->bi_offset,
++ bits->bi_len, buf_block);
++
++ return type;
++}
++
++/**
++ * blkalloc_internal - allocate a single block
++ * @rgd: the resource group descriptor
++ * @goal: the goal block in the RG
++ * @old_state: the type of block to find
++ * @new_state: the resulting block type
++ *
++ * This function never fails.
++ *
++ * Returns: returns the block allocated
++ */
++
++static uint32_t
++blkalloc_internal(struct gfs_rgrpd *rgd,
++ uint32_t goal,
++ unsigned char old_state, unsigned char new_state)
++{
++ struct gfs_bitmap *bits = NULL;
++ uint32_t length = rgd->rd_ri.ri_length;
++ uint32_t blk = 0;
++ unsigned int buf, x;
++
++ for (buf = 0; buf < length; buf++) {
++ bits = &rgd->rd_bits[buf];
++ if (goal < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++ break;
++ }
++
++ GFS_ASSERT_RGRPD(buf < length, rgd,);
++ goal -= bits->bi_start * GFS_NBBY;
++
++ /* "x <= length" because we're skipping over some of the first
++ buffer when the goal is non-zero. */
++
++ for (x = 0; x <= length; x++) {
++ blk = gfs_bitfit(rgd,
++ rgd->rd_bh[buf]->b_data + bits->bi_offset,
++ bits->bi_len, goal, old_state);
++ if (blk != BFITNOENT)
++ break;
++
++ buf = (buf + 1) % length;
++ bits = &rgd->rd_bits[buf];
++ goal = 0;
++ }
++
++ GFS_ASSERT_RGRPD(x <= length, rgd,);
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
++ gfs_setbit(rgd,
++ rgd->rd_bh[buf]->b_data + bits->bi_offset,
++ bits->bi_len, blk, new_state);
++
++ return bits->bi_start * GFS_NBBY + blk;
++}
++
++/**
++ * blkfree_internal - Free a block
++ * @sdp: the filesystem
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ * @new_state: the new state of the block
++ *
++ */
++
++static struct gfs_rgrpd *
++blkfree_internal(struct gfs_sbd *sdp, uint64_t bstart, uint32_t blen,
++ unsigned char new_state)
++{
++ struct gfs_rgrpd *rgd;
++ struct gfs_bitmap *bits = NULL;
++ uint32_t length, rgrp_blk, buf_blk;
++ unsigned int buf;
++
++ rgd = gfs_blk2rgrpd(sdp, bstart);
++ GFS_ASSERT_SBD(rgd, sdp,
++ printk("block = %"PRIu64"\n", bstart););
++
++ length = rgd->rd_ri.ri_length;
++ rgrp_blk = bstart - rgd->rd_ri.ri_data1;
++
++ while (blen--) {
++ for (buf = 0; buf < length; buf++) {
++ bits = &rgd->rd_bits[buf];
++ if (rgrp_blk < (bits->bi_start + bits->bi_len) * GFS_NBBY)
++ break;
++ }
++
++ GFS_ASSERT_RGRPD(buf < length, rgd,);
++ buf_blk = rgrp_blk - bits->bi_start * GFS_NBBY;
++ rgrp_blk++;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
++ gfs_setbit(rgd,
++ rgd->rd_bh[buf]->b_data + bits->bi_offset,
++ bits->bi_len, buf_blk, new_state);
++ }
++
++ return rgd;
++}
++
++/**
++ * clump_alloc - Allocate a clump of metadata
++ * @rgd: the resource group descriptor
++ * @first: returns the first block allocated
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++clump_alloc(struct gfs_rgrpd *rgd, uint32_t *first)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_meta_header mh;
++ struct buffer_head **bh;
++ uint32_t goal, blk;
++ unsigned int x;
++ int error = 0;
++
++ memset(&mh, 0, sizeof(struct gfs_meta_header));
++ mh.mh_magic = GFS_MAGIC;
++ mh.mh_type = GFS_METATYPE_NONE;
++
++ bh = gmalloc(GFS_META_CLUMP * sizeof(struct buffer_head *));
++ memset(bh, 0, sizeof(GFS_META_CLUMP * sizeof(struct buffer_head *)));
++
++ goal = rgd->rd_last_alloc_data;
++
++ for (x = 0; x < GFS_META_CLUMP; x++) {
++ blk = blkalloc_internal(rgd, goal, GFS_BLKST_FREE,
++ GFS_BLKST_FREEMETA);
++ if (!x)
++ *first = blk;
++
++ bh[x] = gfs_dgetblk(sdp, rgd->rd_ri.ri_data1 + blk, rgd->rd_gl);
++
++ gfs_prep_new_buffer(bh[x]);
++
++ gfs_meta_header_out(&mh, bh[x]->b_data);
++ ((struct gfs_meta_header *)bh[x]->b_data)->mh_generation = 0;
++
++ error = gfs_dwrite(sdp, bh[x], DIO_DIRTY | DIO_START);
++ if (error)
++ goto out;
++
++ goal = blk;
++ }
++
++ rgd->rd_last_alloc_data = goal;
++
++ for (x = 0; x < GFS_META_CLUMP; x++) {
++ error = gfs_dwrite(sdp, bh[x], DIO_WAIT);
++ if (error)
++ goto out;
++ }
++
++ gfs_mhc_add(rgd, bh, GFS_META_CLUMP);
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_free >= GFS_META_CLUMP, rgd,);
++ rgd->rd_rg.rg_free -= GFS_META_CLUMP;
++ rgd->rd_rg.rg_freemeta += GFS_META_CLUMP;
++
++ out:
++ for (x = 0; x < GFS_META_CLUMP; x++)
++ if (bh[x]) {
++ gfs_dwrite(sdp, bh[x], DIO_WAIT);
++ brelse(bh[x]);
++ }
++ kfree(bh);
++
++ return error;
++}
++
++/**
++ * gfs_blkalloc - Allocate a data block
++ * @ip: the inode to allocate the data block for
++ * @block: the block allocated
++ *
++ */
++
++void
++gfs_blkalloc(struct gfs_inode *ip, uint64_t *block)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ struct gfs_rgrpd *rgd = al->al_rgd;
++ uint32_t goal, blk;
++ int same;
++
++ GFS_ASSERT_INODE(rgd, ip,);
++
++ same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
++ goal = (same) ? ip->i_di.di_goal_dblk : rgd->rd_last_alloc_data;
++
++ blk = blkalloc_internal(rgd, goal,
++ GFS_BLKST_FREE, GFS_BLKST_USED);
++ rgd->rd_last_alloc_data = blk;
++
++ if (!same) {
++ ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++ ip->i_di.di_goal_mblk = 0;
++ }
++ ip->i_di.di_goal_dblk = blk;
++
++ *block = rgd->rd_ri.ri_data1 + blk;
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_free, rgd,);
++ rgd->rd_rg.rg_free--;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++ al->al_alloced_data++;
++
++ gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
++}
++
++/**
++ * gfs_metaalloc - Allocate a metadata block to a file
++ * @ip: the file
++ * @block: the block allocated
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_metaalloc(struct gfs_inode *ip, uint64_t *block)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_alloc *al = ip->i_alloc;
++ struct gfs_rgrpd *rgd = al->al_rgd;
++ uint32_t goal, blk;
++ int same;
++ int error;
++
++ GFS_ASSERT_INODE(rgd, ip,);
++
++ same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
++
++ if (!rgd->rd_rg.rg_freemeta) {
++ error = clump_alloc(rgd, &goal);
++ if (error)
++ return error;
++
++ al->al_alloced_data += GFS_META_CLUMP;
++ } else
++ goal = (same) ? ip->i_di.di_goal_mblk : rgd->rd_last_alloc_meta;
++
++ blk = blkalloc_internal(rgd, goal,
++ GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
++ rgd->rd_last_alloc_meta = blk;
++
++ if (!same) {
++ ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
++ ip->i_di.di_goal_dblk = 0;
++ }
++ ip->i_di.di_goal_mblk = blk;
++
++ *block = rgd->rd_ri.ri_data1 + blk;
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_freemeta, rgd,);
++ rgd->rd_rg.rg_freemeta--;
++ rgd->rd_rg.rg_usedmeta++;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++ al->al_alloced_meta++;
++
++ gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
++
++ return 0;
++}
++
++/**
++ * gfs_dialloc - Allocate a dinode
++ * @dip: the directory that the inode is going in
++ * @block: the block
++ *
++ * Returns: errno
++ */
++
++int
++gfs_dialloc(struct gfs_inode *dip, uint64_t *block)
++{
++ struct gfs_alloc *al = dip->i_alloc;
++ struct gfs_rgrpd *rgd = al->al_rgd;
++ uint32_t goal, blk;
++ int error = 0;
++
++ GFS_ASSERT_INODE(rgd, dip,);
++
++ if (rgd->rd_rg.rg_freemeta)
++ goal = rgd->rd_last_alloc_meta;
++ else {
++ error = clump_alloc(rgd, &goal);
++ if (error)
++ return error;
++
++ al->al_alloced_data += GFS_META_CLUMP;
++ }
++
++ blk = blkalloc_internal(rgd, goal,
++ GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
++ rgd->rd_last_alloc_meta = blk;
++
++ *block = rgd->rd_ri.ri_data1 + blk;
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_freemeta, rgd,);
++ rgd->rd_rg.rg_freemeta--;
++ rgd->rd_rg.rg_useddi++;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++ al->al_alloced_di++;
++ al->al_alloced_meta++;
++
++ return error;
++}
++
++/**
++ * gfs_blkfree - free a piece of data
++ * @ip: the inode these blocks are being free from
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ *
++ */
++
++void
++gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_rgrpd *rgd;
++
++ rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREE);
++
++ rgd->rd_rg.rg_free += blen;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++ gfs_trans_add_quota(sdp, -(int64_t)blen,
++ ip->i_di.di_uid,
++ ip->i_di.di_gid);
++}
++
++/**
++ * gfs_metafree - free a piece of metadata
++ * @ip: the inode these blocks are being free from
++ * @bstart: the start of a run of blocks to free
++ * @blen: the length of the block run
++ *
++ */
++
++void
++gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ struct gfs_rgrpd *rgd;
++
++ rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREEMETA);
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_usedmeta >= blen, rgd,);
++ rgd->rd_rg.rg_usedmeta -= blen;
++ rgd->rd_rg.rg_freemeta += blen;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++
++ gfs_trans_add_quota(sdp, -(int64_t)blen,
++ ip->i_di.di_uid,
++ ip->i_di.di_gid);
++ gfs_wipe_buffers(ip, rgd, bstart, blen);
++}
++
++/**
++ * gfs_difree_uninit - free a piece of metadata
++ * @rgd: the resource group that contains the dinode
++ * @addr: the dinode address
++ *
++ */
++
++void
++gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr)
++{
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ struct gfs_rgrpd *tmp_rgd;
++
++ tmp_rgd = blkfree_internal(sdp, addr, 1,
++ GFS_BLKST_FREEMETA);
++ GFS_ASSERT_RGRPD(rgd == tmp_rgd, rgd,);
++
++ GFS_ASSERT_RGRPD(rgd->rd_rg.rg_useddi, rgd,);
++ rgd->rd_rg.rg_useddi--;
++ rgd->rd_rg.rg_freemeta++;
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
++}
++
++/**
++ * gfs_difree - free a piece of metadata
++ * @rgd: the resource group that contains the dinode
++ * @ip: the inode representing the dinode to free
++ *
++ */
++
++void
++gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip)
++{
++ gfs_difree_uninit(rgd, ip->i_num.no_addr);
++
++ gfs_trans_add_quota(ip->i_sbd, -1, ip->i_di.di_uid, ip->i_di.di_gid);
++ gfs_wipe_buffers(ip, rgd, ip->i_num.no_addr, 1);
++}
++
++/**
++ * gfs_rlist_add - add a RG to a list of RGs
++ * @sdp: the filesystem
++ * @rlist: the list of resource groups
++ * @block: the block
++ *
++ * Figure out what RG a block belongs to and add that RG to the list
++ *
++ */
++
++void
++gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist, uint64_t block)
++{
++ struct gfs_rgrpd *rgd;
++ struct gfs_rgrpd **tmp;
++ unsigned int new_space;
++ unsigned int x;
++
++ GFS_ASSERT_SBD(rlist->rl_rgrps <= rlist->rl_space, sdp,);
++ GFS_ASSERT_SBD(!rlist->rl_ghs, sdp,);
++
++ rgd = gfs_blk2rgrpd(sdp, block);
++ GFS_ASSERT_SBD(rgd, sdp,
++ printk("block = %"PRIu64"\n", block););
++
++ for (x = 0; x < rlist->rl_rgrps; x++)
++ if (rlist->rl_rgd[x] == rgd)
++ return;
++
++ if (rlist->rl_rgrps == rlist->rl_space) {
++ new_space = rlist->rl_space + 10;
++
++ tmp = gmalloc(new_space * sizeof(struct gfs_rgrpd *));
++
++ if (rlist->rl_rgd) {
++ memcpy(tmp, rlist->rl_rgd,
++ rlist->rl_space * sizeof(struct gfs_rgrpd *));
++ kfree(rlist->rl_rgd);
++ }
++
++ rlist->rl_space = new_space;
++ rlist->rl_rgd = tmp;
++ }
++
++ rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
++}
++
++/**
++ * gfs_rlist_alloc - all RGs have been added to the rlist, allocated holders for them
++ * @rlist: the list of resource groups
++ * @state: the lock state to acquire the RG lock in
++ * @flags: the modifier flags for the holder structures
++ *
++ */
++
++void
++gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state, int flags)
++{
++ unsigned int x;
++
++ rlist->rl_ghs = gmalloc(rlist->rl_rgrps * sizeof(struct gfs_holder));
++ for (x = 0; x < rlist->rl_rgrps; x++)
++ gfs_holder_init(rlist->rl_rgd[x]->rd_gl,
++ state, flags,
++ &rlist->rl_ghs[x]);
++}
++
++/**
++ * gfs_rlist_free - free a resource group list
++ * @list: the list of resource groups
++ *
++ */
++
++void
++gfs_rlist_free(struct gfs_rgrp_list *rlist)
++{
++ unsigned int x;
++
++ if (rlist->rl_rgd)
++ kfree(rlist->rl_rgd);
++
++ if (rlist->rl_ghs) {
++ for (x = 0; x < rlist->rl_rgrps; x++)
++ gfs_holder_uninit(&rlist->rl_ghs[x]);
++ kfree(rlist->rl_ghs);
++ }
++}
++
++/**
++ * gfs_reclaim_metadata - reclaims unused metadata
++ * @sdp: the file system
++ * @stats: stats on reclaimation
++ *
++ * This function will look through the resource groups and
++ * free the unused metadata.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_reclaim_metadata(struct gfs_sbd *sdp, struct gfs_reclaim_stats *stats)
++{
++ struct gfs_holder ji_gh, ri_gh, rgd_gh, t_gh;
++ struct gfs_rgrpd *rgd;
++ struct gfs_rgrp *rg;
++ struct gfs_dinode *di;
++ struct gfs_inum next;
++ struct buffer_head *bh;
++ uint32_t flags;
++ uint32_t goal;
++ unsigned int x;
++ int error = 0;
++
++ /* Acquire the jindex lock here so we don't deadlock with a
++ process writing the the jindex inode. :-( */
++
++ error = gfs_jindex_hold(sdp, &ji_gh);
++ if (error)
++ goto fail;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ goto fail_jindex_relse;
++
++ for (rgd = gfs_rgrpd_get_first(sdp);
++ rgd;
++ rgd = gfs_rgrpd_get_next(rgd)) {
++ error = gfs_glock_nq_init(rgd->rd_gl,
++ LM_ST_EXCLUSIVE, GL_NOCACHE,
++ &rgd_gh);
++ if (error)
++ goto fail_rindex_relse;
++
++ rgrp_verify(rgd);
++
++ rg = &rgd->rd_rg;
++
++ if (!rg->rg_freedi && !rg->rg_freemeta) {
++ gfs_glock_dq_uninit(&rgd_gh);
++ continue;
++ }
++
++ gfs_mhc_zap(rgd);
++ gfs_depend_sync(rgd);
++
++ error = gfs_lock_fs_check_clean(sdp, LM_ST_EXCLUSIVE, &t_gh);
++ if (error)
++ goto fail_gunlock_rg;
++
++ error = gfs_trans_begin(sdp, rgd->rd_ri.ri_length, 0);
++ if (error)
++ goto fail_unlock_fs;
++
++ next = rg->rg_freedi_list;
++
++ for (x = rg->rg_freedi; x--;) {
++ GFS_ASSERT_RGRPD(next.no_formal_ino &&
++ next.no_addr, rgd,);
++
++ blkfree_internal(sdp, next.no_addr, 1, GFS_BLKST_FREE);
++
++ error = gfs_dread(sdp, next.no_addr, rgd->rd_gl,
++ DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto fail_end_trans;
++
++ di = (struct gfs_dinode *)bh->b_data;
++ flags = di->di_flags;
++ flags = gfs32_to_cpu(flags);
++ GFS_ASSERT_RGRPD(flags & GFS_DIF_UNUSED, rgd,);
++
++ gfs_inum_in(&next, (char *)&di->di_next_unused);
++
++ brelse(bh);
++
++ rg->rg_freedi--;
++ rg->rg_free++;
++ stats->rc_inodes++;
++ }
++
++ GFS_ASSERT_RGRPD(!next.no_formal_ino && !next.no_addr, rgd,);
++ rg->rg_freedi_list = next;
++
++ goal = 0;
++ for (x = rg->rg_freemeta; x--;) {
++ goal = blkalloc_internal(rgd, goal,
++ GFS_BLKST_FREEMETA, GFS_BLKST_FREE);
++ rg->rg_freemeta--;
++ rg->rg_free++;
++ stats->rc_metadata++;
++ }
++
++ gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
++ gfs_rgrp_out(rg, rgd->rd_bh[0]->b_data);
++
++ gfs_trans_end(sdp);
++
++ gfs_glock_dq_uninit(&t_gh);
++
++ gfs_glock_dq_uninit(&rgd_gh);
++ }
++
++ gfs_glock_dq_uninit(&ri_gh);
++
++ gfs_glock_dq_uninit(&ji_gh);
++
++ return 0;
++
++ fail_end_trans:
++ gfs_trans_end(sdp);
++
++ fail_unlock_fs:
++ gfs_glock_dq_uninit(&t_gh);
++
++ fail_gunlock_rg:
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ fail_rindex_relse:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ fail_jindex_relse:
++ gfs_glock_dq_uninit(&ji_gh);
++
++ fail:
++ return error;
++}
+diff -urN linux-orig/fs/gfs/rgrp.h linux-patched/fs/gfs/rgrp.h
+--- linux-orig/fs/gfs/rgrp.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/rgrp.h 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,75 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __RGRP_DOT_H__
++#define __RGRP_DOT_H__
++
++void gfs_mhc_add(struct gfs_rgrpd *rgd, struct buffer_head **bh,
++ unsigned int num);
++int gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh);
++void gfs_mhc_zap(struct gfs_rgrpd *rgd);
++
++void gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino);
++void gfs_depend_sync(struct gfs_rgrpd *rgd);
++
++struct gfs_rgrpd *gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk);
++struct gfs_rgrpd *gfs_rgrpd_get_first(struct gfs_sbd *sdp);
++struct gfs_rgrpd *gfs_rgrpd_get_next(struct gfs_rgrpd *rgd);
++
++void gfs_clear_rgrpd(struct gfs_sbd *sdp);
++
++int gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh);
++
++int gfs_rgrp_read(struct gfs_rgrpd *rgd);
++void gfs_rgrp_relse(struct gfs_rgrpd *rgd);
++
++void gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd);
++int gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd);
++
++struct gfs_alloc *gfs_alloc_get(struct gfs_inode *ip);
++void gfs_alloc_put(struct gfs_inode *ip);
++
++int gfs_inplace_reserve_i(struct gfs_inode *ip,
++ char *file, unsigned int line);
++#define gfs_inplace_reserve(ip) \
++gfs_inplace_reserve_i((ip), __FILE__, __LINE__)
++
++void gfs_inplace_release(struct gfs_inode *ip);
++
++unsigned char gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block);
++
++void gfs_blkalloc(struct gfs_inode *ip, uint64_t *block);
++int gfs_metaalloc(struct gfs_inode *ip, uint64_t *block);
++int gfs_dialloc(struct gfs_inode *dip, uint64_t *block);
++
++void gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
++void gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
++void gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr);
++void gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip);
++
++struct gfs_rgrp_list {
++ unsigned int rl_rgrps;
++ unsigned int rl_space;
++ struct gfs_rgrpd **rl_rgd;
++ struct gfs_holder *rl_ghs;
++};
++
++void gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist,
++ uint64_t block);
++void gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state,
++ int flags);
++void gfs_rlist_free(struct gfs_rgrp_list *rlist);
++
++int gfs_reclaim_metadata(struct gfs_sbd *sdp, struct gfs_reclaim_stats *stats);
++
++#endif /* __RGRP_DOT_H__ */
+diff -urN linux-orig/fs/gfs/super.c linux-patched/fs/gfs/super.c
+--- linux-orig/fs/gfs/super.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/super.c 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,1035 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "file.h"
++#include "format.h"
++#include "glock.h"
++#include "glops.h"
++#include "inode.h"
++#include "log.h"
++#include "quota.h"
++#include "recovery.h"
++#include "rgrp.h"
++#include "super.h"
++#include "unlinked.h"
++
++/**
++ * gfs_init_tune_data - Fill in the struct gfs_tune (sd_tune) in the struct gfs_sbd.
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_init_tune_data(struct gfs_sbd *sdp)
++{
++ struct gfs_tune *gt = &sdp->sd_tune;
++
++ gt->gt_tune_version = GFS_TUNE_VERSION;
++
++ gt->gt_ilimit1 = 100;
++ gt->gt_ilimit1_tries = 3;
++ gt->gt_ilimit1_min = 1;
++ gt->gt_ilimit2 = 500;
++ gt->gt_ilimit2_tries = 10;
++ gt->gt_ilimit2_min = 3;
++ gt->gt_demote_secs = 300;
++ gt->gt_incore_log_blocks = 1024;
++ gt->gt_jindex_refresh_secs = 60;
++ gt->gt_depend_secs = 60;
++ gt->gt_scand_secs = 5;
++ gt->gt_recoverd_secs = 60;
++ gt->gt_logd_secs = 1;
++ gt->gt_quotad_secs = 5;
++ gt->gt_inoded_secs = 15;
++ gt->gt_quota_simul_sync = 64;
++ gt->gt_quota_warn_period = 10;
++ gt->gt_atime_quantum = 3600;
++ gt->gt_quota_quantum = 60;
++ gt->gt_quota_scale_num = 1;
++ gt->gt_quota_scale_den = 1;
++ gt->gt_quota_enforce = 1;
++ gt->gt_quota_account = 1;
++ gt->gt_new_files_jdata = 0;
++ gt->gt_new_files_directio = 0;
++ gt->gt_max_atomic_write = 4 << 20;
++ gt->gt_max_readahead = 1 << 18;
++ gt->gt_lockdump_size = 131072;
++ gt->gt_stall_secs = 600;
++ gt->gt_complain_secs = 10;
++ gt->gt_reclaim_limit = 5000;
++ gt->gt_entries_per_readdir = 32;
++ gt->gt_prefetch_secs = 10;
++ gt->gt_statfs_slots = 64;
++ gt->gt_max_mhc = 10000;
++}
++
++/**
++ * gfs_check_sb - Check superblock
++ * @sdp: the filesystem
++ * @sb: The superblock
++ * @silent: Don't print a message if the check fails
++ *
++ * Checks the version code of the FS is one that we understand how to
++ * read and that the sizes of the various on-disk structures have not
++ * changed.
++ */
++
++int
++gfs_check_sb(struct gfs_sbd *sdp, struct gfs_sb *sb, int silent)
++{
++ unsigned int x;
++
++ if (sb->sb_header.mh_magic != GFS_MAGIC ||
++ sb->sb_header.mh_type != GFS_METATYPE_SB) {
++ if (!silent)
++ printk("GFS: not a GFS filesystem\n");
++ return -EINVAL;
++ }
++
++ /* If format numbers match exactly, we're done. */
++
++ if (sb->sb_fs_format == GFS_FORMAT_FS &&
++ sb->sb_multihost_format == GFS_FORMAT_MULTI)
++ return 0;
++
++ if (sb->sb_fs_format != GFS_FORMAT_FS) {
++ for (x = 0; gfs_old_fs_formats[x]; x++)
++ if (gfs_old_fs_formats[x] == sb->sb_fs_format)
++ break;
++
++ if (!gfs_old_fs_formats[x]) {
++ printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++ GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++ sb->sb_fs_format, sb->sb_multihost_format);
++ printk("GFS: I don't know how to upgrade this FS\n");
++ return -EINVAL;
++ }
++ }
++
++ if (sb->sb_multihost_format != GFS_FORMAT_MULTI) {
++ for (x = 0; gfs_old_multihost_formats[x]; x++)
++ if (gfs_old_multihost_formats[x] == sb->sb_multihost_format)
++ break;
++
++ if (!gfs_old_multihost_formats[x]) {
++ printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++ GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++ sb->sb_fs_format, sb->sb_multihost_format);
++ printk("GFS: I don't know how to upgrade this FS\n");
++ return -EINVAL;
++ }
++ }
++
++ if (!sdp->sd_args.ar_upgrade) {
++ printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
++ GFS_FORMAT_FS, GFS_FORMAT_MULTI,
++ sb->sb_fs_format, sb->sb_multihost_format);
++ printk("GFS: Use the \"upgrade\" mount option to upgrade the FS\n");
++ printk("GFS: See the manual for more details\n");
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++/**
++ * gfs_read_sb - Read super block
++ * @sdp: The GFS superblock
++ * @gl: the glock for the superblock (assumed to be held)
++ * @silent: Don't print message if mount fails
++ *
++ */
++
++int
++gfs_read_sb(struct gfs_sbd *sdp, struct gfs_glock *gl, int silent)
++{
++ struct buffer_head *bh;
++ uint32_t hash_blocks, ind_blocks, leaf_blocks;
++ uint32_t tmp_blocks;
++ uint64_t space = 0;
++ unsigned int x;
++ int error;
++
++ error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift,
++ gl, DIO_FORCE | DIO_START | DIO_WAIT, &bh);
++ if (error) {
++ if (!silent)
++ printk("GFS: fsid=%s: can't read superblock\n",
++ sdp->sd_fsname);
++ return error;
++ }
++
++ GFS_ASSERT_SBD(sizeof(struct gfs_sb) <= bh->b_size, sdp,);
++
++ gfs_sb_in(&sdp->sd_sb, bh->b_data);
++
++ brelse(bh);
++
++ error = gfs_check_sb(sdp, &sdp->sd_sb, silent);
++ if (error)
++ return error;
++
++ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
++ GFS_BASIC_BLOCK_SHIFT;
++ sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
++ sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) /
++ sizeof(uint64_t);
++ sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_indirect)) /
++ sizeof(uint64_t);
++ sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
++ sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
++ sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
++ sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
++
++ /* Compute maximum reservation required to add a entry to a directory */
++
++ hash_blocks = DIV_RU(sizeof(uint64_t) * (1 << GFS_DIR_MAX_DEPTH),
++ sdp->sd_jbsize);
++
++ ind_blocks = 0;
++ for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
++ tmp_blocks = DIV_RU(tmp_blocks, sdp->sd_inptrs);
++ ind_blocks += tmp_blocks;
++ }
++
++ leaf_blocks = 2 + GFS_DIR_MAX_DEPTH;
++
++ sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
++
++ sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++ sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
++ for (x = 2;; x++) {
++ uint64_t d;
++ uint32_t m;
++ space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
++ d = space;
++ m = do_div(d, sdp->sd_inptrs);
++
++ if (d != sdp->sd_heightsize[x - 1] || m)
++ break;
++ sdp->sd_heightsize[x] = space;
++ }
++ sdp->sd_max_height = x;
++ GFS_ASSERT_SBD(sdp->sd_max_height <= GFS_MAX_META_HEIGHT, sdp,);
++
++ sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
++ sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
++ for (x = 2;; x++) {
++ uint64_t d;
++ uint32_t m;
++ space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
++ d = space;
++ m = do_div(d, sdp->sd_inptrs);
++
++ if (d != sdp->sd_jheightsize[x - 1] || m)
++ break;
++ sdp->sd_jheightsize[x] = space;
++ }
++ sdp->sd_max_jheight = x;
++ GFS_ASSERT_SBD(sdp->sd_max_jheight <= GFS_MAX_META_HEIGHT, sdp,);
++
++ return 0;
++}
++
++/**
++ * gfs_do_upgrade - upgrade a filesystem
++ * @sdp: The GFS superblock
++ *
++ */
++
++int
++gfs_do_upgrade(struct gfs_sbd *sdp, struct gfs_glock *sb_gl)
++{
++ struct gfs_holder ji_gh, t_gh, j_gh;
++ struct gfs_log_header lh;
++ struct buffer_head *bh;
++ unsigned int x;
++ int error;
++
++ /* If format numbers match exactly, we're done. */
++
++ if (sdp->sd_sb.sb_fs_format == GFS_FORMAT_FS &&
++ sdp->sd_sb.sb_multihost_format == GFS_FORMAT_MULTI) {
++ printk("GFS: fsid=%s: no upgrade necessary\n",
++ sdp->sd_fsname);
++ sdp->sd_args.ar_upgrade = FALSE;
++ return 0;
++ }
++
++ error = gfs_jindex_hold(sdp, &ji_gh);
++ if (error)
++ goto fail;
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl,
++ LM_ST_EXCLUSIVE, GL_NOCACHE,
++ &t_gh);
++ if (error)
++ goto fail_ji_relse;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ printk("GFS: fsid=%s: can't upgrade: read-only FS\n",
++ sdp->sd_fsname);
++ error = -EROFS;
++ goto fail_gunlock_tr;
++ }
++
++ for (x = 0; x < sdp->sd_journals; x++) {
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_jindex[x].ji_addr,
++ &gfs_meta_glops, LM_ST_SHARED,
++ LM_FLAG_TRY | GL_NOCACHE, &j_gh);
++ switch (error) {
++ case 0:
++ break;
++
++ case GLR_TRYFAILED:
++ printk("GFS: fsid=%s: journal %u is busy\n",
++ sdp->sd_fsname, x);
++ error = -EBUSY;
++
++ default:
++ goto fail_gunlock_tr;
++ }
++
++ error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
++ j_gh.gh_gl, &lh);
++
++ gfs_glock_dq_uninit(&j_gh);
++
++ if (error)
++ goto fail_gunlock_tr;
++
++ if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT) || lh.lh_last_dump) {
++ printk("GFS: fsid=%s: journal %u is busy\n",
++ sdp->sd_fsname, x);
++ error = -EBUSY;
++ goto fail_gunlock_tr;
++ }
++ }
++
++ /* We don't need to journal this change because we're changing
++ only one sector of one block. We definitely don't want to have
++ the journaling code running at this point. */
++
++ error = gfs_dread(sdp, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift, sb_gl,
++ DIO_START | DIO_WAIT, &bh);
++ if (error)
++ goto fail_gunlock_tr;
++
++ gfs_sb_in(&sdp->sd_sb, bh->b_data);
++
++ error = gfs_check_sb(sdp, &sdp->sd_sb, FALSE);
++ GFS_ASSERT_SBD(!error, sdp,);
++
++ sdp->sd_sb.sb_fs_format = GFS_FORMAT_FS;
++ sdp->sd_sb.sb_multihost_format = GFS_FORMAT_MULTI;
++
++ gfs_sb_out(&sdp->sd_sb, bh->b_data);
++
++ set_bit(GLF_DIRTY, &sb_gl->gl_flags);
++ error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
++
++ brelse(bh);
++
++ gfs_glock_dq_uninit(&t_gh);
++
++ gfs_glock_dq_uninit(&ji_gh);
++
++ if (!error) {
++ printk("GFS: fsid=%s: upgrade successful\n",
++ sdp->sd_fsname);
++ sdp->sd_args.ar_upgrade = FALSE;
++ }
++
++ return error;
++
++ fail_gunlock_tr:
++ gfs_glock_dq_uninit(&t_gh);
++
++ fail_ji_relse:
++ gfs_glock_dq_uninit(&ji_gh);
++
++ fail:
++ if (error == -EBUSY)
++ printk("GFS: fsid=%s: can't upgrade: the FS is still busy or contains dirty journals\n",
++ sdp->sd_fsname);
++ else
++ printk("GFS: fsid=%s: can't upgrade: %d\n",
++ sdp->sd_fsname, error);
++
++ return error;
++}
++
++/**
++ * clear_journalsi - Clear all the journal index information (without locking)
++ * @sdp: The GFS superblock
++ *
++ */
++
++static void
++clear_journalsi(struct gfs_sbd *sdp)
++{
++ if (sdp->sd_jindex) {
++ kfree(sdp->sd_jindex);
++ sdp->sd_jindex = NULL;
++ }
++ sdp->sd_journals = 0;
++}
++
++/**
++ * gfs_clear_journals - Clear all the journal index information
++ * @sdp: The GFS superblock
++ *
++ */
++
++void
++gfs_clear_journals(struct gfs_sbd *sdp)
++{
++ down(&sdp->sd_jindex_lock);
++ clear_journalsi(sdp);
++ up(&sdp->sd_jindex_lock);
++}
++
++/**
++ * gfs_ji_update - Update the journal index information
++ * @ip: The journal index inode
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++static int
++gfs_ji_update(struct gfs_inode *ip)
++{
++ struct gfs_sbd *sdp = ip->i_sbd;
++ char buf[sizeof(struct gfs_jindex)];
++ unsigned int j;
++ int error;
++
++ GFS_ASSERT_SBD(!do_mod(ip->i_di.di_size, sizeof(struct gfs_jindex)),
++ sdp,);
++
++ clear_journalsi(sdp);
++
++ sdp->sd_jindex = gmalloc(ip->i_di.di_size);
++ memset(sdp->sd_jindex, 0, ip->i_di.di_size);
++
++ for (j = 0;; j++) {
++ error = gfs_internal_read(ip, buf,
++ j * sizeof(struct gfs_jindex),
++ sizeof(struct gfs_jindex));
++ if (!error)
++ break;
++ if (error != sizeof(struct gfs_jindex)) {
++ if (error > 0)
++ error = -EIO;
++ goto fail;
++ }
++
++ gfs_jindex_in(sdp->sd_jindex + j, buf);
++ }
++
++ GFS_ASSERT_SBD(j * sizeof(struct gfs_jindex) == ip->i_di.di_size,
++ sdp,);
++
++ sdp->sd_journals = j;
++ sdp->sd_jiinode_vn = ip->i_gl->gl_vn;
++
++ return 0;
++
++ fail:
++ clear_journalsi(sdp);
++ return error;
++}
++
++/**
++ * gfs_jindex_hold - Grab a lock on the jindex
++ * @sdp: The GFS superblock
++ * @ji_gh: the holder for the jindex glock
++ *
++ * This is very similar to the gfs_rindex_hold() function, except that
++ * in general we hold the jindex lock for longer periods of time and
++ * we grab it far less frequently (in general) then the rgrp lock.
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_jindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ji_gh)
++{
++ struct gfs_inode *ip = sdp->sd_jiinode;
++ struct gfs_glock *gl = ip->i_gl;
++ int error;
++
++ error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ji_gh);
++ if (error)
++ return error;
++
++ if (sdp->sd_jiinode_vn != gl->gl_vn) {
++ down(&sdp->sd_jindex_lock);
++ if (sdp->sd_jiinode_vn != gl->gl_vn)
++ error = gfs_ji_update(ip);
++ up(&sdp->sd_jindex_lock);
++ }
++
++ if (error)
++ gfs_glock_dq_uninit(ji_gh);
++
++ return error;
++}
++
++/**
++ * gfs_get_jiinode - Read in the jindex inode for the superblock
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_jiinode(struct gfs_sbd *sdp)
++{
++ struct gfs_holder ji_gh;
++ int error;
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_sb.sb_jindex_di.no_formal_ino,
++ &gfs_inode_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL,
++ &ji_gh);
++ if (error)
++ return error;
++
++ error = gfs_inode_get(ji_gh.gh_gl, &sdp->sd_sb.sb_jindex_di,
++ CREATE, &sdp->sd_jiinode);
++ if (!error) {
++ sdp->sd_jiinode_vn = ji_gh.gh_gl->gl_vn - 1;
++ set_bit(GLF_STICKY, &ji_gh.gh_gl->gl_flags);
++ }
++
++ gfs_glock_dq_uninit(&ji_gh);
++
++ return error;
++}
++
++/**
++ * gfs_get_riinode - Read in the rindex inode for the superblock
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_riinode(struct gfs_sbd *sdp)
++{
++ struct gfs_holder ri_gh;
++ int error;
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_sb.sb_rindex_di.no_formal_ino,
++ &gfs_inode_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL,
++ &ri_gh);
++ if (error)
++ return error;
++
++ error = gfs_inode_get(ri_gh.gh_gl, &sdp->sd_sb.sb_rindex_di,
++ CREATE, &sdp->sd_riinode);
++ if (!error) {
++ sdp->sd_riinode_vn = ri_gh.gh_gl->gl_vn - 1;
++ set_bit(GLF_STICKY, &ri_gh.gh_gl->gl_flags);
++ }
++
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return error;
++}
++
++/**
++ * gfs_get_rootinode - Read in the root inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_rootinode(struct gfs_sbd *sdp)
++{
++ struct gfs_holder i_gh;
++ int error;
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_sb.sb_root_di.no_formal_ino,
++ &gfs_inode_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL,
++ &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_root_di,
++ CREATE, &sdp->sd_rooti);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_get_qinode - Read in the quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_qinode(struct gfs_sbd *sdp)
++{
++ struct gfs_holder i_gh;
++ int error;
++
++ if (!sdp->sd_sb.sb_quota_di.no_formal_ino) {
++ error = gfs_alloc_qinode(sdp);
++ if (error)
++ return error;
++ }
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_sb.sb_quota_di.no_formal_ino,
++ &gfs_inode_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL,
++ &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_quota_di,
++ CREATE, &sdp->sd_qinode);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_get_linode - Read in the quota inode
++ * @sdp: The GFS superblock
++ *
++ * Returns: 0 on success, error code otherwise
++ */
++
++int
++gfs_get_linode(struct gfs_sbd *sdp)
++{
++ struct gfs_holder i_gh;
++ int error;
++
++ if (!sdp->sd_sb.sb_license_di.no_formal_ino) {
++ error = gfs_alloc_linode(sdp);
++ if (error)
++ return error;
++ }
++
++ error = gfs_glock_nq_num(sdp,
++ sdp->sd_sb.sb_license_di.no_formal_ino,
++ &gfs_inode_glops,
++ LM_ST_SHARED, GL_LOCAL_EXCL,
++ &i_gh);
++ if (error)
++ return error;
++
++ error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_license_di,
++ CREATE, &sdp->sd_linode);
++
++ gfs_glock_dq_uninit(&i_gh);
++
++ return error;
++}
++
++/**
++ * gfs_make_fs_rw - Turn a RO FS into a RW one
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_fs_rw(struct gfs_sbd *sdp)
++{
++ struct gfs_glock *j_gl = sdp->sd_journal_gh.gh_gl;
++ struct gfs_holder t_gh;
++ struct gfs_log_header head;
++ int error;
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl,
++ LM_ST_SHARED,
++ GL_LOCAL_EXCL | GL_EXACT,
++ &t_gh);
++ if (error)
++ return error;
++
++ j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
++
++ error = gfs_find_jhead(sdp, &sdp->sd_jdesc, j_gl, &head);
++ if (error)
++ goto fail;
++
++ GFS_ASSERT_SBD(head.lh_flags & GFS_LOG_HEAD_UNMOUNT, sdp,);
++
++ /* Initialize some head of the log stuff */
++ sdp->sd_sequence = head.lh_sequence;
++ sdp->sd_log_head = head.lh_first + 1;
++
++ error = gfs_recover_dump(sdp);
++ if (error)
++ goto fail;
++
++ set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++ clear_bit(SDF_ROFS, &sdp->sd_flags);
++
++ set_bit(GLF_DIRTY, &j_gl->gl_flags);
++ gfs_log_dump(sdp, TRUE);
++
++ gfs_glock_dq_uninit(&t_gh);
++
++ return 0;
++
++ fail:
++ t_gh.gh_flags |= GL_NOCACHE;
++ gfs_glock_dq_uninit(&t_gh);
++
++ return error;
++}
++
++/**
++ * gfs_make_fs_ro - Turn a RW FS into a RO one
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_make_fs_ro(struct gfs_sbd *sdp)
++{
++ struct gfs_holder t_gh;
++ int error;
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl,
++ LM_ST_SHARED,
++ GL_LOCAL_EXCL | GL_EXACT | GL_NOCACHE,
++ &t_gh);
++ if (error)
++ return error;
++
++ gfs_sync_meta(sdp);
++ gfs_log_dump(sdp, TRUE);
++
++ error = gfs_log_shutdown(sdp);
++ if (error)
++ gfs_io_error(sdp);
++
++ set_bit(SDF_ROFS, &sdp->sd_flags);
++ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
++
++ gfs_glock_dq_uninit(&t_gh);
++
++ gfs_unlinked_cleanup(sdp);
++ gfs_quota_cleanup(sdp);
++
++ return error;
++}
++
++/**
++ * stat_gfs_async - Stat a filesystem using asynchronous locking
++ * @sdp: the filesystem
++ * @usage: the usage info that will be returned
++ * @interruptible: TRUE if we should look for signals.
++ *
++ * Any error (other than a signal) will cause this routine to fall back
++ * to the synchronous version.
++ *
++ * This really shouldn't busy wait like this.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++stat_gfs_async(struct gfs_sbd *sdp, struct gfs_usage *usage, int interruptible)
++{
++ struct gfs_rgrpd *rgd_next = gfs_rgrpd_get_first(sdp), *rgd;
++ struct gfs_holder *gha, *gh;
++ struct gfs_rgrp_lvb *rb;
++ unsigned int slots = sdp->sd_tune.gt_statfs_slots;
++ unsigned int x;
++ int done;
++ int error = 0, err;
++
++ gha = gmalloc(slots * sizeof(struct gfs_holder));
++ memset(gha, 0, slots * sizeof(struct gfs_holder));
++
++ for (;;) {
++ done = TRUE;
++
++ for (x = 0; x < slots; x++) {
++ gh = gha + x;
++
++ if (gh->gh_gl && gfs_glock_poll(gh)) {
++ err = gfs_glock_wait(gh);
++ if (err) {
++ gfs_holder_uninit(gh);
++ error = err;
++ } else {
++ rgd = gl2rgd(gh->gh_gl);
++
++ rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++ if (gfs32_to_cpu(rb->rb_magic) == GFS_MAGIC &&
++ !test_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags)) {
++ usage->gu_total_blocks += rgd->rd_ri.ri_data;
++ usage->gu_free += gfs32_to_cpu(rb->rb_free);
++ usage->gu_used_dinode += gfs32_to_cpu(rb->rb_useddi);
++ usage->gu_free_dinode += gfs32_to_cpu(rb->rb_freedi);
++ usage->gu_used_meta += gfs32_to_cpu(rb->rb_usedmeta);
++ usage->gu_free_meta += gfs32_to_cpu(rb->rb_freemeta);
++ } else
++ error = -EINVAL;
++
++ gfs_glock_dq_uninit(gh);
++ }
++ }
++
++ if (gh->gh_gl)
++ done = FALSE;
++ else if (rgd_next && !error) {
++ gfs_glock_nq_init(rgd_next->rd_gl,
++ LM_ST_SHARED,
++ GL_LOCAL_EXCL | GL_SKIP | GL_ASYNC,
++ gh);
++ rgd_next = gfs_rgrpd_get_next(rgd_next);
++ done = FALSE;
++ }
++
++ if (interruptible && signal_pending(current))
++ error = -ERESTARTSYS;
++ }
++
++ if (done)
++ break;
++
++ yield();
++ }
++
++ kfree(gha);
++
++ return error;
++}
++
++/**
++ * gfs_stat_gfs - Do a statfs
++ * @sdp: the filesystem
++ * @usage: the usage structure
++ * @interruptible: Stop if there is a signal pending
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_stat_gfs(struct gfs_sbd *sdp, struct gfs_usage *usage, int interruptible)
++{
++ struct gfs_holder ri_gh, rgd_gh;
++ struct gfs_rgrpd *rgd;
++ struct gfs_rgrp_lvb *rb;
++ int error;
++
++ memset(usage, 0, sizeof(struct gfs_usage));
++ usage->gu_block_size = sdp->sd_sb.sb_bsize;
++
++ error = gfs_rindex_hold(sdp, &ri_gh);
++ if (error)
++ return error;
++
++ if (GFS_ASYNC_LM(sdp)) {
++ error = stat_gfs_async(sdp, usage, interruptible);
++ if (!error || error == -ERESTARTSYS)
++ goto out;
++
++ memset(usage, 0, sizeof(struct gfs_usage));
++ usage->gu_block_size = sdp->sd_sb.sb_bsize;
++ }
++
++ for (rgd = gfs_rgrpd_get_first(sdp);
++ rgd;
++ rgd = gfs_rgrpd_get_next(rgd)) {
++ for (;;) {
++ error = gfs_glock_nq_init(rgd->rd_gl,
++ LM_ST_SHARED,
++ GL_LOCAL_EXCL | GL_SKIP,
++ &rgd_gh);
++ if (error)
++ goto out;
++
++ rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
++ if (gfs32_to_cpu(rb->rb_magic) == GFS_MAGIC &&
++ !test_bit(GLF_LVB_INVALID, &rgd->rd_gl->gl_flags)) {
++ usage->gu_total_blocks += rgd->rd_ri.ri_data;
++ usage->gu_free += gfs32_to_cpu(rb->rb_free);
++ usage->gu_used_dinode += gfs32_to_cpu(rb->rb_useddi);
++ usage->gu_free_dinode += gfs32_to_cpu(rb->rb_freedi);
++ usage->gu_used_meta += gfs32_to_cpu(rb->rb_usedmeta);
++ usage->gu_free_meta += gfs32_to_cpu(rb->rb_freemeta);
++
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ break;
++ } else {
++ gfs_glock_dq_uninit(&rgd_gh);
++
++ error = gfs_rgrp_lvb_init(rgd);
++ if (error)
++ goto out;
++ }
++ }
++
++ if (interruptible && signal_pending(current)) {
++ error = -ERESTARTSYS;
++ goto out;
++ }
++ }
++
++ out:
++ gfs_glock_dq_uninit(&ri_gh);
++
++ return error;
++}
++
++/**
++ * gfs_lock_fs_check_clean - Stop all writes to the FS and check that all journals are clean
++ * @sdp: the file system
++ * @state: the state to put the transaction lock into
++ * @t_gh: the hold on the transaction lock
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_lock_fs_check_clean(struct gfs_sbd *sdp, unsigned int state,
++ struct gfs_holder *t_gh)
++{
++ struct gfs_holder ji_gh, cl_gh;
++ struct gfs_log_header lh;
++ unsigned int x;
++ int error;
++
++ error = gfs_jindex_hold(sdp, &ji_gh);
++ if (error)
++ return error;
++
++ error = gfs_glock_nq_num(sdp,
++ GFS_CRAP_LOCK, &gfs_meta_glops,
++ LM_ST_SHARED, GL_NOCACHE,
++ &cl_gh);
++ if (error)
++ goto fail;
++
++ error = gfs_glock_nq_init(sdp->sd_trans_gl, state,
++ LM_FLAG_PRIORITY | GL_EXACT | GL_NOCACHE,
++ t_gh);
++ if (error)
++ goto fail_gunlock_craplock;
++
++ for (x = 0; x < sdp->sd_journals; x++) {
++ error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
++ cl_gh.gh_gl, &lh);
++ if (error)
++ goto fail_gunlock_trans;
++
++ if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
++ error = -EBUSY;
++ goto fail_gunlock_trans;
++ }
++ }
++
++ gfs_glock_dq_uninit(&cl_gh);
++ gfs_glock_dq_uninit(&ji_gh);
++
++ return 0;
++
++ fail_gunlock_trans:
++ gfs_glock_dq_uninit(t_gh);
++
++ fail_gunlock_craplock:
++ gfs_glock_dq_uninit(&cl_gh);
++
++ fail:
++ gfs_glock_dq_uninit(&ji_gh);
++
++ return error;
++}
++
++/**
++ * gfs_freeze_fs - freezes the file system
++ * @sdp: the file system
++ *
++ * This function flushes data and meta data for all machines by
++ * aquiring the transaction log exclusively. All journals are
++ * ensured to be in a clean state as well.
++ *
++ * Returns: 0 on success, -EXXX on error
++ */
++
++int
++gfs_freeze_fs(struct gfs_sbd *sdp)
++{
++ int error = 0;
++
++ down(&sdp->sd_freeze_lock);
++
++ if (!sdp->sd_freeze_count++) {
++ error = gfs_lock_fs_check_clean(sdp, LM_ST_DEFERRED,
++ &sdp->sd_freeze_gh);
++ if (error)
++ sdp->sd_freeze_count--;
++ else
++ sdp->sd_freeze_gh.gh_owner = NULL;
++ }
++
++ up(&sdp->sd_freeze_lock);
++
++ return error;
++}
++
++/**
++ * gfs_unfreeze_fs - unfreezes the file system
++ * @sdp: the file system
++ *
++ * This function allows the file system to proceed by unlocking
++ * the exclusively held transaction lock. Other GFS nodes are
++ * now free to acquire the lock shared and go on with their lives.
++ *
++ */
++
++void
++gfs_unfreeze_fs(struct gfs_sbd *sdp)
++{
++ down(&sdp->sd_freeze_lock);
++
++ if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
++ gfs_glock_dq_uninit(&sdp->sd_freeze_gh);
++
++ up(&sdp->sd_freeze_lock);
++}
+diff -urN linux-orig/fs/gfs/super.h linux-patched/fs/gfs/super.h
+--- linux-orig/fs/gfs/super.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/super.h 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,53 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __SUPER_DOT_H__
++#define __SUPER_DOT_H__
++
++void gfs_init_tune_data(struct gfs_sbd *sdp);
++
++int gfs_check_sb(struct gfs_sbd *sdp, struct gfs_sb *sb, int silent);
++int gfs_read_sb(struct gfs_sbd *sdp, struct gfs_glock *gl, int silent);
++int gfs_do_upgrade(struct gfs_sbd *sdp, struct gfs_glock *gl_sb);
++
++static __inline__ unsigned int
++gfs_num_journals(struct gfs_sbd *sdp)
++{
++ unsigned int num;
++ down(&sdp->sd_jindex_lock);
++ num = sdp->sd_journals;
++ up(&sdp->sd_jindex_lock);
++ return num;
++}
++
++int gfs_jindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ji_gh);
++void gfs_clear_journals(struct gfs_sbd *sdp);
++
++int gfs_get_jiinode(struct gfs_sbd *sdp);
++int gfs_get_riinode(struct gfs_sbd *sdp);
++int gfs_get_rootinode(struct gfs_sbd *sdp);
++int gfs_get_qinode(struct gfs_sbd *sdp);
++int gfs_get_linode(struct gfs_sbd *sdp);
++
++int gfs_make_fs_rw(struct gfs_sbd *sdp);
++int gfs_make_fs_ro(struct gfs_sbd *sdp);
++
++int gfs_stat_gfs(struct gfs_sbd *sdp, struct gfs_usage *usage,
++ int interruptible);
++
++int gfs_lock_fs_check_clean(struct gfs_sbd *sdp, unsigned int state,
++ struct gfs_holder *t_gh);
++int gfs_freeze_fs(struct gfs_sbd *sdp);
++void gfs_unfreeze_fs(struct gfs_sbd *sdp);
++
++#endif /* __SUPER_DOT_H__ */
+diff -urN linux-orig/fs/gfs/trans.c linux-patched/fs/gfs/trans.c
+--- linux-orig/fs/gfs/trans.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/trans.c 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,410 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "dio.h"
++#include "glock.h"
++#include "log.h"
++#include "lops.h"
++#include "quota.h"
++#include "trans.h"
++#include "unlinked.h"
++
++/**
++ * gfs_trans_print - Print a transaction to the console
++ * @sdp: the filesystem
++ * @tr: The GFS transaction
++ * @where: Situation of transaction
++ *
++ */
++
++void
++gfs_trans_print(struct gfs_sbd *sdp, struct gfs_trans *tr, unsigned int where)
++{
++ struct gfs_log_element *le;
++ struct list_head *tmp, *head;
++ unsigned int mblks = 0, eblks = 0;
++
++ LO_TRANS_SIZE(sdp, tr, &mblks, &eblks, NULL, NULL);
++
++ printk("Transaction: (%s, %u)\n", tr->tr_file, tr->tr_line);
++ printk(" tr_mblks_asked = %u, tr_eblks_asked = %u, tr_seg_reserved = %u\n",
++ tr->tr_mblks_asked, tr->tr_eblks_asked, tr->tr_seg_reserved);
++ printk(" mblks = %u, eblks = %u\n", mblks, eblks);
++ printk(" tr_flags = 0x%.8X\n", tr->tr_flags);
++
++ for (head = &tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ LO_PRINT(sdp, le, where);
++ }
++
++ printk("End Trans\n");
++}
++
++/**
++ * gfs_trans_begin_i - Perpare to start a transaction
++ * @sdp: The GFS superblock
++ * @meta_blocks: Reserve this many metadata blocks in the log
++ * @extra_blocks: Number of non-metadata blocks to reserve
++ *
++ * Allocate the struct gfs_trans struct. Do in-place and
++ * log reservations.
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++gfs_trans_begin_i(struct gfs_sbd *sdp,
++ unsigned int meta_blocks, unsigned int extra_blocks,
++ char *file, unsigned int line)
++{
++ struct gfs_trans *tr;
++ unsigned int blocks;
++ int error;
++
++ tr = gmalloc(sizeof(struct gfs_trans));
++ memset(tr, 0, sizeof(struct gfs_trans));
++
++ INIT_LIST_HEAD(&tr->tr_elements);
++ INIT_LIST_HEAD(&tr->tr_free_bufs);
++ INIT_LIST_HEAD(&tr->tr_free_bmem);
++ INIT_LIST_HEAD(&tr->tr_bufs);
++ INIT_LIST_HEAD(&tr->tr_ail_bufs);
++
++ tr->tr_file = file;
++ tr->tr_line = line;
++ tr->tr_t_gh = gfs_holder_get(sdp->sd_trans_gl, LM_ST_SHARED, 0);
++
++ error = gfs_glock_nq(tr->tr_t_gh);
++ if (error)
++ goto fail;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
++ tr->tr_t_gh->gh_flags |= GL_NOCACHE;
++ error = -EROFS;
++ goto fail_gunlock;
++ }
++
++ /* Do log reservation */
++
++ tr->tr_mblks_asked = meta_blocks;
++ tr->tr_eblks_asked = extra_blocks;
++
++ blocks = 1;
++ if (meta_blocks)
++ blocks += gfs_struct2blk(sdp, meta_blocks,
++ sizeof(struct gfs_block_tag)) +
++ meta_blocks;
++ blocks += extra_blocks;
++ tr->tr_seg_reserved = gfs_blk2seg(sdp, blocks);
++
++ error = gfs_log_reserve(sdp, tr->tr_seg_reserved, FALSE);
++ if (error)
++ goto fail_gunlock;
++
++ GFS_ASSERT_SBD(!current_transaction, sdp,);
++ current_transaction = tr;
++
++ return 0;
++
++ fail_gunlock:
++ gfs_glock_dq(tr->tr_t_gh);
++
++ fail:
++ gfs_holder_put(tr->tr_t_gh);
++ kfree(tr);
++
++ return error;
++}
++
++/**
++ * gfs_trans_end - End a transaction
++ * @sdp: The GFS superblock
++ *
++ * If buffers were actually added to the transaction,
++ * commit it.
++ */
++
++void
++gfs_trans_end(struct gfs_sbd *sdp)
++{
++ struct gfs_trans *tr;
++ struct gfs_holder *t_gh;
++ struct list_head *tmp, *head;
++ struct gfs_log_element *le;
++
++ tr = current_transaction;
++ GFS_ASSERT_SBD(tr, sdp,);
++ current_transaction = NULL;
++
++ t_gh = tr->tr_t_gh;
++ tr->tr_t_gh = NULL;
++
++ if (list_empty(&tr->tr_elements)) {
++ gfs_log_release(sdp, tr->tr_seg_reserved);
++ kfree(tr);
++
++ gfs_glock_dq(t_gh);
++ gfs_holder_put(t_gh);
++
++ return;
++ }
++
++ for (head = &tr->tr_elements, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ LO_TRANS_END(sdp, le);
++ }
++
++ gfs_log_commit(sdp, tr);
++
++ gfs_glock_dq(t_gh);
++ gfs_holder_put(t_gh);
++
++ if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
++ gfs_log_flush(sdp);
++}
++
++/**
++ * gfs_trans_add_gl - Add a glock to a transaction
++ * @gl: the glock
++ *
++ * Add the given glock to this process's transaction
++ */
++
++void
++gfs_trans_add_gl(struct gfs_glock *gl)
++{
++ if (!gl->gl_new_le.le_trans) {
++ GFS_ASSERT_GLOCK(gfs_glock_is_locked_by_me(gl) &&
++ gfs_glock_is_held_excl(gl), gl,);
++ gfs_glock_hold(gl); /* Released in glock_trans_end() */
++
++ set_bit(GLF_DIRTY, &gl->gl_flags);
++
++ LO_ADD(gl->gl_sbd, &gl->gl_new_le);
++ gl->gl_new_le.le_trans->tr_num_gl++;
++ }
++}
++
++/**
++ * gfs_trans_add_bh - Add a buffer to the current transaction
++ * @gl: the glock the buffer belongs to
++ * @bh: The buffer to add
++ *
++ * Add a buffer to the current transaction. The glock for the buffer
++ * should be held. This pins the buffer as well.
++ *
++ * Call this as many times as you want during transaction formation.
++ * It only does its work once.
++ *
++ */
++
++void
++gfs_trans_add_bh(struct gfs_glock *gl, struct buffer_head *bh)
++{
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ struct gfs_bufdata *bd;
++
++ bd = bh2bd(bh);
++ if (!bd) {
++ gfs_attach_bufdata(bh, gl);
++ bd = bh2bd(bh);
++ }
++
++ if (bd->bd_new_le.le_trans)
++ return;
++
++ gfs_meta_check(sdp, bh);
++
++ GFS_ASSERT_GLOCK(bd->bd_gl == gl, gl,);
++
++ if (!gl->gl_new_le.le_trans)
++ gfs_trans_add_gl(gl);
++
++ gfs_dpin(sdp, bh);
++
++ LO_ADD(sdp, &bd->bd_new_le);
++ bd->bd_new_le.le_trans->tr_num_buf++;
++}
++
++/**
++ * gfs_trans_add_unlinked - Add a unlinked/dealloced tag to the current transaction
++ * @sdp: the filesystem
++ * @type: the type of entry
++ * @inum: the inode number
++ *
++ * Returns: the unlinked structure
++ */
++
++struct gfs_unlinked *
++gfs_trans_add_unlinked(struct gfs_sbd *sdp, unsigned int type,
++ struct gfs_inum *inum)
++{
++ struct gfs_unlinked *ul;
++
++ ul = gfs_unlinked_get(sdp, inum, CREATE);
++
++ LO_ADD(sdp, &ul->ul_new_le);
++
++ switch (type) {
++ case GFS_LOG_DESC_IUL:
++ set_bit(ULF_NEW_UL, &ul->ul_flags);
++ ul->ul_new_le.le_trans->tr_num_iul++;
++ break;
++ case GFS_LOG_DESC_IDA:
++ clear_bit(ULF_NEW_UL, &ul->ul_flags);
++ ul->ul_new_le.le_trans->tr_num_ida++;
++ break;
++ default:
++ GFS_ASSERT_SBD(FALSE, sdp,);
++ break;
++ }
++
++ return ul;
++}
++
++/**
++ * gfs_trans_add_quota - Add quota changes to a transaction
++ * @sdp: the filesystem
++ * @change: The number of blocks allocated (positive) or freed (negative)
++ * @uid: the user ID doing the change
++ * @gid: the group ID doing the change
++ *
++ */
++
++void
++gfs_trans_add_quota(struct gfs_sbd *sdp, int64_t change,
++ uint32_t uid, uint32_t gid)
++{
++ struct gfs_trans *tr;
++ struct list_head *tmp, *head, *next;
++ struct gfs_log_element *le;
++ struct gfs_quota_le *ql;
++ int found_uid, found_gid;
++ int error;
++
++ if (!sdp->sd_tune.gt_quota_account)
++ return;
++
++ GFS_ASSERT_SBD(change, sdp,);
++
++ found_uid = (uid == NO_QUOTA_CHANGE);
++ found_gid = (gid == NO_QUOTA_CHANGE);
++
++ GFS_ASSERT_SBD(!found_uid || !found_gid, sdp,);
++
++ tr = current_transaction;
++ GFS_ASSERT_SBD(tr, sdp,);
++
++ for (head = &tr->tr_elements, tmp = head->next, next = tmp->next;
++ tmp != head;
++ tmp = next, next = next->next) {
++ le = list_entry(tmp, struct gfs_log_element, le_list);
++ if (le->le_ops != &gfs_quota_lops)
++ continue;
++
++ ql = container_of(le, struct gfs_quota_le, ql_le);
++
++ if (test_bit(QDF_USER, &ql->ql_data->qd_flags)) {
++ if (ql->ql_data->qd_id == uid) {
++ ql->ql_change += change;
++
++ spin_lock(&sdp->sd_quota_lock);
++ ql->ql_data->qd_change_new += change;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ list_del(&le->le_list);
++
++ if (ql->ql_change)
++ list_add(&le->le_list,
++ &tr->tr_elements);
++ else {
++ gfs_quota_put(sdp, ql->ql_data);
++ kfree(ql);
++ tr->tr_num_q--;
++ }
++
++ GFS_ASSERT_SBD(!found_uid, sdp,);
++ found_uid = TRUE;
++ if (found_gid)
++ break;
++ }
++ } else {
++ if (ql->ql_data->qd_id == gid) {
++ ql->ql_change += change;
++
++ spin_lock(&sdp->sd_quota_lock);
++ ql->ql_data->qd_change_new += change;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ list_del(&le->le_list);
++
++ if (ql->ql_change)
++ list_add(&le->le_list,
++ &tr->tr_elements);
++ else {
++ gfs_quota_put(sdp, ql->ql_data);
++ kfree(ql);
++ tr->tr_num_q--;
++ }
++
++ GFS_ASSERT_SBD(!found_gid, sdp,);
++ found_gid = TRUE;
++ if (found_uid)
++ break;
++ }
++ }
++ }
++
++ while (!found_uid || !found_gid) {
++ ql = gmalloc(sizeof(struct gfs_quota_le));
++ memset(ql, 0, sizeof(struct gfs_quota_le));
++
++ INIT_LE(&ql->ql_le, &gfs_quota_lops);
++
++ if (found_uid) {
++ error = gfs_quota_get(sdp, FALSE, gid,
++ NO_CREATE,
++ &ql->ql_data);
++ found_gid = TRUE;
++ } else {
++ error = gfs_quota_get(sdp, TRUE, uid,
++ NO_CREATE,
++ &ql->ql_data);
++ found_uid = TRUE;
++ }
++
++ GFS_ASSERT_SBD(!error && ql->ql_data, sdp,);
++
++ ql->ql_change = change;
++
++ spin_lock(&sdp->sd_quota_lock);
++ ql->ql_data->qd_change_new += change;
++ spin_unlock(&sdp->sd_quota_lock);
++
++ LO_ADD(sdp, &ql->ql_le);
++ tr->tr_num_q++;
++ }
++}
+diff -urN linux-orig/fs/gfs/trans.h linux-patched/fs/gfs/trans.h
+--- linux-orig/fs/gfs/trans.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/trans.h 2004-06-20 22:48:17.954944996 -0500
+@@ -0,0 +1,37 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __TRANS_DOT_H__
++#define __TRANS_DOT_H__
++
++#define TRANS_IS_NEW (53)
++#define TRANS_IS_INCORE (54)
++void gfs_trans_print(struct gfs_sbd *sdp, struct gfs_trans *tr,
++ unsigned int where);
++
++int gfs_trans_begin_i(struct gfs_sbd *sdp,
++ unsigned int meta_blocks, unsigned int extra_blocks,
++ char *file, unsigned int line);
++#define gfs_trans_begin(sdp, mb, eb) \
++gfs_trans_begin_i((sdp), (mb), (eb), __FILE__, __LINE__)
++
++void gfs_trans_end(struct gfs_sbd *sdp);
++
++void gfs_trans_add_gl(struct gfs_glock *gl);
++void gfs_trans_add_bh(struct gfs_glock *gl, struct buffer_head *bh);
++struct gfs_unlinked *gfs_trans_add_unlinked(struct gfs_sbd *sdp, unsigned int type,
++ struct gfs_inum *inum);
++void gfs_trans_add_quota(struct gfs_sbd *sdp, int64_t change, uint32_t uid,
++ uint32_t gid);
++
++#endif /* __TRANS_DOT_H__ */
+diff -urN linux-orig/fs/gfs/unlinked.c linux-patched/fs/gfs/unlinked.c
+--- linux-orig/fs/gfs/unlinked.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/unlinked.c 2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,427 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "inode.h"
++#include "log.h"
++#include "lops.h"
++#include "unlinked.h"
++
++/**
++ * gfs_unlinked_get - Get a structure to represent an unlinked inode
++ * @sdp: the filesystem
++ * @inum: the inode that's unlinked
++ * @create: if TRUE, create the structure, otherwise return NULL
++ *
++ * Returns: the structure, or NULL
++ */
++
++struct gfs_unlinked *
++gfs_unlinked_get(struct gfs_sbd *sdp, struct gfs_inum *inum, int create)
++{
++ struct gfs_unlinked *ul = NULL, *new_ul = NULL;
++ struct list_head *tmp, *head;
++
++ for (;;) {
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ for (head = &sdp->sd_unlinked_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++ if (gfs_inum_equal(&ul->ul_inum, inum)) {
++ ul->ul_count++;
++ break;
++ }
++ }
++
++ if (tmp == head)
++ ul = NULL;
++
++ if (!ul && new_ul) {
++ ul = new_ul;
++ list_add(&ul->ul_list, &sdp->sd_unlinked_list);
++ new_ul = NULL;
++ }
++
++ spin_unlock(&sdp->sd_unlinked_lock);
++
++ if (ul || !create) {
++ if (new_ul)
++ kfree(new_ul);
++ return ul;
++ }
++
++ new_ul = gmalloc(sizeof(struct gfs_unlinked));
++ memset(new_ul, 0, sizeof(struct gfs_unlinked));
++
++ new_ul->ul_count = 1;
++ new_ul->ul_inum = *inum;
++
++ INIT_LE(&new_ul->ul_new_le, &gfs_unlinked_lops);
++ INIT_LE(&new_ul->ul_incore_le, &gfs_unlinked_lops);
++ INIT_LE(&new_ul->ul_ondisk_le, &gfs_unlinked_lops);
++ }
++}
++
++/**
++ * gfs_unlinked_hold - increment the usage count on a struct gfs_unlinked
++ * @sdp: the filesystem
++ * @ul: the structure
++ *
++ */
++
++void
++gfs_unlinked_hold(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++ spin_lock(&sdp->sd_unlinked_lock);
++ ul->ul_count++;
++ spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * gfs_unlinked_put - decrement the usage count on a struct gfs_unlinked
++ * @sdp: the filesystem
++ * @ul: the structure
++ *
++ * Free the structure if its reference count hits zero.
++ *
++ */
++
++void
++gfs_unlinked_put(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ GFS_ASSERT_SBD(ul->ul_count, sdp,);
++ ul->ul_count--;
++
++ if (!ul->ul_count) {
++ GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++ !test_bit(ULF_OD_LIST, &ul->ul_flags) &&
++ !test_bit(ULF_LOCK, &ul->ul_flags),
++ sdp,);
++ list_del(&ul->ul_list);
++ spin_unlock(&sdp->sd_unlinked_lock);
++ kfree(ul);
++ } else
++ spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * unlinked_find - Find a inode to try to deallocate
++ * @sdp: the filesystem
++ *
++ * The returned structure is locked and needs to be unlocked
++ * with gfs_unlinked_unlock().
++ *
++ * Returns: A unlinked structure, or NULL
++ */
++
++struct gfs_unlinked *
++unlinked_find(struct gfs_sbd *sdp)
++{
++ struct list_head *tmp, *head;
++ struct gfs_unlinked *ul = NULL;
++
++ if (test_bit(SDF_ROFS, &sdp->sd_flags))
++ return NULL;
++
++ gfs_log_lock(sdp);
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ if (!atomic_read(&sdp->sd_unlinked_ic_count))
++ goto out;
++
++ for (head = &sdp->sd_unlinked_list, tmp = head->next;
++ tmp != head;
++ tmp = tmp->next) {
++ ul = list_entry(tmp, struct gfs_unlinked, ul_list);
++
++ if (test_bit(ULF_LOCK, &ul->ul_flags))
++ continue;
++ if (!test_bit(ULF_IC_LIST, &ul->ul_flags))
++ continue;
++
++ list_move_tail(&ul->ul_list, &sdp->sd_unlinked_list);
++
++ set_bit(ULF_LOCK, &ul->ul_flags);
++ ul->ul_count++;
++
++ goto out;
++ }
++
++ ul = NULL;
++
++ out:
++ spin_unlock(&sdp->sd_unlinked_lock);
++ gfs_log_unlock(sdp);
++
++ return ul;
++}
++
++/**
++ * gfs_unlinked_lock - lock a unlinked structure
++ * @sdp: the filesystem
++ * @ul: the unlinked inode structure
++ *
++ */
++
++void
++gfs_unlinked_lock(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ GFS_ASSERT_SBD(!test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++ set_bit(ULF_LOCK, &ul->ul_flags);
++
++ ul->ul_count++;
++
++ spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * gfs_unlinked_unlock - drop and a reference on a unlinked structure
++ * @sdp: the filesystem
++ * @ul: the unlinked inode structure
++ *
++ */
++
++void
++gfs_unlinked_unlock(struct gfs_sbd *sdp, struct gfs_unlinked *ul)
++{
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ GFS_ASSERT_SBD(test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++ clear_bit(ULF_LOCK, &ul->ul_flags);
++
++ GFS_ASSERT_SBD(ul->ul_count, sdp,);
++ ul->ul_count--;
++
++ if (!ul->ul_count) {
++ GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++ !test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++ list_del(&ul->ul_list);
++ spin_unlock(&sdp->sd_unlinked_lock);
++ kfree(ul);
++ } else
++ spin_unlock(&sdp->sd_unlinked_lock);
++}
++
++/**
++ * gfs_unlinked_merge - add/remove a unlinked inode from the in-memory list
++ * @sdp: the filesystem
++ * @type: is this a unlink tag or a dealloc tag
++ * @inum: the inode number
++ *
++ */
++
++void
++gfs_unlinked_merge(struct gfs_sbd *sdp, unsigned int type,
++ struct gfs_inum *inum)
++{
++ struct gfs_unlinked *ul;
++
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count) ==
++ atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++ ul = gfs_unlinked_get(sdp, inum, CREATE);
++
++ gfs_log_lock(sdp);
++
++ switch (type) {
++ case GFS_LOG_DESC_IUL:
++ gfs_unlinked_hold(sdp, ul);
++ gfs_unlinked_hold(sdp, ul);
++ GFS_ASSERT_SBD(!test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++ !test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++ set_bit(ULF_IC_LIST, &ul->ul_flags);
++ set_bit(ULF_OD_LIST, &ul->ul_flags);
++ atomic_inc(&sdp->sd_unlinked_ic_count);
++ atomic_inc(&sdp->sd_unlinked_od_count);
++
++ break;
++
++ case GFS_LOG_DESC_IDA:
++ GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++ test_bit(ULF_OD_LIST, &ul->ul_flags), sdp,);
++ clear_bit(ULF_IC_LIST, &ul->ul_flags);
++ clear_bit(ULF_OD_LIST, &ul->ul_flags);
++ gfs_unlinked_put(sdp, ul);
++ gfs_unlinked_put(sdp, ul);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count), sdp,);
++ atomic_dec(&sdp->sd_unlinked_ic_count);
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++ atomic_dec(&sdp->sd_unlinked_od_count);
++
++ break;
++ }
++
++ gfs_log_unlock(sdp);
++
++ gfs_unlinked_put(sdp, ul);
++}
++
++/**
++ * gfs_unlinked_cleanup - get rid of any extra struct gfs_unlinked structures
++ * @sdp: the filesystem
++ *
++ */
++
++void
++gfs_unlinked_cleanup(struct gfs_sbd *sdp)
++{
++ struct gfs_unlinked *ul;
++
++ restart:
++ gfs_log_lock(sdp);
++
++ GFS_ASSERT_SBD(atomic_read(&sdp->sd_unlinked_ic_count) ==
++ atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++ spin_lock(&sdp->sd_unlinked_lock);
++
++ while (!list_empty(&sdp->sd_unlinked_list)) {
++ ul = list_entry(sdp->sd_unlinked_list.next,
++ struct gfs_unlinked, ul_list);
++
++ if (ul->ul_count > 2) {
++ spin_unlock(&sdp->sd_unlinked_lock);
++ gfs_log_unlock(sdp);
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto restart;
++ }
++ GFS_ASSERT_SBD(ul->ul_count == 2, sdp,);
++
++ GFS_ASSERT_SBD(test_bit(ULF_IC_LIST, &ul->ul_flags) &&
++ test_bit(ULF_OD_LIST, &ul->ul_flags) &&
++ !test_bit(ULF_LOCK, &ul->ul_flags), sdp,);
++
++ list_del(&ul->ul_list);
++
++ atomic_dec(&sdp->sd_unlinked_ic_count);
++ atomic_dec(&sdp->sd_unlinked_od_count);
++
++ spin_unlock(&sdp->sd_unlinked_lock);
++ kfree(ul);
++ spin_lock(&sdp->sd_unlinked_lock);
++ }
++
++ spin_unlock(&sdp->sd_unlinked_lock);
++
++ GFS_ASSERT_SBD(!atomic_read(&sdp->sd_unlinked_ic_count) &&
++ !atomic_read(&sdp->sd_unlinked_od_count), sdp,);
++
++ gfs_log_unlock(sdp);
++}
++
++/**
++ * gfs_unlinked_limit - limit the number of inodes waiting to be deallocated
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure;
++ */
++
++void
++gfs_unlinked_limit(struct gfs_sbd *sdp)
++{
++ unsigned int tries = 0, min = 0;
++ int error;
++
++ if (atomic_read(&sdp->sd_unlinked_ic_count) >=
++ sdp->sd_tune.gt_ilimit2) {
++ tries = sdp->sd_tune.gt_ilimit2_tries;
++ min = sdp->sd_tune.gt_ilimit2_min;
++ } else if (atomic_read(&sdp->sd_unlinked_ic_count) >=
++ sdp->sd_tune.gt_ilimit1) {
++ tries = sdp->sd_tune.gt_ilimit1_tries;
++ min = sdp->sd_tune.gt_ilimit1_min;
++ }
++
++ while (tries--) {
++ struct gfs_unlinked *ul = unlinked_find(sdp);
++ if (!ul)
++ break;
++
++ error = gfs_inode_dealloc(sdp, &ul->ul_inum);
++
++ gfs_unlinked_unlock(sdp, ul);
++
++ if (!error) {
++ if (!--min)
++ break;
++ } else if (error != 1)
++ break;
++ }
++}
++
++/**
++ * gfs_unlinked_dealloc - Go through the list of inodes to be deallocated
++ * @sdp: the filesystem
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void
++gfs_unlinked_dealloc(struct gfs_sbd *sdp)
++{
++ unsigned int hits, strikes;
++ int error;
++
++ for (;;) {
++ hits = 0;
++ strikes = 0;
++
++ for (;;) {
++ struct gfs_unlinked *ul = unlinked_find(sdp);
++ if (!ul)
++ return;
++
++ error = gfs_inode_dealloc(sdp, &ul->ul_inum);
++
++ gfs_unlinked_unlock(sdp, ul);
++
++ if (!error) {
++ hits++;
++ if (strikes)
++ strikes--;
++ } else if (error == 1) {
++ strikes++;
++ if (strikes >= atomic_read(&sdp->sd_unlinked_ic_count)) {
++ error = 0;
++ break;
++ }
++ } else
++ goto out;
++ }
++
++ if (!hits || !test_bit(SDF_INODED_RUN, &sdp->sd_flags))
++ break;
++
++ cond_resched();
++ }
++
++ out:
++ if (error && error != -EROFS)
++ printk("GFS: fsid=%s: error deallocating inodes: %d\n",
++ sdp->sd_fsname, error);
++}
+diff -urN linux-orig/fs/gfs/unlinked.h linux-patched/fs/gfs/unlinked.h
+--- linux-orig/fs/gfs/unlinked.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/unlinked.h 2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,32 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UNLINKED_DOT_H__
++#define __UNLINKED_DOT_H__
++
++struct gfs_unlinked *gfs_unlinked_get(struct gfs_sbd *sdp,
++ struct gfs_inum *inum, int create);
++void gfs_unlinked_hold(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++void gfs_unlinked_put(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++
++void gfs_unlinked_lock(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++void gfs_unlinked_unlock(struct gfs_sbd *sdp, struct gfs_unlinked *ul);
++
++void gfs_unlinked_merge(struct gfs_sbd *sdp, unsigned int type,
++ struct gfs_inum *inum);
++void gfs_unlinked_cleanup(struct gfs_sbd *sdp);
++
++void gfs_unlinked_limit(struct gfs_sbd *sdp);
++void gfs_unlinked_dealloc(struct gfs_sbd *sdp);
++
++#endif /* __UNLINKED_DOT_H__ */
+diff -urN linux-orig/fs/gfs/util.c linux-patched/fs/gfs/util.c
+--- linux-orig/fs/gfs/util.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/util.c 2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,317 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/smp_lock.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/completion.h>
++#include <linux/buffer_head.h>
++
++#include "gfs.h"
++#include "glock.h"
++
++uint32_t gfs_random_number;
++
++volatile int gfs_in_panic = FALSE;
++
++kmem_cache_t *gfs_glock_cachep = NULL;
++kmem_cache_t *gfs_inode_cachep = NULL;
++kmem_cache_t *gfs_bufdata_cachep = NULL;
++kmem_cache_t *gfs_mhc_cachep = NULL;
++
++/**
++ * gfs_random - Generate a random 32-bit number
++ *
++ * Generate a semi-crappy 32-bit pseudo-random number without using
++ * floating point.
++ *
++ * The PRNG is from "Numerical Recipes in C" (second edition), page 284.
++ *
++ * Returns: a 32-bit random number
++ */
++
++uint32_t
++gfs_random(void)
++{
++ gfs_random_number = 0x0019660D * gfs_random_number + 0x3C6EF35F;
++ return gfs_random_number;
++}
++
++/**
++ * hash_more_internal - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ * @hash: the hash from a previous call
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * Hash guts
++ *
++ * Returns: the hash
++ */
++
++static __inline__ uint32_t
++hash_more_internal(const void *data, unsigned int len, uint32_t hash)
++{
++ unsigned char *p = (unsigned char *)data;
++ unsigned char *e = p + len;
++ uint32_t h = hash;
++
++ while (p < e) {
++ h ^= (uint32_t)(*p++);
++ h *= 0x01000193;
++ }
++
++ return h;
++}
++
++/**
++ * gfs_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_hash(const void *data, unsigned int len)
++{
++ uint32_t h = 0x811C9DC5;
++ h = hash_more_internal(data, len, h);
++ return h;
++}
++
++/**
++ * gfs_hash_more - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ * @hash: the hash from a previous call
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * This is the 32-bit FNV-1a hash from:
++ * http://www.isthe.com/chongo/tech/comp/fnv/
++ *
++ * This version let's you hash together discontinuous regions.
++ * For example, to compute the combined hash of the memory in
++ * (data1, len1), (data2, len2), and (data3, len3) you:
++ *
++ * h = gfs_hash(data1, len1);
++ * h = gfs_hash_more(data2, len2, h);
++ * h = gfs_hash_more(data3, len3, h);
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_hash_more(const void *data, unsigned int len, uint32_t hash)
++{
++ uint32_t h;
++ h = hash_more_internal(data, len, hash);
++ return h;
++}
++
++/**
++ * gfs_sort - Sort base array using bubble sort algorithm
++ * @array: the input array
++ * @num: number of elements in array
++ * @size: size of each element in array
++ * @compare: fxn to compare array elements (returns negative for lt, 0 for eq, and positive for gt)
++ *
++ * Sorts the array passed in using the compar fxn to compare elements using
++ * the bubble sort algorithm
++ */
++
++void
++gfs_sort(void *array, unsigned int num, unsigned int size,
++ int (*compare) (void *, void *))
++{
++ char buf[size];
++ char *p1, *p2;
++ int changed;
++ unsigned int x;
++
++ if (num <= 1)
++ return;
++
++ do {
++ changed = FALSE;
++ p1 = (char *)array;
++ p2 = (char *)array + size;
++
++ for (x = num - 1; x--;) {
++ if (compare(p1, p2) > 0) {
++ memcpy(buf, p1, size);
++ memcpy(p1, p2, size);
++ memcpy(p2, buf, size);
++ changed = TRUE;
++ }
++
++ p1 = p2;
++ p2 += size;
++ }
++ }
++ while (changed);
++}
++
++/**
++ * bitch_about -
++ * @sdp: the filesystem
++ * @last: the last time we bitched
++ * @about:
++ *
++ */
++
++void
++bitch_about(struct gfs_sbd *sdp, unsigned long *last, char *about)
++{
++ if (time_after_eq(jiffies, *last + sdp->sd_tune.gt_complain_secs * HZ)) {
++ printk("GFS: fsid=%s: %s by program \"%s\"\n",
++ sdp->sd_fsname, about, current->comm);
++ *last = jiffies;
++ }
++}
++
++/**
++ * gfs_assert_i - Stop the machine
++ * @assertion: the assertion that failed
++ * @file: the file that called us
++ * @line: the line number of the file that called us
++ *
++ * Don't do ENTER() and EXIT() here.
++ *
++ */
++
++void
++gfs_assert_i(char *assertion,
++ unsigned int type, void *ptr,
++ char *file, unsigned int line)
++{
++ gfs_in_panic = TRUE;
++
++ printk("\nGFS: Assertion failed on line %d of file %s\n"
++ "GFS: assertion: \"%s\"\n"
++ "GFS: time = %lu\n",
++ line, file, assertion, get_seconds());
++
++ switch (type) {
++ case GFS_ASSERT_TYPE_SBD:
++ {
++ struct gfs_sbd *sdp = (struct gfs_sbd *)ptr;
++ printk("GFS: fsid=%s\n", sdp->sd_fsname);
++ }
++ break;
++
++ case GFS_ASSERT_TYPE_GLOCK:
++ {
++ struct gfs_glock *gl = (struct gfs_glock *)ptr;
++ struct gfs_sbd *sdp = gl->gl_sbd;
++ printk("GFS: fsid=%s: glock = (%u, %"PRIu64")\n",
++ sdp->sd_fsname,
++ gl->gl_name.ln_type,
++ gl->gl_name.ln_number);
++ }
++ break;
++
++ case GFS_ASSERT_TYPE_INODE:
++ {
++ struct gfs_inode *ip = (struct gfs_inode *)ptr;
++ struct gfs_sbd *sdp = ip->i_sbd;
++ printk("GFS: fsid=%s: inode = %"PRIu64"/%"PRIu64"\n",
++ sdp->sd_fsname,
++ ip->i_num.no_formal_ino, ip->i_num.no_addr);
++ }
++ break;
++
++ case GFS_ASSERT_TYPE_RGRPD:
++ {
++ struct gfs_rgrpd *rgd = (struct gfs_rgrpd *)ptr;
++ struct gfs_sbd *sdp = rgd->rd_sbd;
++ printk("GFS: fsid=%s: rgroup = %"PRIu64"\n",
++ sdp->sd_fsname, rgd->rd_ri.ri_addr);
++ }
++ break;
++ }
++
++ printk("\n");
++#if 0
++ printk("GFS: Record message above and reboot.\n");
++ BUG();
++#endif
++ panic("GFS: Record message above and reboot.\n");
++}
++
++/**
++ * gfs_io_errori - handle an I/O error
++ * @sdp: the filesystem
++ * @bh: the buffer the error happened on (can be NULL)
++ *
++ * This will do something other than panic, eventually.
++ *
++ */
++
++void gfs_io_error_i(struct gfs_sbd *sdp,
++ unsigned int type, void *ptr,
++ char *file, unsigned int line)
++{
++ switch (type) {
++ case GFS_IO_ERROR_TYPE_BH:
++ {
++ struct buffer_head *bh = (struct buffer_head *)ptr;
++ printk("GFS: fsid=%s: I/O error on block %"PRIu64"\n",
++ sdp->sd_fsname, (uint64_t)bh->b_blocknr);
++ }
++ break;
++
++ case GFS_IO_ERROR_TYPE_INODE:
++ {
++ struct gfs_inode *ip = (struct gfs_inode *)ptr;
++ printk("GFS: fsid=%s: I/O error in inode %"PRIu64"/%"PRIu64"\n",
++ sdp->sd_fsname,
++ ip->i_num.no_formal_ino, ip->i_num.no_addr);
++ }
++ break;
++
++ default:
++ printk("GFS: fsid=%s: I/O error\n", sdp->sd_fsname);
++ break;
++ }
++
++ GFS_ASSERT_SBD(FALSE, sdp,);
++}
++
++/**
++ * gmalloc - malloc a small amount of memory
++ * @size: the number of bytes to malloc
++ *
++ * Returns: the memory
++ */
++
++void *
++gmalloc(unsigned int size)
++{
++ void *p;
++ RETRY_MALLOC(p = kmalloc(size, GFP_KERNEL), p);
++ return p;
++}
++
+diff -urN linux-orig/fs/gfs/util.h linux-patched/fs/gfs/util.h
+--- linux-orig/fs/gfs/util.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs/util.h 2004-06-20 22:48:17.955944714 -0500
+@@ -0,0 +1,156 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++
++/* Utility functions */
++
++extern uint32_t gfs_random_number;
++uint32_t gfs_random(void);
++
++uint32_t gfs_hash(const void *data, unsigned int len);
++uint32_t gfs_hash_more(const void *data, unsigned int len, uint32_t hash);
++
++void gfs_sort(void *array, unsigned int num, unsigned int size,
++ int (*compare) (void *, void *));
++
++void bitch_about(struct gfs_sbd *sdp, unsigned long *last, char *about);
++
++
++
++/* Assertion stuff */
++
++#define GFS_ASSERT_TYPE_NONE (18)
++#define GFS_ASSERT_TYPE_SBD (19)
++#define GFS_ASSERT_TYPE_GLOCK (20)
++#define GFS_ASSERT_TYPE_INODE (21)
++#define GFS_ASSERT_TYPE_RGRPD (22)
++
++#define GFS_ASSERT(x, todo) \
++do \
++{ \
++ if (!(x)) \
++ { \
++ {todo} \
++ gfs_assert_i(#x, GFS_ASSERT_TYPE_NONE, NULL, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++#define GFS_ASSERT_SBD(x, sdp, todo) \
++do \
++{ \
++ if (!(x)) \
++ { \
++ struct gfs_sbd *gfs_assert_sbd = (sdp); \
++ {todo} \
++ gfs_assert_i(#x, GFS_ASSERT_TYPE_SBD, gfs_assert_sbd, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++#define GFS_ASSERT_GLOCK(x, gl, todo) \
++do \
++{ \
++ if (!(x)) \
++ { \
++ struct gfs_glock *gfs_assert_glock = (gl); \
++ {todo} \
++ gfs_assert_i(#x, GFS_ASSERT_TYPE_GLOCK, gfs_assert_glock, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++#define GFS_ASSERT_INODE(x, ip, todo) \
++do \
++{ \
++ if (!(x)) \
++ { \
++ struct gfs_inode *gfs_assert_inode = (ip); \
++ {todo} \
++ gfs_assert_i(#x, GFS_ASSERT_TYPE_INODE, gfs_assert_inode, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++#define GFS_ASSERT_RGRPD(x, rgd, todo) \
++do \
++{ \
++ if (!(x)) \
++ { \
++ struct gfs_rgrpd *gfs_assert_rgrpd = (rgd); \
++ {todo} \
++ gfs_assert_i(#x, GFS_ASSERT_TYPE_RGRPD, gfs_assert_rgrpd, __FILE__, __LINE__); \
++ } \
++} \
++while (0)
++
++extern volatile int gfs_in_panic;
++void gfs_assert_i(char *assertion,
++ unsigned int type, void *ptr,
++ char *file, unsigned int line) __attribute__ ((noreturn));
++
++
++/* I/O error stuff */
++
++#define GFS_IO_ERROR_TYPE_NONE (118)
++#define GFS_IO_ERROR_TYPE_BH (119)
++#define GFS_IO_ERROR_TYPE_INODE (120)
++
++#define gfs_io_error(sdp) \
++gfs_io_error_i((sdp), GFS_ASSERT_TYPE_NONE, NULL, __FILE__, __LINE__);
++
++#define gfs_io_error_bh(sdp, bh) \
++do \
++{ \
++ struct buffer_head *gfs_io_error_bh = (bh); \
++ gfs_io_error_i((sdp), GFS_IO_ERROR_TYPE_BH, gfs_io_error_bh, __FILE__, __LINE__); \
++} \
++while (0)
++
++#define gfs_io_error_inode(ip) \
++do \
++{ \
++ struct gfs_inode *gfs_io_error_inode = (ip); \
++ gfs_io_error_i((ip)->i_sbd, GFS_IO_ERROR_TYPE_INODE, gfs_io_error_inode, __FILE__, __LINE__); \
++} \
++while (0)
++
++void gfs_io_error_i(struct gfs_sbd *sdp,
++ unsigned int type, void *ptr,
++ char *file, unsigned int line);
++
++
++/* Memory stuff */
++
++#define RETRY_MALLOC(do_this, until_this) \
++for (;;) \
++{ \
++ do { do_this; } while (0); \
++ if (until_this) \
++ break; \
++ printk("GFS: out of memory: %s, %u\n", __FILE__, __LINE__); \
++ yield();\
++}
++
++extern kmem_cache_t *gfs_glock_cachep;
++extern kmem_cache_t *gfs_inode_cachep;
++extern kmem_cache_t *gfs_bufdata_cachep;
++extern kmem_cache_t *gfs_mhc_cachep;
++
++void *gmalloc(unsigned int size);
++
++
++#endif /* __UTIL_DOT_H__ */
+diff -urN linux-orig/include/linux/gfs_ioctl.h linux-patched/include/linux/gfs_ioctl.h
+--- linux-orig/include/linux/gfs_ioctl.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/gfs_ioctl.h 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,218 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GFS_IOCTL_DOT_H__
++#define __GFS_IOCTL_DOT_H__
++
++#define GFS_IOCTL_VERSION (0)
++
++#define _GFSC_(x) (('G' << 8) | (x))
++
++/*
++ Ioctls implemented
++
++ Reserved Ioctls: 3, 7, 8, 9, 10, 4, 13
++ Next Ioctl: 44
++ */
++
++#define GFS_STACK_PRINT _GFSC_(40)
++
++#define GFS_GET_META _GFSC_(31)
++#define GFS_FILE_STAT _GFSC_(30)
++
++#define GFS_SHRINK _GFSC_(5)
++
++#define GFS_GET_ARGS _GFSC_(29)
++#define GFS_GET_LOCKSTRUCT _GFSC_(39)
++#define GFS_GET_SUPER _GFSC_(19)
++#define GFS_JREAD _GFSC_(23)
++#define GFS_JWRITE _GFSC_(24)
++#define GFS_JSTAT _GFSC_(20)
++#define GFS_JTRUNC _GFSC_(33)
++
++#define GFS_LOCK_DUMP _GFSC_(11)
++
++#define GFS_STATGFS _GFSC_(12)
++
++#define GFS_FREEZE _GFSC_(14)
++#define GFS_UNFREEZE _GFSC_(15)
++
++#define GFS_RECLAIM_METADATA _GFSC_(16)
++
++#define GFS_QUOTA_SYNC _GFSC_(17)
++#define GFS_QUOTA_REFRESH _GFSC_(18)
++#define GFS_QUOTA_READ _GFSC_(32)
++
++#define GFS_GET_TUNE _GFSC_(21)
++#define GFS_SET_TUNE _GFSC_(22)
++
++#define GFS_EATTR_GET _GFSC_(26)
++#define GFS_EATTR_SET _GFSC_(27)
++
++#define GFS_WHERE_ARE_YOU _GFSC_(35)
++
++#define GFS_SET_FLAG _GFSC_(36)
++#define GFS_CLEAR_FLAG _GFSC_(37)
++
++#define GFS_GET_COUNTERS _GFSC_(43)
++
++#define GFS_FILE_FLUSH _GFSC_(42)
++
++struct gfs_user_buffer {
++ char *ub_data;
++ unsigned int ub_size;
++ unsigned int ub_count;
++};
++
++/* Structure for jread/jwrite */
++
++#define GFS_HIDDEN_JINDEX (0x10342345)
++#define GFS_HIDDEN_RINDEX (0x10342346)
++#define GFS_HIDDEN_QUOTA (0x10342347)
++#define GFS_HIDDEN_LICENSE (0x10342348)
++
++struct gfs_jio {
++ unsigned int jio_file;
++
++ uint32_t jio_size;
++ uint64_t jio_offset;
++ char *jio_data;
++
++ uint32_t jio_count;
++};
++
++/* Structure for better GFS-specific df */
++
++struct gfs_usage {
++ unsigned int gu_block_size;
++ uint64_t gu_total_blocks;
++ uint64_t gu_free;
++ uint64_t gu_used_dinode;
++ uint64_t gu_free_dinode;
++ uint64_t gu_used_meta;
++ uint64_t gu_free_meta;
++};
++
++struct gfs_reclaim_stats {
++ uint64_t rc_inodes;
++ uint64_t rc_metadata;
++};
++
++struct gfs_quota_name {
++ int qn_user;
++ uint32_t qn_id;
++};
++
++/*
++ * You can tune a filesystem, but you can't tune a yak.
++ */
++
++#define GFS_TUNE_VERSION ((GFS_IOCTL_VERSION << 16) | (138))
++
++struct gfs_tune {
++ unsigned int gt_tune_version;
++
++ unsigned int gt_ilimit1;
++ unsigned int gt_ilimit1_tries;
++ unsigned int gt_ilimit1_min;
++ unsigned int gt_ilimit2;
++ unsigned int gt_ilimit2_tries;
++ unsigned int gt_ilimit2_min;
++ unsigned int gt_demote_secs;
++ unsigned int gt_incore_log_blocks;
++ unsigned int gt_jindex_refresh_secs;
++ unsigned int gt_depend_secs;
++ unsigned int gt_scand_secs;
++ unsigned int gt_recoverd_secs;
++ unsigned int gt_logd_secs;
++ unsigned int gt_quotad_secs;
++ unsigned int gt_inoded_secs;
++ unsigned int gt_quota_simul_sync;
++ unsigned int gt_quota_warn_period;
++ unsigned int gt_atime_quantum;
++ unsigned int gt_quota_quantum;
++ unsigned int gt_quota_scale_num;
++ unsigned int gt_quota_scale_den;
++ unsigned int gt_quota_enforce;
++ unsigned int gt_quota_account;
++ unsigned int gt_new_files_jdata;
++ unsigned int gt_new_files_directio;
++ unsigned int gt_max_atomic_write;
++ unsigned int gt_max_readahead;
++ unsigned int gt_lockdump_size;
++ unsigned int gt_stall_secs;
++ unsigned int gt_complain_secs;
++ unsigned int gt_reclaim_limit;
++ unsigned int gt_entries_per_readdir;
++ unsigned int gt_prefetch_secs;
++ unsigned int gt_statfs_slots;
++ unsigned int gt_max_mhc;
++};
++
++/*
++ * Extended Attribute Ioctl structures
++ *
++ * Note: The name_len does not include a null character.
++ *
++ * Getting and setting EAs return the following errors that aren't
++ * what they seem
++ *
++ * ENODATA - No such extended attribute
++ * ERANGE - Extended attribute data is too large for the buffer
++ * ENOSPC - No space left for extended attributes
++ * EEXIST - Extended attribute already exists
++ */
++
++#define GFS_EACMD_SET (0)
++#define GFS_EACMD_CREATE (1)
++#define GFS_EACMD_REPLACE (2)
++#define GFS_EACMD_REMOVE (3)
++
++struct gfs_eaget_io {
++ char *eg_data;
++ char *eg_name;
++ char *eg_len;
++ uint32_t eg_data_len;
++ uint8_t eg_name_len;
++ uint8_t eg_type; /* GFS_EATYPE_... */
++};
++
++struct gfs_easet_io {
++ const char *es_data;
++ char *es_name;
++ uint16_t es_data_len;
++ uint8_t es_name_len; /* not counting the NULL */
++ uint8_t es_cmd; /* GFS_EACMD_... */
++ uint8_t es_type; /* GFS_EATYPE_... */
++};
++
++#define GFS_GLOCKD_DEFAULT (1)
++#define GFS_GLOCKD_MAX (32)
++
++struct gfs_args {
++ char ar_lockproto[256]; /* The name of the Lock Protocol */
++ char ar_locktable[256]; /* The name of the Lock Table */
++ char ar_hostdata[256]; /* The host specific data */
++
++ int ar_ignore_local_fs; /* Ignore the local_fs field in the struct lm_lockops */
++ int ar_localflocks; /* let the VFS do flock|fcntl locks for us */
++ int ar_localcaching; /* Local-style caching (dangerous on mulithost) */
++
++ int ar_upgrade; /* Upgrade ondisk/multihost format */
++
++ unsigned int ar_num_glockd;
++
++ int ar_posixacls; /* Enable posix acls */
++};
++
++#endif /* ___GFS_IOCTL_DOT_H__ */
+diff -urN linux-orig/include/linux/gfs_ondisk.h linux-patched/include/linux/gfs_ondisk.h
+--- linux-orig/include/linux/gfs_ondisk.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/gfs_ondisk.h 2004-06-20 22:48:17.949946404 -0500
+@@ -0,0 +1,1720 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++* NOTE:
++* If you add 8 byte fields to these structures, they must be 8 byte
++* aligned. 4 byte field must be 4 byte aligned, etc...
++*
++* All structures must be a multiple of 8 bytes long.
++*
++* GRIPES:
++* We should have forgetten about supporting 512B FS block sizes
++* and made the di_reserved field in the struct gfs_dinode structure
++* much bigger.
++*
++* de_rec_len in struct gfs_dirent should really have been a 32-bit value
++* as it now limits us to a 64k FS block size (with the current code
++* in dir.c).
++*/
++
++#ifndef __GFS_ONDISK_DOT_H__
++#define __GFS_ONDISK_DOT_H__
++
++#define GFS_MAGIC (0x01161970)
++#define GFS_BASIC_BLOCK (512)
++#define GFS_BASIC_BLOCK_SHIFT (9)
++#define GFS_DUMPS_PER_LOG (4)
++
++/* Lock numbers of the LM_TYPE_NONDISK type */
++
++#define GFS_MOUNT_LOCK (0)
++#define GFS_LIVE_LOCK (1)
++#define GFS_TRANS_LOCK (2)
++#define GFS_RENAME_LOCK (3)
++
++/* Format numbers for various metadata types */
++
++#define GFS_FORMAT_SB (100)
++#define GFS_FORMAT_RG (200)
++#define GFS_FORMAT_RB (300)
++#define GFS_FORMAT_DI (400)
++#define GFS_FORMAT_IN (500)
++#define GFS_FORMAT_LF (600)
++#define GFS_FORMAT_JD (700)
++#define GFS_FORMAT_LH (800)
++#define GFS_FORMAT_LD (900)
++/* These don't have actual struct gfs_meta_header structures to go with them */
++#define GFS_FORMAT_JI (1000)
++#define GFS_FORMAT_RI (1100)
++#define GFS_FORMAT_DE (1200)
++#define GFS_FORMAT_QU (1500)
++#define GFS_FORMAT_EA (1600)
++/* These are part of the superblock */
++#define GFS_FORMAT_FS (1309)
++#define GFS_FORMAT_MULTI (1401)
++
++/*
++ * An on-disk inode number
++ */
++
++#define gfs_inum_equal(ino1, ino2) \
++(((ino1)->no_formal_ino == (ino2)->no_formal_ino) && \
++ ((ino1)->no_addr == (ino2)->no_addr))
++
++struct gfs_inum {
++ uint64_t no_formal_ino;
++ uint64_t no_addr;
++};
++
++/*
++ * Generic metadata head structure
++ *
++ * Every inplace buffer logged in the journal must start with this.
++ */
++
++#define GFS_METATYPE_NONE (0)
++#define GFS_METATYPE_SB (1)
++#define GFS_METATYPE_RG (2)
++#define GFS_METATYPE_RB (3)
++#define GFS_METATYPE_DI (4)
++#define GFS_METATYPE_IN (5)
++#define GFS_METATYPE_LF (6)
++#define GFS_METATYPE_JD (7)
++#define GFS_METATYPE_LH (8)
++#define GFS_METATYPE_LD (9)
++#define GFS_METATYPE_EA (10)
++
++#define GFS_META_CLUMP (64)
++
++struct gfs_meta_header {
++ uint32_t mh_magic; /* Magic number */
++ uint32_t mh_type; /* GFS_METATYPE_XX */
++ uint64_t mh_generation; /* Generation number */
++ uint32_t mh_format; /* GFS_FORMAT_XX */
++ uint32_t mh_incarn;
++};
++
++/*
++ * super-block structure
++ *
++ * It's probably good if SIZEOF_SB <= GFS_BASIC_BLOCK
++ */
++
++/* Address of SuperBlock in GFS basic blocks */
++#define GFS_SB_ADDR (128)
++/* The lock number for the superblock (must be zero) */
++#define GFS_SB_LOCK (0)
++#define GFS_CRAP_LOCK (1)
++
++/* Requirement: GFS_LOCKNAME_LEN % 8 == 0
++ Includes: the fencing zero at the end */
++#define GFS_LOCKNAME_LEN (64)
++
++struct gfs_sb {
++ /* Order is important */
++ struct gfs_meta_header sb_header;
++
++ uint32_t sb_fs_format;
++ uint32_t sb_multihost_format;
++ uint32_t sb_flags;
++
++ /* Important information */
++ uint32_t sb_bsize; /* fundamental fs block size in bytes */
++ uint32_t sb_bsize_shift; /* log2(sb_bsize) */
++ uint32_t sb_seg_size; /* Journal segment size in FS blocks */
++
++ struct gfs_inum sb_jindex_di; /* journal index inode number (GFS_SB_LOCK) */
++ struct gfs_inum sb_rindex_di; /* resource index inode number (GFS_SB_LOCK) */
++ struct gfs_inum sb_root_di; /* root directory inode number (GFS_ROOT_LOCK) */
++
++ char sb_lockproto[GFS_LOCKNAME_LEN]; /* Type of locking this FS uses */
++ char sb_locktable[GFS_LOCKNAME_LEN]; /* Name of lock table for this FS */
++
++ struct gfs_inum sb_quota_di;
++ struct gfs_inum sb_license_di;
++
++ char sb_reserved[96];
++};
++
++/*
++ * journal index structure
++ */
++
++struct gfs_jindex {
++ uint64_t ji_addr; /* starting block of the journal */
++ uint32_t ji_nsegment; /* number of segments in journal */
++ uint32_t ji_pad;
++
++ char ji_reserved[64];
++};
++
++/*
++ * resource index structure
++ */
++
++struct gfs_rindex {
++ uint64_t ri_addr; /* rgrp block disk address */
++ uint32_t ri_length; /* length of rgrp header in fs blocks */
++ uint32_t ri_pad;
++
++ uint64_t ri_data1; /* first data location */
++ uint32_t ri_data; /* num of data blocks in rgrp */
++
++ uint32_t ri_bitbytes; /* number of bytes in data bitmaps */
++
++ char ri_reserved[64];
++};
++
++/*
++ * resource group header structure
++ *
++ */
++
++/* Number of blocks per byte in rgrp */
++#define GFS_NBBY (4)
++#define GFS_BIT_SIZE (2)
++#define GFS_BIT_MASK (0x00000003)
++
++#define GFS_BLKST_FREE (0)
++#define GFS_BLKST_USED (1)
++#define GFS_BLKST_FREEMETA (2)
++#define GFS_BLKST_USEDMETA (3)
++
++struct gfs_rgrp {
++ struct gfs_meta_header rg_header;
++
++ uint32_t rg_flags; /* flags */
++
++ uint32_t rg_free; /* number of free data blocks */
++
++ uint32_t rg_useddi; /* number of dinodes */
++ uint32_t rg_freedi; /* number of unused dinodes */
++ struct gfs_inum rg_freedi_list; /* list of free dinodes */
++
++ uint32_t rg_usedmeta; /* number of used metadata blocks (not including dinodes) */
++ uint32_t rg_freemeta; /* number of unused metadata blocks */
++
++ char rg_reserved[64];
++};
++
++/*
++ * Quota Structures
++ */
++
++struct gfs_quota {
++ uint64_t qu_limit;
++ uint64_t qu_warn;
++ int64_t qu_value;
++
++ char qu_reserved[64];
++};
++
++/*
++ * dinode structure
++ */
++
++#define GFS_MAX_META_HEIGHT (10)
++#define GFS_DIR_MAX_DEPTH (17)
++
++/* Dinode types */
++#define GFS_FILE_NON (0)
++#define GFS_FILE_REG (1)
++#define GFS_FILE_DIR (2)
++#define GFS_FILE_LNK (5)
++#define GFS_FILE_BLK (7)
++#define GFS_FILE_CHR (8)
++#define GFS_FILE_FIFO (101)
++#define GFS_FILE_SOCK (102)
++
++/* Dinode flags */
++#define GFS_DIF_JDATA (0x00000001)
++#define GFS_DIF_EXHASH (0x00000002)
++#define GFS_DIF_UNUSED (0x00000004)
++#define GFS_DIF_EA_INDIRECT (0x00000008)
++#define GFS_DIF_DIRECTIO (0x00000010)
++#define GFS_DIF_IMMUTABLE (0x00000020)
++#define GFS_DIF_APPENDONLY (0x00000040)
++#define GFS_DIF_NOATIME (0x00000080)
++#define GFS_DIF_SYNC (0x00000100)
++#define GFS_DIF_INHERIT_DIRECTIO (0x40000000)
++#define GFS_DIF_INHERIT_JDATA (0x80000000)
++
++struct gfs_dinode {
++ struct gfs_meta_header di_header;
++
++ struct gfs_inum di_num;
++
++ uint32_t di_mode; /* mode of file */
++ uint32_t di_uid; /* owner's user id */
++ uint32_t di_gid; /* owner's group id */
++ uint32_t di_nlink; /* number of links to this file */
++ uint64_t di_size; /* number of bytes in file */
++ uint64_t di_blocks; /* number of blocks in file */
++ int64_t di_atime; /* time last accessed */
++ int64_t di_mtime; /* time last modified */
++ int64_t di_ctime; /* time last changed */
++ uint32_t di_major; /* device major number */
++ uint32_t di_minor; /* device minor number */
++
++ uint64_t di_rgrp; /* dinode rgrp block number */
++ uint64_t di_goal_rgrp; /* rgrp to alloc from next */
++ uint32_t di_goal_dblk; /* data block goal */
++ uint32_t di_goal_mblk; /* metadata block goal */
++ uint32_t di_flags; /* flags */
++ uint32_t di_payload_format; /* struct gfs_rindex, struct gfs_jindex, or struct gfs_dirent */
++ uint16_t di_type; /* type of file */
++ uint16_t di_height; /* height of metadata */
++ uint32_t di_incarn; /* incarnation number */
++ uint16_t di_pad;
++
++ /* These only apply to directories */
++ uint16_t di_depth; /* Number of bits in the table */
++ uint32_t di_entries; /* The number of entries in the directory */
++
++ /* This only applies to unused inodes */
++ struct gfs_inum di_next_unused;
++
++ uint64_t di_eattr; /* extended attribute block number */
++
++ char di_reserved[56];
++};
++
++/*
++ * indirect block header
++ */
++
++struct gfs_indirect {
++ struct gfs_meta_header in_header;
++
++ char in_reserved[64];
++};
++
++/*
++ * directory structure - many of these per directory file
++ */
++
++#define GFS_FNAMESIZE (255)
++#define GFS_DIRENT_SIZE(name_len) ((sizeof(struct gfs_dirent) + (name_len) + 7) & ~7)
++
++struct gfs_dirent {
++ struct gfs_inum de_inum; /* Inode number */
++ uint32_t de_hash; /* hash of the filename */
++ uint16_t de_rec_len; /* the length of the dirent */
++ uint16_t de_name_len; /* the length of the name */
++ uint16_t de_type; /* type of dinode this points to */
++
++ char de_reserved[14];
++};
++
++/*
++ * Header of leaf directory nodes
++ */
++
++struct gfs_leaf {
++ struct gfs_meta_header lf_header;
++
++ uint16_t lf_depth; /* Depth of leaf */
++ uint16_t lf_entries; /* Number of dirents in leaf */
++ uint32_t lf_dirent_format; /* Format of the dirents */
++ uint64_t lf_next; /* Next leaf, if overflow */
++
++ char lf_reserved[64];
++};
++
++/*
++ * Log header structure
++ */
++
++#define GFS_LOG_HEAD_UNMOUNT (0x00000001)
++
++struct gfs_log_header {
++ struct gfs_meta_header lh_header;
++
++ uint32_t lh_flags; /* Flags */
++ uint32_t lh_pad;
++
++ uint64_t lh_first; /* Block number of first header in this trans */
++ uint64_t lh_sequence; /* Sequence number of this transaction */
++
++ uint64_t lh_tail; /* Block number of log tail */
++ uint64_t lh_last_dump; /* block number of last dump */
++
++ char lh_reserved[64];
++};
++
++/*
++ * Log type descriptor
++ */
++
++#define GFS_LOG_DESC_METADATA (300)
++/* ld_data1 is the number of metadata blocks in the descriptor.
++ ld_data2 is unused.
++ */
++
++#define GFS_LOG_DESC_IUL (400)
++/* ld_data1 is TRUE if this is a dump.
++ ld_data2 is unused.
++ FixMe!!! ld_data1 should be the number of entries.
++ ld_data2 should be "TRUE if this is a dump".
++ */
++
++#define GFS_LOG_DESC_IDA (401)
++/* ld_data1 is unused.
++ ld_data2 is unused.
++ FixMe!!! ld_data1 should be the number of entries.
++ */
++
++#define GFS_LOG_DESC_Q (402)
++/* ld_data1 is the number of quota changes in the descriptor.
++ ld_data2 is TRUE if this is a dump.
++ */
++
++#define GFS_LOG_DESC_LAST (500)
++/* ld_data1 is unused.
++ ld_data2 is unused.
++ */
++
++struct gfs_log_descriptor {
++ struct gfs_meta_header ld_header;
++
++ uint32_t ld_type; /* Type of data in this log chunk */
++ uint32_t ld_length; /* Number of buffers in this chunk */
++ uint32_t ld_data1; /* descriptor specific field */
++ uint32_t ld_data2; /* descriptor specific field */
++
++ char ld_reserved[64];
++};
++
++/*
++ * Metadata block tags
++ */
++
++struct gfs_block_tag {
++ uint64_t bt_blkno; /* inplace block number */
++ uint32_t bt_flags; /* flags */
++ uint32_t bt_pad;
++};
++
++/*
++ * Quota Journal Tag
++ */
++
++#define GFS_QTF_USER (0x00000001)
++
++struct gfs_quota_tag {
++ int64_t qt_change;
++ uint32_t qt_flags;
++ uint32_t qt_id;
++};
++
++/*
++ * Extended attribute header format
++ */
++
++#define GFS_EA_MAX_NAME_LEN (255)
++#define GFS_EA_MAX_DATA_LEN (65535)
++
++#define GFS_EATYPE_LAST (2)
++
++#define GFS_EATYPE_UNUSED (0)
++#define GFS_EATYPE_USR (1)
++#define GFS_EATYPE_SYS (2)
++#define GFS_EATYPE_VALID(x) ((x) && (x) <= GFS_EATYPE_LAST) /* this is only
++ for requests */
++
++#define GFS_EAFLAG_LAST (0x01) /* last ea in block */
++
++struct gfs_ea_header {
++ uint32_t ea_rec_len;
++ uint32_t ea_data_len;
++ uint8_t ea_name_len; /* no NULL pointer after the string */
++ uint8_t ea_type; /* GFS_EATYPE_... */
++ uint8_t ea_flags;
++ uint8_t ea_num_ptrs;
++ uint32_t ea_pad;
++};
++
++/* Endian functions */
++
++#define GFS_ENDIAN_BIG
++
++#ifdef GFS_ENDIAN_BIG
++
++#define gfs16_to_cpu be16_to_cpu
++#define gfs32_to_cpu be32_to_cpu
++#define gfs64_to_cpu be64_to_cpu
++
++#define cpu_to_gfs16 cpu_to_be16
++#define cpu_to_gfs32 cpu_to_be32
++#define cpu_to_gfs64 cpu_to_be64
++
++#else /* GFS_ENDIAN_BIG */
++
++#define gfs16_to_cpu le16_to_cpu
++#define gfs32_to_cpu le32_to_cpu
++#define gfs64_to_cpu le64_to_cpu
++
++#define cpu_to_gfs16 cpu_to_le16
++#define cpu_to_gfs32 cpu_to_le32
++#define cpu_to_gfs64 cpu_to_le64
++
++#endif /* GFS_ENDIAN_BIG */
++
++/* Translation functions */
++
++void gfs_inum_in(struct gfs_inum *no, char *buf);
++void gfs_inum_out(struct gfs_inum *no, char *buf);
++void gfs_meta_header_in(struct gfs_meta_header *mh, char *buf);
++void gfs_meta_header_out(struct gfs_meta_header *mh, char *buf);
++void gfs_sb_in(struct gfs_sb *sb, char *buf);
++void gfs_sb_out(struct gfs_sb *sb, char *buf);
++void gfs_jindex_in(struct gfs_jindex *jindex, char *buf);
++void gfs_jindex_out(struct gfs_jindex *jindex, char *buf);
++void gfs_rindex_in(struct gfs_rindex *rindex, char *buf);
++void gfs_rindex_out(struct gfs_rindex *rindex, char *buf);
++void gfs_rgrp_in(struct gfs_rgrp *rgrp, char *buf);
++void gfs_rgrp_out(struct gfs_rgrp *rgrp, char *buf);
++void gfs_quota_in(struct gfs_quota *quota, char *buf);
++void gfs_quota_out(struct gfs_quota *quota, char *buf);
++void gfs_dinode_in(struct gfs_dinode *dinode, char *buf);
++void gfs_dinode_out(struct gfs_dinode *dinode, char *buf);
++void gfs_indirect_in(struct gfs_indirect *indirect, char *buf);
++void gfs_indirect_out(struct gfs_indirect *indirect, char *buf);
++void gfs_dirent_in(struct gfs_dirent *dirent, char *buf);
++void gfs_dirent_out(struct gfs_dirent *dirent, char *buf);
++void gfs_leaf_in(struct gfs_leaf *leaf, char *buf);
++void gfs_leaf_out(struct gfs_leaf *leaf, char *buf);
++void gfs_log_header_in(struct gfs_log_header *head, char *buf);
++void gfs_log_header_out(struct gfs_log_header *head, char *buf);
++void gfs_desc_in(struct gfs_log_descriptor *desc, char *buf);
++void gfs_desc_out(struct gfs_log_descriptor *desc, char *buf);
++void gfs_block_tag_in(struct gfs_block_tag *btag, char *buf);
++void gfs_block_tag_out(struct gfs_block_tag *btag, char *buf);
++void gfs_quota_tag_in(struct gfs_quota_tag *qtag, char *buf);
++void gfs_quota_tag_out(struct gfs_quota_tag *qtag, char *buf);
++void gfs_ea_header_in(struct gfs_ea_header *qtag, char *buf);
++void gfs_ea_header_out(struct gfs_ea_header *qtag, char *buf);
++
++/* Printing functions */
++
++void gfs_inum_print(struct gfs_inum *no);
++void gfs_meta_header_print(struct gfs_meta_header *mh);
++void gfs_sb_print(struct gfs_sb *sb);
++void gfs_jindex_print(struct gfs_jindex *jindex);
++void gfs_rindex_print(struct gfs_rindex *rindex);
++void gfs_rgrp_print(struct gfs_rgrp *rgrp);
++void gfs_quota_print(struct gfs_quota *quota);
++void gfs_dinode_print(struct gfs_dinode *dinode);
++void gfs_indirect_print(struct gfs_indirect *indirect);
++void gfs_dirent_print(struct gfs_dirent *dirent, char *name);
++void gfs_leaf_print(struct gfs_leaf *leaf);
++void gfs_log_header_print(struct gfs_log_header *head);
++void gfs_desc_print(struct gfs_log_descriptor *desc);
++void gfs_block_tag_print(struct gfs_block_tag *tag);
++void gfs_quota_tag_print(struct gfs_quota_tag *tag);
++void gfs_ea_header_print(struct gfs_ea_header *tag);
++
++/* The hash function for ExHash directories */
++
++uint32_t gfs_dir_hash(const char *data, int len);
++
++#endif /* __GFS_ONDISK_DOT_H__ */
++
++
++
++#ifdef WANT_GFS_CONVERSION_FUNCTIONS
++
++#define CPIN_08(s1, s2, member, count) {memcpy((s1->member), (s2->member), (count));}
++#define CPOUT_08(s1, s2, member, count) {memcpy((s2->member), (s1->member), (count));}
++#define CPIN_16(s1, s2, member) {(s1->member) = gfs16_to_cpu((s2->member));}
++#define CPOUT_16(s1, s2, member) {(s2->member) = cpu_to_gfs16((s1->member));}
++#define CPIN_32(s1, s2, member) {(s1->member) = gfs32_to_cpu((s2->member));}
++#define CPOUT_32(s1, s2, member) {(s2->member) = cpu_to_gfs32((s1->member));}
++#define CPIN_64(s1, s2, member) {(s1->member) = gfs64_to_cpu((s2->member));}
++#define CPOUT_64(s1, s2, member) {(s2->member) = cpu_to_gfs64((s1->member));}
++
++#define pa(struct, member, count) print_array(#member, struct->member, count);
++
++/**
++ * print_array - Print out an array of bytes
++ * @title: what to print before the array
++ * @buf: the array
++ * @count: the number of bytes
++ *
++ */
++
++static void
++print_array(char *title, char *buf, int count)
++{
++ int x;
++
++ printk(" %s =\n", title);
++ for (x = 0; x < count; x++) {
++ printk("%.2X ", (unsigned char)buf[x]);
++ if (x % 16 == 15)
++ printk("\n");
++ }
++ if (x % 16)
++ printk("\n");
++}
++
++/**
++ * gfs_inum_in - Read in an inode number
++ * @no: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_inum_in(struct gfs_inum *no, char *buf)
++{
++ struct gfs_inum *str = (struct gfs_inum *)buf;
++
++ CPIN_64(no, str, no_formal_ino);
++ CPIN_64(no, str, no_addr);
++}
++
++/**
++ * gfs_inum_out - Write out an inode number
++ * @no: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_inum_out(struct gfs_inum *no, char *buf)
++{
++ struct gfs_inum *str = (struct gfs_inum *)buf;
++
++ CPOUT_64(no, str, no_formal_ino);
++ CPOUT_64(no, str, no_addr);
++}
++
++/**
++ * gfs_inum_print - Print out a inode number
++ * @no: the cpu-order buffer
++ *
++ */
++
++void
++gfs_inum_print(struct gfs_inum *no)
++{
++ pv(no, no_formal_ino, "%"PRIu64);
++ pv(no, no_addr, "%"PRIu64);
++}
++
++/**
++ * gfs_meta_header_in - Read in a metadata header
++ * @mh: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_meta_header_in(struct gfs_meta_header *mh, char *buf)
++{
++ struct gfs_meta_header *str = (struct gfs_meta_header *)buf;
++
++ CPIN_32(mh, str, mh_magic);
++ CPIN_32(mh, str, mh_type);
++ CPIN_64(mh, str, mh_generation);
++ CPIN_32(mh, str, mh_format);
++ CPIN_32(mh, str, mh_incarn);
++}
++
++/**
++ * gfs_meta_header_in - Write out a metadata header
++ * @mh: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ * Don't ever change the generation number in this routine.
++ * It's done manually in increment_generation().
++ */
++
++void
++gfs_meta_header_out(struct gfs_meta_header *mh, char *buf)
++{
++ struct gfs_meta_header *str = (struct gfs_meta_header *)buf;
++
++ CPOUT_32(mh, str, mh_magic);
++ CPOUT_32(mh, str, mh_type);
++#if 0
++ /* Don't do this!
++ Mh_generation should only be change manually. */
++ CPOUT_64(mh, str, mh_generation);
++#endif
++ CPOUT_32(mh, str, mh_format);
++ CPOUT_32(mh, str, mh_incarn);
++}
++
++/**
++ * gfs_meta_header_print - Print out a metadata header
++ * @mh: the cpu-order buffer
++ *
++ */
++
++void
++gfs_meta_header_print(struct gfs_meta_header *mh)
++{
++ pv(mh, mh_magic, "0x%.8X");
++ pv(mh, mh_type, "%u");
++ pv(mh, mh_generation, "%"PRIu64);
++ pv(mh, mh_format, "%u");
++ pv(mh, mh_incarn, "%u");
++}
++
++/**
++ * gfs_sb_in - Read in a superblock
++ * @sb: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_sb_in(struct gfs_sb *sb, char *buf)
++{
++ struct gfs_sb *str = (struct gfs_sb *)buf;
++
++ gfs_meta_header_in(&sb->sb_header, buf);
++
++ CPIN_32(sb, str, sb_fs_format);
++ CPIN_32(sb, str, sb_multihost_format);
++ CPIN_32(sb, str, sb_flags);
++
++ CPIN_32(sb, str, sb_bsize);
++ CPIN_32(sb, str, sb_bsize_shift);
++ CPIN_32(sb, str, sb_seg_size);
++
++ gfs_inum_in(&sb->sb_jindex_di, (char *)&str->sb_jindex_di);
++ gfs_inum_in(&sb->sb_rindex_di, (char *)&str->sb_rindex_di);
++ gfs_inum_in(&sb->sb_root_di, (char *)&str->sb_root_di);
++
++ CPIN_08(sb, str, sb_lockproto, GFS_LOCKNAME_LEN);
++ CPIN_08(sb, str, sb_locktable, GFS_LOCKNAME_LEN);
++
++ gfs_inum_in(&sb->sb_quota_di, (char *)&str->sb_quota_di);
++ gfs_inum_in(&sb->sb_license_di, (char *)&str->sb_license_di);
++
++ CPIN_08(sb, str, sb_reserved, 96);
++}
++
++/**
++ * gfs_sb_out - Write out a superblock
++ * @sb: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_sb_out(struct gfs_sb *sb, char *buf)
++{
++ struct gfs_sb *str = (struct gfs_sb *)buf;
++
++ gfs_meta_header_out(&sb->sb_header, buf);
++
++ CPOUT_32(sb, str, sb_fs_format);
++ CPOUT_32(sb, str, sb_multihost_format);
++ CPOUT_32(sb, str, sb_flags);
++
++ CPOUT_32(sb, str, sb_bsize);
++ CPOUT_32(sb, str, sb_bsize_shift);
++ CPOUT_32(sb, str, sb_seg_size);
++
++ gfs_inum_out(&sb->sb_jindex_di, (char *)&str->sb_jindex_di);
++ gfs_inum_out(&sb->sb_rindex_di, (char *)&str->sb_rindex_di);
++ gfs_inum_out(&sb->sb_root_di, (char *)&str->sb_root_di);
++
++ CPOUT_08(sb, str, sb_lockproto, GFS_LOCKNAME_LEN);
++ CPOUT_08(sb, str, sb_locktable, GFS_LOCKNAME_LEN);
++
++ gfs_inum_out(&sb->sb_quota_di, (char *)&str->sb_quota_di);
++ gfs_inum_out(&sb->sb_license_di, (char *)&str->sb_license_di);
++
++ CPOUT_08(sb, str, sb_reserved, 96);
++}
++
++/**
++ * gfs_sb_print - Print out a superblock
++ * @sb: the cpu-order buffer
++ *
++ */
++
++void
++gfs_sb_print(struct gfs_sb *sb)
++{
++ gfs_meta_header_print(&sb->sb_header);
++
++ pv(sb, sb_fs_format, "%u");
++ pv(sb, sb_multihost_format, "%u");
++ pv(sb, sb_flags, "%u");
++
++ pv(sb, sb_bsize, "%u");
++ pv(sb, sb_bsize_shift, "%u");
++ pv(sb, sb_seg_size, "%u");
++
++ gfs_inum_print(&sb->sb_jindex_di);
++ gfs_inum_print(&sb->sb_rindex_di);
++ gfs_inum_print(&sb->sb_root_di);
++
++ pv(sb, sb_lockproto, "%s");
++ pv(sb, sb_locktable, "%s");
++
++ gfs_inum_print(&sb->sb_quota_di);
++ gfs_inum_print(&sb->sb_license_di);
++
++ pa(sb, sb_reserved, 96);
++}
++
++/**
++ * gfs_jindex_in - Read in a journal index structure
++ * @jindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_jindex_in(struct gfs_jindex *jindex, char *buf)
++{
++ struct gfs_jindex *str = (struct gfs_jindex *)buf;
++
++ CPIN_64(jindex, str, ji_addr);
++ CPIN_32(jindex, str, ji_nsegment);
++ CPIN_32(jindex, str, ji_pad);
++
++ CPIN_08(jindex, str, ji_reserved, 64);
++}
++
++/**
++ * gfs_jindex_out - Write out a journal index structure
++ * @jindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_jindex_out(struct gfs_jindex *jindex, char *buf)
++{
++ struct gfs_jindex *str = (struct gfs_jindex *)buf;
++
++ CPOUT_64(jindex, str, ji_addr);
++ CPOUT_32(jindex, str, ji_nsegment);
++ CPOUT_32(jindex, str, ji_pad);
++
++ CPOUT_08(jindex, str, ji_reserved, 64);
++}
++
++/**
++ * gfs_jindex_print - Print out a journal index structure
++ * @ji: the cpu-order buffer
++ *
++ */
++
++void
++gfs_jindex_print(struct gfs_jindex *ji)
++{
++ pv(ji, ji_addr, "%"PRIu64);
++ pv(ji, ji_nsegment, "%u");
++ pv(ji, ji_pad, "%u");
++
++ pa(ji, ji_reserved, 64);
++}
++
++/**
++ * gfs_rindex_in - Read in a resource index structure
++ * @rindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rindex_in(struct gfs_rindex *rindex, char *buf)
++{
++ struct gfs_rindex *str = (struct gfs_rindex *)buf;
++
++ CPIN_64(rindex, str, ri_addr);
++ CPIN_32(rindex, str, ri_length);
++ CPIN_32(rindex, str, ri_pad);
++
++ CPIN_64(rindex, str, ri_data1);
++ CPIN_32(rindex, str, ri_data);
++
++ CPIN_32(rindex, str, ri_bitbytes);
++
++ CPIN_08(rindex, str, ri_reserved, 64);
++}
++
++/**
++ * gfs_rindex_out - Write out a resource index structure
++ * @rindex: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rindex_out(struct gfs_rindex *rindex, char *buf)
++{
++ struct gfs_rindex *str = (struct gfs_rindex *)buf;
++
++ CPOUT_64(rindex, str, ri_addr);
++ CPOUT_32(rindex, str, ri_length);
++ CPOUT_32(rindex, str, ri_pad);
++
++ CPOUT_64(rindex, str, ri_data1);
++ CPOUT_32(rindex, str, ri_data);
++
++ CPOUT_32(rindex, str, ri_bitbytes);
++
++ CPOUT_08(rindex, str, ri_reserved, 64);
++}
++
++/**
++ * gfs_rindex_print - Print out a resource index structure
++ * @ri: the cpu-order buffer
++ *
++ */
++
++void
++gfs_rindex_print(struct gfs_rindex *ri)
++{
++ pv(ri, ri_addr, "%"PRIu64);
++ pv(ri, ri_length, "%u");
++ pv(ri, ri_pad, "%u");
++
++ pv(ri, ri_data1, "%"PRIu64);
++ pv(ri, ri_data, "%u");
++
++ pv(ri, ri_bitbytes, "%u");
++
++ pa(ri, ri_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_in - Read in a resource group header
++ * @rgrp: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rgrp_in(struct gfs_rgrp *rgrp, char *buf)
++{
++ struct gfs_rgrp *str = (struct gfs_rgrp *)buf;
++
++ gfs_meta_header_in(&rgrp->rg_header, buf);
++
++ CPIN_32(rgrp, str, rg_flags);
++
++ CPIN_32(rgrp, str, rg_free);
++
++ CPIN_32(rgrp, str, rg_useddi);
++ CPIN_32(rgrp, str, rg_freedi);
++ gfs_inum_in(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list);
++
++ CPIN_32(rgrp, str, rg_usedmeta);
++ CPIN_32(rgrp, str, rg_freemeta);
++
++ CPIN_08(rgrp, str, rg_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_out - Write out a resource group header
++ * @rgrp: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_rgrp_out(struct gfs_rgrp *rgrp, char *buf)
++{
++ struct gfs_rgrp *str = (struct gfs_rgrp *)buf;
++
++ gfs_meta_header_out(&rgrp->rg_header, buf);
++
++ CPOUT_32(rgrp, str, rg_flags);
++
++ CPOUT_32(rgrp, str, rg_free);
++
++ CPOUT_32(rgrp, str, rg_useddi);
++ CPOUT_32(rgrp, str, rg_freedi);
++ gfs_inum_out(&rgrp->rg_freedi_list, (char *)&str->rg_freedi_list);
++
++ CPOUT_32(rgrp, str, rg_usedmeta);
++ CPOUT_32(rgrp, str, rg_freemeta);
++
++ CPOUT_08(rgrp, str, rg_reserved, 64);
++}
++
++/**
++ * gfs_rgrp_print - Print out a resource group header
++ * @rg: the cpu-order buffer
++ *
++ */
++
++void
++gfs_rgrp_print(struct gfs_rgrp *rg)
++{
++ gfs_meta_header_print(&rg->rg_header);
++
++ pv(rg, rg_flags, "%u");
++
++ pv(rg, rg_free, "%u");
++
++ pv(rg, rg_useddi, "%u");
++ pv(rg, rg_freedi, "%u");
++ gfs_inum_print(&rg->rg_freedi_list);
++
++ pv(rg, rg_usedmeta, "%u");
++ pv(rg, rg_freemeta, "%u");
++
++ pa(rg, rg_reserved, 64);
++}
++
++/**
++ * gfs_quota_in - Read in a quota structures
++ * @quota: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_in(struct gfs_quota *quota, char *buf)
++{
++ struct gfs_quota *str = (struct gfs_quota *)buf;
++
++ CPIN_64(quota, str, qu_limit);
++ CPIN_64(quota, str, qu_warn);
++ CPIN_64(quota, str, qu_value);
++
++ CPIN_08(quota, str, qu_reserved, 64);
++}
++
++/**
++ * gfs_quota_out - Write out a quota structure
++ * @quota: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_out(struct gfs_quota *quota, char *buf)
++{
++ struct gfs_quota *str = (struct gfs_quota *)buf;
++
++ CPOUT_64(quota, str, qu_limit);
++ CPOUT_64(quota, str, qu_warn);
++ CPOUT_64(quota, str, qu_value);
++
++ CPOUT_08(quota, str, qu_reserved, 64);
++}
++
++/**
++ * gfs_quota_print - Print out a quota structure
++ * @quota: the cpu-order buffer
++ *
++ */
++
++void
++gfs_quota_print(struct gfs_quota *quota)
++{
++ pv(quota, qu_limit, "%"PRIu64);
++ pv(quota, qu_warn, "%"PRIu64);
++ pv(quota, qu_value, "%"PRId64);
++
++ pa(quota, qu_reserved, 64);
++}
++
++/**
++ * gfs_dinode_in - Read in a dinode
++ * @dinode: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dinode_in(struct gfs_dinode *dinode, char *buf)
++{
++ struct gfs_dinode *str = (struct gfs_dinode *)buf;
++
++ gfs_meta_header_in(&dinode->di_header, buf);
++
++ gfs_inum_in(&dinode->di_num, (char *)&str->di_num);
++
++ CPIN_32(dinode, str, di_mode);
++ CPIN_32(dinode, str, di_uid);
++ CPIN_32(dinode, str, di_gid);
++ CPIN_32(dinode, str, di_nlink);
++ CPIN_64(dinode, str, di_size);
++ CPIN_64(dinode, str, di_blocks);
++ CPIN_64(dinode, str, di_atime);
++ CPIN_64(dinode, str, di_mtime);
++ CPIN_64(dinode, str, di_ctime);
++ CPIN_32(dinode, str, di_major);
++ CPIN_32(dinode, str, di_minor);
++
++ CPIN_64(dinode, str, di_rgrp);
++ CPIN_64(dinode, str, di_goal_rgrp);
++ CPIN_32(dinode, str, di_goal_dblk);
++ CPIN_32(dinode, str, di_goal_mblk);
++ CPIN_32(dinode, str, di_flags);
++ CPIN_32(dinode, str, di_payload_format);
++ CPIN_16(dinode, str, di_type);
++ CPIN_16(dinode, str, di_height);
++ CPIN_32(dinode, str, di_incarn);
++ CPIN_16(dinode, str, di_pad);
++
++ CPIN_16(dinode, str, di_depth);
++ CPIN_32(dinode, str, di_entries);
++
++ gfs_inum_in(&dinode->di_next_unused, (char *)&str->di_next_unused);
++
++ CPIN_64(dinode, str, di_eattr);
++
++ CPIN_08(dinode, str, di_reserved, 56);
++}
++
++/**
++ * gfs_dinode_out - Write out a dinode
++ * @dinode: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dinode_out(struct gfs_dinode *dinode, char *buf)
++{
++ struct gfs_dinode *str = (struct gfs_dinode *)buf;
++
++ gfs_meta_header_out(&dinode->di_header, buf);
++
++ gfs_inum_out(&dinode->di_num, (char *)&str->di_num);
++
++ CPOUT_32(dinode, str, di_mode);
++ CPOUT_32(dinode, str, di_uid);
++ CPOUT_32(dinode, str, di_gid);
++ CPOUT_32(dinode, str, di_nlink);
++ CPOUT_64(dinode, str, di_size);
++ CPOUT_64(dinode, str, di_blocks);
++ CPOUT_64(dinode, str, di_atime);
++ CPOUT_64(dinode, str, di_mtime);
++ CPOUT_64(dinode, str, di_ctime);
++ CPOUT_32(dinode, str, di_major);
++ CPOUT_32(dinode, str, di_minor);
++
++ CPOUT_64(dinode, str, di_rgrp);
++ CPOUT_64(dinode, str, di_goal_rgrp);
++ CPOUT_32(dinode, str, di_goal_dblk);
++ CPOUT_32(dinode, str, di_goal_mblk);
++ CPOUT_32(dinode, str, di_flags);
++ CPOUT_32(dinode, str, di_payload_format);
++ CPOUT_16(dinode, str, di_type);
++ CPOUT_16(dinode, str, di_height);
++ CPOUT_32(dinode, str, di_incarn);
++ CPOUT_16(dinode, str, di_pad);
++
++ CPOUT_16(dinode, str, di_depth);
++ CPOUT_32(dinode, str, di_entries);
++
++ gfs_inum_out(&dinode->di_next_unused, (char *)&str->di_next_unused);
++
++ CPOUT_64(dinode, str, di_eattr);
++
++ CPOUT_08(dinode, str, di_reserved, 56);
++}
++
++/**
++ * gfs_dinode_print - Print out a dinode
++ * @di: the cpu-order buffer
++ *
++ */
++
++void
++gfs_dinode_print(struct gfs_dinode *di)
++{
++ gfs_meta_header_print(&di->di_header);
++
++ gfs_inum_print(&di->di_num);
++
++ pv(di, di_mode, "0%o");
++ pv(di, di_uid, "%u");
++ pv(di, di_gid, "%u");
++ pv(di, di_nlink, "%u");
++ pv(di, di_size, "%"PRIu64);
++ pv(di, di_blocks, "%"PRIu64);
++ pv(di, di_atime, "%"PRId64);
++ pv(di, di_mtime, "%"PRId64);
++ pv(di, di_ctime, "%"PRId64);
++ pv(di, di_major, "%u");
++ pv(di, di_minor, "%u");
++
++ pv(di, di_rgrp, "%"PRIu64);
++ pv(di, di_goal_rgrp, "%"PRIu64);
++ pv(di, di_goal_dblk, "%u");
++ pv(di, di_goal_mblk, "%u");
++ pv(di, di_flags, "0x%.8X");
++ pv(di, di_payload_format, "%u");
++ pv(di, di_type, "%u");
++ pv(di, di_height, "%u");
++ pv(di, di_incarn, "%u");
++ pv(di, di_pad, "%u");
++
++ pv(di, di_depth, "%u");
++ pv(di, di_entries, "%u");
++
++ gfs_inum_print(&di->di_next_unused);
++
++ pv(di, di_eattr, "%"PRIu64);
++
++ pa(di, di_reserved, 56);
++}
++
++/**
++ * gfs_indirect_in - copy in the header of an indirect block
++ * @indirect: the in memory copy
++ * @buf: the buffer copy
++ *
++ */
++
++void
++gfs_indirect_in(struct gfs_indirect *indirect, char *buf)
++{
++ struct gfs_indirect *str = (struct gfs_indirect *)buf;
++
++ gfs_meta_header_in(&indirect->in_header, buf);
++
++ CPIN_08(indirect, str, in_reserved, 64);
++}
++
++/**
++ * gfs_indirect_out - copy out the header of an indirect block
++ * @indirect: the in memory copy
++ * @buf: the buffer copy
++ *
++ */
++
++void
++gfs_indirect_out(struct gfs_indirect *indirect, char *buf)
++{
++ struct gfs_indirect *str = (struct gfs_indirect *)buf;
++
++ gfs_meta_header_out(&indirect->in_header, buf);
++
++ CPOUT_08(indirect, str, in_reserved, 64);
++}
++
++/**
++ * gfs_indirect_print - Print out a indirect block header
++ * @indirect: the cpu-order buffer
++ *
++ */
++
++void
++gfs_indirect_print(struct gfs_indirect *indirect)
++{
++ gfs_meta_header_print(&indirect->in_header);
++
++ pa(indirect, in_reserved, 64);
++}
++
++/**
++ * gfs_dirent_in - Read in a directory entry
++ * @dirent: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dirent_in(struct gfs_dirent *dirent, char *buf)
++{
++ struct gfs_dirent *str = (struct gfs_dirent *)buf;
++
++ gfs_inum_in(&dirent->de_inum, (char *)&str->de_inum);
++ CPIN_32(dirent, str, de_hash);
++ CPIN_16(dirent, str, de_rec_len);
++ CPIN_16(dirent, str, de_name_len);
++ CPIN_16(dirent, str, de_type);
++
++ CPIN_08(dirent, str, de_reserved, 14);
++}
++
++/**
++ * gfs_dirent_out - Write out a directory entry
++ * @dirent: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_dirent_out(struct gfs_dirent *dirent, char *buf)
++{
++ struct gfs_dirent *str = (struct gfs_dirent *)buf;
++
++ gfs_inum_out(&dirent->de_inum, (char *)&str->de_inum);
++ CPOUT_32(dirent, str, de_hash);
++ CPOUT_16(dirent, str, de_rec_len);
++ CPOUT_16(dirent, str, de_name_len);
++ CPOUT_16(dirent, str, de_type);
++
++ CPOUT_08(dirent, str, de_reserved, 14);
++}
++
++/**
++ * gfs_dirent_print - Print out a directory entry
++ * @de: the cpu-order buffer
++ * @name: the filename
++ *
++ */
++
++void
++gfs_dirent_print(struct gfs_dirent *de, char *name)
++{
++ char buf[GFS_FNAMESIZE + 1];
++
++ gfs_inum_print(&de->de_inum);
++ pv(de, de_hash, "0x%.8X");
++ pv(de, de_rec_len, "%u");
++ pv(de, de_name_len, "%u");
++ pv(de, de_type, "%u");
++
++ pa(de, de_reserved, 14);
++
++ memset(buf, 0, GFS_FNAMESIZE + 1);
++ memcpy(buf, name, de->de_name_len);
++ printk(" name = %s\n", buf);
++}
++
++/**
++ * gfs_leaf_in - Read in a directory leaf header
++ * @leaf: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_leaf_in(struct gfs_leaf *leaf, char *buf)
++{
++ struct gfs_leaf *str = (struct gfs_leaf *)buf;
++
++ gfs_meta_header_in(&leaf->lf_header, buf);
++
++ CPIN_16(leaf, str, lf_depth);
++ CPIN_16(leaf, str, lf_entries);
++ CPIN_32(leaf, str, lf_dirent_format);
++ CPIN_64(leaf, str, lf_next);
++
++ CPIN_08(leaf, str, lf_reserved, 64);
++}
++
++/**
++ * gfs_leaf_out - Write out a directory leaf header
++ * @leaf: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_leaf_out(struct gfs_leaf *leaf, char *buf)
++{
++ struct gfs_leaf *str = (struct gfs_leaf *)buf;
++
++ gfs_meta_header_out(&leaf->lf_header, buf);
++
++ CPOUT_16(leaf, str, lf_depth);
++ CPOUT_16(leaf, str, lf_entries);
++ CPOUT_32(leaf, str, lf_dirent_format);
++ CPOUT_64(leaf, str, lf_next);
++
++ CPOUT_08(leaf, str, lf_reserved, 64);
++}
++
++/**
++ * gfs_leaf_print - Print out a directory leaf header
++ * @lf: the cpu-order buffer
++ *
++ */
++
++void
++gfs_leaf_print(struct gfs_leaf *lf)
++{
++ gfs_meta_header_print(&lf->lf_header);
++
++ pv(lf, lf_depth, "%u");
++ pv(lf, lf_entries, "%u");
++ pv(lf, lf_dirent_format, "%u");
++ pv(lf, lf_next, "%"PRIu64);
++
++ pa(lf, lf_reserved, 64);
++}
++
++/**
++ * gfs_log_header_in - Read in a log header
++ * @head: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_log_header_in(struct gfs_log_header *head, char *buf)
++{
++ struct gfs_log_header *str = (struct gfs_log_header *)buf;
++
++ gfs_meta_header_in(&head->lh_header, buf);
++
++ CPIN_32(head, str, lh_flags);
++ CPIN_32(head, str, lh_pad);
++
++ CPIN_64(head, str, lh_first);
++ CPIN_64(head, str, lh_sequence);
++
++ CPIN_64(head, str, lh_tail);
++ CPIN_64(head, str, lh_last_dump);
++
++ CPIN_08(head, str, lh_reserved, 64);
++}
++
++/**
++ * gfs_log_header_out - Write out a log header
++ * @head: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_log_header_out(struct gfs_log_header *head, char *buf)
++{
++ struct gfs_log_header *str = (struct gfs_log_header *)buf;
++
++ gfs_meta_header_out(&head->lh_header, buf);
++
++ CPOUT_32(head, str, lh_flags);
++ CPOUT_32(head, str, lh_pad);
++
++ CPOUT_64(head, str, lh_first);
++ CPOUT_64(head, str, lh_sequence);
++
++ CPOUT_64(head, str, lh_tail);
++ CPOUT_64(head, str, lh_last_dump);
++
++ CPOUT_08(head, str, lh_reserved, 64);
++}
++
++/**
++ * gfs_log_header_print - Print out a log header
++ * @head: the cpu-order buffer
++ *
++ */
++
++void
++gfs_log_header_print(struct gfs_log_header *lh)
++{
++ gfs_meta_header_print(&lh->lh_header);
++
++ pv(lh, lh_flags, "0x%.8X");
++ pv(lh, lh_pad, "%u");
++
++ pv(lh, lh_first, "%"PRIu64);
++ pv(lh, lh_sequence, "%"PRIu64);
++
++ pv(lh, lh_tail, "%"PRIu64);
++ pv(lh, lh_last_dump, "%"PRIu64);
++
++ pa(lh, lh_reserved, 64);
++}
++
++/**
++ * gfs_desc_in - Read in a log descriptor
++ * @desc: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_desc_in(struct gfs_log_descriptor *desc, char *buf)
++{
++ struct gfs_log_descriptor *str = (struct gfs_log_descriptor *)buf;
++
++ gfs_meta_header_in(&desc->ld_header, buf);
++
++ CPIN_32(desc, str, ld_type);
++ CPIN_32(desc, str, ld_length);
++ CPIN_32(desc, str, ld_data1);
++ CPIN_32(desc, str, ld_data2);
++
++ CPIN_08(desc, str, ld_reserved, 64);
++}
++
++/**
++ * gfs_desc_out - Write out a log descriptor
++ * @desc: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_desc_out(struct gfs_log_descriptor *desc, char *buf)
++{
++ struct gfs_log_descriptor *str = (struct gfs_log_descriptor *)buf;
++
++ gfs_meta_header_out(&desc->ld_header, buf);
++
++ CPOUT_32(desc, str, ld_type);
++ CPOUT_32(desc, str, ld_length);
++ CPOUT_32(desc, str, ld_data1);
++ CPOUT_32(desc, str, ld_data2);
++
++ CPOUT_08(desc, str, ld_reserved, 64);
++}
++
++/**
++ * gfs_desc_print - Print out a log descriptor
++ * @ld: the cpu-order buffer
++ *
++ */
++
++void
++gfs_desc_print(struct gfs_log_descriptor *ld)
++{
++ gfs_meta_header_print(&ld->ld_header);
++
++ pv(ld, ld_type, "%u");
++ pv(ld, ld_length, "%u");
++ pv(ld, ld_data1, "%u");
++ pv(ld, ld_data2, "%u");
++
++ pa(ld, ld_reserved, 64);
++}
++
++/**
++ * gfs_block_tag_in - Read in a block tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_block_tag_in(struct gfs_block_tag *tag, char *buf)
++{
++ struct gfs_block_tag *str = (struct gfs_block_tag *)buf;
++
++ CPIN_64(tag, str, bt_blkno);
++ CPIN_32(tag, str, bt_flags);
++ CPIN_32(tag, str, bt_pad);
++}
++
++/**
++ * gfs_block_tag_out - Write out a block tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_block_tag_out(struct gfs_block_tag *tag, char *buf)
++{
++ struct gfs_block_tag *str = (struct gfs_block_tag *)buf;
++
++ CPOUT_64(tag, str, bt_blkno);
++ CPOUT_32(tag, str, bt_flags);
++ CPOUT_32(tag, str, bt_pad);
++}
++
++/**
++ * gfs_block_tag_print - Print out a block tag
++ * @tag: the cpu-order buffer
++ *
++ */
++
++void
++gfs_block_tag_print(struct gfs_block_tag *tag)
++{
++ pv(tag, bt_blkno, "%"PRIu64);
++ pv(tag, bt_flags, "%u");
++ pv(tag, bt_pad, "%u");
++}
++
++/**
++ * gfs_quota_tag_in - Read in a quota tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_in(struct gfs_quota_tag *tag, char *buf)
++{
++ struct gfs_quota_tag *str = (struct gfs_quota_tag *)buf;
++
++ CPIN_64(tag, str, qt_change);
++ CPIN_32(tag, str, qt_flags);
++ CPIN_32(tag, str, qt_id);
++}
++
++/**
++ * gfs_quota_tag_out - Write out a quota tag
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_out(struct gfs_quota_tag *tag, char *buf)
++{
++ struct gfs_quota_tag *str = (struct gfs_quota_tag *)buf;
++
++ CPOUT_64(tag, str, qt_change);
++ CPOUT_32(tag, str, qt_flags);
++ CPOUT_32(tag, str, qt_id);
++}
++
++/**
++ * gfs_quota_tag_print - Print out a quota tag
++ * @tag: the cpu-order buffer
++ *
++ */
++
++void
++gfs_quota_tag_print(struct gfs_quota_tag *tag)
++{
++ pv(tag, qt_change, "%"PRId64);
++ pv(tag, qt_flags, "0x%.8X");
++ pv(tag, qt_id, "%u");
++}
++
++/**
++ * gfs_ea_header_in - Read in a Extended Attribute header
++ * @tag: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_ea_header_in(struct gfs_ea_header *ea, char *buf)
++{
++ struct gfs_ea_header *str = (struct gfs_ea_header *)buf;
++
++ CPIN_32(ea, str, ea_rec_len);
++ CPIN_32(ea, str, ea_data_len);
++ ea->ea_name_len = str->ea_name_len;
++ ea->ea_type = str->ea_type;
++ ea->ea_flags = str->ea_flags;
++ ea->ea_num_ptrs = str->ea_num_ptrs;
++ CPIN_32(ea, str, ea_pad);
++}
++
++/**
++ * gfs_ea_header_out - Write out a Extended Attribute header
++ * @ea: the cpu-order structure
++ * @buf: the disk-order buffer
++ *
++ */
++
++void
++gfs_ea_header_out(struct gfs_ea_header *ea, char *buf)
++{
++ struct gfs_ea_header *str = (struct gfs_ea_header *)buf;
++
++ CPOUT_32(ea, str, ea_rec_len);
++ CPOUT_32(ea, str, ea_data_len);
++ str->ea_name_len = ea->ea_name_len;
++ str->ea_type = ea->ea_type;
++ str->ea_flags = ea->ea_flags;
++ str->ea_num_ptrs = ea->ea_num_ptrs;
++ CPOUT_32(ea, str, ea_pad);
++}
++
++/**
++ * gfs_ea_header_printt - Print out a Extended Attribute header
++ * @ea: the cpu-order buffer
++ *
++ */
++
++void
++gfs_ea_header_print(struct gfs_ea_header *ea)
++{
++ pv(ea, ea_rec_len, "%u");
++ pv(ea, ea_data_len, "%u");
++ pv(ea, ea_name_len, "%u");
++ pv(ea, ea_type, "%u");
++ pv(ea, ea_flags, "%u");
++ pv(ea, ea_num_ptrs, "%u");
++ pv(ea, ea_pad, "%u");
++}
++
++static const uint32_t crc_32_tab[] =
++{
++ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
++ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
++ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
++ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
++ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
++ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
++ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
++ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
++ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
++ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
++ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
++ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
++ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
++ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
++ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
++ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
++ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
++ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
++ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
++ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
++ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
++ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
++ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
++ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
++ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
++ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
++ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
++ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
++ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
++ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
++ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
++ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * gfs_dir_hash - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data. The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at. That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++gfs_dir_hash(const char *data, int len)
++{
++ uint32_t hash = 0xFFFFFFFF;
++
++ for (; len--; data++)
++ hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++ hash = ~hash;
++
++ return hash;
++}
++
++#endif /* WANT_GFS_CONVERSION_FUNCTIONS */
++
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/group.c linux-patched/fs/gfs_locking/lock_dlm/group.c
+--- linux-orig/fs/gfs_locking/lock_dlm/group.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/group.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,776 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++
++#include "lock_dlm.h"
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++
++struct kcl_service_ops mg_ops;
++
++/*
++ * Get the node struct for a given nodeid.
++ */
++
++static dlm_node_t *find_node_by_nodeid(dlm_t *dlm, uint32_t nodeid)
++{
++ dlm_node_t *node;
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ if (node->nodeid == nodeid)
++ return node;
++ }
++ return NULL;
++}
++
++/*
++ * Get the node struct for a given journalid.
++ */
++
++static dlm_node_t *find_node_by_jid(dlm_t *dlm, uint32_t jid)
++{
++ dlm_node_t *node;
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ if (node->jid == jid)
++ return node;
++ }
++ return NULL;
++}
++
++/*
++ * If the given ID is clear, get it, setting to the given VALUE. The ID is a
++ * journalid, the VALUE is our nodeid. When successful, the held ID-lock is
++ * returned (in shared mode). As long as this ID-lock is held, the journalid
++ * is owned.
++ */
++
++static int id_test_and_set(dlm_t *dlm, uint32_t id, uint32_t val,
++ dlm_lock_t **lp_set)
++{
++ dlm_lock_t *lp = NULL;
++ struct lm_lockname name;
++ lm_lock_t *lock;
++ char *lvb;
++ uint32_t exist_val, beval;
++ int error;
++
++ name.ln_type = LM_TYPE_JID;
++ name.ln_number = id;
++
++ error = lm_dlm_get_lock(dlm, &name, &lock);
++ if (error)
++ goto fail;
++
++ error = lm_dlm_hold_lvb(lock, &lvb);
++ if (error)
++ goto fail_put;
++
++ lp = (dlm_lock_t *) lock;
++ set_bit(LFL_IDLOCK, &lp->flags);
++
++ retry:
++
++ error = lm_dlm_lock_sync(lock, LM_ST_UNLOCKED, LM_ST_SHARED,
++ LM_FLAG_TRY | LM_FLAG_NOEXP);
++ if (error == -EAGAIN) {
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto retry;
++ }
++ if (error)
++ goto fail_unhold;
++
++ memcpy(&beval, lvb, sizeof(beval));
++ exist_val = be32_to_cpu(beval);
++
++ if (!exist_val) {
++ /*
++ * This id is unused. Attempt to claim it by getting EX mode
++ * and writing our nodeid into the lvb.
++ */
++ error = lm_dlm_lock_sync(lock, LM_ST_SHARED, LM_ST_EXCLUSIVE,
++ LM_FLAG_TRY | LM_FLAG_NOEXP);
++ if (error == -EAGAIN) {
++ lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto retry;
++ }
++ if (error)
++ goto fail_unlock;
++
++ beval = cpu_to_be32(val);
++ memcpy(lvb, &beval, sizeof(beval));
++
++ error = lm_dlm_lock_sync(lock, LM_ST_EXCLUSIVE, LM_ST_SHARED,
++ LM_FLAG_NOEXP);
++ DLM_ASSERT(!error,);
++
++ *lp_set = lp;
++ error = 0;
++ } else {
++ /*
++ * This id is already used. It has a non-zero nodeid in the lvb
++ */
++ lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++ lm_dlm_unhold_lvb(lock, lvb);
++ lm_dlm_put_lock(lock);
++ error = exist_val;
++ }
++
++ return error;
++
++ fail_unlock:
++ lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++
++ fail_unhold:
++ lm_dlm_unhold_lvb(lock, lvb);
++
++ fail_put:
++ lm_dlm_put_lock(lock);
++
++ fail:
++ return error;
++}
++
++/*
++ * Release a held ID-lock clearing its VALUE. We have to acquire the lock in
++ * EX again so we can write out a zeroed lvb.
++ */
++
++static void id_clear(dlm_t *dlm, dlm_lock_t *lp)
++{
++ lm_lock_t *lock = (lm_lock_t *) lp;
++ int error;
++
++ /*
++ * This flag means that DLM_LKF_CONVDEADLK should not be used.
++ */
++ set_bit(LFL_FORCE_PROMOTE, &lp->flags);
++
++ retry:
++
++ error = lm_dlm_lock_sync(lock, LM_ST_SHARED, LM_ST_EXCLUSIVE,
++ LM_FLAG_TRY | LM_FLAG_NOEXP);
++ if (error == -EAGAIN) {
++ schedule();
++ goto retry;
++ }
++ if (error)
++ goto end;
++
++ memset(lp->lvb, 0, DLM_LVB_LEN);
++ lm_dlm_unlock_sync(lock, LM_ST_EXCLUSIVE);
++
++ end:
++ lm_dlm_unhold_lvb(lock, lp->lvb);
++ lm_dlm_put_lock(lock);
++}
++
++/*
++ * Get the VALUE for a given ID. The ID is a journalid, the VALUE is a nodeid.
++ */
++
++static int id_value(dlm_t *dlm, uint32_t id, uint32_t *val)
++{
++ dlm_lock_t *lp = NULL;
++ struct lm_lockname name;
++ lm_lock_t *lock;
++ char *lvb;
++ uint32_t beval;
++ int error;
++
++ name.ln_type = LM_TYPE_JID;
++ name.ln_number = id;
++
++ error = lm_dlm_get_lock(dlm, &name, &lock);
++ if (error)
++ goto out;
++
++ error = lm_dlm_hold_lvb(lock, &lvb);
++ if (error)
++ goto out_put;
++
++ lp = (dlm_lock_t *) lock;
++ set_bit(LFL_IDLOCK, &lp->flags);
++
++ retry:
++
++ error = lm_dlm_lock_sync(lock, LM_ST_UNLOCKED, LM_ST_SHARED,
++ LM_FLAG_TRY | LM_FLAG_NOEXP);
++ if (error == -EAGAIN) {
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto retry;
++ }
++ if (error)
++ goto out_unhold;
++
++ memcpy(&beval, lvb, sizeof(beval));
++ *val = be32_to_cpu(beval);
++
++ lm_dlm_unlock_sync(lock, LM_ST_SHARED);
++
++ error = 0;
++
++ out_unhold:
++ lm_dlm_unhold_lvb(lock, lvb);
++
++ out_put:
++ lm_dlm_put_lock(lock);
++
++ out:
++ return error;
++}
++
++/*
++ * Find an ID with a given VALUE. The ID is a journalid, the VALUE is a
++ * nodeid.
++ */
++
++static int id_find(dlm_t *dlm, uint32_t value, uint32_t *id_out)
++{
++ uint32_t val, id;
++ int error = 0, found = FALSE;
++
++ for (id = 0; id < dlm->max_nodes; id++) {
++ error = id_value(dlm, id, &val);
++ if (error)
++ break;
++
++ if (val == value) {
++ *id_out = id;
++ error = 0;
++ found = TRUE;
++ break;
++ }
++ }
++
++ if (!error && !found)
++ error = -ENOENT;
++
++ return error;
++}
++
++/*
++ * Get a journalid to use. The journalid must be owned exclusively as long as
++ * this fs is mounted. Other nodes must be able to discover our nodeid as the
++ * owner of the journalid. The journalid we claim should have the lowest value
++ * of all unused journalids.
++ */
++
++static int claim_jid(dlm_t *dlm)
++{
++ dlm_node_t *node;
++ uint32_t id;
++ int error = 0;
++
++ DLM_ASSERT(dlm->our_nodeid,);
++
++ /*
++ * Search an arbitrary number (8) past max nodes so we're sure to find
++ * one so we can let the GFS handle the "too big jid" error and fail
++ * the mount.
++ */
++
++ for (id = 0; id < dlm->max_nodes + 8; id++) {
++ error = id_test_and_set(dlm, id, dlm->our_nodeid, &dlm->jid_lock);
++ if (error < 0)
++ break;
++ if (error > 0)
++ continue;
++
++ dlm->jid = id;
++ node = find_node_by_nodeid(dlm, dlm->our_nodeid);
++ node->jid = id;
++ set_bit(NFL_HAVE_JID, &node->flags);
++ break;
++ }
++
++ /*
++ * If we have a problem getting a jid, pick a bogus one which should
++ * cause GFS to complain and fail to mount.
++ */
++
++ if (error) {
++ printk("lock_dlm: %s: no journal id available (%d)\n",
++ dlm->fsname, error);
++ dlm->jid = dlm->max_nodes + dlm->our_nodeid;
++ }
++
++ log_debug("claim_jid %u", dlm->jid);
++ return 0;
++}
++
++/*
++ * Release our journalid, allowing it to be used by a node subsequently
++ * mounting the fs.
++ */
++
++static void release_jid(dlm_t *dlm)
++{
++ id_clear(dlm, dlm->jid_lock);
++ dlm->jid_lock = NULL;
++}
++
++/*
++ * For all nodes in the mountgroup, find the journalid being used by each.
++ */
++
++static int discover_jids(dlm_t *dlm)
++{
++ dlm_node_t *node;
++ uint32_t id;
++ int error, notfound = 0;
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ if (test_bit(NFL_HAVE_JID, &node->flags))
++ continue;
++
++ error = id_find(dlm, node->nodeid, &id);
++ if (error) {
++ log_debug("jid for node %d not found", node->nodeid);
++ notfound++;
++ continue;
++ }
++
++ node->jid = id;
++ set_bit(NFL_HAVE_JID, &node->flags);
++ }
++
++ return notfound;
++}
++
++/*
++ * Discover the nodeid that we've been assigned by the cluster manager.
++ */
++
++static int get_our_nodeid(dlm_t *dlm)
++{
++ LIST_HEAD(cur_memb);
++ struct kcl_cluster_node *cur_node;
++
++ kcl_get_members(&cur_memb);
++
++ list_for_each_entry(cur_node, &cur_memb, list) {
++ if (cur_node->us) {
++ dlm->our_nodeid = cur_node->node_id;
++ break;
++ }
++ }
++
++ while (!list_empty(&cur_memb)) {
++ cur_node = list_entry(cur_memb.next, struct kcl_cluster_node,
++ list);
++ list_del(&cur_node->list);
++ kfree(cur_node);
++ }
++
++ return 0;
++}
++
++/*
++ * Run in dlm_async thread
++ */
++
++void process_start(dlm_t *dlm, dlm_start_t *ds)
++{
++ dlm_node_t *node;
++ uint32_t nodeid;
++ int last_stop, last_start, error, i, new = FALSE, found;
++
++
++ log_debug("start c %d type %d e %d", ds->count, ds->type, ds->event_id);
++
++ /*
++ * gfs won't do journal recoveries once it's sent us an unmount
++ */
++
++ if (test_bit(DFL_UMOUNT, &dlm->flags)) {
++ log_debug("process_start %d skip for umount", ds->event_id);
++ kcl_start_done(dlm->mg_local_id, ds->event_id);
++ goto out;
++ }
++
++ /*
++ * check if first start
++ */
++
++ if (!test_and_set_bit(DFL_GOT_NODEID, &dlm->flags)) {
++ get_our_nodeid(dlm);
++ if (ds->count == 1)
++ set_bit(DFL_FIRST_MOUNT, &dlm->flags);
++ }
++
++ down(&dlm->mg_nodes_lock);
++
++ /*
++ * find nodes which are gone
++ */
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ found = FALSE;
++ for (i = 0; i < ds->count; i++) {
++ if (node->nodeid != ds->nodeids[i])
++ continue;
++ found = TRUE;
++ break;
++ }
++
++ /* node is still a member */
++ if (found)
++ continue;
++
++ set_bit(NFL_NOT_MEMBER, &node->flags);
++
++ /* no gfs recovery needed for nodes that left cleanly */
++ if (ds->type != SERVICE_NODE_FAILED)
++ continue;
++
++ /* callbacks sent only for nodes in last completed MG */
++ if (!test_bit(NFL_LAST_FINISH, &node->flags))
++ continue;
++
++ /* only send a single callback per node */
++ if (test_and_set_bit(NFL_SENT_CB, &node->flags))
++ continue;
++
++ dlm->fscb(dlm->fsdata, LM_CB_NEED_RECOVERY, &node->jid);
++ set_bit(DFL_NEED_STARTDONE, &dlm->flags);
++ log_debug("cb_need_recovery jid %u", node->jid);
++ }
++
++ /*
++ * add new nodes
++ */
++
++ for (i = 0; i < ds->count; i++) {
++ nodeid = ds->nodeids[i];
++
++ node = find_node_by_nodeid(dlm, nodeid);
++ if (node)
++ continue;
++
++ DLM_RETRY(node = kmalloc(sizeof(dlm_node_t), GFP_KERNEL), node);
++
++ memset(node, 0, sizeof(dlm_node_t));
++
++ node->nodeid = nodeid;
++ list_add(&node->list, &dlm->mg_nodes);
++ new = TRUE;
++ }
++
++ up(&dlm->mg_nodes_lock);
++
++ /*
++ * get a jid for ourself when started for first time
++ */
++
++ if (!test_and_set_bit(DFL_HAVE_JID, &dlm->flags))
++ claim_jid(dlm);
++ else if (new) {
++ /* give new nodes a little time to claim a jid */
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(HZ);
++ }
++
++ /*
++ * find jid's of new nodes
++ */
++
++ for (;;) {
++ /* we don't need to do these jid lookups if this start has been
++ followed by a stop event (and thus cancelled) */
++
++ spin_lock(&dlm->async_lock);
++ last_stop = dlm->mg_last_stop;
++ last_start = dlm->mg_last_start;
++ spin_unlock(&dlm->async_lock);
++
++ if (last_stop >= ds->event_id)
++ break;
++
++ error = discover_jids(dlm);
++ if (error) {
++ /* Not all jids were found. Wait for a time to let all
++ new nodes claim_jid, then try to scan for jids
++ again. */
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(HZ);
++ continue;
++ }
++ break;
++ }
++
++ /*
++ * tell SM we're done if there are no GFS recoveries to wait for
++ */
++
++ if (last_start > last_stop) {
++ error = 0;
++ down(&dlm->mg_nodes_lock);
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ if (!test_bit(NFL_SENT_CB, &node->flags))
++ continue;
++ error = 1;
++ break;
++ }
++ up(&dlm->mg_nodes_lock);
++
++ if (!error)
++ kcl_start_done(dlm->mg_local_id, ds->event_id);
++ }
++
++ out:
++ kfree(ds->nodeids);
++ kfree(ds);
++}
++
++void process_finish(dlm_t *dlm)
++{
++ struct list_head *tmp, *tmpsafe;
++ dlm_node_t *node;
++ dlm_lock_t *lp;
++
++ spin_lock(&dlm->async_lock);
++ clear_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++
++ list_for_each_safe(tmp, tmpsafe, &dlm->delayed) {
++ lp = list_entry(tmp, dlm_lock_t, dlist);
++
++ if (lp->type != QUEUE_LOCKS_BLOCKED)
++ continue;
++
++ lp->type = 0;
++ list_del(&lp->dlist);
++ list_add_tail(&lp->slist, &dlm->submit);
++
++ clear_bit(LFL_DLIST, &lp->flags);
++ set_bit(LFL_SLIST, &lp->flags);
++ }
++ spin_unlock(&dlm->async_lock);
++
++ down(&dlm->mg_nodes_lock);
++
++ list_for_each_safe(tmp, tmpsafe, &dlm->mg_nodes) {
++ node = list_entry(tmp, dlm_node_t, list);
++
++ if (test_bit(NFL_NOT_MEMBER, &node->flags)) {
++ list_del(&node->list);
++ kfree(node);
++ } else
++ set_bit(NFL_LAST_FINISH, &node->flags);
++ }
++ up(&dlm->mg_nodes_lock);
++
++ wake_up(&dlm->wait);
++}
++
++/*
++ * Run in user process
++ */
++
++int init_mountgroup(dlm_t *dlm)
++{
++ int error;
++ int id;
++
++ error = kcl_register_service(dlm->fsname, dlm->fnlen, SERVICE_LEVEL_GFS,
++ &mg_ops, TRUE, (void *) dlm, &id);
++ if (error)
++ goto out;
++
++ dlm->mg_local_id = id;
++
++ /* BLOCK_LOCKS is cleared when the join is finished */
++ set_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++
++ error = kcl_join_service(id);
++ if (error)
++ goto out_unreg;
++
++ if (test_bit(DFL_START_ERROR, &dlm->flags))
++ goto out_leave;
++
++ return 0;
++
++ out_leave:
++ kcl_leave_service(dlm->mg_local_id);
++
++ out_unreg:
++ kcl_unregister_service(id);
++
++ out:
++ printk("lock_dlm: service error %d\n", error);
++ return error;
++}
++
++void release_mountgroup(dlm_t *dlm)
++{
++ int last_start, last_stop;
++
++ /* this flag causes a kcl_start_done() to be sent right away for
++ any start callbacks we get from SM */
++
++ log_debug("umount flags %lx", dlm->flags);
++ set_bit(DFL_UMOUNT, &dlm->flags);
++
++ /* gfs has done a unmount and will not call jid_recovery_done()
++ any longer so make necessary kcl_start_done() calls so
++ kcl_leave_service() will complete */
++
++ spin_lock(&dlm->async_lock);
++ last_start = dlm->mg_last_start;
++ last_stop = dlm->mg_last_stop;
++ spin_unlock(&dlm->async_lock);
++
++ if ((last_start > last_stop) &&
++ test_and_clear_bit(DFL_NEED_STARTDONE, &dlm->flags)) {
++ log_debug("umount doing start_done %d", last_start);
++ kcl_start_done(dlm->mg_local_id, last_start);
++ }
++
++ kcl_leave_service(dlm->mg_local_id);
++ kcl_unregister_service(dlm->mg_local_id);
++ release_jid(dlm);
++}
++
++/*
++ * Run in GFS thread
++ */
++
++void jid_recovery_done(dlm_t *dlm, unsigned int jid, unsigned int message)
++{
++ dlm_node_t *node;
++ int last_start, last_stop;
++ int remain = 0;
++
++ log_debug("recovery_done jid %u msg %u", jid, message);
++
++ node = find_node_by_jid(dlm, jid);
++ if (!node)
++ goto out;
++
++ log_debug("recovery_done %u,%u f %lx", jid, node->nodeid, node->flags);
++
++ if (!test_bit(NFL_SENT_CB, &node->flags))
++ goto out;
++
++ if (!test_bit(NFL_NOT_MEMBER, &node->flags))
++ goto out;
++
++ set_bit(NFL_RECOVERY_DONE, &node->flags);
++
++ /*
++ * when recovery is done for all nodes, we're done with the start
++ */
++
++ down(&dlm->mg_nodes_lock);
++
++ list_for_each_entry(node, &dlm->mg_nodes, list) {
++ if (test_bit(NFL_SENT_CB, &node->flags) &&
++ !test_bit(NFL_RECOVERY_DONE, &node->flags))
++ remain++;
++ }
++ up(&dlm->mg_nodes_lock);
++
++ if (!remain) {
++ /* don't send a start_done if there's since been a stop which
++ * cancels this start */
++
++ spin_lock(&dlm->async_lock);
++ last_start = dlm->mg_last_start;
++ last_stop = dlm->mg_last_stop;
++ spin_unlock(&dlm->async_lock);
++
++ if (last_start > last_stop) {
++ log_debug("recovery_done start_done %d", last_start);
++ kcl_start_done(dlm->mg_local_id, last_start);
++ clear_bit(DFL_NEED_STARTDONE, &dlm->flags);
++ }
++ }
++
++ out:
++ return;
++}
++
++/*
++ * Run in CMAN SM thread
++ */
++
++static void queue_start(dlm_t *dlm, uint32_t *nodeids, int count,
++ int event_id, int type)
++{
++ dlm_start_t *ds;
++
++ DLM_RETRY(ds = kmalloc(sizeof(dlm_start_t), GFP_KERNEL), ds);
++
++ memset(ds, 0, sizeof(dlm_start_t));
++
++ ds->nodeids = nodeids;
++ ds->count = count;
++ ds->event_id = event_id;
++ ds->type = type;
++
++ spin_lock(&dlm->async_lock);
++ dlm->mg_last_start = event_id;
++ list_add_tail(&ds->list, &dlm->starts);
++ spin_unlock(&dlm->async_lock);
++
++ wake_up(&dlm->wait);
++}
++
++static int mg_stop(void *data)
++{
++ dlm_t *dlm = (dlm_t *) data;
++
++ spin_lock(&dlm->async_lock);
++ set_bit(DFL_BLOCK_LOCKS, &dlm->flags);
++ dlm->mg_last_stop = dlm->mg_last_start;
++ spin_unlock(&dlm->async_lock);
++
++ return 0;
++}
++
++static int mg_start(void *data, uint32_t *nodeids, int count, int event_id,
++ int type)
++{
++ dlm_t *dlm = (dlm_t *) data;
++
++ queue_start(dlm, nodeids, count, event_id, type);
++
++ return 0;
++}
++
++static void mg_finish(void *data, int event_id)
++{
++ dlm_t *dlm = (dlm_t *) data;
++
++ spin_lock(&dlm->async_lock);
++ dlm->mg_last_finish = event_id;
++ set_bit(DFL_MG_FINISH, &dlm->flags);
++ spin_unlock(&dlm->async_lock);
++
++ wake_up(&dlm->wait);
++}
++
++struct kcl_service_ops mg_ops = {
++ .stop = mg_stop,
++ .start = mg_start,
++ .finish = mg_finish
++};
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/lock.c linux-patched/fs/gfs_locking/lock_dlm/lock.c
+--- linux-orig/fs/gfs_locking/lock_dlm/lock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/lock.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,561 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++/*
++ * Run in DLM thread
++ */
++
++static void queue_complete(dlm_lock_t *lp)
++{
++ dlm_t *dlm = lp->dlm;
++
++ clear_bit(LFL_WAIT_COMPLETE, &lp->flags);
++
++ spin_lock(&dlm->async_lock);
++ list_add_tail(&lp->clist, &dlm->complete);
++ set_bit(LFL_CLIST, &lp->flags);
++ spin_unlock(&dlm->async_lock);
++ wake_up(&dlm->wait);
++}
++
++static void queue_blocking(dlm_lock_t *lp, int mode)
++{
++ dlm_t *dlm = lp->dlm;
++
++ if (test_bit(LFL_WAIT_COMPLETE, &lp->flags)) {
++ /* We often receive basts for EX while we're promoting
++ from SH to EX. */
++ /* printk("lock_dlm: bast before complete %x,%"PRIx64" "
++ "gr=%d rq=%d bast=%d\n", lp->lockname.ln_type,
++ lp->lockname.ln_number, lp->cur, lp->req, mode); */
++ return;
++ }
++
++ spin_lock(&dlm->async_lock);
++
++ if (!lp->bast_mode) {
++ list_add_tail(&lp->blist, &dlm->blocking);
++ set_bit(LFL_BLIST, &lp->flags);
++ lp->bast_mode = mode;
++ } else if (lp->bast_mode < mode)
++ lp->bast_mode = mode;
++
++ spin_unlock(&dlm->async_lock);
++ wake_up(&dlm->wait);
++}
++
++static __inline__ void lock_ast(void *astargs)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) astargs;
++ queue_complete(lp);
++}
++
++static __inline__ void lock_bast(void *astargs, int mode)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) astargs;
++ queue_blocking(lp, mode);
++}
++
++/*
++ * Run in GFS or user thread
++ */
++
++/**
++ * queue_delayed - add request to queue to be submitted later
++ * @lp: DLM lock
++ * @type: the reason the lock is blocked
++ *
++ * Queue of locks which need submitting sometime later. Locks here
++ * due to BLOCKED_LOCKS are moved to request queue when recovery is
++ * done. Locks here due to an ERROR are moved to request queue after
++ * some delay. This could also be called from dlm_async thread.
++ */
++
++void queue_delayed(dlm_lock_t *lp, int type)
++{
++ dlm_t *dlm = lp->dlm;
++
++ lp->type = type;
++
++ spin_lock(&dlm->async_lock);
++ list_add_tail(&lp->dlist, &dlm->delayed);
++ set_bit(LFL_DLIST, &lp->flags);
++ spin_unlock(&dlm->async_lock);
++}
++
++/**
++ * make_mode - convert to DLM_LOCK_
++ * @lmstate: GFS lock state
++ *
++ * Returns: DLM lock mode
++ */
++
++static int16_t make_mode(int16_t lmstate)
++{
++ switch (lmstate) {
++ case LM_ST_UNLOCKED:
++ return DLM_LOCK_NL;
++ case LM_ST_EXCLUSIVE:
++ return DLM_LOCK_EX;
++ case LM_ST_DEFERRED:
++ return DLM_LOCK_CW;
++ case LM_ST_SHARED:
++ return DLM_LOCK_PR;
++ default:
++ DLM_ASSERT(0, printk("unknown LM state %d\n", lmstate););
++ }
++}
++
++/**
++ * make_lmstate - convert to LM_ST_
++ * @dlmmode: DLM lock mode
++ *
++ * Returns: GFS lock state
++ */
++
++int16_t make_lmstate(int16_t dlmmode)
++{
++ switch (dlmmode) {
++ case DLM_LOCK_IV:
++ case DLM_LOCK_NL:
++ return LM_ST_UNLOCKED;
++ case DLM_LOCK_EX:
++ return LM_ST_EXCLUSIVE;
++ case DLM_LOCK_CW:
++ return LM_ST_DEFERRED;
++ case DLM_LOCK_PR:
++ return LM_ST_SHARED;
++ default:
++ DLM_ASSERT(0, printk("unknown DLM mode %d\n", dlmmode););
++ }
++}
++
++/**
++ * check_cur_state - verify agreement with GFS on the current lock state
++ * @lp: the DLM lock
++ * @cur_state: the current lock state from GFS
++ *
++ * NB: DLM_LOCK_NL and DLM_LOCK_IV are both considered
++ * LM_ST_UNLOCKED by GFS.
++ *
++ */
++
++static void check_cur_state(dlm_lock_t *lp, unsigned int cur_state)
++{
++ int16_t cur = make_mode(cur_state);
++ if (lp->cur != DLM_LOCK_IV)
++ DLM_ASSERT(lp->cur == cur, printk("%d, %d\n", lp->cur, cur););
++}
++
++/**
++ * make_flags - put together necessary DLM flags
++ * @lp: DLM lock
++ * @gfs_flags: GFS flags
++ * @cur: current DLM lock mode
++ * @req: requested DLM lock mode
++ *
++ * Returns: DLM flags
++ */
++
++static unsigned int make_flags(dlm_lock_t *lp, unsigned int gfs_flags,
++ int16_t cur, int16_t req)
++{
++ unsigned int lkf = 0;
++
++ if (gfs_flags & LM_FLAG_TRY)
++ lkf |= DLM_LKF_NOQUEUE;
++
++ if (gfs_flags & LM_FLAG_TRY_1CB) {
++ lkf |= DLM_LKF_NOQUEUE;
++ lkf |= DLM_LKF_NOQUEUEBAST;
++ }
++
++ if (lp->lksb.sb_lkid != 0) {
++ lkf |= DLM_LKF_CONVERT;
++
++ if (gfs_flags & LM_FLAG_PRIORITY)
++ lkf |= DLM_LKF_EXPEDITE;
++ else if (req > cur)
++ lkf |= DLM_LKF_QUECVT;
++
++ /* Conversion deadlock avoidance by DLM */
++
++ if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
++ cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
++ lkf |= DLM_LKF_CONVDEADLK;
++ }
++
++ if (lp->lvb)
++ lkf |= DLM_LKF_VALBLK;
++
++ return lkf;
++}
++
++/**
++ * make_strname - convert GFS lock numbers to string
++ * @lockname: the lock type/number
++ * @str: the lock string/length
++ *
++ */
++
++static __inline__ void make_strname(struct lm_lockname *lockname,
++ strname_t *str)
++{
++ sprintf(str->name, "%8x%16"PRIx64, lockname->ln_type,
++ lockname->ln_number);
++ str->namelen = LOCK_DLM_STRNAME_BYTES;
++}
++
++int create_lp(dlm_t *dlm, struct lm_lockname *name, dlm_lock_t **lpp)
++{
++ dlm_lock_t *lp;
++
++ lp = kmalloc(sizeof(dlm_lock_t), GFP_KERNEL);
++ if (!lp)
++ return -ENOMEM;
++
++ memset(lp, 0, sizeof(dlm_lock_t));
++ lp->lockname = *name;
++ lp->dlm = dlm;
++ lp->cur = DLM_LOCK_IV;
++ init_completion(&lp->uast_wait);
++ *lpp = lp;
++ return 0;
++}
++
++/**
++ * dlm_get_lock - get a lm_lock_t given a descripton of the lock
++ * @lockspace: the lockspace the lock lives in
++ * @name: the name of the lock
++ * @lockp: return the lm_lock_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int lm_dlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ lm_lock_t **lockp)
++{
++ dlm_lock_t *lp;
++ int error;
++
++ error = create_lp((dlm_t *) lockspace, name, &lp);
++
++ *lockp = (lm_lock_t *) lp;
++ return error;
++}
++
++int do_unlock(dlm_lock_t *lp)
++{
++ int error;
++
++ init_completion(&lp->uast_wait);
++
++ set_bit(LFL_DLM_UNLOCK, &lp->flags);
++
++ error = dlm_unlock(lp->dlm->gdlm_lsp, lp->lksb.sb_lkid, 0, &lp->lksb,
++ (void *) lp);
++
++ DLM_ASSERT(!error, printk("%s: error=%d num=%x,%"PRIx64"\n",
++ lp->dlm->fsname, error, lp->lockname.ln_type,
++ lp->lockname.ln_number););
++
++ wait_for_completion(&lp->uast_wait);
++
++ spin_lock(&lp->dlm->async_lock);
++ if (test_bit(LFL_CLIST, &lp->flags)) {
++ printk("lock_dlm: dlm_put_lock lp on clist num=%x,%"PRIx64"\n", lp->lockname.ln_type, lp->lockname.ln_number);
++ list_del(&lp->clist);
++ }
++ if (test_bit(LFL_BLIST, &lp->flags)) {
++ printk("lock_dlm: dlm_put_lock lp on blist num=%x,%"PRIx64"\n",
++ lp->lockname.ln_type, lp->lockname.ln_number);
++ list_del(&lp->blist);
++ }
++ if (test_bit(LFL_DLIST, &lp->flags)) {
++ printk("lock_dlm: dlm_put_lock lp on dlist num=%x,%"PRIx64"\n",
++ lp->lockname.ln_type, lp->lockname.ln_number);
++ list_del(&lp->dlist);
++ }
++ if (test_bit(LFL_SLIST, &lp->flags)) {
++ printk("lock_dlm: dlm_put_lock lp on slist num=%x,%"PRIx64"\n",
++ lp->lockname.ln_type, lp->lockname.ln_number);
++ list_del(&lp->slist);
++ }
++ spin_unlock(&lp->dlm->async_lock);
++
++ return 0;
++}
++
++/**
++ * dlm_put_lock - get rid of a lock structure
++ * @lock: the lock to throw away
++ *
++ */
++
++void lm_dlm_put_lock(lm_lock_t *lock)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ if (lp->cur != DLM_LOCK_IV) {
++ do_unlock(lp);
++ kfree(lp);
++ }
++}
++
++/**
++ * do_lock - acquire a lock
++ * @lp: the DLM lock
++ * @range: optional range
++ */
++
++void do_lock(dlm_lock_t *lp, struct dlm_range *range)
++{
++ dlm_t *dlm = lp->dlm;
++ strname_t str;
++ int error;
++
++ /*
++ * When recovery is in progress, delay lock requests for submission
++ * once recovery is done. Requests for recovery (NOEXP) and unlocks
++ * can pass.
++ */
++
++ if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++ !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
++ queue_delayed(lp, QUEUE_LOCKS_BLOCKED);
++ return;
++ }
++
++ /*
++ * Submit the actual lock request.
++ */
++
++ make_strname(&lp->lockname, &str);
++
++ set_bit(LFL_WAIT_COMPLETE, &lp->flags);
++
++ error = dlm_lock(dlm->gdlm_lsp, lp->req, &lp->lksb, lp->lkf, str.name,
++ str.namelen, 0, lock_ast, (void *) lp,
++ lp->posix ? NULL : lock_bast, range);
++
++ if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
++ lp->lksb.sb_status = -EAGAIN;
++ queue_complete(lp);
++ error = 0;
++ }
++
++ DLM_ASSERT(!error,
++ printk("%s: num=%x,%"PRIx64" err=%d cur=%d req=%d lkf=%x\n",
++ dlm->fsname, lp->lockname.ln_type,
++ lp->lockname.ln_number, error, lp->cur, lp->req,
++ lp->lkf););
++}
++
++/**
++ * lm_dlm_lock - acquire a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ * @req_state: the requested state
++ * @flags: modifier flags
++ *
++ * Returns: A bitmap of LM_OUT_* on success, -EXXX on failure
++ */
++
++unsigned int lm_dlm_lock(lm_lock_t *lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ if (flags & LM_FLAG_NOEXP)
++ set_bit(LFL_NOBLOCK, &lp->flags);
++
++ check_cur_state(lp, cur_state);
++ lp->req = make_mode(req_state);
++ lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
++
++ do_lock(lp, NULL);
++ return LM_OUT_ASYNC;
++}
++
++int lm_dlm_lock_sync(lm_lock_t *lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ init_completion(&lp->uast_wait);
++ lm_dlm_lock(lock, cur_state, req_state, flags);
++ wait_for_completion(&lp->uast_wait);
++
++ return lp->lksb.sb_status;
++}
++
++/**
++ * lm_dlm_unlock - unlock a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++unsigned int lm_dlm_unlock(lm_lock_t *lock, unsigned int cur_state)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ check_cur_state(lp, cur_state);
++ lp->req = DLM_LOCK_NL;
++ lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
++
++ do_lock(lp, NULL);
++
++ return LM_OUT_ASYNC;
++}
++
++void lm_dlm_unlock_sync(lm_lock_t *lock, unsigned int cur_state)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ init_completion(&lp->uast_wait);
++ lm_dlm_unlock(lock, cur_state);
++ wait_for_completion(&lp->uast_wait);
++}
++
++/**
++ * dlm_cancel - cancel a request that is blocked due to DFL_BLOCK_LOCKS
++ * @lock: the lock to cancel request for
++ *
++ */
++
++void lm_dlm_cancel(lm_lock_t *lock)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++ int dlist = FALSE;
++
++ printk("lock_dlm: cancel num=%x,%"PRIx64"\n",
++ lp->lockname.ln_type, lp->lockname.ln_number);
++
++ spin_lock(&lp->dlm->async_lock);
++ if (test_and_clear_bit(LFL_DLIST, &lp->flags)) {
++ list_del(&lp->dlist);
++ lp->type = 0;
++ dlist = TRUE;
++ }
++ spin_unlock(&lp->dlm->async_lock);
++
++ if (dlist) {
++ set_bit(LFL_CANCEL, &lp->flags);
++ queue_complete(lp);
++ }
++}
++
++/**
++ * dlm_hold_lvb - hold on to a lock value block
++ * @lock: the lock the LVB is associated with
++ * @lvbp: return the lvb memory here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int lm_dlm_hold_lvb(lm_lock_t *lock, char **lvbp)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++ char *lvb;
++
++ lvb = kmalloc(DLM_LVB_SIZE, GFP_KERNEL);
++ if (!lvb)
++ return -ENOMEM;
++
++ memset(lvb, 0, DLM_LVB_SIZE);
++
++ lp->lksb.sb_lvbptr = lvb;
++ lp->lvb = lvb;
++ *lvbp = lvb;
++
++ return 0;
++}
++
++/**
++ * dlm_unhold_lvb - release a LVB
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++void lm_dlm_unhold_lvb(lm_lock_t *lock, char *lvb)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++ kfree(lvb);
++ lp->lvb = NULL;
++ lp->lksb.sb_lvbptr = NULL;
++}
++
++/**
++ * dlm_sync_lvb - sync out the value of a lvb
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++void lm_dlm_sync_lvb(lm_lock_t *lock, char *lvb)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) lock;
++
++ if (lp->cur != DLM_LOCK_EX)
++ return;
++
++ init_completion(&lp->uast_wait);
++ set_bit(LFL_SYNC_LVB, &lp->flags);
++
++ lp->req = DLM_LOCK_EX;
++ lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
++
++ do_lock(lp, NULL);
++ wait_for_completion(&lp->uast_wait);
++}
++
++/**
++ * dlm_recovery_done - reset the expired locks for a given jid
++ * @lockspace: the lockspace
++ * @jid: the jid
++ *
++ */
++
++void lm_dlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
++ unsigned int message)
++{
++ jid_recovery_done((dlm_t *) lockspace, jid, message);
++}
++
++/*
++ * Run in dlm_async
++ */
++
++/**
++ * process_submit - make DLM lock requests from dlm_async thread
++ * @lp: DLM Lock
++ *
++ */
++
++void process_submit(dlm_lock_t *lp)
++{
++ struct dlm_range range, *r = NULL;
++
++ if (lp->posix) {
++ range.ra_start = lp->posix->start;
++ range.ra_end = lp->posix->end;
++ r = ⦥
++ }
++
++ do_lock(lp, r);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/lock_dlm.h linux-patched/fs/gfs_locking/lock_dlm/lock_dlm.h
+--- linux-orig/fs/gfs_locking/lock_dlm/lock_dlm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/lock_dlm.h 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,323 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef LOCK_DLM_DOT_H
++#define LOCK_DLM_DOT_H
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/spinlock.h>
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <linux/lm_interface.h>
++#include <cluster/dlm.h>
++
++/* We take a shortcut and use lm_lockname structs for internal locks. This
++ means we must be careful to keep these types different from those used in
++ lm_interface.h. */
++
++#define LM_TYPE_JID (0x10)
++#define LM_TYPE_PLOCK_UPDATE (0x11)
++
++#define DLM_LVB_SIZE (DLM_LVB_LEN)
++
++/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
++ We sprintf these numbers into a 24 byte string of hex values to make them
++ human-readable (to make debugging simpler.) */
++
++#define LOCK_DLM_STRNAME_BYTES (24)
++
++#define LOCK_DLM_MAX_NODES (128)
++
++struct dlm;
++struct dlm_lock;
++struct dlm_node;
++struct dlm_start;
++struct strname;
++
++typedef struct dlm dlm_t;
++typedef struct dlm_lock dlm_lock_t;
++typedef struct dlm_node dlm_node_t;
++typedef struct dlm_start dlm_start_t;
++typedef struct strname strname_t;
++
++#define DFL_FIRST_MOUNT 0
++#define DFL_THREAD_STOP 1
++#define DFL_GOT_NODEID 2
++#define DFL_MG_FINISH 3
++#define DFL_HAVE_JID 4
++#define DFL_BLOCK_LOCKS 5
++#define DFL_START_ERROR 6
++#define DFL_UMOUNT 7
++#define DFL_NEED_STARTDONE 8
++
++struct dlm {
++ uint32_t jid;
++ uint32_t our_nodeid;
++ unsigned long flags;
++
++ int cnlen;
++ char * clustername;
++ int fnlen;
++ char * fsname;
++ int max_nodes;
++
++ dlm_lockspace_t * gdlm_lsp;
++
++ lm_callback_t fscb;
++ lm_fsdata_t * fsdata;
++ dlm_lock_t * jid_lock;
++
++ spinlock_t async_lock;
++ struct list_head complete;
++ struct list_head blocking;
++ struct list_head delayed;
++ struct list_head submit;
++ struct list_head starts;
++
++ wait_queue_head_t wait;
++ atomic_t threads;
++
++ int mg_local_id;
++ int mg_last_start;
++ int mg_last_stop;
++ int mg_last_finish;
++ struct list_head mg_nodes;
++ struct semaphore mg_nodes_lock;
++
++ struct list_head resources;
++ struct semaphore res_lock;
++};
++
++struct dlm_resource {
++ dlm_t * dlm;
++ struct list_head list; /* list of resources */
++ struct lm_lockname name; /* the resource name */
++ struct semaphore sema;
++ struct list_head locks; /* one lock for each range */
++ int count;
++ dlm_lock_t * update;
++ struct list_head async_locks;
++ spinlock_t async_spin;
++};
++
++struct posix_lock {
++ struct list_head list; /* resource locks list */
++ struct list_head async_list; /* resource async_locks list */
++ struct dlm_resource * resource;
++ dlm_lock_t * lp;
++ unsigned long owner;
++ uint64_t start;
++ uint64_t end;
++ int count;
++ int ex;
++};
++
++#define LFL_NOBLOCK 0
++#define LFL_NOCACHE 1
++#define LFL_UNLOCK_RECOVERY 2
++#define LFL_DLM_UNLOCK 3
++#define LFL_TRYFAILED 4
++#define LFL_SYNC_LVB 5
++#define LFL_FORCE_PROMOTE 6
++#define LFL_REREQUEST 7
++#define LFL_WAIT_COMPLETE 8
++#define LFL_CLIST 9
++#define LFL_BLIST 10
++#define LFL_DLIST 11
++#define LFL_SLIST 12
++#define LFL_IDLOCK 13
++#define LFL_CANCEL 14
++
++struct dlm_lock {
++ dlm_t * dlm;
++ struct lm_lockname lockname;
++ char * lvb;
++ struct dlm_lksb lksb;
++
++ int16_t cur;
++ int16_t req;
++ int16_t prev_req;
++ unsigned int lkf;
++ unsigned int type;
++ unsigned long flags;
++
++ int bast_mode; /* protected by async_lock */
++ struct completion uast_wait;
++
++ struct list_head clist; /* complete */
++ struct list_head blist; /* blocking */
++ struct list_head dlist; /* delayed */
++ struct list_head slist; /* submit */
++
++ struct posix_lock * posix;
++};
++
++#define NFL_SENT_CB 0
++#define NFL_NOT_MEMBER 1
++#define NFL_RECOVERY_DONE 2
++#define NFL_LAST_FINISH 3
++#define NFL_HAVE_JID 4
++
++struct dlm_node {
++ uint32_t nodeid;
++ uint32_t jid;
++ unsigned long flags;
++ struct list_head list;
++};
++
++#define QUEUE_LOCKS_BLOCKED 1
++#define QUEUE_ERROR_UNLOCK 2
++#define QUEUE_ERROR_LOCK 3
++#define QUEUE_ERROR_RETRY 4
++
++struct strname {
++ unsigned char name[LOCK_DLM_STRNAME_BYTES];
++ unsigned short namelen;
++};
++
++struct dlm_start {
++ uint32_t * nodeids;
++ int count;
++ int type;
++ int event_id;
++ struct list_head list;
++};
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++extern struct lm_lockops lock_dlm_ops;
++
++/* group.c */
++
++int init_mountgroup(dlm_t * dlm);
++void release_mountgroup(dlm_t * dlm);
++void process_start(dlm_t * dlm, dlm_start_t * ds);
++void process_finish(dlm_t * dlm);
++void jid_recovery_done(dlm_t * dlm, unsigned int jid, unsigned int message);
++
++/* thread.c */
++
++int init_async_thread(dlm_t * dlm);
++void release_async_thread(dlm_t * dlm);
++
++/* lock.c */
++
++int16_t make_lmstate(int16_t dlmmode);
++void queue_delayed(dlm_lock_t * lp, int type);
++void process_submit(dlm_lock_t * lp);
++int create_lp(dlm_t *dlm, struct lm_lockname *name, dlm_lock_t **lpp);
++void do_lock(dlm_lock_t *lp, struct dlm_range *range);
++int do_unlock(dlm_lock_t *lp);
++
++int lm_dlm_get_lock(lm_lockspace_t * lockspace, struct lm_lockname * name,
++ lm_lock_t ** lockp);
++void lm_dlm_put_lock(lm_lock_t * lock);
++unsigned int lm_dlm_lock(lm_lock_t * lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags);
++int lm_dlm_lock_sync(lm_lock_t * lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags);
++unsigned int lm_dlm_unlock(lm_lock_t * lock, unsigned int cur_state);
++void lm_dlm_unlock_sync(lm_lock_t * lock, unsigned int cur_state);
++void lm_dlm_cancel(lm_lock_t * lock);
++int lm_dlm_hold_lvb(lm_lock_t * lock, char **lvbp);
++void lm_dlm_unhold_lvb(lm_lock_t * lock, char *lvb);
++void lm_dlm_sync_lvb(lm_lock_t * lock, char *lvb);
++void lm_dlm_recovery_done(lm_lockspace_t * lockspace, unsigned int jid,
++ unsigned int message);
++
++/* plock.c */
++
++int lm_dlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end);
++
++int lm_dlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, uint64_t start, uint64_t end);
++
++int lm_dlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, uint64_t *start, uint64_t *end,
++ int *ex, unsigned long *rowner);
++
++/* main.c */
++
++void lock_dlm_debug_log(const char *fmt, ...);
++void lock_dlm_debug_dump(void);
++
++
++#define LOCK_DLM_DEBUG
++
++#ifdef LOCK_DLM_DEBUG
++#define log_debug(fmt, args...) lock_dlm_debug_log(fmt, ##args)
++#else
++#define log_debug(fmt, args...)
++#endif
++
++#define DLM_ASSERT(x, do) \
++{ \
++ if (!(x)) \
++ { \
++ lock_dlm_debug_dump(); \
++ printk("\nlock_dlm: Assertion failed on line %d of file %s\n" \
++ "lock_dlm: assertion: \"%s\"\n" \
++ "lock_dlm: time = %lu\n", \
++ __LINE__, __FILE__, #x, jiffies); \
++ {do} \
++ printk("\n"); \
++ panic("lock_dlm: Record message above and reboot.\n"); \
++ } \
++}
++
++#define DLM_RETRY(do_this, until_this) \
++for (;;) \
++{ \
++ do { do_this; } while (0); \
++ if (until_this) \
++ break; \
++ printk("lock_dlm: out of memory: %s, %u\n", __FILE__, __LINE__); \
++ schedule();\
++}
++
++#endif
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/main.c linux-patched/fs/gfs_locking/lock_dlm/main.c
+--- linux-orig/fs/gfs_locking/lock_dlm/main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/main.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,192 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++#include <linux/init.h>
++#include <linux/proc_fs.h>
++
++#if defined(LOCK_DLM_DEBUG)
++#define LOCK_DLM_DEBUG_SIZE (1024)
++#define MAX_DEBUG_MSG_LEN (64)
++#else
++#define LOCK_DLM_DEBUG_SIZE (0)
++#define MAX_DEBUG_MSG_LEN (0)
++#endif
++
++static char * debug_buf;
++static unsigned int debug_size;
++static unsigned int debug_point;
++static int debug_wrap;
++static spinlock_t debug_lock;
++static struct proc_dir_entry * debug_proc_entry = NULL;
++
++
++void lock_dlm_debug_log(const char *fmt, ...)
++{
++ va_list va;
++ int i, n, size, len;
++ char buf[MAX_DEBUG_MSG_LEN+1];
++
++ spin_lock(&debug_lock);
++
++ if (!debug_buf)
++ goto out;
++
++ size = MAX_DEBUG_MSG_LEN;
++ memset(buf, 0, size+1);
++
++ n = 0;
++ /* n = snprintf(buf, size, "%s ", dlm->fsname); */
++ size -= n;
++
++ va_start(va, fmt);
++ vsnprintf(buf+n, size, fmt, va);
++ va_end(va);
++
++ len = strlen(buf);
++ if (len > MAX_DEBUG_MSG_LEN-1)
++ len = MAX_DEBUG_MSG_LEN-1;
++ buf[len] = '\n';
++ buf[len+1] = '\0';
++
++ for (i = 0; i < strlen(buf); i++) {
++ debug_buf[debug_point++] = buf[i];
++
++ if (debug_point == debug_size) {
++ debug_point = 0;
++ debug_wrap = 1;
++ }
++ }
++ out:
++ spin_unlock(&debug_lock);
++}
++
++static void debug_setup(int size)
++{
++ char *b = NULL;
++
++ if (size > PAGE_SIZE)
++ size = PAGE_SIZE;
++ if (size)
++ b = kmalloc(size, GFP_KERNEL);
++
++ spin_lock(&debug_lock);
++ if (debug_buf)
++ kfree(debug_buf);
++ if (!size || !b)
++ goto out;
++ debug_size = size;
++ debug_point = 0;
++ debug_wrap = 0;
++ debug_buf = b;
++ memset(debug_buf, 0, debug_size);
++ out:
++ spin_unlock(&debug_lock);
++}
++
++static void debug_init(void)
++{
++ debug_buf = NULL;
++ debug_size = 0;
++ debug_point = 0;
++ debug_wrap = 0;
++ spin_lock_init(&debug_lock);
++ debug_setup(LOCK_DLM_DEBUG_SIZE);
++}
++
++void lock_dlm_debug_dump(void)
++{
++ int i;
++
++ spin_lock(&debug_lock);
++
++ if (debug_wrap) {
++ for (i = debug_point; i < debug_size; i++)
++ printk("%c", debug_buf[i]);
++ }
++ for (i = 0; i < debug_point; i++)
++ printk("%c", debug_buf[i]);
++
++ spin_unlock(&debug_lock);
++}
++
++#ifdef CONFIG_PROC_FS
++int lock_dlm_debug_info(char *b, char **start, off_t offset, int length)
++{
++ int i, n = 0;
++
++ spin_lock(&debug_lock);
++
++ if (debug_wrap) {
++ for (i = debug_point; i < debug_size; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++ }
++ for (i = 0; i < debug_point; i++)
++ n += sprintf(b + n, "%c", debug_buf[i]);
++
++ spin_unlock(&debug_lock);
++
++ return n;
++}
++#endif
++
++/**
++ * init_dlm - Initialize the dlm module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init init_lock_dlm(void)
++{
++ int error;
++
++ error = lm_register_proto(&lock_dlm_ops);
++ if (error) {
++ printk("lock_dlm: can't register protocol: (%d)\n", error);
++ return error;
++ }
++
++#ifdef CONFIG_PROC_FS
++ debug_proc_entry = create_proc_entry("cluster/lock_dlm_debug", S_IRUGO,
++ NULL);
++ if (debug_proc_entry)
++ debug_proc_entry->get_info = &lock_dlm_debug_info;
++#endif
++ debug_init();
++
++ printk("Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
++ return 0;
++}
++
++/**
++ * exit_dlm - cleanup the dlm module
++ *
++ */
++
++void __exit exit_lock_dlm(void)
++{
++ lm_unregister_proto(&lock_dlm_ops);
++
++#ifdef CONFIG_PROC_FS
++ if (debug_proc_entry)
++ remove_proc_entry("cluster/lock_dlm_debug", NULL);
++#endif
++ debug_setup(0);
++}
++
++module_init(init_lock_dlm);
++module_exit(exit_lock_dlm);
++
++MODULE_DESCRIPTION("GFS DLM Locking Module");
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/mount.c linux-patched/fs/gfs_locking/lock_dlm/mount.c
+--- linux-orig/fs/gfs_locking/lock_dlm/mount.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/mount.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,335 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/socket.h>
++#include <net/sock.h>
++
++#include "lock_dlm.h"
++#include <cluster/cnxman.h>
++#include <cluster/service.h>
++
++static int init_cman(dlm_t *dlm)
++{
++ int error = -1;
++ char *name = NULL;
++
++ if (!dlm->clustername)
++ goto fail;
++
++ error = kcl_addref_cluster();
++ if (error) {
++ printk("lock_dlm: cannot get cman reference %d\n", error);
++ goto fail;
++ }
++
++ error = kcl_cluster_name(&name);
++ if (error) {
++ printk("lock_dlm: cannot get cman cluster name %d\n", error);
++ goto fail_ref;
++ }
++
++ if (strcmp(name, dlm->clustername)) {
++ error = -1;
++ printk("lock_dlm: cman cluster name \"%s\" does not match "
++ "file system cluster name \"%s\"\n",
++ name, dlm->clustername);
++ goto fail_ref;
++ }
++
++ kfree(name);
++ return 0;
++
++ fail_ref:
++ kcl_releaseref_cluster();
++ fail:
++ if (name)
++ kfree(name);
++ return error;
++}
++
++static int release_cman(dlm_t *dlm)
++{
++ return kcl_releaseref_cluster();
++}
++
++static int init_cluster(dlm_t *dlm, char *table_name)
++{
++ char *buf, *c, *clname, *fsname;
++ int len, error = -1;
++
++ /*
++ * Parse superblock lock table <clustername>:<fsname>
++ */
++
++ len = strlen(table_name) + 1;
++ buf = kmalloc(len, GFP_KERNEL);
++ if (!buf)
++ goto out;
++ memset(buf, 0, len);
++ memcpy(buf, table_name, strlen(table_name));
++
++ c = strstr(buf, ":");
++ if (!c)
++ goto out_buf;
++
++ *c = '\0';
++ clname = buf;
++ fsname = ++c;
++
++ dlm->max_nodes = LOCK_DLM_MAX_NODES;
++
++ len = strlen(clname) + 1;
++ c = kmalloc(len, GFP_KERNEL);
++ if (!c)
++ goto out_buf;
++ memset(c, 0, len);
++ memcpy(c, clname, len-1);
++ dlm->cnlen = len-1;
++ dlm->clustername = c;
++
++ len = strlen(fsname) + 1;
++ c = kmalloc(len, GFP_KERNEL);
++ if (!c)
++ goto out_cn;
++ memset(c, 0, len);
++ memcpy(c, fsname, len-1);
++ dlm->fnlen = len-1;
++ dlm->fsname = c;
++
++ error = init_cman(dlm);
++ if (error)
++ goto out_fn;
++
++ kfree(buf);
++ return 0;
++
++ out_fn:
++ kfree(dlm->fsname);
++ out_cn:
++ kfree(dlm->clustername);
++ out_buf:
++ kfree(buf);
++ out:
++ printk("lock_dlm: init_cluster error %d\n", error);
++ return error;
++}
++
++static int release_cluster(dlm_t *dlm)
++{
++ release_cman(dlm);
++ kfree(dlm->clustername);
++ kfree(dlm->fsname);
++ return 0;
++}
++
++static int init_fence(dlm_t *dlm)
++{
++ LIST_HEAD(head);
++ struct kcl_service *s, *safe;
++ int error, found = FALSE;
++
++ error = kcl_get_services(&head, SERVICE_LEVEL_FENCE);
++ if (error < 0)
++ goto out;
++
++ list_for_each_entry_safe(s, safe, &head, list) {
++ list_del(&s->list);
++ if (!found && !strcmp(s->name, "default"))
++ found = TRUE;
++ kfree(s);
++ }
++
++ if (found)
++ return 0;
++
++ error = -1;
++ out:
++ printk("lock_dlm: init_fence error %d\n", error);
++ return error;
++}
++
++static int release_fence(dlm_t *dlm)
++{
++ return 0;
++}
++
++static int init_gdlm(dlm_t *dlm)
++{
++ int error;
++
++ error = dlm_new_lockspace(dlm->fsname, dlm->fnlen, &dlm->gdlm_lsp,
++ DLM_LSF_NOTIMERS);
++ if (error)
++ printk("lock_dlm: new lockspace error %d\n", error);
++
++ return error;
++}
++
++static int release_gdlm(dlm_t *dlm)
++{
++ dlm_release_lockspace(dlm->gdlm_lsp, 1);
++ return 0;
++}
++
++static dlm_t *init_dlm(lm_callback_t cb, lm_fsdata_t *fsdata)
++{
++ dlm_t *dlm;
++
++ dlm = kmalloc(sizeof(dlm_t), GFP_KERNEL);
++ if (!dlm)
++ return NULL;
++
++ memset(dlm, 0, sizeof(dlm_t));
++
++ dlm->fscb = cb;
++ dlm->fsdata = fsdata;
++
++ spin_lock_init(&dlm->async_lock);
++
++ INIT_LIST_HEAD(&dlm->complete);
++ INIT_LIST_HEAD(&dlm->blocking);
++ INIT_LIST_HEAD(&dlm->delayed);
++ INIT_LIST_HEAD(&dlm->submit);
++ INIT_LIST_HEAD(&dlm->starts);
++ INIT_LIST_HEAD(&dlm->resources);
++
++ init_waitqueue_head(&dlm->wait);
++
++ INIT_LIST_HEAD(&dlm->mg_nodes);
++ init_MUTEX(&dlm->mg_nodes_lock);
++ init_MUTEX(&dlm->res_lock);
++
++ return dlm;
++}
++
++/**
++ * dlm_mount - mount a dlm lockspace
++ * @table_name: the name of the space to mount
++ * @host_data: host specific data
++ * @cb: the callback
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int lm_dlm_mount(char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t *fsdata,
++ unsigned int min_lvb_size,
++ struct lm_lockstruct *lockstruct)
++{
++ dlm_t *dlm;
++ int error = -ENOMEM;
++
++ if (min_lvb_size > DLM_LVB_SIZE)
++ goto out;
++
++ dlm = init_dlm(cb, fsdata);
++ if (!dlm)
++ goto out;
++
++ error = init_cluster(dlm, table_name);
++ if (error)
++ goto out_free;
++
++ error = init_fence(dlm);
++ if (error)
++ goto out_cluster;
++
++ error = init_gdlm(dlm);
++ if (error)
++ goto out_fence;
++
++ error = init_async_thread(dlm);
++ if (error)
++ goto out_gdlm;
++
++ error = init_mountgroup(dlm);
++ if (error)
++ goto out_thread;
++
++ lockstruct->ls_jid = dlm->jid;
++ lockstruct->ls_first = test_bit(DFL_FIRST_MOUNT, &dlm->flags);
++ lockstruct->ls_lockspace = dlm;
++ lockstruct->ls_ops = &lock_dlm_ops;
++ lockstruct->ls_flags = LM_LSFLAG_ASYNC;
++ lockstruct->ls_lvb_size = DLM_LVB_SIZE;
++ return 0;
++
++ out_thread:
++ release_async_thread(dlm);
++
++ out_gdlm:
++ release_gdlm(dlm);
++
++ out_fence:
++ release_fence(dlm);
++
++ out_cluster:
++ release_cluster(dlm);
++
++ out_free:
++ kfree(dlm);
++
++ out:
++ return error;
++}
++
++/**
++ * dlm_others_may_mount
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void lm_dlm_others_may_mount(lm_lockspace_t *lockspace)
++{
++ /* Do nothing. The first node to join the Mount Group will complete
++ * before Service Manager allows another node to join. */
++}
++
++/**
++ * dlm_unmount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void lm_dlm_unmount(lm_lockspace_t *lockspace)
++{
++ dlm_t *dlm = (dlm_t *) lockspace;
++
++ release_mountgroup(dlm);
++ release_async_thread(dlm);
++ release_gdlm(dlm);
++ release_fence(dlm);
++ release_cluster(dlm);
++ kfree(dlm);
++}
++
++struct lm_lockops lock_dlm_ops = {
++ lm_proto_name:"lock_dlm",
++ lm_mount:lm_dlm_mount,
++ lm_others_may_mount:lm_dlm_others_may_mount,
++ lm_unmount:lm_dlm_unmount,
++ lm_get_lock:lm_dlm_get_lock,
++ lm_put_lock:lm_dlm_put_lock,
++ lm_lock:lm_dlm_lock,
++ lm_unlock:lm_dlm_unlock,
++ lm_plock:lm_dlm_plock,
++ lm_punlock:lm_dlm_punlock,
++ lm_plock_get:lm_dlm_plock_get,
++ lm_cancel:lm_dlm_cancel,
++ lm_hold_lvb:lm_dlm_hold_lvb,
++ lm_unhold_lvb:lm_dlm_unhold_lvb,
++ lm_sync_lvb:lm_dlm_sync_lvb,
++ lm_recovery_done:lm_dlm_recovery_done,
++ lm_owner:THIS_MODULE,
++};
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/plock.c linux-patched/fs/gfs_locking/lock_dlm/plock.c
+--- linux-orig/fs/gfs_locking/lock_dlm/plock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/plock.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,1037 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++#define MIN(a,b) ((a) <= (b)) ? (a) : (b)
++#define MAX(a,b) ((a) >= (b)) ? (a) : (b)
++
++#define CREATE 1
++#define NO_CREATE 0
++
++#define WAIT 1
++#define NO_WAIT 0
++#define X_WAIT -1
++
++#define EX 1
++#define NO_EX 0
++#define SH NO_EX
++
++
++static int check_conflict(dlm_t *dlm, struct dlm_resource *r,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t start, uint64_t end, int ex);
++
++
++static int lock_resource(struct dlm_resource *r)
++{
++ dlm_lock_t *lp;
++ struct lm_lockname name;
++ int error;
++
++ name.ln_type = LM_TYPE_PLOCK_UPDATE;
++ name.ln_number = r->name.ln_number;
++
++ error = create_lp(r->dlm, &name, &lp);
++ if (error)
++ return error;
++
++ set_bit(LFL_IDLOCK, &lp->flags);
++ lp->req = DLM_LOCK_EX;
++ do_lock(lp, NULL);
++ wait_for_completion(&lp->uast_wait);
++
++ error = lp->lksb.sb_status;
++ if (error) {
++ kfree(lp);
++ lp = NULL;
++ }
++
++ r->update = lp;
++ return error;
++}
++
++static void unlock_resource(struct dlm_resource *r)
++{
++ do_unlock(r->update);
++ kfree(r->update);
++}
++
++static struct dlm_resource *search_resource(dlm_t *dlm, struct lm_lockname *name)
++{
++ struct dlm_resource *r;
++
++ list_for_each_entry(r, &dlm->resources, list) {
++ if (lm_name_equal(&r->name, name))
++ return r;
++ }
++ return NULL;
++}
++
++static int get_resource(dlm_t *dlm, struct lm_lockname *name, int create,
++ struct dlm_resource **res)
++{
++ struct dlm_resource *r, *r2;
++ int error = -ENOMEM;
++
++ down(&dlm->res_lock);
++ r = search_resource(dlm, name);
++ if (r)
++ r->count++;
++ up(&dlm->res_lock);
++
++ if (r)
++ goto out;
++
++ if (create == NO_CREATE) {
++ error = -ENOENT;
++ goto fail;
++ }
++
++ r = kmalloc(sizeof(struct dlm_resource), GFP_KERNEL);
++ if (!r)
++ goto fail;
++
++ memset(r, 0, sizeof(struct dlm_resource));
++ r->dlm = dlm;
++ r->name = *name;
++ r->count = 1;
++ INIT_LIST_HEAD(&r->locks);
++ INIT_LIST_HEAD(&r->async_locks);
++ init_MUTEX(&r->sema);
++ spin_lock_init(&r->async_spin);
++
++ down(&dlm->res_lock);
++ r2 = search_resource(dlm, name);
++ if (r2) {
++ r2->count++;
++ up(&dlm->res_lock);
++ kfree(r);
++ r = r2;
++ goto out;
++ }
++
++ list_add_tail(&r->list, &dlm->resources);
++ up(&dlm->res_lock);
++
++ out:
++ *res = r;
++ return 0;
++ fail:
++ return error;
++}
++
++static void put_resource(struct dlm_resource *r)
++{
++ dlm_t *dlm = r->dlm;
++
++ down(&dlm->res_lock);
++ r->count--;
++ if (r->count == 0) {
++ DLM_ASSERT(list_empty(&r->locks), );
++ DLM_ASSERT(list_empty(&r->async_locks), );
++ list_del(&r->list);
++ kfree(r);
++ }
++ up(&dlm->res_lock);
++}
++
++static inline void hold_resource(struct dlm_resource *r)
++{
++ down(&r->dlm->res_lock);
++ r->count++;
++ up(&r->dlm->res_lock);
++}
++
++static inline int ranges_overlap(uint64_t start1, uint64_t end1,
++ uint64_t start2, uint64_t end2)
++{
++ if (end1 < start2 || start1 > end2)
++ return FALSE;
++ return TRUE;
++}
++
++/**
++ * overlap_type - returns a value based on the type of overlap
++ * @s1 - start of new lock range
++ * @e1 - end of new lock range
++ * @s2 - start of existing lock range
++ * @e2 - end of existing lock range
++ *
++ */
++
++static int overlap_type(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2)
++{
++ int ret;
++
++ /*
++ * ---r1---
++ * ---r2---
++ */
++
++ if (s1 == s2 && e1 == e2)
++ ret = 0;
++
++ /*
++ * --r1--
++ * ---r2---
++ */
++
++ else if (s1 == s2 && e1 < e2)
++ ret = 1;
++
++ /*
++ * --r1--
++ * ---r2---
++ */
++
++ else if (s1 > s2 && e1 == e2)
++ ret = 1;
++
++ /*
++ * --r1--
++ * ---r2---
++ */
++
++ else if (s1 > s2 && e1 < e2)
++ ret = 2;
++
++ /*
++ * ---r1--- or ---r1--- or ---r1---
++ * --r2-- --r2-- --r2--
++ */
++
++ else if (s1 <= s2 && e1 >= e2)
++ ret = 3;
++
++ /*
++ * ---r1---
++ * ---r2---
++ */
++
++ else if (s1 > s2 && e1 > e2)
++ ret = 4;
++
++ /*
++ * ---r1---
++ * ---r2---
++ */
++
++ else if (s1 < s2 && e1 < e2)
++ ret = 4;
++
++ else
++ ret = -1;
++
++ return ret;
++}
++
++/* shrink the range start2:end2 by the partially overlapping start:end */
++
++static int shrink_range2(uint64_t *start2, uint64_t *end2,
++ uint64_t start, uint64_t end)
++{
++ int error = 0;
++
++ if (*start2 < start)
++ *end2 = start - 1;
++ else if (*end2 > end)
++ *start2 = end + 1;
++ else
++ error = -1;
++ return error;
++}
++
++static int shrink_range(struct posix_lock *po, uint64_t start, uint64_t end)
++{
++ return shrink_range2(&po->start, &po->end, start, end);
++}
++
++static void put_lock(dlm_lock_t *lp)
++{
++ struct posix_lock *po = lp->posix;
++
++ po->count--;
++ if (po->count == 0) {
++ kfree(po);
++ kfree(lp);
++ }
++}
++
++static int create_lock(struct dlm_resource *r, unsigned long owner, int ex,
++ uint64_t start, uint64_t end, dlm_lock_t **lpp)
++{
++ dlm_lock_t *lp;
++ struct posix_lock *po;
++ int error;
++
++ error = create_lp(r->dlm, &r->name, &lp);
++ if (error)
++ return error;
++
++ po = kmalloc(sizeof(struct posix_lock), GFP_KERNEL);
++ if (!po) {
++ kfree(lp);
++ return -ENOMEM;
++ }
++ memset(po, 0, sizeof(struct posix_lock));
++
++ lp->posix = po;
++ po->lp = lp;
++ po->resource = r;
++ po->count = 1;
++ po->start = start;
++ po->end = end;
++ po->owner = owner;
++ po->ex = ex;
++ list_add_tail(&po->list, &r->locks);
++
++ *lpp = lp;
++ return 0;
++}
++
++static unsigned int make_flags_posix(dlm_lock_t *lp, int wait)
++{
++ unsigned int lkf = 0;
++
++ if (wait == NO_WAIT || wait == X_WAIT)
++ lkf |= DLM_LKF_NOQUEUE;
++
++ if (lp->lksb.sb_lkid != 0) {
++ lkf |= DLM_LKF_CONVERT;
++ if (wait == WAIT)
++ lkf |= DLM_LKF_EXPEDITE;
++ }
++ return lkf;
++}
++
++static void do_range_lock(dlm_lock_t *lp)
++{
++ struct dlm_range range = { lp->posix->start, lp->posix->end };
++ do_lock(lp, &range);
++}
++
++static void request_lock(dlm_lock_t *lp, int wait)
++{
++ log_debug("req %x,%"PRIx64" %s %"PRIx64"-%"PRIx64" %u w %u",
++ lp->lockname.ln_type, lp->lockname.ln_number,
++ lp->posix->ex ? "ex" : "sh", lp->posix->start,
++ lp->posix->end, current->pid, wait);
++
++ set_bit(LFL_IDLOCK, &lp->flags);
++ lp->req = lp->posix->ex ? DLM_LOCK_EX : DLM_LOCK_PR;
++ lp->lkf = make_flags_posix(lp, wait);
++
++ do_range_lock(lp);
++}
++
++static void add_async(struct posix_lock *po, struct dlm_resource *r)
++{
++ spin_lock(&r->async_spin);
++ list_add_tail(&po->async_list, &r->async_locks);
++ spin_unlock(&r->async_spin);
++}
++
++static void del_async(struct posix_lock *po, struct dlm_resource *r)
++{
++ spin_lock(&r->async_spin);
++ list_del(&po->async_list);
++ spin_unlock(&r->async_spin);
++}
++
++static int wait_async(dlm_lock_t *lp)
++{
++ wait_for_completion(&lp->uast_wait);
++ del_async(lp->posix, lp->posix->resource);
++ return lp->lksb.sb_status;
++}
++
++static void wait_async_list(struct dlm_resource *r, unsigned long owner)
++{
++ struct posix_lock *po;
++ int error, found;
++
++ restart:
++ found = FALSE;
++ spin_lock(&r->async_spin);
++ list_for_each_entry(po, &r->async_locks, async_list) {
++ if (po->owner != owner)
++ continue;
++ found = TRUE;
++ break;
++ }
++ spin_unlock(&r->async_spin);
++
++ if (found) {
++ DLM_ASSERT(po->lp, );
++ error = wait_async(po->lp);
++ DLM_ASSERT(!error, );
++ goto restart;
++ }
++}
++
++static void update_lock(dlm_lock_t *lp, int wait)
++{
++ request_lock(lp, wait);
++ add_async(lp->posix, lp->posix->resource);
++
++ if (wait == NO_WAIT || wait == X_WAIT) {
++ int error = wait_async(lp);
++ DLM_ASSERT(!error, printk("error=%d\n", error););
++ }
++}
++
++static void add_lock(struct dlm_resource *r, unsigned long owner, int wait,
++ int ex, uint64_t start, uint64_t end)
++{
++ dlm_lock_t *lp;
++ int error;
++
++ error = create_lock(r, owner, ex, start, end, &lp);
++ DLM_ASSERT(!error, );
++
++ hold_resource(r);
++ update_lock(lp, wait);
++}
++
++static int remove_lock(dlm_lock_t *lp)
++{
++ struct dlm_resource *r = lp->posix->resource;
++
++ log_debug("remove %x,%"PRIx64" %u",
++ r->name.ln_type, r->name.ln_number, current->pid);
++
++ do_unlock(lp);
++ put_lock(lp);
++ put_resource(r);
++ return 0;
++}
++
++/* RN within RE (and starts or ends on RE boundary)
++ 1. add new lock for non-overlap area of RE, orig mode
++ 2. convert RE to RN range and mode */
++
++static int lock_case1(struct posix_lock *po, struct dlm_resource *r,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end)
++{
++ uint64_t start2, end2;
++
++ /* non-overlapping area start2:end2 */
++ start2 = po->start;
++ end2 = po->end;
++ shrink_range2(&start2, &end2, start, end);
++
++ po->start = start;
++ po->end = end;
++ po->ex = ex;
++
++ if (ex) {
++ add_lock(r, owner, X_WAIT, SH, start2, end2);
++ update_lock(po->lp, wait);
++ } else {
++ add_lock(r, owner, WAIT, EX, start2, end2);
++ update_lock(po->lp, X_WAIT);
++ }
++ return 0;
++}
++
++/* RN within RE (RE overlaps RN on both sides)
++ 1. add new lock for front fragment, orig mode
++ 2. add new lock for back fragment, orig mode
++ 3. convert RE to RN range and mode */
++
++static int lock_case2(struct posix_lock *po, struct dlm_resource *r,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end)
++{
++ if (ex) {
++ add_lock(r, owner, X_WAIT, SH, po->start, start-1);
++ add_lock(r, owner, X_WAIT, SH, end+1, po->end);
++
++ po->start = start;
++ po->end = end;
++ po->ex = ex;
++
++ update_lock(po->lp, wait);
++ } else {
++ add_lock(r, owner, WAIT, EX, po->start, start-1);
++ add_lock(r, owner, WAIT, EX, end+1, po->end);
++
++ po->start = start;
++ po->end = end;
++ po->ex = ex;
++
++ update_lock(po->lp, X_WAIT);
++ }
++ return 0;
++}
++
++/* returns ranges from exist list in order of their start values */
++
++static int next_exist(struct list_head *exist, uint64_t *start, uint64_t *end)
++{
++ struct posix_lock *po;
++ int first = TRUE, first_call = FALSE;
++
++ if (!*start && !*end)
++ first_call = TRUE;
++
++ list_for_each_entry(po, exist, list) {
++ if (!first_call && (po->start <= *start))
++ continue;
++
++ if (first) {
++ *start = po->start;
++ *end = po->end;
++ first = FALSE;
++ } else if (po->start < *start) {
++ *start = po->start;
++ *end = po->end;
++ }
++ }
++
++ return (first ? -1 : 0);
++}
++
++/* adds locks in gaps between existing locks from start to end */
++
++static int fill_gaps(struct list_head *exist, struct dlm_resource *r,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end)
++{
++ uint64_t exist_start = 0, exist_end = 0;
++
++ /* cover gaps in front of each existing lock */
++ for (;;) {
++ if (next_exist(exist, &exist_start, &exist_end))
++ break;
++ if (start < exist_start)
++ add_lock(r, owner, wait, ex, start, exist_start-1);
++ start = exist_end + 1;
++ }
++
++ /* cover gap after last existing lock */
++ if (exist_end < end)
++ add_lock(r, owner, wait, ex, exist_end+1, end);
++
++ return 0;
++}
++
++/* RE within RN (possibly more than one RE lock, all within RN) */
++
++static int lock_case3(struct list_head *exist, struct dlm_resource *r,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end)
++{
++ struct posix_lock *po, *safe;
++
++ fill_gaps(exist, r, owner, wait, ex, start, end);
++
++ if (!ex)
++ wait = X_WAIT;
++
++ /* update existing locks to new mode and put back in locks list */
++ list_for_each_entry_safe(po, safe, exist, list) {
++ list_move_tail(&po->list, &r->locks);
++ if (po->ex == ex)
++ continue;
++ po->ex = ex;
++ update_lock(po->lp, wait);
++ }
++
++ return 0;
++}
++
++/* RE within RN (possibly more than one RE lock, one RE partially overlaps RN)
++ 1. add new locks with new mode for RN gaps not covered by RE's
++ 2. convert RE locks' mode to new mode
++ other steps deal with the partial-overlap fragment and depend on whether
++ the request is sh->ex or ex->sh */
++
++static int lock_case4(struct posix_lock *opo, struct list_head *exist,
++ struct dlm_resource *r, unsigned long owner, int wait,
++ int ex, uint64_t start, uint64_t end)
++{
++ struct posix_lock *po, *safe;
++ uint64_t over_start = 0, over_end = 0;
++ uint64_t frag_start = 0, frag_end = 0;
++
++ /* fragment (non-overlap) range of opo */
++ if (opo->start < start) {
++ frag_start = opo->start;
++ frag_end = start - 1;
++ } else {
++ frag_start = end + 1;
++ frag_end = opo->end;
++ }
++
++ /* overlap range of opo */
++ if (opo->start < start) {
++ over_start = start;
++ over_end = opo->end;
++ } else {
++ over_start = opo->start;
++ opo->end = end;
++ }
++
++ /* cut off the non-overlap portion of opo so fill_gaps will work */
++ opo->start = over_start;
++ opo->end = over_end;
++
++ fill_gaps(exist, r, owner, wait, ex, start, end);
++
++ /* update existing locks to new mode and put back in locks list */
++ list_for_each_entry_safe(po, safe, exist, list) {
++ list_move_tail(&po->list, &r->locks);
++ if (po == opo)
++ continue;
++ if (po->ex == ex)
++ continue;
++ po->ex = ex;
++ update_lock(po->lp, wait);
++ }
++
++ /* deal with the RE that partially overlaps the requested range */
++
++ if (ex == opo->ex)
++ return 0;
++
++ if (ex) {
++ /* 1. add a shared lock in the non-overlap range
++ 2. convert RE to overlap range and requested mode */
++
++ add_lock(r, owner, X_WAIT, SH, frag_start, frag_end);
++
++ opo->start = over_start;
++ opo->end = over_end;
++ opo->ex = ex;
++
++ update_lock(opo->lp, wait);
++ } else {
++ /* 1. request a shared lock in the overlap range
++ 2. convert RE to non-overlap range
++ 3. wait for shared lock to complete */
++
++ add_lock(r, owner, WAIT, SH, over_start, over_end);
++
++ opo->start = frag_start;
++ opo->end = frag_end;
++
++ update_lock(opo->lp, X_WAIT);
++ }
++
++ return 0;
++}
++
++/* go through r->locks to find what needs to be done to extend,
++ shrink, shift, split, etc existing locks (this often involves adding new
++ locks in addition to modifying existing locks. */
++
++static int plock_internal(struct dlm_resource *r, unsigned long owner,
++ int wait, int ex, uint64_t start, uint64_t end)
++{
++ LIST_HEAD(exist);
++ struct posix_lock *po, *safe, *case4_po = NULL;
++ int error = 0;
++
++ list_for_each_entry_safe(po, safe, &r->locks, list) {
++ if (po->owner != owner)
++ continue;
++ if (!ranges_overlap(po->start, po->end, start, end))
++ continue;
++
++ /* existing range (RE) overlaps new range (RN) */
++
++ switch(overlap_type(start, end, po->start, po->end)) {
++
++ case 0:
++ if (po->ex == ex)
++ goto out;
++
++ /* ranges the same - just update the existing lock */
++ po->ex = ex;
++ update_lock(po->lp, wait);
++ goto out;
++
++ case 1:
++ if (po->ex == ex)
++ goto out;
++
++ error = lock_case1(po, r, owner, wait, ex, start, end);
++ goto out;
++
++ case 2:
++ if (po->ex == ex)
++ goto out;
++
++ error = lock_case2(po, r, owner, wait, ex, start, end);
++ goto out;
++
++ case 3:
++ list_move_tail(&po->list, &exist);
++ break;
++
++ case 4:
++ DLM_ASSERT(!case4_po, );
++ case4_po = po;
++ list_move_tail(&po->list, &exist);
++ break;
++
++ default:
++ error = -1;
++ goto out;
++ }
++ }
++
++ if (case4_po)
++ error = lock_case4(case4_po, &exist, r, owner, wait, ex,
++ start, end);
++ else if (!list_empty(&exist))
++ error = lock_case3(&exist, r, owner, wait, ex, start, end);
++ else
++ add_lock(r, owner, wait, ex, start, end);
++
++ out:
++ return error;
++}
++
++static int punlock_internal(struct dlm_resource *r, unsigned long owner,
++ uint64_t start, uint64_t end)
++{
++ struct posix_lock *po, *safe;
++ int error = 0;
++
++ list_for_each_entry_safe(po, safe, &r->locks, list) {
++ if (po->owner != owner)
++ continue;
++ if (!ranges_overlap(po->start, po->end, start, end))
++ continue;
++
++ /* existing range (RE) overlaps new range (RN) */
++
++ switch(overlap_type(start, end, po->start, po->end)) {
++
++ case 0:
++ /* ranges the same - just remove the existing lock */
++
++ list_del(&po->list);
++ remove_lock(po->lp);
++ goto out;
++
++ case 1:
++ /* RN within RE and starts or ends on RE boundary -
++ * shrink and update RE */
++
++ shrink_range(po, start, end);
++ update_lock(po->lp, X_WAIT);
++ goto out;
++
++ case 2:
++ /* RN within RE - shrink and update RE to be front
++ * fragment, and add a new lock for back fragment */
++
++ add_lock(r, owner, po->ex ? WAIT : X_WAIT, po->ex,
++ end+1, po->end);
++
++ po->end = start - 1;
++ update_lock(po->lp, X_WAIT);
++ goto out;
++
++ case 3:
++ /* RE within RN - remove RE, then continue checking
++ * because RN could cover other locks */
++
++ list_del(&po->list);
++ remove_lock(po->lp);
++ continue;
++
++ case 4:
++ /* front of RE in RN, or end of RE in RN - shrink and
++ * update RE, then continue because RN could cover
++ * other locks */
++
++ shrink_range(po, start, end);
++ update_lock(po->lp, X_WAIT);
++ continue;
++
++ default:
++ error = -1;
++ goto out;
++ }
++ }
++
++ out:
++ return error;
++}
++
++int lm_dlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, int wait, int ex, uint64_t start,
++ uint64_t end)
++{
++ dlm_t *dlm = (dlm_t *) lockspace;
++ struct dlm_resource *r;
++ int error;
++
++ log_debug("en plock %u %x,%"PRIx64"", current->pid,
++ name->ln_type, name->ln_number);
++
++ error = get_resource(dlm, name, CREATE, &r);
++ if (error)
++ goto out;
++
++#if 0
++ /* Wait, without holding any locks, until this plock request is not
++ blocked by plocks of *other* *local* processes. Then, none of the
++ dlm requests below will wait on a lock from a local process.
++
++ This should not be necessary since we wait for completion after
++ up(). This means a local process p1 can unlock lkb X while local p2
++ is waiting for X (in wait_async_list). */
++ error = wait_local(r, owner, wait, ex, start, end);
++ if (error)
++ goto out_put;
++#endif
++
++ down(&r->sema);
++ error = lock_resource(r);
++ if (error)
++ goto out_up;
++
++ /* check_conflict() checks for conflicts with plocks from other local
++ processes and other nodes. */
++
++ if (!wait && check_conflict(dlm, r, name, owner, start, end, ex)) {
++ error = -1;
++ unlock_resource(r);
++ goto out_up;
++ }
++
++ /* If NO_WAIT all requests should return immediately.
++ If WAIT all requests go on r->async_locks which we wait on in
++ wait_async_locks(). This means DLM should not return -EAGAIN and we
++ should never block waiting for a plock to be released (by a local or
++ remote process) until we call wait_async_list(). */
++
++ error = plock_internal(r, owner, wait, ex, start, end);
++ unlock_resource(r);
++
++ /* wait_async_list() must follow the up() because we must be able
++ to punlock a range on this resource while there's a blocked plock
++ request to prevent deadlock between nodes (and processes). */
++
++ out_up:
++ up(&r->sema);
++ wait_async_list(r, owner);
++ put_resource(r);
++ out:
++ log_debug("ex plock %u error %d", current->pid, error);
++ return error;
++}
++
++int lm_dlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, uint64_t start, uint64_t end)
++{
++ dlm_t *dlm = (dlm_t *) lockspace;
++ struct dlm_resource *r;
++ int error;
++
++ log_debug("en punlock %u %x,%"PRIx64"", current->pid,
++ name->ln_type, name->ln_number);
++
++ error = get_resource(dlm, name, NO_CREATE, &r);
++ if (error)
++ goto out;
++
++ down(&r->sema);
++ error = lock_resource(r);
++ if (error)
++ goto out_up;
++
++ error = punlock_internal(r, owner, start, end);
++ unlock_resource(r);
++
++ out_up:
++ up(&r->sema);
++ wait_async_list(r, owner);
++ put_resource(r);
++ out:
++ log_debug("ex punlock %u error %d", current->pid, error);
++ return error;
++}
++
++static void query_ast(void *astargs)
++{
++ dlm_lock_t *lp = (dlm_lock_t *) astargs;;
++ complete(&lp->uast_wait);
++}
++
++static int get_conflict_global(dlm_t *dlm, struct lm_lockname *name,
++ unsigned long owner, uint64_t *start,
++ uint64_t *end, int *ex, unsigned long *rowner)
++{
++ dlm_lock_t *lp;
++ struct dlm_queryinfo qinfo;
++ struct dlm_lockinfo *lki;
++ int query = 0, s, error;
++
++ /* acquire a null lock on which base the query */
++
++ error = create_lp(dlm, name, &lp);
++ if (error)
++ goto ret;
++
++ lp->req = DLM_LOCK_NL;
++ set_bit(LFL_IDLOCK, &lp->flags);
++ do_lock(lp, NULL);
++ wait_for_completion(&lp->uast_wait);
++
++ /* do query, repeating if insufficient space */
++
++ query = DLM_LOCK_THIS | DLM_QUERY_QUEUE_GRANTED |
++ DLM_QUERY_LOCKS_HIGHER;
++
++ for (s = 16; s < dlm->max_nodes + 1; s += 16) {
++
++ lki = kmalloc(s * sizeof(struct dlm_lockinfo), GFP_KERNEL);
++ if (!lki) {
++ error = -ENOMEM;
++ goto out;
++ }
++ memset(lki, 0, s * sizeof(struct dlm_lockinfo));
++ memset(&qinfo, 0, sizeof(qinfo));
++ qinfo.gqi_locksize = s;
++ qinfo.gqi_lockinfo = lki;
++
++ init_completion(&lp->uast_wait);
++ error = dlm_query(dlm->gdlm_lsp, &lp->lksb, query, &qinfo,
++ query_ast, (void *) lp);
++ if (error) {
++ kfree(lki);
++ goto out;
++ }
++ wait_for_completion(&lp->uast_wait);
++ error = lp->lksb.sb_status;
++
++ if (!error)
++ break;
++ kfree(lki);
++ if (error != -E2BIG)
++ goto out;
++ }
++
++ /* check query results for blocking locks */
++
++ for (s = 0; s < qinfo.gqi_lockcount; s++) {
++
++ lki = &qinfo.gqi_lockinfo[s];
++
++ if (!ranges_overlap(*start, *end, lki->lki_grrange.ra_start,
++ lki->lki_grrange.ra_end))
++ continue;
++
++ if (lki->lki_node == dlm->our_nodeid)
++ continue;
++
++ if (lki->lki_grmode == DLM_LOCK_EX || *ex) {
++ *start = lki->lki_grrange.ra_start;
++ *end = lki->lki_grrange.ra_end;
++ *ex = (lki->lki_grmode == DLM_LOCK_EX) ? 1 : 0;
++ *rowner = lki->lki_node;
++ error = -EAGAIN;
++ break;
++ }
++ }
++
++ kfree(qinfo.gqi_lockinfo);
++
++ out:
++ do_unlock(lp);
++ kfree(lp);
++ ret:
++ return error;
++}
++
++static int get_conflict_local(dlm_t *dlm, struct dlm_resource *r,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t *start, uint64_t *end, int *ex,
++ unsigned long *rowner)
++{
++ struct posix_lock *po;
++ int found = FALSE;
++
++ list_for_each_entry(po, &r->locks, list) {
++ if (po->owner == owner)
++ continue;
++ if (!ranges_overlap(po->start, po->end, *start, *end))
++ continue;
++
++ if (*ex || po->ex) {
++ *start = po->start;
++ *end = po->end;
++ *ex = po->ex;
++ *rowner = po->owner;
++ found = TRUE;
++ break;
++ }
++ }
++ return found;
++}
++
++int lm_dlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ unsigned long owner, uint64_t *start, uint64_t *end,
++ int *ex, unsigned long *rowner)
++{
++ dlm_t *dlm = (dlm_t *) lockspace;
++ struct dlm_resource *r;
++ int error, found;
++
++ error = get_resource(dlm, name, NO_CREATE, &r);
++ if (!error) {
++ down(&r->sema);
++ found = get_conflict_local(dlm, r, name, owner, start, end, ex,
++ rowner);
++ up(&r->sema);
++ put_resource(r);
++ if (found)
++ goto out;
++ }
++
++ error = get_conflict_global(dlm, name, owner, start, end, ex, rowner);
++ out:
++ return error;
++}
++
++static int check_conflict(dlm_t *dlm, struct dlm_resource *r,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t start, uint64_t end, int ex)
++{
++ uint64_t get_start = start, get_end = end;
++ unsigned long get_owner = 0;
++ int get_ex = ex, error;
++
++ error = get_conflict_local(dlm, r, name, owner,
++ &get_start, &get_end, &get_ex, &get_owner);
++ if (error)
++ goto out;
++
++ error = get_conflict_global(dlm, name, owner,
++ &get_start, &get_end, &get_ex, &get_owner);
++ out:
++ log_debug("check_conflict %d %"PRIx64"-%"PRIx64" %"PRIx64"-%"PRIx64" "
++ "ex %d %d own %lu %lu pid %u", error, start, end,
++ get_start, get_end, ex, get_ex, owner, get_owner,
++ current->pid);
++ return error;
++}
++
+diff -urN linux-orig/fs/gfs_locking/lock_dlm/thread.c linux-patched/fs/gfs_locking/lock_dlm/thread.c
+--- linux-orig/fs/gfs_locking/lock_dlm/thread.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_dlm/thread.c 2004-06-16 12:03:17.967822065 -0500
+@@ -0,0 +1,388 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "lock_dlm.h"
++
++/*
++ * Run in dlm_async thread
++ */
++
++/**
++ * queue_submit - add lock request to queue for dlm_async thread
++ * @lp: DLM lock
++ *
++ * A lock placed on this queue is re-submitted to DLM as soon as
++ * dlm_async thread gets to it.
++ */
++
++static void queue_submit(dlm_lock_t *lp)
++{
++ dlm_t *dlm = lp->dlm;
++
++ spin_lock(&dlm->async_lock);
++ list_add_tail(&lp->slist, &dlm->submit);
++ set_bit(LFL_SLIST, &lp->flags);
++ spin_unlock(&dlm->async_lock);
++ wake_up(&dlm->wait);
++}
++
++/**
++ * process_blocking - processing of blocking callback
++ * @lp: DLM lock
++ *
++ */
++
++static void process_blocking(dlm_lock_t *lp, int bast_mode)
++{
++ dlm_t *dlm = lp->dlm;
++ unsigned int cb;
++
++ switch (make_lmstate(bast_mode)) {
++ case LM_ST_EXCLUSIVE:
++ cb = LM_CB_NEED_E;
++ break;
++ case LM_ST_DEFERRED:
++ cb = LM_CB_NEED_D;
++ break;
++ case LM_ST_SHARED:
++ cb = LM_CB_NEED_S;
++ break;
++ default:
++ DLM_ASSERT(0, printk("unknown bast mode %u\n", lp->bast_mode););
++ }
++
++ dlm->fscb(dlm->fsdata, cb, &lp->lockname);
++}
++
++/**
++ * process_complete - processing of completion callback for a lock request
++ * @lp: DLM lock
++ *
++ */
++
++static void process_complete(dlm_lock_t *lp)
++{
++ dlm_t *dlm = lp->dlm;
++ struct lm_async_cb acb;
++ int16_t prev_mode = lp->cur;
++
++ memset(&acb, 0, sizeof(acb));
++
++ /*
++ * This is an AST for an unlock.
++ */
++
++ if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
++
++ /* FIXME: Add an assertion to catch NOFAIL promotions from
++ * non-NL modes? */
++
++ if (lp->lksb.sb_status == -DLM_ECANCEL) {
++
++ /* lp->cur remains the same, is there anything to clear
++ * or reset to put this lp into an "ordinary" state? */
++
++ printk("lock_dlm: -DLM_ECANCEL num=%x,%"PRIx64"\n",
++ lp->lockname.ln_type, lp->lockname.ln_number);
++ } else {
++ DLM_ASSERT(lp->lksb.sb_status == -DLM_EUNLOCK,
++ printk("num=%x,%"PRIx64" status=%d\n",
++ lp->lockname.ln_type,
++ lp->lockname.ln_number,
++ lp->lksb.sb_status););
++ lp->cur = DLM_LOCK_IV;
++ }
++
++ complete(&lp->uast_wait);
++ return;
++ }
++
++ /*
++ * A canceled lock request. The lock was just taken off the delayed
++ * list and was never even submitted to dlm.
++ */
++
++ if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
++ lp->req = lp->cur;
++ acb.lc_ret |= LM_OUT_CANCELED;
++ goto out;
++ }
++
++ /*
++ * An error occured.
++ */
++
++ if (lp->lksb.sb_status) {
++ lp->req = lp->cur;
++ if (lp->cur == DLM_LOCK_IV)
++ lp->lksb.sb_lkid = 0;
++
++ if ((lp->lksb.sb_status == -EAGAIN) &&
++ (lp->lkf & DLM_LKF_NOQUEUE)) {
++ /* a "normal" error */
++ } else
++ printk("lock_dlm: process_complete error id=%x "
++ "status=%d\n", lp->lksb.sb_lkid,
++ lp->lksb.sb_status);
++ goto out;
++ }
++
++ /*
++ * This is an AST for an EX->EX conversion for sync_lvb from GFS.
++ */
++
++ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
++ complete(&lp->uast_wait);
++ return;
++ }
++
++ /*
++ * A lock has been demoted to NL because it initially completed during
++ * BLOCK_LOCKS. Now it must be requested in the originally requested
++ * mode.
++ */
++
++ if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
++
++ DLM_ASSERT(lp->req == DLM_LOCK_NL,);
++ DLM_ASSERT(lp->prev_req > DLM_LOCK_NL,);
++
++ lp->cur = DLM_LOCK_NL;
++ lp->req = lp->prev_req;
++ lp->prev_req = DLM_LOCK_IV;
++ lp->lkf &= ~DLM_LKF_CONVDEADLK;
++ lp->lkf |= DLM_LKF_QUECVT;
++
++ set_bit(LFL_NOCACHE, &lp->flags);
++
++ if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++ !test_bit(LFL_NOBLOCK, &lp->flags))
++ queue_delayed(lp, QUEUE_LOCKS_BLOCKED);
++ else
++ queue_submit(lp);
++ return;
++ }
++
++ /*
++ * A request is granted during dlm recovery. It may be granted
++ * because the locks of a failed node were cleared. In that case,
++ * there may be inconsistent data beneath this lock and we must wait
++ * for recovery to complete to use it. When gfs recovery is done this
++ * granted lock will be converted to NL and then reacquired in this
++ * granted state.
++ */
++
++ if (test_bit(DFL_BLOCK_LOCKS, &dlm->flags) &&
++ !test_bit(LFL_NOBLOCK, &lp->flags) &&
++ lp->req != DLM_LOCK_NL) {
++
++ lp->cur = lp->req;
++ lp->prev_req = lp->req;
++ lp->req = DLM_LOCK_NL;
++ lp->lkf |= DLM_LKF_CONVERT;
++ lp->lkf &= ~DLM_LKF_CONVDEADLK;
++ lp->lkf &= ~DLM_LKF_QUECVT;
++
++ set_bit(LFL_REREQUEST, &lp->flags);
++ queue_submit(lp);
++ return;
++ }
++
++ /*
++ * DLM demoted the lock to NL before it was granted so GFS must be
++ * told it cannot cache data for this lock.
++ */
++
++ if (lp->lksb.sb_flags == DLM_SBF_DEMOTED)
++ set_bit(LFL_NOCACHE, &lp->flags);
++
++ out:
++
++ /*
++ * This is an internal lock_dlm lock used for managing JIDs.
++ */
++
++ if (test_bit(LFL_IDLOCK, &lp->flags)) {
++ clear_bit(LFL_NOBLOCK, &lp->flags);
++ lp->cur = lp->req;
++ complete(&lp->uast_wait);
++ return;
++ }
++
++ /*
++ * Normal completion of a lock request. Tell GFS it now has the lock.
++ */
++
++ clear_bit(LFL_NOBLOCK, &lp->flags);
++ lp->cur = lp->req;
++
++ acb.lc_name = lp->lockname;
++ acb.lc_ret |= make_lmstate(lp->cur);
++
++ if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
++ (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
++ acb.lc_ret |= LM_OUT_CACHEABLE;
++
++ dlm->fscb(dlm->fsdata, LM_CB_ASYNC, &acb);
++}
++
++/**
++ * no_work - determine if there's work for the dlm_async thread
++ * @dlm:
++ *
++ * Returns: 1 if no work, 0 otherwise
++ */
++
++static __inline__ int no_work(dlm_t * dlm)
++{
++ int ret;
++
++ spin_lock(&dlm->async_lock);
++
++ ret = list_empty(&dlm->complete) &&
++ list_empty(&dlm->blocking) &&
++ list_empty(&dlm->submit) &&
++ list_empty(&dlm->starts) && !test_bit(DFL_MG_FINISH, &dlm->flags);
++
++ spin_unlock(&dlm->async_lock);
++
++ return ret;
++}
++
++/**
++ * dlm_async - thread for a variety of asynchronous processing
++ * @data:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int dlm_async(void *data)
++{
++ dlm_t *dlm = (dlm_t *) data;
++ dlm_lock_t *lp = NULL;
++ dlm_start_t *ds = NULL;
++ uint8_t complete, blocking, submit, start, finish;
++ DECLARE_WAITQUEUE(wait, current);
++
++ daemonize("lock_dlm");
++ atomic_inc(&dlm->threads);
++
++ do {
++ current->state = TASK_INTERRUPTIBLE;
++ add_wait_queue(&dlm->wait, &wait);
++ if (no_work(dlm))
++ schedule();
++ remove_wait_queue(&dlm->wait, &wait);
++ current->state = TASK_RUNNING;
++
++ complete = blocking = submit = start = finish = 0;
++
++ spin_lock(&dlm->async_lock);
++
++ if (!list_empty(&dlm->complete)) {
++ lp = list_entry(dlm->complete.next, dlm_lock_t, clist);
++ list_del(&lp->clist);
++ clear_bit(LFL_CLIST, &lp->flags);
++ complete = 1;
++ } else if (!list_empty(&dlm->blocking)) {
++ lp = list_entry(dlm->blocking.next, dlm_lock_t, blist);
++ list_del(&lp->blist);
++ clear_bit(LFL_BLIST, &lp->flags);
++ blocking = lp->bast_mode;
++ lp->bast_mode = 0;
++ } else if (!list_empty(&dlm->submit)) {
++ lp = list_entry(dlm->submit.next, dlm_lock_t, slist);
++ list_del(&lp->slist);
++ clear_bit(LFL_SLIST, &lp->flags);
++ submit = 1;
++ } else if (!list_empty(&dlm->starts)) {
++ ds = list_entry(dlm->starts.next, dlm_start_t, list);
++ list_del(&ds->list);
++ start = 1;
++ } else if (test_and_clear_bit(DFL_MG_FINISH, &dlm->flags)) {
++ finish = 1;
++ }
++
++ spin_unlock(&dlm->async_lock);
++
++ if (complete)
++ process_complete(lp);
++
++ else if (blocking)
++ process_blocking(lp, blocking);
++
++ else if (submit)
++ process_submit(lp);
++
++ else if (start)
++ process_start(dlm, ds);
++
++ else if (finish)
++ process_finish(dlm);
++
++ schedule();
++ }
++ while (!test_bit(DFL_THREAD_STOP, &dlm->flags));
++
++ atomic_dec(&dlm->threads);
++ return 0;
++}
++
++/**
++ * init_async_thread
++ * @dlm:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int init_async_thread(dlm_t * dlm)
++{
++ int error;
++
++ clear_bit(DFL_THREAD_STOP, &dlm->flags);
++ atomic_set(&dlm->threads, 0);
++
++ error = kernel_thread(dlm_async, dlm, 0);
++ if (error < 0)
++ goto out;
++
++ error = kernel_thread(dlm_async, dlm, 0);
++ if (error < 0) {
++ release_async_thread(dlm);
++ goto out;
++ }
++
++ while (atomic_read(&dlm->threads) != 2)
++ schedule();
++ error = 0;
++
++ out:
++ if (error)
++ printk("lock_dlm: can't start async thread %d\n", error);
++ return error;
++}
++
++/**
++ * release_async_thread
++ * @dlm:
++ *
++ */
++
++void release_async_thread(dlm_t * dlm)
++{
++ set_bit(DFL_THREAD_STOP, &dlm->flags);
++ while (atomic_read(&dlm->threads)) {
++ wake_up(&dlm->wait);
++ schedule();
++ }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gio_wiretypes.h linux-patched/fs/gfs_locking/lock_gulm/gio_wiretypes.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gio_wiretypes.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gio_wiretypes.h 2004-06-16 12:03:21.956895230 -0500
+@@ -0,0 +1,404 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++#ifndef __gio_wiretypes_h__
++#define __gio_wiretypes_h__
++
++/* an attempt to do something about tracking changes to the protocol over
++ * the wires.
++ * If I was really cute, this would be effectivily a checksum of this file.
++ */
++#define GIO_WIREPROT_VERS (0x67000010)
++
++/*****************Error codes.
++ * everyone uses these same error codes.
++ */
++#define gio_Err_Ok (0)
++#define gio_Err_BadLogin (1001)
++#define gio_Err_BadCluster (1003)
++#define gio_Err_BadConfig (1004)
++#define gio_Err_BadGeneration (1005)
++#define gio_Err_BadWireProto (1019)
++
++#define gio_Err_NotAllowed (1006)
++#define gio_Err_Unknown_Cs (1007)
++#define gio_Err_BadStateChg (1008)
++#define gio_Err_MemoryIssues (1009)
++
++#define gio_Err_PushQu (1010) /* client should never see this one */
++#define gio_Err_TryFailed (1011)
++#define gio_Err_AlreadyPend (1013)
++#define gio_Err_Canceled (1015)
++
++#define gio_Err_NoSuchFS (1016)
++#define gio_Err_NoSuchJID (1017)
++#define gio_Err_NoSuchName (1018)
++
++/* next free error code: 1002 1012 1014 1020 */
++
++/*
++ * Error: just sort of a generic error code thing.
++ * uint32: gERR
++ * uint32: opcode that this is in reply to. (can be zeros)
++ * uint32: error code
++ */
++#define gulm_err_reply (0x67455252) /* gERR */
++
++#define gulm_nop (0x674e4f50) /* gNOP */
++
++/********************* Core *****************/
++/*
++ * login request
++ * uint32: gCL0
++ * uint32: proto version
++ * string: cluster ID
++ * string: My Name
++ * uint64: generation number
++ * uint32: config CRC
++ * uint32: rank
++ * login reply
++ * uint32: gCL1
++ * uint64: generation number
++ * uint32: error code
++ * uint32: rank
++ * uint8: ama
++ * If I am the Master or Arbitrating and there are no errors, A
++ * serialization of the current nodelist follows. And a client or slave
++ * is connecting (not resources).
++ *
++ * logout request:
++ * uint32: gCL2
++ * string: node name
++ * uint8: S/P/A/M/R
++ * logout reply: Don't seem to use this....
++ * uint32: gCL3
++ * uint32: error code
++ *
++ * resource login request:
++ * uint32: gCL4
++ * uint32: proto version
++ * string: cluster ID
++ * string: resource name
++ * uint32: options
++ * login reply (gCL1) is sent in return.
++ *
++ * beat req
++ * uint32: gCB0
++ * string: My Name
++ * beat rpl
++ * uint32: gCB1
++ * uint32: error code
++ *
++ * Membership Request
++ * uint32: gCMA
++ * string: node name
++ *
++ * Membership update
++ * uint32: gCMU
++ * string: node name
++ * IPv6: IP
++ * uint8: Current State
++ *
++ * Membership list request info.
++ * uint32: gCMl
++ *
++ * Membership list info.
++ * uint32: gCML
++ * list_start_marker
++ * string: node name
++ * IPv6: IP
++ * uint8: state
++ * uint8: laststate
++ * uint8: mode (S/P/A/M/C)
++ * uint32: missed beats
++ * uint64: last beat
++ * uint64: delay avg
++ * uint64: max delay
++ * list_stop_marker
++ *
++ * Request Resource info
++ * uint32: gCR0
++ *
++ * Resource list info
++ * uint32: gCR1
++ * list_start_marker
++ * string: name
++ * list_stop_marker
++ *
++ * Force node into Expired:
++ * uint32: gCFE
++ * string: node name
++ *
++ * Core state request:
++ * uint32: gCSR
++ *
++ * Core state changes:
++ * uint32: gCSC
++ * uint8: state (slave, pending, arbitrating, master)
++ * If state == Slave, then the next two will follow.
++ * IPv6: MasterIP
++ * string: MasterName
++ *
++ * Core shutdown req:
++ * uint32: gCSD
++ *
++ * Switch core from current state into Pending:
++ * uint32: gCSP
++ *
++ */
++#define gulm_core_login_req (0x67434c00) /* gCL0 */
++#define gulm_core_login_rpl (0x67434c01) /* gCL1 */
++#define gulm_core_logout_req (0x67434c02) /* gCL2 */
++#define gulm_core_logout_rpl (0x67434c03) /* gCL3 */
++#define gulm_core_reslgn_req (0x67434c04) /* gCL4 */
++#define gulm_core_beat_req (0x67434200) /* gCB0 */
++#define gulm_core_beat_rpl (0x67434201) /* gCB1 */
++#define gulm_core_mbr_req (0x67434d41) /* gCMA */
++#define gulm_core_mbr_updt (0x67434d55) /* gCMU */
++#define gulm_core_mbr_lstreq (0x67434d6c) /* gCMl */
++#define gulm_core_mbr_lstrpl (0x67434d4c) /* gCML */
++#define gulm_core_mbr_force (0x67434645) /* gCFE */
++#define gulm_core_res_req (0x67435200) /* gCR0 */
++#define gulm_core_res_list (0x67435201) /* gCR1 */
++#define gulm_core_state_req (0x67435352) /* gCSR */
++#define gulm_core_state_chgs (0x67435343) /* gCSC */
++#define gulm_core_shutdown (0x67435344) /* gCSD */
++#define gulm_core_forcepend (0x67435350) /* gCSP */
++
++/* in the st field */
++#define gio_Mbr_Logged_in (0x05)
++#define gio_Mbr_Logged_out (0x06)
++#define gio_Mbr_Expired (0x07)
++#define gio_Mbr_Killed (0x08)
++#define gio_Mbr_OM_lgin (0x09)
++
++/* in the ama field */
++#define gio_Mbr_ama_Slave (0x01)
++#define gio_Mbr_ama_Master (0x02)
++#define gio_Mbr_ama_Pending (0x03)
++#define gio_Mbr_ama_Arbitrating (0x04)
++#define gio_Mbr_ama_Resource (0x05)
++#define gio_Mbr_ama_Client (0x06)
++/* the Client entery is ONLY for mode tracking.
++ * nodelist reply is the only place it is used.
++ */
++
++/* options that affect behavors on services. (resources) */
++#define gulm_svc_opt_important (0x00000001)
++
++/********************* Info Traffic *****************
++ *
++ * Note that for many of these, they can be sent to all of the servers and
++ * will get sane replies. Some of these can only be sent to specific
++ * servers.
++ *
++ * stats req:
++ * uint32: gIS0
++ * stats rpl:
++ * uint32: gIS1
++ * list start:
++ * string: key
++ * string: value
++ * list stop:
++ * Notes:
++ * The stats reply is a set of string pairs. This way the server can send
++ * whatever things it wants, and the same client code will work for
++ * anything.
++ *
++ * set verbosity:
++ * uint32: gIV0
++ * string: verb flags (with -/+) to [un]set
++ * Note:
++ * We don't bother with a reply for this. If the server got it, it works.
++ * If it didn't, it cannot send an error back anyways.
++ *
++ * close socket:
++ * uint32: gSC0
++ * Note:
++ * Tells the server to close this connection cleanly. We're done with
++ * it. This is *not* the same as loging out. You must login before you
++ * can logout. And many commands sent from gulm_tool happen without
++ * logging in. These commands would be useful for clients in many cases,
++ * so I don't want to put a close at the end of them, but if I don't,
++ * there will be error messages printed on the console when gulm_tool
++ * calls them.
++ * So we need a way to close a connection cleanly that has not been
++ * logged in.
++ *
++ * request slave list:
++ * uint32: gIL0
++ * slave list replay:
++ * uint32: gIL1
++ * list start:
++ * string: name
++ * uint32: poller idx
++ * list stop:
++ */
++#define gulm_info_stats_req (0x67495300) /* gIS0 */
++#define gulm_info_stats_rpl (0x67495301) /* gIS1 */
++#define gulm_info_set_verbosity (0x67495600) /* gIV0 */
++#define gulm_socket_close (0x67534300) /* gSC0 */
++#define gulm_info_slave_list_req (0x67494c00) /* gIL0 */
++#define gulm_info_slave_list_rpl (0x67494c01) /* gIL1 */
++
++/********************* Lock Traffic *****************
++ * All lock traffic.
++ *
++ * login req:
++ * uint32: gLL0
++ * uint32: proto version
++ * string: node name
++ * uint8: Client/Slave
++ * login rpl:
++ * uint32: gLL1
++ * uint32: error code
++ * uint8: Slave/Master
++ * xdr of current lock state if no errors and master sending reply
++ * and you're a slave.
++ *
++ * logout req:
++ * uint32: gLL2
++ * logout rpl:
++ * uint32: gLL3
++ *
++ * select lockspace:
++ * uint32: gLS0
++ * raw: usually just four bytes for lockspace name.
++ * but can be most anything.
++ *
++ * lock req:
++ * uint32: gLR0
++ * raw: key
++ * uint8: state
++ * uint32: flags
++ * raw: lvb -- Only exists if hasLVB flag is true.
++ * lock rpl:
++ * uint32: gLR1
++ * raw: key
++ * uint8: state
++ * uint32: flags
++ * uint32: error code
++ * raw: lvb -- Only exists if hasLVB flag is true.
++ *
++ * lock state update:
++ * uint32: gLRU
++ * string: node name
++ * raw: key
++ * uint8: state
++ * uint32: flags
++ * raw: lvb -- Only exists if hasLVB flag is true.
++ *
++ * Action req:
++ * uint32: gLA0
++ * raw: key
++ * uint8: action
++ * raw: lvb -- Only exists if action is SyncLVB
++ * Action Rpl:
++ * uint32: gLA1
++ * raw: key
++ * uint8: action
++ * uint32: error code
++ *
++ * Action update:
++ * uint32: gLAU
++ * string: node name
++ * raw: key
++ * uint8: action
++ * raw: lvb -- Only exists if action is SyncLVB
++ *
++ * Slave Update Rply: -- for both actions and requests.
++ * uint32: gLUR
++ * raw: key
++ *
++ * Drop lock Callback:
++ * uint32: gLC0
++ * raw: key
++ * uint8: state
++ *
++ * Drop all locks callback: This is the highwater locks thing
++ * uint32: gLC2
++ *
++ * Drop expired locks:
++ * uint32: gLEO
++ * string: node name if NULL, then drap all exp for mask.
++ * raw: keymask if keymask & key == key, then dropexp on this lock.
++ *
++ * Lock list req:
++ * uint32: gLD0
++ * Lock list rpl:
++ * uint32: gLD1
++ * list start mark
++ * uint8: key length
++ * raw: key
++ * uint8: state
++ * uint8: lvb length
++ * if lvb length > 0, raw: LVB
++ * uint32: Holder count
++ * list start mark
++ * string: holders
++ * list stop mark
++ * uint32: LVB holder count
++ * list start mark
++ * string: LVB Holders
++ * list stop mark
++ * uint32: Expired holder count
++ * list start mark
++ * string: ExpHolders
++ * list stop mark
++ * list stop mark
++ *
++ */
++#define gulm_lock_login_req (0x674C4C00) /* gLL0 */
++#define gulm_lock_login_rpl (0x674C4C01) /* gLL1 */
++#define gulm_lock_logout_req (0x674C4C02) /* gLL2 */
++#define gulm_lock_logout_rpl (0x674C4C03) /* gLL3 */
++#define gulm_lock_sel_lckspc (0x674C5300) /* gLS0 */
++#define gulm_lock_state_req (0x674C5200) /* gLR0 */
++#define gulm_lock_state_rpl (0x674C5201) /* gLR1 */
++#define gulm_lock_state_updt (0x674C5255) /* gLRU */
++#define gulm_lock_action_req (0x674C4100) /* gLA0 */
++#define gulm_lock_action_rpl (0x674C4101) /* gLA1 */
++#define gulm_lock_action_updt (0x674C4155) /* gLAU */
++#define gulm_lock_update_rpl (0x674c5552) /* gLUR */
++#define gulm_lock_cb_state (0x674C4300) /* gLC0 */
++#define gulm_lock_cb_dropall (0x674C4302) /* gLC2 */
++#define gulm_lock_drop_exp (0x674C454F) /* gLEO */
++#define gulm_lock_dump_req (0x674c4400) /* gLD0 */
++#define gulm_lock_dump_rpl (0x674c4401) /* gLD1 */
++#define gulm_lock_rerunqueues (0x674c5152) /* gLQR */
++
++/* marks for the login */
++#define gio_lck_st_Slave (0x00)
++#define gio_lck_st_Client (0x01)
++
++/* state change requests */
++#define gio_lck_st_Unlock (0x00)
++#define gio_lck_st_Exclusive (0x01)
++#define gio_lck_st_Deferred (0x02)
++#define gio_lck_st_Shared (0x03)
++/* actions */
++#define gio_lck_st_Cancel (0x09)
++#define gio_lck_st_HoldLVB (0x0b)
++#define gio_lck_st_UnHoldLVB (0x0c)
++#define gio_lck_st_SyncLVB (0x0d)
++
++/* flags */
++#define gio_lck_fg_Do_CB (0x00000001)
++#define gio_lck_fg_Try (0x00000002)
++#define gio_lck_fg_Any (0x00000004)
++#define gio_lck_fg_NoExp (0x00000008)
++#define gio_lck_fg_hasLVB (0x00000010)
++#define gio_lck_fg_Cachable (0x00000020)
++#define gio_lck_fg_Piority (0x00000040)
++
++#endif /*__gio_wiretypes_h__*/
++/* vim: set ai cin et sw=3 ts=3 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm.h linux-patched/fs/gfs_locking/lock_gulm/gulm.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm.h 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,288 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef GULM_DOT_H
++#define GULM_DOT_H
++
++#define GULM_RELEASE_NAME "v6.0.0"
++
++#ifdef MODVERSIONS
++#include <linux/modversions.h>
++#endif /* MODVERSIONS */
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <asm/uaccess.h>
++#include <linux/spinlock.h>
++#include <asm/atomic.h>
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/smp_lock.h>
++#include <linux/ctype.h>
++#include <linux/string.h>
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#if (BITS_PER_LONG == 64)
++#define PRIu64 "lu"
++#define PRId64 "ld"
++#define PRIo64 "lo"
++#define PRIx64 "lx"
++#define PRIX64 "lX"
++#define SCNu64 "lu"
++#define SCNd64 "ld"
++#define SCNo64 "lo"
++#define SCNx64 "lx"
++#define SCNX64 "lX"
++#else
++#define PRIu64 "Lu"
++#define PRId64 "Ld"
++#define PRIo64 "Lo"
++#define PRIx64 "Lx"
++#define PRIX64 "LX"
++#define SCNu64 "Lu"
++#define SCNd64 "Ld"
++#define SCNo64 "Lo"
++#define SCNx64 "Lx"
++#define SCNX64 "LX"
++#endif
++
++#include <linux/list.h>
++
++#undef MAX
++#define MAX(a,b) ((a>b)?a:b)
++
++#undef MIN
++#define MIN(a,b) ((a<b)?a:b)
++
++/* Extern Macro */
++
++#ifndef EXTERN
++#define EXTERN extern
++#define INIT(X)
++#else
++#undef EXTERN
++#define EXTERN
++#define INIT(X) =X
++#endif
++
++/* Static Macro */
++#ifndef DEBUG_SYMBOLS
++#define STATIC static
++#else
++#define STATIC
++#endif
++
++/* Divide x by y. Round up if there is a remainder. */
++#define DIV_RU(x, y) (((x) + (y) - 1) / (y))
++
++#include <linux/lm_interface.h>
++
++#include "gulm_prints.h"
++
++#include "libgulm.h"
++
++#include "handler.h"
++
++/* Some fixed length constants.
++ * Some of these should be made dynamic in size in the future.
++ */
++#define GIO_KEY_SIZE (46)
++#define GIO_LVB_SIZE (32)
++#define GIO_NAME_SIZE (32)
++#define GIO_NAME_LEN (GIO_NAME_SIZE-1)
++
++/* What we know about this filesytem */
++struct gulm_fs_s {
++ struct list_head fs_list;
++ char fs_name[GIO_NAME_SIZE]; /* lock table name */
++
++ lm_callback_t cb; /* file system callback function */
++ lm_fsdata_t *fsdata; /* private file system data */
++
++ callback_qu_t cq;
++
++ uint32_t fsJID;
++ uint32_t lvb_size;
++
++ struct semaphore get_lock; /* I am not 100% sure this is needed.
++ * But it only hurts performance,
++ * not correctness if it is
++ * useless. Sometime post52, need
++ * to investigate.
++ */
++
++ /* Stuff for the first mounter lock and state */
++ int firstmounting;
++ /* the recovery done func needs to behave slightly differnt when we are
++ * the first node in an fs.
++ */
++
++ void *mountlock; /* this lock holds the Firstmounter state of the FS */
++ /* this is because all lock traffic is async, and really at this point
++ * in time we want a sync behavor, so I'm left with doing something to
++ * achive that.
++ *
++ * this works, but it is crufty, but I don't want to build a huge
++ * queuing system for one lock that we touch twice at the beginning and
++ * once on the end.
++ *
++ * I should change the firstmounter lock to work like the journal locks
++ * and the node locks do. Things are a lot cleaner now with the libgulm
++ * interface than before. (when the firstmounter lock code was written)
++ */
++ struct completion sleep;
++
++ /* Stuff for JID mapping locks */
++ uint32_t JIDcount; /* how many JID locks are there. */
++};
++typedef struct gulm_fs_s gulm_fs_t;
++
++/* What we know about each locktable.
++ * only one now-a-days. (the LTPX)
++ * */
++typedef struct lock_table_s {
++ uint32_t magic_one;
++
++ int running;
++ struct task_struct *recver_task;
++ struct completion startup;
++ struct semaphore sender;
++
++ struct task_struct *sender_task;
++ wait_queue_head_t send_wchan;
++ spinlock_t queue_sender;
++ struct list_head to_be_sent;
++
++ int hashbuckets;
++ spinlock_t *hshlk;
++ struct list_head *lkhsh;
++
++ /* stats
++ * it may be wise to make some of these into atomic numbers.
++ * or something. or not.
++ * */
++ uint32_t locks_total;
++ uint32_t locks_unl;
++ uint32_t locks_exl;
++ uint32_t locks_shd;
++ uint32_t locks_dfr;
++ uint32_t locks_lvbs;
++ atomic_t locks_pending;
++ /* cannot count expired here. clients don't know this */
++
++ uint32_t lops; /* just incr on each op */
++
++} lock_table_t;
++
++typedef struct gulm_cm_s {
++ uint8_t myName[64];
++ uint8_t clusterID[256]; /* doesn't need to be 256. */
++ uint8_t loaded; /* True|False whether we grabbed the config data */
++ uint8_t starts;
++
++ uint32_t handler_threads; /* howmany to have */
++ uint32_t verbosity;
++
++ uint64_t GenerationID;
++
++ lock_table_t ltpx;
++
++ gulm_interface_p hookup;
++
++} gulm_cm_t;
++
++/* things about each lock. */
++typedef struct gulm_lock_s {
++ struct list_head gl_list;
++ atomic_t count;
++
++ uint32_t magic_one;
++ gulm_fs_t *fs; /* which filesystem we belong to. */
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++ uint8_t last_suc_state; /* last state we succesfully got. */
++ char *lvb;
++
++ /* this is true when there is a lock request sent out for this lock.
++ * All it really means is that if we've lost the master, and reconnect
++ * to another, this lock needs to have it's request resent.
++ *
++ * This now has two stages. Since a lock could be pending, but still in
++ * the send queue. So we don't want to resend requests that haven't
++ * been sent yet.
++ *
++ * we don't handle the master losses here any more. LTPX does that for
++ * us. Should consider removing the dupicated code then.
++ */
++ int actuallypending; /* may need to be atomic */
++ int in_to_be_sent;
++
++ enum { glck_nothing, glck_action, glck_state } req_type;
++ /* these three for the lock req. We save them here so we can rebuild
++ * the lock request if there was a server failover. (?still needed?)
++ */
++ unsigned int cur_state;
++ unsigned int req_state;
++ unsigned int flags;
++
++ /* these three for actions. First is the action, next is result, last is
++ * what threads wait on for the reply.
++ */
++ int action;
++ int result; /* ok, both are using this. */
++ struct completion actsleep;
++
++} gulm_lock_t;
++
++/*****************************************************************************/
++/* cross pollenate prototypes */
++
++/* from gulm_lt.c */
++void lt_logout (void);
++int lt_login (void);
++int get_mount_lock (gulm_fs_t * fs, int *first);
++int downgrade_mount_lock (gulm_fs_t * fs);
++int drop_mount_lock (gulm_fs_t * fs);
++int send_drop_all_exp (lock_table_t * lt);
++int send_drop_exp (gulm_fs_t * fs, lock_table_t * lt, char *name);
++
++/*from gulm_core.c */
++void cm_logout (void);
++int cm_login (void);
++void delete_ipnames (struct list_head *namelist);
++
++/* from gulm_fs.c */
++void init_gulm_fs (void);
++void request_journal_replay (uint8_t * name);
++void passup_droplocks (void);
++gulm_fs_t *get_fs_by_name (uint8_t * name);
++void dump_internal_lists (void);
++void gulm_recovery_done (lm_lockspace_t * lockspace,
++ unsigned int jid, unsigned int message);
++void gulm_unmount (lm_lockspace_t * lockspace);
++void gulm_others_may_mount (lm_lockspace_t * lockspace);
++int gulm_mount (char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t * fsdata,
++ unsigned int min_lvb_size, struct lm_lockstruct *lockstruct);
++
++extern struct lm_lockops gulm_ops;
++
++#endif /* GULM_DOT_H */
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_core.c linux-patched/fs/gfs_locking/lock_gulm/gulm_core.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_core.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_core.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,255 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "utils_tostr.h"
++
++extern gulm_cm_t gulm_cm;
++
++/* private vars. */
++int cm_thd_running;
++struct completion cm_thd_startup;
++struct task_struct *cm_thd_task;
++
++/**
++ */
++int
++gulm_core_login_reply (void *misc, uint64_t gen, uint32_t error,
++ uint32_t rank, uint8_t corestate)
++{
++ if (error != 0) {
++ log_err ("Core returned error %d:%s.\n", error,
++ gio_Err_to_str (error));
++ cm_thd_running = FALSE;
++ return error;
++ }
++
++ if( gulm_cm.GenerationID != 0 ) {
++ GULM_ASSERT(gulm_cm.GenerationID == gen,
++ printk("us: %"PRIu64" them: %"PRIu64"\n",
++ gulm_cm.GenerationID,gen);
++ );
++ }
++ gulm_cm.GenerationID = gen;
++
++ error = lt_login ();
++ if (error != 0) {
++ log_err ("lt_login failed. %d\n", error);
++ lg_core_logout (gulm_cm.hookup); /* XXX is this safe? */
++ return error;
++ }
++
++ log_msg (lgm_Network2, "Logged into local core.\n");
++
++ return 0;
++}
++
++/**
++ * gulm_core_logout_reply -
++ * @misc:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_core_logout_reply (void *misc)
++{
++ log_msg (lgm_Network2, "Logged out of local core.\n");
++ return 0;
++}
++
++/**
++ */
++int
++gulm_core_nodechange (void *misc, char *nodename,
++ struct in6_addr *nodeip, uint8_t nodestate)
++{
++ if (nodestate == lg_core_Fenced) {
++ request_journal_replay (nodename);
++ }
++ /* if me and state is logout, Need to close out things if we can.
++ */
++ if (gulm_cm.starts && nodestate == lg_core_Logged_out &&
++ strcmp(gulm_cm.myName, nodename) == 0 ) {
++ lt_logout();
++ cm_thd_running = FALSE;
++ lg_core_logout (gulm_cm.hookup);
++ return -1;
++ }
++ return 0;
++}
++
++int gulm_core_statechange (void *misc, uint8_t corestate,
++ struct in6_addr *masterip, char *mastername)
++{
++ int *cst = (int *)misc;
++ if( misc != NULL ) {
++ if( corestate != lg_core_Slave &&
++ corestate != lg_core_Master ) {
++ *cst = TRUE;
++ }else{
++ *cst = FALSE;
++ }
++ }
++ return 0;
++}
++
++/**
++ */
++int
++gulm_core_error (void *misc, uint32_t err)
++{
++ log_err ("Got error code %d %#x back fome some reason!\n", err, err);
++ return 0;
++}
++
++static lg_core_callbacks_t core_cb = {
++ login_reply:gulm_core_login_reply,
++ logout_reply:gulm_core_logout_reply,
++ nodechange:gulm_core_nodechange,
++ statechange:gulm_core_statechange,
++ error:gulm_core_error
++};
++
++/**
++ * cm_io_recving_thread -
++ * @data:
++ *
++ *
++ * Returns: int
++ */
++int
++cm_io_recving_thread (void *data)
++{
++ int err;
++
++ daemonize ("gulm_res_recvd");
++ cm_thd_task = current;
++ complete (&cm_thd_startup);
++
++ while (cm_thd_running) {
++ err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, NULL);
++ if (err != 0) {
++ log_err
++ ("Got an error in gulm_res_recvd err: %d\n", err);
++ if (!cm_thd_running)
++ break;
++ /*
++ * Pause a bit, then try to log back into the local
++ * lock_gulmd. Keep doing this until an outside force
++ * stops us. (which I don't think there is any at this
++ * point. forceunmount would be one, if we ever do
++ * that.)
++ *
++ * If we are still in the gulm_mount() function, we
++ * should not retry. We should just exit.
++ */
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout (3 * HZ);
++
++ while ((err =
++ lg_core_login (gulm_cm.hookup, TRUE)) != 0) {
++ log_err
++ ("Got a %d trying to login to lock_gulmd. Is it running?\n",
++ err);
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout (3 * HZ);
++ }
++ }
++ } /* while( gulm_cm.cm_thd_running ) */
++
++ complete (&cm_thd_startup);
++ return 0;
++}
++
++/**
++ * cm_logout -
++ */
++void
++cm_logout (void)
++{
++
++ if (cm_thd_running) {
++ cm_thd_running = FALSE;
++ lg_core_logout (gulm_cm.hookup);
++
++ /* wait for thread to finish */
++ wait_for_completion (&cm_thd_startup);
++ }
++
++}
++
++/**
++ * cm_login -
++ *
++ * Returns: int
++ */
++int
++cm_login (void)
++{
++ int err = -1;
++ int cst=TRUE;
++
++ cm_thd_running = FALSE;
++ init_completion (&cm_thd_startup);
++
++ err = lg_core_login (gulm_cm.hookup, TRUE);
++ if (err != 0) {
++ log_err
++ ("Got a %d trying to login to lock_gulmd. Is it running?\n",
++ err);
++ goto exit;
++ }
++ /* handle login reply. which will start the lt thread. */
++ err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, NULL);
++ if (err != 0) {
++ goto exit;
++ }
++
++ /* do not pass go until Slave(client) or Master */
++ while(cst) {
++ lg_core_corestate(gulm_cm.hookup);
++ err = lg_core_handle_messages (gulm_cm.hookup, &core_cb, &cst);
++ if (err != 0) {
++ goto exit;
++ }
++ if(cst) {
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout (3 * HZ);
++ /* if interrupted, exit */
++ }
++ }
++
++ /* start recver thread. */
++ cm_thd_running = TRUE;
++ err = kernel_thread (cm_io_recving_thread, NULL, 0);
++ if (err < 0) {
++ log_err ("Failed to start gulm_res_recvd. (%d)\n", err);
++ goto exit;
++ }
++ wait_for_completion (&cm_thd_startup);
++
++ err = 0;
++ exit:
++ return err;
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_fs.c linux-patched/fs/gfs_locking/lock_gulm/gulm_fs.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_fs.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_fs.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,613 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "load_info.h"
++#include "handler.h"
++#include "gulm_procinfo.h"
++#include "gulm_jid.h"
++
++/* things about myself */
++extern gulm_cm_t gulm_cm;
++
++/* globals for this file.*/
++uint32_t filesystems_count = 0;
++LIST_HEAD (filesystems_list);
++struct semaphore filesystem_lck; /* we use a sema instead of a spin here because
++ * all of the interruptible things we do inside
++ * of it.
++ * If i stop doing nasty things within this it doesn't need
++ * to be a sema.
++ */
++struct semaphore start_stop_lock;
++atomic_t start_stop_cnt;
++
++/**
++ * init_gulm_fs -
++ */
++void
++init_gulm_fs (void)
++{
++ init_MUTEX (&filesystem_lck);
++ init_MUTEX (&start_stop_lock);
++ atomic_set (&start_stop_cnt, 0);
++}
++
++/*****************************************************************************/
++struct rjrpf_s {
++ gulm_fs_t *fs;
++ uint8_t *name;
++};
++
++void
++request_journal_replay_per_fs (void *d)
++{
++ struct rjrpf_s *rf = (struct rjrpf_s *) d;
++ uint32_t jid;
++ unsigned int ujid;
++
++ /* lookup jid <=> name mapping */
++ if (find_jid_by_name_and_mark_replay (rf->fs, rf->name, &jid) != 0) {
++ log_msg (lgm_JIDMap,
++ "In fs (%s), no jid for name (%s) was found.\n",
++ rf->fs->fs_name, rf->name);
++ } else {
++ log_msg (lgm_JIDMap,
++ "In fs (%s), jid %d was found for name (%s).\n",
++ rf->fs->fs_name, jid, rf->name);
++
++ /* all that the replay journal call back into gfs does is malloc
++ * some memory and add it to a list. So we really don't need to
++ * queue that action. Since that is what gfs is doing.
++ *
++ * This will need to change if gfs changes.
++ *
++ * Basically, we assume that the callback is non-blocking.
++ */
++ ujid = jid;
++ rf->fs->cb (rf->fs->fsdata, LM_CB_NEED_RECOVERY, &ujid);
++ }
++
++ kfree (rf->name);
++ kfree (rf);
++
++}
++
++/**
++ * request_journal_replay - give a journal replay request to mounted filesystems
++ * @name: < the name of the node that died.
++ *
++ *
++ * Returns: void
++ */
++void
++request_journal_replay (uint8_t * name)
++{
++ struct list_head *tmp;
++ gulm_fs_t *fs;
++ struct rjrpf_s *rf;
++
++ log_msg (lgm_Always, "Checking for journals for node \"%s\"\n",
++ name);
++
++ down (&filesystem_lck);
++
++ list_for_each (tmp, &filesystems_list) {
++ fs = list_entry (tmp, gulm_fs_t, fs_list);
++
++ /* we don't want to process replay requests when we are
++ * still in the first mounter state. All the journals are
++ * getting replayed anyways, and there could be some issue
++ * with stuff happening twice.
++ */
++ if (fs->firstmounting)
++ continue;
++
++ /* due to the way the new jid mapping code works, we had to
++ * move it out of here.
++ */
++
++ rf = kmalloc (sizeof (struct rjrpf_s), GFP_KERNEL);
++ GULM_ASSERT (rf != NULL,);
++
++ rf->fs = fs;
++ rf->name = kmalloc (strlen (name) + 1, GFP_KERNEL);
++ GULM_ASSERT (rf->name != NULL,);
++ memcpy (rf->name, name, strlen (name) + 1);
++
++ qu_function_call (&fs->cq, request_journal_replay_per_fs, rf);
++
++ }
++ up (&filesystem_lck);
++}
++
++/**
++ * passup_droplocks -
++ */
++void
++passup_droplocks (void)
++{
++ struct list_head *tmp;
++ gulm_fs_t *fs;
++ down (&filesystem_lck);
++ list_for_each (tmp, &filesystems_list) {
++ fs = list_entry (tmp, gulm_fs_t, fs_list);
++ qu_drop_req (&fs->cq, fs->cb, fs->fsdata, LM_CB_DROPLOCKS, 0,
++ 0);
++ /* If this decides to block someday, we need to change this function.
++ */
++ }
++ up (&filesystem_lck);
++}
++
++/**
++ * dump_internal_lists -
++ *
++ */
++void
++dump_internal_lists (void)
++{
++ struct list_head *tmp;
++ gulm_fs_t *fs;
++ down (&filesystem_lck);
++ list_for_each (tmp, &filesystems_list) {
++ fs = list_entry (tmp, gulm_fs_t, fs_list);
++ log_msg (lgm_Always, "Handler queue for %s\n", fs->fs_name);
++ display_handler_queue (&fs->cq);
++ /* other lists? */
++ }
++ up (&filesystem_lck);
++}
++
++/**
++ * get_fs_by_name -
++ * @name:
++ *
++ *
++ * Returns: gulm_fs_t
++ */
++gulm_fs_t *
++get_fs_by_name (uint8_t * name)
++{
++ struct list_head *tmp;
++ gulm_fs_t *fs = NULL;
++ down (&filesystem_lck);
++ list_for_each (tmp, &filesystems_list) {
++ fs = list_entry (tmp, gulm_fs_t, fs_list);
++ if (strcmp (name, fs->fs_name) == 0) {
++ up (&filesystem_lck);
++ return fs;
++ }
++ }
++ up (&filesystem_lck);
++ return NULL;
++}
++
++/*****************************************************************************/
++
++/**
++ * clear_locks -
++ *
++ * quick check to see if there was leaking
++ * should I panic on these? or just complain?
++ *
++ * Returns: void
++ */
++void
++clear_locks (void)
++{
++ int i;
++ lock_table_t *lt = &gulm_cm.ltpx;
++
++ for (i = 0; i < lt->hashbuckets; i++) {
++ struct list_head *lcktmp, *lckfoo;
++ spin_lock (<->hshlk[i]);
++ list_for_each_safe (lcktmp, lckfoo, <->lkhsh[i]) {
++ gulm_lock_t *lck = NULL;
++ lck = list_entry (lcktmp, gulm_lock_t, gl_list);
++ /* need to relelase it. umm, should any even exist? */
++ log_err ("AH! Rogue lock buffer! refcount:%d\n",
++ atomic_read (&lck->count));
++
++ if (lck->lvb) {
++ log_err ("AH! Rogue lock buffer with LVB!\n");
++ kfree (lck->lvb);
++ }
++
++ list_del (lcktmp);
++ kfree (lck);
++
++ }
++ spin_unlock (<->hshlk[i]);
++ }
++ kfree (lt->hshlk);
++ lt->hshlk = NULL;
++ kfree (lt->lkhsh);
++ lt->lkhsh = NULL;
++}
++
++/*****************************************************************************/
++/**
++ * start_gulm_threads -
++ * @host_data:
++ *
++ *
++ * Returns: int
++ */
++int
++start_gulm_threads (char *csnm, char *host_data)
++{
++ int error = 0;
++
++ down (&start_stop_lock);
++ atomic_inc (&start_stop_cnt);
++ if (atomic_read (&start_stop_cnt) == 1) {
++ /* first one. get stuff going */
++ strncpy (gulm_cm.clusterID, csnm, 255);
++ gulm_cm.clusterID[255] = '\0';
++
++ error = lg_initialize (&gulm_cm.hookup, gulm_cm.clusterID,
++ "GFS Kernel Interface");
++ if (error != 0) {
++ log_err ("lg_initialize failed, %d\n", error);
++ goto fail;
++ }
++ gulm_cm.starts = TRUE;
++
++ error = load_info (host_data);
++ if (error != 0) {
++ log_err ("load_info failed. %d\n", error);
++ goto fail;
++ }
++
++ jid_init ();
++
++ error = cm_login ();
++ if (error != 0) {
++ log_err ("cm_login failed. %d\n", error);
++ goto fail;
++ }
++
++ /* lt_login() is called after the success packet for cm_login()
++ * returns.
++ */
++ }
++ fail:
++ up (&start_stop_lock);
++ return error;
++}
++
++/**
++ * stop_gulm_threads -
++ */
++void
++stop_gulm_threads (void)
++{
++ down (&start_stop_lock);
++ atomic_dec (&start_stop_cnt);
++ if (atomic_read (&start_stop_cnt) == 0) {
++ /* last one, put it all away. */
++ lt_logout ();
++ cm_logout ();
++ clear_locks ();
++ lg_release (gulm_cm.hookup);
++ gulm_cm.hookup = NULL;
++ gulm_cm.loaded = FALSE;
++ gulm_cm.GenerationID = 0;
++ }
++ up (&start_stop_lock);
++}
++
++/*****************************************************************************/
++
++/**
++ * gulm_mount
++ * @table_name: clusterID:FS_Name
++ * @host_data:
++ * @cb: GFS callback function
++ * @fsdata: opaque GFS handle
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_mount (char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t * fsdata,
++ unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++ gulm_fs_t *gulm;
++ char work[256], *tbln;
++ int first;
++ int error = -1;
++ struct list_head *lltmp;
++
++ strncpy (work, table_name, 256);
++
++ tbln = strstr (work, ":");
++ if (tbln == NULL) {
++ log_err
++ ("Malformed table name. Couldn't find separator ':' between "
++ "clusterID and lockspace name.\n");
++ error = -1;
++ goto fail;
++ }
++ *tbln++ = '\0';
++
++ /* make sure that the cluster name exists. */
++ if (strlen (work) <= 0) {
++ log_err ("Cluster name \"%s\" is too short.\n", work);
++ error = -EPROTO;
++ goto fail;
++ }
++ if (strlen (work) > 16) {
++ log_err ("Cluster name \"%s\" is too long.\n", work);
++ error = -EPROTO;
++ goto fail;
++ }
++
++ /* the second one is an artifact of the way I use the name.
++ * A better fix to this will happen when I actually get dynamic key
++ * lengths working.
++ */
++ if (strlen (tbln) > MIN (GIO_NAME_LEN, (GIO_KEY_SIZE - 13))) {
++ log_err
++ ("Warning! lockspace name (%s) is longer than %d chars!\n",
++ tbln, MIN (GIO_NAME_LEN, (GIO_KEY_SIZE - 13)));
++ error = -EPROTO;
++ goto fail;
++ }
++ if (strlen (tbln) <= 0) {
++ log_err ("Table name \"%s\" is too short.\n", tbln);
++ error = -EPROTO;
++ goto fail;
++ }
++
++ /* Check to make sure this lock table isn't already being used */
++ down (&filesystem_lck);
++ list_for_each (lltmp, &filesystems_list) {
++ gulm = list_entry (lltmp, gulm_fs_t, fs_list);
++ if (!strncmp (gulm->fs_name, tbln, GIO_NAME_LEN)) {
++ log_err ("\"%s\" is already in use\n", tbln);
++ error = -EEXIST;
++ up (&filesystem_lck);
++ goto fail;
++ }
++ }
++ up (&filesystem_lck);
++
++ /* Set up our main structure */
++
++ gulm = kmalloc (sizeof (gulm_fs_t), GFP_KERNEL);
++ if (!gulm) {
++ log_err ("out of memory\n");
++ error = -ENOMEM;
++ goto fail;
++ }
++ memset (gulm, 0, sizeof (gulm_fs_t));
++
++ INIT_LIST_HEAD (&gulm->fs_list);
++
++ strncpy (gulm->fs_name, tbln, GIO_NAME_LEN);
++ gulm->cb = cb;
++ gulm->fsdata = fsdata;
++ gulm->lvb_size = min_lvb_size;
++ init_completion (&gulm->sleep);
++ init_MUTEX (&gulm->get_lock);
++
++ if ((error = start_gulm_threads (work, host_data)) != 0) {
++ log_err ("Got a %d trying to start the threads.\n", error);
++ goto fail_free_gulm;
++ }
++
++ if ((error =
++ start_callback_qu (&gulm->cq, gulm_cm.handler_threads)) < 0) {
++ log_err ("fsid=%s: Failed to start the callback handler.\n",
++ gulm->fs_name);
++ goto fail_free_gulm;
++ }
++
++ /* the mount lock HAS to be the first thing done in the LTs for this fs. */
++ error = get_mount_lock (gulm, &first);
++ if (error != 0) {
++ log_err
++ ("fsid=%s: Error %d while trying to get the mount lock\n",
++ gulm->fs_name, error);
++ goto fail_callback;
++ }
++
++ jid_lockstate_reserve (gulm, first);
++ jid_fs_init (gulm);
++ get_journalID (gulm);
++
++ /* things act a bit different until the first mounter is finished.
++ */
++ if (first)
++ gulm->firstmounting = TRUE;
++
++ /* Success */
++ down (&filesystem_lck);
++ list_add (&gulm->fs_list, &filesystems_list);
++ filesystems_count++;
++ up (&filesystem_lck);
++
++ log_msg (lgm_JIDMap, "fsid=%s: We will be using jid %d\n",
++ gulm->fs_name, gulm->fsJID);
++
++ if (add_to_proc (gulm) != 0) {
++ /* ignored for now */
++ }
++
++ lockstruct->ls_jid = gulm->fsJID;
++ lockstruct->ls_first = first;
++ lockstruct->ls_lvb_size = gulm->lvb_size;
++ lockstruct->ls_lockspace = gulm;
++ lockstruct->ls_ops = &gulm_ops;
++#ifdef USE_SYNC_LOCKING
++ lockstruct->ls_flags = 0;
++
++ log_msg (lgm_Network2, "Done: %s, sync mode\n", table_name);
++#else
++ lockstruct->ls_flags = LM_LSFLAG_ASYNC;
++
++ log_msg (lgm_Network2, "Done: %s, async mode\n", table_name);
++#endif
++
++ gulm_cm.starts = FALSE;
++ return 0;
++
++ fail_callback:
++ stop_callback_qu (&gulm->cq);
++
++ fail_free_gulm:
++ kfree (gulm);
++ stop_gulm_threads ();
++
++ fail:
++
++ gulm_cm.starts = FALSE;
++ log_msg (lgm_Always, "fsid=%s: Exiting gulm_mount with errors %d\n",
++ table_name, error);
++ return error;
++}
++
++/**
++ * gulm_others_may_mount
++ * @lockspace: handle to specific lock space
++ *
++ * GFS calls this function if it was the first mounter after it's done
++ * checking all the journals.
++ *
++ */
++void
++gulm_others_may_mount (lm_lockspace_t * lockspace)
++{
++ gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++ int err = 0;
++ lock_table_t *lt = &gulm_cm.ltpx;
++
++ /* first send the drop all exp message.
++ * */
++ err = send_drop_exp (fs, lt, NULL);
++ if (err < 0)
++ log_err
++ ("fsid=%s: Problems sending DropExp request to LTPX: %d\n",
++ fs->fs_name, err);
++
++ /* then move the FirstMountLock to shared so others can mount. */
++ err = downgrade_mount_lock (fs);
++
++ if (err < 0) {
++ log_err ("fsid=%s: error sending Fs_FinMount_Req.(%d)\n",
++ fs->fs_name, err);
++ }
++
++ /* first mounter is all done. let the gulm_recovery_done function
++ * behave as normal now.
++ */
++ fs->firstmounting = FALSE;
++}
++
++/**
++ * gulm_umount
++ * @lockspace: handle to specific lock space
++ *
++ */
++void
++gulm_unmount (lm_lockspace_t * lockspace)
++{
++ gulm_fs_t *gulm_fs = (gulm_fs_t *) lockspace;
++
++ down (&filesystem_lck);
++ list_del (&gulm_fs->fs_list);
++ --filesystems_count;
++ up (&filesystem_lck);
++
++ /* close and release stuff */
++ drop_mount_lock (gulm_fs);
++ put_journalID (gulm_fs);
++ jid_fs_release (gulm_fs);
++ jid_lockstate_release (gulm_fs);
++
++ stop_callback_qu (&gulm_fs->cq);
++
++ remove_from_proc (gulm_fs);
++
++ kfree (gulm_fs);
++
++ stop_gulm_threads ();
++
++}
++
++/**
++ * gulm_recovery_done -
++ * @lockspace:
++ * @jid:
++ *
++ * Returns: void
++ */
++void
++gulm_recovery_done (lm_lockspace_t * lockspace, unsigned int jid,
++ unsigned int message)
++{
++ gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++ int err;
++ uint8_t name[256];
++
++ if (message != LM_RD_SUCCESS) {
++ /* Need to start thinking about how I want to use this... */
++ return;
++ }
++
++ if (jid == fs->fsJID) { /* this may be drifting crud through. */
++ /* hey! its me! */
++ strncpy (name, gulm_cm.myName, 256);
++ } else if (lookup_name_by_jid (fs, jid, name) != 0) {
++ log_msg (lgm_JIDMap,
++ "fsid=%s: Could not find a client for jid %d\n",
++ fs->fs_name, jid);
++ return;
++ }
++ if (strlen (name) == 0) {
++ log_msg (lgm_JIDMap, "fsid=%s: No one mapped to jid %d\n",
++ fs->fs_name, jid);
++ return;
++ }
++ log_msg (lgm_JIDMap, "fsid=%s: Found %s for jid %d\n",
++ fs->fs_name, name, jid);
++
++ err = send_drop_exp (fs, &gulm_cm.ltpx, name);
++
++ if (jid != fs->fsJID) {
++ /* rather dumb to do this to ourselves right after we mount... */
++ log_msg (lgm_JIDMap,
++ "fsid=%s: Clearing JID %d for use by others\n",
++ fs->fs_name, jid);
++ release_JID (fs, jid, FALSE);
++ }
++
++ /* If someone died while replaying someoneelse's journal, there will be
++ * stale expired jids.
++ */
++ check_for_stale_expires (fs);
++
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.c linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,806 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++
++extern gulm_cm_t gulm_cm;
++
++/****************************************************************************/
++
++/* jid locks:
++ *
++ * Header lock: "JHeader" + \0\0\0 + fsname
++ * lvb: <uint32> :number of JIDs
++ * Mappinglock: "JM" + <uint32> + \0\0\0\0 + fsname
++ * lvb: [012] + <node name>
++ * 0: unused
++ * 1: replaying journal
++ * 2: Mounted
++ * list lock : "JL" + "listlock" + fsname
++ * Node Locks : "JN" + <nodename[8]> + fsname
++ *
++ */
++#define jid_header_lvb_size (8)
++
++struct jid_lookup_item_s {
++ struct list_head jp_list;
++ uint8_t *key;
++ uint16_t keylen;
++ uint8_t *lvb;
++ uint16_t lvblen;
++ struct completion waitforit;
++};
++typedef struct jid_lookup_item_s jid_lookup_item_t;
++
++LIST_HEAD (jid_pending_locks);
++spinlock_t jid_pending;
++struct semaphore jid_listlock;
++
++/**
++ * jid_init -
++ */
++void
++jid_init (void)
++{
++ spin_lock_init (&jid_pending);
++ init_MUTEX (&jid_listlock);
++}
++
++/**
++ * jid_get_header_name -
++ * @fs: <
++ * @key: <>
++ * @keylen: <>
++ *
++ * key is buffer to write to, keylen is size of buffer on input, and real
++ * length on output.
++ *
++ * Returns: int
++ */
++int
++jid_get_header_name (uint8_t * fsname, uint8_t * key, uint16_t * keylen)
++{
++ int len;
++ len = strlen (fsname);
++ if ((len + 11) > *keylen)
++ return -EINVAL;
++ memcpy (key, "JHeader\0\0\0", 10);
++ memcpy (&key[10], fsname, len + 1);
++ *keylen = len + 11;
++ return 0;
++}
++
++int
++jid_get_listlock_name (uint8_t * fsname, uint8_t * key, uint16_t * keylen)
++{
++ int len;
++ len = strlen (fsname);
++ if ((len + 11) > *keylen)
++ return -EINVAL;
++ memcpy (key, "JLlistlock", 10);
++ memcpy (&key[10], fsname, len + 1);
++ *keylen = len + 11;
++ return 0;
++}
++
++/**
++ * jid_get_lock_name -
++ * @fs: <
++ * @jid: <
++ * @key: <>
++ * @keylen: <>
++ *
++ * key is buffer to write to, keylen is size of buffer on input, and real
++ * length on output.
++ *
++ * Returns: int
++ */
++int
++jid_get_lock_name (uint8_t * fsname, uint32_t jid, uint8_t * key,
++ uint16_t * keylen)
++{
++ int len;
++ len = strlen (fsname);
++ if ((len + 11) > *keylen)
++ return -EINVAL;
++ key[0] = 'J';
++ key[1] = 'M';
++ key[5] = (jid >> 24) & 0xff;
++ key[4] = (jid >> 16) & 0xff;
++ key[3] = (jid >> 8) & 0xff;
++ key[2] = (jid >> 0) & 0xff;
++ key[6] = 0;
++ key[7] = 0;
++ key[8] = 0;
++ key[9] = 0;
++ memcpy (&key[10], fsname, len + 1);
++ *keylen = len + 11;
++ return 0;
++}
++
++/**
++ * jid_hold_lvb -
++ * @key:
++ * @keylen:
++ *
++ *
++ */
++void
++jid_hold_lvb (uint8_t * key, uint16_t keylen)
++{
++ jid_lookup_item_t jp;
++ GULM_ASSERT (keylen > 6,);
++ jp.key = key;
++ jp.keylen = keylen;
++ jp.lvb = NULL;
++ jp.lvblen = 0;
++ INIT_LIST_HEAD (&jp.jp_list);
++ init_completion (&jp.waitforit);
++
++ spin_lock (&jid_pending);
++ list_add (&jp.jp_list, &jid_pending_locks);
++ spin_unlock (&jid_pending);
++
++ lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_HoldLVB,
++ NULL, 0);
++
++ wait_for_completion (&jp.waitforit);
++}
++
++void
++jid_unhold_lvb (uint8_t * key, uint16_t keylen)
++{
++ jid_lookup_item_t jp;
++ GULM_ASSERT (keylen > 6,);
++ jp.key = key;
++ jp.keylen = keylen;
++ jp.lvb = NULL;
++ jp.lvblen = 0;
++ INIT_LIST_HEAD (&jp.jp_list);
++ init_completion (&jp.waitforit);
++
++ spin_lock (&jid_pending);
++ list_add (&jp.jp_list, &jid_pending_locks);
++ spin_unlock (&jid_pending);
++
++ lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_UnHoldLVB,
++ NULL, 0);
++
++ wait_for_completion (&jp.waitforit);
++}
++
++void
++jid_sync_lvb (uint8_t * key, uint16_t keylen, uint8_t * lvb, uint16_t lvblen)
++{
++ jid_lookup_item_t jp;
++ GULM_ASSERT (keylen > 6,);
++ jp.key = key;
++ jp.keylen = keylen;
++ jp.lvb = NULL;
++ jp.lvblen = 0;
++ INIT_LIST_HEAD (&jp.jp_list);
++ init_completion (&jp.waitforit);
++
++ spin_lock (&jid_pending);
++ list_add (&jp.jp_list, &jid_pending_locks);
++ spin_unlock (&jid_pending);
++
++ lg_lock_action_req (gulm_cm.hookup, key, keylen, lg_lock_act_SyncLVB,
++ lvb, lvblen);
++
++ wait_for_completion (&jp.waitforit);
++}
++
++/**
++ * jid_action_reply -
++ * @key:
++ * @keylen:
++ *
++ * called from the lock handler callback.
++ *
++ * Returns: void
++ */
++void
++jid_action_reply (uint8_t * key, uint16_t keylen)
++{
++ struct list_head *tmp, *nxt;
++ jid_lookup_item_t *jp, *fnd = NULL;
++ spin_lock (&jid_pending);
++ list_for_each_safe (tmp, nxt, &jid_pending_locks) {
++ jp = list_entry (tmp, jid_lookup_item_t, jp_list);
++ if (memcmp (key, jp->key, MIN (keylen, jp->keylen)) == 0) {
++ fnd = jp;
++ list_del (tmp);
++ break;
++ }
++ }
++ spin_unlock (&jid_pending);
++
++ if (fnd != NULL)
++ complete (&fnd->waitforit);
++}
++
++/**
++ * jid_get_lock_state_inr -
++ * @key:
++ * @keylen:
++ * @state:
++ * @flags:
++ * @lvb:
++ * @lvblen:
++ *
++ *
++ */
++void
++jid_get_lock_state_inr (uint8_t * key, uint16_t keylen, uint8_t state,
++ uint32_t flags, uint8_t * lvb, uint16_t lvblen)
++{
++ jid_lookup_item_t jp;
++ GULM_ASSERT (keylen > 6,);
++ jp.key = key;
++ jp.keylen = keylen;
++ jp.lvb = lvb;
++ jp.lvblen = lvblen;
++ INIT_LIST_HEAD (&jp.jp_list);
++ init_completion (&jp.waitforit);
++
++ spin_lock (&jid_pending);
++ list_add (&jp.jp_list, &jid_pending_locks);
++ spin_unlock (&jid_pending);
++
++ lg_lock_state_req (gulm_cm.hookup, key, keylen, state, flags, lvb, lvblen);
++
++ wait_for_completion (&jp.waitforit);
++}
++
++/**
++ * jid_get_lock_state_lvb -
++ * @key:
++ * @keylen:
++ * @state:
++ * @lvb:
++ * @lvblen:
++ *
++ *
++ */
++void
++jid_get_lock_state_lvb (uint8_t * key, uint16_t keylen, uint8_t state,
++ uint8_t * lvb, uint16_t lvblen)
++{
++ jid_get_lock_state_inr (key, keylen, state, 0, lvb, lvblen);
++}
++/**
++ * jid_get_lock_state -
++ * @key:
++ * @keylen:
++ * @state:
++ *
++ *
++ */
++void
++jid_get_lock_state (uint8_t * key, uint16_t keylen, uint8_t state)
++{
++ jid_get_lock_state_inr (key, keylen, state, 0, NULL, 0);
++}
++
++/**
++ * jid_state_reply -
++ * @key:
++ * @keylen:
++ * @lvb:
++ * @lvblen:
++ *
++ *
++ */
++void
++jid_state_reply (uint8_t * key, uint16_t keylen, uint8_t * lvb, uint16_t lvblen)
++{
++ struct list_head *tmp, *nxt;
++ jid_lookup_item_t *jp, *fnd = NULL;
++ spin_lock (&jid_pending);
++ list_for_each_safe (tmp, nxt, &jid_pending_locks) {
++ jp = list_entry (tmp, jid_lookup_item_t, jp_list);
++ if (memcmp (key, jp->key, MIN (keylen, jp->keylen)) == 0) {
++ fnd = jp;
++ list_del (tmp);
++ break;
++ }
++ }
++ spin_unlock (&jid_pending);
++
++ if (fnd != NULL) {
++ if (lvb != NULL && fnd->lvb != NULL)
++ memcpy (fnd->lvb, lvb, MIN (fnd->lvblen, lvblen));
++ complete (&fnd->waitforit);
++ }
++}
++
++/****************************************************************************/
++
++/**
++ * jid_hold_list_lock -
++ * @fs:
++ *
++ * only make one call to this per node.
++ *
++ * Returns: void
++ */
++void
++jid_hold_list_lock (gulm_fs_t * fs)
++{
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++
++ down (&jid_listlock);
++
++ keylen = sizeof (key);
++ jid_get_listlock_name (fs->fs_name, key, &keylen);
++ jid_get_lock_state (key, keylen, lg_lock_state_Exclusive);
++
++}
++
++/**
++ * jid_release_list_lock -
++ * @fs:
++ *
++ *
++ * Returns: void
++ */
++void
++jid_release_list_lock (gulm_fs_t * fs)
++{
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++
++ keylen = sizeof (key);
++ jid_get_listlock_name (fs->fs_name, key, &keylen);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ up (&jid_listlock);
++}
++
++/**
++ * jid_rehold_lvbs -
++ * @fs:
++ *
++ *
++ */
++void
++jid_rehold_lvbs (gulm_fs_t * fs)
++{
++ int i;
++ uint32_t oldjcnt;
++ uint8_t key[GIO_KEY_SIZE], lvb[jid_header_lvb_size];
++ uint16_t keylen = GIO_KEY_SIZE;
++
++ oldjcnt = fs->JIDcount;
++
++ jid_get_header_name (fs->fs_name, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb,
++ jid_header_lvb_size);
++ fs->JIDcount = (uint32_t) (lvb[0]) << 0;
++ fs->JIDcount |= (uint32_t) (lvb[1]) << 8;
++ fs->JIDcount |= (uint32_t) (lvb[2]) << 16;
++ fs->JIDcount |= (uint32_t) (lvb[3]) << 24;
++
++ for (i = oldjcnt; i < fs->JIDcount; i++) {
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, i, key, &keylen);
++ jid_hold_lvb (key, keylen);
++ }
++
++}
++
++void
++jid_grow_space (gulm_fs_t * fs)
++{
++ uint8_t key[GIO_KEY_SIZE], lvb[jid_header_lvb_size];
++ uint16_t keylen = GIO_KEY_SIZE;
++ uint32_t jidc;
++
++ keylen = sizeof (key);
++ jid_get_header_name (fs->fs_name, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive, lvb,
++ jid_header_lvb_size);
++ jidc = (uint32_t) (lvb[0]) << 0;
++ jidc |= (uint32_t) (lvb[1]) << 8;
++ jidc |= (uint32_t) (lvb[2]) << 16;
++ jidc |= (uint32_t) (lvb[3]) << 24;
++ jidc += 10;
++ lvb[3] = (jidc >> 24) & 0xff;
++ lvb[2] = (jidc >> 16) & 0xff;
++ lvb[1] = (jidc >> 8) & 0xff;
++ lvb[0] = (jidc >> 0) & 0xff;
++ jid_sync_lvb (key, keylen, lvb, jid_header_lvb_size);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++ /* do an unlock here, so that when rehold grabs it shared, there is no
++ * lvb writing.
++ */
++
++ jid_rehold_lvbs (fs);
++}
++
++/**
++ * lookup_name_by_jid -
++ * @fs:
++ * @jid:
++ * @name:
++ *
++ *
++ * Returns: int
++ */
++int
++lookup_name_by_jid (gulm_fs_t * fs, uint32_t jid, uint8_t * name)
++{
++ uint8_t key[GIO_KEY_SIZE], lvb[64];
++ uint16_t keylen = 64;
++ int err = 0;
++
++ if (jid >= fs->JIDcount) {
++ err = -1;
++ goto exit;
++ }
++
++ jid_hold_list_lock (fs);
++
++ jid_get_lock_name (fs->fs_name, jid, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb, 64);
++
++ if (lvb[0] != 0) {
++ memcpy (name, &lvb[1], strlen (&lvb[1]) + 1);
++ } else {
++ err = -1;
++ }
++
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ jid_release_list_lock (fs);
++
++ exit:
++ return err;
++}
++
++/**
++ * Release_JID -
++ * @fs:
++ * @jid:
++ *
++ * actually may only need to et first byte to zero
++ *
++ * Returns: int
++ */
++int
++release_JID (gulm_fs_t * fs, uint32_t jid, int nop)
++{
++ uint8_t key[GIO_KEY_SIZE], lvb[64];
++ uint16_t keylen = 64;
++
++ /* there is no such, so this becomes a nop. */
++ if (jid >= fs->JIDcount)
++ goto exit;
++
++ jid_hold_list_lock (fs);
++
++ jid_get_lock_name (fs->fs_name, jid, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive, lvb, 64);
++ lvb[0] = 0;
++ jid_sync_lvb (key, keylen, lvb, strlen (&lvb[1]) + 2);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ jid_release_list_lock (fs);
++
++ exit:
++ return 0;
++}
++
++void
++put_journalID (gulm_fs_t * fs)
++{
++ release_JID (fs, fs->fsJID, TRUE);
++}
++
++/**
++ * get_journalID -
++ * @fs:
++ * @jid:
++ *
++ * This is broken.
++ *
++ * Returns: int
++ */
++void
++get_journalID (gulm_fs_t * fs)
++{
++ uint32_t i = 0;
++ uint8_t key[GIO_KEY_SIZE], lvb[64];
++ uint16_t keylen;
++ int first_clear = -1;
++
++ retry:
++ jid_hold_list_lock (fs);
++
++ /* find an empty space, or ourselves again */
++ for (i = 0; i < fs->JIDcount; i++) {
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, i, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++ lvb, 64);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++ if (first_clear == -1 && lvb[0] == 0 ) {
++ first_clear = i;
++ } else if (strcmp (gulm_cm.myName, &lvb[1]) == 0) {
++ first_clear = i;
++ break;
++ }
++ }
++ if (first_clear >= 0) {
++ /* take the jid we have found */
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, first_clear, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++ lvb, 64);
++ lvb[0] = 2;
++ memcpy (&lvb[1], gulm_cm.myName, strlen (gulm_cm.myName) + 1);
++ jid_sync_lvb (key, keylen, lvb, strlen (gulm_cm.myName) + 2);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ fs->fsJID = first_clear;
++ }
++
++ /* unlock the header lock */
++ jid_release_list_lock (fs);
++
++ if (first_clear < 0) {
++ /* nothing found, grow and try again. */
++ jid_grow_space (fs);
++ goto retry;
++ }
++
++}
++
++/**
++ * find_jid_by_name_and_mark_replay -
++ * @fs:
++ * @name:
++ * @jid:
++ *
++ *
++ * Returns: int
++ */
++int
++find_jid_by_name_and_mark_replay (gulm_fs_t * fs, uint8_t * name,
++ uint32_t * jid)
++{
++ uint32_t i, found = -1;
++ uint8_t key[GIO_KEY_SIZE], lvb[64];
++ uint16_t keylen;
++
++ /* grab list lock */
++ jid_hold_list_lock (fs);
++
++ for (i = 0; i < fs->JIDcount; i++) {
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, i, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Exclusive,
++ lvb, 64);
++ if (strcmp (name, &lvb[1]) == 0) {
++ *jid = i;
++ found = 0;
++ lvb[0] = 1;
++ jid_sync_lvb (key, keylen, lvb, strlen (&lvb[1]) + 2);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++ break;
++ }
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ }
++ /* unlock the list lock */
++ jid_release_list_lock (fs);
++
++ return found;
++}
++
++/**
++ * Check_for_replays -
++ * @fs:
++ *
++ *
++ * Returns: int
++ */
++void
++check_for_stale_expires (gulm_fs_t * fs)
++{
++ uint32_t i;
++ uint8_t key[GIO_KEY_SIZE], lvb[64];
++ uint16_t keylen;
++ unsigned int ujid;
++
++ /* grab list lock */
++ jid_hold_list_lock (fs);
++
++ for (i = 0; i < fs->JIDcount; i++) {
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, i, key, &keylen);
++ jid_get_lock_state_lvb (key, keylen, lg_lock_state_Shared, lvb,
++ 64);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++ if (lvb[0] == 1) {
++ log_msg (lgm_JIDMap,
++ "fsid=%s: stale JID %d found\n",
++ fs->fs_name, i);
++ ujid = i;
++ fs->cb (fs->fsdata, LM_CB_NEED_RECOVERY, &ujid);
++ }
++ }
++
++ /* unlock the list lock */
++ jid_release_list_lock (fs);
++}
++
++/**
++ * jid_fs_init -
++ * @fs:
++ *
++ */
++void
++jid_fs_init (gulm_fs_t * fs)
++{
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen = GIO_KEY_SIZE;
++
++ fs->JIDcount = 0;
++
++ jid_get_header_name (fs->fs_name, key, &keylen);
++ jid_hold_lvb (key, keylen);
++ jid_rehold_lvbs (fs);
++}
++
++/**
++ * jid_fs_release -
++ * @fs:
++ *
++ */
++void
++jid_fs_release (gulm_fs_t * fs)
++{
++ uint32_t i;
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++ for (i = 0; i < fs->JIDcount; i++) {
++ keylen = sizeof (key);
++ jid_get_lock_name (fs->fs_name, i, key, &keylen);
++ jid_unhold_lvb (key, keylen);
++ }
++ keylen = sizeof (key);
++ jid_get_header_name (fs->fs_name, key, &keylen);
++ jid_unhold_lvb (key, keylen);
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++}
++
++/**
++ * jid_unlock_callback -
++ * @d:
++ *
++ * *MUST* be called from a Handler thread.
++ *
++ * Returns: int
++ */
++void
++jid_unlock_callback (void *d)
++{
++ gulm_fs_t *fs = (gulm_fs_t *) d;
++ jid_rehold_lvbs (fs);
++}
++
++/**
++ * jid_header_lock_drop -
++ * @key:
++ * @keylen:
++ *
++ * Returns: void
++ */
++void
++jid_header_lock_drop (uint8_t * key, uint16_t keylen)
++{
++ gulm_fs_t *fs;
++ /* make sure this is the header lock.... */
++ if (key[1] == 'H' && (fs = get_fs_by_name (&key[10])) != NULL) {
++ qu_function_call (&fs->cq, jid_unlock_callback, fs);
++ }
++}
++
++/****************************************************************************/
++/**
++ * jid_get_lsresv_name -
++ * @fsname:
++ * @key:
++ * @keylen:
++ *
++ *
++ * Returns: int
++ */
++int
++jid_get_lsresv_name (char *fsname, uint8_t * key, uint16_t * keylen)
++{
++ int len;
++
++ key[0] = 'J';
++ key[1] = 'N';
++ len = strlen (gulm_cm.myName) + 1;
++ memset (&key[2], 0, 8);
++ memcpy ((&key[2]), gulm_cm.myName, MIN (len, 8));
++ /* fsname starts at byte 10 so the dropexp pattern will find it. */
++ memcpy ((&key[10]), fsname, strlen (fsname) + 1);
++
++ *keylen = 10 + strlen (fsname) + 1;
++
++ return 0;
++}
++
++/**
++ * jid_lockstate_reserve -
++ * @fs:
++ *
++ *
++ * Returns: void
++ */
++void
++jid_lockstate_reserve (gulm_fs_t * fs, int first)
++{
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++
++ jid_get_lsresv_name (fs->fs_name, key, &keylen);
++
++ /* if we are expired, this will block until someone else has cleaned our
++ * last mess up.
++ *
++ * Will very well may need to put in some kind of timeout otherwise this
++ * may do a forever lockup much like the FirstMounter lock had.
++ */
++ jid_get_lock_state_inr (key, keylen, lg_lock_state_Exclusive,
++ first?lg_lock_flag_IgnoreExp:0, NULL, 0);
++
++}
++
++/**
++ * jid_lockstate_release -
++ * @fs:
++ *
++ *
++ * Returns: void
++ */
++void
++jid_lockstate_release (gulm_fs_t * fs)
++{
++ uint8_t key[GIO_KEY_SIZE];
++ uint16_t keylen;
++
++ jid_get_lsresv_name (fs->fs_name, key, &keylen);
++
++ jid_get_lock_state (key, keylen, lg_lock_state_Unlock);
++
++}
++
++
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.h linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_jid.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_jid.h 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,41 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __GULM_JID_H__
++#define __GULM_JID_H__
++#include "gulm.h"
++void jid_init (void);
++void jid_fs_init (gulm_fs_t * fs);
++void jid_fs_release (gulm_fs_t * fs);
++int get_journalID (gulm_fs_t * fs);
++int lookup_jid_by_name (gulm_fs_t * fs, uint8_t * name, uint32_t * injid);
++int lookup_name_by_jid (gulm_fs_t * fs, uint32_t jid, uint8_t * name);
++void release_JID (gulm_fs_t * fs, uint32_t jid, int owner);
++void put_journalID (gulm_fs_t * fs);
++void check_for_stale_expires (gulm_fs_t * fs);
++
++int
++ find_jid_by_name_and_mark_replay (gulm_fs_t * fs, uint8_t * name, uint32_t * jid);
++
++void jid_start_journal_reply (gulm_fs_t * fs, uint32_t jid);
++void jid_finish_journal_reply (gulm_fs_t * fs, uint32_t jid);
++
++void jid_lockstate_reserve (gulm_fs_t * fs, int first);
++void jid_lockstate_release (gulm_fs_t * fs);
++
++/* to be called from the lg_lock callbacks. */
++void jid_state_reply (uint8_t * key, uint16_t keylen, uint8_t * lvb,
++ uint16_t lvblen);
++void jid_action_reply (uint8_t * key, uint16_t keylen);
++void jid_header_lock_drop (uint8_t * key, uint16_t keylen);
++#endif /*__GULM_JID_H__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h linux-patched/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_log_msg_bits.h 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,40 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_log_msg_bits_h__
++#define __gulm_log_msg_bits_h__
++/* log_msg bit flags
++ * These got thier own file so I can easily include them in both user and
++ * kernel space.
++ * */
++#define lgm_Always (0x00000000) /*Print Message no matter what */
++#define lgm_Network (0x00000001)
++#define lgm_Network2 (0x00000002)
++#define lgm_Stomith (0x00000004)
++#define lgm_Heartbeat (0x00000008)
++#define lgm_locking (0x00000010)
++#define lgm_FuncDebug (0x00000020)
++#define lgm_Forking (0x00000040)
++#define lgm_JIDMap (0x00000080)
++#define lgm_Subscribers (0x00000100)
++#define lgm_LockUpdates (0x00000200)
++#define lgm_LoginLoops (0x00000400)
++#define lgm_Network3 (0x00000800)
++#define lgm_JIDUpdates (0x00001000)
++#define lgm_ServerState (0x00002000)
++
++#define lgm_ReallyAll (0xffffffff)
++
++#define lgm_BitFieldSize (32)
++
++#endif /*__gulm_log_msg_bits_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_lt.c linux-patched/fs/gfs_locking/lock_gulm/gulm_lt.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_lt.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_lt.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,1937 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "util.h"
++#include "handler.h"
++#include "utils_tostr.h"
++#include "gulm_jid.h"
++
++extern gulm_cm_t gulm_cm;
++
++/****************************************************************************/
++/* A bunch of prints that hopefully contain more information that is also
++ * useful
++ *
++ * these are a mess.
++ */
++
++/**
++ * lck_key_to_hex -
++ * @key:
++ * @len:
++ * @workspace: <> place to put string. !! better be 2x len !!
++ *
++ *
++ * Returns: char
++ */
++static char *
++lck_key_to_hex (uint8_t * key, uint16_t len, char *workspace)
++{
++ int i;
++ for (i = 0; i < len; i++)
++ sprintf (&workspace[i * 2], "%02x", (key[i] & 0xff));
++ return workspace;
++}
++
++static void __inline__
++db_lck_entered (gulm_lock_t * lck)
++{
++ char bb[GIO_KEY_SIZE * 2 + 3];
++ lck_key_to_hex (lck->key, lck->keylen, bb);
++ printk ("Started lock 0x%s cur:%#x req:%#x flags:%#x\n", bb,
++ lck->cur_state, lck->req_state, lck->flags);
++}
++static void __inline__
++db_lck_exited (gulm_lock_t * lck)
++{
++ char bb[GIO_KEY_SIZE * 2 + 3];
++ lck_key_to_hex (lck->key, lck->keylen, bb);
++ printk ("Finished lock 0x%s result:%#x\n", bb, lck->result);
++}
++
++static void __inline__
++dump_gulm_lock_t (gulm_lock_t * lck)
++{
++ char bb[GIO_KEY_SIZE * 2 + 3];
++
++ lck_key_to_hex (lck->key, lck->keylen, bb);
++ log_msg (lgm_Always, " key = 0x%s\n", bb);
++ log_msg (lgm_Always, " req_type = %#x\n", lck->req_type);
++ log_msg (lgm_Always, " last_suc_state = %#x\n", lck->last_suc_state);
++ log_msg (lgm_Always, " actuallypending = %d\n", lck->actuallypending);
++ log_msg (lgm_Always, " in_to_be_sent = %d\n", lck->in_to_be_sent);
++ log_msg (lgm_Always, " cur_state = %d\n", lck->cur_state);
++ log_msg (lgm_Always, " req_state = %d\n", lck->req_state);
++ log_msg (lgm_Always, " flags = %#x\n", lck->flags);
++ log_msg (lgm_Always, " action = %d\n", lck->action);
++ log_msg (lgm_Always, " result = %d\n", lck->result);
++}
++
++/* DEBUG_BY_LOCK is gone. I may later add something back if needed.
++ *
++ * I love the idea of being able to log only certain locks, I just cannot
++ * think of an easy way to do it. The best I can come up with is some
++ * pattern (or set of) that are used to decide which locks get logged. But
++ * that could be expensive if the pattern is checked everytime, and won't
++ * behave as expected if only applied in get_lock.
++ * */
++
++/* The old log functions.
++ * These need their own sort of clean up someday as well.
++ * */
++#define log_msg_lk(key, keylen, fmt, args...) {\
++ uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++ lck_key_to_hex( key, keylen, bb); \
++ printk(PROTO_NAME ": On lock 0x%s " fmt , bb , ## args ); \
++ }
++
++#define log_err_lk(key, keylen, fmt, args...) {\
++ uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++ lck_key_to_hex( key, keylen, bb); \
++ printk(KERN_ERR PROTO_NAME ": ERROR On lock 0x%s " fmt , bb , ## args ); \
++ }
++
++#define log_msg_lck(lck, fmt, args...) {\
++ uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++ lck_key_to_hex( (lck)->key, (lck)->keylen, bb); \
++ printk(PROTO_NAME ": On lock 0x%s " fmt , bb , ## args ); \
++ }
++
++#define log_err_lck(lck, fmt, args...) {\
++ uint8_t bb[GIO_KEY_SIZE*2 +3]; \
++ lck_key_to_hex( (lck)->key, (lck)->keylen, bb); \
++ printk(KERN_ERR PROTO_NAME ": ERROR On lock 0x%s " fmt , bb , ## args ); \
++ }
++
++#ifdef DEBUG_LVB
++static void __inline__
++print_lk_lvb (uint8_t * key, uint8_t * lvb, uint8_t st, uint8_t * dir)
++{
++ uint8_t bk[GIO_KEY_SIZE * 2 + 3];
++ uint8_t bl[GIO_LVB_SIZE * 2 + 3];
++ int i;
++ for (i = 0; i < GIO_KEY_SIZE; i++)
++ sprintf (&bk[(i * 2)], "%02x", (key[i]) & 0xff);
++ for (i = 0; i < GIO_LVB_SIZE; i++)
++ sprintf (&bl[(i * 2)], "%02x", (lvb[i]) & 0xff);
++ printk (PROTO_NAME ": On lock 0x%s with state %d\n\t%s LVB 0x%s\n",
++ bk, st, dir, bl);
++}
++
++#define lvb_log_msg_lk(k, fmt, args...) log_msg_lk( k , fmt , ## args )
++#define lvb_log_msg(fmt, args...) log_msg(lgm_Always , fmt , ## args )
++#else /*DEBUG_LVB */
++#define print_lk_lvb(k,l,s,d)
++#define lvb_log_msg_lk(k, fmt, args...)
++#define lvb_log_msg(fmt, args...)
++#endif /*DEBUG_LVB */
++
++/****************************************************************************/
++/**
++ * find_and_mark_lock -
++ * @key:
++ * @keylen:
++ * @lockp:
++ *
++ * looks for a lock struct of key. If found, marks it.
++ *
++ * Returns: TRUE or FALSE
++ */
++int
++find_and_mark_lock (uint8_t * key, uint8_t keylen, gulm_lock_t ** lockp)
++{
++ int found = FALSE;
++ uint32_t bkt;
++ gulm_lock_t *lck = NULL;
++ struct list_head *tmp;
++
++ /* now find the lock */
++ bkt = hash_lock_key (key, keylen);
++ bkt %= gulm_cm.ltpx.hashbuckets;
++
++ spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++ list_for_each (tmp, &gulm_cm.ltpx.lkhsh[bkt]) {
++ lck = list_entry (tmp, gulm_lock_t, gl_list);
++ if (memcmp (lck->key, key, keylen) == 0) {
++ found = TRUE;
++ atomic_inc (&lck->count);
++ break;
++ }
++ }
++ spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++
++ if (found)
++ *lockp = lck;
++
++ return found;
++}
++
++/**
++ * mark_lock -
++ * @lck:
++ *
++ * like above, but since we have the lock, don't search for it.
++ *
++ * Returns: int
++ */
++void __inline__
++mark_lock (gulm_lock_t * lck)
++{
++ atomic_inc (&lck->count);
++}
++
++/**
++ * unmark_and_release_lock -
++ * @lck:
++ *
++ * decrement the counter on a lock, freeing it if it reaches 0.
++ * (also removes it from the hash table)
++ *
++ * TRUE if lock was freed.
++ *
++ * Returns: TRUE or FALSE
++ */
++int
++unmark_and_release_lock (gulm_lock_t * lck)
++{
++ uint32_t bkt;
++ int deld = FALSE;
++
++ bkt = hash_lock_key (lck->key, lck->keylen);
++ bkt %= gulm_cm.ltpx.hashbuckets;
++ spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++ if (atomic_dec_and_test (&lck->count)) {
++ list_del (&lck->gl_list);
++ deld = TRUE;
++ }
++ spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++ if (deld) {
++ gulm_cm.ltpx.locks_total--;
++ gulm_cm.ltpx.locks_unl--;
++ if (lck->lvb != NULL) {
++ kfree (lck->lvb);
++ }
++ kfree (lck);
++ }
++
++ return deld;
++}
++
++/****************************************************************************/
++
++void
++gulm_key_to_lm_lockname (uint8_t * key, struct lm_lockname *lockname)
++{
++ (*lockname).ln_number = (u64) (key[9]) << 0;
++ (*lockname).ln_number |= (u64) (key[8]) << 8;
++ (*lockname).ln_number |= (u64) (key[7]) << 16;
++ (*lockname).ln_number |= (u64) (key[6]) << 24;
++ (*lockname).ln_number |= (u64) (key[5]) << 32;
++ (*lockname).ln_number |= (u64) (key[4]) << 40;
++ (*lockname).ln_number |= (u64) (key[3]) << 48;
++ (*lockname).ln_number |= (u64) (key[2]) << 56;
++ (*lockname).ln_type = key[1];
++}
++
++void
++do_drop_lock_req (gulm_fs_t * fs, uint8_t state, uint8_t key[GIO_KEY_SIZE])
++{
++ unsigned int type;
++ struct lm_lockname lockname;
++ /* i might want to shove most of this function into the new lockcallback
++ * handing queue.
++ * later.
++ */
++
++ /* don't do callbacks on the gulm mount lock.
++ * I need to someday come up with a cleaner way of seperating the
++ * firstmounter lock and the rest of gfs's locks.
++ * i duno, this first byte is pretty clean.
++ * */
++ if (key[0] != 'G') {
++ return;
++ }
++
++ switch (state) {
++ case lg_lock_state_Unlock:
++ type = LM_CB_DROPLOCKS;
++ break;
++ case lg_lock_state_Exclusive:
++ type = LM_CB_NEED_E;
++ break;
++ case lg_lock_state_Shared:
++ type = LM_CB_NEED_S;
++ break;
++ case lg_lock_state_Deferred:
++ type = LM_CB_NEED_D;
++ break;
++ default:
++ type = LM_CB_DROPLOCKS;
++ break;
++ }
++ gulm_key_to_lm_lockname (key, &lockname);
++
++ qu_drop_req (&fs->cq, fs->cb, fs->fsdata, type,
++ lockname.ln_type, lockname.ln_number);
++}
++
++/**
++ * send_async_reply -
++ * @lck:
++ *
++ *
++ * Returns: void
++ */
++void
++send_async_reply (gulm_lock_t * lck)
++{
++ gulm_fs_t *fs = lck->fs;
++ struct lm_lockname lockname;
++
++ if (lck->key[0] == 'F') {
++ /* whee! it is the first mounter lock. two things:
++ * A: gfs could care less about this.
++ * B: we need to up the sleeper in the fs. (hack)
++ */
++ complete (&fs->sleep);
++ return;
++ }
++
++ gulm_key_to_lm_lockname (lck->key, &lockname);
++
++ qu_async_rpl (&fs->cq, fs->cb, fs->fsdata, &lockname, lck->result);
++}
++
++/**
++ * send_drop_exp_inter -
++ * @lt:
++ * @name:
++ *
++ *
++ * Returns: int
++ */
++int
++send_drop_exp_inter (gulm_fs_t * fs, lock_table_t * lt, char *name)
++{
++ int err, len;
++ uint8_t mask[GIO_KEY_SIZE];
++
++ memset (mask, 0, GIO_KEY_SIZE);
++ /* pack key mask */
++ mask[0] = 0xff; /* minor lock type. 'G', 'F', 'J'. */
++ mask[1] = 0xff; /* GFS lock type. */
++ mask[2] = 0xff; /* next 8 are lock number */
++ mask[3] = 0xff;
++ mask[4] = 0xff;
++ mask[5] = 0xff;
++ mask[6] = 0xff;
++ mask[7] = 0xff;
++ mask[8] = 0xff;
++ mask[9] = 0xff;
++ /* Now stick the fsname into the remaining space. */
++ len = strlen (fs->fs_name);
++ strncpy (&mask[10], fs->fs_name, GIO_KEY_SIZE - 16);
++ len += 11; /* 10 for the encoded buf, 1 for the '\0' after the fs name */
++
++ err = lg_lock_drop_exp (gulm_cm.hookup, name, mask, len);
++
++ return err;
++}
++
++/**
++ * send_lock_action -
++ * @lck:
++ *
++ *
++ * Returns: int
++ */
++int
++send_lock_action (gulm_lock_t * lck, uint8_t action)
++{
++ int err;
++
++ GULM_ASSERT (lck->req_type == glck_action, dump_gulm_lock_t (lck););
++
++ err = lg_lock_action_req (gulm_cm.hookup, lck->key, lck->keylen, action,
++ lck->lvb, lck->fs->lvb_size);
++ if (err != 0)
++ log_err ("Issues sending action request. %d\n", err);
++
++ return err;
++}
++
++/**
++ * send_lock_req -
++ * @lck:
++ *
++ *
++ * Returns: int
++ */
++int
++send_lock_req (gulm_lock_t * lck)
++{
++ gulm_fs_t *fs = lck->fs;
++ int err;
++ uint32_t flags = 0;
++ uint8_t state;
++
++ GULM_ASSERT (lck->req_type == glck_state, dump_gulm_lock_t (lck););
++
++ switch (lck->req_state) {
++ case LM_ST_EXCLUSIVE:
++ state = lg_lock_state_Exclusive;
++ break;
++ case LM_ST_DEFERRED:
++ state = lg_lock_state_Deferred;
++ break;
++ case LM_ST_SHARED:
++ state = lg_lock_state_Shared;
++ break;
++ case LM_ST_UNLOCKED:
++ state = lg_lock_state_Unlock;
++ break;
++ default:
++ GULM_ASSERT (0, log_err ("fsid=%s: Anit no lock state %d.\n",
++ fs->fs_name, lck->req_state););
++ break;
++ }
++ if (lck->flags & LM_FLAG_TRY) {
++ flags |= lg_lock_flag_Try;
++ }
++ if (lck->flags & LM_FLAG_TRY_1CB) {
++ flags |= lg_lock_flag_Try | lg_lock_flag_DoCB;
++ }
++ if (lck->flags & LM_FLAG_NOEXP) {
++ flags |= lg_lock_flag_IgnoreExp;
++ }
++ if (lck->flags & LM_FLAG_ANY) {
++ flags |= lg_lock_flag_Any;
++ }
++ if (lck->flags & LM_FLAG_PRIORITY) {
++ flags |= lg_lock_flag_Piority;
++ }
++ if (lck->lvb != NULL) {
++ print_lk_lvb (lck->key, lck->lvb, lck->req_state, "Sending");
++ }
++
++ err = lg_lock_state_req (gulm_cm.hookup, lck->key, lck->keylen,
++ state, flags, lck->lvb, lck->fs->lvb_size);
++ if (err != 0)
++ log_err ("Issues sending state request. %d\n", err);
++
++ return err;
++}
++
++/**
++ * toggle_lock_counters -
++ *
++ * called after a succesful request to change lock state. Decrements
++ * counts for what the lock was, and increments for what it is now.
++ */
++void
++toggle_lock_counters (lock_table_t * lt, int old, int new)
++{
++ /* what we had it in */
++ switch (old) {
++ case LM_ST_EXCLUSIVE:
++ lt->locks_exl--;
++ break;
++ case LM_ST_DEFERRED:
++ lt->locks_dfr--;
++ break;
++ case LM_ST_SHARED:
++ lt->locks_shd--;
++ break;
++ case LM_ST_UNLOCKED:
++ lt->locks_unl--;
++ break;
++ }
++ /* what we have it in */
++ switch (new) {
++ case LM_ST_EXCLUSIVE:
++ lt->locks_exl++;
++ break;
++ case LM_ST_DEFERRED:
++ lt->locks_dfr++;
++ break;
++ case LM_ST_SHARED:
++ lt->locks_shd++;
++ break;
++ case LM_ST_UNLOCKED:
++ lt->locks_unl++;
++ break;
++ }
++}
++
++/**
++ * calc_lock_result -
++ * @lck:
++ * @state:
++ * @error:
++ * @flags:
++ *
++ * This calculates the correct result to return for gfs lock requests.
++ *
++ * Returns: int
++ */
++int
++calc_lock_result (gulm_lock_t * lck,
++ uint8_t state, uint32_t error, uint32_t flags)
++{
++ gulm_fs_t *fs = lck->fs;
++ lock_table_t *lt = &gulm_cm.ltpx;
++ int result = -69;
++
++ /* adjust result based on success status. */
++ switch (error) {
++ case lg_err_Ok:
++ /* set result to current lock state. */
++ if (!(lck->flags & LM_FLAG_ANY)) {
++ /* simple case, we got what we asked for. */
++ result = lck->req_state;
++ } else {
++ /* complex case, we got something else, but we said that was ok */
++ switch (state) {
++ case lg_lock_state_Shared:
++ result = LM_ST_SHARED;
++ break;
++ case lg_lock_state_Deferred:
++ result = LM_ST_DEFERRED;
++ break;
++
++ case lg_lock_state_Exclusive:
++ case lg_lock_state_Unlock:
++ GULM_ASSERT (0,
++ dump_gulm_lock_t (lck);
++ log_err
++ ("fsid=%s: lock state %d is invalid on "
++ "ANY flag return\n", fs->fs_name,
++ state);
++ );
++ break;
++
++ default:
++ GULM_ASSERT (0,
++ dump_gulm_lock_t (lck);
++ log_err_lck (lck,
++ "fsid=%s: Anit no lock state %d.\n",
++ fs->fs_name, state);
++ );
++ break;
++ }
++ }
++
++ /* toggle counters.
++ * due to ANY flag, new state may not be req_state.
++ * */
++ toggle_lock_counters (lt, lck->cur_state, result);
++
++ /* if no internal unlocks, it is cachable. */
++ if (result != LM_ST_UNLOCKED && (flags & lg_lock_flag_Cachable))
++ result |= LM_OUT_CACHEABLE;
++
++ /* record and move on
++ * */
++ lck->last_suc_state = result & LM_OUT_ST_MASK;
++ break;
++ case lg_err_Canceled:
++ result = LM_OUT_CANCELED | lck->cur_state;
++ break;
++ case lg_err_TryFailed:
++ result = lck->cur_state; /* if we didn't get it. */
++ break;
++ default:
++ result = -error;
++ break;
++ }
++
++ return result;
++}
++
++/**
++ * my_strdup -
++ * @s:
++ *
++ *
++ * Returns: char
++ */
++char *
++my_strdup (char *s)
++{
++ char *tmp;
++ int len;
++ len = strlen (s) + 1;
++ tmp = kmalloc (len, GFP_KERNEL);
++ if (tmp == NULL)
++ return NULL;
++ memcpy (tmp, s, len);
++ return tmp;
++}
++
++/* Instead of directly calling the send function below, the functions will
++ * create of of these.
++ * Which exist only because I cannot stick the lock_t onto two lists
++ * at once.
++ *
++ * this could use some clean up.
++ */
++typedef struct send_req_s {
++ struct list_head sr_list;
++ enum { sr_lock, sr_act, sr_cancel, sr_drop } type;
++ gulm_lock_t *who;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++ char *name;
++} send_req_t;
++
++/**
++ * alloc_send_req -
++ * @oid:
++ *
++ *
++ * Returns: send_req_t
++ */
++send_req_t *
++alloc_send_req (void)
++{
++ send_req_t *tmp;
++ tmp = kmalloc (sizeof (send_req_t), GFP_KERNEL);
++ GULM_ASSERT (tmp != NULL,); /* so evil.... */
++ return tmp;
++}
++
++/**
++ * send_drop_exp -
++ * @fs:
++ * @lt:
++ * @name:
++ *
++ *
++ * Returns: int
++ */
++int
++send_drop_exp (gulm_fs_t * fs, lock_table_t * lt, char *name)
++{
++ send_req_t *sr;
++
++ sr = alloc_send_req ();
++ INIT_LIST_HEAD (&sr->sr_list);
++ sr->type = sr_drop;
++ sr->who = NULL;
++ sr->fs = fs;
++ sr->lt = lt;
++ if (name != NULL) {
++ sr->name = my_strdup (name);
++ } else {
++ sr->name = NULL;
++ }
++
++ spin_lock (<->queue_sender);
++ list_add (&sr->sr_list, <->to_be_sent);
++ spin_unlock (<->queue_sender);
++
++ wake_up (<->send_wchan);
++ return 0;
++}
++
++/**
++ * add_lock_to_send_req_queue -
++ * @lt:
++ * @lck:
++ *
++ *
++ * Returns: void
++ */
++void
++add_lock_to_send_req_queue (lock_table_t * lt, gulm_lock_t * lck, int type)
++{
++ send_req_t *sr;
++
++ sr = alloc_send_req ();
++ INIT_LIST_HEAD (&sr->sr_list);
++ sr->type = type;
++ sr->who = lck;
++ sr->fs = NULL;
++ sr->lt = NULL;
++ sr->name = NULL;
++ if (type != sr_cancel)
++ lck->in_to_be_sent = TRUE;
++
++ mark_lock (lck);
++
++ spin_lock (<->queue_sender);
++ list_add (&sr->sr_list, <->to_be_sent);
++ spin_unlock (<->queue_sender);
++
++ wake_up (<->send_wchan);
++}
++
++/**
++ * queue_empty -
++ * @lt:
++ *
++ *
++ * Returns: int
++ */
++static __inline__ int
++queue_empty (lock_table_t * lt)
++{
++ int ret;
++ spin_lock (<->queue_sender);
++ ret = list_empty (<->to_be_sent);
++ spin_unlock (<->queue_sender);
++ return ret;
++}
++
++/**
++ * lt_io_sender_thread -
++ * @data:
++ *
++ * Right now, only gfs lock requests should go through this thread.
++ * Must look, May not even need this.
++ * well, it is nice to get the socket io off of what ever process the user
++ * is running that is going through gfs into here. ?is it?
++ *
++ *
++ * Returns: int
++ */
++int
++lt_io_sender_thread (void *data)
++{
++ lock_table_t *lt = (lock_table_t *) data;
++ struct list_head *tmp;
++ send_req_t *sr = NULL;
++ int err = 0;
++
++ daemonize ("gulm_LT_sender");
++ lt->sender_task = current;
++ complete (<->startup);
++
++ while (lt->running) {
++ do {
++ DECLARE_WAITQUEUE (__wait_chan, current);
++ current->state = TASK_INTERRUPTIBLE;
++ add_wait_queue (<->send_wchan, &__wait_chan);
++ if (queue_empty (lt))
++ schedule ();
++ remove_wait_queue (<->send_wchan, &__wait_chan);
++ current->state = TASK_RUNNING;
++ } while (0);
++ if (!lt->running)
++ break;
++
++ /* check to make sure socket is ok. */
++ down (<->sender);
++
++ /* pop next item to be sent
++ * (it will get pushed back if there was problems.)
++ */
++ spin_lock (<->queue_sender);
++ if (list_empty (<->to_be_sent)) {
++ spin_unlock (<->queue_sender);
++ up (<->sender);
++ continue;
++ }
++ tmp = (<->to_be_sent)->prev;
++ list_del (tmp);
++ spin_unlock (<->queue_sender);
++ sr = list_entry (tmp, send_req_t, sr_list);
++
++ /* send. */
++ if (sr->type == sr_lock) {
++ err = send_lock_req (sr->who);
++ if (err == 0) {
++ sr->who->in_to_be_sent = FALSE;
++ unmark_and_release_lock (sr->who);
++ }
++ } else if (sr->type == sr_act) {
++ err = send_lock_action (sr->who, sr->who->action);
++ if (err == 0) {
++ sr->who->in_to_be_sent = FALSE;
++ unmark_and_release_lock (sr->who);
++ }
++ } else if (sr->type == sr_cancel) {
++ err =
++ lg_lock_cancel_req (gulm_cm.hookup, sr->who->key,
++ sr->who->keylen);
++ if (err == 0)
++ unmark_and_release_lock (sr->who);
++ } else if (sr->type == sr_drop) {
++ /* XXX sr->lt isn't really needed.
++ * just lt should be fine.
++ * look into it someday.
++ */
++ err = send_drop_exp_inter (sr->fs, sr->lt, sr->name);
++ } else {
++ log_err ("Unknown send_req type! %d\n", sr->type);
++ }
++ up (<->sender);
++
++ /* if no errors, remove from queue. */
++ if (err == 0) {
++ if (sr->type == sr_drop && sr->name != NULL)
++ kfree (sr->name);
++ kfree (sr);
++ sr = NULL;
++ } else {
++ /* if errors, re-queue.
++ * the send_* funcs already reported the error, so we won't
++ * repeat that.
++ * */
++ spin_lock (<->queue_sender);
++ /* reset the pointers. otherwise things get weird. */
++ INIT_LIST_HEAD (&sr->sr_list);
++ list_add_tail (&sr->sr_list, <->to_be_sent);
++ spin_unlock (<->queue_sender);
++
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout (3 * HZ);
++
++ /* gotta break shit up.
++ * else this loops hard and fast.
++ */
++ }
++ } /* while( lt->running ) */
++
++ complete (<->startup);
++ return 0;
++}
++
++/**
++ * cancel_pending_sender -
++ * @lck:
++ *
++ * want to cancel a lock request that we haven't sent to the server yet.
++ *
++ * this must skip over unlock requests. (never cancel unlocks)
++ *
++ * Returns: int
++ */
++int
++cancel_pending_sender (gulm_lock_t * lck)
++{
++ lock_table_t *lt = &gulm_cm.ltpx;
++ struct list_head *tmp, *nxt;
++ send_req_t *sr;
++ int found = FALSE;
++
++ spin_lock (<->queue_sender);
++
++ list_for_each_safe (tmp, nxt, <->to_be_sent) {
++ sr = list_entry (tmp, send_req_t, sr_list);
++ if (sr->who == lck) { /* good enough? */
++ if (lck->req_type == sr_cancel)
++ continue;
++ if (lck->req_state == LM_ST_UNLOCKED)
++ continue; /*donot cancel unlocks */
++ list_del (tmp);
++ kfree (sr);
++ found = TRUE;
++ lck->in_to_be_sent = FALSE;
++
++ /* Now we need to tell the waiting lock req that it got canceled.
++ * basically, we need to fake a lg_err_Canceled return....
++ */
++ lck->result = LM_OUT_CANCELED | lck->cur_state;
++ lck->actuallypending = FALSE;
++ lck->req_type = glck_nothing;
++ atomic_dec (<->locks_pending);
++#ifndef USE_SYNC_LOCKING
++ send_async_reply (lck);
++#else
++ complete (&lck->actsleep);
++#endif
++ unmark_and_release_lock (lck);
++ break;
++ }
++ }
++
++ spin_unlock (<->queue_sender);
++ return found;
++}
++
++/**
++ * gulm_lt_login_reply -
++ * @misc:
++ * @error:
++ * @which:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_login_reply (void *misc, uint32_t error, uint8_t which)
++{
++ if (error != 0) {
++ gulm_cm.ltpx.running = FALSE;
++ log_err ("LTPX: Got a %d from the login request.\n", error);
++ } else {
++ log_msg (lgm_Network2, "Logged into local LTPX.\n");
++ }
++ return error;
++}
++
++/**
++ * gulm_lt_logout_reply -
++ * @misc:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_logout_reply (void *misc)
++{
++ gulm_cm.ltpx.running = FALSE;
++ log_msg (lgm_Network2, "Logged out of local LTPX.\n");
++ return 0;
++}
++
++/**
++ * gulm_lt_lock_state -
++ * @misc:
++ * @key:
++ * @keylen:
++ * @state:
++ * @flags:
++ * @error:
++ * @LVB:
++ * @LVBlen:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_lock_state (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t state, uint32_t flags, uint32_t error,
++ uint8_t * LVB, uint16_t LVBlen)
++{
++ gulm_lock_t *lck;
++
++ if (key[0] == 'J') {
++ jid_state_reply (key, keylen, LVB, LVBlen);
++ return 0;
++ }
++
++ if (!find_and_mark_lock (key, keylen, &lck)) {
++ log_err_lk (key, keylen, "Got a lock state reply for a lock "
++ "that we don't know of. state:%#x flags:%#x error:%#x\n",
++ state, flags, error);
++ return 0;
++ }
++
++ lck->result = calc_lock_result (lck, state, error, flags);
++
++ if ((lck->result & LM_OUT_ST_MASK) != LM_ST_UNLOCKED &&
++ lck->lvb != NULL) {
++ memcpy (lck->lvb, LVB, MIN (lck->fs->lvb_size, LVBlen));
++ }
++
++ lck->actuallypending = FALSE;
++ lck->req_type = glck_nothing;
++ atomic_dec (&gulm_cm.ltpx.locks_pending);
++#ifndef USE_SYNC_LOCKING
++ send_async_reply (lck);
++#else
++ complete (&lck->actsleep);
++#endif
++
++ if (error != 0 && error != lg_err_TryFailed && error != lg_err_Canceled)
++ log_msg_lck (lck, "Error: %d:%s (req:%#x rpl:%#x lss:%#x)\n",
++ error, gio_Err_to_str (error),
++ lck->req_state, state, lck->last_suc_state);
++
++ unmark_and_release_lock (lck);
++ return 0;
++}
++
++/**
++ * gulm_lt_lock_action -
++ * @misc:
++ * @key:
++ * @keylen:
++ * @action:
++ * @error:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_lock_action (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t action, uint32_t error)
++{
++ gulm_lock_t *lck;
++
++ if (key[0] == 'J') {
++ jid_action_reply (key, keylen);
++ return 0;
++ }
++
++ if (!find_and_mark_lock (key, keylen, &lck)) {
++ log_err_lk (key, keylen, "Got a lock action reply for a lock "
++ "that we don't know of. action:%#x error:%#x\n",
++ action, error);
++ return 0;
++ }
++
++ if (action == lg_lock_act_HoldLVB ||
++ action == lg_lock_act_UnHoldLVB || action == lg_lock_act_SyncLVB) {
++ /* */
++ lck->result = error;
++ if (error != lg_err_Ok) {
++ log_err ("on action reply act:%d err:%d\n", action,
++ error);
++ }
++ lck->req_type = glck_nothing;
++ lck->actuallypending = FALSE;
++ complete (&lck->actsleep);
++ } else {
++ log_err_lck (lck, "Got strange Action %#x\n", action);
++ }
++ unmark_and_release_lock (lck);
++ return 0;
++}
++
++/**
++ * gulm_lt_drop_lock_req -
++ * @misc:
++ * @key:
++ * @keylen:
++ * @state:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_drop_lock_req (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t state)
++{
++ gulm_lock_t *lck;
++
++ if (key[0] == 'J') {
++ jid_header_lock_drop (key, keylen);
++ return 0;
++ }
++
++ if (!find_and_mark_lock (key, keylen, &lck)) {
++ log_err_lk (key, keylen, "Got a drop lcok request for a lock "
++ "that we don't know of. state:%#x\n", state);
++ return 0;
++ }
++
++ do_drop_lock_req (lck->fs, state, key);
++
++ unmark_and_release_lock (lck);
++ return 0;
++}
++
++/**
++ * gulm_lt_drop_all -
++ * @misc:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_drop_all (void *misc)
++{
++ passup_droplocks ();
++ return 0;
++}
++
++/**
++ * gulm_lt_error -
++ * @misc:
++ * @err:
++ *
++ *
++ * Returns: int
++ */
++int
++gulm_lt_error (void *misc, uint32_t err)
++{
++ log_err ("LTPX: RANDOM ERROR %d\n", err);
++ return err;
++}
++
++static lg_lockspace_callbacks_t lock_cb = {
++ login_reply:gulm_lt_login_reply,
++ logout_reply:gulm_lt_logout_reply,
++ lock_state:gulm_lt_lock_state,
++ lock_action:gulm_lt_lock_action,
++ drop_lock_req:gulm_lt_drop_lock_req,
++ drop_all:gulm_lt_drop_all,
++ error:gulm_lt_error
++};
++
++/**
++ * lt_io_recving_thread -
++ * @data:
++ *
++ *
++ * Returns: int
++ */
++int
++lt_io_recving_thread (void *data)
++{
++ lock_table_t *lt = &gulm_cm.ltpx;
++ int err;
++
++ daemonize ("gulm_LT_recver");
++ lt->recver_task = current;
++ complete (<->startup);
++
++ while (lt->running) {
++ err = lg_lock_handle_messages (gulm_cm.hookup, &lock_cb, NULL);
++ if (err != 0) {
++ log_err ("gulm_LT_recver err %d\n", err);
++ lt->running = FALSE; /* should stop the sender thread. */
++ wake_up (<->send_wchan);
++ break;
++ }
++ } /* while( lt->running ) */
++
++ complete (<->startup);
++ return 0;
++}
++
++/**
++ * lt_logout - log out of all of the lock tables
++ */
++void
++lt_logout (void)
++{
++ lock_table_t *lt = &gulm_cm.ltpx;
++ int err;
++
++ if (lt->running) {
++ lt->running = FALSE;
++
++ /* stop sender thread */
++ wake_up (<->send_wchan);
++ wait_for_completion (<->startup);
++
++ /* stop recver thread */
++ down (<->sender);
++ err = lg_lock_logout (gulm_cm.hookup);
++ up (<->sender);
++
++ /* wait for thread to finish */
++ wait_for_completion (<->startup);
++ }
++
++}
++
++/**
++ * lt_login - login to lock tables.
++ *
++ * Returns: int
++ */
++int
++lt_login (void)
++{
++ int err;
++ lock_table_t *lt = &gulm_cm.ltpx;
++
++ if (lt->running)
++ log_err
++ ("Trying to log into LTPX when it appears to be logged in!\n");
++
++ err = lg_lock_login (gulm_cm.hookup, "GFS ");
++ if (err != 0) {
++ log_err ("Failed to send login request. %d\n", err);
++ goto fail;
++ }
++
++ /* start recver thread. */
++ lt->running = TRUE;
++ err = kernel_thread (lt_io_recving_thread, lt, 0);
++ if (err < 0) {
++ log_err ("Failed to start gulm_lt_IOd. (%d)\n", err);
++ goto fail;
++ }
++ wait_for_completion (<->startup);
++
++ /* start sender thread */
++ err = kernel_thread (lt_io_sender_thread, lt, 0);
++ if (err < 0) {
++ log_err ("Failed to start gulm_LT_sender. (%d)\n", err);
++ goto fail;
++ }
++ wait_for_completion (<->startup);
++
++ return 0;
++ fail:
++ lt_logout ();
++ log_msg (lgm_Always, "Exiting lt_login. err:%d\n", err);
++ return err;
++}
++
++/****************************************************************************/
++
++/**
++ * internal_gulm_get_lock -
++ * @fs:
++ * @key:
++ * @keylen:
++ * @lockp:
++ *
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++internal_gulm_get_lock (gulm_fs_t * fs, uint8_t * key, uint8_t keylen,
++ gulm_lock_t ** lockp)
++{
++ int found = FALSE;
++ uint32_t bkt;
++ gulm_lock_t *lck = NULL;
++
++ found = find_and_mark_lock (key, keylen, &lck);
++
++ /* malloc space */
++ if (found) {
++ GULM_ASSERT (lck->magic_one == 0xAAAAAAAA,);
++ } else {
++ lck = kmalloc (sizeof (gulm_lock_t), GFP_KERNEL);
++ if (lck == NULL) {
++ log_err
++ ("fsid=%s: Out of memory for lock struct in get_lock!\n",
++ fs->fs_name);
++ return -ENOMEM;
++ }
++ memset (lck, 0, sizeof (gulm_lock_t));
++ INIT_LIST_HEAD (&lck->gl_list);
++ atomic_set (&lck->count, 1);
++ lck->magic_one = 0xAAAAAAAA;
++ lck->fs = fs;
++ memcpy (lck->key, key, keylen);
++ lck->keylen = keylen;
++ lck->lvb = NULL;
++ init_completion (&lck->actsleep);
++ lck->actuallypending = FALSE;
++ lck->in_to_be_sent = FALSE;
++ lck->result = 0;
++ lck->action = -1;
++ lck->req_type = glck_nothing;
++ lck->last_suc_state = LM_ST_UNLOCKED;
++
++ gulm_cm.ltpx.locks_total++;
++ gulm_cm.ltpx.locks_unl++;
++
++ bkt = hash_lock_key (key, keylen);
++ bkt %= gulm_cm.ltpx.hashbuckets;
++
++ spin_lock (&gulm_cm.ltpx.hshlk[bkt]);
++ list_add (&lck->gl_list, &gulm_cm.ltpx.lkhsh[bkt]);
++ spin_unlock (&gulm_cm.ltpx.hshlk[bkt]);
++ }
++
++ *lockp = lck;
++
++ return 0;
++}
++
++/**
++ * gulm_get_lock -
++ * @lockspace:
++ * @name:
++ * @lockp:
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_get_lock (lm_lockspace_t * lockspace, struct lm_lockname *name,
++ lm_lock_t ** lockp)
++{
++ int err, len;
++ gulm_fs_t *fs = (gulm_fs_t *) lockspace;
++ uint8_t key[GIO_KEY_SIZE];
++
++ /* i could add a per fs lock to force only one gulm_get_lock at a time.
++ */
++ down (&fs->get_lock);
++
++ memset (key, 0, GIO_KEY_SIZE);
++ /* pack lockname */
++ key[0] = 'G'; /* G: fs lock, F: First mounter, J: JID mapping lock */
++ key[1] = name->ln_type & 0xff;
++ key[2] = (name->ln_number >> 56) & 0xff;
++ key[3] = (name->ln_number >> 48) & 0xff;
++ key[4] = (name->ln_number >> 40) & 0xff;
++ key[5] = (name->ln_number >> 32) & 0xff;
++ key[6] = (name->ln_number >> 24) & 0xff;
++ key[7] = (name->ln_number >> 16) & 0xff;
++ key[8] = (name->ln_number >> 8) & 0xff;
++ key[9] = (name->ln_number >> 0) & 0xff;
++
++ /* Now stick the fsname into the remaining space. */
++ len = strlen (fs->fs_name);
++ strncpy (&key[10], fs->fs_name, GIO_KEY_SIZE - 16);
++
++ len = MIN (len, GIO_KEY_SIZE - 16);
++ len += 11; /* 10 for the encoded buf, 1 for the '\0' after the fs name */
++ err = internal_gulm_get_lock (fs, key, len, (gulm_lock_t **) lockp);
++
++ up (&fs->get_lock);
++
++ return err;
++}
++
++/**
++ * gulm_put_lock -
++ * @lock:
++ *
++ *
++ * Returns: void
++ */
++void
++gulm_put_lock (lm_lock_t * lock)
++{
++ gulm_lock_t *lck = (gulm_lock_t *) lock;
++ lock_table_t *lt = &gulm_cm.ltpx;
++ gulm_fs_t *fs = lck->fs;
++
++ down (&fs->get_lock);
++
++ GULM_ASSERT (lt != NULL,);
++
++ if (lck->last_suc_state != LM_ST_UNLOCKED) {
++ log_err_lck (lck,
++ "fsid=%s: gulm_put_lock called on a lock that is not unlocked!"
++ " Current state:%#x\n", lck->fs->fs_name,
++ lck->last_suc_state);
++ /* I'm still not sure about this one. We should never see it, so I
++ * don't think it is that big of a deal, but i duno.
++ *
++ * Maybe should just make it an assertion.
++ *
++ * with the mark/unmark code, is it even a concern?
++ */
++ }
++
++ unmark_and_release_lock (lck);
++ /* lck = NULL; */
++
++ up (&fs->get_lock);
++
++}
++
++static int
++valid_trasition (unsigned int cur, unsigned int req)
++{
++ int lock_state_changes[16] = { /* unl exl def shr */
++ FALSE, TRUE, TRUE, TRUE, /* unl */
++ TRUE, FALSE, TRUE, TRUE, /* exl */
++ TRUE, TRUE, FALSE, TRUE, /* def */
++ TRUE, TRUE, TRUE, FALSE /* shr */
++ };
++ GULM_ASSERT (cur < 4
++ && req < 4, log_err ("cur:%d req:%d\n", cur, req););
++
++ return (lock_state_changes[4 * cur + req]);
++}
++
++/**
++ * verify_gulm_lock_t -
++ * @lck:
++ *
++ * wonder if I should add some other checks.
++ *
++ * Returns: int
++ */
++int
++verify_gulm_lock_t (gulm_lock_t * lck)
++{
++ if (lck == NULL) {
++ log_err ("Lock pointer was NULL!\n");
++ return -1;
++ }
++ if (lck->fs == NULL) {
++ log_err ("This lock has no filesystem!!!\n");
++ return -1;
++ }
++ return 0;
++}
++
++/**
++ * gulm_lock -
++ * @lock:
++ * @cur_state:
++ * @req_state:
++ * @flags:
++ *
++ *
++ * Returns: int
++ */
++unsigned int
++gulm_lock (lm_lock_t * lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags)
++{
++ gulm_lock_t *lck = NULL;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++
++ /* verify vars. */
++ lck = (gulm_lock_t *) lock;
++ if (verify_gulm_lock_t (lck) != 0) {
++ return -EINVAL;
++ }
++ lt = &gulm_cm.ltpx;
++ fs = lck->fs;
++
++ GULM_ASSERT (valid_trasition (cur_state, req_state),
++ log_err_lck (lck, "want %d with %s thinks:%d\n", req_state,
++ (LM_FLAG_TRY & flags) ? "try" : (LM_FLAG_NOEXP
++ & flags) ?
++ "noexp" : "no flags", cur_state);
++ );
++
++ GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++ /* save the details of this request. */
++ lck->req_type = glck_state;
++ lck->result = 0;
++ lck->cur_state = cur_state;
++ lck->req_state = req_state;
++ lck->flags = flags;
++
++ /* moving these here fixes a race on the s390 that ben found.
++ * basically, the request was sent to the server, the server receives
++ * it, the server processes, the server sends a reply, the client
++ * receives the reply, and the client tries to processe the reply before
++ * this thread could mark it as actuallypending.
++ * */
++ lck->actuallypending = TRUE;
++ atomic_inc (<->locks_pending);
++ add_lock_to_send_req_queue (lt, lck, sr_lock);
++
++ lt->lops++;
++#ifdef USE_SYNC_LOCKING
++ wait_for_completion (&lck->actsleep);
++#endif
++
++#ifdef USE_SYNC_LOCKING
++ return lck->result;
++#else
++ return LM_OUT_ASYNC;
++#endif
++}
++
++/**
++ * gulm_unlock -
++ * @lock:
++ * @cur_state:
++ *
++ *
++ * Returns: int
++ */
++unsigned int
++gulm_unlock (lm_lock_t * lock, unsigned int cur_state)
++{
++ int e;
++ e = gulm_lock (lock, cur_state, LM_ST_UNLOCKED, 0);
++ return e;
++}
++
++/**
++ * gulm_cancel -
++ * @lock:
++ *
++ */
++void
++gulm_cancel (lm_lock_t * lock)
++{
++ gulm_lock_t *lck;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++
++ /* verify vars. */
++ lck = (gulm_lock_t *) lock;
++ if (verify_gulm_lock_t (lck) != 0) {
++ return;
++ }
++ lt = &gulm_cm.ltpx;
++ fs = lck->fs;
++
++ if (lck->actuallypending) {
++ if (lck->in_to_be_sent) {
++ /* this should pull the req out of the send queue and have it
++ * return with a cancel code without going to the server.
++ */
++ cancel_pending_sender (lck);
++ } else {
++ add_lock_to_send_req_queue (lt, lck, sr_cancel);
++ }
++ } else {
++ log_msg_lck (lck, "Cancel called with no pending request.\n");
++ }
++
++}
++
++/**
++ * gulm_hold_lvb -
++ * @lock:
++ * @lvbp:
++ *
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int
++gulm_hold_lvb (lm_lock_t * lock, char **lvbp)
++{
++ gulm_lock_t *lck;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++ int err = -1;
++
++ /* verify vars. */
++ lck = (gulm_lock_t *) lock;
++ if (verify_gulm_lock_t (lck) != 0) {
++ return -EINVAL;
++ }
++ lt = &gulm_cm.ltpx;
++ fs = lck->fs;
++
++ /* what where these for? */
++ GULM_ASSERT (lck->magic_one == 0xAAAAAAAA,
++ log_msg_lck (lck, "Bad gulm_lock magic.\n"););
++ GULM_ASSERT (lt->magic_one == 0xAAAAAAAA,
++ log_msg_lck (lck, "Bad lock_table magic.\n"););
++
++ lvb_log_msg_lk (lck->key, "Entering gulm_hold_lvb\n");
++
++ GULM_ASSERT (lck->lvb == NULL,
++ log_msg_lck (lck,
++ "fsid=%s: Lvb data wasn't null! must be held "
++ "already.\n", fs->fs_name);
++ );
++
++ GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++ lck->lvb = kmalloc (fs->lvb_size, GFP_KERNEL);
++ if (lck->lvb == NULL) {
++ err = -ENOMEM;
++ goto fail;
++ }
++ memset (lck->lvb, 0, fs->lvb_size);
++
++ lck->req_type = glck_action;
++ lck->action = lg_lock_act_HoldLVB;
++ lck->result = 0;
++ lck->actuallypending = TRUE;
++ add_lock_to_send_req_queue (lt, lck, sr_act);
++
++ wait_for_completion (&lck->actsleep);
++
++ if (lck->result != lg_err_Ok) {
++ log_err ("fsid=%s: Got error %d on hold lvb request.\n",
++ fs->fs_name, lck->result);
++ kfree (lck->lvb);
++ lck->lvb = NULL;
++ goto fail;
++ }
++
++ lt->locks_lvbs++;
++
++ *lvbp = lck->lvb;
++
++ lvb_log_msg_lk (lck->key, "fsid=%s: Exiting gulm_hold_lvb\n",
++ fs->fs_name);
++ return 0;
++ fail:
++ if (err != 0)
++ log_msg (lgm_Always,
++ "fsid=%s: Exiting gulm_hold_lvb with errors (%d)\n",
++ fs->fs_name, err);
++ return err;
++}
++
++/**
++ * gulm_unhold_lvb -
++ * @lock:
++ * @lvb:
++ *
++ *
++ * Returns: void
++ */
++void
++gulm_unhold_lvb (lm_lock_t * lock, char *lvb)
++{
++ gulm_lock_t *lck = NULL;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++
++ /* verify vars. */
++ lck = (gulm_lock_t *) lock;
++ if (verify_gulm_lock_t (lck) != 0) {
++ return;
++ }
++ lt = &gulm_cm.ltpx;
++ fs = lck->fs;
++
++ GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++ if (lck->lvb != lvb) {
++ log_err ("fsid=%s: AH! LVB pointer missmatch! %p != %p\n",
++ fs->fs_name, lck->lvb, lvb);
++ goto exit;
++ }
++
++ lvb_log_msg_lk (lck->key, "Entering gulm_unhold_lvb\n");
++
++ lck->req_type = glck_action;
++ lck->action = lg_lock_act_UnHoldLVB;
++ lck->result = 0;
++ lck->actuallypending = TRUE;
++ add_lock_to_send_req_queue (lt, lck, sr_act);
++
++ wait_for_completion (&lck->actsleep);
++
++ /* XXX ummm, is it sane to not free the memory if the command fails?
++ * gfs will still think that the lvb was dropped sucessfully....
++ * (it assumes it is always sucessful)
++ * Maybe I should retry the drop request then?
++ */
++ if (lck->result != lg_err_Ok) {
++ log_err ("fsid=%s: Got error %d on unhold LVB request.\n",
++ lck->fs->fs_name, lck->result);
++ } else {
++ if (lck->lvb != NULL)
++ kfree (lck->lvb);
++ lck->lvb = NULL;
++ lt->locks_lvbs--;
++ }
++ exit:
++ lvb_log_msg ("Exiting gulm_unhold_lvb\n");
++}
++
++/**
++ * gulm_sync_lvb -
++ * @lock:
++ * @lvb:
++ *
++ * umm, is this even used anymore? yes.
++ *
++ * Returns: void
++ */
++void
++gulm_sync_lvb (lm_lock_t * lock, char *lvb)
++{
++ gulm_lock_t *lck = NULL;
++ gulm_fs_t *fs;
++ lock_table_t *lt;
++
++ /* verify vars. */
++ lck = (gulm_lock_t *) lock;
++ if (verify_gulm_lock_t (lck) != 0) {
++ return;
++ }
++ lt = &gulm_cm.ltpx;
++ fs = lck->fs;
++
++ GULM_ASSERT (lck->actuallypending == FALSE, dump_gulm_lock_t (lck););
++
++ /* this check is also in the server, so it isn't really needed here. */
++ if (lck->last_suc_state != LM_ST_EXCLUSIVE) {
++ log_err ("sync_lvb: You must hold the lock Exclusive first.\n");
++ goto exit; /*cannot do anything */
++ }
++ if (lck->lvb == NULL) {
++ log_err ("sync_lvb: You forgot to call hold lvb first.\n");
++ goto exit;
++ }
++ if (lck->lvb != lvb) {
++ log_err ("fsid=%s: AH! LVB pointer missmatch! %p != %p\n",
++ fs->fs_name, lck->lvb, lvb);
++ goto exit;
++ }
++
++ lvb_log_msg_lk (lck->key, "Entering gulm_sync_lvb\n");
++
++ lck->req_type = glck_action;
++ lck->action = lg_lock_act_SyncLVB;
++ lck->result = 0;
++ lck->actuallypending = TRUE;
++ add_lock_to_send_req_queue (lt, lck, sr_act);
++
++ wait_for_completion (&lck->actsleep);
++
++ /* XXX? retry if I get an error? */
++ if (lck->result != lg_err_Ok) {
++ log_err_lck (lck,
++ "fsid=%s: Got error %d:%s on Sync LVB request.\n",
++ fs->fs_name, lck->result,
++ gio_Err_to_str (lck->result));
++ }
++ exit:
++ lvb_log_msg ("Exiting gulm_sync_lvb\n");
++}
++
++/*****************************************************************************/
++static int
++gulm_plock_get (lm_lockspace_t * lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t * start, uint64_t * end, int *exclusive,
++ unsigned long *rowner)
++{
++ return -ENOSYS;
++}
++
++static int
++gulm_plock (lm_lockspace_t * lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ int wait, int exclusive, uint64_t start, uint64_t end)
++{
++ return -ENOSYS;
++}
++
++static int
++gulm_punlock (lm_lockspace_t * lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t start, uint64_t end)
++{
++ return -ENOSYS;
++}
++
++/****************************************************************************/
++/****************************************************************************/
++/****************************************************************************/
++/* should move the firstmounter lock stuff into its own file perhaps? */
++/**
++ * get_special_lock -
++ * @fs: <> filesystem we're getting special lock for
++ *
++ * Returns: gulm_lock_t
++ */
++STATIC gulm_lock_t *
++get_special_lock (gulm_fs_t * fs)
++{
++ int err, len;
++ gulm_lock_t *lck = NULL;
++ uint8_t key[GIO_KEY_SIZE];
++
++ /* pack lockname */
++ memset (key, 0, GIO_KEY_SIZE);
++ /* The F at the beginning doesn't mash with the G that prefixes every fs
++ * lock.
++ */
++ memcpy (key, "FirstMount", 10);
++ len = strlen (fs->fs_name);
++ strncpy (&key[10], fs->fs_name, GIO_KEY_SIZE - 21);
++ len = MIN (len, GIO_KEY_SIZE - 21);
++ len += 11;
++
++ err = internal_gulm_get_lock (fs, key, len, &lck);
++
++ /* return pointer */
++ return lck;
++}
++
++/**
++ * do_lock_time_out -
++ * @d:
++ *
++ * after timeout, set cancel request on the handler queue. (since we cannot
++ * call it from within the timer code.
++ *
++ */
++static void
++do_lock_time_out (unsigned long d)
++{
++ gulm_lock_t *lck = (gulm_lock_t *) d;
++ qu_function_call (&lck->fs->cq, gulm_cancel, lck);
++}
++
++/**
++ * get_mount_lock -
++ * @fs:
++ * @first:
++ *
++ * Get the Firstmount lock.
++ * We try to grab it Exl. IF we get that, then we are the first client
++ * mounting this fs. Otherwise we grab it shared to show that there are
++ * clients using this fs.
++ *
++ * Returns: int
++ */
++int
++get_mount_lock (gulm_fs_t * fs, int *first)
++{
++ int err;
++ struct timer_list locktimeout;
++ gulm_lock_t *lck = NULL;
++ /*
++ * first we need to get the lock into the hash.
++ * then we can try to get it Exl with try and noexp.
++ * if the try fails, grab it shared.
++ */
++
++ lck = get_special_lock (fs); /* there is only a mount lock. */
++ if (lck == NULL) {
++ err = -ENOMEM;
++ goto fail;
++ }
++
++ fs->mountlock = lck;
++ try_it_again:
++ *first = FALSE; /* assume we're not first */
++
++ err = gulm_lock (lck, LM_ST_UNLOCKED, LM_ST_EXCLUSIVE,
++ LM_FLAG_TRY | LM_FLAG_NOEXP);
++#ifndef USE_SYNC_LOCKING
++ wait_for_completion (&fs->sleep);
++#endif
++
++ if ((lck->result & LM_OUT_ST_MASK) == LM_ST_EXCLUSIVE) {
++ /* we got the lock, we're the first mounter. */
++ *first = TRUE;
++ log_msg (lgm_locking, "fsid=%s: Got mount lock Exclusive.\n",
++ fs->fs_name);
++ return 0;
++ } else if ((lck->result & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
++ log_msg (lgm_locking,
++ "fsid=%s: Didn't get mount lock Exl, someone else "
++ "was first, trying for shared.\n", fs->fs_name);
++
++ /* the try failed, pick it up shared. */
++ /* There was a case (bug #220) where we could hang here.
++ *
++ * To handle this, we put up a timer for a couple of
++ * minutes. That if it trips, it cancels our shared
++ * request. Which we then see, so we go back and try the
++ * EXL again. If the Firstmounter is fine and is just
++ * taking a damn long time to do its work, this just ends
++ * back here, no worse for the wear.
++ *
++ * Another way to do this, is to wait for a killed message
++ * for the master. When we get that, && we're pending
++ * shared here, send the gulm_canel for the mounter lock.
++ * (too bad we are not in the fs list yet at this point.
++ * (well, maybe that *isn't* a bad thing))
++ */
++ init_timer (&locktimeout);
++ locktimeout.function = do_lock_time_out;
++ locktimeout.data = (unsigned long) lck;
++ mod_timer (&locktimeout, jiffies + (120 * HZ));
++ err = gulm_lock (lck, LM_ST_UNLOCKED, LM_ST_SHARED, 0);
++#ifndef USE_SYNC_LOCKING
++ wait_for_completion (&fs->sleep);
++#endif
++ del_timer (&locktimeout);
++
++ if ((lck->result & LM_OUT_ST_MASK) == LM_ST_SHARED) {
++ /* kewl we got it. */
++ log_msg (lgm_locking,
++ "fsid=%s: Got mount lock shared.\n",
++ fs->fs_name);
++ return 0;
++ }
++
++ log_msg (lgm_locking,
++ "fsid=%s: Shared req timed out, trying Exl again.\n",
++ fs->fs_name);
++ goto try_it_again;
++ }
++ fail:
++ log_err ("Exit get_mount_lock err=%d\n", err);
++ return err;
++}
++
++/**
++ * downgrade_mount_lock -
++ * @fs:
++ *
++ * drop the Firstmount lock down to shared. This lets other mount.
++ *
++ * Returns: int
++ */
++int
++downgrade_mount_lock (gulm_fs_t * fs)
++{
++ int err;
++ gulm_lock_t *lck = (gulm_lock_t *) fs->mountlock;
++ /* we were first, so we have it exl.
++ * shift it to shared so others may mount.
++ */
++ err = gulm_lock (lck, LM_ST_EXCLUSIVE, LM_ST_SHARED, LM_FLAG_NOEXP);
++#ifndef USE_SYNC_LOCKING
++ wait_for_completion (&fs->sleep);
++#endif
++
++ if ((lck->result & LM_OUT_ST_MASK) != LM_ST_SHARED) {
++ log_err
++ ("fsid=%s: Couldn't downgrade mount lock to shared!!!!!\n",
++ fs->fs_name);
++ }
++ return 0;
++}
++
++/**
++ * drop_mount_lock - drop our hold on the firstmount lock.
++ * @fs: <> the filesystem pointer.
++ *
++ * Returns: int
++ */
++int
++drop_mount_lock (gulm_fs_t * fs)
++{
++ int err;
++ gulm_lock_t *lck = (gulm_lock_t *) fs->mountlock;
++
++ if (fs->mountlock == NULL) {
++ log_err ("fsid=%s: There's no Mount lock!!!!!\n", fs->fs_name);
++ return -1;
++ }
++ err = gulm_unlock (lck, LM_ST_SHARED);
++#ifndef USE_SYNC_LOCKING
++ wait_for_completion (&fs->sleep);
++#endif
++
++ if (lck->result != LM_ST_UNLOCKED)
++ log_err ("fsid=%s: Couldn't unlock mount lock!!!!!!\n",
++ fs->fs_name);
++ gulm_put_lock (fs->mountlock);
++ fs->mountlock = NULL;
++ return 0;
++}
++
++/*****************************************************************************/
++struct lm_lockops gulm_ops = {
++ lm_proto_name:PROTO_NAME,
++ lm_mount:gulm_mount,
++ lm_others_may_mount:gulm_others_may_mount,
++ lm_unmount:gulm_unmount,
++ lm_get_lock:gulm_get_lock,
++ lm_put_lock:gulm_put_lock,
++ lm_lock:gulm_lock,
++ lm_unlock:gulm_unlock,
++ lm_cancel:gulm_cancel,
++ lm_hold_lvb:gulm_hold_lvb,
++ lm_unhold_lvb:gulm_unhold_lvb,
++ lm_sync_lvb:gulm_sync_lvb,
++ lm_plock_get:gulm_plock_get,
++ lm_plock:gulm_plock,
++ lm_punlock:gulm_punlock,
++ lm_recovery_done:gulm_recovery_done,
++ lm_owner:THIS_MODULE,
++};
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_prints.h linux-patched/fs/gfs_locking/lock_gulm/gulm_prints.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_prints.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_prints.h 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,45 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_prints_h__
++#define __gulm_prints_h__
++#include "gulm_log_msg_bits.h"
++
++#define PROTO_NAME "lock_gulm"
++
++#ifdef GULM_ASSERT
++#undef GULM_ASSERT
++#endif
++#define GULM_ASSERT(x, do) \
++{ \
++ if (!(x)) \
++ { \
++ printk("\n"PROTO_NAME": Assertion failed on line %d of file %s\n" \
++ PROTO_NAME": assertion: \"%s\"\n", \
++ __LINE__, __FILE__, #x ); \
++ {do} \
++ panic("\n"PROTO_NAME": Record message above and reboot.\n"); \
++ } \
++}
++
++#define log_msg(v, fmt, args...) if(((v)&gulm_cm.verbosity)==(v)||(v)==lgm_Always) {\
++ printk(PROTO_NAME ": " fmt, ## args); \
++}
++#define log_err(fmt, args...) {\
++ printk(KERN_ERR PROTO_NAME ": ERROR " fmt, ## args); \
++}
++
++#define log_nop(fmt, args...)
++#define TICK printk("TICK==>" PROTO_NAME ": [%s:%d] pid:%ld\n",__FILE__,__LINE__,osi_pid())
++
++#endif /*__gulm_prints_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.c linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.c
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.c 2004-06-16 12:03:21.957894998 -0500
+@@ -0,0 +1,165 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++#include <linux/kernel.h>
++#include <linux/proc_fs.h>
++#include "util.h"
++
++extern gulm_cm_t gulm_cm;
++
++struct proc_dir_entry *gulm_proc_dir;
++struct proc_dir_entry *gulm_fs_proc_dir;
++
++/* the read operating function. */
++int
++gulm_fs_proc_read (char *buf, char **start, off_t off, int count, int *eof,
++ void *data)
++{
++ gulm_fs_t *fs = (gulm_fs_t *) data;
++ count = 0; /* ignore how much it wants */
++
++ count += sprintf (buf + count, "Filesystem: %s\nJID: %d\n"
++ "handler_queue_cur: %d\n"
++ "handler_queue_max: %d\n",
++ fs->fs_name, fs->fsJID,
++ fs->cq.task_count, fs->cq.task_max);
++
++ *eof = TRUE;
++ if (off >= count)
++ return 0;
++ *start = buf + off;
++ return (count - off);
++}
++
++/* read the stuff for all */
++int
++gulm_core_proc_read (char *buf, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ count = 0; /* ignore how much it wants */
++
++ count = sprintf (buf,
++ "cluster id: %s\n"
++ "my name: %s\n", gulm_cm.clusterID, gulm_cm.myName);
++
++ *eof = TRUE;
++ if (off >= count)
++ return 0;
++ *start = buf + off;
++ return (count - off);
++}
++
++int
++gulm_lt_proc_read (char *buf, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ lock_table_t *lt = (lock_table_t *) data;
++ count = 0; /* ignore how much it wants */
++
++ count += sprintf (buf + count, "\n"
++ "lock counts:\n"
++ " total: %d\n"
++ " unl: %d\n"
++ " exl: %d\n"
++ " shd: %d\n"
++ " dfr: %d\n"
++ "pending: %d\n"
++ " lvbs: %d\n"
++ " lops: %d\n\n",
++ lt->locks_total,
++ lt->locks_unl,
++ lt->locks_exl,
++ lt->locks_shd,
++ lt->locks_dfr,
++ atomic_read (<->locks_pending),
++ lt->locks_lvbs, lt->lops);
++
++ *eof = TRUE;
++ if (off >= count)
++ return 0;
++ *start = buf + off;
++ return (count - off);
++}
++
++/* add entry to our proc folder
++ * call this on mount.
++ * */
++int
++add_to_proc (gulm_fs_t * fs)
++{
++ if (!(create_proc_read_entry (fs->fs_name, S_IFREG | S_IRUGO,
++ gulm_fs_proc_dir, gulm_fs_proc_read,
++ (void *) fs))) {
++ log_err ("couldn't register proc entry for %s\n", fs->fs_name);
++ return -EINVAL;
++ }
++ return 0;
++}
++
++/* get rid of it
++ * this on umount.
++ * */
++void
++remove_from_proc (gulm_fs_t * fs)
++{
++ remove_proc_entry (fs->fs_name, gulm_fs_proc_dir);
++}
++
++ /* create our own root dir.
++ * initmodule
++ * */
++int
++init_proc_dir (void)
++{
++ if ((gulm_proc_dir = proc_mkdir ("gulm", &proc_root)) == NULL) {
++ log_err ("cannot create the gulm directory in /proc\n");
++ return -EINVAL;
++ }
++ if (!(create_proc_read_entry ("core", S_IFREG | S_IRUGO, gulm_proc_dir,
++ gulm_core_proc_read, NULL))) {
++ log_err ("couldn't register proc entry for core\n");
++ remove_proc_entry ("gulm", &proc_root);
++ return -EINVAL;
++ }
++ if ((gulm_fs_proc_dir =
++ proc_mkdir ("filesystems", gulm_proc_dir)) == NULL) {
++ log_err
++ ("cannot create the filesystems directory in /proc/gulm\n");
++ remove_proc_entry ("core", gulm_proc_dir);
++ remove_proc_entry ("gulm", &proc_root);
++ return -EINVAL;
++ }
++ if (!(create_proc_read_entry ("lockspace", S_IFREG | S_IRUGO,
++ gulm_proc_dir, gulm_lt_proc_read,
++ (void *) &gulm_cm.ltpx))) {
++ remove_proc_entry ("filesystems", gulm_proc_dir);
++ remove_proc_entry ("core", gulm_proc_dir);
++ remove_proc_entry ("gulm", &proc_root);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++/* destroy it
++ * close module
++ * */
++void
++remove_proc_dir (void)
++{
++ remove_proc_entry ("lockspace", gulm_proc_dir);
++ remove_proc_entry ("filesystems", gulm_proc_dir);
++ remove_proc_entry ("core", gulm_proc_dir);
++ remove_proc_entry ("gulm", &proc_root);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.h linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.h
+--- linux-orig/fs/gfs_locking/lock_gulm/gulm_procinfo.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/gulm_procinfo.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __procinfo_h__
++#define __procinfo_h__
++int add_to_proc (gulm_fs_t * fs);
++void remove_from_proc (gulm_fs_t * fs);
++void remove_locktables_from_proc (void);
++void add_locktables_to_proc (void);
++int init_proc_dir (void);
++void remove_proc_dir (void);
++#endif /*__procinfo_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/handler.c linux-patched/fs/gfs_locking/lock_gulm/handler.c
+--- linux-orig/fs/gfs_locking/lock_gulm/handler.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/handler.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,343 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/smp_lock.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include "handler.h"
++
++/* things about myself
++ * mostly just for verbosity here.
++ * */
++extern gulm_cm_t gulm_cm;
++
++/* the task struct */
++typedef struct runtask_s {
++ struct list_head rt_list;
++
++ gulm_fn fn;
++ lm_callback_t cb;
++ lm_fsdata_t *fsdata;
++ int type;
++ uint64_t lmnum;
++ unsigned int lmtype;
++ int result;
++
++} runtask_t;
++/* ooo crufty. */
++#define LM_CB_GULM_FN 169
++#if LM_CB_GULM_FN == LM_CB_NEED_E || \
++ LM_CB_GULM_FN == LM_CB_NEED_D || \
++ LM_CB_GULM_FN == LM_CB_NEED_S || \
++ LM_CB_GULM_FN == LM_CB_NEED_RECOVERY || \
++ LM_CB_GULM_FN == LM_CB_DROPLOCKS || \
++ LM_CB_GULM_FN == LM_CB_ASYNC
++#error "LM_CB_GULM_FN collision with other LM_CB_*"
++#endif
++
++static __inline__ int
++queue_empty (callback_qu_t * cq)
++{
++ int ret;
++ spin_lock (&cq->list_lock);
++ ret = list_empty (&cq->run_tasks);
++ spin_unlock (&cq->list_lock);
++ return ret;
++}
++
++/**
++ * handler -
++ * @d:
++ *
++ *
++ * Returns: int
++ */
++int
++handler (void *d)
++{
++ callback_qu_t *cq = (callback_qu_t *) d;
++ runtask_t *rt;
++ struct list_head *tmp;
++ struct lm_lockname lockname;
++ struct lm_async_cb acb;
++
++ daemonize ("gulm_Cb_Handler");
++ atomic_inc (&cq->num_threads);
++ complete (&cq->startup);
++
++ while (cq->running) {
++ do {
++ DECLARE_WAITQUEUE (__wait_chan, current);
++ current->state = TASK_INTERRUPTIBLE;
++ add_wait_queue (&cq->waiter, &__wait_chan);
++ if (queue_empty (cq))
++ schedule ();
++ remove_wait_queue (&cq->waiter, &__wait_chan);
++ current->state = TASK_RUNNING;
++ } while (0);
++
++ if (!cq->running)
++ break;
++ /* remove item from list */
++ spin_lock (&cq->list_lock);
++ if (list_empty (&cq->run_tasks)) {
++ spin_unlock (&cq->list_lock);
++ continue; /* nothing here. move on */
++ }
++ /* take items off the end of the list, since we add them to the
++ * beginning.
++ */
++ tmp = (&cq->run_tasks)->prev;
++ list_del (tmp);
++ cq->task_count--;
++ spin_unlock (&cq->list_lock);
++
++ rt = list_entry (tmp, runtask_t, rt_list);
++
++ if (rt->type == LM_CB_ASYNC) {
++ acb.lc_name.ln_number = rt->lmnum;
++ acb.lc_name.ln_type = rt->lmtype;
++ acb.lc_ret = rt->result;
++ rt->cb (rt->fsdata, rt->type, &acb);
++ } else if (rt->type == LM_CB_GULM_FN) {
++ rt->fn (rt->fsdata);
++ } else {
++ lockname.ln_number = rt->lmnum;
++ lockname.ln_type = rt->lmtype;
++ rt->cb (rt->fsdata, rt->type, &lockname);
++ }
++
++ kfree (rt);
++
++ } /*while(running) */
++
++ atomic_dec (&cq->num_threads);
++ complete (&cq->startup);
++ return 0;
++}
++
++/**
++ * display_handler_queue -
++ * @cq:
++ *
++ * remember, items are added to the head, and removed from the tail.
++ * So the last item listed, is the next item to be handled.
++ *
++ */
++void
++display_handler_queue (callback_qu_t * cq)
++{
++ struct list_head *lltmp;
++ runtask_t *rt;
++ int i = 0;
++ log_msg (lgm_Always, "Dumping Handler queue with %d items, max %d\n",
++ cq->task_count, cq->task_max);
++ spin_lock (&cq->list_lock);
++ list_for_each (lltmp, &cq->run_tasks) {
++ rt = list_entry (lltmp, runtask_t, rt_list);
++ if (rt->type == LM_CB_ASYNC) {
++ log_msg (lgm_Always,
++ "%4d ASYNC (%" PRIu64 ", %u) result:%#x\n",
++ i, rt->lmnum, rt->lmtype, rt->result);
++ } else if (rt->type == LM_CB_GULM_FN) {
++ log_msg (lgm_Always, "%4d GULM FN func:%p data:%p\n",
++ i, rt->fn, rt->fsdata);
++ } else { /* callback. */
++ log_msg (lgm_Always,
++ "%4d CALLBACK req:%u (%" PRIu64 ", %u)\n", i,
++ rt->type, rt->lmnum, rt->lmtype);
++ }
++ i++;
++ }
++ spin_unlock (&cq->list_lock);
++}
++
++/**
++ * alloc_runtask -
++ * Returns: runtask_t
++ */
++runtask_t *
++alloc_runtask (void)
++{
++ runtask_t *rt;
++ rt = kmalloc (sizeof (runtask_t), GFP_KERNEL);
++ return rt;
++}
++
++/**
++ * qu_function_call -
++ * @cq:
++ * @fn:
++ * @data:
++ *
++ * Generic function execing on the handler thread. Mostly so I can add
++ * single things quick without having to build all the details into the
++ * handler queues.
++ *
++ * Returns: int
++ */
++int
++qu_function_call (callback_qu_t * cq, gulm_fn fn, void *data)
++{
++ runtask_t *rt;
++ rt = alloc_runtask ();
++ if (rt == NULL)
++ return -ENOMEM;
++ rt->cb = NULL;
++ rt->fn = fn;
++ rt->fsdata = data;
++ rt->type = LM_CB_GULM_FN;
++ rt->lmtype = 0;
++ rt->lmnum = 0;
++ rt->result = 0;
++ INIT_LIST_HEAD (&rt->rt_list);
++ spin_lock (&cq->list_lock);
++ list_add (&rt->rt_list, &cq->run_tasks);
++ cq->task_count++;
++ if (cq->task_count > cq->task_max)
++ cq->task_max = cq->task_count;
++ spin_unlock (&cq->list_lock);
++ wake_up (&cq->waiter);
++ return 0;
++}
++
++/**
++ * qu_async_rpl -
++ * @cq:
++ * @cb:
++ * @fsdata:
++ * @lockname:
++ * @result:
++ *
++ *
++ * Returns: int
++ */
++int
++qu_async_rpl (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++ struct lm_lockname *lockname, int result)
++{
++ runtask_t *rt;
++ rt = alloc_runtask ();
++ if (rt == NULL)
++ return -ENOMEM;
++ rt->cb = cb;
++ rt->fsdata = fsdata;
++ rt->type = LM_CB_ASYNC;
++ rt->lmtype = lockname->ln_type;
++ rt->lmnum = lockname->ln_number;
++ rt->result = result;
++ INIT_LIST_HEAD (&rt->rt_list);
++ spin_lock (&cq->list_lock);
++ list_add (&rt->rt_list, &cq->run_tasks);
++ cq->task_count++;
++ if (cq->task_count > cq->task_max)
++ cq->task_max = cq->task_count;
++ spin_unlock (&cq->list_lock);
++ wake_up (&cq->waiter);
++ return 0;
++}
++
++/**
++ * qu_drop_req -
++ *
++ * Returns: <0:Error; =0:Ok
++ */
++int
++qu_drop_req (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++ int type, uint8_t lmtype, uint64_t lmnum)
++{
++ runtask_t *rt;
++ rt = alloc_runtask ();
++ if (rt == NULL)
++ return -ENOMEM;
++ rt->cb = cb;
++ rt->fsdata = fsdata;
++ rt->type = type;
++ rt->lmtype = lmtype;
++ rt->lmnum = lmnum;
++ rt->result = 0;
++ INIT_LIST_HEAD (&rt->rt_list);
++ spin_lock (&cq->list_lock);
++ list_add (&rt->rt_list, &cq->run_tasks);
++ cq->task_count++;
++ if (cq->task_count > cq->task_max)
++ cq->task_max = cq->task_count;
++ spin_unlock (&cq->list_lock);
++ wake_up (&cq->waiter);
++ return 0;
++}
++
++/**
++ * stop_callback_qu - stop the handler thread
++ */
++void
++stop_callback_qu (callback_qu_t * cq)
++{
++ struct list_head *lltmp, *tmp;
++ runtask_t *rt;
++
++ if (cq->running) {
++ cq->running = FALSE;
++ /* make sure all thread stop.
++ * */
++ while (atomic_read (&cq->num_threads) > 0) {
++ wake_up (&cq->waiter);
++ wait_for_completion (&cq->startup);
++ }
++ /* clear out any left overs. */
++ list_for_each_safe (tmp, lltmp, &cq->run_tasks) {
++ rt = list_entry (tmp, runtask_t, rt_list);
++ list_del (tmp);
++ kfree (rt);
++ }
++ }
++}
++
++/**
++ * start_callback_qu -
++ *
++ * Returns: <0:Error, >=0:Ok
++ */
++int
++start_callback_qu (callback_qu_t * cq, int cnt)
++{
++ int err;
++ INIT_LIST_HEAD (&cq->run_tasks);
++ spin_lock_init (&cq->list_lock);
++ init_completion (&cq->startup);
++ init_waitqueue_head (&cq->waiter);
++ atomic_set (&cq->num_threads, 0);
++ cq->running = TRUE;
++ cq->task_count = 0;
++ cq->task_max = 0;
++ if (cnt <= 0)
++ cnt = 2;
++ for (; cnt > 0; cnt--) {
++ err = kernel_thread (handler, cq, 0); /* XXX linux part */
++ if (err < 0) {
++ stop_callback_qu (cq);
++ /* calling stop here might not behave correctly in all error
++ * cases.
++ */
++ return err;
++ }
++ wait_for_completion (&cq->startup);
++ }
++ return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/handler.h linux-patched/fs/gfs_locking/lock_gulm/handler.h
+--- linux-orig/fs/gfs_locking/lock_gulm/handler.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/handler.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,42 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __handler_c__
++#define __handler_c__
++#include <linux/lm_interface.h>
++
++struct callback_qu_s {
++ struct completion startup;
++ int running;
++ int task_count;
++ int task_max;
++ struct list_head run_tasks;
++ spinlock_t list_lock;
++ wait_queue_head_t waiter;
++ atomic_t num_threads;
++};
++typedef struct callback_qu_s callback_qu_t;
++
++/* kinda an excess overloading */
++typedef void (*gulm_fn) (void *);
++int qu_function_call (callback_qu_t * cq, gulm_fn fn, void *data);
++
++int qu_async_rpl (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++ struct lm_lockname *lockname, int result);
++int qu_drop_req (callback_qu_t * cq, lm_callback_t cb, lm_fsdata_t * fsdata,
++ int type, uint8_t lmtype, uint64_t lmnum);
++int start_callback_qu (callback_qu_t * cq, int cnt);
++void stop_callback_qu (callback_qu_t * cq);
++void display_handler_queue (callback_qu_t * cq);
++
++#endif /*__handler_c__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_core.c linux-patched/fs/gfs_locking/lock_gulm/lg_core.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_core.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_core.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,724 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* All of the core related functions for services are here. */
++
++#include "lg_priv.h"
++
++/**
++ * lg_core_selector -
++ * @ulm_interface_p:
++ *
++ *
++ * Returns: int
++ */
++xdr_socket
++lg_core_selector (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL || lg->first_magic != LGMAGIC
++ || lg->last_magic != LGMAGIC)
++#ifdef __KERNEL__
++ return NULL;
++#else
++ return -EINVAL;
++#endif
++
++ return lg->core_fd;
++}
++
++/**
++ * lg_core_handle_messages -
++ * @ulm_interface_p:
++ * @lg_core_callbacks_t:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_handle_messages (gulm_interface_p lgp, lg_core_callbacks_t * ccbp,
++ void *misc)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_dec_t *dec;
++ int err = 0;
++ uint64_t x_gen;
++ uint32_t x_code, x_error, x_rank;
++ struct in6_addr x_ip;
++ uint8_t x_state, x_mode;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EBADR;
++
++ down (&lg->core_recver);
++ if (lg->in_core_hm)
++ return -EDEADLK;
++ lg->in_core_hm = TRUE;
++ up (&lg->core_recver);
++
++ dec = lg->core_dec;
++
++ err = xdr_dec_uint32 (dec, &x_code);
++ if (err != 0)
++ goto exit;
++
++ if (gulm_core_login_rpl == x_code) {
++ do {
++ if ((err = xdr_dec_uint64 (dec, &x_gen)) < 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_error)) < 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_rank)) < 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) < 0)
++ break;
++ } while (0);
++ if (err != 0)
++ goto exit;
++ if (ccbp->login_reply == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->login_reply (misc, x_gen, x_error, x_rank, x_state);
++ goto exit;
++ } else if (gulm_core_logout_rpl == x_code) {
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ goto exit;
++ if (ccbp->logout_reply != NULL) {
++ err = ccbp->logout_reply (misc);
++ }
++
++ xdr_close (&lg->core_fd);
++ xdr_enc_release (lg->core_enc);
++ lg->core_enc = NULL;
++ xdr_dec_release (lg->core_dec);
++ lg->core_dec = NULL;
++
++ goto exit;
++ } else if (gulm_core_mbr_lstrpl == x_code) {
++ if (ccbp->nodelist != NULL) {
++ err = ccbp->nodelist (misc, lglcb_start, NULL, 0, 0);
++ if (err != 0)
++ goto exit;
++ }
++ do {
++ if ((err = xdr_dec_list_start (dec)) != 0)
++ break;
++ while (xdr_dec_list_stop (dec) != 0) {
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfba,
++ &lg->cfba_len)) != 0)
++ break;
++ if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_mode)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_mode)) != 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_rank)) != 0)
++ break;
++ if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++ break;
++ if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++ break;
++ if ((err = xdr_dec_uint64 (dec, &x_gen)) != 0)
++ break;
++
++ if (ccbp->nodelist != NULL) {
++ err =
++ ccbp->nodelist (misc, lglcb_item,
++ lg->cfba, &x_ip,
++ x_state);
++ if (err != 0)
++ goto exit;
++ }
++
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (ccbp->nodelist == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->nodelist (misc, lglcb_stop, NULL, 0, 0);
++ goto exit;
++ } else if (gulm_core_state_chgs == x_code) {
++ do {
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ if (x_state == gio_Mbr_ama_Slave) {
++ if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++ break;
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfba,
++ &lg->cfba_len)) != 0)
++ break;
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (ccbp->statechange == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->statechange (misc, x_state, &x_ip, lg->cfba);
++ goto exit;
++ } else if (gulm_core_mbr_updt == x_code) {
++ do {
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfba,
++ &lg->cfba_len)) != 0)
++ break;
++ if ((err = xdr_dec_ipv6 (dec, &x_ip)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (ccbp->nodechange == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->nodechange (misc, lg->cfba, &x_ip, x_state);
++ goto exit;
++ } else if (gulm_core_res_list == x_code) {
++ if (ccbp->service_list != NULL) {
++ if ((err =
++ ccbp->service_list (misc, lglcb_start, NULL)) != 0)
++ goto exit;
++ }
++ do {
++ if ((err = xdr_dec_list_start (dec)) != 0)
++ break;
++ while (xdr_dec_list_stop (dec)) {
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfba,
++ &lg->cfba_len)) != 0)
++ break;
++ if (ccbp->service_list != NULL) {
++ if ((err =
++ ccbp->service_list (misc,
++ lglcb_item,
++ lg->cfba)) !=
++ 0) {
++ goto exit;
++ }
++ }
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (ccbp->service_list == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->service_list (misc, lglcb_stop, NULL);
++ goto exit;
++ } else if (gulm_info_stats_rpl == x_code) {
++ if (ccbp->status != NULL) {
++ if ((err =
++ ccbp->status (misc, lglcb_start, NULL, NULL)) != 0)
++ goto exit;
++ }
++ do {
++ if ((err = xdr_dec_list_start (dec)) != 0)
++ break;
++ while (xdr_dec_list_stop (dec) != 0) {
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfba,
++ &lg->cfba_len)) != 0)
++ break;
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->cfbb,
++ &lg->cfbb_len)) != 0)
++ break;
++ if (ccbp->status != NULL) {
++ if ((err =
++ ccbp->status (misc, lglcb_item,
++ lg->cfba,
++ lg->cfbb)) != 0) {
++ goto exit;
++ }
++ }
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (ccbp->status == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->status (misc, lglcb_stop, NULL, NULL);
++ goto exit;
++ } else if (gulm_err_reply == x_code) {
++ if ((err = xdr_dec_uint32 (dec, &x_code)) != 0)
++ goto exit;
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ goto exit;
++ if (ccbp->error == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = ccbp->error (misc, x_error);
++ goto exit;
++ } else {
++ /* unknown code. what to do? */
++ err = -EPROTO;
++ goto exit;
++ }
++
++ exit:
++ lg->in_core_hm = FALSE;
++ return err;
++}
++
++/**
++ * lg_core_login -
++ * @lgp:
++ * @important:
++ *
++ * On any error, things are closed and released to the state of things
++ * before you called login.
++ *
++ * Returns: int
++ */
++int
++lg_core_login (gulm_interface_p lgp, int important)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct sockaddr_in6 adr;
++ int err;
++ xdr_socket cfd;
++ xdr_enc_t *enc;
++ xdr_dec_t *dec;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ adr.sin6_family = AF_INET6;
++ adr.sin6_addr = in6addr_loopback;
++ adr.sin6_port = htons (lg->core_port);
++
++ if ((err = xdr_open (&cfd)) < 0) {
++ return err;
++ }
++
++ if ((err = xdr_connect (&adr, cfd)) < 0) {
++ xdr_close (&cfd);
++ return err;
++ }
++
++ enc = xdr_enc_init (cfd, 128);
++ if (enc == NULL) {
++ xdr_close (&cfd);
++ return -ENOMEM;
++ }
++
++ dec = xdr_dec_init (cfd, 128);
++ if (enc == NULL) {
++ xdr_enc_release (enc);
++ xdr_close (&cfd);
++ return -ENOMEM;
++ }
++
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_reslgn_req)) < 0)
++ break;
++ if ((err = xdr_enc_uint32 (enc, GIO_WIREPROT_VERS)) < 0)
++ break;
++ if ((err = xdr_enc_string (enc, lg->clusterID)) < 0)
++ break;
++ if ((err = xdr_enc_string (enc, lg->service_name)) < 0)
++ break;
++ if ((err =
++ xdr_enc_uint32 (enc,
++ important ? gulm_svc_opt_important : 0)) !=
++ 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) < 0)
++ break;
++ } while (0);
++ if (err != 0) {
++ xdr_dec_release (dec);
++ xdr_enc_release (enc);
++ xdr_close (&cfd);
++ return err;
++ }
++
++ down (&lg->core_sender);
++ lg->core_fd = cfd;
++ lg->core_enc = enc;
++ lg->core_dec = dec;
++ up (&lg->core_sender);
++
++ return 0;
++}
++
++/**
++ * lg_core_logout -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_logout (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_logout_req)) != 0)
++ break;
++ if ((err = xdr_enc_string (enc, lg->service_name)) != 0)
++ break;
++ if ((err = xdr_enc_uint8 (enc, gio_Mbr_ama_Resource)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_nodeinfo -
++ * @lgp:
++ * @nodename:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_nodeinfo (gulm_interface_p lgp, char *nodename)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ if (nodename == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_req)) != 0)
++ break;
++ if ((err = xdr_enc_string (enc, nodename)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_nodelist -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_nodelist (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_lstreq)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_servicelist -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_servicelist (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_res_req)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_corestate -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_corestate (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_state_req)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_shutdown -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_shutdown (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_shutdown)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_forceexpire -
++ * @lgp:
++ * @node_name:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_forceexpire (gulm_interface_p lgp, char *nodename)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ if (nodename == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_mbr_force)) != 0)
++ break;
++ if ((err = xdr_enc_string (enc, nodename)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_forcepending -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_forcepending (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_core_forcepend)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
++
++/**
++ * lg_core_status -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_core_status (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_fd < 0 || lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->core_enc;
++
++ down (&lg->core_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_info_stats_req)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->core_sender);
++ return err;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_lock.c linux-patched/fs/gfs_locking/lock_gulm/lg_lock.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_lock.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_lock.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,667 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* all of the lock related fucntion are here. */
++#include "lg_priv.h"
++
++/**
++ * lg_lock_selector -
++ * @ulm_interface_p:
++ *
++ *
++ * Returns: int
++ */
++xdr_socket
++lg_lock_selector (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL || lg->first_magic != LGMAGIC
++ || lg->last_magic != LGMAGIC)
++#ifdef __KERNEL__
++ return NULL;
++#else
++ return -EINVAL;
++#endif
++
++ return lg->lock_fd;
++}
++
++/**
++ * lg_lock_handle_messages -
++ * @ulm_interface_p:
++ * @lg_lockspace_callbacks_t:
++ *
++ * Returns: int
++ */
++int
++lg_lock_handle_messages (gulm_interface_p lgp, lg_lockspace_callbacks_t * cbp,
++ void *misc)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_dec_t *dec;
++ int err = 0;
++ uint32_t x_code, x_error, x_flags;
++ uint16_t x_keylen, x_lvblen = 0;
++ uint8_t x_state;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->core_enc == NULL || lg->core_dec == NULL)
++ return -EBADR;
++
++ down (&lg->lock_recver);
++ if (lg->in_lock_hm)
++ return -EDEADLK;
++ lg->in_lock_hm = TRUE;
++ up (&lg->lock_recver);
++
++ dec = lg->lock_dec;
++
++ err = xdr_dec_uint32 (dec, &x_code);
++ if (err != 0)
++ goto exit;
++
++ if (gulm_lock_login_rpl == x_code) {
++ do {
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ } while (0);
++ if (err != 0)
++ goto exit;
++ if (cbp->login_reply == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = cbp->login_reply (misc, x_error, x_state);
++ goto exit;
++ } else if (gulm_lock_logout_rpl == x_code) {
++ if (cbp->logout_reply != NULL) {
++ err = cbp->logout_reply (misc);
++ }
++
++ xdr_close (&lg->lock_fd);
++ xdr_enc_release (lg->lock_enc);
++ lg->lock_enc = NULL;
++ xdr_dec_release (lg->lock_dec);
++ lg->lock_dec = NULL;
++
++ goto exit;
++ } else if (gulm_lock_state_rpl == x_code) {
++ do {
++ if ((err =
++ xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++ &lg->lfba_len, &x_keylen)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_flags)) != 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ break;
++ if (x_flags & gio_lck_fg_hasLVB) {
++ if ((err =
++ xdr_dec_raw_ag (dec, (void **) &lg->lfbb,
++ &lg->lfbb_len,
++ &x_lvblen)) != 0)
++ break;
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (x_keylen <= 4) {
++ err = -EPROTO; /* or something */
++ goto exit;
++ }
++ if (cbp->lock_state == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = cbp->lock_state (misc, &lg->lfba[4], x_keylen - 4,
++ x_state, x_flags, x_error,
++ lg->lfbb, x_lvblen);
++ goto exit;
++ } else if (gulm_lock_action_rpl == x_code) {
++ do {
++ if ((err =
++ xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++ &lg->lfba_len, &x_keylen)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ break;
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (x_keylen <= 4) {
++ err = -EPROTO; /* or something */
++ goto exit;
++ }
++ if (cbp->lock_action == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err =
++ cbp->lock_action (misc, &lg->lfba[4], x_keylen - 4, x_state,
++ x_error);
++ goto exit;
++ } else if (gulm_lock_cb_state == x_code) {
++ do {
++ if ((err =
++ xdr_dec_raw_ag (dec, (void **) &lg->lfba,
++ &lg->lfba_len, &x_keylen)) != 0)
++ break;
++ if ((err = xdr_dec_uint8 (dec, &x_state)) != 0)
++ break;
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (cbp->drop_lock_req == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err =
++ cbp->drop_lock_req (misc, &lg->lfba[4], x_keylen - 4,
++ x_state);
++ goto exit;
++ } else if (gulm_lock_cb_dropall == x_code) {
++ if (cbp->drop_all == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = cbp->drop_all (misc);
++ goto exit;
++ } else if (gulm_info_stats_rpl == x_code) {
++ if (cbp->status != NULL) {
++ if ((err =
++ cbp->status (misc, lglcb_start, NULL, NULL)) != 0)
++ goto exit;
++ }
++ do {
++ if ((err = xdr_dec_list_start (dec)) != 0)
++ break;
++ while (xdr_dec_list_stop (dec) != 0) {
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->lfba,
++ &lg->lfba_len)) != 0)
++ break;
++ if ((err =
++ xdr_dec_string_ag (dec, &lg->lfbb,
++ &lg->lfbb_len)) != 0)
++ break;
++ if (cbp->status != NULL) {
++ if ((err =
++ cbp->status (misc, lglcb_item,
++ lg->lfba,
++ lg->lfbb)) != 0) {
++ break;
++ }
++ }
++ }
++ } while (0);
++ if (err != 0) {
++ goto exit;
++ }
++ if (cbp->status == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = cbp->status (misc, lglcb_stop, NULL, NULL);
++ goto exit;
++ } else if (gulm_err_reply == x_code) {
++ do {
++ if ((err = xdr_dec_uint32 (dec, &x_code)) != 0)
++ break;
++ if ((err = xdr_dec_uint32 (dec, &x_error)) != 0)
++ break;
++ } while (0);
++ if (err != 0)
++ goto exit;
++ if (cbp->error == NULL) {
++ err = 0;
++ goto exit;
++ }
++ err = cbp->error (misc, x_error);
++ goto exit;
++ } else {
++ err = -EPROTO;
++ goto exit;
++ }
++
++ exit:
++ lg->in_lock_hm = FALSE;
++ return err;
++}
++
++/**
++ * lg_lock_login -
++ * @ulm_interface_p:
++ * @4:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_lock_login (gulm_interface_p lgp, uint8_t lockspace[4])
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct sockaddr_in6 adr;
++ int err;
++ xdr_socket cfd;
++ xdr_enc_t *enc;
++ xdr_dec_t *dec;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ adr.sin6_family = AF_INET6;
++ adr.sin6_addr = in6addr_loopback;
++ adr.sin6_port = htons (lg->lock_port);
++
++ if ((err = xdr_open (&cfd)) < 0) {
++ return err;
++ }
++
++ if ((err = xdr_connect (&adr, cfd)) < 0) {
++ xdr_close (&cfd);
++ return err;
++ }
++
++ enc = xdr_enc_init (cfd, 512);
++ if (enc == NULL) {
++ xdr_close (&cfd);
++ return -ENOMEM;
++ }
++
++ dec = xdr_dec_init (cfd, 512);
++ if (enc == NULL) {
++ xdr_enc_release (enc);
++ xdr_close (&cfd);
++ return -ENOMEM;
++ }
++
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_login_req)) < 0)
++ break;
++ if ((err = xdr_enc_uint32 (enc, GIO_WIREPROT_VERS)) < 0)
++ break;
++ if ((err = xdr_enc_string (enc, lg->service_name)) < 0)
++ break;
++ if ((err = xdr_enc_uint8 (enc, gio_lck_st_Client)) < 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) < 0)
++ break;
++
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_sel_lckspc)) < 0)
++ break;
++ if ((err = xdr_enc_raw (enc, lockspace, 4)) < 0)
++ break;
++ /* don't flush here.
++ * dumb programmer stunt. This way, the lockspace selection won't
++ * happen until the next thing the user of this lib sends. Which
++ * means it will be after we have received the login reply.
++ *
++ * Is there really a good reason not to flush here?
++ */
++ } while (0);
++ if (err != 0) {
++ xdr_dec_release (dec);
++ xdr_enc_release (enc);
++ xdr_close (&cfd);
++ return err;
++ }
++
++ down (&lg->lock_sender);
++ lg->lock_fd = cfd;
++ lg->lock_enc = enc;
++ lg->lock_dec = dec;
++
++ memcpy (lg->lockspace, lockspace, 4);
++ up (&lg->lock_sender);
++
++ return 0;
++}
++
++/**
++ * lg_lock_logout -
++ * @ulm_interface_p:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_lock_logout (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->lock_enc;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_logout_req)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
++
++/**
++ * lg_lock_state_req -
++ * @lgp:
++ * @key:
++ * @keylen:
++ * @state:
++ * @flags:
++ * @LVB:
++ * @LVBlen:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_lock_state_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen,
++ uint8_t state, uint32_t flags, uint8_t * LVB,
++ uint16_t LVBlen)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct iovec iov[2];
++ xdr_enc_t *enc;
++ uint32_t iflgs = 0;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ if (state != lg_lock_state_Unlock &&
++ state != lg_lock_state_Exclusive &&
++ state != lg_lock_state_Deferred && state != lg_lock_state_Shared)
++ return -EINVAL;
++
++ /* make sure only the accepted flags get passed through. */
++ if ((flags & lg_lock_flag_DoCB) == lg_lock_flag_DoCB)
++ iflgs |= lg_lock_flag_DoCB;
++ if ((flags & lg_lock_flag_Try) == lg_lock_flag_Try)
++ iflgs |= lg_lock_flag_Try;
++ if ((flags & lg_lock_flag_Any) == lg_lock_flag_Any)
++ iflgs |= lg_lock_flag_Any;
++ if ((flags & lg_lock_flag_IgnoreExp) == lg_lock_flag_IgnoreExp)
++ iflgs |= lg_lock_flag_IgnoreExp;
++ if ((flags & lg_lock_flag_Piority) == lg_lock_flag_Piority)
++ iflgs |= lg_lock_flag_Piority;
++
++ enc = lg->lock_enc;
++
++ if (LVB != NULL && LVBlen > 0)
++ iflgs |= gio_lck_fg_hasLVB;
++
++ iov[0].iov_base = lg->lockspace;
++ iov[0].iov_len = 4;
++ iov[1].iov_base = key;
++ iov[1].iov_len = keylen;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_state_req)) != 0)
++ break;
++ if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++ break;
++ if ((err = xdr_enc_uint8 (enc, state)) != 0)
++ break;
++ if ((err = xdr_enc_uint32 (enc, iflgs)) != 0)
++ break;
++ if (iflgs & gio_lck_fg_hasLVB)
++ if ((err = xdr_enc_raw (enc, LVB, LVBlen)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
++
++/**
++ * lg_lock_cancel_req -
++ * @lgp:
++ * @key:
++ * @keylen:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_lock_cancel_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct iovec iov[2];
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->lock_enc;
++
++ iov[0].iov_base = lg->lockspace;
++ iov[0].iov_len = 4;
++ iov[1].iov_base = key;
++ iov[1].iov_len = keylen;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_action_req)) != 0)
++ break;
++ if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++ break;
++ if ((err = xdr_enc_uint8 (enc, gio_lck_st_Cancel)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
++
++/**
++ * lg_lock_action_req -
++ * @lgp:
++ * @key:
++ * @keylen:
++ * @action:
++ * @LVB:
++ * @LVBlen:
++ *
++ * XXX
++ * I wonder if I should actually break this into three seperate calls for
++ * the lvb stuff. Does it really matter?
++ *
++ * Returns: int
++ */
++int
++lg_lock_action_req (gulm_interface_p lgp, uint8_t * key, uint16_t keylen,
++ uint8_t action, uint8_t * LVB, uint16_t LVBlen)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct iovec iov[2];
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ if (action != lg_lock_act_HoldLVB &&
++ action != lg_lock_act_UnHoldLVB && action != lg_lock_act_SyncLVB)
++ return -EINVAL;
++
++ enc = lg->lock_enc;
++
++ iov[0].iov_base = lg->lockspace;
++ iov[0].iov_len = 4;
++ iov[1].iov_base = key;
++ iov[1].iov_len = keylen;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_action_req)) != 0)
++ break;
++ if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++ break;
++ if ((err = xdr_enc_uint8 (enc, action)) != 0)
++ break;
++ if (action == gio_lck_st_SyncLVB)
++ if ((err = xdr_enc_raw (enc, LVB, LVBlen)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
++
++/**
++ * lg_lock_drop_exp -
++ * @ulm_interface_p:
++ * @holder:
++ * @keymask:
++ * @kmlen:
++ *
++ * holder is the node name of the expired holder that you want to clear.
++ * Only locks matching the keymask will be looked at. (most of the time you
++ * will just set key to a bunch of 0xff to match all) The keymask lets you
++ * basically subdivide your lockspace into smaller seperate parts.
++ * (example, there is one gfs lockspace, but each filesystem gets its own
++ * subpart of that larger space)
++ *
++ * If holder is NULL, all expired holders in your lockspace will get
++ * dropped.
++ *
++ * Returns: int
++ */
++int
++lg_lock_drop_exp (gulm_interface_p lgp, uint8_t * holder, uint8_t * key,
++ uint16_t keylen)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ struct iovec iov[2];
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->lock_enc;
++
++ iov[0].iov_base = lg->lockspace;
++ iov[0].iov_len = 4;
++ iov[1].iov_base = key;
++ iov[1].iov_len = (key != NULL) ? keylen : 0;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_lock_drop_exp)) != 0)
++ break;
++ if ((err = xdr_enc_string (enc, holder)) != 0)
++ break;
++ if ((err = xdr_enc_raw_iov (enc, 2, iov)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
++
++/**
++ * lg_lock_status -
++ * @lgp:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_lock_status (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ xdr_enc_t *enc;
++ int err;
++
++ /* make sure it is a gulm_interface_p. */
++ if (lg == NULL)
++ return -EINVAL;
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ if (lg->lock_fd < 0 || lg->lock_enc == NULL || lg->lock_dec == NULL)
++ return -EINVAL;
++
++ enc = lg->lock_enc;
++
++ down (&lg->lock_sender);
++ do {
++ if ((err = xdr_enc_uint32 (enc, gulm_info_stats_req)) != 0)
++ break;
++ if ((err = xdr_enc_flush (enc)) != 0)
++ break;
++ } while (0);
++ up (&lg->lock_sender);
++ return err;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_main.c linux-patched/fs/gfs_locking/lock_gulm/lg_main.c
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_main.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,209 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/* This is where all of the library specific functions exist.
++ * Not many, but keeps things clean.
++ */
++
++#include "lg_priv.h"
++#include "gulm.h"
++extern gulm_cm_t gulm_cm;
++
++/**
++ * lg_initialize -
++ * @gulm_interface_p:
++ * @cluster_name:
++ * @service_name:
++ *
++ * if returning an error, nothing was done to the value of gulm_interface_p
++ *
++ * Returns: gulm_interface_p
++ */
++int
++lg_initialize (gulm_interface_p * ret, char *cluster_name, char *service_name)
++{
++ gulm_interface_t *lg;
++ int err, len;
++
++ lg = kmalloc (sizeof (gulm_interface_t), GFP_KERNEL);
++ if (lg == NULL)
++ return -ENOMEM;
++
++ memset (lg, 0, sizeof (gulm_interface_t));
++ lg->first_magic = LGMAGIC;
++ lg->last_magic = LGMAGIC;
++
++ if (cluster_name == NULL)
++ cluster_name = "cluster";
++ len = strlen (cluster_name) + 1;
++ lg->clusterID = kmalloc (len, GFP_KERNEL);
++ if (lg->clusterID == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++ memcpy (lg->clusterID, cluster_name, len);
++
++ len = strlen (service_name) + 1;
++ lg->service_name = kmalloc (len, GFP_KERNEL);
++ if (lg->service_name == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++ memcpy (lg->service_name, service_name, len);
++
++ /* set up flutter bufs. */
++ lg->cfba_len = 64;
++ lg->cfba = kmalloc (lg->cfba_len, GFP_KERNEL);
++ if (lg->cfba == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++
++ lg->cfbb_len = 64;
++ lg->cfbb = kmalloc (lg->cfbb_len, GFP_KERNEL);
++ if (lg->cfbb == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++
++ lg->lfba_len = 128;
++ lg->lfba = kmalloc (lg->lfba_len, GFP_KERNEL);
++ if (lg->lfba == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++
++ lg->lfbb_len = 128;
++ lg->lfbb = kmalloc (lg->lfbb_len, GFP_KERNEL);
++ if (lg->lfbb == NULL) {
++ err = -ENOMEM;
++ goto fail_nomem;
++ }
++
++ /* setup mutexes */
++ init_MUTEX (&lg->core_sender);
++ init_MUTEX (&lg->core_recver);
++ init_MUTEX (&lg->lock_sender);
++ init_MUTEX (&lg->lock_recver);
++
++ lg->core_port = 40040;
++ lg->lock_port = 40042;
++
++ *ret = lg;
++ return 0;
++ fail_nomem:
++ if (lg->clusterID != NULL)
++ kfree (lg->clusterID);
++ if (lg->service_name != NULL)
++ kfree (lg->service_name);
++ if (lg->cfba != NULL)
++ kfree (lg->cfba);
++ if (lg->cfbb != NULL)
++ kfree (lg->cfbb);
++ if (lg->lfba != NULL)
++ kfree (lg->lfba);
++ if (lg->lfbb != NULL)
++ kfree (lg->lfbb);
++ kfree (lg);
++ return err;
++}
++
++/**
++ * lg_release -
++ * @lg:
++ *
++ */
++void
++lg_release (gulm_interface_p lgp)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ if (lgp == NULL)
++ return;
++ /* make sure it is a gulm_interface_p. */
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return;
++
++ if (lg->service_name != NULL)
++ kfree (lg->service_name);
++ if (lg->clusterID != NULL)
++ kfree (lg->clusterID);
++
++ /* wonder if I should send a logout packet? */
++ if (lg->core_enc != NULL)
++ xdr_enc_release (lg->core_enc);
++ if (lg->core_dec != NULL)
++ xdr_dec_release (lg->core_dec);
++ xdr_close (&lg->core_fd);
++
++ if (lg->lock_enc != NULL)
++ xdr_enc_release (lg->lock_enc);
++ if (lg->lock_dec != NULL)
++ xdr_dec_release (lg->lock_dec);
++ xdr_close (&lg->lock_fd);
++
++ if (lg->cfba != NULL)
++ kfree (lg->cfba);
++ if (lg->cfbb != NULL)
++ kfree (lg->cfbb);
++ if (lg->lfba != NULL)
++ kfree (lg->lfba);
++ if (lg->lfbb != NULL)
++ kfree (lg->lfbb);
++
++ kfree (lg);
++}
++
++/**
++ * lg_set_core_port -
++ * @lgp:
++ * @new:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_set_core_port (gulm_interface_p lgp, uint16_t new)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ if (lgp == NULL)
++ return -EINVAL;
++ /* make sure it is a gulm_interface_p. */
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ lg->core_port = new;
++ return 0;
++}
++
++/**
++ * lg_set_ltpx_port -
++ * @lgp:
++ * @new:
++ *
++ *
++ * Returns: int
++ */
++int
++lg_set_lock_port (gulm_interface_p lgp, uint16_t new)
++{
++ gulm_interface_t *lg = (gulm_interface_t *) lgp;
++ if (lgp == NULL)
++ return -EINVAL;
++ /* make sure it is a gulm_interface_p. */
++ if (lg->first_magic != LGMAGIC || lg->last_magic != LGMAGIC)
++ return -EINVAL;
++
++ lg->lock_port = new;
++
++ return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/lg_priv.h linux-patched/fs/gfs_locking/lock_gulm/lg_priv.h
+--- linux-orig/fs/gfs_locking/lock_gulm/lg_priv.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/lg_priv.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,86 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __lg_priv_h__
++#define __lg_priv_h__
++/* private details that we don't want to give the users of this lib access
++ * to go here.
++ */
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "xdr.h"
++#include "gio_wiretypes.h"
++#include "libgulm.h"
++
++#define LGMAGIC (0x474d4354)
++
++struct gulm_interface_s {
++ /* since we've masked this to a void* to the users, it is a nice safty
++ * net to put a little magic in here so we know things stay good.
++ */
++ uint32_t first_magic;
++
++ /* WHAT IS YOUR NAME?!? */
++ char *service_name;
++
++ char *clusterID;
++
++ uint16_t core_port;
++ xdr_socket core_fd;
++ xdr_enc_t *core_enc;
++ xdr_dec_t *core_dec;
++ struct semaphore core_sender;
++ struct semaphore core_recver;
++ int in_core_hm;
++
++ uint16_t lock_port;
++ xdr_socket lock_fd;
++ xdr_enc_t *lock_enc;
++ xdr_dec_t *lock_dec;
++ struct semaphore lock_sender;
++ struct semaphore lock_recver;
++ int in_lock_hm;
++ uint8_t lockspace[4];
++
++ /* in the message recver func, we read data into these buffers and pass
++ * them to the callback function. This way we avoid doinf mallocs and
++ * frees on every callback.
++ */
++ uint16_t cfba_len;
++ uint8_t *cfba;
++ uint16_t cfbb_len;
++ uint8_t *cfbb;
++ uint16_t lfba_len;
++ uint8_t *lfba;
++ uint16_t lfbb_len;
++ uint8_t *lfbb;
++
++ uint32_t last_magic;
++};
++typedef struct gulm_interface_s gulm_interface_t;
++
++#ifndef TRUE
++#define TRUE (1)
++#endif
++
++#ifndef FALSE
++#define FALSE (0)
++#endif
++
++#endif /*__lg_priv_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/libgulm.h linux-patched/fs/gfs_locking/lock_gulm/libgulm.h
+--- linux-orig/fs/gfs_locking/lock_gulm/libgulm.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/libgulm.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,191 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __libgulm_h__
++#define __libgulm_h__
++
++/* bit messy, but we need this to be rather seemless in both kernel and
++ * userspace. and this seems the easiest way to do it.
++ */
++
++#ifdef __linux__
++#include <linux/in6.h>
++typedef struct socket *lg_socket;
++#endif /*__linux__*/
++
++typedef void *gulm_interface_p;
++
++/* mallocs the interface structure.
++ */
++int lg_initialize (gulm_interface_p *, char *cluster_name, char *service_name);
++/* frees struct.
++ */
++void lg_release (gulm_interface_p);
++
++/* Determins where we are with a itemlist callback */
++typedef enum { lglcb_start, lglcb_item, lglcb_stop } lglcb_t;
++
++/****** Core specifics ******/
++
++/* leaving a callback pointer as NULL, will cause that message type to
++ * be ignored. */
++typedef struct lg_core_callbacks_s {
++ int (*login_reply) (void *misc, uint64_t gen, uint32_t error,
++ uint32_t rank, uint8_t corestate);
++ int (*logout_reply) (void *misc);
++ int (*nodelist) (void *misc, lglcb_t type, char *name,
++ struct in6_addr * ip, uint8_t state);
++ int (*statechange) (void *misc, uint8_t corestate,
++ struct in6_addr * masterip, char *mastername);
++ int (*nodechange) (void *misc, char *nodename,
++ struct in6_addr * nodeip, uint8_t nodestate);
++ int (*service_list) (void *misc, lglcb_t type, char *service);
++ int (*status) (void *misc, lglcb_t type, char *key, char *value);
++ int (*error) (void *misc, uint32_t err);
++} lg_core_callbacks_t;
++
++/* this will trigger a callback from gulm_core_callbacks_t
++ * handles one message! Either stick this inside of a thread,
++ * or in a poll()/select() loop using the function below.
++ * This will block until there is a message sent from core.
++ */
++int lg_core_handle_messages (gulm_interface_p, lg_core_callbacks_t *,
++ void *misc);
++
++/* this returns the filedescriptor that the library is using to
++ * communicate with the core. This is only for using in a poll()
++ * or select() call to avoid having the gulm_core_handle_messages()
++ * call block.
++ */
++lg_socket lg_core_selector (gulm_interface_p);
++
++/* Queue requests. */
++int lg_core_login (gulm_interface_p, int important);
++int lg_core_logout (gulm_interface_p);
++int lg_core_nodeinfo (gulm_interface_p, char *nodename);
++int lg_core_nodelist (gulm_interface_p);
++int lg_core_servicelist (gulm_interface_p);
++int lg_core_corestate (gulm_interface_p);
++
++/* for completeness mostly. */
++int lg_core_shutdown (gulm_interface_p);
++int lg_core_forceexpire (gulm_interface_p, char *node_name);
++int lg_core_forcepending (gulm_interface_p);
++
++int lg_core_status (gulm_interface_p);
++
++/* Node states
++ * First three are actual states, as well as changes. Last is only a node
++ * change message.
++ * */
++#define lg_core_Logged_in (0x05)
++#define lg_core_Logged_out (0x06)
++#define lg_core_Expired (0x07)
++#define lg_core_Fenced (0x08)
++/* Core states */
++#define lg_core_Slave (0x01)
++#define lg_core_Master (0x02)
++#define lg_core_Pending (0x03)
++#define lg_core_Arbitrating (0x04)
++#define lg_core_Client (0x06)
++
++/****** lock space specifics *****/
++/* note that this library masks out the lock table seperation.
++ */
++
++typedef struct lg_lockspace_callbacks_s {
++ int (*login_reply) (void *misc, uint32_t error, uint8_t which);
++ int (*logout_reply) (void *misc);
++ int (*lock_state) (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t state, uint32_t flags, uint32_t error,
++ uint8_t * LVB, uint16_t LVBlen);
++ int (*lock_action) (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t action, uint32_t error);
++ int (*cancel_reply) (void *misc, uint8_t * key, uint16_t keylen,
++ uint32_t error);
++ int (*drop_lock_req) (void *misc, uint8_t * key, uint16_t keylen,
++ uint8_t state);
++ int (*drop_all) (void *misc);
++ int (*status) (void *misc, lglcb_t type, char *key, char *value);
++ int (*error) (void *misc, uint32_t err);
++} lg_lockspace_callbacks_t;
++
++/* Like the core handle messages function, but for the lockspace.
++ * Handles one message, blocks.
++ */
++
++int lg_lock_handle_messages (gulm_interface_p, lg_lockspace_callbacks_t *,
++ void *misc);
++
++/* this returns the filedescriptor that the library is using to
++ * communicate with the ltpx. This is only for using in a poll()
++ * or select() call to avoid having the gulm_lock_handle_messages()
++ * call block.
++ */
++lg_socket lg_lock_selector (gulm_interface_p);
++
++/* Lockspace request calls */
++int lg_lock_login (gulm_interface_p, uint8_t lockspace[4]);
++int lg_lock_logout (gulm_interface_p);
++int lg_lock_state_req (gulm_interface_p, uint8_t * key, uint16_t keylen,
++ uint8_t state, uint32_t flags, uint8_t * LVB,
++ uint16_t LVBlen);
++int lg_lock_cancel_req (gulm_interface_p, uint8_t * key, uint16_t keylen);
++int lg_lock_action_req (gulm_interface_p, uint8_t * key,
++ uint16_t keylen, uint8_t action,
++ uint8_t * LVB, uint16_t LVBlen);
++int lg_lock_drop_exp (gulm_interface_p, uint8_t * holder,
++ uint8_t * keymask, uint16_t kmlen);
++int lg_lock_status (gulm_interface_p);
++
++/* state requests */
++#define lg_lock_state_Unlock (0x00)
++#define lg_lock_state_Exclusive (0x01)
++#define lg_lock_state_Deferred (0x02)
++#define lg_lock_state_Shared (0x03)
++
++/* actions */
++#define lg_lock_act_HoldLVB (0x0b)
++#define lg_lock_act_UnHoldLVB (0x0c)
++#define lg_lock_act_SyncLVB (0x0d)
++
++/* flags */
++#define lg_lock_flag_DoCB (0x00000001)
++#define lg_lock_flag_Try (0x00000002)
++#define lg_lock_flag_Any (0x00000004)
++#define lg_lock_flag_IgnoreExp (0x00000008)
++#define lg_lock_flag_Cachable (0x00000020)
++#define lg_lock_flag_Piority (0x00000040)
++
++/* These are the possible values that can be in the error fields. */
++#define lg_err_Ok (0)
++#define lg_err_BadLogin (1001)
++#define lg_err_BadCluster (1003)
++#define lg_err_BadConfig (1004)
++#define lg_err_BadGeneration (1005)
++#define lg_err_BadWireProto (1019)
++
++#define lg_err_NotAllowed (1006)
++#define lg_err_Unknown_Cs (1007)
++#define lg_err_BadStateChg (1008)
++#define lg_err_MemoryIssues (1009)
++
++#define lg_err_TryFailed (1011)
++#define lg_err_AlreadyPend (1013)
++#define lg_err_Canceled (1015)
++
++#define lg_err_NoSuchFS (1016)
++#define lg_err_NoSuchJID (1017)
++#define lg_err_NoSuchName (1018)
++
++#endif /*__libgulm_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/linux_gulm_main.c linux-patched/fs/gfs_locking/lock_gulm/linux_gulm_main.c
+--- linux-orig/fs/gfs_locking/lock_gulm/linux_gulm_main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/linux_gulm_main.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,109 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#define EXPORT_SYMTAB
++#define WANT_DEBUG_NAMES
++#define WANT_GMALLOC_NAMES
++#define EXTERN
++#include "gulm.h"
++
++#include <linux/init.h>
++
++#include "util.h"
++#include "gulm_procinfo.h"
++
++MODULE_DESCRIPTION ("Grand Unified Locking Module " GULM_RELEASE_NAME);
++MODULE_AUTHOR ("Red Hat, Inc.");
++MODULE_LICENSE ("GPL");
++
++extern gulm_cm_t gulm_cm;
++
++/**
++ * init_gulm - Initialize the gulm module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++int __init
++init_gulm (void)
++{
++ int error;
++
++ memset (&gulm_cm, 0, sizeof (gulm_cm_t));
++ gulm_cm.loaded = FALSE;
++ gulm_cm.hookup = NULL;
++
++ /* register with the lm layers. */
++ error = lm_register_proto (&gulm_ops);
++ if (error)
++ goto fail;
++
++ error = init_proc_dir ();
++ if (error != 0) {
++ goto fail_lm;
++ }
++
++ init_gulm_fs ();
++
++ printk ("Gulm %s (built %s %s) installed\n",
++ GULM_RELEASE_NAME, __DATE__, __TIME__);
++
++ return 0;
++
++ fail_lm:
++ lm_unregister_proto (&gulm_ops);
++
++ fail:
++ return error;
++}
++
++/**
++ * exit_gulm - cleanup the gulm module
++ *
++ */
++
++void __exit
++exit_gulm (void)
++{
++ remove_proc_dir ();
++ lm_unregister_proto (&gulm_ops);
++}
++
++module_init (init_gulm);
++module_exit (exit_gulm);
++
++/* the libgulm.h interface. */
++EXPORT_SYMBOL (lg_initialize);
++EXPORT_SYMBOL (lg_release);
++
++EXPORT_SYMBOL (lg_core_handle_messages);
++EXPORT_SYMBOL (lg_core_selector);
++EXPORT_SYMBOL (lg_core_login);
++EXPORT_SYMBOL (lg_core_logout);
++EXPORT_SYMBOL (lg_core_nodeinfo);
++EXPORT_SYMBOL (lg_core_nodelist);
++EXPORT_SYMBOL (lg_core_servicelist);
++EXPORT_SYMBOL (lg_core_corestate);
++EXPORT_SYMBOL (lg_core_shutdown);
++EXPORT_SYMBOL (lg_core_forceexpire);
++EXPORT_SYMBOL (lg_core_forcepending);
++EXPORT_SYMBOL (lg_core_status);
++
++EXPORT_SYMBOL (lg_lock_handle_messages);
++EXPORT_SYMBOL (lg_lock_selector);
++EXPORT_SYMBOL (lg_lock_login);
++EXPORT_SYMBOL (lg_lock_logout);
++EXPORT_SYMBOL (lg_lock_state_req);
++EXPORT_SYMBOL (lg_lock_cancel_req);
++EXPORT_SYMBOL (lg_lock_action_req);
++EXPORT_SYMBOL (lg_lock_drop_exp);
++EXPORT_SYMBOL (lg_lock_status);
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/load_info.c linux-patched/fs/gfs_locking/lock_gulm/load_info.c
+--- linux-orig/fs/gfs_locking/lock_gulm/load_info.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/load_info.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,96 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gulm.h"
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++
++#include <linux/utsname.h> /* for extern system_utsname */
++
++#include "util.h"
++#include "utils_verb_flags.h"
++
++gulm_cm_t gulm_cm;
++
++/**
++ * init_ltpx -
++ */
++int
++init_ltpx (void)
++{
++ int j;
++ lock_table_t *lt = &gulm_cm.ltpx;
++
++ INIT_LIST_HEAD (<->to_be_sent);
++ spin_lock_init (<->queue_sender);
++ init_waitqueue_head (<->send_wchan);
++ lt->magic_one = 0xAAAAAAAA;
++ init_MUTEX (<->sender);
++ init_completion (<->startup);
++ atomic_set (<->locks_pending, 0);
++ lt->hashbuckets = 8191;
++ lt->hshlk = kmalloc (sizeof (spinlock_t) * lt->hashbuckets, GFP_KERNEL);
++ if (lt->hshlk == NULL)
++ return -ENOMEM;
++ lt->lkhsh =
++ kmalloc (sizeof (struct list_head) * lt->hashbuckets, GFP_KERNEL);
++ if (lt->lkhsh == NULL) {
++ kfree (lt->hshlk);
++ return -ENOMEM;
++ }
++ for (j = 0; j < lt->hashbuckets; j++) {
++ spin_lock_init (<->hshlk[j]);
++ INIT_LIST_HEAD (<->lkhsh[j]);
++ }
++ return 0;
++}
++
++/**
++ * load_info -
++ * @hostdata: < optionally override the name of this node.
++ *
++ * Returns: int
++ */
++int
++load_info (char *hostdata)
++{
++ int err = 0;
++
++ if (gulm_cm.loaded)
++ goto exit;
++
++ gulm_cm.verbosity = 0;
++ if (hostdata != NULL && strlen (hostdata) > 0) {
++ strncpy (gulm_cm.myName, hostdata, 64);
++ } else {
++ strncpy (gulm_cm.myName, system_utsname.nodename, 64);
++ }
++ gulm_cm.myName[63] = '\0';
++
++ /* breaking away from ccs. just hardcoding defaults here.
++ * Noone really used these anyways and if ppl want them badly, we'll
++ * find another way to set them. (modprobe options for example.)
++ * */
++ gulm_cm.handler_threads = 2;
++ set_verbosity ("Default", &gulm_cm.verbosity);
++
++ init_ltpx ();
++
++ gulm_cm.loaded = TRUE;
++ exit:
++ return err;
++}
++/* vim: set ai cin noet sw=8 ts=8 : */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/load_info.h linux-patched/fs/gfs_locking/lock_gulm/load_info.h
+--- linux-orig/fs/gfs_locking/lock_gulm/load_info.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/load_info.h 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,17 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __load_info_h__
++#define __load_info_h__
++int load_info (char *);
++#endif /*__load_info_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/util.c linux-patched/fs/gfs_locking/lock_gulm/util.c
+--- linux-orig/fs/gfs_locking/lock_gulm/util.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/util.c 2004-06-16 12:03:21.958894765 -0500
+@@ -0,0 +1,109 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include "utils_crc.h"
++
++/**
++ * atoi
++ *
++ * @c:
++ *
++ */
++
++int
++atoi (char *c)
++{
++ int x = 0;
++
++ while ('0' <= *c && *c <= '9') {
++ x = x * 10 + (*c - '0');
++ c++;
++ }
++
++ return (x);
++}
++
++/**
++ * inet_aton
++ *
++ * @ascii:
++ * @ip:
++ *
++ */
++
++int
++inet_aton (char *ascii, uint32_t * ip)
++{
++ uint32_t value;
++ int x;
++
++ *ip = 0;
++
++ for (x = 0; x < 4; x++) {
++ value = atoi (ascii);
++ if (value > 255)
++ return (-1);
++
++ *ip = (*ip << 8) | value;
++
++ if (x != 3) {
++ for (; *ascii != '.' && *ascii != '\0'; ascii++) {
++ if (*ascii < '0' || *ascii > '9') {
++ /* not a number. stop */
++ return -1;
++ }
++ }
++ if (*ascii == '\0')
++ return (-1);
++
++ ascii++;
++ }
++ }
++
++ return (0);
++}
++
++/**
++ * inet_ntoa
++ *
++ * @ascii:
++ * @ip:
++ *
++ */
++void
++inet_ntoa (uint32_t ip, char *buf)
++{
++ int i;
++ char *p;
++
++ p = buf;
++
++ for (i = 3; i >= 0; i--) {
++ p += sprintf (p, "%d", (ip >> (8 * i)) & 0xFF);
++ if (i > 0)
++ *(p++) = '.';
++ }
++
++}
++
++/* public functions */
++#define hash_init_val 0x6d696b65
++
++uint32_t __inline__
++hash_lock_key (uint8_t * in, uint8_t len)
++{ /* other hash function was to variable */
++ return crc32 (in, len, hash_init_val);
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/util.h linux-patched/fs/gfs_locking/lock_gulm/util.h
+--- linux-orig/fs/gfs_locking/lock_gulm/util.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/util.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,29 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __UTIL_DOT_H__
++#define __UTIL_DOT_H__
++
++int atoi (char *c);
++int inet_aton (char *ascii, uint32_t * ip);
++void inet_ntoa (uint32_t ip, char *buf);
++void dump_buffer (void *buf, int len);
++
++uint32_t __inline__ hash_lock_key (uint8_t * in, uint8_t len);
++uint8_t __inline__ fourtoone (uint32_t);
++
++__inline__ int testbit (uint16_t bit, uint8_t * set);
++__inline__ void setbit (uint16_t bit, uint8_t * set);
++__inline__ void clearbit (uint16_t bit, uint8_t * set);
++
++#endif /* __UTIL_DOT_H__ */
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_crc.c linux-patched/fs/gfs_locking/lock_gulm/utils_crc.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_crc.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_crc.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,92 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/types.h>
++
++static const uint32_t crc_32_tab[] = {
++ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
++ 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
++ 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
++ 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
++ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
++ 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
++ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
++ 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
++ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
++ 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
++ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
++ 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
++ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
++ 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
++ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
++ 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
++ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
++ 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
++ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
++ 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
++ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
++ 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
++ 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
++ 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
++ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
++ 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
++ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
++ 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
++ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
++ 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
++ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
++ 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
++ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
++ 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
++ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
++ 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
++ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
++ 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
++ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
++ 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
++ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
++ 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
++ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
++};
++
++/**
++ * crc32 - hash an array of data
++ * @data: the data to be hashed
++ * @len: the length of data to be hashed
++ *
++ * completely copied from GFS/src/fs.c
++ *
++ * Take some data and convert it to a 32-bit hash.
++ *
++ * The hash function is a 32-bit CRC of the data. The algorithm uses
++ * the crc_32_tab table above.
++ *
++ * This may not be the fastest hash function, but it does a fair bit better
++ * at providing uniform results than the others I've looked at. That's
++ * really important for efficient directories.
++ *
++ * Returns: the hash
++ */
++
++uint32_t
++crc32 (const char *data, int len, uint32_t init)
++{
++ uint32_t hash = init;
++
++ for (; len--; data++)
++ hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
++
++ hash = ~hash;
++
++ return hash;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_crc.h linux-patched/fs/gfs_locking/lock_gulm/utils_crc.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_crc.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_crc.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,17 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_crc_h__
++#define __utils_crc_h__
++uint32_t crc32 (const char *data, int len, uint32_t init);
++#endif /*__utils_crc_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.c linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,207 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include "gio_wiretypes.h"
++
++char *
++gio_Err_to_str (int x)
++{
++ char *t = "Unknown GULM Err";
++ switch (x) {
++ case gio_Err_Ok:
++ t = "Ok";
++ break;
++
++ case gio_Err_BadLogin:
++ t = "Bad Login";
++ break;
++ case gio_Err_BadCluster:
++ t = "Bad Cluster ID";
++ break;
++ case gio_Err_BadConfig:
++ t = "Incompatible configurations";
++ break;
++ case gio_Err_BadGeneration:
++ t = "Bad Generation ID";
++ break;
++ case gio_Err_BadWireProto:
++ t = "Bad Wire Protocol Version";
++ break;
++
++ case gio_Err_NotAllowed:
++ t = "Not Allowed";
++ break;
++ case gio_Err_Unknown_Cs:
++ t = "Uknown Client";
++ break;
++ case gio_Err_BadStateChg:
++ t = "Bad State Change";
++ break;
++ case gio_Err_MemoryIssues:
++ t = "Memory Problems";
++ break;
++
++ case gio_Err_PushQu:
++ t = "Push Queue";
++ break;
++ case gio_Err_TryFailed:
++ t = "Try Failed";
++ break;
++ case gio_Err_AlreadyPend:
++ t = "Request Already Pending";
++ break;
++ case gio_Err_Canceled:
++ t = "Request Canceled";
++ break;
++
++ case gio_Err_NoSuchFS:
++ t = "No Such Filesystem";
++ break;
++ case gio_Err_NoSuchJID:
++ t = "No Such JID";
++ break;
++ case gio_Err_NoSuchName:
++ t = "No Such Node";
++ break;
++ }
++ return t;
++}
++
++char *
++gio_mbrupdate_to_str (int x)
++{
++ char *t = "Unknown Membership Update";
++ switch (x) {
++ case gio_Mbr_Logged_in:
++ t = "Logged in";
++ break;
++ case gio_Mbr_Logged_out:
++ t = "Logged out";
++ break;
++ case gio_Mbr_Expired:
++ t = "Expired";
++ break;
++ case gio_Mbr_Killed:
++ t = "Fenced";
++ break;
++ case gio_Mbr_OM_lgin:
++ t = "Was Logged in";
++ break;
++ }
++ return t;
++}
++
++char *
++gio_I_am_to_str (int x)
++{
++ switch (x) {
++ case gio_Mbr_ama_Slave:
++ return "Slave";
++ break;
++ case gio_Mbr_ama_Pending:
++ return "Pending";
++ break;
++ case gio_Mbr_ama_Arbitrating:
++ return "Arbitrating";
++ break;
++ case gio_Mbr_ama_Master:
++ return "Master";
++ break;
++ case gio_Mbr_ama_Resource:
++ return "Service";
++ break;
++ case gio_Mbr_ama_Client:
++ return "Client";
++ break;
++ default:
++ return "Unknown I_am state";
++ break;
++ }
++}
++
++char *
++gio_license_states (int x)
++{
++ switch (x) {
++ case 0:
++ return "valid";
++ break;
++ case 1:
++ return "expired";
++ break;
++ case 2:
++ return "invalid";
++ break;
++ default:
++ return "unknown";
++ break;
++ }
++}
++
++char *
++gio_opcodes (int x)
++{
++ switch (x) {
++#define CP(x) case (x): return #x ; break
++ CP (gulm_err_reply);
++
++ CP (gulm_core_login_req);
++ CP (gulm_core_login_rpl);
++ CP (gulm_core_logout_req);
++ CP (gulm_core_logout_rpl);
++ CP (gulm_core_reslgn_req);
++ CP (gulm_core_beat_req);
++ CP (gulm_core_beat_rpl);
++ CP (gulm_core_mbr_req);
++ CP (gulm_core_mbr_updt);
++ CP (gulm_core_mbr_lstreq);
++ CP (gulm_core_mbr_lstrpl);
++ CP (gulm_core_mbr_force);
++ CP (gulm_core_res_req);
++ CP (gulm_core_res_list);
++ CP (gulm_core_state_req);
++ CP (gulm_core_state_chgs);
++ CP (gulm_core_shutdown);
++ CP (gulm_core_forcepend);
++
++ CP (gulm_info_stats_req);
++ CP (gulm_info_stats_rpl);
++ CP (gulm_info_set_verbosity);
++ CP (gulm_socket_close);
++ CP (gulm_info_slave_list_req);
++ CP (gulm_info_slave_list_rpl);
++
++ CP (gulm_lock_login_req);
++ CP (gulm_lock_login_rpl);
++ CP (gulm_lock_logout_req);
++ CP (gulm_lock_logout_rpl);
++ CP (gulm_lock_state_req);
++ CP (gulm_lock_state_rpl);
++ CP (gulm_lock_state_updt);
++ CP (gulm_lock_action_req);
++ CP (gulm_lock_action_rpl);
++ CP (gulm_lock_action_updt);
++ CP (gulm_lock_update_rpl);
++ CP (gulm_lock_cb_state);
++ CP (gulm_lock_cb_dropall);
++ CP (gulm_lock_drop_exp);
++ CP (gulm_lock_dump_req);
++ CP (gulm_lock_dump_rpl);
++ CP (gulm_lock_rerunqueues);
++
++#undef CP
++ default:
++ return "Unknown Op Code";
++ break;
++ }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.h linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_tostr.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_tostr.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,22 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_tostr_h__
++#define __utils_tostr_h__
++char *gio_Err_to_str (int x);
++char *gio_mbrupdate_to_str (int x);
++char *gio_mbrama_to_str (int x);
++char *gio_I_am_to_str (int x);
++char *gio_license_states (int x);
++char *gio_opcodes (int x);
++#endif /*__utils_tostr_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.c linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.c
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,271 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "gulm_log_msg_bits.h"
++
++static __inline__ int
++strncasecmp (const char *s1, const char *s2, size_t l)
++{
++ char c1 = '\0', c2 = '\0';
++
++ while (*s1 && *s2 && l-- > 0) {
++ c1 = *s1++;
++ c2 = *s2++;
++
++ if (c1 >= 'A' && c1 <= 'Z')
++ c1 += 'a' - 'A';
++
++ if (c2 >= 'A' && c2 <= 'Z')
++ c2 += 'a' - 'A';
++
++ if (c1 != c2)
++ break;
++ }
++ return (c1 - c2);
++}
++
++static int bit_array[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
++
++#define BITCOUNT(x) (bit_array[x & 0x000F] + \
++ bit_array[(x >> 4) & 0x000F] + \
++ bit_array[(x >> 8) & 0x000F] + \
++ bit_array[(x >> 12) & 0x000F] + \
++ bit_array[(x >> 16) & 0x000F] + \
++ bit_array[(x >> 20) & 0x000F] + \
++ bit_array[(x >> 24) & 0x000F] + \
++ bit_array[(x >> 28) & 0x000F])
++
++struct {
++ char *name;
++ uint32_t val;
++} verbose_flags[] = {
++ {
++ "Network", lgm_Network,}, {
++ "Network2", lgm_Network2,}, {
++ "Network3", lgm_Network3,}, {
++ "Fencing", lgm_Stomith,}, {
++ "Heartbeat", lgm_Heartbeat,}, {
++ "Locking", lgm_locking,}, {
++ "Forking", lgm_Forking,}, {
++ "JIDMap", lgm_JIDMap,}, {
++ "JIDUpdates", lgm_JIDUpdates,}, {
++ "Subscribers", lgm_Subscribers,}, {
++ "LockUpdates", lgm_LockUpdates,}, {
++ "LoginLoops", lgm_LoginLoops,}, {
++ "ServerState", lgm_ServerState,}, {
++ "Default", lgm_Network | lgm_Stomith | lgm_Forking,},
++/* Since I really don't want people really doing *all* flags with all,
++ * there is AlmostAll, which users really get, and ReallyAll, which is all
++ * bits on.
++ * This is mostly due to Network3, which dumps messages on nearly
++ * every packet. (should actually be every packet.)
++ * Also drop the slave updates, since that is on every packet as well.
++ */
++ {
++ "All",
++ (lgm_ReallyAll &
++ ~(lgm_Network3 | lgm_JIDUpdates |
++ lgm_LockUpdates)),}, {
++ "AlmostAll",
++ lgm_ReallyAll & ~(lgm_Network3 | lgm_JIDUpdates |
++ lgm_LockUpdates),}, {
++ "ReallyAll", lgm_ReallyAll,}
++};
++
++static int
++add_string (char *name, size_t * cur, char *str, size_t slen)
++{
++ size_t nl;
++
++ nl = strlen (name);
++ if (*cur + nl > slen) {
++ memcpy (str + *cur, "...", 3);
++ cur += 3;
++ str[*cur] = '\0';
++ return -1;
++ }
++ memcpy (str + *cur, name, nl);
++ *cur += nl;
++ str[*cur] = ',';
++ *cur += 1;
++
++ return 0;
++}
++
++/**
++ * get_verbosity_string -
++ * @str:
++ * @verb:
++ *
++ *
++ * Returns: int
++ */
++int
++get_verbosity_string (char *str, size_t slen, uint32_t verb)
++{
++ int i, vlen = sizeof (verbose_flags) / sizeof (verbose_flags[0]);
++ size_t cur = 0;
++ int combo_match = -1, error = 0;
++
++ memset (str, 0, slen);
++ slen -= 4; /* leave room for dots and null */
++
++ if (verb == 0) {
++ error = add_string ("Quiet", &cur, str, slen);
++ goto end;
++ }
++
++ /* Combo verb flag phase */
++ for (i = 0; i < vlen; i++) {
++ if (BITCOUNT (verbose_flags[i].val) > 1) {
++ /* check to see if this flag matches exclusively */
++ if ((verbose_flags[i].val ^ verb) == 0) {
++ error =
++ add_string (verbose_flags[i].name, &cur,
++ str, slen);
++ goto end;
++ }
++
++ if ((verbose_flags[i].val & verb) ==
++ verbose_flags[i].val) {
++ if (combo_match < 0) {
++ combo_match = i;
++ } else {
++ /* Compare this combo with the one in combo_match */
++ if (BITCOUNT (verbose_flags[i].val) >
++ BITCOUNT (verbose_flags
++ [combo_match].val)) {
++ combo_match = i;
++ }
++ }
++
++ }
++ }
++ }
++ /* Add the best combo to the string */
++ if (combo_match > -1) {
++ if (add_string
++ (verbose_flags[combo_match].name, &cur, str, slen) == -1) {
++ error = -1;
++ goto end;
++ }
++ }
++
++ /* Single verb flag phase */
++ for (i = 0; i < vlen; i++) {
++ if (BITCOUNT (verbose_flags[i].val) == 1) {
++ if (combo_match > -1) {
++ if ((verbose_flags[combo_match].
++ val & verbose_flags[i].val) ==
++ verbose_flags[i].val) {
++ continue;
++ }
++ }
++
++ if ((verbose_flags[i].val & verb) ==
++ verbose_flags[i].val) {
++ if (add_string
++ (verbose_flags[i].name, &cur, str,
++ slen) == -1) {
++ error = -1;
++ goto end;
++ }
++ }
++ }
++ }
++ end:
++ /* Clear trailing ',' */
++ if (str[cur - 1] == ',') {
++ str[cur - 1] = '\0';
++ }
++ return error;
++}
++
++/**
++ * set_verbosity -
++ * @str:
++ * @verb:
++ *
++ * toggle bits according to the `rules' in the str.
++ * str is a list of verb flags. can be prefexed with '+' or '-'
++ * No prefix is the same as '+' prefix
++ * '+' sets bits
++ * '-' unsets bits.
++ * special 'clear' unsets all.
++ */
++void
++set_verbosity (char *str, uint32_t * verb)
++{
++ char *token, *next;
++ int i, wl, tl, len = sizeof (verbose_flags) / sizeof (verbose_flags[0]);
++
++ if (str == NULL)
++ return;
++
++ wl = strlen (str);
++ if (wl == 0)
++ return;
++ for (token = str, tl = 0; tl < wl &&
++ token[tl] != ',' &&
++ token[tl] != ' ' && token[tl] != '|' && token[tl] != '\0'; tl++) ;
++ next = token + tl + 1;
++
++ for (;;) {
++ if (token[0] == '-') {
++ token++;
++ for (i = 0; i < len; i++) {
++ if (strncasecmp
++ (token, verbose_flags[i].name, tl) == 0) {
++ (*verb) &= ~(verbose_flags[i].val);
++ }
++ }
++ } else if (token[0] == '+') {
++ token++;
++ for (i = 0; i < len; i++) {
++ if (strncasecmp
++ (token, verbose_flags[i].name, tl) == 0) {
++ (*verb) |= verbose_flags[i].val;
++ }
++ }
++ } else {
++ if (strncasecmp (token, "clear", tl) == 0) {
++ (*verb) = 0;
++ } else {
++ for (i = 0; i < len; i++) {
++ if (strncasecmp
++ (token, verbose_flags[i].name,
++ tl) == 0) {
++ (*verb) |= verbose_flags[i].val;
++ }
++ }
++ }
++ }
++
++ if (next >= str + wl)
++ return;
++ for (token = next, tl = 0;
++ tl < wl &&
++ token[tl] != ',' &&
++ token[tl] != ' ' &&
++ token[tl] != '|' && token[tl] != '\0'; tl++) ;
++ next = token + tl + 1;
++
++ }
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.h linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.h
+--- linux-orig/fs/gfs_locking/lock_gulm/utils_verb_flags.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/utils_verb_flags.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,18 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __utils_verb_flags_h__
++#define __utils_verb_flags_h__
++int get_verbosity_string (char *str, size_t slen, uint32_t verb);
++void set_verbosity (char *str, uint32_t * verb);
++#endif /*__utils_verb_flags_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr.h linux-patched/fs/gfs_locking/lock_gulm/xdr.h
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr.h 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,98 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#ifndef __gulm_xdr_h__
++#define __gulm_xdr_h__
++typedef struct xdr_enc_s xdr_enc_t;
++typedef struct xdr_dec_s xdr_dec_t;
++
++/* sockets in kernel space are done a bit different than socket in
++ * userspace. But we need to have them appear to be the same.
++ */
++#ifdef __KERNEL__
++
++#ifdef __linux__
++#include <linux/net.h>
++#include <linux/in.h>
++#include <linux/in6.h>
++#include <linux/socket.h>
++#include <net/sock.h>
++
++typedef struct socket *xdr_socket;
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++#include <sys/types.h>
++#include <sys/uio.h>
++#include <sys/socket.h>
++#include <netinet/in.h>
++#include <netinet/tcp.h>
++#include <unistd.h>
++#include <errno.h>
++typedef int xdr_socket;
++#endif /*__KERNEL__*/
++
++/* start things up */
++int xdr_open (xdr_socket * sk);
++int xdr_connect (struct sockaddr_in6 *adr, xdr_socket sk);
++void xdr_close (xdr_socket * sk);
++
++/* deep, basic io */
++#ifdef __KERNEL__
++#ifdef __linux__
++size_t xdr_send (struct socket *sock, void *buf, size_t size);
++size_t xdr_recv (struct socket *sock, void *buf, size_t size);
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++ssize_t xdr_recv (int fd, void *buf, size_t len);
++ssize_t xdr_send (int fd, void *buf, size_t len);
++#endif /*__KERNEL__*/
++
++xdr_enc_t *xdr_enc_init (xdr_socket sk, int buffer_size);
++xdr_dec_t *xdr_dec_init (xdr_socket sk, int buffer_size);
++int xdr_enc_flush (xdr_enc_t * xdr);
++int xdr_enc_release (xdr_enc_t * xdr); /* calls xdr_enc_flush() */
++void xdr_enc_force_release (xdr_enc_t * xdr); /* doesn't call xdr_enc_flush() */
++void xdr_dec_release (xdr_dec_t * xdr);
++/* xdr_enc_force_release() is for when you get and error sending and you
++ * want to free that stuff up right away. If you use the regular release
++ * for enc, it will fail if it cannot send data over the filedesciptor.
++ */
++
++/* encoders add to a stream */
++int __inline__ xdr_enc_uint64 (xdr_enc_t * xdr, uint64_t i);
++int __inline__ xdr_enc_uint32 (xdr_enc_t * xdr, uint32_t i);
++int __inline__ xdr_enc_uint16 (xdr_enc_t * xdr, uint16_t i);
++int __inline__ xdr_enc_uint8 (xdr_enc_t * xdr, uint8_t i);
++int __inline__ xdr_enc_ipv6 (xdr_enc_t * enc, struct in6_addr *ip);
++int xdr_enc_raw (xdr_enc_t * xdr, void *pointer, uint16_t len);
++int xdr_enc_raw_iov (xdr_enc_t * xdr, int count, struct iovec *iov);
++int xdr_enc_string (xdr_enc_t * xdr, uint8_t * s);
++int xdr_enc_list_start (xdr_enc_t * xdr);
++int xdr_enc_list_stop (xdr_enc_t * xdr);
++
++/* decoders remove from stream */
++int xdr_dec_uint64 (xdr_dec_t * xdr, uint64_t * i);
++int xdr_dec_uint32 (xdr_dec_t * xdr, uint32_t * i);
++int xdr_dec_uint16 (xdr_dec_t * xdr, uint16_t * i);
++int xdr_dec_uint8 (xdr_dec_t * xdr, uint8_t * i);
++int xdr_dec_ipv6 (xdr_dec_t * xdr, struct in6_addr *ip);
++int xdr_dec_raw (xdr_dec_t * xdr, void *p, uint16_t * l); /* no malloc */
++int xdr_dec_raw_m (xdr_dec_t * xdr, void **p, uint16_t * l); /* mallocs p */
++int xdr_dec_raw_ag (xdr_dec_t * xdr, void **p, uint16_t * bl, uint16_t * rl);
++int xdr_dec_string (xdr_dec_t * xdr, uint8_t ** strp); /* mallocs s */
++int xdr_dec_string_nm (xdr_dec_t * xdr, uint8_t * strp, size_t l); /* no malloc */
++int xdr_dec_string_ag (xdr_dec_t * xdr, uint8_t ** s, uint16_t * bl);
++int xdr_dec_list_start (xdr_dec_t * xdr);
++int xdr_dec_list_stop (xdr_dec_t * xdr);
++
++#endif /*__gulm_xdr_h__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_base.c linux-patched/fs/gfs_locking/lock_gulm/xdr_base.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_base.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_base.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,904 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * This is a bit of an abstraction layer to get this working in both kernel
++ * and userspace.
++ */
++#define TRUE (1)
++#define FALSE (0)
++#define MIN(a,b) ((a<b)?a:b)
++
++#ifdef __linux__
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#define __KERNEL_SYSCALLS__
++#include <linux/unistd.h>
++#endif /*__linux__*/
++
++#include "xdr.h"
++
++/**
++ * xdr_realloc - a realloc for kernel space.
++ * @a: < pointer to realloc
++ * @nl: < desired new size
++ * @ol: < current old size
++ *
++ * Not as good as the real realloc, since it always moves memory. But good
++ * enough for as little as it will get used here.
++ *
++ * XXX this is broken.
++ *
++ * Returns: void*
++ */
++static void *
++xdr_realloc (void *a, size_t nl, size_t ol)
++{
++ if (nl == ol) {
++ return a;
++ } else if (nl == 0) {
++ kfree (a);
++ return NULL;
++ } else if (a == NULL && nl > 0) {
++ return kmalloc (nl, GFP_KERNEL);
++ } else {
++ void *tmp;
++ tmp = kmalloc (nl, GFP_KERNEL);
++ if (tmp == NULL)
++ return NULL;
++ memcpy (tmp, a, MIN (nl, ol));
++ kfree (a);
++ return tmp;
++ }
++}
++
++typedef enum { xdr_enc, xdr_dec } xdr_type;
++
++/* encoders have this sorta non-blocking, growing buffering stunt.
++ * makes them a bit different from the decoders now.
++ */
++struct xdr_enc_s {
++ size_t default_buf_size;
++ xdr_socket fd;
++ xdr_type type;
++ size_t length;
++ size_t curloc;
++ uint8_t *stream;
++};
++
++/* decoders only pull a single item off of the socket at a time.
++ * so this is all they need.
++ */
++struct xdr_dec_s {
++ size_t length; /* total byte length of the stream */
++ size_t curloc; /* current byte offset from start */
++ uint8_t *stream; /* start of the encoded stream. */
++ xdr_socket fd;
++ xdr_type type;
++};
++
++/* the types of data we support. */
++
++#define XDR_NULL 0x00 /* NOT A VALID TAG!!! used in dec code. */
++#define XDR_LIST_START 0x01
++#define XDR_LIST_STOP 0x02
++/* list is a variable length device. It is a start tag, some number of
++ * xdr_enc_*, then an stop tag. It's main purpose is to provide a method
++ * of encasing data.
++ * */
++#define XDR_STRING 0x04
++/* string tag is followed by a uint16 which is the byte length */
++#define XDR_RAW 0x05
++/* raw tag is followed by a uint16 which is the byte length
++ * if 65535 bytes isn't enough, split your data and put multiples of these
++ * back to back. (idea of xdr is to avoid this twit.)
++ * */
++
++/* note, if the size of these should variate, I'm screwed. Should consider
++ * changing this all to the bit shift and array access to be more concrete.
++ * later.
++ */
++#define XDR_UINT64 0x06
++#define XDR_UINT32 0x07
++#define XDR_UINT16 0x08
++#define XDR_UINT8 0x09
++/* should add signed ints */
++
++#define XDR_IPv6 0x0a /* 16 bytes, IPv6 address */
++
++/* any other base types?
++ */
++
++#define XDR_DEFAULT_BUFFER_SIZE 4096
++/*****************************************************************************/
++
++/**
++ * xdr_enc_init -
++ * @fd:
++ * @buffer_size:
++ *
++ *
++ * Returns: xdr_enc_t*
++ */
++xdr_enc_t *
++xdr_enc_init (xdr_socket fd, int buffer_size)
++{
++ xdr_enc_t *xdr;
++
++ if (buffer_size <= 0)
++ buffer_size = XDR_DEFAULT_BUFFER_SIZE;
++
++ xdr = kmalloc (sizeof (xdr_enc_t), GFP_KERNEL);
++ if (xdr == NULL)
++ return NULL;
++ xdr->stream = kmalloc (buffer_size, GFP_KERNEL);
++ if (xdr->stream == NULL) {
++ kfree (xdr);
++ return NULL;
++ }
++ xdr->fd = fd;
++ xdr->type = xdr_enc;
++ xdr->default_buf_size = buffer_size;
++ xdr->length = buffer_size;
++ xdr->curloc = 0;
++
++ return xdr;
++}
++
++/**
++ * xdr_dec_init -
++ * @fd:
++ * @buffer_size:
++ *
++ *
++ * Returns: xdr_dec_t*
++ */
++xdr_dec_t *
++xdr_dec_init (xdr_socket fd, int buffer_size)
++{
++ xdr_dec_t *xdr;
++
++ if (buffer_size <= 0)
++ buffer_size = XDR_DEFAULT_BUFFER_SIZE;
++
++ xdr = kmalloc (sizeof (xdr_dec_t), GFP_KERNEL);
++ if (xdr == NULL)
++ return NULL;
++ xdr->length = buffer_size;
++ xdr->curloc = 0;
++ xdr->stream = kmalloc (buffer_size, GFP_KERNEL);
++ xdr->fd = fd;
++ xdr->type = xdr_dec;
++ if (xdr->stream == NULL) {
++ kfree (xdr);
++ return NULL;
++ }
++ *(xdr->stream) = XDR_NULL; /* so the first dec_call will call get_next */
++ return xdr;
++}
++
++/*****************************************************************************/
++/**
++ * xdr_enc_flush -
++ * @xdr:
++ *
++ * Returns: int
++ */
++int
++xdr_enc_flush (xdr_enc_t * xdr)
++{
++ int err;
++ if (xdr == NULL)
++ return -EINVAL;
++ if (xdr->type != xdr_enc)
++ return -EINVAL;
++ if (xdr->curloc == 0)
++ return 0;
++
++ err = xdr_send (xdr->fd, xdr->stream, xdr->curloc);
++ if (err < 0)
++ return err;
++ if (err == 0)
++ return -EPROTO; /* why? */
++ xdr->curloc = 0;
++
++ return 0;
++}
++
++/**
++ * xdr_release -
++ * @xdr:
++ *
++ * Free the memory, losing whatever may be there.
++ */
++void
++xdr_dec_release (xdr_dec_t * xdr)
++{
++ if (xdr == NULL)
++ return;
++ kfree (xdr->stream);
++ kfree (xdr);
++}
++
++/**
++ * xdr_enc_force_release -
++ * @xdr:
++ *
++ * Free the memory, losing whatever may be there.
++ */
++void
++xdr_enc_force_release (xdr_enc_t * xdr)
++{
++ if (xdr == NULL)
++ return;
++ if (xdr->stream != NULL)
++ kfree (xdr->stream);
++ kfree (xdr);
++}
++
++/**
++ * xdr_enc_release -
++ * @xdr:
++ *
++ * Free things up, trying to send any possible leftover data first.
++ *
++ * Returns: int
++ */
++int
++xdr_enc_release (xdr_enc_t * xdr)
++{
++ int e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if ((e = xdr_enc_flush (xdr)) != 0)
++ return e;
++ xdr_enc_force_release (xdr);
++ return 0;
++}
++
++/*****************************************************************************/
++/**
++ * grow_stream -
++ * @xdr:
++ * @len:
++ *
++ * each single encoded call needs to fit within a buffer. So we make sure
++ * the buffer is big enough.
++ *
++ * If the buffer is big enough, but just doesn't have room, we send the
++ * data in the buffer, emptying it, first.
++ *
++ * Returns: int
++ */
++static int
++grow_stream (xdr_enc_t * enc, size_t len)
++{
++ int err;
++ uint8_t *c;
++
++ /* buffer must be big enough for one type entry. */
++ if (len > enc->length) {
++ c = xdr_realloc (enc->stream, len, enc->length);
++ if (c == NULL)
++ return -ENOMEM;
++ enc->stream = c;
++ enc->length = len;
++ }
++
++ /* if there isn't room on the end of this chunk,
++ * try sending what we've got.
++ */
++ if (enc->curloc + len > enc->length) {
++ err = xdr_enc_flush (enc);
++ if (err != 0) {
++ /* error, better pass this up. */
++ return err;
++ }
++ }
++
++ return 0;
++}
++
++/**
++ * append_bytes -
++ * @xdr:
++ * @xdr_type:
++ * @bytes:
++ * @len:
++ *
++ *
++ * Returns: int
++ */
++static int
++append_bytes (xdr_enc_t * xdr, uint8_t xdr_type, void *bytes, size_t len)
++{
++ int e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if (xdr->type != xdr_enc)
++ return -EINVAL;
++
++ /* len + 1; need the one byte for the type code. */
++ if ((e = grow_stream (xdr, len + 1)) != 0)
++ return e;
++ *(xdr->stream + xdr->curloc) = xdr_type;
++ xdr->curloc += 1;
++ memcpy ((xdr->stream + xdr->curloc), bytes, len);
++ xdr->curloc += len;
++
++ return 0;
++}
++
++int __inline__
++xdr_enc_uint64 (xdr_enc_t * xdr, uint64_t i)
++{
++ uint64_t b = cpu_to_be64 (i);
++ return append_bytes (xdr, XDR_UINT64, &b, sizeof (uint64_t));
++}
++
++int __inline__
++xdr_enc_uint32 (xdr_enc_t * xdr, uint32_t i)
++{
++ uint32_t b = cpu_to_be32 (i);
++ return append_bytes (xdr, XDR_UINT32, &b, sizeof (uint32_t));
++}
++
++int __inline__
++xdr_enc_uint16 (xdr_enc_t * xdr, uint16_t i)
++{
++ uint16_t b = cpu_to_be16 (i);
++ return append_bytes (xdr, XDR_UINT16, &b, sizeof (uint16_t));
++}
++
++int __inline__
++xdr_enc_uint8 (xdr_enc_t * xdr, uint8_t i)
++{
++ return append_bytes (xdr, XDR_UINT8, &i, sizeof (uint8_t));
++}
++
++int __inline__
++xdr_enc_ipv6 (xdr_enc_t * xdr, struct in6_addr *ip)
++{ /* bytes should already be in the right order. */
++ return append_bytes (xdr, XDR_IPv6, ip->s6_addr, 16);
++}
++
++int
++xdr_enc_raw (xdr_enc_t * xdr, void *p, uint16_t len)
++{
++ int e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if ((e = grow_stream (xdr, len + 3)) != 0)
++ return e;
++ *(xdr->stream + xdr->curloc) = XDR_RAW;
++ xdr->curloc += 1;
++ (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++ cpu_to_be16 (len);
++ xdr->curloc += 2;
++ memcpy ((xdr->stream + xdr->curloc), p, len);
++ xdr->curloc += len;
++ return 0;
++}
++
++int
++xdr_enc_raw_iov (xdr_enc_t * xdr, int count, struct iovec *iov)
++{
++ size_t total = 0;
++ int i, err;
++ if (xdr == NULL || count < 1 || iov == NULL)
++ return -EINVAL;
++ for (i = 0; i < count; i++)
++ total += iov[i].iov_len;
++ /* make sure it fits in a uint16_t */
++ if (total > 0xffff)
++ return -EFBIG;
++ /* grow to fit */
++ if ((err = grow_stream (xdr, total + 3)) != 0)
++ return err;
++ /* copy in header and size */
++ *(xdr->stream + xdr->curloc) = XDR_RAW;
++ xdr->curloc += 1;
++ (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++ cpu_to_be16 (total);
++ xdr->curloc += 2;
++ /* copy in all iovbufs */
++ for (i = 0; i < count; i++) {
++ if (iov[i].iov_base == NULL)
++ continue;
++ memcpy ((xdr->stream + xdr->curloc), iov[i].iov_base,
++ iov[i].iov_len);
++ xdr->curloc += iov[i].iov_len;
++ }
++ return 0;
++}
++
++int
++xdr_enc_string (xdr_enc_t * xdr, uint8_t * s)
++{
++ int len, e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if (s == NULL)
++ len = 0;
++ else
++ len = strlen (s);
++ if ((e = grow_stream (xdr, len + 3)) != 0)
++ return e;
++ *(xdr->stream + xdr->curloc) = XDR_STRING;
++ xdr->curloc += 1;
++ (uint16_t) * ((uint16_t *) (xdr->stream + xdr->curloc)) =
++ cpu_to_be16 (len);
++ xdr->curloc += 2;
++ if (len > 0) {
++ memcpy ((xdr->stream + xdr->curloc), s, len);
++ xdr->curloc += len;
++ }
++ return 0;
++}
++
++int
++xdr_enc_list_start (xdr_enc_t * xdr)
++{
++ int e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if ((e = grow_stream (xdr, 1)) != 0)
++ return e;
++ *(xdr->stream + xdr->curloc) = XDR_LIST_START;
++ xdr->curloc += 1;
++ return 0;
++}
++
++int
++xdr_enc_list_stop (xdr_enc_t * xdr)
++{
++ int e;
++ if (xdr == NULL)
++ return -EINVAL;
++ if ((e = grow_stream (xdr, 1)) != 0)
++ return e;
++ *(xdr->stream + xdr->curloc) = XDR_LIST_STOP;
++ xdr->curloc += 1;
++ return 0;
++}
++
++/*****************************************************************************/
++
++/**
++ * get_next -
++ * @xdr:
++ *
++ * get what ever may be next, and put it into the buffer.
++ *
++ * Returns: int
++ */
++static int
++get_next (xdr_dec_t * xdr)
++{
++ int err;
++ uint16_t len;
++ if ((err = xdr_recv (xdr->fd, xdr->stream, 1)) < 0)
++ return err;
++ if (err == 0)
++ return -EPROTO;
++ xdr->curloc = 1;
++ if (*(xdr->stream) == XDR_UINT64) {
++ len = sizeof (uint64_t);
++ } else if (*(xdr->stream) == XDR_UINT32) {
++ len = sizeof (uint32_t);
++ } else if (*(xdr->stream) == XDR_UINT16) {
++ len = sizeof (uint16_t);
++ } else if (*(xdr->stream) == XDR_UINT8) {
++ len = sizeof (uint8_t);
++ } else if (*(xdr->stream) == XDR_IPv6) {
++ len = 16;
++ } else if (*(xdr->stream) == XDR_STRING) {
++ if ((err = xdr_recv (xdr->fd, (xdr->stream + 1), 2)) < 0)
++ return err;
++ if (err == 0)
++ return -EPROTO;
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++ } else if (*(xdr->stream) == XDR_RAW) {
++ if ((err = xdr_recv (xdr->fd, (xdr->stream + 1), 2)) < 0)
++ return err;
++ if (err == 0)
++ return -EPROTO;
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++ } else if (*(xdr->stream) == XDR_LIST_START) {
++ xdr->curloc = 0;
++ return 0;
++ } else if (*(xdr->stream) == XDR_LIST_STOP) {
++ xdr->curloc = 0;
++ return 0;
++ } else {
++ return -1;
++ }
++
++ /* grow buffer if need be. */
++ if (xdr->curloc + len > xdr->length) {
++ uint8_t *c;
++ c = xdr_realloc (xdr->stream, xdr->curloc + len, xdr->length);
++ if (c == NULL)
++ return -ENOMEM;
++ xdr->stream = c;
++ xdr->length = xdr->curloc + len;
++ }
++
++ if (len > 0) {
++ if ((err =
++ xdr_recv (xdr->fd, (xdr->stream + xdr->curloc), len)) < 0)
++ return err;
++ if (err == 0)
++ return -EPROTO;
++ }
++ xdr->curloc = 0;
++ return 0;
++}
++
++int
++xdr_dec_uint64 (xdr_dec_t * xdr, uint64_t * i)
++{
++ int err;
++ if (xdr == NULL || i == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_UINT64)
++ return -ENOMSG;
++ *i = be64_to_cpu (*((uint64_t *) (xdr->stream + 1)));
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_uint32 (xdr_dec_t * xdr, uint32_t * i)
++{
++ int err;
++ if (xdr == NULL || i == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_UINT32)
++ return -ENOMSG;
++ *i = be32_to_cpu (*((uint32_t *) (xdr->stream + 1)));
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_uint16 (xdr_dec_t * xdr, uint16_t * i)
++{
++ int err;
++ if (xdr == NULL || i == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_UINT16)
++ return -ENOMSG;
++ *i = be16_to_cpu (*((uint16_t *) (xdr->stream + 1)));
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_uint8 (xdr_dec_t * xdr, uint8_t * i)
++{
++ int err;
++ if (xdr == NULL || i == NULL)
++ return -EINVAL;
++
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_UINT8)
++ return -ENOMSG;
++ *i = *((uint8_t *) (xdr->stream + 1));
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_ipv6 (xdr_dec_t * xdr, struct in6_addr *ip)
++{
++ int err;
++ if (xdr == NULL || ip == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_IPv6)
++ return -ENOMSG;
++ memcpy (ip, xdr->stream + 1, 16);
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++/* mallocing version */
++int
++xdr_dec_raw_m (xdr_dec_t * xdr, void **p, uint16_t * l)
++{
++ int len;
++ void *str;
++ int err;
++
++ if (xdr == NULL || p == NULL || l == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_RAW)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ str = kmalloc (len, GFP_KERNEL);
++ if (str == NULL)
++ return -ENOMEM;
++ memcpy (str, (xdr->stream + xdr->curloc), len);
++ xdr->curloc += len;
++
++ *p = str;
++ *l = len;
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++/* non-mallocing version */
++int
++xdr_dec_raw (xdr_dec_t * xdr, void *p, uint16_t * l)
++{
++ int len;
++ int err;
++
++ if (xdr == NULL || p == NULL || l == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_RAW)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ if (len > *l)
++ return -1;
++
++ memcpy (p, (xdr->stream + xdr->curloc), len);
++ xdr->curloc += len;
++
++ *l = len;
++
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++/**
++ * xdr_dec_raw_ag - auto-growing version
++ * @xdr:
++ * @p: <> pointer to buffer
++ * @bl: <> size of the buffer
++ * @rl: > size of data read from stream
++ *
++ * This form of xdr_dec_raw will increase the size of a pre-malloced buffer
++ * to fit the data it is reading. It is kind of a merger of the
++ * non-mallocing and mallocing versions.
++ *
++ * Returns: int
++ */
++int
++xdr_dec_raw_ag (xdr_dec_t * xdr, void **p, uint16_t * bl, uint16_t * rl)
++{
++ int len;
++ int err;
++
++ if (xdr == NULL || p == NULL || bl == NULL || rl == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_RAW)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ if (len > *bl) { /* grow p */
++ void *temp;
++ temp = xdr_realloc (*p, len, *bl);
++ if (temp == NULL)
++ return -ENOMEM;
++ *bl = len;
++ *p = temp;
++ }
++
++ memcpy (*p, (xdr->stream + xdr->curloc), len);
++ xdr->curloc += len;
++
++ *rl = len;
++
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++/* mallocing version */
++int
++xdr_dec_string (xdr_dec_t * xdr, uint8_t ** strp)
++{
++ int len;
++ char *str;
++ int err;
++ if (xdr == NULL || strp == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_STRING)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ if (len > 0) {
++ str = kmalloc (len + 1, GFP_KERNEL);
++ if (str == NULL)
++ return -ENOMEM;
++ str[len] = '\0';
++ memcpy (str, (xdr->stream + xdr->curloc), len);
++ xdr->curloc += len;
++
++ *strp = str;
++ } else {
++ *strp = NULL;
++ }
++
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++/* non-mallocing version */
++int
++xdr_dec_string_nm (xdr_dec_t * xdr, uint8_t * string, size_t l)
++{
++ int len;
++ int err;
++ if (xdr == NULL || string == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_STRING)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ if (len > 0) {
++ memcpy (string, (xdr->stream + xdr->curloc), MIN (len, l));
++ if (l > len) {
++ string[len] = '\0';
++ }
++ string[l - 1] = '\0';
++ } else {
++ string[0] = '\0';
++ }
++
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_string_ag (xdr_dec_t * xdr, uint8_t ** s, uint16_t * bl)
++{
++ int len;
++ int err;
++ if (xdr == NULL || s == NULL || bl == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_STRING)
++ return -ENOMSG;
++ xdr->curloc = 1;
++
++ len = be16_to_cpu (*((uint16_t *) (xdr->stream + xdr->curloc)));
++ xdr->curloc += 2;
++
++ if (len == 0) { /* empty string */
++ **s = '\0';
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++ }
++
++ if (len >= *bl) { /* grow s */
++ void *temp;
++ temp = xdr_realloc (*s, len + 1, *bl);
++ if (temp == NULL)
++ return -ENOMEM;
++ *bl = len + 1;
++ *s = temp;
++ }
++
++ memcpy (*s, (xdr->stream + xdr->curloc), len);
++ (*s)[len] = '\0';
++
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_list_start (xdr_dec_t * xdr)
++{
++ int err;
++ if (xdr == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_LIST_START)
++ return -ENOMSG;
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
++
++int
++xdr_dec_list_stop (xdr_dec_t * xdr)
++{
++ int err;
++ if (xdr == NULL)
++ return -EINVAL;
++ if (*(xdr->stream) == XDR_NULL) {
++ if ((err = get_next (xdr)) != 0)
++ return err;
++ }
++ if (*(xdr->stream) != XDR_LIST_STOP)
++ return -ENOMSG;
++ /* read the item out, mark that */
++ *(xdr->stream) = XDR_NULL;
++ return 0;
++}
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_io.c linux-patched/fs/gfs_locking/lock_gulm/xdr_io.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_io.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_io.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,169 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * does the lowest level of reads and writes.
++ * In kernel and/or userspace.
++ */
++
++#include "xdr.h"
++
++#ifdef __KERNEL__
++#ifdef __linux__
++#include <linux/net.h>
++#include <linux/in.h>
++#include <linux/socket.h>
++#include <net/sock.h>
++#include "asm/uaccess.h"
++
++/**
++ * do_tfer - transfers data over a socket
++ * @sock: < socket
++ * @iov: <> iovec of buffers
++ * @n: < how many iovecs
++ * @size: < total data size to send/recv
++ * @dir: < send or recv
++ * @timeout: < how many sec to wait. 0 == forever.
++ *
++ * Returns: <0: Error
++ * >=0: Bytes transfered
++ */
++static int
++do_tfer (struct socket *sock, struct iovec *iov, int n, int size, int dir)
++{
++ unsigned long flags;
++ sigset_t oldset;
++ struct msghdr m;
++ mm_segment_t fs;
++ int rv, moved = 0;
++
++ fs = get_fs ();
++ set_fs (get_ds ());
++
++ /* XXX do I still want the signal stuff? */
++ spin_lock_irqsave (¤t->sighand->siglock, flags);
++ oldset = current->blocked;
++ siginitsetinv (¤t->blocked,
++ sigmask (SIGKILL) | sigmask (SIGTERM));
++ recalc_sigpending ();
++ spin_unlock_irqrestore (¤t->sighand->siglock, flags);
++
++ memset (&m, 0, sizeof (struct msghdr));
++ for (;;) {
++ m.msg_iov = iov;
++ m.msg_iovlen = n;
++ m.msg_flags = MSG_NOSIGNAL;
++
++ if (dir)
++ rv = sock_sendmsg (sock, &m, size - moved);
++ else
++ rv = sock_recvmsg (sock, &m, size - moved, 0);
++
++ if (rv <= 0)
++ goto out_err;
++ moved += rv;
++
++ if (moved >= size)
++ break;
++
++ /* adjust iov's for next transfer */
++ while (iov->iov_len == 0) {
++ iov++;
++ n--;
++ }
++
++ }
++ rv = moved;
++ out_err:
++ spin_lock_irqsave (¤t->sighand->siglock, flags);
++ current->blocked = oldset;
++ recalc_sigpending ();
++ spin_unlock_irqrestore (¤t->sighand->siglock, flags);
++
++ set_fs (fs);
++
++ return rv;
++}
++
++size_t
++xdr_send (struct socket * sock, void *buf, size_t size)
++{
++ struct iovec iov;
++ int res;
++
++ iov.iov_base = buf;
++ iov.iov_len = size;
++
++ res = do_tfer (sock, &iov, 1, size, 1);
++
++ return res;
++}
++
++size_t
++xdr_recv (struct socket * sock, void *buf, size_t size)
++{
++ struct iovec iov;
++ int res;
++
++ iov.iov_base = buf;
++ iov.iov_len = size;
++
++ res = do_tfer (sock, &iov, 1, size, 0);
++
++ return res;
++}
++
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++
++#include <errno.h>
++#include <sys/types.h>
++#include <sys/socket.h>
++
++ssize_t
++xdr_recv (int fd, void *buf, size_t len)
++{
++ ssize_t cnt = 0;
++ size_t ttl = 0;
++ while (len > 0) {
++ cnt = recv (fd, buf, len, 0);
++ if (cnt == 0)
++ return 0;
++ if (cnt < 0)
++ return -errno;
++ len -= cnt;
++ buf += cnt;
++ ttl += cnt;
++ }
++ return ttl;
++}
++
++ssize_t
++xdr_send (int fd, void *buf, size_t len)
++{
++ ssize_t cnt = 0;
++ size_t ttl = 0;
++ while (len > 0) {
++ cnt = send (fd, buf, len, 0);
++ if (cnt == 0)
++ return 0;
++ if (cnt < 0)
++ return -errno;
++ len -= cnt;
++ buf += cnt;
++ ttl += cnt;
++ }
++ return ttl;
++}
++
++#endif /*__KERNEL__*/
+diff -urN linux-orig/fs/gfs_locking/lock_gulm/xdr_socket.c linux-patched/fs/gfs_locking/lock_gulm/xdr_socket.c
+--- linux-orig/fs/gfs_locking/lock_gulm/xdr_socket.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_gulm/xdr_socket.c 2004-06-16 12:03:21.959894533 -0500
+@@ -0,0 +1,82 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++ * This file opens and closes a socket.
++ * In kernel and/or userspace.
++ */
++
++#include "xdr.h"
++
++#ifdef __KERNEL__
++#ifdef __linux__
++
++int
++xdr_open (xdr_socket * xsk)
++{
++ return sock_create (AF_INET6, SOCK_STREAM, 0, xsk);
++}
++
++int
++xdr_connect (struct sockaddr_in6 *adr, xdr_socket xsk)
++{
++ return xsk->ops->connect (xsk,
++ (struct sockaddr *) adr,
++ sizeof (struct sockaddr_in6), 0);
++}
++
++void
++xdr_close (xdr_socket * xsk)
++{
++ if (*xsk == NULL)
++ return;
++ sock_release (*xsk);
++ *xsk = NULL;
++}
++
++#endif /*__linux__*/
++#else /*__KERNEL__*/
++
++int
++xdr_open (xdr_socket * xsk)
++{
++ int sk;
++ sk = socket (AF_INET6, SOCK_STREAM, 0);
++ if (sk < 0)
++ return -errno;
++ *xsk = sk;
++ return 0;
++}
++
++int
++xdr_connect (struct sockaddr_in6 *adr, xdr_socket xsk)
++{
++ int err;
++ err =
++ connect (xsk, (struct sockaddr *) adr,
++ sizeof (struct sockaddr_in6));
++ if (err < 0)
++ return -errno;
++ return 0;
++}
++
++void
++xdr_close (xdr_socket * xsk)
++{
++ if (*xsk < 0)
++ return;
++ close (*xsk);
++ *xsk = -1;
++}
++
++#endif /*__KERNEL__*/
+diff -urN linux-orig/fs/gfs_locking/lock_harness/main.c linux-patched/fs/gfs_locking/lock_harness/main.c
+--- linux-orig/fs/gfs_locking/lock_harness/main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_harness/main.c 2004-06-16 12:03:10.006671787 -0500
+@@ -0,0 +1,226 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <linux/kmod.h>
++#include <linux/lm_interface.h>
++
++#define RELEASE_NAME "<CVS>"
++
++struct lmh_wrapper {
++ struct list_head lw_list;
++ struct lm_lockops *lw_ops;
++};
++
++static struct semaphore lmh_lock;
++static struct list_head lmh_list;
++
++/**
++ * lm_register_proto - Register a low-level locking protocol
++ * @proto: the protocol definition
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++lm_register_proto(struct lm_lockops *proto)
++{
++ struct list_head *tmp, *head;
++ struct lmh_wrapper *lw;
++
++ down(&lmh_lock);
++
++ for (head = &lmh_list, tmp = head->next; tmp != head; tmp = tmp->next) {
++ lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++ if (strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name) == 0) {
++ up(&lmh_lock);
++ printk("lock_harness: protocol %s already exists\n",
++ proto->lm_proto_name);
++ return -EEXIST;
++ }
++ }
++
++ lw = kmalloc(sizeof (struct lmh_wrapper), GFP_KERNEL);
++ if (!lw) {
++ up(&lmh_lock);
++ return -ENOMEM;
++ }
++ memset(lw, 0, sizeof (struct lmh_wrapper));
++
++ lw->lw_ops = proto;
++ list_add(&lw->lw_list, &lmh_list);
++
++ up(&lmh_lock);
++
++ return 0;
++}
++
++/**
++ * lm_unregister_proto - Unregister a low-level locking protocol
++ * @proto: the protocol definition
++ *
++ */
++
++void
++lm_unregister_proto(struct lm_lockops *proto)
++{
++ struct list_head *tmp, *head;
++ struct lmh_wrapper *lw = NULL;
++
++ down(&lmh_lock);
++
++ for (head = &lmh_list, tmp = head->next; tmp != head; tmp = tmp->next) {
++ lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++ if (strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name) == 0) {
++ list_del(&lw->lw_list);
++ up(&lmh_lock);
++ kfree(lw);
++ return;
++ }
++ }
++
++ up(&lmh_lock);
++
++ printk("lock_harness: can't unregister lock protocol %s\n",
++ proto->lm_proto_name);
++}
++
++/**
++ * lm_mount - Mount a lock protocol
++ * @proto_name - the name of the protocol
++ * @table_name - the name of the lock space
++ * @host_data - data specific to this host
++ * @cb - the callback to the code using the lock module
++ * @fsdata - data to pass back with the callback
++ * @min_lvb_size - the mininum LVB size that the caller can deal with
++ * @lockstruct - a structure returned describing the mount
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int
++lm_mount(char *proto_name, char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t * fsdata,
++ unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++ struct list_head *tmp;
++ struct lmh_wrapper *lw = NULL;
++ int try = 0;
++ int error;
++
++ retry:
++ down(&lmh_lock);
++
++ for (tmp = lmh_list.next; tmp != &lmh_list; tmp = tmp->next) {
++ lw = list_entry(tmp, struct lmh_wrapper, lw_list);
++
++ if (strcmp(lw->lw_ops->lm_proto_name, proto_name) == 0)
++ break;
++ else
++ lw = NULL;
++ }
++
++ if (!lw) {
++ if (!try && capable(CAP_SYS_MODULE)) {
++ try = 1;
++ up(&lmh_lock);
++ request_module(proto_name);
++ goto retry;
++ }
++ printk("lock_harness: can't find protocol %s\n", proto_name);
++ error = -ENOENT;
++ goto out;
++ }
++
++ if (!try_module_get(lw->lw_ops->lm_owner)) {
++ try = 0;
++ up(&lmh_lock);
++ current->state = TASK_UNINTERRUPTIBLE;
++ schedule_timeout(HZ);
++ goto retry;
++ }
++
++ error = lw->lw_ops->lm_mount(table_name, host_data,
++ cb, fsdata, min_lvb_size, lockstruct);
++ if (error)
++ module_put(lw->lw_ops->lm_owner);
++
++ out:
++ up(&lmh_lock);
++
++ return error;
++}
++
++/**
++ * lm_unmount - unmount a lock module
++ * @lockstruct: the lockstruct passed into mount
++ *
++ */
++
++void
++lm_unmount(struct lm_lockstruct *lockstruct)
++{
++ down(&lmh_lock);
++ lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
++ if (lockstruct->ls_ops->lm_owner)
++ module_put(lockstruct->ls_ops->lm_owner);
++ up(&lmh_lock);
++}
++
++/**
++ * init_lmh - Initialize the lock module harness
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init
++init_lmh(void)
++{
++ init_MUTEX(&lmh_lock);
++ INIT_LIST_HEAD(&lmh_list);
++
++ printk("Lock_Harness %s (built %s %s) installed\n",
++ RELEASE_NAME, __DATE__, __TIME__);
++
++ return 0;
++}
++
++/**
++ * exit_lmh - cleanup the Lock Module Harness
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++void __exit
++exit_lmh(void)
++{
++}
++
++module_init(init_lmh);
++module_exit(exit_lmh);
++
++MODULE_DESCRIPTION("GFS Lock Module Harness " RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");
++
++EXPORT_SYMBOL_GPL(lm_register_proto);
++EXPORT_SYMBOL_GPL(lm_unregister_proto);
++EXPORT_SYMBOL_GPL(lm_mount);
++EXPORT_SYMBOL_GPL(lm_unmount);
+diff -urN linux-orig/include/linux/lm_interface.h linux-patched/include/linux/lm_interface.h
+--- linux-orig/include/linux/lm_interface.h 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/include/linux/lm_interface.h 2004-06-16 12:03:10.005672019 -0500
+@@ -0,0 +1,193 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++/*
++
++ Sooner or later, I need to put all the documentation back into this file.
++ In the mean time, here are some notes.
++
++ - The lock module is now responsible for STOMITHing the an expired
++ client before calling the callback with type LM_CB_NEED_RECOVERY.
++
++ - If mount() operation returns first == TRUE, GFS will check all the
++ journals. GFS itself can't/shouldn't stomith the machines, so the lock module
++ needs to make sure that there are no zombie machines on any of the
++ journals. (i.e. this should probably be on the first mount of the lock
++ space where all mounts by other machines are blocked.) GFS will call
++ others_may_mount() when the filesystem is in a consistent state.
++
++ - GFS can issue multiple simultaneous get_lock()s for the same lockname.
++ The lock module needs to deal with it, either by 1) building a hash table
++ to lookup the structures and keeping a reference count so there is only
++ on lm_lock_t for a given lockname. or 2) just dealing with multiple
++ lm_lock_t structures for a given lockname.
++
++*/
++
++#ifndef __LM_INTERFACE_DOT_H__
++#define __LM_INTERFACE_DOT_H__
++
++typedef void lm_lockspace_t;
++typedef void lm_lock_t;
++typedef void lm_fsdata_t;
++typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
++ void *data);
++
++/* Flags for the struct lm_lockstruct->ls_flags field */
++
++#define LM_LSFLAG_LOCAL (0x00000001)
++#define LM_LSFLAG_ASYNC (0x00000002)
++
++/* Lock types */
++
++#define LM_TYPE_RESERVED (0x00)
++#define LM_TYPE_NONDISK (0x01)
++#define LM_TYPE_INODE (0x02)
++#define LM_TYPE_RGRP (0x03)
++#define LM_TYPE_META (0x04)
++#define LM_TYPE_IOPEN (0x05)
++#define LM_TYPE_FLOCK (0x06)
++#define LM_TYPE_PLOCK (0x07)
++#define LM_TYPE_QUOTA (0x08)
++
++/* States passed to lock() */
++
++#define LM_ST_UNLOCKED (0)
++#define LM_ST_EXCLUSIVE (1)
++#define LM_ST_DEFERRED (2)
++#define LM_ST_SHARED (3)
++
++/* Flags passed to lock() */
++
++#define LM_FLAG_TRY (0x00000001)
++#define LM_FLAG_TRY_1CB (0x00000002)
++#define LM_FLAG_NOEXP (0x00000004)
++#define LM_FLAG_ANY (0x00000008)
++#define LM_FLAG_PRIORITY (0x00000010)
++
++/* Flags returned by lock() */
++
++#define LM_OUT_ST_MASK (0x00000003)
++#define LM_OUT_CACHEABLE (0x00000004)
++#define LM_OUT_CANCELED (0x00000008)
++#define LM_OUT_NEED_E (0x00000010)
++#define LM_OUT_NEED_D (0x00000020)
++#define LM_OUT_NEED_S (0x00000040)
++#define LM_OUT_ASYNC (0x00000080)
++#define LM_OUT_LVB_INVALID (0x00000100)
++
++/* Callback types */
++
++#define LM_CB_NEED_E (257)
++#define LM_CB_NEED_D (258)
++#define LM_CB_NEED_S (259)
++#define LM_CB_NEED_RECOVERY (260)
++#define LM_CB_DROPLOCKS (261)
++#define LM_CB_ASYNC (262)
++
++/* Reset_exp messages */
++
++#define LM_RD_GAVEUP (308)
++#define LM_RD_SUCCESS (309)
++
++struct lm_lockname {
++ uint64_t ln_number;
++ unsigned int ln_type;
++};
++
++#define lm_name_equal(name1, name2) \
++(((name1)->ln_number == (name2)->ln_number) && \
++ ((name1)->ln_type == (name2)->ln_type)) \
++
++struct lm_async_cb {
++ struct lm_lockname lc_name;
++ int lc_ret;
++};
++
++struct lm_lockstruct;
++
++struct lm_lockops {
++ char lm_proto_name[256];
++
++ /* Mount/Unmount */
++
++ int (*lm_mount) (char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t *fsdata,
++ unsigned int min_lvb_size,
++ struct lm_lockstruct *lockstruct);
++ void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
++ void (*lm_unmount) (lm_lockspace_t *lockspace);
++
++ /* Lock oriented operations */
++
++ int (*lm_get_lock) (lm_lockspace_t *lockspace,
++ struct lm_lockname *name, lm_lock_t **lockp);
++ void (*lm_put_lock) (lm_lock_t *lock);
++
++ unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
++ unsigned int req_state, unsigned int flags);
++ unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
++
++ void (*lm_cancel) (lm_lock_t *lock);
++
++ int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
++ void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
++ void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
++
++ /* Posix Lock oriented operations */
++
++ int (*lm_plock_get) (lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t *start, uint64_t *end, int *exclusive,
++ unsigned long *rowner);
++
++ int (*lm_plock) (lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ int wait, int exclusive, uint64_t start,
++ uint64_t end);
++
++ int (*lm_punlock) (lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t start, uint64_t end);
++
++ /* Client oriented operations */
++
++ void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
++ unsigned int message);
++
++ struct module *lm_owner;
++};
++
++struct lm_lockstruct {
++ unsigned int ls_jid;
++ unsigned int ls_first;
++ unsigned int ls_lvb_size;
++ lm_lockspace_t *ls_lockspace;
++ struct lm_lockops *ls_ops;
++ int ls_flags;
++};
++
++/* Bottom interface */
++
++int lm_register_proto(struct lm_lockops *proto);
++void lm_unregister_proto(struct lm_lockops *proto);
++
++/* Top interface */
++
++int lm_mount(char *proto_name,
++ char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t *fsdata,
++ unsigned int min_lvb_size, struct lm_lockstruct *lockstruct);
++void lm_unmount(struct lm_lockstruct *lockstruct);
++
++#endif /* __LM_INTERFACE_DOT_H__ */
+diff -urN linux-orig/fs/gfs_locking/lock_nolock/main.c linux-patched/fs/gfs_locking/lock_nolock/main.c
+--- linux-orig/fs/gfs_locking/lock_nolock/main.c 1969-12-31 18:00:00.000000000 -0600
++++ linux-patched/fs/gfs_locking/lock_nolock/main.c 2004-06-16 12:03:13.918762838 -0500
+@@ -0,0 +1,350 @@
++/******************************************************************************
++*******************************************************************************
++**
++** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
++** Copyright (C) 2004 Red Hat, Inc. All rights reserved.
++**
++** This copyrighted material is made available to anyone wishing to use,
++** modify, copy, or redistribute it subject to the terms and conditions
++** of the GNU General Public License v.2.
++**
++*******************************************************************************
++******************************************************************************/
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/lm_interface.h>
++
++#define RELEASE_NAME "<CVS>"
++
++struct nolock_lockspace {
++ unsigned int nl_lvb_size;
++};
++
++struct lm_lockops nolock_ops;
++
++/**
++ * nolock_mount - mount a nolock lockspace
++ * @table_name: the name of the space to mount
++ * @host_data: host specific data
++ * @cb: the callback
++ * @lockstruct: the structure of crap to fill in
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_mount(char *table_name, char *host_data,
++ lm_callback_t cb, lm_fsdata_t *fsdata,
++ unsigned int min_lvb_size, struct lm_lockstruct *lockstruct)
++{
++ char *c;
++ unsigned int jid;
++ struct nolock_lockspace *nl;
++
++ /* If there is a "jid=" in the hostdata, return that jid.
++ Otherwise, return zero. */
++
++ c = strstr(host_data, "jid=");
++ if (!c)
++ jid = 0;
++ else {
++ c += 4;
++ sscanf(c, "%u", &jid);
++ }
++
++ nl = kmalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
++ if (!nl)
++ return -ENOMEM;
++
++ memset(nl, 0, sizeof(struct nolock_lockspace));
++ nl->nl_lvb_size = min_lvb_size;
++
++ lockstruct->ls_jid = jid;
++ lockstruct->ls_first = 1;
++ lockstruct->ls_lvb_size = min_lvb_size;
++ lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
++ lockstruct->ls_ops = &nolock_ops;
++ lockstruct->ls_flags = LM_LSFLAG_LOCAL | LM_LSFLAG_ASYNC;
++
++ return 0;
++}
++
++/**
++ * nolock_others_may_mount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void
++nolock_others_may_mount(lm_lockspace_t *lockspace)
++{
++}
++
++/**
++ * nolock_unmount - unmount a lock space
++ * @lockspace: the lockspace to unmount
++ *
++ */
++
++static void
++nolock_unmount(lm_lockspace_t *lockspace)
++{
++ struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
++ kfree(nl);
++}
++
++/**
++ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
++ * @lockspace: the lockspace the lock lives in
++ * @name: the name of the lock
++ * @lockp: return the lm_lock_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
++ lm_lock_t ** lockp)
++{
++ *lockp = (lm_lock_t *)lockspace;
++ return 0;
++}
++
++/**
++ * nolock_put_lock - get rid of a lock structure
++ * @lock: the lock to throw away
++ *
++ */
++
++static void
++nolock_put_lock(lm_lock_t *lock)
++{
++}
++
++/**
++ * nolock_lock - acquire a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ * @req_state: the requested state
++ * @flags: modifier flags
++ *
++ * Returns: A bitmap of LM_OUT_*
++ */
++
++static unsigned int
++nolock_lock(lm_lock_t *lock, unsigned int cur_state, unsigned int req_state,
++ unsigned int flags)
++{
++ return req_state | LM_OUT_CACHEABLE;
++}
++
++/**
++ * nolock_unlock - unlock a lock
++ * @lock: the lock to manipulate
++ * @cur_state: the current state
++ *
++ * Returns: 0
++ */
++
++static unsigned int
++nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
++{
++ return 0;
++}
++
++/**
++ * nolock_cancel - cancel a request on a lock
++ * @lock: the lock to cancel request for
++ *
++ */
++
++static void
++nolock_cancel(lm_lock_t *lock)
++{
++}
++
++/**
++ * nolock_hold_lvb - hold on to a lock value block
++ * @lock: the lock the LVB is associated with
++ * @lvbp: return the lm_lvb_t here
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++static int
++nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
++{
++ struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
++ int error = 0;
++
++ *lvbp = kmalloc(nl->nl_lvb_size, GFP_KERNEL);
++ if (*lvbp)
++ memset(*lvbp, 0, nl->nl_lvb_size);
++ else
++ error = -ENOMEM;
++
++ return error;
++}
++
++/**
++ * nolock_unhold_lvb - release a LVB
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++static void
++nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
++{
++ kfree(lvb);
++}
++
++/**
++ * nolock_sync_lvb - sync out the value of a lvb
++ * @lock: the lock the LVB is associated with
++ * @lvb: the lock value block
++ *
++ */
++
++static void
++nolock_sync_lvb(lm_lock_t *lock, char *lvb)
++{
++}
++
++/**
++ * nolock_plock_get -
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @start:
++ * @end:
++ * @exclusive:
++ * @rowner:
++ *
++ */
++
++static int
++nolock_plock_get(lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t *start, uint64_t *end, int *exclusive,
++ unsigned long *rowner)
++{
++ return -ENOSYS;
++}
++
++/**
++ * nolock_plock -
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @wait:
++ * @exclusive:
++ * @start:
++ * @end:
++ *
++ */
++
++static int
++nolock_plock(lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ int wait, int exclusive, uint64_t start,
++ uint64_t end)
++{
++ return -ENOSYS;
++}
++
++/**
++ * nolock_punlock -
++ * @lockspace: the lockspace
++ * @name:
++ * @owner:
++ * @start:
++ * @end:
++ *
++ */
++
++static int
++nolock_punlock(lm_lockspace_t *lockspace,
++ struct lm_lockname *name, unsigned long owner,
++ uint64_t start, uint64_t end)
++{
++ return -ENOSYS;
++}
++
++/**
++ * nolock_recovery_done - reset the expired locks for a given jid
++ * @lockspace: the lockspace
++ * @jid: the jid
++ *
++ */
++
++static void
++nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
++ unsigned int message)
++{
++}
++
++struct lm_lockops nolock_ops = {
++ .lm_proto_name = "lock_nolock",
++ .lm_mount = nolock_mount,
++ .lm_others_may_mount = nolock_others_may_mount,
++ .lm_unmount = nolock_unmount,
++ .lm_get_lock = nolock_get_lock,
++ .lm_put_lock = nolock_put_lock,
++ .lm_lock = nolock_lock,
++ .lm_unlock = nolock_unlock,
++ .lm_cancel = nolock_cancel,
++ .lm_hold_lvb = nolock_hold_lvb,
++ .lm_unhold_lvb = nolock_unhold_lvb,
++ .lm_sync_lvb = nolock_sync_lvb,
++ .lm_plock_get = nolock_plock_get,
++ .lm_plock = nolock_plock,
++ .lm_punlock = nolock_punlock,
++ .lm_recovery_done = nolock_recovery_done,
++ .lm_owner = THIS_MODULE,
++};
++
++/**
++ * init_nolock - Initialize the nolock module
++ *
++ * Returns: 0 on success, -EXXX on failure
++ */
++
++int __init
++init_nolock(void)
++{
++ int error;
++
++ error = lm_register_proto(&nolock_ops);
++ if (error) {
++ printk("lock_nolock: can't register protocol: %d\n", error);
++ return error;
++ }
++
++ printk("Lock_Nolock %s (built %s %s) installed\n",
++ RELEASE_NAME, __DATE__, __TIME__);
++
++ return 0;
++}
++
++/**
++ * exit_nolock - cleanup the nolock module
++ *
++ */
++
++void __exit
++exit_nolock(void)
++{
++ lm_unregister_proto(&nolock_ops);
++}
++
++module_init(init_nolock);
++module_exit(exit_nolock);
++
++MODULE_DESCRIPTION("GFS Nolock Locking Module " RELEASE_NAME);
++MODULE_AUTHOR("Red Hat, Inc.");
++MODULE_LICENSE("GPL");