linux-cluster-dlm.patch

   1 # Add DLM to the build system
   2 diff -urN -p linux-2.6.7/cluster/Kconfig linux/cluster/Kconfig
   3 --- linux-2.6.7/cluster/Kconfig 2004-06-17 15:00:36.000000000 +0800
   4 +++ linux/cluster/Kconfig       2004-06-17 15:00:57.000000000 +0800
   5 @@ -10,4 +10,22 @@ config CLUSTER
   6         needed by all the other components. It provides membership services
   7         for those other subsystems.
   8
   9 +config CLUSTER_DLM
  10 +       tristate "Distributed Lock Manager"
  11 +       depends on CLUSTER
  12 +       ---help---
  13 +       A fully distributed lock manager, providing cluster-wide locking services
  14 +       and protected lock namespaces for kernel and userland applications.
  15 +
  16 +config CLUSTER_DLM_PROCLOCKS
  17 +       boolean "/proc/locks support for DLM"
  18 +       depends on CLUSTER_DLM
  19 +       depends on PROC_FS
  20 +       ---help---
  21 +       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
  22 +       write into this "file" the name of a lockspace known to the DLM and then
  23 +       read out a list of all the resources and locks in that lockspace that are
  24 +       known to the local node. Note because the DLM is distributed this may not
  25 +       be the full lock picture.
  26 +
  27  endmenu
  28 diff -urN -p linux-2.6.7/cluster/Makefile linux/cluster/Makefile
  29 --- linux-2.6.7/cluster/Makefile        2004-06-17 15:00:36.000000000 +0800
  30 +++ linux/cluster/Makefile      2004-06-17 15:00:57.000000000 +0800
  31 @@ -1,3 +1,4 @@
  32  obj-y  := nocluster.o
  33
  34  obj-$(CONFIG_CLUSTER)         += cman/
  35 +obj-$(CONFIG_CLUSTER_DLM)     += dlm/
  36 diff -urN -p linux-2.6.7/cluster/dlm/Makefile linux/cluster/dlm/Makefile
  37 --- linux-2.6.7/cluster/dlm/Makefile    1970-01-01 07:30:00.000000000 +0730
  38 +++ linux/cluster/dlm/Makefile  2004-06-17 15:00:57.000000000 +0800
  39 @@ -0,0 +1,23 @@
  40 +dlm-objs                 :=    ast.o \
  41 +                               config.o \
  42 +                               device.o \
  43 +                               dir.o \
  44 +                               lkb.o \
  45 +                               locking.o \
  46 +                               lockqueue.o \
  47 +                               lockspace.o \
  48 +                               lowcomms.o \
  49 +                               main.o \
  50 +                               memory.o \
  51 +                               midcomms.o \
  52 +                               nodes.o \
  53 +                               proc.o \
  54 +                               queries.o \
  55 +                               rebuild.o \
  56 +                               reccomms.o \
  57 +                               recover.o \
  58 +                               recoverd.o \
  59 +                               rsb.o \
  60 +                               util.o \
  61 +
  62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
  63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
  64 --- linux-orig/cluster/dlm/ast.c        1970-01-01 07:30:00.000000000 +0730
  65 +++ linux-patched/cluster/dlm/ast.c     2004-07-13 18:57:22.000000000 +0800
  66 @@ -0,0 +1,557 @@
  67 +/******************************************************************************
  68 +*******************************************************************************
  69 +**
  70 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
  71 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
  72 +**
  73 +**  This copyrighted material is made available to anyone wishing to use,
  74 +**  modify, copy, or redistribute it subject to the terms and conditions
  75 +**  of the GNU General Public License v.2.
  76 +**
  77 +*******************************************************************************
  78 +******************************************************************************/
  79 +
  80 +/*
  81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
  82 + */
  83 +
  84 +#include <linux/timer.h>
  85 +
  86 +#include "dlm_internal.h"
  87 +#include "rsb.h"
  88 +#include "lockqueue.h"
  89 +#include "dir.h"
  90 +#include "locking.h"
  91 +#include "lkb.h"
  92 +#include "lowcomms.h"
  93 +#include "midcomms.h"
  94 +#include "ast.h"
  95 +#include "nodes.h"
  96 +#include "config.h"
  97 +#include "util.h"
  98 +
  99 +/* Wake up flags for astd */
 100 +#define GDLMD_WAKE_ASTS  1
 101 +#define GDLMD_WAKE_TIMER 2
 102 +
 103 +static struct list_head _deadlockqueue;
 104 +static struct semaphore _deadlockqueue_lock;
 105 +static struct list_head _lockqueue;
 106 +static struct semaphore _lockqueue_lock;
 107 +static struct timer_list _lockqueue_timer;
 108 +static struct list_head _ast_queue;
 109 +static struct semaphore _ast_queue_lock;
 110 +static wait_queue_head_t _astd_waitchan;
 111 +static atomic_t _astd_running;
 112 +static long _astd_pid;
 113 +static unsigned long _astd_wakeflags;
 114 +static struct completion _astd_done;
 115 +
 116 +void add_to_lockqueue(struct dlm_lkb *lkb)
 117 +{
 118 +       /* Time stamp the entry so we know if it's been waiting too long */
 119 +       lkb->lkb_lockqueue_time = jiffies;
 120 +
 121 +       down(&_lockqueue_lock);
 122 +       list_add(&lkb->lkb_lockqueue, &_lockqueue);
 123 +       up(&_lockqueue_lock);
 124 +}
 125 +
 126 +void remove_from_lockqueue(struct dlm_lkb *lkb)
 127 +{
 128 +       down(&_lockqueue_lock);
 129 +       list_del(&lkb->lkb_lockqueue);
 130 +       up(&_lockqueue_lock);
 131 +}
 132 +
 133 +void add_to_deadlockqueue(struct dlm_lkb *lkb)
 134 +{
 135 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 136 +               return;
 137 +       lkb->lkb_duetime = jiffies;
 138 +       down(&_deadlockqueue_lock);
 139 +       list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
 140 +       up(&_deadlockqueue_lock);
 141 +}
 142 +
 143 +void remove_from_deadlockqueue(struct dlm_lkb *lkb)
 144 +{
 145 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 146 +               return;
 147 +
 148 +       down(&_deadlockqueue_lock);
 149 +       list_del(&lkb->lkb_deadlockq);
 150 +       up(&_deadlockqueue_lock);
 151 +
 152 +       /* Invalidate the due time */
 153 +       memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
 154 +}
 155 +
 156 +/*
 157 + * deliver an AST to a user
 158 + */
 159 +
 160 +static void deliver_ast(struct dlm_lkb *lkb, uint16_t ast_type)
 161 +{
 162 +       void (*cast) (long param) = lkb->lkb_astaddr;
 163 +       void (*bast) (long param, int mode) = lkb->lkb_bastaddr;
 164 +
 165 +       if (ast_type == AST_BAST) {
 166 +               if (!bast)
 167 +                       return;
 168 +               if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
 169 +                       return;
 170 +               bast(lkb->lkb_astparam, (int) lkb->lkb_bastmode);
 171 +       } else {
 172 +               if (!cast)
 173 +                       return;
 174 +               cast(lkb->lkb_astparam);
 175 +       }
 176 +}
 177 +
 178 +/*
 179 + * Queue an AST for delivery, this will only deal with
 180 + * kernel ASTs, usermode API will piggyback on top of this.
 181 + *
 182 + * This can be called in either the user or DLM context.
 183 + * ASTs are queued EVEN IF we are already running in dlm_astd
 184 + * context as we don't know what other locks are held (eg we could
 185 + * be being called from a lock operation that was called from
 186 + * another AST!
 187 + * If the AST is to be queued remotely then a message is sent to
 188 + * the target system via midcomms.
 189 + */
 190 +
 191 +void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
 192 +{
 193 +       struct dlm_request req;
 194 +
 195 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
 196 +               /*
 197 +                * Send a message to have an ast queued remotely.  Note: we do
 198 +                * not send remote completion asts, they are handled as part of
 199 +                * remote lock granting.
 200 +                */
 201 +               if (flags & AST_BAST) {
 202 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
 203 +                       req.rr_header.rh_length = sizeof(req);
 204 +                       req.rr_header.rh_flags = 0;
 205 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 206 +                       req.rr_header.rh_lockspace =
 207 +                           lkb->lkb_resource->res_ls->ls_global_id;
 208 +                       req.rr_status = lkb->lkb_retstatus;
 209 +                       req.rr_remlkid = lkb->lkb_remid;
 210 +                       req.rr_rqmode = rqmode;
 211 +
 212 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 213 +                               lkb->lkb_resource->res_ls->ls_allocation);
 214 +               } else if (lkb->lkb_retstatus == -EDEADLOCK) {
 215 +                       /*
 216 +                        * We only queue remote Completion ASTs here for error
 217 +                        * completions that happen out of band.
 218 +                        * DEADLOCK is one such.
 219 +                        */
 220 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
 221 +                       req.rr_header.rh_length = sizeof(req);
 222 +                       req.rr_header.rh_flags = 0;
 223 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 224 +                       req.rr_header.rh_lockspace =
 225 +                           lkb->lkb_resource->res_ls->ls_global_id;
 226 +                       req.rr_status = lkb->lkb_retstatus;
 227 +                       req.rr_remlkid = lkb->lkb_remid;
 228 +                       req.rr_rqmode = rqmode;
 229 +
 230 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 231 +                               lkb->lkb_resource->res_ls->ls_allocation);
 232 +               }
 233 +       } else {
 234 +               /*
 235 +                * Prepare info that will be returned in ast/bast.
 236 +                */
 237 +
 238 +               if (flags & AST_BAST) {
 239 +                       lkb->lkb_bastmode = rqmode;
 240 +               } else {
 241 +                       lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
 242 +
 243 +                       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
 244 +                               lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
 245 +                       else
 246 +                               lkb->lkb_lksb->sb_flags = 0;
 247 +               }
 248 +
 249 +               down(&_ast_queue_lock);
 250 +               if (lkb->lkb_astflags & AST_DEL)
 251 +                       log_print("queue_ast on deleted lkb %x ast %x pid %u",
 252 +                                 lkb->lkb_id, lkb->lkb_astflags, current->pid);
 253 +               if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
 254 +                       list_add_tail(&lkb->lkb_astqueue, &_ast_queue);
 255 +               lkb->lkb_astflags |= flags;
 256 +               up(&_ast_queue_lock);
 257 +
 258 +               /* It is the responsibility of the caller to call wake_astd()
 259 +                * after it has finished other locking operations that request
 260 +                * the ASTs to be delivered after */
 261 +       }
 262 +}
 263 +
 264 +/*
 265 + * Process any LKBs on the AST queue.
 266 + */
 267 +
 268 +static void process_asts(void)
 269 +{
 270 +       struct dlm_lkb *lkb;
 271 +       uint16_t flags;
 272 +
 273 +       for (;;) {
 274 +               down(&_ast_queue_lock);
 275 +               if (list_empty(&_ast_queue)) {
 276 +                       up(&_ast_queue_lock);
 277 +                       break;
 278 +               }
 279 +
 280 +               lkb = list_entry(_ast_queue.next, struct dlm_lkb, lkb_astqueue);
 281 +               list_del(&lkb->lkb_astqueue);
 282 +               flags = lkb->lkb_astflags;
 283 +               lkb->lkb_astflags = 0;
 284 +               up(&_ast_queue_lock);
 285 +
 286 +               if (flags & AST_COMP)
 287 +                       deliver_ast(lkb, AST_COMP);
 288 +
 289 +               if (flags & AST_BAST)
 290 +                       deliver_ast(lkb, AST_BAST);
 291 +
 292 +               if (flags & AST_DEL) {
 293 +                       struct dlm_rsb *rsb = lkb->lkb_resource;
 294 +                       struct dlm_ls *ls = rsb->res_ls;
 295 +
 296 +                       DLM_ASSERT(lkb->lkb_astflags == 0,
 297 +                           printk("%x %x\n", lkb->lkb_id, lkb->lkb_astflags););
 298 +
 299 +                       down_read(&ls->ls_in_recovery);
 300 +                       release_lkb(ls, lkb);
 301 +                       release_rsb(rsb);
 302 +                       up_read(&ls->ls_in_recovery);
 303 +               }
 304 +
 305 +               schedule();
 306 +       }
 307 +}
 308 +
 309 +void lockqueue_lkb_mark(struct dlm_ls *ls)
 310 +{
 311 +       struct dlm_lkb *lkb, *safe;
 312 +       int count = 0;
 313 +
 314 +       log_all(ls, "mark waiting requests");
 315 +
 316 +       down(&_lockqueue_lock);
 317 +
 318 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 319 +
 320 +               if (lkb->lkb_resource->res_ls != ls)
 321 +                       continue;
 322 +
 323 +               /*
 324 +                * These lkb's are new and the master is being looked up.  Mark
 325 +                * the lkb request to be resent.  Even if the destination node
 326 +                * for the request is still living and has our request, it will
 327 +                * purge all resdir requests in purge_requestqueue.  If there's
 328 +                * a reply to the LOOKUP request in our requestqueue (the reply
 329 +                * arrived after ls_stop), it is invalid and will be discarded
 330 +                * in purge_requestqueue, too.
 331 +                */
 332 +
 333 +               if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
 334 +                       DLM_ASSERT(lkb->lkb_nodeid == -1,
 335 +                                   print_lkb(lkb);
 336 +                                   print_rsb(lkb->lkb_resource););
 337 +
 338 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 339 +                       count++;
 340 +                       continue;
 341 +               }
 342 +
 343 +               /*
 344 +                * These lkb's have an outstanding request to a bygone node.
 345 +                * The request will be redirected to the new master node in
 346 +                * resend_cluster_requests().  Don't mark the request for
 347 +                * resending if there's a reply for it saved in the
 348 +                * requestqueue.
 349 +                */
 350 +
 351 +               if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
 352 +                   !reply_in_requestqueue(ls, lkb->lkb_id)) {
 353 +
 354 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 355 +
 356 +                       /*
 357 +                        * Don't rebuild this lkb on a new rsb in
 358 +                        * rebuild_rsbs_send().
 359 +                        */
 360 +
 361 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
 362 +                               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
 363 +                                           print_lkb(lkb);
 364 +                                           print_rsb(lkb->lkb_resource););
 365 +                               lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
 366 +                       }
 367 +
 368 +                       /*
 369 +                        * This flag indicates to the new master that his lkb
 370 +                        * is in the midst of a convert request and should be
 371 +                        * placed on the granted queue rather than the convert
 372 +                        * queue.  We will resend this convert request to the
 373 +                        * new master.
 374 +                        */
 375 +
 376 +                       else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
 377 +                               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
 378 +                                           print_lkb(lkb);
 379 +                                           print_rsb(lkb->lkb_resource););
 380 +                               lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
 381 +                       }
 382 +
 383 +                       count++;
 384 +               }
 385 +       }
 386 +       up(&_lockqueue_lock);
 387 +
 388 +       log_all(ls, "marked %d requests", count);
 389 +}
 390 +
 391 +int resend_cluster_requests(struct dlm_ls *ls)
 392 +{
 393 +       struct dlm_lkb *lkb, *safe;
 394 +       int error = 0, state, count = 0;
 395 +
 396 +       log_all(ls, "resend marked requests");
 397 +
 398 +       down(&_lockqueue_lock);
 399 +
 400 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 401 +
 402 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
 403 +                       log_debug(ls, "resend_cluster_requests: aborted");
 404 +                       error = -EINTR;
 405 +                       break;
 406 +               }
 407 +
 408 +               if (lkb->lkb_resource->res_ls != ls)
 409 +                       continue;
 410 +
 411 +               log_debug(ls, "resend_cluster_requests id=%x nodeid=%d "
 412 +                         "lqstate=%u flags=%x", lkb->lkb_id, lkb->lkb_nodeid,
 413 +                         lkb->lkb_lockqueue_state, lkb->lkb_flags);
 414 +
 415 +               /*
 416 +                * Resend/process the lockqueue lkb's (in-progres requests)
 417 +                * that were flagged at the start of recovery in
 418 +                * lockqueue_lkb_mark().
 419 +                */
 420 +
 421 +               if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
 422 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
 423 +                       lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
 424 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
 425 +
 426 +                       if (lkb->lkb_nodeid == -1) {
 427 +                               /*
 428 +                                * Send lookup to new resdir node.
 429 +                                */
 430 +                               lkb->lkb_lockqueue_time = jiffies;
 431 +                               send_cluster_request(lkb,
 432 +                                                    lkb->lkb_lockqueue_state);
 433 +                       }
 434 +
 435 +                       else if (lkb->lkb_nodeid != 0) {
 436 +                               /*
 437 +                                * There's a new RSB master (that's not us.)
 438 +                                */
 439 +                               lkb->lkb_lockqueue_time = jiffies;
 440 +                               send_cluster_request(lkb,
 441 +                                                    lkb->lkb_lockqueue_state);
 442 +                       }
 443 +
 444 +                       else {
 445 +                               /*
 446 +                                * We are the new RSB master for this lkb
 447 +                                * request.
 448 +                                */
 449 +                               state = lkb->lkb_lockqueue_state;
 450 +                               lkb->lkb_lockqueue_state = 0;
 451 +                               /* list_del equals remove_from_lockqueue() */
 452 +                               list_del(&lkb->lkb_lockqueue);
 453 +                               process_remastered_lkb(ls, lkb, state);
 454 +                       }
 455 +
 456 +                       count++;
 457 +               }
 458 +       }
 459 +       up(&_lockqueue_lock);
 460 +
 461 +       log_all(ls, "resent %d requests", count);
 462 +       return error;
 463 +}
 464 +
 465 +/*
 466 + * Process any LKBs on the Lock queue, this
 467 + * just looks at the entries to see if they have been
 468 + * on the queue too long and fails the requests if so.
 469 + */
 470 +
 471 +static void process_lockqueue(void)
 472 +{
 473 +       struct dlm_lkb *lkb, *safe;
 474 +       struct dlm_ls *ls;
 475 +       int count = 0;
 476 +
 477 +       down(&_lockqueue_lock);
 478 +
 479 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 480 +               ls = lkb->lkb_resource->res_ls;
 481 +
 482 +               if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
 483 +                       continue;
 484 +
 485 +               /* Don't time out locks that are in transition */
 486 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
 487 +                       continue;
 488 +
 489 +               if (check_timeout(lkb->lkb_lockqueue_time,
 490 +                                 dlm_config.lock_timeout)) {
 491 +                       count++;
 492 +                       list_del(&lkb->lkb_lockqueue);
 493 +                       up(&_lockqueue_lock);
 494 +                       cancel_lockop(lkb, -ETIMEDOUT);
 495 +                       down(&_lockqueue_lock);
 496 +               }
 497 +       }
 498 +       up(&_lockqueue_lock);
 499 +
 500 +       if (count)
 501 +               wake_astd();
 502 +
 503 +       if (atomic_read(&_astd_running))
 504 +               mod_timer(&_lockqueue_timer,
 505 +                         jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 506 +}
 507 +
 508 +/* Look for deadlocks */
 509 +static void process_deadlockqueue(void)
 510 +{
 511 +       struct dlm_lkb *lkb, *safe;
 512 +
 513 +       down(&_deadlockqueue_lock);
 514 +
 515 +       list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
 516 +               struct dlm_lkb *kill_lkb;
 517 +
 518 +               /* Only look at "due" locks */
 519 +               if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
 520 +                       break;
 521 +
 522 +               /* Don't look at locks that are in transition */
 523 +               if (!test_bit(LSFL_LS_RUN,
 524 +                             &lkb->lkb_resource->res_ls->ls_flags))
 525 +                       continue;
 526 +
 527 +               up(&_deadlockqueue_lock);
 528 +
 529 +               /* Lock has hit due time, check for conversion deadlock */
 530 +               kill_lkb = conversion_deadlock_check(lkb);
 531 +               if (kill_lkb)
 532 +                       cancel_conversion(kill_lkb, -EDEADLOCK);
 533 +
 534 +               down(&_deadlockqueue_lock);
 535 +       }
 536 +       up(&_deadlockqueue_lock);
 537 +}
 538 +
 539 +static __inline__ int no_asts(void)
 540 +{
 541 +       int ret;
 542 +
 543 +       down(&_ast_queue_lock);
 544 +       ret = list_empty(&_ast_queue);
 545 +       up(&_ast_queue_lock);
 546 +       return ret;
 547 +}
 548 +
 549 +static void lockqueue_timer_fn(unsigned long arg)
 550 +{
 551 +       set_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags);
 552 +       wake_up(&_astd_waitchan);
 553 +}
 554 +
 555 +/*
 556 + * DLM daemon which delivers asts.
 557 + */
 558 +
 559 +static int dlm_astd(void *data)
 560 +{
 561 +       daemonize("dlm_astd");
 562 +
 563 +       INIT_LIST_HEAD(&_lockqueue);
 564 +       init_MUTEX(&_lockqueue_lock);
 565 +       INIT_LIST_HEAD(&_deadlockqueue);
 566 +       init_MUTEX(&_deadlockqueue_lock);
 567 +       INIT_LIST_HEAD(&_ast_queue);
 568 +       init_MUTEX(&_ast_queue_lock);
 569 +       init_waitqueue_head(&_astd_waitchan);
 570 +       complete(&_astd_done);
 571 +
 572 +       /*
 573 +        * Set a timer to check the lockqueue for dead locks (and deadlocks).
 574 +        */
 575 +
 576 +       init_timer(&_lockqueue_timer);
 577 +       _lockqueue_timer.function = lockqueue_timer_fn;
 578 +       _lockqueue_timer.data = 0;
 579 +       mod_timer(&_lockqueue_timer,
 580 +                 jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 581 +
 582 +       while (atomic_read(&_astd_running)) {
 583 +               wchan_cond_sleep_intr(_astd_waitchan, no_asts());
 584 +
 585 +               if (test_and_clear_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags))
 586 +                       process_asts();
 587 +
 588 +               if (test_and_clear_bit(GDLMD_WAKE_TIMER, &_astd_wakeflags)) {
 589 +                       process_lockqueue();
 590 +                       if (dlm_config.deadlocktime)
 591 +                               process_deadlockqueue();
 592 +               }
 593 +       }
 594 +
 595 +       if (timer_pending(&_lockqueue_timer))
 596 +               del_timer(&_lockqueue_timer);
 597 +
 598 +       complete(&_astd_done);
 599 +
 600 +       return 0;
 601 +}
 602 +
 603 +void wake_astd(void)
 604 +{
 605 +       set_bit(GDLMD_WAKE_ASTS, &_astd_wakeflags);
 606 +       wake_up(&_astd_waitchan);
 607 +}
 608 +
 609 +int astd_start()
 610 +{
 611 +       init_completion(&_astd_done);
 612 +       atomic_set(&_astd_running, 1);
 613 +       _astd_pid = kernel_thread(dlm_astd, NULL, 0);
 614 +       wait_for_completion(&_astd_done);
 615 +       return 0;
 616 +}
 617 +
 618 +void astd_stop()
 619 +{
 620 +       atomic_set(&_astd_running, 0);
 621 +       wake_astd();
 622 +       wait_for_completion(&_astd_done);
 623 +}
 624 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
 625 --- linux-orig/cluster/dlm/ast.h        1970-01-01 07:30:00.000000000 +0730
 626 +++ linux-patched/cluster/dlm/ast.h     2004-07-13 18:57:22.000000000 +0800
 627 @@ -0,0 +1,28 @@
 628 +/******************************************************************************
 629 +*******************************************************************************
 630 +**
 631 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 632 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 633 +**
 634 +**  This copyrighted material is made available to anyone wishing to use,
 635 +**  modify, copy, or redistribute it subject to the terms and conditions
 636 +**  of the GNU General Public License v.2.
 637 +**
 638 +*******************************************************************************
 639 +******************************************************************************/
 640 +
 641 +#ifndef __AST_DOT_H__
 642 +#define __AST_DOT_H__
 643 +
 644 +void lockqueue_lkb_mark(struct dlm_ls *ls);
 645 +int resend_cluster_requests(struct dlm_ls *ls);
 646 +void add_to_lockqueue(struct dlm_lkb *lkb);
 647 +void remove_from_lockqueue(struct dlm_lkb *lkb);
 648 +void add_to_deadlockqueue(struct dlm_lkb *lkb);
 649 +void remove_from_deadlockqueue(struct dlm_lkb *lkb);
 650 +void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
 651 +void wake_astd(void);
 652 +int astd_start(void);
 653 +void astd_stop(void);
 654 +
 655 +#endif                         /* __AST_DOT_H__ */
 656 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
 657 --- linux-orig/cluster/dlm/config.c     1970-01-01 07:30:00.000000000 +0730
 658 +++ linux-patched/cluster/dlm/config.c  2004-07-13 18:57:22.000000000 +0800
 659 @@ -0,0 +1,131 @@
 660 +/******************************************************************************
 661 +*******************************************************************************
 662 +**
 663 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 664 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 665 +**
 666 +**  This copyrighted material is made available to anyone wishing to use,
 667 +**  modify, copy, or redistribute it subject to the terms and conditions
 668 +**  of the GNU General Public License v.2.
 669 +**
 670 +*******************************************************************************
 671 +******************************************************************************/
 672 +
 673 +#include <linux/module.h>
 674 +#include <linux/proc_fs.h>
 675 +
 676 +#include "dlm_internal.h"
 677 +#include "lowcomms.h"
 678 +#include "config.h"
 679 +
 680 +/* Config file defaults */
 681 +#define DEFAULT_TCP_PORT       21064
 682 +#define DEFAULT_LOCK_TIMEOUT      30
 683 +#define DEFAULT_BUFFER_SIZE     4096
 684 +#define DEFAULT_RSBTBL_SIZE      256
 685 +#define DEFAULT_LKBTBL_SIZE     1024
 686 +#define DEFAULT_DIRTBL_SIZE      512
 687 +#define DEFAULT_MAX_CONNECTIONS  128
 688 +#define DEFAULT_DEADLOCKTIME      10
 689 +
 690 +struct config_info dlm_config = {
 691 +       .tcp_port = DEFAULT_TCP_PORT,
 692 +       .lock_timeout = DEFAULT_LOCK_TIMEOUT,
 693 +       .buffer_size = DEFAULT_BUFFER_SIZE,
 694 +       .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
 695 +       .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
 696 +       .dirtbl_size = DEFAULT_DIRTBL_SIZE,
 697 +       .max_connections = DEFAULT_MAX_CONNECTIONS,
 698 +       .deadlocktime = DEFAULT_DEADLOCKTIME,
 699 +};
 700 +
 701 +
 702 +static struct config_proc_info {
 703 +    char *name;
 704 +    int  *value;
 705 +} config_proc[] = {
 706 +    {
 707 +       .name = "tcp_port",
 708 +       .value = &dlm_config.tcp_port,
 709 +    },
 710 +    {
 711 +       .name = "lock_timeout",
 712 +       .value = &dlm_config.lock_timeout,
 713 +    },
 714 +    {
 715 +       .name = "buffer_size",
 716 +       .value = &dlm_config.buffer_size,
 717 +    },
 718 +    {
 719 +       .name = "rsbtbl_size",
 720 +       .value = &dlm_config.rsbtbl_size,
 721 +    },
 722 +    {
 723 +       .name = "lkbtbl_size",
 724 +       .value = &dlm_config.lkbtbl_size,
 725 +    },
 726 +    {
 727 +       .name = "dirtbl_size",
 728 +       .value = &dlm_config.dirtbl_size,
 729 +    },
 730 +    {
 731 +       .name = "max_connections",
 732 +       .value = &dlm_config.max_connections,
 733 +    },
 734 +    {
 735 +       .name = "deadlocktime",
 736 +       .value = &dlm_config.deadlocktime,
 737 +    }
 738 +};
 739 +static struct proc_dir_entry *dlm_dir;
 740 +
 741 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
 742 +                               int *eof, void *data)
 743 +{
 744 +       struct config_proc_info *cinfo = data;
 745 +       return snprintf(page, count, "%d\n", *cinfo->value);
 746 +}
 747 +
 748 +static int dlm_config_write_proc(struct file *file, const char *buffer,
 749 +                                unsigned long count, void *data)
 750 +{
 751 +       struct config_proc_info *cinfo = data;
 752 +       int value;
 753 +       char *end;
 754 +
 755 +       value = simple_strtoul(buffer, &end, 10);
 756 +       if (*end)
 757 +               *cinfo->value = value;
 758 +       return count;
 759 +}
 760 +
 761 +int dlm_config_init(void)
 762 +{
 763 +       int i;
 764 +       struct proc_dir_entry *pde;
 765 +
 766 +       dlm_dir = proc_mkdir("cluster/config/dlm", 0);
 767 +       if (!dlm_dir)
 768 +               return -1;
 769 +
 770 +       dlm_dir->owner = THIS_MODULE;
 771 +
 772 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
 773 +               pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
 774 +               if (pde) {
 775 +                       pde->data = &config_proc[i];
 776 +                       pde->write_proc = dlm_config_write_proc;
 777 +                       pde->read_proc = dlm_config_read_proc;
 778 +               }
 779 +       }
 780 +       return 0;
 781 +}
 782 +
 783 +void dlm_config_exit(void)
 784 +{
 785 +       int i;
 786 +
 787 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
 788 +               remove_proc_entry(config_proc[i].name, dlm_dir);
 789 +       remove_proc_entry("cluster/config/dlm", NULL);
 790 +}
 791 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
 792 --- linux-orig/cluster/dlm/config.h     1970-01-01 07:30:00.000000000 +0730
 793 +++ linux-patched/cluster/dlm/config.h  2004-07-13 18:57:22.000000000 +0800
 794 @@ -0,0 +1,32 @@
 795 +/******************************************************************************
 796 +*******************************************************************************
 797 +**
 798 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 799 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 800 +**
 801 +**  This copyrighted material is made available to anyone wishing to use,
 802 +**  modify, copy, or redistribute it subject to the terms and conditions
 803 +**  of the GNU General Public License v.2.
 804 +**
 805 +*******************************************************************************
 806 +******************************************************************************/
 807 +
 808 +#ifndef __CONFIG_DOT_H__
 809 +#define __CONFIG_DOT_H__
 810 +
 811 +struct config_info {
 812 +       int tcp_port;
 813 +       int lock_timeout;
 814 +       int buffer_size;
 815 +       int rsbtbl_size;
 816 +       int lkbtbl_size;
 817 +       int dirtbl_size;
 818 +       int max_connections;
 819 +       int deadlocktime;
 820 +};
 821 +
 822 +extern struct config_info dlm_config;
 823 +extern int  dlm_config_init(void);
 824 +extern void dlm_config_exit(void);
 825 +
 826 +#endif                         /* __CONFIG_DOT_H__ */
 827 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
 828 --- linux-orig/cluster/dlm/device.c     1970-01-01 07:30:00.000000000 +0730
 829 +++ linux-patched/cluster/dlm/device.c  2004-07-13 18:57:22.000000000 +0800
 830 @@ -0,0 +1,1020 @@
 831 +/******************************************************************************
 832 +*******************************************************************************
 833 +**
 834 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 835 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 836 +**
 837 +**  This copyrighted material is made available to anyone wishing to use,
 838 +**  modify, copy, or redistribute it subject to the terms and conditions
 839 +**  of the GNU General Public License v.2.
 840 +**
 841 +*******************************************************************************
 842 +******************************************************************************/
 843 +
 844 +/*
 845 + * device.c
 846 + *
 847 + * This is the userland interface to the DLM.
 848 + *
 849 + * The locking is done via a misc char device (find the
 850 + * registered minor number in /proc/misc).
 851 + *
 852 + * User code should not use this interface directly but
 853 + * call the library routines in libdlm.a instead.
 854 + *
 855 + */
 856 +
 857 +#include <linux/miscdevice.h>
 858 +#include <linux/init.h>
 859 +#include <linux/wait.h>
 860 +#include <linux/module.h>
 861 +#include <linux/file.h>
 862 +#include <linux/fs.h>
 863 +#include <linux/poll.h>
 864 +#include <linux/signal.h>
 865 +#include <linux/spinlock.h>
 866 +#include <asm/ioctls.h>
 867 +
 868 +#include "dlm_internal.h"
 869 +#include "device.h"
 870 +
 871 +extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
 872 +static struct file_operations _dlm_fops;
 873 +static const char *name_prefix="dlm";
 874 +static struct list_head user_ls_list;
 875 +
 876 +/* Flags in li_flags */
 877 +#define LI_FLAG_COMPLETE  1
 878 +#define LI_FLAG_FIRSTLOCK 2
 879 +
 880 +struct lock_info {
 881 +       uint8_t li_cmd;
 882 +       struct dlm_lksb li_lksb;
 883 +       wait_queue_head_t li_waitq;
 884 +       unsigned long li_flags;
 885 +       void __user *li_astparam;
 886 +       void __user *li_astaddr;
 887 +       void __user *li_bastaddr;
 888 +       struct file_info *li_file;
 889 +       struct dlm_lksb __user *li_user_lksb;
 890 +       struct semaphore li_firstlock;
 891 +       struct dlm_queryinfo *li_queryinfo;
 892 +       struct dlm_queryinfo __user *li_user_queryinfo;
 893 +};
 894 +
 895 +/* A queued AST no less */
 896 +struct ast_info {
 897 +       struct dlm_lock_result result;
 898 +       struct dlm_queryinfo *queryinfo;
 899 +       struct dlm_queryinfo __user *user_queryinfo;
 900 +       struct list_head list;
 901 +};
 902 +
 903 +/* One of these per userland lockspace */
 904 +struct user_ls {
 905 +       void    *ls_lockspace;
 906 +       atomic_t ls_refcnt;
 907 +       long     ls_flags; /* bit 1 means LS has been deleted */
 908 +
 909 +       /* Passed into misc_register() */
 910 +       struct miscdevice ls_miscinfo;
 911 +       struct list_head  ls_list;
 912 +};
 913 +
 914 +/* misc_device info for the control device */
 915 +static struct miscdevice ctl_device;
 916 +
 917 +/*
 918 + * Stuff we hang off the file struct.
 919 + * The first two are to cope with unlocking all the
 920 + * locks help by a process when it dies.
 921 + */
 922 +struct file_info {
 923 +       struct list_head    fi_lkb_list;     /* List of active lkbs */
 924 +       spinlock_t          fi_lkb_lock;
 925 +       struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
 926 +       spinlock_t          fi_ast_lock;
 927 +       wait_queue_head_t   fi_wait;
 928 +       struct user_ls     *fi_ls;
 929 +       atomic_t            fi_refcnt;       /* Number of users */
 930 +       unsigned long       fi_flags;        /* Bit 1 means the device is open */
 931 +};
 932 +
 933 +
 934 +/* get and put ops for file_info.
 935 +   Actually I don't really like "get" and "put", but everyone
 936 +   else seems to use them and I can't think of anything
 937 +   nicer at the moment */
 938 +static void get_file_info(struct file_info *f)
 939 +{
 940 +       atomic_inc(&f->fi_refcnt);
 941 +}
 942 +
 943 +static void put_file_info(struct file_info *f)
 944 +{
 945 +       if (atomic_dec_and_test(&f->fi_refcnt))
 946 +               kfree(f);
 947 +}
 948 +
 949 +/* Find a lockspace struct given the device minor number */
 950 +static struct user_ls *find_lockspace(int minor)
 951 +{
 952 +       struct user_ls *lsinfo;
 953 +
 954 +       list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
 955 +
 956 +               if (lsinfo->ls_miscinfo.minor == minor)
 957 +                       return lsinfo;
 958 +       }
 959 +       return NULL;
 960 +}
 961 +
 962 +static void add_lockspace_to_list(struct user_ls *lsinfo)
 963 +{
 964 +       list_add(&lsinfo->ls_list, &user_ls_list);
 965 +}
 966 +
 967 +/* Register a lockspace with the DLM and create a misc
 968 +   device for userland to access it */
 969 +static int register_lockspace(char *name, struct user_ls **ls)
 970 +{
 971 +       struct user_ls *newls;
 972 +       int status;
 973 +       int namelen;
 974 +
 975 +       namelen = strlen(name)+strlen(name_prefix)+2;
 976 +
 977 +       newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
 978 +       if (!newls)
 979 +               return -ENOMEM;
 980 +       memset(newls, 0, sizeof(struct user_ls));
 981 +
 982 +       newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
 983 +       if (!newls->ls_miscinfo.name) {
 984 +               kfree(newls);
 985 +               return -ENOMEM;
 986 +       }
 987 +       snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
 988 +
 989 +       status = dlm_new_lockspace((char *)newls->ls_miscinfo.name+strlen(name_prefix)+1,
 990 +                                   strlen(newls->ls_miscinfo.name) - strlen(name_prefix) - 1,
 991 +                                   &newls->ls_lockspace, 0);
 992 +
 993 +       if (status != 0) {
 994 +               kfree(newls->ls_miscinfo.name);
 995 +               kfree(newls);
 996 +               return status;
 997 +       }
 998 +
 999 +       newls->ls_miscinfo.fops = &_dlm_fops;
1000 +       newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1001 +
1002 +       status = misc_register(&newls->ls_miscinfo);
1003 +       if (status) {
1004 +               log_print("failed to register misc device for %s", name);
1005 +               dlm_release_lockspace(newls->ls_lockspace, 0);
1006 +               kfree(newls->ls_miscinfo.name);
1007 +               kfree(newls);
1008 +               return status;
1009 +       }
1010 +
1011 +
1012 +       add_lockspace_to_list(newls);
1013 +       *ls = newls;
1014 +       return 0;
1015 +}
1016 +
1017 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1018 +{
1019 +       int status;
1020 +
1021 +       status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1022 +       if (status)
1023 +               return status;
1024 +
1025 +       status = misc_deregister(&lsinfo->ls_miscinfo);
1026 +       if (status)
1027 +               return status;
1028 +
1029 +       list_del(&lsinfo->ls_list);
1030 +       kfree(lsinfo->ls_miscinfo.name);
1031 +       kfree(lsinfo);
1032 +
1033 +       return 0;
1034 +}
1035 +
1036 +/* Add it to userland's AST queue */
1037 +static void add_to_astqueue(struct lock_info *li, void *astaddr)
1038 +{
1039 +       struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1040 +       if (!ast)
1041 +               return;
1042 +
1043 +       ast->result.astparam  = li->li_astparam;
1044 +       ast->result.astaddr   = astaddr;
1045 +       ast->result.user_lksb = li->li_user_lksb;
1046 +       ast->result.cmd       = li->li_cmd;
1047 +       memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1048 +
1049 +       /* These two will both be NULL for anything other than queries */
1050 +       ast->queryinfo        = li->li_queryinfo;
1051 +       ast->user_queryinfo   = li->li_user_queryinfo;
1052 +
1053 +       spin_lock(&li->li_file->fi_ast_lock);
1054 +       list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1055 +       spin_unlock(&li->li_file->fi_ast_lock);
1056 +       wake_up_interruptible(&li->li_file->fi_wait);
1057 +}
1058 +
1059 +static void bast_routine(void *param, int mode)
1060 +{
1061 +       struct lock_info *li = param;
1062 +
1063 +       if (param) {
1064 +               add_to_astqueue(li, li->li_bastaddr);
1065 +       }
1066 +}
1067 +
1068 +/*
1069 + * This is the kernel's AST routine.
1070 + * All lock, unlock & query operations complete here.
1071 + * The only syncronous ops are those done during device close.
1072 + */
1073 +static void ast_routine(void *param)
1074 +{
1075 +       struct lock_info *li = param;
1076 +
1077 +       /* Param may be NULL if a persistent lock is unlocked by someone else */
1078 +       if (!param)
1079 +               return;
1080 +
1081 +       /* If it's an async request then post data to the user's AST queue. */
1082 +       if (li->li_astaddr) {
1083 +
1084 +               /* Only queue AST if the device is still open */
1085 +               if (test_bit(1, &li->li_file->fi_flags))
1086 +                       add_to_astqueue(li, li->li_astaddr);
1087 +
1088 +               /* If it's a new lock operation that failed, then
1089 +                * remove it from the owner queue and free the
1090 +                * lock_info. The DLM will not free the LKB until this
1091 +                * AST has completed.
1092 +                */
1093 +               if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1094 +                   li->li_lksb.sb_status != 0) {
1095 +                       struct dlm_lkb *lkb;
1096 +
1097 +                       /* Wait till dlm_lock() has finished */
1098 +                       down(&li->li_firstlock);
1099 +                       lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1100 +                       if (lkb) {
1101 +                               spin_lock(&li->li_file->fi_lkb_lock);
1102 +                               list_del(&lkb->lkb_ownerqueue);
1103 +                               spin_unlock(&li->li_file->fi_lkb_lock);
1104 +                       }
1105 +                       up(&li->li_firstlock);
1106 +                       put_file_info(li->li_file);
1107 +                       kfree(li);
1108 +                       return;
1109 +               }
1110 +               /* Free unlocks & queries */
1111 +               if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1112 +                   li->li_cmd == DLM_USER_QUERY) {
1113 +                       put_file_info(li->li_file);
1114 +                       kfree(li);
1115 +               }
1116 +       }
1117 +       else {
1118 +               /* Syncronous request, just wake up the caller */
1119 +               set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1120 +               wake_up_interruptible(&li->li_waitq);
1121 +       }
1122 +}
1123 +
1124 +/*
1125 + * Wait for the lock op to complete and return the status.
1126 + */
1127 +static int wait_for_ast(struct lock_info *li)
1128 +{
1129 +       /* Wait for the AST routine to complete */
1130 +       set_task_state(current, TASK_INTERRUPTIBLE);
1131 +       while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1132 +               schedule();
1133 +
1134 +       set_task_state(current, TASK_RUNNING);
1135 +
1136 +       return li->li_lksb.sb_status;
1137 +}
1138 +
1139 +
1140 +/* Open on control device */
1141 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1142 +{
1143 +       return 0;
1144 +}
1145 +
1146 +/* Close on control device */
1147 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1148 +{
1149 +       return 0;
1150 +}
1151 +
1152 +/* Open on lockspace device */
1153 +static int dlm_open(struct inode *inode, struct file *file)
1154 +{
1155 +       struct file_info *f;
1156 +       struct user_ls *lsinfo;
1157 +
1158 +       lsinfo = find_lockspace(iminor(inode));
1159 +       if (!lsinfo)
1160 +               return -ENOENT;
1161 +
1162 +       f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1163 +       if (!f)
1164 +               return -ENOMEM;
1165 +
1166 +       atomic_inc(&lsinfo->ls_refcnt);
1167 +       INIT_LIST_HEAD(&f->fi_lkb_list);
1168 +       INIT_LIST_HEAD(&f->fi_ast_list);
1169 +       spin_lock_init(&f->fi_ast_lock);
1170 +       spin_lock_init(&f->fi_lkb_lock);
1171 +       init_waitqueue_head(&f->fi_wait);
1172 +       f->fi_ls = lsinfo;
1173 +       atomic_set(&f->fi_refcnt, 1);
1174 +       set_bit(1, &f->fi_flags);
1175 +
1176 +       file->private_data = f;
1177 +
1178 +       return 0;
1179 +}
1180 +
1181 +/* Check the user's version matches ours */
1182 +static int check_version(struct dlm_lock_params *params)
1183 +{
1184 +       if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1185 +           (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1186 +            params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1187 +
1188 +               log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1189 +                      params->version[0],
1190 +                      params->version[1],
1191 +                      params->version[2],
1192 +                      DLM_DEVICE_VERSION_MAJOR,
1193 +                      DLM_DEVICE_VERSION_MINOR,
1194 +                      DLM_DEVICE_VERSION_PATCH);
1195 +               return -EINVAL;
1196 +       }
1197 +       return 0;
1198 +}
1199 +
1200 +/* Close on lockspace device */
1201 +static int dlm_close(struct inode *inode, struct file *file)
1202 +{
1203 +       struct file_info *f = file->private_data;
1204 +       struct lock_info li;
1205 +       sigset_t tmpsig;
1206 +       sigset_t allsigs;
1207 +       struct dlm_lkb *lkb, *safe;
1208 +       struct user_ls *lsinfo;
1209 +       DECLARE_WAITQUEUE(wq, current);
1210 +
1211 +       lsinfo = find_lockspace(iminor(inode));
1212 +       if (!lsinfo)
1213 +               return -ENOENT;
1214 +
1215 +       /* Mark this closed so that ASTs will not be delivered any more */
1216 +       clear_bit(1, &f->fi_flags);
1217 +
1218 +       /* Block signals while we are doing this */
1219 +       sigfillset(&allsigs);
1220 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1221 +
1222 +       /* We use our own lock_info struct here, so that any
1223 +        * outstanding "real" ASTs will be delivered with the
1224 +        * corresponding "real" params, thus freeing the lock_info
1225 +        * that belongs the lock. This catches the corner case where
1226 +        * a lock is BUSY when we try to unlock it here
1227 +        */
1228 +       memset(&li, 0, sizeof(li));
1229 +       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1230 +       init_waitqueue_head(&li.li_waitq);
1231 +       add_wait_queue(&li.li_waitq, &wq);
1232 +
1233 +       /*
1234 +        * Free any outstanding locks, they are on the
1235 +        * list in LIFO order so there should be no problems
1236 +        * about unlocking parents before children.
1237 +        * Although we don't remove the lkbs from the list here
1238 +        * (what would be the point?), foreach_safe is needed
1239 +        * because the lkbs are freed during dlm_unlock operations
1240 +        */
1241 +       list_for_each_entry_safe(lkb, safe, &f->fi_lkb_list, lkb_ownerqueue) {
1242 +               int status;
1243 +               int lock_status;
1244 +               int flags = 0;
1245 +               struct lock_info *old_li;
1246 +
1247 +               /* Make a copy of this pointer. If all goes well we will
1248 +                * free it later. if not it will be left to the AST routine
1249 +                * to tidy up
1250 +                */
1251 +               old_li = (struct lock_info *)lkb->lkb_astparam;
1252 +
1253 +               /* Don't unlock persistent locks */
1254 +               if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1255 +                       list_del(&lkb->lkb_ownerqueue);
1256 +
1257 +                       /* But tidy our references in it */
1258 +                       kfree(old_li);
1259 +                       lkb->lkb_astparam = (long)NULL;
1260 +                       put_file_info(f);
1261 +                       continue;
1262 +               }
1263 +
1264 +               clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1265 +
1266 +               /* If it's not granted then cancel the request.
1267 +                * If the lock was WAITING then it will be dropped,
1268 +                *    if it was converting then it will be reverted to GRANTED,
1269 +                *    then we will unlock it.
1270 +                */
1271 +               lock_status = lkb->lkb_status;
1272 +
1273 +               if (lock_status != GDLM_LKSTS_GRANTED)
1274 +                       flags = DLM_LKF_CANCEL;
1275 +
1276 +               status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1277 +
1278 +               /* Must wait for it to complete as the next lock could be its
1279 +                * parent */
1280 +               if (status == 0)
1281 +                       wait_for_ast(&li);
1282 +
1283 +               /* If it was waiting for a conversion, it will
1284 +                  now be granted so we can unlock it properly */
1285 +               if (lock_status == GDLM_LKSTS_CONVERT) {
1286 +
1287 +                       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1288 +                       status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, 0, &li.li_lksb, &li);
1289 +
1290 +                       if (status == 0)
1291 +                               wait_for_ast(&li);
1292 +               }
1293 +               /* Unlock suceeded, free the lock_info struct. */
1294 +               if (status == 0) {
1295 +                       kfree(old_li);
1296 +                       put_file_info(f);
1297 +               }
1298 +       }
1299 +
1300 +       remove_wait_queue(&li.li_waitq, &wq);
1301 +
1302 +       /* If this is the last reference, and the lockspace has been deleted
1303 +          the free the struct */
1304 +       if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1305 +               kfree(lsinfo);
1306 +       }
1307 +
1308 +       /* Restore signals */
1309 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1310 +       recalc_sigpending();
1311 +
1312 +       return 0;
1313 +}
1314 +
1315 +/*
1316 + * ioctls to create/remove lockspaces, and check how many
1317 + * outstanding ASTs there are against a particular LS.
1318 + */
1319 +static int dlm_ioctl(struct inode *inode, struct file *file,
1320 +                    uint command, ulong u)
1321 +{
1322 +       struct file_info *fi = file->private_data;
1323 +       int status = -EINVAL;
1324 +       int count;
1325 +       struct list_head *tmp_list;
1326 +
1327 +       switch (command) {
1328 +
1329 +               /* Are there any ASTs for us to read?
1330 +                * Warning, this returns the number of messages (ASTs)
1331 +                * in the queue, NOT the number of bytes to read
1332 +                */
1333 +       case FIONREAD:
1334 +               count = 0;
1335 +               spin_lock(&fi->fi_ast_lock);
1336 +               list_for_each(tmp_list, &fi->fi_ast_list)
1337 +                       count++;
1338 +               spin_unlock(&fi->fi_ast_lock);
1339 +               status = put_user(count, (int *)u);
1340 +               break;
1341 +
1342 +       default:
1343 +               return -ENOTTY;
1344 +       }
1345 +
1346 +       return status;
1347 +}
1348 +
1349 +/*
1350 + * ioctls to create/remove lockspaces.
1351 + */
1352 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1353 +                        uint command, ulong u)
1354 +{
1355 +       int status = -EINVAL;
1356 +       char ls_name[MAX_LS_NAME_LEN];
1357 +       struct user_ls *lsinfo;
1358 +       int force = 0;
1359 +
1360 +       switch (command) {
1361 +       case DLM_CREATE_LOCKSPACE:
1362 +               if (!capable(CAP_SYS_ADMIN))
1363 +                       return -EPERM;
1364 +
1365 +               if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1366 +                       return -EFAULT;
1367 +               status = register_lockspace(ls_name, &lsinfo);
1368 +
1369 +               /* If it succeeded then return the minor number */
1370 +               if (status == 0)
1371 +                       status = lsinfo->ls_miscinfo.minor;
1372 +               break;
1373 +
1374 +       case DLM_FORCE_RELEASE_LOCKSPACE:
1375 +               force = 2;
1376 +
1377 +       case DLM_RELEASE_LOCKSPACE:
1378 +               if (!capable(CAP_SYS_ADMIN))
1379 +                       return -EPERM;
1380 +
1381 +               lsinfo = find_lockspace(u);
1382 +               if (!lsinfo)
1383 +                       return -EINVAL;
1384 +               status = unregister_lockspace(lsinfo, force);
1385 +               break;
1386 +
1387 +       default:
1388 +               return -ENOTTY;
1389 +       }
1390 +
1391 +       return status;
1392 +}
1393 +
1394 +/* Deal with the messy stuff of copying a web of structs
1395 +   from kernel space to userspace */
1396 +static int copy_query_result(struct ast_info *ast)
1397 +{
1398 +       int status = -EFAULT;
1399 +       struct dlm_queryinfo qi;
1400 +
1401 +       /* Get the pointers to userspace structs */
1402 +       if (copy_from_user(&qi, ast->user_queryinfo,
1403 +                          sizeof(struct dlm_queryinfo)))
1404 +               goto copy_out;
1405 +
1406 +       /* TODO: does this deref a user pointer? */
1407 +       if (put_user(ast->queryinfo->gqi_lockcount,
1408 +                    &ast->user_queryinfo->gqi_lockcount))
1409 +               goto copy_out;
1410 +
1411 +       if (qi.gqi_resinfo) {
1412 +               if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1413 +                                sizeof(struct dlm_resinfo)))
1414 +                       goto copy_out;
1415 +       }
1416 +
1417 +       if (qi.gqi_lockinfo) {
1418 +               if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1419 +                                sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1420 +                       goto copy_out;
1421 +       }
1422 +
1423 +       status = 0;
1424 +
1425 +       if (ast->queryinfo->gqi_lockinfo)
1426 +               kfree(ast->queryinfo->gqi_lockinfo);
1427 +
1428 +       if (ast->queryinfo->gqi_resinfo)
1429 +               kfree(ast->queryinfo->gqi_resinfo);
1430 +
1431 +       kfree(ast->queryinfo);
1432 +
1433 + copy_out:
1434 +       return status;
1435 +}
1436 +
1437 +/* Read call, might block if no ASTs are waiting.
1438 + * It will only ever return one message at a time, regardless
1439 + * of how many are pending.
1440 + */
1441 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1442 +{
1443 +       struct file_info *fi = file->private_data;
1444 +       struct ast_info *ast;
1445 +       int ret;
1446 +       DECLARE_WAITQUEUE(wait, current);
1447 +
1448 +       if (count < sizeof(struct dlm_lock_result))
1449 +               return -EINVAL;
1450 +
1451 +       spin_lock(&fi->fi_ast_lock);
1452 +       if (list_empty(&fi->fi_ast_list)) {
1453 +
1454 +               /* No waiting ASTs.
1455 +                * Return EOF if the lockspace been deleted.
1456 +                */
1457 +               if (test_bit(1, &fi->fi_ls->ls_flags))
1458 +                       return 0;
1459 +
1460 +               if (file->f_flags & O_NONBLOCK) {
1461 +                       spin_unlock(&fi->fi_ast_lock);
1462 +                       return -EAGAIN;
1463 +               }
1464 +
1465 +               add_wait_queue(&fi->fi_wait, &wait);
1466 +
1467 +       repeat:
1468 +               set_current_state(TASK_INTERRUPTIBLE);
1469 +               if (list_empty(&fi->fi_ast_list) &&
1470 +                   !signal_pending(current)) {
1471 +
1472 +                       spin_unlock(&fi->fi_ast_lock);
1473 +                       schedule();
1474 +                       spin_lock(&fi->fi_ast_lock);
1475 +                       goto repeat;
1476 +               }
1477 +
1478 +               current->state = TASK_RUNNING;
1479 +               remove_wait_queue(&fi->fi_wait, &wait);
1480 +
1481 +               if (signal_pending(current)) {
1482 +                       spin_unlock(&fi->fi_ast_lock);
1483 +                       return -ERESTARTSYS;
1484 +               }
1485 +       }
1486 +
1487 +       ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1488 +       list_del(&ast->list);
1489 +       spin_unlock(&fi->fi_ast_lock);
1490 +
1491 +       ret = sizeof(struct dlm_lock_result);
1492 +       if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1493 +               ret = -EFAULT;
1494 +
1495 +       /* If it was a query then copy the result block back here */
1496 +       if (ast->queryinfo) {
1497 +               int status = copy_query_result(ast);
1498 +               if (status)
1499 +                       ret = status;
1500 +       }
1501 +
1502 +       kfree(ast);
1503 +       return ret;
1504 +}
1505 +
1506 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1507 +{
1508 +       struct file_info *fi = file->private_data;
1509 +
1510 +       poll_wait(file, &fi->fi_wait, wait);
1511 +
1512 +       spin_lock(&fi->fi_ast_lock);
1513 +       if (!list_empty(&fi->fi_ast_list)) {
1514 +               spin_unlock(&fi->fi_ast_lock);
1515 +               return POLLIN | POLLRDNORM;
1516 +       }
1517 +
1518 +       spin_unlock(&fi->fi_ast_lock);
1519 +       return 0;
1520 +}
1521 +
1522 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1523 +{
1524 +       struct lock_info *li;
1525 +       int status;
1526 +
1527 +       li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1528 +       if (!li)
1529 +               return -ENOMEM;
1530 +
1531 +       get_file_info(fi);
1532 +       li->li_user_lksb = kparams->lksb;
1533 +       li->li_astparam  = kparams->astparam;
1534 +       li->li_bastaddr  = kparams->bastaddr;
1535 +       li->li_astaddr   = kparams->astaddr;
1536 +       li->li_file      = fi;
1537 +       li->li_flags     = 0;
1538 +       li->li_cmd       = kparams->cmd;
1539 +       clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1540 +
1541 +       if (copy_from_user(&li->li_lksb, kparams->lksb,
1542 +                          sizeof(struct dlm_lksb))) {
1543 +               kfree(li);
1544 +               return -EFAULT;
1545 +       }
1546 +       li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1547 +
1548 +       /* Allocate query structs */
1549 +       status = -ENOMEM;
1550 +       li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1551 +       if (!li->li_queryinfo)
1552 +               goto out1;
1553 +
1554 +       /* Mainly to get gqi_lock buffer size */
1555 +       if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1556 +                          sizeof(struct dlm_queryinfo))) {
1557 +               status = -EFAULT;
1558 +               goto out1;
1559 +       }
1560 +
1561 +       /* Overwrite userspace pointers we just copied with kernel space ones */
1562 +       if (li->li_queryinfo->gqi_resinfo) {
1563 +               li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1564 +               if (!li->li_queryinfo->gqi_resinfo)
1565 +                       goto out1;
1566 +       }
1567 +       if (li->li_queryinfo->gqi_lockinfo) {
1568 +               li->li_queryinfo->gqi_lockinfo =
1569 +                       kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1570 +                               GFP_KERNEL);
1571 +               if (!li->li_queryinfo->gqi_lockinfo)
1572 +                       goto out2;
1573 +       }
1574 +
1575 +       li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1576 +
1577 +       return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1578 +                         kparams->flags, /* query */
1579 +                         li->li_queryinfo,
1580 +                         ast_routine, li);
1581 +
1582 + out2:
1583 +       kfree(li->li_queryinfo);
1584 +
1585 + out1:
1586 +       kfree(li);
1587 +       return status;
1588 +}
1589 +
1590 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1591 +                       const char *buffer)
1592 +{
1593 +       struct lock_info *li;
1594 +       int status;
1595 +       char name[DLM_RESNAME_MAXLEN];
1596 +
1597 +       /*
1598 +        * Validate things that we need to have correct.
1599 +        */
1600 +       if (kparams->namelen > DLM_RESNAME_MAXLEN)
1601 +               return -EINVAL;
1602 +
1603 +       if (!kparams->astaddr)
1604 +               return -EINVAL;
1605 +
1606 +       if (!kparams->lksb)
1607 +               return -EINVAL;
1608 +
1609 +       /* Get the lock name */
1610 +       if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1611 +                          kparams->namelen)) {
1612 +               return -EFAULT;
1613 +       }
1614 +
1615 +       /* For conversions, the lock will already have a lock_info
1616 +          block squirelled away in astparam */
1617 +       if (kparams->flags & DLM_LKF_CONVERT) {
1618 +               struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1619 +               if (!lkb) {
1620 +                       return -EINVAL;
1621 +               }
1622 +               li = (struct lock_info *)lkb->lkb_astparam;
1623 +
1624 +               /* Only override these if they are provided */
1625 +               if (li->li_user_lksb)
1626 +                       li->li_user_lksb = kparams->lksb;
1627 +               if (li->li_astparam)
1628 +                       li->li_astparam  = kparams->astparam;
1629 +               if (li->li_bastaddr)
1630 +                       li->li_bastaddr  = kparams->bastaddr;
1631 +               if (li->li_bastaddr)
1632 +                       li->li_astaddr   = kparams->astaddr;
1633 +               li->li_flags     = 0;
1634 +       }
1635 +       else {
1636 +               li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1637 +               if (!li)
1638 +                       return -ENOMEM;
1639 +
1640 +               li->li_user_lksb = kparams->lksb;
1641 +               li->li_astparam  = kparams->astparam;
1642 +               li->li_bastaddr  = kparams->bastaddr;
1643 +               li->li_astaddr   = kparams->astaddr;
1644 +               li->li_file      = fi;
1645 +               li->li_flags     = 0;
1646 +               li->li_cmd       = kparams->cmd;
1647 +               li->li_queryinfo  = NULL;
1648 +
1649 +               /* semaphore to allow us to complete our work before
1650 +                  the AST routine runs. In fact we only need (and use) this
1651 +                  when the initial lock fails */
1652 +               init_MUTEX_LOCKED(&li->li_firstlock);
1653 +               set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1654 +
1655 +               get_file_info(fi);
1656 +       }
1657 +
1658 +       /* Copy the user's LKSB into kernel space,
1659 +          needed for conversions & value block operations */
1660 +       if (kparams->lksb && copy_from_user(&li->li_lksb, kparams->lksb,
1661 +                                           sizeof(struct dlm_lksb)))
1662 +               return -EFAULT;
1663 +
1664 +       /* Lock it ... */
1665 +       status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1666 +                          kparams->flags, name, kparams->namelen,
1667 +                          kparams->parent,
1668 +                          ast_routine,
1669 +                          li,
1670 +                          li->li_bastaddr ? bast_routine : NULL,
1671 +                          kparams->range.ra_end ? &kparams->range : NULL);
1672 +
1673 +       /* If it succeeded (this far) with a new lock then keep track of
1674 +          it on the file's lkb list */
1675 +       if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1676 +               struct dlm_lkb *lkb;
1677 +               lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1678 +
1679 +               if (lkb) {
1680 +                       spin_lock(&fi->fi_lkb_lock);
1681 +                       list_add(&lkb->lkb_ownerqueue,
1682 +                                &fi->fi_lkb_list);
1683 +                       spin_unlock(&fi->fi_lkb_lock);
1684 +               }
1685 +               else {
1686 +                       log_print("failed to get lkb for new lock");
1687 +               }
1688 +               up(&li->li_firstlock);
1689 +       }
1690 +
1691 +       return status;
1692 +}
1693 +
1694 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1695 +{
1696 +       struct lock_info *li;
1697 +       struct dlm_lkb *lkb;
1698 +       int status;
1699 +
1700 +       lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1701 +       if (!lkb) {
1702 +               return -EINVAL;
1703 +       }
1704 +
1705 +       li = (struct lock_info *)lkb->lkb_astparam;
1706 +
1707 +       li->li_user_lksb = kparams->lksb;
1708 +       li->li_astparam  = kparams->astparam;
1709 +       li->li_cmd       = kparams->cmd;
1710 +
1711 +       /* Have to do it here cos the lkb may not exist after
1712 +        * dlm_unlock() */
1713 +       spin_lock(&fi->fi_lkb_lock);
1714 +       list_del(&lkb->lkb_ownerqueue);
1715 +       spin_unlock(&fi->fi_lkb_lock);
1716 +
1717 +       /* Use existing lksb & astparams */
1718 +       status = dlm_unlock(fi->fi_ls->ls_lockspace,
1719 +                            kparams->lkid,
1720 +                            kparams->flags, NULL, NULL);
1721 +
1722 +       return status;
1723 +}
1724 +
1725 +/* Write call, submit a locking request */
1726 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1727 +                        size_t count, loff_t *ppos)
1728 +{
1729 +       struct file_info *fi = file->private_data;
1730 +       struct dlm_lock_params kparams;
1731 +       sigset_t tmpsig;
1732 +       sigset_t allsigs;
1733 +       int status;
1734 +
1735 +       if (count < sizeof(kparams))
1736 +               return -EINVAL;
1737 +
1738 +       /* Has the lockspace been deleted */
1739 +       if (test_bit(1, &fi->fi_ls->ls_flags))
1740 +               return -ENOENT;
1741 +
1742 +       /* Get the command info */
1743 +       if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1744 +               return -EFAULT;
1745 +
1746 +       if (check_version(&kparams))
1747 +               return -EINVAL;
1748 +
1749 +       /* Block signals while we are doing this */
1750 +       sigfillset(&allsigs);
1751 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1752 +
1753 +       switch (kparams.cmd)
1754 +       {
1755 +       case DLM_USER_LOCK:
1756 +               status = do_user_lock(fi, &kparams, buffer);
1757 +               break;
1758 +
1759 +       case DLM_USER_UNLOCK:
1760 +               status = do_user_unlock(fi, &kparams);
1761 +               break;
1762 +
1763 +       case DLM_USER_QUERY:
1764 +               status = do_user_query(fi, &kparams);
1765 +               break;
1766 +
1767 +       default:
1768 +               status = -EINVAL;
1769 +               break;
1770 +       }
1771 +       /* Restore signals */
1772 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1773 +       recalc_sigpending();
1774 +
1775 +       if (status == 0)
1776 +               return count;
1777 +       else
1778 +               return status;
1779 +}
1780 +
1781 +void dlm_device_free_devices()
1782 +{
1783 +       struct user_ls *tmp;
1784 +       struct user_ls *lsinfo;
1785 +
1786 +       list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
1787 +               misc_deregister(&lsinfo->ls_miscinfo);
1788 +
1789 +               /* Tidy up, but don't delete the lsinfo struct until
1790 +                  all the users have closed their devices */
1791 +               list_del(&lsinfo->ls_list);
1792 +               kfree(lsinfo->ls_miscinfo.name);
1793 +               set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1794 +       }
1795 +}
1796 +
1797 +static struct file_operations _dlm_fops = {
1798 +      .open    = dlm_open,
1799 +      .release = dlm_close,
1800 +      .ioctl   = dlm_ioctl,
1801 +      .read    = dlm_read,
1802 +      .write   = dlm_write,
1803 +      .poll    = dlm_poll,
1804 +      .owner   = THIS_MODULE,
1805 +};
1806 +
1807 +static struct file_operations _dlm_ctl_fops = {
1808 +      .open    = dlm_ctl_open,
1809 +      .release = dlm_ctl_close,
1810 +      .ioctl   = dlm_ctl_ioctl,
1811 +      .owner   = THIS_MODULE,
1812 +};
1813 +
1814 +/*
1815 + * Create control device
1816 + */
1817 +int dlm_device_init(void)
1818 +{
1819 +       int r;
1820 +
1821 +       INIT_LIST_HEAD(&user_ls_list);
1822 +
1823 +       ctl_device.name = "dlm-control";
1824 +       ctl_device.fops = &_dlm_ctl_fops;
1825 +       ctl_device.minor = MISC_DYNAMIC_MINOR;
1826 +
1827 +       r = misc_register(&ctl_device);
1828 +       if (r) {
1829 +               log_print("misc_register failed for DLM control device");
1830 +               return r;
1831 +       }
1832 +
1833 +       return 0;
1834 +}
1835 +
1836 +void dlm_device_exit(void)
1837 +{
1838 +       misc_deregister(&ctl_device);
1839 +}
1840 +
1841 +/*
1842 + * Overrides for Emacs so that we follow Linus's tabbing style.
1843 + * Emacs will notice this stuff at the end of the file and automatically
1844 + * adjust the settings for this buffer only.  This must remain at the end
1845 + * of the file.
1846 + * ---------------------------------------------------------------------------
1847 + * Local variables:
1848 + * c-file-style: "linux"
1849 + * End:
1850 + */
1851 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
1852 --- linux-orig/cluster/dlm/device.h     1970-01-01 07:30:00.000000000 +0730
1853 +++ linux-patched/cluster/dlm/device.h  2004-07-13 18:57:22.000000000 +0800
1854 @@ -0,0 +1,19 @@
1855 +/******************************************************************************
1856 +*******************************************************************************
1857 +**
1858 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
1859 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
1860 +**
1861 +**  This copyrighted material is made available to anyone wishing to use,
1862 +**  modify, copy, or redistribute it subject to the terms and conditions
1863 +**  of the GNU General Public License v.2.
1864 +**
1865 +*******************************************************************************
1866 +******************************************************************************/
1867 +
1868 +#ifndef __DEVICE_DOT_H__
1869 +#define __DEVICE_DOT_H__
1870 +
1871 +extern void dlm_device_free_devices(void);
1872 +
1873 +#endif                         /* __DEVICE_DOT_H__ */
1874 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
1875 --- linux-orig/cluster/dlm/dir.c        1970-01-01 07:30:00.000000000 +0730
1876 +++ linux-patched/cluster/dlm/dir.c     2004-07-13 18:57:22.000000000 +0800
1877 @@ -0,0 +1,427 @@
1878 +/******************************************************************************
1879 +*******************************************************************************
1880 +**
1881 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
1882 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
1883 +**
1884 +**  This copyrighted material is made available to anyone wishing to use,
1885 +**  modify, copy, or redistribute it subject to the terms and conditions
1886 +**  of the GNU General Public License v.2.
1887 +**
1888 +*******************************************************************************
1889 +******************************************************************************/
1890 +
1891 +#include "dlm_internal.h"
1892 +#include "nodes.h"
1893 +#include "lockspace.h"
1894 +#include "lowcomms.h"
1895 +#include "reccomms.h"
1896 +#include "rsb.h"
1897 +#include "config.h"
1898 +#include "memory.h"
1899 +#include "recover.h"
1900 +#include "util.h"
1901 +
1902 +struct resmov {
1903 +       uint32_t rm_nodeid;
1904 +       uint16_t rm_length;
1905 +       uint16_t rm_pad;
1906 +};
1907 +
1908 +
1909 +/*
1910 + * We use the upper 16 bits of the hash value to select the directory node.
1911 + * Low bits are used for distribution of rsb's among hash buckets on each node.
1912 + *
1913 + * From the hash value, we are interested in arriving at a final value between
1914 + * zero and the number of nodes minus one (num_nodes - 1).
1915 + *
1916 + * To accomplish this scaling, we take the nearest power of two larger than
1917 + * num_nodes and subtract one to create a bit mask.  The mask is applied to the
1918 + * hash, reducing the range to nearer the final range.
1919 + *
1920 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
1921 + * num_nodes to the previously masked hash value.
1922 + *
1923 + * This value in the desired range is used as an offset into the sorted list of
1924 + * nodeid's to give the particular nodeid of the directory node.
1925 + */
1926 +
1927 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
1928 +{
1929 +       struct list_head *tmp;
1930 +       struct dlm_csb *csb = NULL;
1931 +       uint32_t hash, node, n = 0, nodeid;
1932 +
1933 +       if (ls->ls_num_nodes == 1) {
1934 +               nodeid = our_nodeid();
1935 +               goto out;
1936 +       }
1937 +
1938 +       hash = dlm_hash(name, length);
1939 +       node = (hash >> 16) & ls->ls_nodes_mask;
1940 +       node %= ls->ls_num_nodes;
1941 +
1942 +       list_for_each(tmp, &ls->ls_nodes) {
1943 +               if (n++ != node)
1944 +                       continue;
1945 +               csb = list_entry(tmp, struct dlm_csb, list);
1946 +               break;
1947 +       }
1948 +
1949 +       DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u mask=%x\n",
1950 +                               ls->ls_num_nodes, n, node, ls->ls_nodes_mask););
1951 +       nodeid = csb->node->nodeid;
1952 +
1953 +      out:
1954 +       return nodeid;
1955 +}
1956 +
1957 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
1958 +{
1959 +       return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
1960 +                                       rsb->res_length);
1961 +}
1962 +
1963 +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
1964 +{
1965 +       uint32_t val;
1966 +
1967 +       val = dlm_hash(name, len);
1968 +       val &= (ls->ls_dirtbl_size - 1);
1969 +
1970 +       return val;
1971 +}
1972 +
1973 +static void add_resdata_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
1974 +{
1975 +       uint32_t bucket;
1976 +
1977 +       bucket = dir_hash(ls, de->name, de->length);
1978 +       list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
1979 +}
1980 +
1981 +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
1982 +                                         int namelen, uint32_t bucket)
1983 +{
1984 +       struct dlm_direntry *de;
1985 +
1986 +       list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
1987 +               if (de->length == namelen && !memcmp(name, de->name, namelen))
1988 +                       goto out;
1989 +       }
1990 +       de = NULL;
1991 +      out:
1992 +       return de;
1993 +}
1994 +
1995 +void remove_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
1996 +{
1997 +       struct dlm_direntry *de;
1998 +       uint32_t bucket;
1999 +
2000 +       bucket = dir_hash(ls, name, namelen);
2001 +
2002 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2003 +
2004 +       de = search_bucket(ls, name, namelen, bucket);
2005 +
2006 +       if (!de) {
2007 +               log_debug(ls, "remove from %u none", nodeid);
2008 +               goto out;
2009 +       }
2010 +
2011 +       if (de->master_nodeid != nodeid) {
2012 +               log_debug(ls, "remove from %u ID %u",
2013 +                         nodeid, de->master_nodeid);
2014 +               goto out;
2015 +       }
2016 +
2017 +       list_del(&de->list);
2018 +       free_resdata(de);
2019 + out:
2020 +       write_unlock(&ls->ls_dirtbl[bucket].lock);
2021 +}
2022 +
2023 +void dlm_dir_clear(struct dlm_ls *ls)
2024 +{
2025 +       struct list_head *head;
2026 +       struct dlm_direntry *de;
2027 +       int i;
2028 +
2029 +       for (i = 0; i < ls->ls_dirtbl_size; i++) {
2030 +               head = &ls->ls_dirtbl[i].list;
2031 +               while (!list_empty(head)) {
2032 +                       de = list_entry(head->next, struct dlm_direntry, list);
2033 +                       list_del(&de->list);
2034 +                       free_resdata(de);
2035 +               }
2036 +       }
2037 +}
2038 +
2039 +static void resmov_in(struct resmov *rm, char *buf)
2040 +{
2041 +       struct resmov tmp;
2042 +
2043 +       memcpy(&tmp, buf, sizeof(struct resmov));
2044 +
2045 +       rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2046 +       rm->rm_length = be16_to_cpu(tmp.rm_length);
2047 +}
2048 +
2049 +int dlm_dir_rebuild_local(struct dlm_ls *ls)
2050 +{
2051 +       struct dlm_csb *csb;
2052 +       struct dlm_direntry *de;
2053 +       struct dlm_rcom *rc;
2054 +       struct resmov mov, last_mov;
2055 +       char *b, *last_name;
2056 +       int error = -ENOMEM, count = 0;
2057 +
2058 +       log_all(ls, "rebuild resource directory");
2059 +
2060 +       dlm_dir_clear(ls);
2061 +
2062 +       rc = allocate_rcom_buffer(ls);
2063 +       if (!rc)
2064 +               goto out;
2065 +
2066 +       last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2067 +       if (!last_name)
2068 +               goto free_rc;
2069 +
2070 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
2071 +               last_mov.rm_length = 0;
2072 +               for (;;) {
2073 +                       error = dlm_recovery_stopped(ls);
2074 +                       if (error)
2075 +                               goto free_last;
2076 +
2077 +                       memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2078 +                       rc->rc_datalen = last_mov.rm_length;
2079 +
2080 +                       error = rcom_send_message(ls, csb->node->nodeid,
2081 +                                                 RECCOMM_RECOVERNAMES, rc, 1);
2082 +                       if (error)
2083 +                               goto free_last;
2084 +
2085 +                       schedule();
2086 +
2087 +                       /*
2088 +                        * pick each res out of buffer
2089 +                        */
2090 +
2091 +                       b = rc->rc_buf;
2092 +
2093 +                       for (;;) {
2094 +                               resmov_in(&mov, b);
2095 +                               b += sizeof(struct resmov);
2096 +
2097 +                               /* Length of 0 with a non-zero nodeid marks the
2098 +                                * end of the list */
2099 +                               if (!mov.rm_length && mov.rm_nodeid)
2100 +                                       goto done;
2101 +
2102 +                               /* This is just the end of the block */
2103 +                               if (!mov.rm_length)
2104 +                                       break;
2105 +
2106 +                               error = -ENOMEM;
2107 +                               de = allocate_resdata(ls, mov.rm_length);
2108 +                               if (!de)
2109 +                                       goto free_last;
2110 +
2111 +                               de->master_nodeid = mov.rm_nodeid;
2112 +                               de->length = mov.rm_length;
2113 +
2114 +                               memcpy(de->name, b, mov.rm_length);
2115 +                               b += mov.rm_length;
2116 +
2117 +                               add_resdata_to_hash(ls, de);
2118 +                               count++;
2119 +
2120 +                               last_mov = mov;
2121 +                               memset(last_name, 0, DLM_RESNAME_MAXLEN);
2122 +                               memcpy(last_name, de->name, de->length);
2123 +                       }
2124 +               }
2125 +             done:
2126 +               ;
2127 +       }
2128 +
2129 +       set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2130 +       error = 0;
2131 +
2132 +       log_all(ls, "rebuilt %d resources", count);
2133 +
2134 +      free_last:
2135 +       kfree(last_name);
2136 +
2137 +      free_rc:
2138 +       free_rcom_buffer(rc);
2139 +
2140 +      out:
2141 +       return error;
2142 +}
2143 +
2144 +/*
2145 + * The reply end of dlm_dir_rebuild_local/RECOVERNAMES.  Collect and send as
2146 + * many resource names as can fit in the buffer.
2147 + */
2148 +
2149 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2150 +                        char *outbuf, int outlen, uint32_t nodeid)
2151 +{
2152 +       struct list_head *list;
2153 +       struct dlm_rsb *start_rsb = NULL, *rsb;
2154 +       int offset = 0, start_namelen, error;
2155 +       char *start_name;
2156 +       struct resmov tmp;
2157 +       uint32_t dir_nodeid;
2158 +
2159 +       /*
2160 +        * Find the rsb where we left off (or start again)
2161 +        */
2162 +
2163 +       start_namelen = inlen;
2164 +       start_name = inbuf;
2165 +
2166 +       if (start_namelen > 1) {
2167 +               error = find_or_create_rsb(ls, NULL, start_name,
2168 +                                          start_namelen, 0, &start_rsb);
2169 +               DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2170 +               release_rsb(start_rsb);
2171 +       }
2172 +
2173 +       /*
2174 +        * Send rsb names for rsb's we're master of and whose directory node
2175 +        * matches the requesting node.
2176 +        */
2177 +
2178 +       down_read(&ls->ls_rec_rsblist);
2179 +       if (start_rsb)
2180 +               list = start_rsb->res_rootlist.next;
2181 +       else
2182 +               list = ls->ls_rootres.next;
2183 +
2184 +       for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2185 +               rsb = list_entry(list, struct dlm_rsb, res_rootlist);
2186 +               if (rsb->res_nodeid)
2187 +                       continue;
2188 +
2189 +               dir_nodeid = get_directory_nodeid(rsb);
2190 +               if (dir_nodeid != nodeid)
2191 +                       continue;
2192 +
2193 +               if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
2194 +                       /* Write end-of-block record */
2195 +                       memset(&tmp, 0, sizeof(struct resmov));
2196 +                       memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2197 +                       offset += sizeof(struct resmov);
2198 +                       goto out;
2199 +               }
2200 +
2201 +               memset(&tmp, 0, sizeof(struct resmov));
2202 +               tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2203 +               tmp.rm_length = cpu_to_be16(rsb->res_length);
2204 +
2205 +               memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2206 +               offset += sizeof(struct resmov);
2207 +
2208 +               memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2209 +               offset += rsb->res_length;
2210 +       }
2211 +
2212 +       /*
2213 +        * If we've reached the end of the list (and there's room) write a
2214 +        * terminating record.
2215 +        */
2216 +
2217 +       if ((list == &ls->ls_rootres) &&
2218 +           (offset + sizeof(struct resmov) <= outlen)) {
2219 +
2220 +               memset(&tmp, 0, sizeof(struct resmov));
2221 +               /* This only needs to be non-zero */
2222 +               tmp.rm_nodeid = cpu_to_be32(1);
2223 +               /* and this must be zero */
2224 +               tmp.rm_length = 0;
2225 +               memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2226 +               offset += sizeof(struct resmov);
2227 +       }
2228 +
2229 + out:
2230 +       up_read(&ls->ls_rec_rsblist);
2231 +       return offset;
2232 +}
2233 +
2234 +static int get_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name,
2235 +                      int namelen, uint32_t *r_nodeid, int recovery)
2236 +{
2237 +       struct dlm_direntry *de, *tmp;
2238 +       uint32_t bucket;
2239 +
2240 +       bucket = dir_hash(ls, name, namelen);
2241 +
2242 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2243 +       de = search_bucket(ls, name, namelen, bucket);
2244 +       if (de) {
2245 +               *r_nodeid = de->master_nodeid;
2246 +               write_unlock(&ls->ls_dirtbl[bucket].lock);
2247 +               goto out;
2248 +       }
2249 +
2250 +        write_unlock(&ls->ls_dirtbl[bucket].lock);
2251 +
2252 +       de = allocate_resdata(ls, namelen);
2253 +       if (!de)
2254 +               return -ENOMEM;
2255 +
2256 +       de->master_nodeid = nodeid;
2257 +       de->length = namelen;
2258 +       memcpy(de->name, name, namelen);
2259 +
2260 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2261 +       tmp = search_bucket(ls, name, namelen, bucket);
2262 +       if (tmp) {
2263 +               free_resdata(de);
2264 +               de = tmp;
2265 +       } else {
2266 +               list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2267 +       }
2268 +       *r_nodeid = de->master_nodeid;
2269 +       write_unlock(&ls->ls_dirtbl[bucket].lock);
2270 +
2271 + out:
2272 +       return 0;
2273 +}
2274 +
2275 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2276 +                  uint32_t *r_nodeid)
2277 +{
2278 +       return get_resdata(ls, nodeid, name, namelen, r_nodeid, 0);
2279 +}
2280 +
2281 +int dlm_dir_lookup_recovery(struct dlm_ls *ls, uint32_t nodeid, char *name,
2282 +                           int namelen, uint32_t *r_nodeid)
2283 +{
2284 +       return get_resdata(ls, nodeid, name, namelen, r_nodeid, 1);
2285 +}
2286 +
2287 +/*
2288 + * The node with lowest id queries all nodes to determine when all are done.
2289 + * All other nodes query the low nodeid for this.
2290 + */
2291 +
2292 +int dlm_dir_rebuild_wait(struct dlm_ls *ls)
2293 +{
2294 +       int error;
2295 +
2296 +       if (ls->ls_low_nodeid == our_nodeid()) {
2297 +               error = dlm_wait_status_all(ls, RESDIR_VALID);
2298 +               if (!error)
2299 +                       set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2300 +       } else
2301 +               error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
2302 +
2303 +       return error;
2304 +}
2305 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2306 --- linux-orig/cluster/dlm/dir.h        1970-01-01 07:30:00.000000000 +0730
2307 +++ linux-patched/cluster/dlm/dir.h     2004-07-13 18:57:22.000000000 +0800
2308 @@ -0,0 +1,31 @@
2309 +/******************************************************************************
2310 +*******************************************************************************
2311 +**
2312 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2313 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2314 +**
2315 +**  This copyrighted material is made available to anyone wishing to use,
2316 +**  modify, copy, or redistribute it subject to the terms and conditions
2317 +**  of the GNU General Public License v.2.
2318 +**
2319 +*******************************************************************************
2320 +******************************************************************************/
2321 +
2322 +#ifndef __DIR_DOT_H__
2323 +#define __DIR_DOT_H__
2324 +
2325 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2326 +                       uint32_t *r_nodeid);
2327 +int dlm_dir_lookup_recovery(struct dlm_ls *ls, uint32_t nodeid, char *name,
2328 +                            int namelen, uint32_t *r_nodeid);
2329 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
2330 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
2331 +void remove_resdata(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen);
2332 +int dlm_dir_rebuild_local(struct dlm_ls *ls);
2333 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2334 +                        char *outbuf, int outlen, uint32_t nodeid);
2335 +int dlm_dir_rebuild_wait(struct dlm_ls * ls);
2336 +void dlm_dir_clear(struct dlm_ls *ls);
2337 +void dlm_dir_dump(struct dlm_ls *ls);
2338 +
2339 +#endif                         /* __DIR_DOT_H__ */
2340 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2341 --- linux-orig/cluster/dlm/dlm_internal.h       1970-01-01 07:30:00.000000000 +0730
2342 +++ linux-patched/cluster/dlm/dlm_internal.h    2004-07-13 18:57:22.000000000 +0800
2343 @@ -0,0 +1,594 @@
2344 +/******************************************************************************
2345 +*******************************************************************************
2346 +**
2347 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2348 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2349 +**
2350 +**  This copyrighted material is made available to anyone wishing to use,
2351 +**  modify, copy, or redistribute it subject to the terms and conditions
2352 +**  of the GNU General Public License v.2.
2353 +**
2354 +*******************************************************************************
2355 +******************************************************************************/
2356 +
2357 +#ifndef __DLM_INTERNAL_DOT_H__
2358 +#define __DLM_INTERNAL_DOT_H__
2359 +
2360 +/*
2361 + * This is the main header file to be included in each DLM source file.
2362 + */
2363 +
2364 +#define DLM_RELEASE_NAME "<CVS>"
2365 +
2366 +#include <linux/slab.h>
2367 +#include <linux/sched.h>
2368 +#include <asm/semaphore.h>
2369 +#include <linux/types.h>
2370 +#include <linux/spinlock.h>
2371 +#include <linux/vmalloc.h>
2372 +#include <asm/uaccess.h>
2373 +#include <linux/list.h>
2374 +#include <linux/errno.h>
2375 +#include <linux/random.h>
2376 +
2377 +#include <cluster/dlm.h>
2378 +#include <cluster/dlm_device.h>
2379 +#include <cluster/service.h>
2380 +
2381 +#ifndef TRUE
2382 +#define TRUE (1)
2383 +#endif
2384 +
2385 +#ifndef FALSE
2386 +#define FALSE (0)
2387 +#endif
2388 +
2389 +#if (BITS_PER_LONG == 64)
2390 +#define PRIu64 "lu"
2391 +#define PRId64 "ld"
2392 +#define PRIo64 "lo"
2393 +#define PRIx64 "lx"
2394 +#define PRIX64 "lX"
2395 +#define SCNu64 "lu"
2396 +#define SCNd64 "ld"
2397 +#define SCNo64 "lo"
2398 +#define SCNx64 "lx"
2399 +#define SCNX64 "lX"
2400 +#else
2401 +#define PRIu64 "Lu"
2402 +#define PRId64 "Ld"
2403 +#define PRIo64 "Lo"
2404 +#define PRIx64 "Lx"
2405 +#define PRIX64 "LX"
2406 +#define SCNu64 "Lu"
2407 +#define SCNd64 "Ld"
2408 +#define SCNo64 "Lo"
2409 +#define SCNx64 "Lx"
2410 +#define SCNX64 "LX"
2411 +#endif
2412 +
2413 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2414 +do \
2415 +{ \
2416 +  DECLARE_WAITQUEUE(__wait_chan, current); \
2417 +  current->state = TASK_INTERRUPTIBLE; \
2418 +  add_wait_queue(&chan, &__wait_chan); \
2419 +  if ((sleep_cond)) \
2420 +    schedule(); \
2421 +  remove_wait_queue(&chan, &__wait_chan); \
2422 +  current->state = TASK_RUNNING; \
2423 +} \
2424 +while (0)
2425 +
2426 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2427 +{
2428 +    return time_after(jiffies, stamp + seconds * HZ);
2429 +}
2430 +
2431 +
2432 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2433 +
2434 +#define log_all(ls, fmt, args...) \
2435 +       do { \
2436 +               printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2437 +               dlm_debug_log(ls, fmt, ##args); \
2438 +       } while (0)
2439 +
2440 +#define log_error log_all
2441 +
2442 +
2443 +#define DLM_DEBUG
2444 +#if defined(DLM_DEBUG)
2445 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2446 +#else
2447 +#define log_debug(ls, fmt, args...)
2448 +#endif
2449 +
2450 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2451 +#undef log_debug
2452 +#define log_debug log_all
2453 +#endif
2454 +
2455 +
2456 +#define DLM_ASSERT(x, do) \
2457 +{ \
2458 +  if (!(x)) \
2459 +  { \
2460 +    dlm_locks_dump(); \
2461 +    dlm_debug_dump(); \
2462 +    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
2463 +               "DLM:  assertion:  \"%s\"\n" \
2464 +               "DLM:  time = %lu\n", \
2465 +               __LINE__, __FILE__, #x, jiffies); \
2466 +    {do} \
2467 +    printk("\n"); \
2468 +    BUG(); \
2469 +    panic("DLM:  Record message above and reboot.\n"); \
2470 +  } \
2471 +}
2472 +
2473 +
2474 +struct dlm_ls;
2475 +struct dlm_lkb;
2476 +struct dlm_rsb;
2477 +struct dlm_csb;
2478 +struct dlm_node;
2479 +struct dlm_lkbtable;
2480 +struct dlm_rsbtable;
2481 +struct dlm_dirtable;
2482 +struct dlm_direntry;
2483 +struct dlm_recover;
2484 +struct dlm_header;
2485 +struct dlm_request;
2486 +struct dlm_reply;
2487 +struct dlm_rcom;
2488 +struct dlm_query_request;
2489 +struct dlm_query_reply;
2490 +
2491 +
2492 +struct dlm_direntry {
2493 +       struct list_head        list;
2494 +       uint32_t                master_nodeid;
2495 +       uint16_t                length;
2496 +       char                    name[1];
2497 +};
2498 +
2499 +struct dlm_dirtable {
2500 +       struct list_head        list;
2501 +       rwlock_t                lock;
2502 +};
2503 +
2504 +struct dlm_rsbtable {
2505 +       struct list_head        list;
2506 +       rwlock_t                lock;
2507 +};
2508 +
2509 +struct dlm_lkbtable {
2510 +       struct list_head        list;
2511 +       rwlock_t                lock;
2512 +       uint16_t                counter;
2513 +};
2514 +
2515 +/*
2516 + * Cluster node (per node in cluster)
2517 + */
2518 +
2519 +struct dlm_node {
2520 +       struct list_head        list;
2521 +       uint32_t                nodeid;
2522 +       int                     refcount;       /* num csb's referencing */
2523 +};
2524 +
2525 +/*
2526 + * Cluster System Block (per node in a ls)
2527 + */
2528 +
2529 +struct dlm_csb {
2530 +       struct list_head        list;           /* per-lockspace node list */
2531 +       struct dlm_node *       node;           /* global node structure */
2532 +       int                     gone_event;     /* event id when node removed */
2533 +
2534 +       /* recovery stats for debugging */
2535 +
2536 +       uint32_t                names_send_count;
2537 +       uint32_t                names_send_msgid;
2538 +       uint32_t                names_recv_count;
2539 +       uint32_t                names_recv_msgid;
2540 +       uint32_t                locks_send_count;
2541 +       uint32_t                locks_send_msgid;
2542 +       uint32_t                locks_recv_count;
2543 +       uint32_t                locks_recv_msgid;
2544 +};
2545 +
2546 +/*
2547 + * Used to save and manage recovery state for a lockspace.
2548 + */
2549 +
2550 +struct dlm_recover {
2551 +       struct list_head        list;
2552 +       uint32_t *              nodeids;
2553 +       int                     node_count;
2554 +       int                     event_id;
2555 +};
2556 +
2557 +/*
2558 + * Elements in the range array
2559 + */
2560 +
2561 +#define GR_RANGE_START         (0)
2562 +#define GR_RANGE_END           (1)
2563 +#define RQ_RANGE_START         (2)
2564 +#define RQ_RANGE_END           (3)
2565 +
2566 +/*
2567 + * Lockspace structure
2568 + */
2569 +
2570 +#define LSFL_WORK              (0)
2571 +#define LSFL_LS_RUN            (1)
2572 +#define LSFL_LS_STOP           (2)
2573 +#define LSFL_LS_START          (3)
2574 +#define LSFL_LS_FINISH         (4)
2575 +#define LSFL_RECCOMM_WAIT      (5)
2576 +#define LSFL_RECCOMM_READY     (6)
2577 +#define LSFL_NOTIMERS          (7)
2578 +#define LSFL_FINISH_RECOVERY   (8)
2579 +#define LSFL_RESDIR_VALID      (9)
2580 +#define LSFL_ALL_RESDIR_VALID  (10)
2581 +#define LSFL_NODES_VALID       (11)
2582 +#define LSFL_ALL_NODES_VALID   (12)
2583 +#define LSFL_REQUEST_WARN      (13)
2584 +#define LSFL_NOCONVGRANT       (14)
2585 +
2586 +#define LSST_NONE              (0)
2587 +#define LSST_INIT              (1)
2588 +#define LSST_INIT_DONE         (2)
2589 +#define LSST_CLEAR             (3)
2590 +#define LSST_WAIT_START                (4)
2591 +#define LSST_RECONFIG_DONE     (5)
2592 +
2593 +struct dlm_ls {
2594 +       struct list_head        ls_list;        /* list of lockspaces */
2595 +       uint32_t                ls_local_id;    /* local unique lockspace ID */
2596 +       uint32_t                ls_global_id;   /* global unique lockspace ID */
2597 +       int                     ls_allocation;  /* Memory allocation policy */
2598 +       unsigned long           ls_flags;       /* LSFL_ */
2599 +
2600 +       struct dlm_rsbtable *   ls_rsbtbl;
2601 +       uint32_t                ls_rsbtbl_size;
2602 +
2603 +       struct dlm_lkbtable *   ls_lkbtbl;
2604 +       uint32_t                ls_lkbtbl_size;
2605 +
2606 +       struct dlm_dirtable *   ls_dirtbl;
2607 +       uint32_t                ls_dirtbl_size;
2608 +
2609 +       struct list_head        ls_nodes;       /* current nodes in RC */
2610 +       struct list_head        ls_nodes_gone;  /* dead node list, recovery */
2611 +       uint32_t                ls_num_nodes;   /* number of nodes in RC */
2612 +       uint32_t                ls_nodes_mask;
2613 +       uint32_t                ls_low_nodeid;
2614 +
2615 +       struct rw_semaphore     ls_unlock_sem;  /* To prevent unlock on a
2616 +                                                  parent lock racing with a
2617 +                                                  new child lock */
2618 +
2619 +       struct list_head        ls_deadlockq;   /* List of locks in conversion
2620 +                                                  ordered by duetime. for
2621 +                                                  deadlock detection */
2622 +
2623 +       /* recovery related */
2624 +
2625 +       struct list_head        ls_recover;     /* dlm_recover structs */
2626 +       spinlock_t              ls_recover_lock;
2627 +       int                     ls_last_stop;
2628 +       int                     ls_last_start;
2629 +       int                     ls_last_finish;
2630 +       int                     ls_state;       /* recovery states */
2631 +
2632 +       struct rw_semaphore     ls_in_recovery; /* block local requests */
2633 +       struct list_head        ls_requestqueue;/* queue remote requests */
2634 +
2635 +       struct dlm_rcom *       ls_rcom;        /* recovery comms */
2636 +       uint32_t                ls_rcom_msgid;
2637 +       struct semaphore        ls_rcom_lock;
2638 +
2639 +       struct list_head        ls_recover_list;
2640 +       spinlock_t              ls_recover_list_lock;
2641 +       int                     ls_recover_list_count;
2642 +       wait_queue_head_t       ls_wait_general;
2643 +
2644 +       struct list_head        ls_rootres;     /* List of root resources */
2645 +
2646 +       struct rw_semaphore     ls_rec_rsblist; /* To prevent incoming recovery
2647 +                                                  operations happening while
2648 +                                                  we are purging */
2649 +
2650 +       struct rw_semaphore     ls_gap_rsblist; /* To protect rootres list
2651 +                                                  in grant_after_purge() which
2652 +                                                  runs outside recovery */
2653 +
2654 +       struct list_head        ls_rebuild_rootrsb_list; /* Root of lock trees
2655 +                                                           we are
2656 +                                                           deserialising */
2657 +       int                     ls_namelen;
2658 +       char                    ls_name[1];
2659 +};
2660 +
2661 +/*
2662 + * Resource block
2663 + */
2664 +
2665 +#define RESFL_NEW_MASTER       (0)
2666 +#define RESFL_RECOVER_LIST     (1)
2667 +#define RESFL_MASTER           (2)
2668 +
2669 +struct dlm_rsb {
2670 +       struct list_head        res_hashchain;
2671 +       uint32_t                res_bucket;
2672 +
2673 +       struct dlm_ls *         res_ls;         /* The owning lockspace */
2674 +
2675 +       struct list_head        res_rootlist;   /* List of root rsb's */
2676 +
2677 +       struct list_head        res_subreslist; /* List of all sub-resources
2678 +                                                  for this root rsb */
2679 +
2680 +       uint8_t                 res_depth;      /* Depth in resource tree */
2681 +       unsigned long           res_flags;      /* Flags, RESFL_ */
2682 +
2683 +       struct list_head        res_grantqueue;
2684 +       struct list_head        res_convertqueue;
2685 +       struct list_head        res_waitqueue;
2686 +
2687 +       uint32_t                res_nodeid;     /* nodeid of master node */
2688 +
2689 +       struct dlm_rsb *        res_root;       /* root rsb if a subresource */
2690 +       struct dlm_rsb *        res_parent;     /* parent rsb (if any) */
2691 +
2692 +       atomic_t                res_ref;        /* Number of lkb's */
2693 +       uint16_t                res_remasterid; /* ID used during remaster */
2694 +
2695 +       struct list_head        res_recover_list; /* General list for use
2696 +                                                    during recovery */
2697 +       int                     res_recover_msgid;
2698 +       int                     res_newlkid_expect;
2699 +
2700 +       struct rw_semaphore     res_lock;
2701 +
2702 +       char *                  res_lvbptr;     /* Lock value block */
2703 +
2704 +       uint8_t                 res_length;
2705 +       char                    res_name[1];    /* <res_length> bytes */
2706 +};
2707 +
2708 +/*
2709 + * Lock block. To avoid confusion, where flags mirror the
2710 + * public flags, they should have the same value.
2711 + */
2712 +
2713 +#define GDLM_LKSTS_NEW         (0)
2714 +#define GDLM_LKSTS_WAITING     (1)
2715 +#define GDLM_LKSTS_GRANTED     (2)
2716 +#define GDLM_LKSTS_CONVERT     (3)
2717 +
2718 +#define GDLM_LKFLG_VALBLK      (0x00000008)
2719 +#define GDLM_LKFLG_PERSISTENT  (0x00000080)    /* Don't unlock when process exits */
2720 +#define GDLM_LKFLG_NODLCKWT    (0x00000100)    /* Don't do deadlock detection */
2721 +#define GDLM_LKFLG_EXPEDITE    (0x00000400)    /* Move to head of convert queue */
2722 +
2723 +/* Internal flags */
2724 +#define GDLM_LKFLG_RANGE       (0x00001000)    /* Range field is present
2725 +                                                  (remote protocol only) */
2726 +#define GDLM_LKFLG_MSTCPY      (0x00002000)
2727 +#define GDLM_LKFLG_DELETED     (0x00004000)    /* LKB is being deleted */
2728 +#define GDLM_LKFLG_LQCONVERT   (0x00008000)
2729 +#define GDLM_LKFLG_LQRESEND    (0x00010000)    /* LKB on lockqueue must be resent */
2730 +#define GDLM_LKFLG_DEMOTED     (0x00020000)
2731 +#define GDLM_LKFLG_RESENT      (0x00040000)
2732 +#define GDLM_LKFLG_NOREBUILD   (0x00080000)
2733 +
2734 +#define AST_COMP               (1)
2735 +#define AST_BAST               (2)
2736 +#define AST_DEL                        (4)
2737 +
2738 +struct dlm_lkb {
2739 +       uint32_t                lkb_flags;
2740 +       uint16_t                lkb_status;     /* grant, wait, convert */
2741 +       int8_t                  lkb_rqmode;     /* requested lock mode */
2742 +       int8_t                  lkb_grmode;     /* granted lock mode */
2743 +       uint32_t                lkb_retstatus;  /* status to return in lksb */
2744 +       uint32_t                lkb_id;         /* our lock ID */
2745 +       struct dlm_lksb *       lkb_lksb;       /* status block of caller */
2746 +       struct list_head        lkb_idtbl_list; /* lockidtbl */
2747 +       struct list_head        lkb_statequeue; /* rsb's g/c/w queue */
2748 +       struct dlm_rsb *        lkb_resource;
2749 +       struct list_head        lkb_ownerqueue; /* list of locks owned by a
2750 +                                                  process */
2751 +       struct dlm_lkb *        lkb_parent;     /* parent lock if any */
2752 +       atomic_t                lkb_childcnt;   /* number of children */
2753 +
2754 +       struct list_head        lkb_lockqueue;  /* queue of locks waiting
2755 +                                                  for remote reply */
2756 +       int                     lkb_lockqueue_state; /* reason on lockqueue */
2757 +       int                     lkb_lockqueue_flags; /* as passed into
2758 +                                                       lock/unlock */
2759 +       unsigned long           lkb_lockqueue_time;  /* time lkb went on the
2760 +                                                       lockqueue */
2761 +       unsigned long           lkb_duetime;    /* for deadlock detection */
2762 +
2763 +       uint32_t                lkb_remid;      /* id on remote partner */
2764 +       uint32_t                lkb_nodeid;     /* id of remote partner */
2765 +
2766 +       void *                  lkb_astaddr;
2767 +       void *                  lkb_bastaddr;
2768 +       long                    lkb_astparam;
2769 +       struct list_head        lkb_astqueue;   /* locks with asts to deliver */
2770 +       uint16_t                lkb_astflags;   /* COMP, BAST, DEL */
2771 +       uint8_t                 lkb_bastmode;   /* requested mode */
2772 +       uint8_t                 lkb_highbast;   /* highest mode bast sent for */
2773 +
2774 +       struct dlm_request *    lkb_request;
2775 +
2776 +       struct list_head        lkb_deadlockq;  /* ls_deadlockq list */
2777 +
2778 +       char *                  lkb_lvbptr;     /* points to lksb lvb on local
2779 +                                                  lock, allocated lvb on
2780 +                                                  on remote lock */
2781 +       uint64_t *              lkb_range;      /* Points to an array of 64 bit
2782 +                                                  numbers that represent the
2783 +                                                  requested and granted ranges
2784 +                                                  of the lock. NULL implies
2785 +                                                  0-ffffffffffffffff */
2786 +};
2787 +
2788 +/*
2789 + * Header part of the mid-level comms system. All packets start with
2790 + * this header so we can identify them. The comms packet can
2791 + * contain many of these structs but the are split into individual
2792 + * work units before being passed to the lockqueue routines.
2793 + * below this are the structs that this is a header for
2794 + */
2795 +
2796 +struct dlm_header {
2797 +       uint8_t                 rh_cmd;         /* What we are */
2798 +       uint8_t                 rh_flags;       /* maybe just a pad */
2799 +       uint16_t                rh_length;      /* Length of struct (so we can
2800 +                                                  send many in 1 message) */
2801 +       uint32_t                rh_lkid;        /* Lock ID tag: ie the local
2802 +                                                  (requesting) lock ID */
2803 +       uint32_t                rh_lockspace;   /* Lockspace ID */
2804 +};
2805 +
2806 +/*
2807 + * This is the struct used in a remote lock/unlock/convert request
2808 + * The mid-level comms API should turn this into native byte order.
2809 + * Most "normal" lock operations will use these two structs for
2810 + * communications. Recovery operations use their own structs
2811 + * but still with the gd_req_header on the front.
2812 + */
2813 +
2814 +struct dlm_request {
2815 +       struct dlm_header       rr_header;
2816 +       uint32_t                rr_remlkid;     /* Remote lock ID */
2817 +       uint32_t                rr_remparid;    /* Parent's remote lock ID */
2818 +       uint32_t                rr_flags;       /* Flags from lock/convert req*/
2819 +       uint64_t                rr_range_start; /* Yes, these are in the right
2820 +                                                  place... */
2821 +       uint64_t                rr_range_end;
2822 +       uint32_t                rr_status;      /* Status to return if this is
2823 +                                                  an AST request */
2824 +       uint8_t                 rr_rqmode;      /* Requested lock mode */
2825 +       uint8_t                 rr_asts;        /* Whether the LKB has ASTs */
2826 +       char                    rr_lvb[DLM_LVB_LEN];
2827 +       char                    rr_name[1];     /* As long as needs be. Only
2828 +                                                  used for directory lookups.
2829 +                                                  The length of this can be
2830 +                                                  worked out from the packet
2831 +                                                  length */
2832 +};
2833 +
2834 +/*
2835 + * This is the struct returned by a remote lock/unlock/convert request
2836 + * The mid-level comms API should turn this into native byte order.
2837 + */
2838 +
2839 +struct dlm_reply {
2840 +       struct dlm_header       rl_header;
2841 +       uint32_t                rl_lockstate;   /* Whether request was
2842 +                                                  queued/granted/waiting */
2843 +       uint32_t                rl_nodeid;      /* nodeid of lock master */
2844 +       uint32_t                rl_status;      /* Status to return to caller */
2845 +       uint32_t                rl_lkid;        /* Remote lkid */
2846 +       char                    rl_lvb[DLM_LVB_LEN];
2847 +};
2848 +
2849 +/*
2850 + * Recovery comms message
2851 + */
2852 +
2853 +struct dlm_rcom {
2854 +       struct dlm_header       rc_header;      /* 32 byte aligned */
2855 +       uint32_t                rc_msgid;
2856 +       uint16_t                rc_datalen;
2857 +       uint8_t                 rc_expanded;
2858 +       uint8_t                 rc_subcmd;      /* secondary command */
2859 +       char                    rc_buf[1];      /* first byte of data goes here
2860 +                                                  and extends beyond here for
2861 +                                                  another datalen - 1 bytes.
2862 +                                                  rh_length is set to sizeof
2863 +                                                  dlm_rcom + datalen - 1 */
2864 +};
2865 +
2866 +
2867 +/* A remote query: GDLM_REMCMD_QUERY */
2868 +
2869 +struct dlm_query_request {
2870 +       struct dlm_header       rq_header;
2871 +       uint32_t                rq_mstlkid;   /* LockID on master node */
2872 +       uint32_t                rq_query;     /* query from the user */
2873 +       uint32_t                rq_maxlocks;  /* max number of locks we can
2874 +                                                cope with */
2875 +};
2876 +
2877 +/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
2878 +/* There may be subsequent blocks of
2879 +   lock info in GDLM_REMCMD_QUERYCONT messages which just have
2880 +   a normal header. The last of these will have rh_flags set to
2881 +   GDLM_REMFLAG_ENDQUERY
2882 + */
2883 +
2884 +struct dlm_query_reply {
2885 +       struct dlm_header       rq_header;
2886 +       uint32_t                rq_numlocks;  /* Number of locks in reply */
2887 +       uint32_t                rq_startlock; /* Which lock this block starts
2888 +                                                at (for multi-block replies) */
2889 +       uint32_t                rq_status;
2890 +
2891 +       /* Resource information */
2892 +       uint32_t                rq_grantcount;  /* No. of nodes on grantqueue */
2893 +       uint32_t                rq_convcount;   /* No. of nodes on convertq */
2894 +       uint32_t                rq_waitcount;   /* No. of nodes on waitqueue */
2895 +       char                    rq_valblk[DLM_LVB_LEN]; /* Master's LVB
2896 +                                                          contents, if
2897 +                                                          applicable */
2898 +};
2899 +
2900 +/*
2901 + * Lockqueue wait lock states
2902 + */
2903 +
2904 +#define GDLM_LQSTATE_WAIT_RSB          1
2905 +#define GDLM_LQSTATE_WAIT_CONVERT      2
2906 +#define GDLM_LQSTATE_WAIT_CONDGRANT    3
2907 +#define GDLM_LQSTATE_WAIT_UNLOCK       4
2908 +
2909 +/* Commands sent across the comms link */
2910 +#define GDLM_REMCMD_LOOKUP             1
2911 +#define GDLM_REMCMD_LOCKREQUEST                2
2912 +#define GDLM_REMCMD_UNLOCKREQUEST      3
2913 +#define GDLM_REMCMD_CONVREQUEST                4
2914 +#define GDLM_REMCMD_LOCKREPLY          5
2915 +#define GDLM_REMCMD_LOCKGRANT          6
2916 +#define GDLM_REMCMD_SENDBAST           7
2917 +#define GDLM_REMCMD_SENDCAST           8
2918 +#define GDLM_REMCMD_REM_RESDATA                9
2919 +#define GDLM_REMCMD_RECOVERMESSAGE     20
2920 +#define GDLM_REMCMD_RECOVERREPLY       21
2921 +#define GDLM_REMCMD_QUERY              30
2922 +#define GDLM_REMCMD_QUERYREPLY         31
2923 +
2924 +/* Set in rh_flags when this is the last block of
2925 +   query information. Note this could also be the first
2926 +   block */
2927 +#define GDLM_REMFLAG_ENDQUERY       1
2928 +
2929 +#ifndef BUG_ON
2930 +#define BUG_ON(x)
2931 +#endif
2932 +
2933 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
2934 +void dlm_debug_dump(void);
2935 +void dlm_locks_dump(void);
2936 +
2937 +#endif                         /* __DLM_INTERNAL_DOT_H__ */
2938 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
2939 --- linux-orig/cluster/dlm/lkb.c        1970-01-01 07:30:00.000000000 +0730
2940 +++ linux-patched/cluster/dlm/lkb.c     2004-07-13 18:57:22.000000000 +0800
2941 @@ -0,0 +1,181 @@
2942 +/******************************************************************************
2943 +*******************************************************************************
2944 +**
2945 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2946 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2947 +**
2948 +**  This copyrighted material is made available to anyone wishing to use,
2949 +**  modify, copy, or redistribute it subject to the terms and conditions
2950 +**  of the GNU General Public License v.2.
2951 +**
2952 +*******************************************************************************
2953 +******************************************************************************/
2954 +
2955 +/*
2956 + * lkb.c
2957 + *
2958 + * Allocate and free locks on the lock ID table.
2959 + *
2960 + * This is slightly naff but I don't really like the
2961 + * VMS lockidtbl stuff as it uses a realloced array
2962 + * to hold the locks in. I think this is slightly better
2963 + * in some ways.
2964 + *
2965 + * Any better suggestions gratefully received. Patrick
2966 + *
2967 + */
2968 +
2969 +#include "dlm_internal.h"
2970 +#include "lockqueue.h"
2971 +#include "lkb.h"
2972 +#include "config.h"
2973 +#include "rsb.h"
2974 +#include "memory.h"
2975 +#include "lockspace.h"
2976 +#include "util.h"
2977 +
2978 +/*
2979 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
2980 + */
2981 +
2982 +static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
2983 +{
2984 +       uint16_t bucket = lkid & 0xFFFF;
2985 +       struct dlm_lkb *lkb;
2986 +
2987 +       if (bucket >= ls->ls_lkbtbl_size)
2988 +               goto out;
2989 +
2990 +       list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
2991 +               if (lkb->lkb_id == lkid)
2992 +                       return lkb;
2993 +       }
2994 + out:
2995 +       return NULL;
2996 +}
2997 +
2998 +/*
2999 + * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
3000 + * random number between 0 and lockidtbl_size-1.  This random number specifies
3001 + * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
3002 + * assigned per-bucket id.
3003 + *
3004 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3005 + * against the lkid of all lkb's in the bucket to avoid duplication.
3006 + *
3007 + */
3008 +
3009 +struct dlm_lkb *create_lkb(struct dlm_ls *ls)
3010 +{
3011 +       struct dlm_lkb *lkb;
3012 +       uint32_t lkid;
3013 +       uint16_t bucket;
3014 +
3015 +       lkb = allocate_lkb(ls);
3016 +       if (!lkb)
3017 +               goto out;
3018 +
3019 + retry:
3020 +       get_random_bytes(&bucket, sizeof(bucket));
3021 +       bucket &= (ls->ls_lkbtbl_size - 1);
3022 +
3023 +       write_lock(&ls->ls_lkbtbl[bucket].lock);
3024 +
3025 +       lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
3026 +
3027 +       if (__find_lock_by_id(ls, lkid)) {
3028 +               write_unlock(&ls->ls_lkbtbl[bucket].lock);
3029 +               goto retry;
3030 +       }
3031 +
3032 +       lkb->lkb_id = lkid;
3033 +       list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
3034 +       write_unlock(&ls->ls_lkbtbl[bucket].lock);
3035 + out:
3036 +       return lkb;
3037 +}
3038 +
3039 +/*
3040 + * Free LKB and remove it from the lockidtbl.
3041 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3042 + * rsb unless its reference count is zero.
3043 + */
3044 +
3045 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
3046 +{
3047 +       uint16_t bucket = lkb->lkb_id & 0xFFFF;
3048 +
3049 +       if (lkb->lkb_status) {
3050 +               log_error(ls, "release lkb with status %u", lkb->lkb_status);
3051 +               print_lkb(lkb);
3052 +               return;
3053 +       }
3054 +
3055 +       if (lkb->lkb_parent)
3056 +               atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3057 +
3058 +       write_lock(&ls->ls_lkbtbl[bucket].lock);
3059 +       list_del(&lkb->lkb_idtbl_list);
3060 +       write_unlock(&ls->ls_lkbtbl[bucket].lock);
3061 +
3062 +       /* if this is not a master copy then lvbptr points into the user's
3063 +        * lksb, so don't free it */
3064 +       if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3065 +               free_lvb(lkb->lkb_lvbptr);
3066 +
3067 +       if (lkb->lkb_range)
3068 +               free_range(lkb->lkb_range);
3069 +
3070 +       free_lkb(lkb);
3071 +}
3072 +
3073 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3074 +{
3075 +       struct dlm_lkb *lkb;
3076 +       uint16_t bucket = lkid & 0xFFFF;
3077 +
3078 +       read_lock(&ls->ls_lkbtbl[bucket].lock);
3079 +       lkb = __find_lock_by_id(ls, lkid);
3080 +       read_unlock(&ls->ls_lkbtbl[bucket].lock);
3081 +
3082 +       return lkb;
3083 +}
3084 +
3085 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid)
3086 +{
3087 +        struct dlm_ls *lspace = find_lockspace_by_local_id(ls);
3088 +       return find_lock_by_id(lspace, lkid);
3089 +}
3090 +
3091 +/*
3092 + * Initialise the range parts of an LKB.
3093 + */
3094 +
3095 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
3096 +{
3097 +       int ret = -ENOMEM;
3098 +
3099 +       /*
3100 +        * if this wasn't already a range lock, make it one
3101 +        */
3102 +       if (!lkb->lkb_range) {
3103 +               lkb->lkb_range = allocate_range(lspace);
3104 +               if (!lkb->lkb_range)
3105 +                       goto out;
3106 +
3107 +               /*
3108 +                * This is needed for conversions that contain ranges where the
3109 +                * original lock didn't but it's harmless for new locks too.
3110 +                */
3111 +               lkb->lkb_range[GR_RANGE_START] = 0LL;
3112 +               lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3113 +       }
3114 +
3115 +       lkb->lkb_range[RQ_RANGE_START] = start;
3116 +       lkb->lkb_range[RQ_RANGE_END] = end;
3117 +
3118 +       ret = 0;
3119 +
3120 +      out:
3121 +       return ret;
3122 +}
3123 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3124 --- linux-orig/cluster/dlm/lkb.h        1970-01-01 07:30:00.000000000 +0730
3125 +++ linux-patched/cluster/dlm/lkb.h     2004-07-13 18:57:22.000000000 +0800
3126 @@ -0,0 +1,23 @@
3127 +/******************************************************************************
3128 +*******************************************************************************
3129 +**
3130 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3131 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3132 +**
3133 +**  This copyrighted material is made available to anyone wishing to use,
3134 +**  modify, copy, or redistribute it subject to the terms and conditions
3135 +**  of the GNU General Public License v.2.
3136 +**
3137 +*******************************************************************************
3138 +******************************************************************************/
3139 +
3140 +#ifndef __LKB_DOT_H__
3141 +#define __LKB_DOT_H__
3142 +
3143 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
3144 +struct dlm_lkb *create_lkb(struct dlm_ls *ls);
3145 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
3146 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
3147 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
3148 +
3149 +#endif                         /* __LKB_DOT_H__ */
3150 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3151 --- linux-orig/cluster/dlm/locking.c    1970-01-01 07:30:00.000000000 +0730
3152 +++ linux-patched/cluster/dlm/locking.c 2004-07-13 18:57:22.000000000 +0800
3153 @@ -0,0 +1,1307 @@
3154 +/******************************************************************************
3155 +*******************************************************************************
3156 +**
3157 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3158 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3159 +**
3160 +**  This copyrighted material is made available to anyone wishing to use,
3161 +**  modify, copy, or redistribute it subject to the terms and conditions
3162 +**  of the GNU General Public License v.2.
3163 +**
3164 +*******************************************************************************
3165 +******************************************************************************/
3166 +
3167 +/*
3168 + * locking.c
3169 + *
3170 + * This is where the main work of the DLM goes on
3171 + *
3172 + */
3173 +
3174 +#include "dlm_internal.h"
3175 +#include "lockqueue.h"
3176 +#include "locking.h"
3177 +#include "lockspace.h"
3178 +#include "lkb.h"
3179 +#include "nodes.h"
3180 +#include "dir.h"
3181 +#include "ast.h"
3182 +#include "memory.h"
3183 +#include "rsb.h"
3184 +#include "util.h"
3185 +
3186 +extern struct list_head lslist;
3187 +
3188 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3189 +
3190 +/*
3191 + * Lock compatibilty matrix - thanks Steve
3192 + * UN = Unlocked state. Not really a state, used as a flag
3193 + * PD = Padding. Used to make the matrix a nice power of two in size
3194 + * Other states are the same as the VMS DLM.
3195 + * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
3196 + */
3197 +
3198 +#define modes_compat(gr, rq) \
3199 +       __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3200 +
3201 +const int __dlm_compat_matrix[8][8] = {
3202 +      /* UN NL CR CW PR PW EX PD */
3203 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
3204 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
3205 +       {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
3206 +       {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
3207 +       {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
3208 +       {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
3209 +       {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
3210 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3211 +};
3212 +
3213 +/*
3214 + * Compatibility matrix for conversions with QUECVT set.
3215 + * Granted mode is the row; requested mode is the column.
3216 + * Usage: matrix[grmode+1][rqmode+1]
3217 + */
3218 +
3219 +const int __quecvt_compat_matrix[8][8] = {
3220 +      /* UN NL CR CW PR PW EX PD */
3221 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
3222 +       {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
3223 +       {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
3224 +       {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
3225 +       {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
3226 +       {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
3227 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
3228 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3229 +};
3230 +
3231 +/*
3232 + * This defines the direction of transfer of LVB data.
3233 + * Granted mode is the row; requested mode is the column.
3234 + * Usage: matrix[grmode+1][rqmode+1]
3235 + * 1 = LVB is returned to the caller
3236 + * 0 = LVB is written to the resource
3237 + * -1 = nothing happens to the LVB
3238 + */
3239 +
3240 +const int __lvb_operations[8][8] = {
3241 +       /* UN   NL  CR  CW  PR  PW  EX  PD*/
3242 +       {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
3243 +       {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
3244 +       {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
3245 +       {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
3246 +       {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
3247 +       {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
3248 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
3249 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
3250 +};
3251 +
3252 +static void grant_lock(struct dlm_lkb * lkb, int send_remote);
3253 +static void send_blocking_asts(struct dlm_rsb * rsb, struct dlm_lkb * lkb);
3254 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3255 +static int convert_lock(struct dlm_ls * ls, int mode, struct dlm_lksb *lksb,
3256 +                       int flags, void *ast, void *astarg, void *bast,
3257 +                       struct dlm_range *range);
3258 +static int dlm_lock_stage1(struct dlm_ls * lspace, struct dlm_lkb * lkb, int flags,
3259 +                          char *name, int namelen);
3260 +
3261 +
3262 +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
3263 +{
3264 +       struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
3265 +
3266 +       if (lkb->lkb_id == first->lkb_id)
3267 +               return 1;
3268 +
3269 +       return 0;
3270 +}
3271 +
3272 +/*
3273 + * Return 1 if the locks' ranges overlap
3274 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3275 + */
3276 +
3277 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
3278 +{
3279 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
3280 +               return 1;
3281 +
3282 +       if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3283 +           lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3284 +               return 0;
3285 +
3286 +       return 1;
3287 +}
3288 +
3289 +/*
3290 + * Resolve conversion deadlock by changing to NL the granted mode of deadlocked
3291 + * locks on the convert queue.  One of the deadlocked locks is allowed to
3292 + * retain its original granted state (we choose the lkb provided although it
3293 + * shouldn't matter which.)  We do not change the granted mode on locks without
3294 + * the CONVDEADLK flag.  If any of these exist (there shouldn't if the app uses
3295 + * the flag consistently) the false return value is used.
3296 + */
3297 +
3298 +static int conversion_deadlock_resolve(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3299 +{
3300 +       struct dlm_lkb *this;
3301 +       int rv = TRUE;
3302 +
3303 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3304 +               if (this == lkb)
3305 +                       continue;
3306 +
3307 +               if (!ranges_overlap(lkb, this))
3308 +                       continue;
3309 +
3310 +               if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) {
3311 +
3312 +                       if (!(this->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK)){
3313 +                               rv = FALSE;
3314 +                               continue;
3315 +                       }
3316 +                       this->lkb_grmode = DLM_LOCK_NL;
3317 +                       this->lkb_flags |= GDLM_LKFLG_DEMOTED;
3318 +               }
3319 +       }
3320 +       return rv;
3321 +}
3322 +
3323 +/*
3324 + * "A conversion deadlock arises with a pair of lock requests in the converting
3325 + * queue for one resource.  The granted mode of each lock blocks the requested
3326 + * mode of the other lock."
3327 + */
3328 +
3329 +static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3330 +{
3331 +       struct dlm_lkb *this;
3332 +
3333 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3334 +               if (this == lkb)
3335 +                       continue;
3336 +
3337 +               if (!ranges_overlap(lkb, this))
3338 +                       continue;
3339 +
3340 +               if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3341 +                       return TRUE;
3342 +       }
3343 +       return FALSE;
3344 +}
3345 +
3346 +/*
3347 + * Check if the given lkb conflicts with another lkb on the queue.
3348 + */
3349 +
3350 +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
3351 +{
3352 +       struct dlm_lkb *this;
3353 +
3354 +       list_for_each_entry(this, head, lkb_statequeue) {
3355 +               if (this == lkb)
3356 +                       continue;
3357 +               if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3358 +                       return TRUE;
3359 +       }
3360 +       return FALSE;
3361 +}
3362 +
3363 +/*
3364 + * Deadlock can arise when using the QUECVT flag if the requested mode of the
3365 + * first converting lock is incompatible with the granted mode of another
3366 + * converting lock further down the queue.  To prevent this deadlock, a
3367 + * requested QUEUECVT lock is granted immediately if adding it to the end of
3368 + * the queue would prevent a lock ahead of it from being granted.
3369 + */
3370 +
3371 +static int queuecvt_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3372 +{
3373 +       struct dlm_lkb *this;
3374 +
3375 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3376 +               if (this == lkb)
3377 +                       break;
3378 +
3379 +               if (ranges_overlap(lkb, this) && !modes_compat(lkb, this))
3380 +                       return TRUE;
3381 +       }
3382 +       return FALSE;
3383 +}
3384 +
3385 +/*
3386 + * Return 1 if the lock can be granted, 0 otherwise.
3387 + * Also detect and resolve conversion deadlocks.
3388 + */
3389 +
3390 +static int can_be_granted(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
3391 +{
3392 +        if (test_bit(LSFL_NOCONVGRANT, &rsb->res_ls->ls_flags) &&
3393 +           lkb->lkb_grmode == DLM_LOCK_IV &&
3394 +           !list_empty(&rsb->res_convertqueue))
3395 +               return FALSE;
3396 +
3397 +        if (lkb->lkb_rqmode == DLM_LOCK_NL)
3398 +               return TRUE;
3399 +
3400 +       if (lkb->lkb_rqmode == lkb->lkb_grmode)
3401 +               return TRUE;
3402 +
3403 +       if (queue_conflict(&rsb->res_grantqueue, lkb))
3404 +               return FALSE;
3405 +
3406 +       if (!queue_conflict(&rsb->res_convertqueue, lkb)) {
3407 +               if (!(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3408 +                       return TRUE;
3409 +
3410 +               if (list_empty(&rsb->res_convertqueue) ||
3411 +                   first_in_list(lkb, &rsb->res_convertqueue) ||
3412 +                   queuecvt_deadlock_detect(rsb, lkb))
3413 +                       return TRUE;
3414 +               else
3415 +                       return FALSE;
3416 +       }
3417 +
3418 +       /* there *is* a conflict between this lkb and a converting lock so
3419 +          we return false unless conversion deadlock resolution is permitted
3420 +          (only conversion requests will have the CONVDEADLK flag set) */
3421 +
3422 +       if (!(lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK))
3423 +               return FALSE;
3424 +
3425 +       if (!conversion_deadlock_detect(rsb, lkb))
3426 +               return FALSE;
3427 +
3428 +       if (conversion_deadlock_resolve(rsb, lkb))
3429 +               return TRUE;
3430 +
3431 +       return FALSE;
3432 +}
3433 +
3434 +int dlm_lock(void *lockspace,
3435 +            uint32_t mode,
3436 +            struct dlm_lksb *lksb,
3437 +            uint32_t flags,
3438 +            void *name,
3439 +            unsigned int namelen,
3440 +            uint32_t parent,
3441 +            void (*ast) (void *astarg),
3442 +            void *astarg,
3443 +            void (*bast) (void *astarg, int mode),
3444 +            struct dlm_range *range)
3445 +{
3446 +       struct dlm_ls *lspace;
3447 +       struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3448 +       int ret = -EINVAL;
3449 +
3450 +       lspace = find_lockspace_by_local_id(lockspace);
3451 +       if (!lspace)
3452 +               goto out;
3453 +
3454 +       if (mode < 0 || mode > DLM_LOCK_EX)
3455 +               goto out;
3456 +
3457 +       if (namelen > DLM_RESNAME_MAXLEN)
3458 +               goto out;
3459 +
3460 +       if (flags & DLM_LKF_CANCEL)
3461 +               goto out;
3462 +
3463 +       if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3464 +               goto out;
3465 +
3466 +       if (flags & DLM_LKF_EXPEDITE && !(flags & DLM_LKF_CONVERT))
3467 +               goto out;
3468 +
3469 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3470 +               goto out;
3471 +
3472 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3473 +               goto out;
3474 +
3475 +       if (!ast || !lksb)
3476 +               goto out;
3477 +
3478 +       if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK))
3479 +               goto out;
3480 +
3481 +       if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3482 +               goto out;
3483 +
3484 +       /*
3485 +        * Take conversion path.
3486 +        */
3487 +
3488 +       if (flags & DLM_LKF_CONVERT) {
3489 +               ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3490 +                                  bast, range);
3491 +               goto out;
3492 +       }
3493 +
3494 +       /*
3495 +        * Take new lock path.
3496 +        */
3497 +
3498 +       if (parent) {
3499 +               down_read(&lspace->ls_unlock_sem);
3500 +
3501 +               parent_lkb = find_lock_by_id(lspace, parent);
3502 +
3503 +               if (!parent_lkb ||
3504 +                   parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3505 +                   parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3506 +                   parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3507 +                       up_read(&lspace->ls_unlock_sem);
3508 +                       goto out;
3509 +               }
3510 +
3511 +               atomic_inc(&parent_lkb->lkb_childcnt);
3512 +               up_read(&lspace->ls_unlock_sem);
3513 +       }
3514 +
3515 +       down_read(&lspace->ls_in_recovery);
3516 +
3517 +       ret = -ENOMEM;
3518 +
3519 +       lkb = create_lkb(lspace);
3520 +       if (!lkb)
3521 +               goto fail_dec;
3522 +       lkb->lkb_astaddr = ast;
3523 +       lkb->lkb_astparam = (long) astarg;
3524 +       lkb->lkb_bastaddr = bast;
3525 +       lkb->lkb_rqmode = mode;
3526 +       lkb->lkb_grmode = DLM_LOCK_IV;
3527 +       lkb->lkb_nodeid = -1;
3528 +       lkb->lkb_lksb = lksb;
3529 +       lkb->lkb_parent = parent_lkb;
3530 +       lkb->lkb_lockqueue_flags = flags;
3531 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
3532 +
3533 +       /* Copy the range if appropriate */
3534 +       if (range) {
3535 +               if (range->ra_start > range->ra_end) {
3536 +                       ret = -EINVAL;
3537 +                       goto fail_free;
3538 +               }
3539 +
3540 +               if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3541 +                       goto fail_free;
3542 +       }
3543 +
3544 +       /* Convert relevant flags to internal numbers */
3545 +       if (flags & DLM_LKF_VALBLK)
3546 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3547 +       if (flags & DLM_LKF_PERSISTENT)
3548 +               lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3549 +       if (flags & DLM_LKF_NODLCKWT)
3550 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3551 +
3552 +       lksb->sb_lkid = lkb->lkb_id;
3553 +
3554 +       ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3555 +       if (ret)
3556 +               goto fail_free;
3557 +
3558 +       up_read(&lspace->ls_in_recovery);
3559 +
3560 +       wake_astd();
3561 +
3562 +       return 0;
3563 +
3564 +      fail_free:
3565 +       release_lkb(lspace, lkb);
3566 +       goto fail_unlock;
3567 +
3568 +      fail_dec:
3569 +       if (parent_lkb)
3570 +               atomic_dec(&parent_lkb->lkb_childcnt);
3571 +
3572 +      fail_unlock:
3573 +       up_read(&lspace->ls_in_recovery);
3574 +
3575 +      out:
3576 +       return ret;
3577 +}
3578 +
3579 +int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, int flags, char *name,
3580 +                   int namelen)
3581 +{
3582 +       struct dlm_rsb *rsb, *parent_rsb = NULL;
3583 +       struct dlm_lkb *parent_lkb = lkb->lkb_parent;
3584 +       uint32_t nodeid;
3585 +       int error;
3586 +
3587 +       if (parent_lkb)
3588 +               parent_rsb = parent_lkb->lkb_resource;
3589 +
3590 +       error = find_or_create_rsb(ls, parent_rsb, name, namelen, 1, &rsb);
3591 +       if (error)
3592 +               goto out;
3593 +       lkb->lkb_resource = rsb;
3594 +
3595 +       log_debug(ls, "rq %u %x \"%s\"", lkb->lkb_rqmode, lkb->lkb_id,
3596 +                 rsb->res_name);
3597 +       /*
3598 +        * Next stage, do we need to find the master or can
3599 +        * we get on with the real locking work ?
3600 +        */
3601 +
3602 +       if (rsb->res_nodeid == -1) {
3603 +               if (get_directory_nodeid(rsb) != our_nodeid()) {
3604 +                       error = remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3605 +                       goto out;
3606 +               }
3607 +
3608 +               error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
3609 +                                      rsb->res_length, &nodeid);
3610 +               if (error)
3611 +                       goto out;
3612 +
3613 +               if (nodeid == our_nodeid()) {
3614 +                       set_bit(RESFL_MASTER, &rsb->res_flags);
3615 +                       nodeid = 0;
3616 +               } else
3617 +                       clear_bit(RESFL_MASTER, &rsb->res_flags);
3618 +               rsb->res_nodeid = nodeid;
3619 +       }
3620 +
3621 +       lkb->lkb_nodeid = rsb->res_nodeid;
3622 +
3623 +       error = dlm_lock_stage2(ls, lkb, rsb, flags);
3624 +
3625 +      out:
3626 +       if (error)
3627 +               release_rsb(rsb);
3628 +
3629 +       return error;
3630 +}
3631 +
3632 +/*
3633 + * Locking routine called after we have an RSB, either a copy of a remote one
3634 + * or a local one, or perhaps a shiny new one all of our very own
3635 + */
3636 +
3637 +int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb, int flags)
3638 +{
3639 +       int error = 0;
3640 +
3641 +       DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
3642 +
3643 +       if (rsb->res_nodeid) {
3644 +               res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3645 +               error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
3646 +       } else {
3647 +               dlm_lock_stage3(lkb);
3648 +       }
3649 +
3650 +       return error;
3651 +}
3652 +
3653 +/*
3654 + * Called on an RSB's master node to do stage2 locking for a remote lock
3655 + * request.  Returns a proper lkb with rsb ready for lock processing.
3656 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
3657 + */
3658 +
3659 +struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
3660 +                       struct dlm_request *freq)
3661 +{
3662 +       struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
3663 +       struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3664 +       int error, namelen;
3665 +
3666 +       if (freq->rr_remparid) {
3667 +               parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
3668 +               if (!parent_lkb)
3669 +                       goto fail;
3670 +
3671 +               atomic_inc(&parent_lkb->lkb_childcnt);
3672 +               parent_rsb = parent_lkb->lkb_resource;
3673 +       }
3674 +
3675 +       /*
3676 +        * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
3677 +        * node actually holding the (non-MSTCPY) lkb.  AST address are just
3678 +        * flags in the master copy.
3679 +        */
3680 +
3681 +       lkb = create_lkb(ls);
3682 +       if (!lkb)
3683 +               goto fail_dec;
3684 +       lkb->lkb_grmode = DLM_LOCK_IV;
3685 +       lkb->lkb_rqmode = freq->rr_rqmode;
3686 +       lkb->lkb_parent = parent_lkb;
3687 +       lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
3688 +       lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
3689 +       lkb->lkb_nodeid = remote_nodeid;
3690 +       lkb->lkb_remid = freq->rr_header.rh_lkid;
3691 +       lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
3692 +       lkb->lkb_lockqueue_flags = freq->rr_flags;
3693 +
3694 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
3695 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3696 +               allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
3697 +               if (!lkb->lkb_lvbptr)
3698 +                       goto fail_free;
3699 +       }
3700 +
3701 +       if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
3702 +               error = lkb_set_range(ls, lkb, freq->rr_range_start,
3703 +                                     freq->rr_range_end);
3704 +               if (error)
3705 +                       goto fail_free;
3706 +       }
3707 +
3708 +       /*
3709 +        * Get the RSB which this lock is for.  Create a new RSB if this is a
3710 +        * new lock on a new resource.  We must be the master of any new rsb.
3711 +        */
3712 +
3713 +       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
3714 +
3715 +       error = find_or_create_rsb(ls, parent_rsb, freq->rr_name, namelen, 0,
3716 +                                  &rsb);
3717 +       if (error)
3718 +               goto fail_free;
3719 +
3720 +       if (!rsb || rsb->res_nodeid == -1) {
3721 +               log_debug(ls, "inval rsb to %u", remote_nodeid);
3722 +               lkb->lkb_retstatus = -EINVAL;
3723 +               goto out;
3724 +       }
3725 +
3726 +       lkb->lkb_resource = rsb;
3727 +
3728 +       log_debug(ls, "rq %u from %u %x \"%s\"", lkb->lkb_rqmode, remote_nodeid,
3729 +                 lkb->lkb_id, rsb->res_name);
3730 +
3731 +       DLM_ASSERT(rsb->res_nodeid == 0,
3732 +                  print_lkb(lkb);
3733 +                  print_request(freq);
3734 +                  printk("nodeid %u\n", remote_nodeid););
3735 +
3736 +      out:
3737 +       return lkb;
3738 +
3739 +      fail_free:
3740 +       /* release_lkb handles parent */
3741 +       release_lkb(ls, lkb);
3742 +       parent_lkb = NULL;
3743 +
3744 +      fail_dec:
3745 +       if (parent_lkb)
3746 +               atomic_dec(&parent_lkb->lkb_childcnt);
3747 +      fail:
3748 +       return NULL;
3749 +}
3750 +
3751 +/*
3752 + * The final bit of lock request processing on the master node.  Here the lock
3753 + * is granted and the completion ast is queued, or the lock is put on the
3754 + * waitqueue and blocking asts are sent.
3755 + */
3756 +
3757 +void dlm_lock_stage3(struct dlm_lkb *lkb)
3758 +{
3759 +       struct dlm_rsb *rsb = lkb->lkb_resource;
3760 +
3761 +       /*
3762 +        * This is a locally mastered lock on a resource that already exists,
3763 +        * see if it can be  granted or if it must wait.  When this function is
3764 +        * called for a remote lock request (process_cluster_request,
3765 +        * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
3766 +        * requesting node at the end of process_cluster_request, not at the
3767 +        * end of grant_lock.
3768 +        */
3769 +
3770 +       down_write(&rsb->res_lock);
3771 +
3772 +       if (can_be_granted(rsb, lkb)) {
3773 +               grant_lock(lkb, 0);
3774 +               goto out;
3775 +       }
3776 +
3777 +       /*
3778 +        * This request is not a conversion, so the lkb didn't exist other than
3779 +        * for this request and should be freed after EAGAIN is returned in the
3780 +        * ast.
3781 +        */
3782 +
3783 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
3784 +               lkb->lkb_retstatus = -EAGAIN;
3785 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
3786 +                       send_blocking_asts_all(rsb, lkb);
3787 +               queue_ast(lkb, AST_COMP | AST_DEL, 0);
3788 +               goto out;
3789 +       }
3790 +
3791 +       /*
3792 +        * The requested lkb must wait.  Because the rsb of the requested lkb
3793 +        * is mastered here, send blocking asts for the lkb's blocking the
3794 +        * request.
3795 +        */
3796 +
3797 +       lkb->lkb_retstatus = 0;
3798 +       lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
3799 +
3800 +       send_blocking_asts(rsb, lkb);
3801 +
3802 +      out:
3803 +       up_write(&rsb->res_lock);
3804 +}
3805 +
3806 +int dlm_unlock(void *lockspace,
3807 +              uint32_t lkid,
3808 +              uint32_t flags,
3809 +              struct dlm_lksb *lksb,
3810 +              void *astarg)
3811 +{
3812 +       struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
3813 +       struct dlm_lkb *lkb;
3814 +       struct dlm_rsb *rsb;
3815 +       int ret = -EINVAL;
3816 +
3817 +       if (!ls)
3818 +               goto out;
3819 +
3820 +       lkb = find_lock_by_id(ls, lkid);
3821 +       if (!lkb)
3822 +               goto out;
3823 +
3824 +       /* Can't dequeue a master copy (a remote node's mastered lock) */
3825 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3826 +               goto out;
3827 +
3828 +       /* Already waiting for a remote lock operation */
3829 +       if (lkb->lkb_lockqueue_state) {
3830 +               ret = -EBUSY;
3831 +               goto out;
3832 +       }
3833 +
3834 +       /* Can only cancel WAITING or CONVERTing locks.
3835 +        * This is just a quick check - it is also checked in unlock_stage2()
3836 +        * (which may be on the master) under the semaphore.
3837 +        */
3838 +       if ((flags & DLM_LKF_CANCEL) &&
3839 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED))
3840 +               goto out;
3841 +
3842 +       /* "Normal" unlocks must operate on a granted lock */
3843 +       if (!(flags & DLM_LKF_CANCEL) &&
3844 +           (lkb->lkb_status != GDLM_LKSTS_GRANTED))
3845 +               goto out;
3846 +
3847 +       down_write(&ls->ls_unlock_sem);
3848 +       /* Can't dequeue a lock with sublocks */
3849 +       if (atomic_read(&lkb->lkb_childcnt)) {
3850 +               up_write(&ls->ls_unlock_sem);
3851 +               ret = -ENOTEMPTY;
3852 +               goto out;
3853 +       }
3854 +       /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
3855 +       if (!(flags & DLM_LKF_CANCEL))
3856 +               lkb->lkb_flags |= GDLM_LKFLG_DELETED;
3857 +       up_write(&ls->ls_unlock_sem);
3858 +
3859 +       down_read(&ls->ls_in_recovery);
3860 +       rsb = find_rsb_to_unlock(ls, lkb);
3861 +
3862 +       log_debug(ls, "un %x ref %u flg %x nodeid %d/%d \"%s\"", lkb->lkb_id,
3863 +                 atomic_read(&rsb->res_ref), rsb->res_flags,
3864 +                 lkb->lkb_nodeid, rsb->res_nodeid, rsb->res_name);
3865 +
3866 +       /* Save any new params */
3867 +       if (lksb)
3868 +               lkb->lkb_lksb = lksb;
3869 +       if (astarg)
3870 +               lkb->lkb_astparam = (long) astarg;
3871 +       lkb->lkb_lockqueue_flags = flags;
3872 +
3873 +       if (lkb->lkb_nodeid)
3874 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
3875 +       else
3876 +               ret = dlm_unlock_stage2(lkb, rsb, flags);
3877 +       up_read(&ls->ls_in_recovery);
3878 +
3879 +       wake_astd();
3880 +
3881 +      out:
3882 +       return ret;
3883 +}
3884 +
3885 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
3886 +{
3887 +       int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
3888 +       int old_status;
3889 +
3890 +       down_write(&rsb->res_lock);
3891 +
3892 +       /* Can only cancel WAITING or CONVERTing locks */
3893 +       if ((flags & DLM_LKF_CANCEL) &&
3894 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
3895 +               lkb->lkb_retstatus = -EINVAL;
3896 +               queue_ast(lkb, AST_COMP, 0);
3897 +               goto out;
3898 +       }
3899 +
3900 +       old_status = lkb_dequeue(lkb);
3901 +
3902 +       /*
3903 +        * If was granted grant any converting or waiting locks.
3904 +        */
3905 +
3906 +       if (old_status == GDLM_LKSTS_GRANTED)
3907 +               grant_pending_locks(rsb);
3908 +
3909 +       /*
3910 +        * Cancelling a conversion
3911 +        */
3912 +
3913 +       if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
3914 +               /* VMS semantics say we should send blocking ASTs again here */
3915 +               send_blocking_asts(rsb, lkb);
3916 +
3917 +               /* Remove from deadlock detection */
3918 +               if (lkb->lkb_duetime)
3919 +                       remove_from_deadlockqueue(lkb);
3920 +
3921 +               /* Stick it back on the granted queue */
3922 +               lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
3923 +               lkb->lkb_rqmode = lkb->lkb_grmode;
3924 +
3925 +               /* Was it blocking any other locks? */
3926 +               if (first_in_list(lkb, &rsb->res_convertqueue))
3927 +                       grant_pending_locks(rsb);
3928 +
3929 +               lkb->lkb_retstatus = -DLM_ECANCEL;
3930 +               queue_ast(lkb, AST_COMP, 0);
3931 +               goto out;
3932 +       }
3933 +
3934 +       /*
3935 +        * The lvb can be saved or cleared on unlock.
3936 +        */
3937 +
3938 +       if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
3939 +               if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
3940 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
3941 +               if (flags & DLM_LKF_IVVALBLK)
3942 +                       memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
3943 +       }
3944 +
3945 +       lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
3946 +
3947 +       if (!remote)
3948 +               queue_ast(lkb, AST_COMP | AST_DEL, 0);
3949 +
3950 +       /*
3951 +        * Only free the LKB if we are the master copy.  Otherwise the AST
3952 +        * delivery routine will free it after delivery.
3953 +        */
3954 +
3955 +       if (remote) {
3956 +               up_write(&rsb->res_lock);
3957 +               release_lkb(rsb->res_ls, lkb);
3958 +               release_rsb(rsb);
3959 +               goto out2;
3960 +       }
3961 +
3962 +      out:
3963 +       up_write(&rsb->res_lock);
3964 +      out2:
3965 +       wake_astd();
3966 +       return 0;
3967 +}
3968 +
3969 +/*
3970 + * Lock conversion
3971 + */
3972 +
3973 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
3974 +                       int flags, void *ast, void *astarg, void *bast,
3975 +                       struct dlm_range *range)
3976 +{
3977 +       struct dlm_lkb *lkb;
3978 +       struct dlm_rsb *rsb;
3979 +       int ret = -EINVAL;
3980 +
3981 +       lkb = find_lock_by_id(ls, lksb->sb_lkid);
3982 +       if (!lkb) {
3983 +               goto out;
3984 +       }
3985 +
3986 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3987 +               ret = -EBUSY;
3988 +               goto out;
3989 +       }
3990 +
3991 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
3992 +               goto out;
3993 +       }
3994 +
3995 +       if ((flags & DLM_LKF_QUECVT) &&
3996 +           !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
3997 +               goto out;
3998 +       }
3999 +
4000 +       if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4001 +               goto out;
4002 +       }
4003 +
4004 +       if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr) {
4005 +               goto out;
4006 +       }
4007 +
4008 +       /* Set up the ranges as appropriate */
4009 +       if (range) {
4010 +               if (range->ra_start > range->ra_end)
4011 +                       goto out;
4012 +
4013 +               if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4014 +                       ret = -ENOMEM;
4015 +                       goto out;
4016 +               }
4017 +       }
4018 +
4019 +       rsb = lkb->lkb_resource;
4020 +       down_read(&ls->ls_in_recovery);
4021 +
4022 +       log_debug(ls, "cv %u %x \"%s\"", mode, lkb->lkb_id, rsb->res_name);
4023 +
4024 +       lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4025 +       lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4026 +
4027 +       if (flags & DLM_LKF_NODLCKWT)
4028 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4029 +       if (ast)
4030 +               lkb->lkb_astaddr = ast;
4031 +       if (astarg)
4032 +               lkb->lkb_astparam = (long) astarg;
4033 +       if (bast)
4034 +               lkb->lkb_bastaddr = bast;
4035 +       lkb->lkb_rqmode = mode;
4036 +       lkb->lkb_lockqueue_flags = flags;
4037 +       lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4038 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
4039 +
4040 +       if (rsb->res_nodeid) {
4041 +               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4042 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4043 +       } else {
4044 +               ret = dlm_convert_stage2(lkb, FALSE);
4045 +       }
4046 +
4047 +       up_read(&ls->ls_in_recovery);
4048 +
4049 +       wake_astd();
4050 +
4051 +      out:
4052 +       return ret;
4053 +}
4054 +
4055 +/*
4056 + * For local conversion requests on locally mastered locks this is called
4057 + * directly from dlm_lock/convert_lock.  This function is also called for
4058 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4059 + */
4060 +
4061 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
4062 +{
4063 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4064 +       int ret = 0;
4065 +
4066 +       down_write(&rsb->res_lock);
4067 +
4068 +       if (can_be_granted(rsb, lkb)) {
4069 +               grant_lock(lkb, 0);
4070 +               grant_pending_locks(rsb);
4071 +               goto out;
4072 +       }
4073 +
4074 +       /*
4075 +        * Remove lkb from granted queue.
4076 +        */
4077 +
4078 +       lkb_dequeue(lkb);
4079 +
4080 +       /*
4081 +        * The user won't wait so stick it back on the grant queue
4082 +        */
4083 +
4084 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4085 +               lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4086 +               ret = lkb->lkb_retstatus = -EAGAIN;
4087 +               if (do_ast)
4088 +                       queue_ast(lkb, AST_COMP, 0);
4089 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4090 +                       send_blocking_asts_all(rsb, lkb);
4091 +               goto out;
4092 +       }
4093 +
4094 +       /*
4095 +        * The lkb's status tells which queue it's on.  Put back on convert
4096 +        * queue.  (QUECVT requests added at end of the queue, all others in
4097 +        * order.)
4098 +        */
4099 +
4100 +       lkb->lkb_retstatus = 0;
4101 +       lkb_enqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4102 +
4103 +       /*
4104 +        * If the request can't be granted
4105 +        */
4106 +
4107 +       send_blocking_asts(rsb, lkb);
4108 +
4109 +       if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4110 +               add_to_deadlockqueue(lkb);
4111 +
4112 +      out:
4113 +       up_write(&rsb->res_lock);
4114 +       return ret;
4115 +}
4116 +
4117 +/*
4118 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4119 + * completion ast.  rsb res_lock must be held in write when this is called.
4120 + */
4121 +
4122 +static void grant_lock(struct dlm_lkb *lkb, int send_remote)
4123 +{
4124 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4125 +
4126 +       if (lkb->lkb_duetime)
4127 +               remove_from_deadlockqueue(lkb);
4128 +
4129 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4130 +               int b;
4131 +               DLM_ASSERT(lkb->lkb_lvbptr,);
4132 +
4133 +               if (!rsb->res_lvbptr)
4134 +                       rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4135 +
4136 +               b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4137 +               if (b)
4138 +                       memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4139 +               else
4140 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4141 +       }
4142 +
4143 +       if (lkb->lkb_range) {
4144 +               lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4145 +               lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4146 +       }
4147 +
4148 +       lkb->lkb_grmode = lkb->lkb_rqmode;
4149 +       lkb->lkb_rqmode = DLM_LOCK_IV;
4150 +       lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4151 +
4152 +       lkb->lkb_highbast = 0;
4153 +       lkb->lkb_retstatus = 0;
4154 +       queue_ast(lkb, AST_COMP, 0);
4155 +
4156 +       /*
4157 +        * A remote conversion request has been granted, either immediately
4158 +        * upon being requested or after waiting a bit.  In the former case,
4159 +        * reply_and_grant() is called.  In the later case send_remote is 1 and
4160 +        * remote_grant() is called.
4161 +        *
4162 +        * The "send_remote" flag is set only for locks which are granted "out
4163 +        * of band" - ie by another lock being converted or unlocked.
4164 +        *
4165 +        * The second case occurs when this lkb is granted right away as part
4166 +        * of processing the initial request.  In that case, we send a single
4167 +        * message in reply_and_grant which combines the request reply with the
4168 +        * grant message.
4169 +        */
4170 +
4171 +       if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4172 +               if (send_remote)
4173 +                       remote_grant(lkb);
4174 +               else if (lkb->lkb_request)
4175 +                       reply_and_grant(lkb);
4176 +       }
4177 +
4178 +}
4179 +
4180 +static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
4181 +{
4182 +       struct dlm_lkb *gr;
4183 +
4184 +       list_for_each_entry(gr, head, lkb_statequeue) {
4185 +               if (gr->lkb_bastaddr &&
4186 +                   gr->lkb_highbast < lkb->lkb_rqmode &&
4187 +                   ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4188 +                       queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
4189 +                       gr->lkb_highbast = lkb->lkb_rqmode;
4190 +               }
4191 +       }
4192 +}
4193 +
4194 +/*
4195 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4196 + */
4197 +
4198 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4199 +{
4200 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4201 +       /* check if the following improves performance */
4202 +       /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4203 +}
4204 +
4205 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4206 +{
4207 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4208 +       send_bast_queue(&rsb->res_convertqueue, lkb);
4209 +}
4210 +
4211 +/*
4212 + * Called when a lock has been dequeued. Look for any locks to grant that are
4213 + * waiting for conversion or waiting to be granted.
4214 + * The rsb res_lock must be held in write when this function is called.
4215 + */
4216 +
4217 +int grant_pending_locks(struct dlm_rsb *rsb)
4218 +{
4219 +       struct dlm_lkb *lkb;
4220 +       struct list_head *list;
4221 +       struct list_head *temp;
4222 +       int8_t high = DLM_LOCK_IV;
4223 +
4224 +       list_for_each_safe(list, temp, &rsb->res_convertqueue) {
4225 +               lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4226 +
4227 +               if (can_be_granted(rsb, lkb))
4228 +                       grant_lock(lkb, 1);
4229 +               else
4230 +                       high = MAX(lkb->lkb_rqmode, high);
4231 +       }
4232 +
4233 +       list_for_each_safe(list, temp, &rsb->res_waitqueue) {
4234 +               lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4235 +
4236 +               if (can_be_granted(rsb, lkb))
4237 +                       grant_lock(lkb, 1);
4238 +               else
4239 +                       high = MAX(lkb->lkb_rqmode, high);
4240 +       }
4241 +
4242 +       /*
4243 +        * If there are locks left on the wait/convert queue then send blocking
4244 +        * ASTs to granted locks that are blocking
4245 +        *
4246 +        * FIXME: This might generate some spurious blocking ASTs for range
4247 +        * locks.
4248 +        */
4249 +
4250 +       if (high > DLM_LOCK_IV) {
4251 +               list_for_each_safe(list, temp, &rsb->res_grantqueue) {
4252 +                       lkb = list_entry(list, struct dlm_lkb, lkb_statequeue);
4253 +
4254 +                       if (lkb->lkb_bastaddr &&
4255 +                           (lkb->lkb_highbast < high) &&
4256 +                           !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4257 +
4258 +                               queue_ast(lkb, AST_BAST, high);
4259 +                               lkb->lkb_highbast = high;
4260 +                       }
4261 +               }
4262 +       }
4263 +
4264 +       return 0;
4265 +}
4266 +
4267 +/*
4268 + * Called to cancel a locking operation that failed due to some internal
4269 + * reason.
4270 + *
4271 + * Waiting locks will be removed, converting locks will be reverted to their
4272 + * granted status, unlocks will be left where they are.
4273 + *
4274 + * A completion AST will be delivered to the caller.
4275 + */
4276 +
4277 +int cancel_lockop(struct dlm_lkb *lkb, int status)
4278 +{
4279 +       int state = lkb->lkb_lockqueue_state;
4280 +       uint16_t astflags = AST_COMP;
4281 +
4282 +       lkb->lkb_lockqueue_state = 0;
4283 +
4284 +       switch (state) {
4285 +       case GDLM_LQSTATE_WAIT_RSB:
4286 +               astflags |= AST_DEL;
4287 +               break;
4288 +
4289 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4290 +               res_lkb_dequeue(lkb);
4291 +               astflags |= AST_DEL;
4292 +               break;
4293 +
4294 +       case GDLM_LQSTATE_WAIT_CONVERT:
4295 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4296 +
4297 +               /* Remove from deadlock detection */
4298 +               if (lkb->lkb_duetime) {
4299 +                       remove_from_deadlockqueue(lkb);
4300 +               }
4301 +               break;
4302 +
4303 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4304 +               /* We can leave this. I think.... */
4305 +               break;
4306 +       }
4307 +
4308 +       lkb->lkb_retstatus = status;
4309 +       queue_ast(lkb, astflags, 0);
4310 +
4311 +       return 0;
4312 +}
4313 +
4314 +/*
4315 + * Check for conversion deadlock. If a deadlock was found
4316 + * return lkb to kill, else return NULL
4317 + */
4318 +
4319 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
4320 +{
4321 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4322 +       struct list_head *entry;
4323 +
4324 +       DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4325 +
4326 +       /* Work our way up to the head of the queue looking for locks that
4327 +        * conflict with us */
4328 +
4329 +       down_read(&rsb->res_lock);
4330 +
4331 +       entry = lkb->lkb_statequeue.prev;
4332 +       while (entry != &rsb->res_convertqueue) {
4333 +               struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
4334 +
4335 +               if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4336 +                       up_read(&rsb->res_lock);
4337 +                       return lkb;
4338 +               }
4339 +               entry = entry->prev;
4340 +       }
4341 +       up_read(&rsb->res_lock);
4342 +
4343 +       return 0;
4344 +}
4345 +
4346 +/*
4347 + * Conversion operation was cancelled by us (not the user).
4348 + * ret contains the return code to pass onto the user
4349 + */
4350 +
4351 +void cancel_conversion(struct dlm_lkb *lkb, int ret)
4352 +{
4353 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4354 +
4355 +       /* Stick it back on the granted queue */
4356 +       res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4357 +       lkb->lkb_rqmode = lkb->lkb_grmode;
4358 +
4359 +       remove_from_deadlockqueue(lkb);
4360 +
4361 +       lkb->lkb_retstatus = ret;
4362 +       queue_ast(lkb, AST_COMP, 0);
4363 +       wake_astd();
4364 +}
4365 +
4366 +/*
4367 + * As new master of the rsb for this lkb, we need to handle these requests
4368 + * removed from the lockqueue and originating from local processes:
4369 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4370 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4371 + */
4372 +
4373 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
4374 +{
4375 +       struct dlm_rsb *rsb;
4376 +
4377 +       switch (state) {
4378 +       case GDLM_LQSTATE_WAIT_RSB:
4379 +               dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4380 +                               lkb->lkb_lockqueue_flags,
4381 +                               lkb->lkb_resource->res_name,
4382 +                               lkb->lkb_resource->res_length);
4383 +               break;
4384 +
4385 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4386 +               res_lkb_dequeue(lkb);
4387 +               dlm_lock_stage3(lkb);
4388 +               break;
4389 +
4390 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4391 +               rsb = find_rsb_to_unlock(ls, lkb);
4392 +               dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
4393 +               break;
4394 +
4395 +       case GDLM_LQSTATE_WAIT_CONVERT:
4396 +               dlm_convert_stage2(lkb, TRUE);
4397 +               break;
4398 +
4399 +       default:
4400 +               DLM_ASSERT(0,);
4401 +       }
4402 +}
4403 +
4404 +static void dump_queue(struct list_head *head)
4405 +{
4406 +       struct dlm_lkb *lkb;
4407 +
4408 +       list_for_each_entry(lkb, head, lkb_statequeue) {
4409 +               printk("%08x gr %d rq %d flg %x sts %u node %u remid %x "
4410 +                      "lq %d,%x\n",
4411 +                      lkb->lkb_id,
4412 +                      lkb->lkb_grmode,
4413 +                      lkb->lkb_rqmode,
4414 +                      lkb->lkb_flags,
4415 +                      lkb->lkb_status,
4416 +                      lkb->lkb_nodeid,
4417 +                      lkb->lkb_remid,
4418 +                      lkb->lkb_lockqueue_state,
4419 +                      lkb->lkb_lockqueue_flags);
4420 +       }
4421 +}
4422 +
4423 +static void dump_rsb(struct dlm_rsb *rsb)
4424 +{
4425 +       printk("name \"%s\" flags %lx nodeid %u ref %u\n",
4426 +              rsb->res_name, rsb->res_flags, rsb->res_nodeid,
4427 +              atomic_read(&rsb->res_ref));
4428 +
4429 +       if (!list_empty(&rsb->res_grantqueue)) {
4430 +               printk("grant queue\n");
4431 +               dump_queue(&rsb->res_grantqueue);
4432 +       }
4433 +
4434 +       if (!list_empty(&rsb->res_convertqueue)) {
4435 +               printk("convert queue\n");
4436 +               dump_queue(&rsb->res_convertqueue);
4437 +       }
4438 +
4439 +       if (!list_empty(&rsb->res_waitqueue)) {
4440 +               printk("wait queue\n");
4441 +               dump_queue(&rsb->res_waitqueue);
4442 +       }
4443 +}
4444 +
4445 +void dlm_locks_dump(void)
4446 +{
4447 +       struct dlm_ls *ls;
4448 +       struct dlm_rsb *rsb;
4449 +       struct list_head *head;
4450 +       int i;
4451 +
4452 +       list_for_each_entry(ls, &lslist, ls_list) {
4453 +               for (i = 0; i < ls->ls_rsbtbl_size; i++) {
4454 +                       head = &ls->ls_rsbtbl[i].list;
4455 +                       list_for_each_entry(rsb, head, res_hashchain)
4456 +                               dump_rsb(rsb);
4457 +               }
4458 +       }
4459 +}
4460 +
4461 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4462 --- linux-orig/cluster/dlm/locking.h    1970-01-01 07:30:00.000000000 +0730
4463 +++ linux-patched/cluster/dlm/locking.h 2004-07-13 18:57:22.000000000 +0800
4464 @@ -0,0 +1,32 @@
4465 +/******************************************************************************
4466 +*******************************************************************************
4467 +**
4468 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4469 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4470 +**
4471 +**  This copyrighted material is made available to anyone wishing to use,
4472 +**  modify, copy, or redistribute it subject to the terms and conditions
4473 +**  of the GNU General Public License v.2.
4474 +**
4475 +*******************************************************************************
4476 +******************************************************************************/
4477 +
4478 +#ifndef __LOCKING_DOT_H__
4479 +#define __LOCKING_DOT_H__
4480 +
4481 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
4482 +void dlm_lock_stage3(struct dlm_lkb *lkb);
4483 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
4484 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4485 +int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, int flags);
4486 +struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
4487 +int free_rsb_if_unused(struct dlm_rsb *rsb);
4488 +struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
4489 +                       struct dlm_request *freq);
4490 +int cancel_lockop(struct dlm_lkb *lkb, int status);
4491 +int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
4492 +int grant_pending_locks(struct dlm_rsb *rsb);
4493 +void cancel_conversion(struct dlm_lkb *lkb, int ret);
4494 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
4495 +
4496 +#endif                         /* __LOCKING_DOT_H__ */
4497 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4498 --- linux-orig/cluster/dlm/lockqueue.c  1970-01-01 07:30:00.000000000 +0730
4499 +++ linux-patched/cluster/dlm/lockqueue.c       2004-07-13 18:57:22.000000000 +0800
4500 @@ -0,0 +1,1092 @@
4501 +/******************************************************************************
4502 +*******************************************************************************
4503 +**
4504 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4505 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4506 +**
4507 +**  This copyrighted material is made available to anyone wishing to use,
4508 +**  modify, copy, or redistribute it subject to the terms and conditions
4509 +**  of the GNU General Public License v.2.
4510 +**
4511 +*******************************************************************************
4512 +******************************************************************************/
4513 +
4514 +/*
4515 + * lockqueue.c
4516 + *
4517 + * This controls the lock queue, which is where locks
4518 + * come when they need to wait for a remote operation
4519 + * to complete.
4520 + *
4521 + * This could also be thought of as the "high-level" comms
4522 + * layer.
4523 + *
4524 + */
4525 +
4526 +#include "dlm_internal.h"
4527 +#include "lockqueue.h"
4528 +#include "dir.h"
4529 +#include "locking.h"
4530 +#include "lkb.h"
4531 +#include "lowcomms.h"
4532 +#include "midcomms.h"
4533 +#include "reccomms.h"
4534 +#include "nodes.h"
4535 +#include "lockspace.h"
4536 +#include "ast.h"
4537 +#include "memory.h"
4538 +#include "rsb.h"
4539 +#include "queries.h"
4540 +#include "util.h"
4541 +
4542 +static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
4543 +static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
4544 +
4545 +/*
4546 + * format of an entry on the request queue
4547 + */
4548 +struct rq_entry {
4549 +       struct list_head rqe_list;
4550 +       uint32_t rqe_nodeid;
4551 +       char rqe_request[1];
4552 +};
4553 +
4554 +/*
4555 + * Add a new request (if appropriate) to the request queue and send the remote
4556 + * request out.  - runs in the context of the locking caller
4557 + *
4558 + * Recovery of a remote_stage request if the remote end fails while the lkb
4559 + * is still on the lockqueue:
4560 + *
4561 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4562 + *   lockqueue_lkb_mark() at the start of recovery.
4563 + *
4564 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4565 + *   (depends on the type of request, see below).
4566 + *
4567 + * o At the end of recovery, resend_cluster_requests() looks at these
4568 + *   LQRESEND lkb's and either:
4569 + *
4570 + *   i) resends the request to the new master for the rsb where the
4571 + *      request is processed as usual.  The lkb remains on the lockqueue until
4572 + *      the new master replies and we run process_lockqueue_reply().
4573 + *
4574 + *   ii) if we've become the rsb master, remove the lkb from the lockqueue
4575 + *       and processes the request locally via process_remastered_lkb().
4576 + *
4577 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4578 + * and the request should be resent if dest node is failed.
4579 + *
4580 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4581 + * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4582 + * makes send_lkb_queue() skip it).  Resend this request to the new master.
4583 + *
4584 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
4585 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4586 + * Resend this request to the new master.
4587 + *
4588 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4589 + * It will be rebuilt on the new master rsb's granted queue.  Resend this
4590 + * request to the new master.
4591 + */
4592 +
4593 +int remote_stage(struct dlm_lkb *lkb, int state)
4594 +{
4595 +       int error;
4596 +
4597 +       lkb->lkb_lockqueue_state = state;
4598 +       add_to_lockqueue(lkb);
4599 +
4600 +       error = send_cluster_request(lkb, state);
4601 +       if (error < 0) {
4602 +               log_print("remote_stage error sending request %d", error);
4603 +
4604 +               /* Leave on lockqueue, it will be resent to correct node during
4605 +                * recovery. */
4606 +
4607 +                /*
4608 +                lkb->lkb_lockqueue_state = 0;
4609 +                remove_from_lockqueue(lkb);
4610 +                return -ENOTCONN;
4611 +                */
4612 +       }
4613 +       return 0;
4614 +}
4615 +
4616 +/*
4617 + * Requests received while the lockspace is in recovery get added to the
4618 + * request queue and processed when recovery is complete.
4619 + */
4620 +
4621 +void add_to_requestqueue(struct dlm_ls *ls, int nodeid, char *request, int length)
4622 +{
4623 +       struct rq_entry *entry;
4624 +
4625 +       if (in_nodes_gone(ls, nodeid))
4626 +               return;
4627 +
4628 +       entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
4629 +       if (!entry) {
4630 +               // TODO something better
4631 +               printk("dlm: add_to_requestqueue: out of memory\n");
4632 +               return;
4633 +       }
4634 +
4635 +       log_debug(ls, "add_to_requestqueue %d", nodeid);
4636 +       entry->rqe_nodeid = nodeid;
4637 +       memcpy(entry->rqe_request, request, length);
4638 +       list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
4639 +}
4640 +
4641 +int process_requestqueue(struct dlm_ls *ls)
4642 +{
4643 +       int error = 0, count = 0;
4644 +       struct rq_entry *entry, *safe;
4645 +       struct dlm_header *req;
4646 +
4647 +       log_all(ls, "process held requests");
4648 +
4649 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4650 +               req = (struct dlm_header *) entry->rqe_request;
4651 +               log_debug(ls, "process_requestqueue %u", entry->rqe_nodeid);
4652 +
4653 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
4654 +                       log_debug(ls, "process_requestqueue aborted");
4655 +                       error = -EINTR;
4656 +                       break;
4657 +               }
4658 +
4659 +               error = process_cluster_request(entry->rqe_nodeid, req, TRUE);
4660 +               if (error == -EINTR) {
4661 +                       log_debug(ls, "process_requestqueue interrupted");
4662 +                       break;
4663 +               }
4664 +
4665 +               list_del(&entry->rqe_list);
4666 +               kfree(entry);
4667 +               count++;
4668 +               error = 0;
4669 +       }
4670 +
4671 +       log_all(ls, "processed %d requests", count);
4672 +       return error;
4673 +}
4674 +
4675 +void wait_requestqueue(struct dlm_ls *ls)
4676 +{
4677 +       while (!list_empty(&ls->ls_requestqueue) &&
4678 +               test_bit(LSFL_LS_RUN, &ls->ls_flags))
4679 +               schedule();
4680 +}
4681 +
4682 +/*
4683 + * Resdir requests (lookup or remove) and replies from before recovery are
4684 + * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
4685 + * gone are also invalid.
4686 + */
4687 +
4688 +void purge_requestqueue(struct dlm_ls *ls)
4689 +{
4690 +       int count = 0;
4691 +       struct rq_entry *entry, *safe;
4692 +       struct dlm_header *req;
4693 +       struct dlm_request *freq;
4694 +       struct dlm_lkb *lkb;
4695 +
4696 +       log_all(ls, "purge requests");
4697 +
4698 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4699 +               req = (struct dlm_header *) entry->rqe_request;
4700 +               freq = (struct dlm_request *) req;
4701 +
4702 +               if (req->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
4703 +                   req->rh_cmd == GDLM_REMCMD_LOOKUP ||
4704 +                   in_nodes_gone(ls, entry->rqe_nodeid)) {
4705 +
4706 +                       list_del(&entry->rqe_list);
4707 +                       kfree(entry);
4708 +                       count++;
4709 +
4710 +               } else if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
4711 +
4712 +                       /*
4713 +                        * Replies to resdir lookups are invalid and must be
4714 +                        * purged.  The lookup requests are marked in
4715 +                        * lockqueue_lkb_mark and will be resent in
4716 +                        * resend_cluster_requests.  The only way to check if
4717 +                        * this is a lookup reply is to look at the
4718 +                        * lockqueue_state of the lkb.
4719 +                        */
4720 +
4721 +                       lkb = find_lock_by_id(ls, freq->rr_header.rh_lkid);
4722 +                       DLM_ASSERT(lkb,);
4723 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
4724 +                               list_del(&entry->rqe_list);
4725 +                               kfree(entry);
4726 +                               count++;
4727 +                       }
4728 +               }
4729 +       }
4730 +
4731 +       log_all(ls, "purged %d requests", count);
4732 +}
4733 +
4734 +/*
4735 + * Check if there's a reply for the given lkid in the requestqueue.
4736 + */
4737 +
4738 +int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
4739 +{
4740 +       int rv = FALSE;
4741 +       struct rq_entry *entry, *safe;
4742 +       struct dlm_header *req;
4743 +       struct dlm_request *freq;
4744 +
4745 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
4746 +               req = (struct dlm_header *) entry->rqe_request;
4747 +               freq = (struct dlm_request *) req;
4748 +
4749 +               if (req->rh_cmd == GDLM_REMCMD_LOCKREPLY &&
4750 +                   freq->rr_header.rh_lkid == lkid) {
4751 +                       rv = TRUE;
4752 +                       break;
4753 +               }
4754 +       }
4755 +
4756 +       return rv;
4757 +}
4758 +
4759 +void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
4760 +{
4761 +       if (!*lvbptr)
4762 +               *lvbptr = allocate_lvb(ls);
4763 +       if (*lvbptr)
4764 +               memcpy(*lvbptr, src, DLM_LVB_LEN);
4765 +}
4766 +
4767 +/*
4768 + * Process a lockqueue LKB after it has had it's remote processing complete and
4769 + * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread
4770 + * on the machine that requested the lock.
4771 + */
4772 +
4773 +static void process_lockqueue_reply(struct dlm_lkb *lkb,
4774 +                                   struct dlm_reply *reply,
4775 +                                   uint32_t nodeid)
4776 +{
4777 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4778 +       struct dlm_ls *ls = rsb->res_ls;
4779 +       int oldstate, state = lkb->lkb_lockqueue_state;
4780 +
4781 +       lkb->lkb_lockqueue_state = 0;
4782 +       if (state)
4783 +               remove_from_lockqueue(lkb);
4784 +
4785 +       switch (state) {
4786 +       case GDLM_LQSTATE_WAIT_RSB:
4787 +
4788 +               DLM_ASSERT(reply->rl_status == 0,
4789 +                           print_lkb(lkb);
4790 +                           print_rsb(rsb);
4791 +                           print_reply(reply););
4792 +
4793 +               DLM_ASSERT(rsb->res_nodeid == -1 ||
4794 +                           rsb->res_nodeid == 0,
4795 +                           print_lkb(lkb);
4796 +                           print_rsb(rsb);
4797 +                           print_reply(reply););
4798 +
4799 +               if (reply->rl_nodeid == our_nodeid()) {
4800 +                       if (rsb->res_nodeid == -1) {
4801 +                               set_bit(RESFL_MASTER, &rsb->res_flags);
4802 +                               rsb->res_nodeid = 0;
4803 +                       } else {
4804 +                               log_all(ls, "ignore master reply %x %u",
4805 +                                       lkb->lkb_id, nodeid);
4806 +                       }
4807 +               } else {
4808 +                       DLM_ASSERT(rsb->res_nodeid == -1,
4809 +                                  print_lkb(lkb);
4810 +                                  print_rsb(rsb);
4811 +                                  print_reply(reply););
4812 +
4813 +                       clear_bit(RESFL_MASTER, &rsb->res_flags);
4814 +                       rsb->res_nodeid = reply->rl_nodeid;
4815 +               }
4816 +
4817 +               log_debug(ls, "lookup reply %x %u", lkb->lkb_id,
4818 +                         rsb->res_nodeid);
4819 +
4820 +               lkb->lkb_nodeid = rsb->res_nodeid;
4821 +               dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
4822 +               break;
4823 +
4824 +       case GDLM_LQSTATE_WAIT_CONVERT:
4825 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4826 +
4827 +               /*
4828 +                * After a remote lock/conversion/grant request we put the lock
4829 +                * on the right queue and send an AST if appropriate.  Any lock
4830 +                * shuffling (eg newly granted locks because this one was
4831 +                * converted downwards) will be dealt with in seperate messages
4832 +                * (which may be in the same network message)
4833 +                */
4834 +
4835 +
4836 +               /* the destination wasn't the master */
4837 +               if (reply->rl_status == -EINVAL) {
4838 +                       int master_nodeid;
4839 +
4840 +                       log_debug(ls, "resend lookup");
4841 +                       lkb_dequeue(lkb);
4842 +                       rsb->res_nodeid = -1;
4843 +                       lkb->lkb_nodeid = -1;
4844 +                       if (get_directory_nodeid(rsb) != our_nodeid())
4845 +                               remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
4846 +                       else {
4847 +                               dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
4848 +                                              rsb->res_length, &master_nodeid);
4849 +
4850 +                               if (master_nodeid == our_nodeid()) {
4851 +                                       set_bit(RESFL_MASTER, &rsb->res_flags);
4852 +                                       master_nodeid = 0;
4853 +                               }
4854 +                               else
4855 +                                       clear_bit(RESFL_MASTER,&rsb->res_flags);
4856 +                               rsb->res_nodeid = master_nodeid;
4857 +                               lkb->lkb_nodeid = master_nodeid;
4858 +                               dlm_lock_stage2(ls, lkb, rsb,
4859 +                                               lkb->lkb_lockqueue_flags);
4860 +                       }
4861 +                       break;
4862 +               }
4863 +
4864 +               if (!lkb->lkb_remid)
4865 +                       lkb->lkb_remid = reply->rl_lkid;
4866 +
4867 +               /*
4868 +                * The remote request failed (we assume because of NOQUEUE).
4869 +                * If this is a new request (non-conv) the lkb was created just
4870 +                * for it so the lkb should be freed.  If this was a
4871 +                * conversion, the lkb already existed so we should put it back
4872 +                * on the grant queue.
4873 +                */
4874 +
4875 +               if (reply->rl_status != 0) {
4876 +                       DLM_ASSERT(reply->rl_status == -EAGAIN,);
4877 +
4878 +                       if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
4879 +                               res_lkb_dequeue(lkb);
4880 +                               lkb->lkb_retstatus = reply->rl_status;
4881 +                               queue_ast(lkb, AST_COMP | AST_DEL, 0);
4882 +                       } else {
4883 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4884 +                               lkb->lkb_retstatus = reply->rl_status;
4885 +                               queue_ast(lkb, AST_COMP, 0);
4886 +                       }
4887 +                       break;
4888 +               }
4889 +
4890 +               /*
4891 +                * The remote request was successful in granting the request or
4892 +                * queuing it to be granted later.  Add the lkb to the
4893 +                * appropriate rsb queue.
4894 +                */
4895 +
4896 +               switch (reply->rl_lockstate) {
4897 +               case GDLM_LKSTS_GRANTED:
4898 +
4899 +                       /* Compact version of grant_lock(). */
4900 +
4901 +                       down_write(&rsb->res_lock);
4902 +                       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
4903 +                               memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
4904 +                                      DLM_LVB_LEN);
4905 +
4906 +                       lkb->lkb_grmode = lkb->lkb_rqmode;
4907 +                       lkb->lkb_rqmode = DLM_LOCK_IV;
4908 +                       lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4909 +
4910 +                       if (lkb->lkb_range) {
4911 +                               lkb->lkb_range[GR_RANGE_START] =
4912 +                                   lkb->lkb_range[RQ_RANGE_START];
4913 +                               lkb->lkb_range[GR_RANGE_END] =
4914 +                                   lkb->lkb_range[RQ_RANGE_END];
4915 +                       }
4916 +                       up_write(&rsb->res_lock);
4917 +
4918 +                       lkb->lkb_retstatus = 0;
4919 +                       queue_ast(lkb, AST_COMP, 0);
4920 +                       break;
4921 +
4922 +               case GDLM_LKSTS_WAITING:
4923 +
4924 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4925 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4926 +                       else
4927 +                               log_error(ls, "wait reply for granted %x %u",
4928 +                                         lkb->lkb_id, lkb->lkb_nodeid);
4929 +                       break;
4930 +
4931 +               case GDLM_LKSTS_CONVERT:
4932 +
4933 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
4934 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4935 +                       else
4936 +                               log_error(ls, "convert reply for granted %x %u",
4937 +                                         lkb->lkb_id, lkb->lkb_nodeid);
4938 +                       break;
4939 +
4940 +               default:
4941 +                       log_error(ls, "process_lockqueue_reply state %d",
4942 +                                 reply->rl_lockstate);
4943 +               }
4944 +
4945 +               break;
4946 +
4947 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4948 +
4949 +               /*
4950 +                * Unlocks should never fail.  Update local lock info.  This
4951 +                * always sends completion AST with status in lksb
4952 +                */
4953 +
4954 +               DLM_ASSERT(reply->rl_status == 0,);
4955 +               oldstate = res_lkb_dequeue(lkb);
4956 +
4957 +               /* Differentiate between unlocks and conversion cancellations */
4958 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL &&
4959 +                   oldstate == GDLM_LKSTS_CONVERT) {
4960 +                       res_lkb_enqueue(lkb->lkb_resource, lkb,
4961 +                                       GDLM_LKSTS_GRANTED);
4962 +                       lkb->lkb_retstatus = -DLM_ECANCEL;
4963 +                       queue_ast(lkb, AST_COMP, 0);
4964 +               } else {
4965 +                       lkb->lkb_retstatus = -DLM_EUNLOCK;
4966 +                       queue_ast(lkb, AST_COMP | AST_DEL, 0);
4967 +               }
4968 +               break;
4969 +
4970 +       default:
4971 +               log_error(ls, "process_lockqueue_reply id %x state %d",
4972 +                         lkb->lkb_id, state);
4973 +       }
4974 +}
4975 +
4976 +/*
4977 + * Tell a remote node to grant a lock.  This happens when we are the master
4978 + * copy for a lock that is actually held on a remote node.  The remote end is
4979 + * also responsible for sending the completion AST.
4980 + */
4981 +
4982 +void remote_grant(struct dlm_lkb *lkb)
4983 +{
4984 +       struct writequeue_entry *e;
4985 +       struct dlm_request *req;
4986 +
4987 +       // TODO Error handling
4988 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
4989 +                               sizeof(struct dlm_request),
4990 +                               lkb->lkb_resource->res_ls->ls_allocation,
4991 +                               (char **) &req);
4992 +       if (!e)
4993 +               return;
4994 +
4995 +       req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
4996 +       req->rr_header.rh_length = sizeof(struct dlm_request);
4997 +       req->rr_header.rh_flags = 0;
4998 +       req->rr_header.rh_lkid = lkb->lkb_id;
4999 +       req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
5000 +       req->rr_remlkid = lkb->lkb_remid;
5001 +       req->rr_flags = 0;
5002 +
5003 +       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
5004 +               /* This is a confusing non-standard use of rr_flags which is
5005 +                * usually used to pass lockqueue_flags. */
5006 +               req->rr_flags |= GDLM_LKFLG_DEMOTED;
5007 +       }
5008 +
5009 +       add_request_lvb(lkb, req);
5010 +       midcomms_send_buffer(&req->rr_header, e);
5011 +}
5012 +
5013 +void reply_and_grant(struct dlm_lkb *lkb)
5014 +{
5015 +       struct dlm_request *req = lkb->lkb_request;
5016 +       struct dlm_reply *reply;
5017 +       struct writequeue_entry *e;
5018 +
5019 +       // TODO Error handling
5020 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
5021 +                               sizeof(struct dlm_reply),
5022 +                               lkb->lkb_resource->res_ls->ls_allocation,
5023 +                               (char **) &reply);
5024 +       if (!e)
5025 +               return;
5026 +
5027 +       reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5028 +       reply->rl_header.rh_flags = 0;
5029 +       reply->rl_header.rh_length = sizeof(struct dlm_reply);
5030 +       reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5031 +       reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5032 +
5033 +       reply->rl_status = lkb->lkb_retstatus;
5034 +       reply->rl_lockstate = lkb->lkb_status;
5035 +       reply->rl_lkid = lkb->lkb_id;
5036 +
5037 +       DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5038 +
5039 +       lkb->lkb_request = NULL;
5040 +
5041 +       add_reply_lvb(lkb, reply);
5042 +       midcomms_send_buffer(&reply->rl_header, e);
5043 +}
5044 +
5045 +/*
5046 + * Request removal of a dead entry in the resource directory
5047 + */
5048 +
5049 +void remote_remove_resdata(struct dlm_ls *ls, int nodeid, char *name,
5050 +                          int namelen)
5051 +{
5052 +       struct writequeue_entry *e;
5053 +       struct dlm_request *req;
5054 +
5055 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5056 +               struct dlm_rcom *rc = allocate_rcom_buffer(ls);
5057 +
5058 +               memcpy(rc->rc_buf, name, namelen);
5059 +               rc->rc_datalen = namelen;
5060 +
5061 +               rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5062 +
5063 +               free_rcom_buffer(rc);
5064 +               return;
5065 +       }
5066 +       // TODO Error handling
5067 +       e = lowcomms_get_buffer(nodeid,
5068 +                               sizeof(struct dlm_request) + namelen - 1,
5069 +                               ls->ls_allocation, (char **) &req);
5070 +       if (!e)
5071 +               return;
5072 +
5073 +       memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
5074 +       req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5075 +       req->rr_header.rh_length =
5076 +           sizeof(struct dlm_request) + namelen - 1;
5077 +       req->rr_header.rh_flags = 0;
5078 +       req->rr_header.rh_lkid = 0;
5079 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5080 +       req->rr_remlkid = 0;
5081 +       memcpy(req->rr_name, name, namelen);
5082 +
5083 +       midcomms_send_buffer(&req->rr_header, e);
5084 +}
5085 +
5086 +/*
5087 + * Send remote cluster request to directory or master node before the request
5088 + * is put on the lock queue.  Runs in the context of the locking caller.
5089 + */
5090 +
5091 +int send_cluster_request(struct dlm_lkb *lkb, int state)
5092 +{
5093 +       uint32_t target_nodeid;
5094 +       struct dlm_rsb *rsb = lkb->lkb_resource;
5095 +       struct dlm_ls *ls = rsb->res_ls;
5096 +       struct dlm_request *req;
5097 +       struct writequeue_entry *e;
5098 +
5099 +       if (state == GDLM_LQSTATE_WAIT_RSB)
5100 +               target_nodeid = get_directory_nodeid(rsb);
5101 +       else
5102 +               target_nodeid = lkb->lkb_nodeid;
5103 +
5104 +       /* during recovery it's valid for target_nodeid to equal our own;
5105 +          resend_cluster_requests does this to get requests back on track */
5106 +
5107 +       DLM_ASSERT(target_nodeid && target_nodeid != -1,
5108 +                   print_lkb(lkb);
5109 +                   print_rsb(rsb);
5110 +                   printk("target_nodeid %u\n", target_nodeid););
5111 +
5112 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5113 +               /* this may happen when called by resend_cluster_request */
5114 +               log_error(ls, "send_cluster_request to %u state %d recovery",
5115 +                         target_nodeid, state);
5116 +       }
5117 +
5118 +       e = lowcomms_get_buffer(target_nodeid,
5119 +                               sizeof(struct dlm_request) +
5120 +                               rsb->res_length - 1, ls->ls_allocation,
5121 +                               (char **) &req);
5122 +       if (!e)
5123 +               return -ENOBUFS;
5124 +       memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
5125 +
5126 +       /* Common stuff, some are just defaults */
5127 +
5128 +       if (lkb->lkb_bastaddr)
5129 +               req->rr_asts = AST_BAST;
5130 +       if (lkb->lkb_astaddr)
5131 +               req->rr_asts |= AST_COMP;
5132 +       if (lkb->lkb_parent)
5133 +               req->rr_remparid = lkb->lkb_parent->lkb_remid;
5134 +
5135 +       req->rr_flags = lkb->lkb_lockqueue_flags;
5136 +       req->rr_rqmode = lkb->lkb_rqmode;
5137 +       req->rr_remlkid = lkb->lkb_remid;
5138 +       req->rr_header.rh_length =
5139 +           sizeof(struct dlm_request) + rsb->res_length - 1;
5140 +       req->rr_header.rh_flags = 0;
5141 +       req->rr_header.rh_lkid = lkb->lkb_id;
5142 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5143 +
5144 +       switch (state) {
5145 +
5146 +       case GDLM_LQSTATE_WAIT_RSB:
5147 +
5148 +               DLM_ASSERT(!lkb->lkb_parent,
5149 +                           print_lkb(lkb);
5150 +                           print_rsb(rsb););
5151 +
5152 +               DLM_ASSERT(rsb->res_nodeid == -1,
5153 +                           print_lkb(lkb);
5154 +                           print_rsb(rsb););
5155 +
5156 +               log_debug(ls, "send lu %x to %u", lkb->lkb_id, target_nodeid);
5157 +
5158 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5159 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5160 +               break;
5161 +
5162 +       case GDLM_LQSTATE_WAIT_CONVERT:
5163 +
5164 +               DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5165 +                           print_lkb(lkb);
5166 +                           print_rsb(rsb););
5167 +
5168 +               log_debug(ls, "send cv %x to %u", lkb->lkb_id, target_nodeid);
5169 +
5170 +               req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5171 +               if (lkb->lkb_range) {
5172 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5173 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5174 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5175 +               }
5176 +               break;
5177 +
5178 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
5179 +
5180 +               DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5181 +                           print_lkb(lkb);
5182 +                           print_rsb(rsb););
5183 +
5184 +               log_debug(ls, "send rq %x to %u", lkb->lkb_id, target_nodeid);
5185 +
5186 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5187 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5188 +               if (lkb->lkb_range) {
5189 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5190 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5191 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5192 +               }
5193 +               break;
5194 +
5195 +       case GDLM_LQSTATE_WAIT_UNLOCK:
5196 +
5197 +               log_debug(ls, "send un %x to %u", lkb->lkb_id, target_nodeid);
5198 +
5199 +               if (rsb->res_nodeid != -1)
5200 +                       log_all(ls, "un %x to %u rsb nodeid %u", lkb->lkb_id,
5201 +                               target_nodeid, rsb->res_nodeid);
5202 +
5203 +               req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5204 +               break;
5205 +
5206 +       default:
5207 +               DLM_ASSERT(0, printk("Unknown cluster request\n"););
5208 +       }
5209 +
5210 +       add_request_lvb(lkb, req);
5211 +       midcomms_send_buffer(&req->rr_header, e);
5212 +
5213 +       return 0;
5214 +}
5215 +
5216 +/*
5217 + * We got a request from another cluster node, process it and return an info
5218 + * structure with the lock state/LVB etc as required.  Executes in the DLM's
5219 + * recvd thread.
5220 + */
5221 +
5222 +int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
5223 +{
5224 +       struct dlm_ls *lspace;
5225 +       struct dlm_lkb *lkb = NULL;
5226 +       struct dlm_rsb *rsb;
5227 +       int send_reply = 0, status = 0, namelen;
5228 +       struct dlm_request *freq = (struct dlm_request *) req;
5229 +       struct dlm_reply *rp = (struct dlm_reply *) req;
5230 +       struct dlm_reply reply;
5231 +
5232 +       lspace = find_lockspace_by_global_id(req->rh_lockspace);
5233 +
5234 +       if (!lspace) {
5235 +               log_print("process_cluster_request invalid lockspace %x "
5236 +                         "from %d req %u", req->rh_lockspace, nodeid,
5237 +                         req->rh_cmd);
5238 +               status = -EINVAL;
5239 +               goto out;
5240 +       }
5241 +
5242 +       /* wait for recoverd to drain requestqueue */
5243 +       if (!recovery)
5244 +               wait_requestqueue(lspace);
5245 +
5246 +       /*
5247 +        * If we're in recovery then queue the request for later.  Otherwise,
5248 +        * we still need to get the "in_recovery" lock to make sure the
5249 +        * recovery itself doesn't start until we are done.
5250 +        */
5251 + retry:
5252 +       if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5253 +               if (test_bit(LSFL_REQUEST_WARN, &lspace->ls_flags))
5254 +                       log_error(lspace, "process_cluster_request warning %u",
5255 +                                 nodeid);
5256 +               add_to_requestqueue(lspace, nodeid, (char *) req,
5257 +                                   req->rh_length);
5258 +               log_debug(lspace, "process_cluster_request queue %d from %u",
5259 +                         req->rh_cmd, nodeid);
5260 +               status = -EINTR;
5261 +               goto out;
5262 +       }
5263 +       if (!down_read_trylock(&lspace->ls_in_recovery)) {
5264 +               schedule();
5265 +               goto retry;
5266 +       }
5267 +
5268 +
5269 +       /*
5270 +        * Process the request.
5271 +        */
5272 +
5273 +       switch (req->rh_cmd) {
5274 +
5275 +       case GDLM_REMCMD_LOOKUP:
5276 +               {
5277 +                       uint32_t dir_nodeid, r_nodeid;
5278 +                       int status;
5279 +
5280 +                       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5281 +
5282 +                       dir_nodeid = name_to_directory_nodeid(lspace,
5283 +                                                             freq->rr_name,
5284 +                                                             namelen);
5285 +                       if (dir_nodeid != our_nodeid())
5286 +                               log_debug(lspace, "ignoring directory lookup");
5287 +
5288 +                       status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
5289 +                                               namelen, &r_nodeid);
5290 +                       if (status)
5291 +                               status = -ENOMEM;
5292 +
5293 +                       reply.rl_status = status;
5294 +                       reply.rl_lockstate = 0;
5295 +                       reply.rl_nodeid = r_nodeid;
5296 +               }
5297 +               send_reply = 1;
5298 +               break;
5299 +
5300 +       case GDLM_REMCMD_REM_RESDATA:
5301 +
5302 +               namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5303 +               remove_resdata(lspace, nodeid, freq->rr_name, namelen);
5304 +               break;
5305 +
5306 +       case GDLM_REMCMD_LOCKREQUEST:
5307 +
5308 +               lkb = remote_stage2(nodeid, lspace, freq);
5309 +               if (lkb) {
5310 +                       lkb->lkb_request = freq;
5311 +                       if (lkb->lkb_retstatus != -EINVAL)
5312 +                               dlm_lock_stage3(lkb);
5313 +
5314 +                       /*
5315 +                        * If the request was granted in lock_stage3, then a
5316 +                        * reply message was already sent in combination with
5317 +                        * the grant message and lkb_request is NULL.
5318 +                        */
5319 +
5320 +                       if (lkb->lkb_request) {
5321 +                               lkb->lkb_request = NULL;
5322 +                               send_reply = 1;
5323 +                               reply.rl_status = lkb->lkb_retstatus;
5324 +                               reply.rl_lockstate = lkb->lkb_status;
5325 +                               reply.rl_lkid = lkb->lkb_id;
5326 +
5327 +                               /*
5328 +                                * If the request could not be granted and the
5329 +                                * user won't wait, then free up the LKB
5330 +                                */
5331 +
5332 +                               if (lkb->lkb_retstatus == -EAGAIN) {
5333 +                                       rsb = lkb->lkb_resource;
5334 +                                       release_lkb(lspace, lkb);
5335 +                                       release_rsb(rsb);
5336 +                                       lkb = NULL;
5337 +                               }
5338 +                               else if (lkb->lkb_retstatus == -EINVAL) {
5339 +                                       release_lkb(lspace, lkb);
5340 +                                       lkb = NULL;
5341 +                               }
5342 +                       }
5343 +               } else {
5344 +                       reply.rl_status = -ENOMEM;
5345 +                       send_reply = 1;
5346 +               }
5347 +               break;
5348 +
5349 +       case GDLM_REMCMD_CONVREQUEST:
5350 +
5351 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5352 +
5353 +               DLM_ASSERT(lkb,
5354 +                           print_request(freq);
5355 +                           printk("nodeid %u\n", nodeid););
5356 +
5357 +               rsb = lkb->lkb_resource;
5358 +
5359 +               DLM_ASSERT(rsb,
5360 +                           print_lkb(lkb);
5361 +                           print_request(freq);
5362 +                           printk("nodeid %u\n", nodeid););
5363 +
5364 +               DLM_ASSERT(!rsb->res_nodeid,
5365 +                           print_lkb(lkb);
5366 +                           print_rsb(rsb);
5367 +                           print_request(freq);
5368 +                           printk("nodeid %u\n", nodeid););
5369 +
5370 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5371 +                           print_lkb(lkb);
5372 +                           print_rsb(rsb);
5373 +                           print_request(freq);
5374 +                           printk("nodeid %u\n", nodeid););
5375 +
5376 +               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
5377 +                           print_lkb(lkb);
5378 +                           print_rsb(rsb);
5379 +                           print_request(freq);
5380 +                           printk("nodeid %u\n", nodeid););
5381 +
5382 +               lkb->lkb_rqmode = freq->rr_rqmode;
5383 +               lkb->lkb_lockqueue_flags = freq->rr_flags;
5384 +               lkb->lkb_request = freq;
5385 +               lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5386 +
5387 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
5388 +                   freq->rr_flags & DLM_LKF_VALBLK) {
5389 +                       lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5390 +                       allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5391 +                                             freq->rr_lvb);
5392 +               }
5393 +
5394 +               if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5395 +                       if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5396 +                                         freq->rr_range_end)) {
5397 +                               reply.rl_status = -ENOMEM;
5398 +                               send_reply = 1;
5399 +                               goto out;
5400 +                       }
5401 +               }
5402 +
5403 +               log_debug(lspace, "cv %u from %u %x \"%s\"", lkb->lkb_rqmode,
5404 +                         nodeid, lkb->lkb_id, rsb->res_name);
5405 +
5406 +               dlm_convert_stage2(lkb, FALSE);
5407 +
5408 +               /*
5409 +                * If the conv request was granted in stage2, then a reply
5410 +                * message was already sent in combination with the grant
5411 +                * message.
5412 +                */
5413 +
5414 +               if (lkb->lkb_request) {
5415 +                       lkb->lkb_request = NULL;
5416 +                       send_reply = 1;
5417 +                       reply.rl_status = lkb->lkb_retstatus;
5418 +                       reply.rl_lockstate = lkb->lkb_status;
5419 +                       reply.rl_lkid = lkb->lkb_id;
5420 +               }
5421 +               break;
5422 +
5423 +       case GDLM_REMCMD_LOCKREPLY:
5424 +
5425 +               lkb = find_lock_by_id(lspace, req->rh_lkid);
5426 +
5427 +               DLM_ASSERT(lkb,
5428 +                           print_reply(rp);
5429 +                           printk("nodeid %u\n", nodeid););
5430 +
5431 +               DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5432 +                           print_lkb(lkb);
5433 +                           print_reply(rp);
5434 +                           printk("nodeid %u\n", nodeid););
5435 +
5436 +               process_lockqueue_reply(lkb, rp, nodeid);
5437 +               break;
5438 +
5439 +       case GDLM_REMCMD_LOCKGRANT:
5440 +
5441 +               /*
5442 +                * Remote lock has been granted asynchronously.  Do a compact
5443 +                * version of what grant_lock() does.
5444 +                */
5445 +
5446 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5447 +
5448 +               DLM_ASSERT(lkb,
5449 +                           print_request(freq);
5450 +                           printk("nodeid %u\n", nodeid););
5451 +
5452 +               rsb = lkb->lkb_resource;
5453 +
5454 +               DLM_ASSERT(rsb,
5455 +                           print_lkb(lkb);
5456 +                           print_request(freq);
5457 +                           printk("nodeid %u\n", nodeid););
5458 +
5459 +               DLM_ASSERT(rsb->res_nodeid,
5460 +                           print_lkb(lkb);
5461 +                           print_rsb(rsb);
5462 +                           print_request(freq);
5463 +                           printk("nodeid %u\n", nodeid););
5464 +
5465 +               DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5466 +                           print_lkb(lkb);
5467 +                           print_rsb(rsb);
5468 +                           print_request(freq);
5469 +                           printk("nodeid %u\n", nodeid););
5470 +
5471 +               if (lkb->lkb_lockqueue_state) {
5472 +                       log_error(rsb->res_ls, "granting lock on lockqueue");
5473 +                       print_lkb(lkb);
5474 +               }
5475 +
5476 +               down_write(&rsb->res_lock);
5477 +
5478 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5479 +                       memcpy(lkb->lkb_lvbptr, freq->rr_lvb, DLM_LVB_LEN);
5480 +
5481 +               lkb->lkb_grmode = lkb->lkb_rqmode;
5482 +               lkb->lkb_rqmode = DLM_LOCK_IV;
5483 +
5484 +               if (lkb->lkb_range) {
5485 +                       lkb->lkb_range[GR_RANGE_START] =
5486 +                           lkb->lkb_range[RQ_RANGE_START];
5487 +                       lkb->lkb_range[GR_RANGE_END] =
5488 +                           lkb->lkb_range[RQ_RANGE_END];
5489 +               }
5490 +
5491 +               lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5492 +               up_write(&rsb->res_lock);
5493 +
5494 +               if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5495 +                       lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5496 +
5497 +               lkb->lkb_retstatus = 0;
5498 +               queue_ast(lkb, AST_COMP, 0);
5499 +               break;
5500 +
5501 +       case GDLM_REMCMD_SENDBAST:
5502 +
5503 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5504 +
5505 +               DLM_ASSERT(lkb,
5506 +                           print_request(freq);
5507 +                           printk("nodeid %u\n", nodeid););
5508 +
5509 +               if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5510 +                       queue_ast(lkb, AST_BAST, freq->rr_rqmode);
5511 +               break;
5512 +
5513 +       case GDLM_REMCMD_SENDCAST:
5514 +
5515 +               /* This is only used for some error completion ASTs */
5516 +
5517 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5518 +
5519 +               DLM_ASSERT(lkb,
5520 +                           print_request(freq);
5521 +                           printk("nodeid %u\n", nodeid););
5522 +
5523 +               /* Return the lock to granted status */
5524 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5525 +               lkb->lkb_retstatus = freq->rr_status;
5526 +               queue_ast(lkb, AST_COMP, 0);
5527 +               break;
5528 +
5529 +       case GDLM_REMCMD_UNLOCKREQUEST:
5530 +
5531 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5532 +
5533 +               DLM_ASSERT(lkb,
5534 +                           print_request(freq);
5535 +                           printk("nodeid %u\n", nodeid););
5536 +
5537 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5538 +                           print_lkb(lkb);
5539 +                           print_request(freq);
5540 +                           printk("nodeid %u\n", nodeid););
5541 +
5542 +               rsb = find_rsb_to_unlock(lspace, lkb);
5543 +
5544 +               log_debug(lspace, "un from %u %x \"%s\"", nodeid, lkb->lkb_id,
5545 +                         rsb->res_name);
5546 +
5547 +               reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
5548 +               send_reply = 1;
5549 +               break;
5550 +
5551 +       case GDLM_REMCMD_QUERY:
5552 +               remote_query(nodeid, lspace, req);
5553 +               break;
5554 +
5555 +       case GDLM_REMCMD_QUERYREPLY:
5556 +               remote_query_reply(nodeid, lspace, req);
5557 +               break;
5558 +
5559 +       default:
5560 +               log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
5561 +       }
5562 +
5563 +       up_read(&lspace->ls_in_recovery);
5564 +
5565 +      out:
5566 +       if (send_reply) {
5567 +               reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5568 +               reply.rl_header.rh_flags = 0;
5569 +               reply.rl_header.rh_length = sizeof(reply);
5570 +               reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
5571 +               reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
5572 +
5573 +               status = midcomms_send_message(nodeid, &reply.rl_header,
5574 +                                              GFP_KERNEL);
5575 +       }
5576 +
5577 +       wake_astd();
5578 +
5579 +       return status;
5580 +}
5581 +
5582 +static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
5583 +{
5584 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5585 +               memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5586 +}
5587 +
5588 +static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
5589 +{
5590 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5591 +               memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
5592 +}
5593 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
5594 --- linux-orig/cluster/dlm/lockqueue.h  1970-01-01 07:30:00.000000000 +0730
5595 +++ linux-patched/cluster/dlm/lockqueue.h       2004-07-13 18:57:22.000000000 +0800
5596 @@ -0,0 +1,28 @@
5597 +/******************************************************************************
5598 +*******************************************************************************
5599 +**
5600 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5601 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
5602 +**
5603 +**  This copyrighted material is made available to anyone wishing to use,
5604 +**  modify, copy, or redistribute it subject to the terms and conditions
5605 +**  of the GNU General Public License v.2.
5606 +**
5607 +*******************************************************************************
5608 +******************************************************************************/
5609 +
5610 +#ifndef __LOCKQUEUE_DOT_H__
5611 +#define __LOCKQUEUE_DOT_H__
5612 +
5613 +void remote_grant(struct dlm_lkb * lkb);
5614 +void reply_and_grant(struct dlm_lkb * lkb);
5615 +int remote_stage(struct dlm_lkb * lkb, int state);
5616 +int process_cluster_request(int csid, struct dlm_header *req, int recovery);
5617 +int send_cluster_request(struct dlm_lkb * lkb, int state);
5618 +void purge_requestqueue(struct dlm_ls * ls);
5619 +int process_requestqueue(struct dlm_ls * ls);
5620 +int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
5621 +void remote_remove_resdata(struct dlm_ls * ls, int nodeid, char *name, int namelen);
5622 +void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
5623 +
5624 +#endif                         /* __LOCKQUEUE_DOT_H__ */
5625 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
5626 --- linux-orig/cluster/dlm/lockspace.c  1970-01-01 07:30:00.000000000 +0730
5627 +++ linux-patched/cluster/dlm/lockspace.c       2004-07-13 18:57:22.000000000 +0800
5628 @@ -0,0 +1,699 @@
5629 +/******************************************************************************
5630 +*******************************************************************************
5631 +**
5632 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5633 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
5634 +**
5635 +**  This copyrighted material is made available to anyone wishing to use,
5636 +**  modify, copy, or redistribute it subject to the terms and conditions
5637 +**  of the GNU General Public License v.2.
5638 +**
5639 +*******************************************************************************
5640 +******************************************************************************/
5641 +
5642 +#include <linux/module.h>
5643 +
5644 +#include "dlm_internal.h"
5645 +#include "recoverd.h"
5646 +#include "ast.h"
5647 +#include "lkb.h"
5648 +#include "nodes.h"
5649 +#include "dir.h"
5650 +#include "lowcomms.h"
5651 +#include "config.h"
5652 +#include "memory.h"
5653 +#include "lockspace.h"
5654 +#include "device.h"
5655 +
5656 +#define GDST_NONE       (0)
5657 +#define GDST_RUNNING    (1)
5658 +
5659 +static int dlmstate;
5660 +static int dlmcount;
5661 +static struct semaphore dlmstate_lock;
5662 +struct list_head lslist;
5663 +spinlock_t lslist_lock;
5664 +struct kcl_service_ops ls_ops;
5665 +
5666 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
5667 +
5668 +
5669 +void dlm_lockspace_init(void)
5670 +{
5671 +       dlmstate = GDST_NONE;
5672 +       dlmcount = 0;
5673 +       init_MUTEX(&dlmstate_lock);
5674 +       INIT_LIST_HEAD(&lslist);
5675 +       spin_lock_init(&lslist_lock);
5676 +}
5677 +
5678 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
5679 +{
5680 +       struct dlm_ls *ls;
5681 +
5682 +       spin_lock(&lslist_lock);
5683 +
5684 +       list_for_each_entry(ls, &lslist, ls_list) {
5685 +               if (ls->ls_global_id == id)
5686 +                       goto out;
5687 +       }
5688 +       ls = NULL;
5689 +      out:
5690 +       spin_unlock(&lslist_lock);
5691 +       return ls;
5692 +}
5693 +
5694 +/* TODO: make this more efficient */
5695 +struct dlm_ls *find_lockspace_by_local_id(void *id)
5696 +{
5697 +       struct dlm_ls *ls;
5698 +
5699 +       spin_lock(&lslist_lock);
5700 +
5701 +       list_for_each_entry(ls, &lslist, ls_list) {
5702 +               if (ls->ls_local_id == (uint32_t)(long)id)
5703 +                       goto out;
5704 +       }
5705 +       ls = NULL;
5706 +      out:
5707 +       spin_unlock(&lslist_lock);
5708 +       return ls;
5709 +}
5710 +
5711 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
5712 +{
5713 +       struct dlm_ls *ls;
5714 +
5715 +       spin_lock(&lslist_lock);
5716 +
5717 +       list_for_each_entry(ls, &lslist, ls_list) {
5718 +               if (ls->ls_namelen == namelen &&
5719 +                   memcmp(ls->ls_name, name, namelen) == 0)
5720 +                       goto out;
5721 +       }
5722 +       ls = NULL;
5723 +      out:
5724 +       spin_unlock(&lslist_lock);
5725 +       return ls;
5726 +}
5727 +
5728 +/*
5729 + * Called from dlm_init.  These are the general threads which are not
5730 + * lockspace-specific and work for all dlm lockspaces.
5731 + */
5732 +
5733 +static int threads_start(void)
5734 +{
5735 +       int error;
5736 +
5737 +       /* Thread which interacts with cman for all ls's */
5738 +       error = dlm_recoverd_start();
5739 +       if (error) {
5740 +               log_print("cannot start recovery thread %d", error);
5741 +               goto fail;
5742 +       }
5743 +
5744 +       /* Thread which process lock requests for all ls's */
5745 +       error = astd_start();
5746 +       if (error) {
5747 +               log_print("cannot start ast thread %d", error);
5748 +               goto recoverd_fail;
5749 +       }
5750 +
5751 +       /* Thread for sending/receiving messages for all ls's */
5752 +       error = lowcomms_start();
5753 +       if (error) {
5754 +               log_print("cannot start lowcomms %d", error);
5755 +               goto astd_fail;
5756 +       }
5757 +
5758 +       return 0;
5759 +
5760 +      astd_fail:
5761 +       astd_stop();
5762 +
5763 +      recoverd_fail:
5764 +       dlm_recoverd_stop();
5765 +
5766 +      fail:
5767 +       return error;
5768 +}
5769 +
5770 +static void threads_stop(void)
5771 +{
5772 +       lowcomms_stop();
5773 +       astd_stop();
5774 +       dlm_recoverd_stop();
5775 +}
5776 +
5777 +static int init_internal(void)
5778 +{
5779 +       int error = 0;
5780 +
5781 +       if (dlmstate == GDST_RUNNING)
5782 +               dlmcount++;
5783 +       else {
5784 +               error = threads_start();
5785 +               if (error)
5786 +                       goto out;
5787 +
5788 +               dlmstate = GDST_RUNNING;
5789 +               dlmcount = 1;
5790 +       }
5791 +
5792 +      out:
5793 +       return error;
5794 +}
5795 +
5796 +
5797 +/*
5798 + * Called after dlm module is loaded and before any lockspaces are created.
5799 + * Starts and initializes global threads and structures.  These global entities
5800 + * are shared by and independent of all lockspaces.
5801 + *
5802 + * There should be a dlm-specific user command which a person can run which
5803 + * calls this function.  If a user hasn't run that command and something
5804 + * creates a new lockspace, this is called first.
5805 + *
5806 + * This also starts the default lockspace.
5807 + */
5808 +
5809 +int dlm_init(void)
5810 +{
5811 +       int error;
5812 +
5813 +       down(&dlmstate_lock);
5814 +       error = init_internal();
5815 +       up(&dlmstate_lock);
5816 +
5817 +       return error;
5818 +}
5819 +
5820 +int dlm_release(void)
5821 +{
5822 +       int error = 0;
5823 +
5824 +       down(&dlmstate_lock);
5825 +
5826 +       if (dlmstate == GDST_NONE)
5827 +               goto out;
5828 +
5829 +       if (dlmcount)
5830 +               dlmcount--;
5831 +
5832 +       if (dlmcount)
5833 +               goto out;
5834 +
5835 +       spin_lock(&lslist_lock);
5836 +       if (!list_empty(&lslist)) {
5837 +               spin_unlock(&lslist_lock);
5838 +               log_print("cannot stop threads, lockspaces still exist");
5839 +               goto out;
5840 +       }
5841 +       spin_unlock(&lslist_lock);
5842 +
5843 +       threads_stop();
5844 +       dlmstate = GDST_NONE;
5845 +
5846 +      out:
5847 +       up(&dlmstate_lock);
5848 +
5849 +       return error;
5850 +}
5851 +
5852 +struct dlm_ls *allocate_ls(int namelen)
5853 +{
5854 +       struct dlm_ls *ls;
5855 +
5856 +       /* FIXME: use appropriate malloc type */
5857 +
5858 +       ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
5859 +       if (ls)
5860 +               memset(ls, 0, sizeof(struct dlm_ls) + namelen);
5861 +
5862 +       return ls;
5863 +}
5864 +
5865 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
5866 +{
5867 +       struct dlm_ls *ls;
5868 +       int i, size, error = -ENOMEM;
5869 +       uint32_t local_id = 0;
5870 +
5871 +       if (!try_module_get(THIS_MODULE))
5872 +               return -EINVAL;
5873 +
5874 +       if (namelen > MAX_SERVICE_NAME_LEN)
5875 +               return -EINVAL;
5876 +
5877 +       if ((ls = find_lockspace_by_name(name, namelen))) {
5878 +               *lockspace = (void *)(long)ls->ls_local_id;
5879 +               return -EEXIST;
5880 +       }
5881 +
5882 +       /*
5883 +        * Initialize ls fields
5884 +        */
5885 +
5886 +       ls = allocate_ls(namelen);
5887 +       if (!ls)
5888 +               goto out;
5889 +
5890 +       memcpy(ls->ls_name, name, namelen);
5891 +       ls->ls_namelen = namelen;
5892 +
5893 +       ls->ls_allocation = GFP_KERNEL;
5894 +       ls->ls_flags = 0;
5895 +
5896 +       size = dlm_config.rsbtbl_size;
5897 +       ls->ls_rsbtbl_size = size;
5898 +
5899 +       ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
5900 +       if (!ls->ls_rsbtbl)
5901 +               goto out_lsfree;
5902 +       for (i = 0; i < size; i++) {
5903 +               INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
5904 +               rwlock_init(&ls->ls_rsbtbl[i].lock);
5905 +       }
5906 +
5907 +       size = dlm_config.lkbtbl_size;
5908 +       ls->ls_lkbtbl_size = size;
5909 +
5910 +       ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
5911 +       if (!ls->ls_lkbtbl)
5912 +               goto out_rsbfree;
5913 +       for (i = 0; i < size; i++) {
5914 +               INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
5915 +               rwlock_init(&ls->ls_lkbtbl[i].lock);
5916 +               ls->ls_lkbtbl[i].counter = 1;
5917 +       }
5918 +
5919 +       size = dlm_config.dirtbl_size;
5920 +       ls->ls_dirtbl_size = size;
5921 +
5922 +       ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
5923 +       if (!ls->ls_dirtbl)
5924 +               goto out_lkbfree;
5925 +       for (i = 0; i < size; i++) {
5926 +               INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
5927 +               rwlock_init(&ls->ls_dirtbl[i].lock);
5928 +       }
5929 +
5930 +       INIT_LIST_HEAD(&ls->ls_nodes);
5931 +       INIT_LIST_HEAD(&ls->ls_nodes_gone);
5932 +       ls->ls_num_nodes = 0;
5933 +       INIT_LIST_HEAD(&ls->ls_recover);
5934 +       spin_lock_init(&ls->ls_recover_lock);
5935 +       INIT_LIST_HEAD(&ls->ls_recover_list);
5936 +       ls->ls_recover_list_count = 0;
5937 +       spin_lock_init(&ls->ls_recover_list_lock);
5938 +       init_waitqueue_head(&ls->ls_wait_general);
5939 +       INIT_LIST_HEAD(&ls->ls_rootres);
5940 +       INIT_LIST_HEAD(&ls->ls_requestqueue);
5941 +       INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
5942 +       ls->ls_last_stop = 0;
5943 +       ls->ls_last_start = 0;
5944 +       ls->ls_last_finish = 0;
5945 +       ls->ls_rcom_msgid = 0;
5946 +       init_MUTEX(&ls->ls_rcom_lock);
5947 +       init_rwsem(&ls->ls_in_recovery);
5948 +       init_rwsem(&ls->ls_unlock_sem);
5949 +       init_rwsem(&ls->ls_rec_rsblist);
5950 +       init_rwsem(&ls->ls_gap_rsblist);
5951 +       down_write(&ls->ls_in_recovery);
5952 +
5953 +       if (flags & DLM_LSF_NOTIMERS)
5954 +               set_bit(LSFL_NOTIMERS, &ls->ls_flags);
5955 +       if (flags & DLM_LSF_NOCONVGRANT)
5956 +               set_bit(LSFL_NOCONVGRANT, &ls->ls_flags);
5957 +
5958 +       /*
5959 +        * Connect this lockspace with the cluster manager
5960 +        */
5961 +
5962 +       error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
5963 +                                    &ls_ops, TRUE, (void *) ls, &local_id);
5964 +       if (error)
5965 +               goto out_dirfree;
5966 +
5967 +       ls->ls_state = LSST_INIT;
5968 +       ls->ls_local_id = local_id;
5969 +
5970 +       spin_lock(&lslist_lock);
5971 +       list_add(&ls->ls_list, &lslist);
5972 +       spin_unlock(&lslist_lock);
5973 +
5974 +       error = kcl_join_service(local_id);
5975 +       if (error) {
5976 +               log_error(ls, "service manager join error %d", error);
5977 +               goto out_reg;
5978 +       }
5979 +
5980 +       /* The ls isn't actually running until it receives a start() from CMAN.
5981 +          Neither does it have a global ls id until started. */
5982 +
5983 +       /* Return the local ID as the lockspace handle. I've left this
5984 +          cast to a void* as it allows us to replace it with pretty much
5985 +          anything at a future date without breaking clients. But returning
5986 +          the address of the lockspace is a bad idea as it could get
5987 +          forcibly removed, leaving client with a dangling pointer */
5988 +       *lockspace = (void *)(long)local_id;
5989 +
5990 +       return 0;
5991 +
5992 + out_reg:
5993 +       kcl_unregister_service(ls->ls_local_id);
5994 + out_dirfree:
5995 +       kfree(ls->ls_dirtbl);
5996 + out_lkbfree:
5997 +       kfree(ls->ls_lkbtbl);
5998 + out_rsbfree:
5999 +       kfree(ls->ls_rsbtbl);
6000 + out_lsfree:
6001 +       kfree(ls);
6002 + out:
6003 +       return error;
6004 +}
6005 +
6006 +/*
6007 + * Called by a system like GFS which wants independent lock spaces.
6008 + */
6009 +
6010 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
6011 +{
6012 +       int error = -ENOSYS;
6013 +
6014 +       down(&dlmstate_lock);
6015 +       error = init_internal();
6016 +       if (error)
6017 +               goto out;
6018 +
6019 +       error = new_lockspace(name, namelen, lockspace, flags);
6020 + out:
6021 +       up(&dlmstate_lock);
6022 +       return error;
6023 +}
6024 +
6025 +/* Return 1 if the lockspace still has active remote locks,
6026 + *        2 if the lockspace still has active local locks.
6027 + */
6028 +static int lockspace_busy(struct dlm_ls *ls)
6029 +{
6030 +       int i, lkb_found = 0;
6031 +       struct dlm_lkb *lkb;
6032 +
6033 +       /* NOTE: We check the lockidtbl here rather than the resource table.
6034 +          This is because there may be LKBs queued as ASTs that have been
6035 +          unlinked from their RSBs and are pending deletion once the AST has
6036 +          been delivered */
6037 +
6038 +       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6039 +               read_lock(&ls->ls_lkbtbl[i].lock);
6040 +               if (!list_empty(&ls->ls_lkbtbl[i].list)) {
6041 +                       lkb_found = 1;
6042 +                       list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
6043 +                                           lkb_idtbl_list) {
6044 +                               if (!lkb->lkb_nodeid) {
6045 +                                       read_unlock(&ls->ls_lkbtbl[i].lock);
6046 +                                       return 2;
6047 +                               }
6048 +                       }
6049 +               }
6050 +               read_unlock(&ls->ls_lkbtbl[i].lock);
6051 +       }
6052 +       return lkb_found;
6053 +}
6054 +
6055 +static int release_lockspace(struct dlm_ls *ls, int force)
6056 +{
6057 +       struct dlm_lkb *lkb;
6058 +       struct dlm_rsb *rsb;
6059 +       struct dlm_recover *rv;
6060 +       struct dlm_csb *csb;
6061 +       struct list_head *head;
6062 +       int i;
6063 +       int busy = lockspace_busy(ls);
6064 +
6065 +       /* Don't destroy a busy lockspace */
6066 +       if (busy > force)
6067 +               return -EBUSY;
6068 +
6069 +       if (force < 3) {
6070 +               kcl_leave_service(ls->ls_local_id);
6071 +               kcl_unregister_service(ls->ls_local_id);
6072 +       }
6073 +
6074 +       spin_lock(&lslist_lock);
6075 +       list_del(&ls->ls_list);
6076 +       spin_unlock(&lslist_lock);
6077 +
6078 +       /*
6079 +        * Free resdata structs.
6080 +        */
6081 +
6082 +       dlm_dir_clear(ls);
6083 +       kfree(ls->ls_dirtbl);
6084 +
6085 +       /*
6086 +        * Free all lkb's on lkbtbl[] lists.
6087 +        */
6088 +
6089 +       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6090 +               head = &ls->ls_lkbtbl[i].list;
6091 +               while (!list_empty(head)) {
6092 +                       lkb = list_entry(head->next, struct dlm_lkb,
6093 +                                        lkb_idtbl_list);
6094 +                       list_del(&lkb->lkb_idtbl_list);
6095 +
6096 +                       if (lkb->lkb_lockqueue_state)
6097 +                               remove_from_lockqueue(lkb);
6098 +
6099 +                       if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
6100 +                               list_del(&lkb->lkb_astqueue);
6101 +
6102 +                       if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
6103 +                               free_lvb(lkb->lkb_lvbptr);
6104 +
6105 +                       free_lkb(lkb);
6106 +               }
6107 +       }
6108 +
6109 +       kfree(ls->ls_lkbtbl);
6110 +
6111 +       /*
6112 +        * Free all rsb's on rsbtbl[] lists
6113 +        */
6114 +
6115 +       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
6116 +               head = &ls->ls_rsbtbl[i].list;
6117 +               while (!list_empty(head)) {
6118 +                       rsb = list_entry(head->next, struct dlm_rsb,
6119 +                                        res_hashchain);
6120 +                       list_del(&rsb->res_hashchain);
6121 +
6122 +                       if (rsb->res_lvbptr)
6123 +                               free_lvb(rsb->res_lvbptr);
6124 +
6125 +                       free_rsb(rsb);
6126 +               }
6127 +       }
6128 +
6129 +       kfree(ls->ls_rsbtbl);
6130 +
6131 +       /*
6132 +        * Free structures on any other lists
6133 +        */
6134 +
6135 +       head = &ls->ls_recover;
6136 +       while (!list_empty(head)) {
6137 +               rv = list_entry(head->next, struct dlm_recover, list);
6138 +               list_del(&rv->list);
6139 +               kfree(rv);
6140 +       }
6141 +
6142 +       head = &ls->ls_nodes;
6143 +       while (!list_empty(head)) {
6144 +               csb = list_entry(head->next, struct dlm_csb, list);
6145 +               list_del(&csb->list);
6146 +               release_csb(csb);
6147 +       }
6148 +
6149 +       head = &ls->ls_nodes_gone;
6150 +       while (!list_empty(head)) {
6151 +               csb = list_entry(head->next, struct dlm_csb, list);
6152 +               list_del(&csb->list);
6153 +               release_csb(csb);
6154 +       }
6155 +
6156 +       kfree(ls);
6157 +
6158 +       dlm_release();
6159 +
6160 +       module_put(THIS_MODULE);
6161 +       return 0;
6162 +}
6163 +
6164 +
6165 +/*
6166 + * Called when a system has released all its locks and is not going to use the
6167 + * lockspace any longer.  We blindly free everything we're managing for this
6168 + * lockspace.  Remaining nodes will go through the recovery process as if we'd
6169 + * died.  The lockspace must continue to function as usual, participating in
6170 + * recoveries, until kcl_leave_service returns.
6171 + *
6172 + * Force has 4 possible values:
6173 + * 0 - don't destroy locksapce if it has any LKBs
6174 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6175 + * 2 - destroy lockspace regardless of LKBs
6176 + * 3 - destroy lockspace as part of a forced shutdown
6177 + */
6178 +
6179 +int dlm_release_lockspace(void *lockspace, int force)
6180 +{
6181 +       struct dlm_ls *ls;
6182 +
6183 +       ls = find_lockspace_by_local_id(lockspace);
6184 +       if (!ls)
6185 +               return -EINVAL;
6186 +
6187 +       return release_lockspace(ls, force);
6188 +}
6189 +
6190 +
6191 +/* Called when the cluster is being shut down dirtily */
6192 +void dlm_emergency_shutdown()
6193 +{
6194 +       struct dlm_ls *ls;
6195 +       struct dlm_ls *tmp;
6196 +
6197 +       /* Shut lowcomms down to prevent any socket activity */
6198 +       lowcomms_stop_accept();
6199 +
6200 +       /* Delete the devices that belong the the userland
6201 +          lockspaces to be deleted. */
6202 +       dlm_device_free_devices();
6203 +
6204 +       /* Now try to clean the lockspaces */
6205 +       spin_lock(&lslist_lock);
6206 +
6207 +       list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6208 +               spin_unlock(&lslist_lock);
6209 +               release_lockspace(ls, 3);
6210 +               spin_lock(&lslist_lock);
6211 +       }
6212 +
6213 +       spin_unlock(&lslist_lock);
6214 +}
6215 +
6216 +struct dlm_recover *allocate_dlm_recover(void)
6217 +{
6218 +       struct dlm_recover *rv;
6219 +
6220 +       rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
6221 +       if (rv)
6222 +               memset(rv, 0, sizeof(struct dlm_recover));
6223 +       return rv;
6224 +}
6225 +
6226 +/*
6227 + * Called by CMAN on a specific ls.  "stop" means set flag which while set
6228 + * causes all new requests to ls to be queued and not submitted until flag is
6229 + * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
6230 + * The recoverd thread carries out any work called for by this event.
6231 + */
6232 +
6233 +static int dlm_ls_stop(void *servicedata)
6234 +{
6235 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6236 +       int new;
6237 +
6238 +       spin_lock(&ls->ls_recover_lock);
6239 +       ls->ls_last_stop = ls->ls_last_start;
6240 +       set_bit(LSFL_LS_STOP, &ls->ls_flags);
6241 +       new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6242 +       spin_unlock(&ls->ls_recover_lock);
6243 +
6244 +       /*
6245 +        * This in_recovery lock does two things:
6246 +        *
6247 +        * 1) Keeps this function from returning until all threads are out
6248 +        *    of locking routines and locking is truely stopped.
6249 +        * 2) Keeps any new requests from being processed until it's unlocked
6250 +        *    when recovery is complete.
6251 +        */
6252 +
6253 +       if (new)
6254 +               down_write(&ls->ls_in_recovery);
6255 +
6256 +       clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6257 +       clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6258 +       clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6259 +       clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6260 +
6261 +       dlm_recoverd_kick(ls);
6262 +
6263 +       return 0;
6264 +}
6265 +
6266 +/*
6267 + * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
6268 + * request processing which first requires that the recovery procedure be
6269 + * stepped through with all nodes sharing the lockspace (nodeids).  The first
6270 + * start on the ls after it's created is a special case and requires some extra
6271 + * work like figuring out our own local nodeid.  We can't do all this in the
6272 + * calling CMAN context, so we must pass this work off to the recoverd thread
6273 + * which was created in dlm_init().  The recoverd thread carries out any work
6274 + * called for by this event.
6275 + */
6276 +
6277 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6278 +                       int event_id, int type)
6279 +{
6280 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6281 +       struct dlm_recover *rv;
6282 +       int error = -ENOMEM;
6283 +
6284 +       rv = allocate_dlm_recover();
6285 +       if (!rv)
6286 +               goto out;
6287 +
6288 +       rv->nodeids = nodeids;
6289 +       rv->node_count = count;
6290 +       rv->event_id = event_id;
6291 +
6292 +       spin_lock(&ls->ls_recover_lock);
6293 +       ls->ls_last_start = event_id;
6294 +       list_add_tail(&rv->list, &ls->ls_recover);
6295 +       set_bit(LSFL_LS_START, &ls->ls_flags);
6296 +       spin_unlock(&ls->ls_recover_lock);
6297 +
6298 +       dlm_recoverd_kick(ls);
6299 +       error = 0;
6300 +
6301 +      out:
6302 +       return error;
6303 +}
6304 +
6305 +/*
6306 + * Called by CMAN on a specific ls.  "finish" means that all nodes which
6307 + * received a "start" have completed the start and called kcl_start_done.
6308 + * The recoverd thread carries out any work called for by this event.
6309 + */
6310 +
6311 +static void dlm_ls_finish(void *servicedata, int event_id)
6312 +{
6313 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6314 +
6315 +       spin_lock(&ls->ls_recover_lock);
6316 +       ls->ls_last_finish = event_id;
6317 +       set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6318 +       spin_unlock(&ls->ls_recover_lock);
6319 +
6320 +       dlm_recoverd_kick(ls);
6321 +}
6322 +
6323 +struct kcl_service_ops ls_ops = {
6324 +       .stop = dlm_ls_stop,
6325 +       .start = dlm_ls_start,
6326 +       .finish = dlm_ls_finish
6327 +};
6328 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6329 --- linux-orig/cluster/dlm/lockspace.h  1970-01-01 07:30:00.000000000 +0730
6330 +++ linux-patched/cluster/dlm/lockspace.h       2004-07-13 18:57:22.000000000 +0800
6331 @@ -0,0 +1,27 @@
6332 +/******************************************************************************
6333 +*******************************************************************************
6334 +**
6335 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6336 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6337 +**
6338 +**  This copyrighted material is made available to anyone wishing to use,
6339 +**  modify, copy, or redistribute it subject to the terms and conditions
6340 +**  of the GNU General Public License v.2.
6341 +**
6342 +*******************************************************************************
6343 +******************************************************************************/
6344 +
6345 +#ifndef __LOCKSPACE_DOT_H__
6346 +#define __LOCKSPACE_DOT_H__
6347 +
6348 +void dlm_lockspace_init(void);
6349 +int dlm_init(void);
6350 +int dlm_release(void);
6351 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6352 +int dlm_release_lockspace(void *ls, int force);
6353 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
6354 +struct dlm_ls *find_lockspace_by_local_id(void *id);
6355 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
6356 +void dlm_emergency_shutdown(void);
6357 +
6358 +#endif                         /* __LOCKSPACE_DOT_H__ */
6359 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6360 --- linux-orig/cluster/dlm/lowcomms.c   1970-01-01 07:30:00.000000000 +0730
6361 +++ linux-patched/cluster/dlm/lowcomms.c        2004-07-13 18:57:22.000000000 +0800
6362 @@ -0,0 +1,1354 @@
6363 +/******************************************************************************
6364 +*******************************************************************************
6365 +**
6366 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6367 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6368 +**
6369 +**  This copyrighted material is made available to anyone wishing to use,
6370 +**  modify, copy, or redistribute it subject to the terms and conditions
6371 +**  of the GNU General Public License v.2.
6372 +**
6373 +*******************************************************************************
6374 +******************************************************************************/
6375 +
6376 +/*
6377 + * lowcomms.c
6378 + *
6379 + * This is the "low-level" comms layer.
6380 + *
6381 + * It is responsible for sending/receiving messages
6382 + * from other nodes in the cluster.
6383 + *
6384 + * Cluster nodes are referred to by their nodeids. nodeids are
6385 + * simply 32 bit numbers to the locking module - if they need to
6386 + * be expanded for the cluster infrastructure then that is it's
6387 + * responsibility. It is this layer's
6388 + * responsibility to resolve these into IP address or
6389 + * whatever it needs for inter-node communication.
6390 + *
6391 + * The comms level is two kernel threads that deal mainly with
6392 + * the receiving of messages from other nodes and passing them
6393 + * up to the mid-level comms layer (which understands the
6394 + * message format) for execution by the locking core, and
6395 + * a send thread which does all the setting up of connections
6396 + * to remote nodes and the sending of data. Threads are not allowed
6397 + * to send their own data because it may cause them to wait in times
6398 + * of high load. Also, this way, the sending thread can collect together
6399 + * messages bound for one node and send them in one block.
6400 + *
6401 + * I don't see any problem with the recv thread executing the locking
6402 + * code on behalf of remote processes as the locking code is
6403 + * short, efficient and never waits.
6404 + *
6405 + */
6406 +
6407 +
6408 +#include <asm/ioctls.h>
6409 +#include <net/sock.h>
6410 +#include <net/tcp.h>
6411 +#include <linux/pagemap.h>
6412 +#include <cluster/cnxman.h>
6413 +
6414 +#include "dlm_internal.h"
6415 +#include "lowcomms.h"
6416 +#include "midcomms.h"
6417 +#include "config.h"
6418 +
6419 +struct cbuf {
6420 +       unsigned base;
6421 +       unsigned len;
6422 +       unsigned mask;
6423 +};
6424 +
6425 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6426 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6427 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6428 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6429 +#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
6430 +                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6431 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6432 +
6433 +struct connection {
6434 +       struct socket *sock;    /* NULL if not connected */
6435 +       uint32_t nodeid;        /* So we know who we are in the list */
6436 +       struct rw_semaphore sock_sem;   /* Stop connect races */
6437 +       struct list_head read_list;     /* On this list when ready for reading */
6438 +       struct list_head write_list;    /* On this list when ready for writing */
6439 +       struct list_head state_list;    /* On this list when ready to connect */
6440 +       unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
6441 +#define CF_READ_PENDING 1
6442 +#define CF_WRITE_PENDING 2
6443 +#define CF_CONNECT_PENDING 3
6444 +#define CF_IS_OTHERSOCK 4
6445 +       struct list_head writequeue;    /* List of outgoing writequeue_entries */
6446 +       struct list_head listenlist;    /* List of allocated listening sockets */
6447 +       spinlock_t writequeue_lock;
6448 +       int (*rx_action) (struct connection *); /* What to do when active */
6449 +       struct page *rx_page;
6450 +       struct cbuf cb;
6451 +       int retries;
6452 +#define MAX_CONNECT_RETRIES 3
6453 +       struct connection *othersock;
6454 +};
6455 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6456 +#define nodeid2con(x) (&connections[(x)])
6457 +
6458 +/* An entry waiting to be sent */
6459 +struct writequeue_entry {
6460 +       struct list_head list;
6461 +       struct page *page;
6462 +       int offset;
6463 +       int len;
6464 +       int end;
6465 +       int users;
6466 +       struct connection *con;
6467 +};
6468 +
6469 +/* "Template" structure for IPv4 and IPv6 used to fill
6470 + * in the missing bits when converting between cman (which knows
6471 + * nothing about sockaddr structs) and real life where we actually
6472 + * have to connect to these addresses. Also one of these structs
6473 + * will hold the cached "us" address.
6474 + *
6475 + * It's an in6 sockaddr just so there's enough space for anything
6476 + * we're likely to see here.
6477 + */
6478 +static struct sockaddr_in6 local_addr;
6479 +
6480 +/* Manage daemons */
6481 +static struct semaphore thread_lock;
6482 +static struct completion thread_completion;
6483 +static atomic_t send_run;
6484 +static atomic_t recv_run;
6485 +
6486 +/* An array of connections, indexed by NODEID */
6487 +static struct connection *connections;
6488 +static int conn_array_size;
6489 +static atomic_t writequeue_length;
6490 +static atomic_t accepting;
6491 +
6492 +static wait_queue_t lowcomms_send_waitq_head;
6493 +static wait_queue_head_t lowcomms_send_waitq;
6494 +
6495 +static wait_queue_t lowcomms_recv_waitq_head;
6496 +static wait_queue_head_t lowcomms_recv_waitq;
6497 +
6498 +/* List of sockets that have reads pending */
6499 +static struct list_head read_sockets;
6500 +static spinlock_t read_sockets_lock;
6501 +
6502 +/* List of sockets which have writes pending */
6503 +static struct list_head write_sockets;
6504 +static spinlock_t write_sockets_lock;
6505 +
6506 +/* List of sockets which have connects pending */
6507 +static struct list_head state_sockets;
6508 +static spinlock_t state_sockets_lock;
6509 +
6510 +/* List of allocated listen sockets */
6511 +static struct list_head listen_sockets;
6512 +
6513 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6514 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6515 +
6516 +
6517 +/* Data available on socket or listen socket received a connect */
6518 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
6519 +{
6520 +       struct connection *con = sock2con(sk);
6521 +
6522 +       if (test_and_set_bit(CF_READ_PENDING, &con->flags))
6523 +               return;
6524 +
6525 +       spin_lock_bh(&read_sockets_lock);
6526 +       list_add_tail(&con->read_list, &read_sockets);
6527 +       spin_unlock_bh(&read_sockets_lock);
6528 +
6529 +       wake_up_interruptible(&lowcomms_recv_waitq);
6530 +}
6531 +
6532 +static void lowcomms_write_space(struct sock *sk)
6533 +{
6534 +       struct connection *con = sock2con(sk);
6535 +
6536 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
6537 +               return;
6538 +
6539 +       spin_lock_bh(&write_sockets_lock);
6540 +       list_add_tail(&con->write_list, &write_sockets);
6541 +       spin_unlock_bh(&write_sockets_lock);
6542 +
6543 +       wake_up_interruptible(&lowcomms_send_waitq);
6544 +}
6545 +
6546 +static inline void lowcomms_connect_sock(struct connection *con)
6547 +{
6548 +       if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
6549 +               return;
6550 +       if (!atomic_read(&accepting))
6551 +               return;
6552 +
6553 +       spin_lock_bh(&state_sockets_lock);
6554 +       list_add_tail(&con->state_list, &state_sockets);
6555 +       spin_unlock_bh(&state_sockets_lock);
6556 +
6557 +       wake_up_interruptible(&lowcomms_send_waitq);
6558 +}
6559 +
6560 +static void lowcomms_state_change(struct sock *sk)
6561 +{
6562 +/*     struct connection *con = sock2con(sk); */
6563 +
6564 +       switch (sk->sk_state) {
6565 +       case TCP_ESTABLISHED:
6566 +               lowcomms_write_space(sk);
6567 +               break;
6568 +
6569 +       case TCP_FIN_WAIT1:
6570 +       case TCP_FIN_WAIT2:
6571 +       case TCP_TIME_WAIT:
6572 +       case TCP_CLOSE:
6573 +       case TCP_CLOSE_WAIT:
6574 +       case TCP_LAST_ACK:
6575 +       case TCP_CLOSING:
6576 +               /* FIXME: I think this causes more trouble than it solves.
6577 +                  lowcomms wil reconnect anyway when there is something to
6578 +                  send. This just attempts reconnection if a node goes down!
6579 +               */
6580 +               /* lowcomms_connect_sock(con); */
6581 +               break;
6582 +
6583 +       default:
6584 +               printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
6585 +               break;
6586 +       }
6587 +}
6588 +
6589 +/* Make a socket active */
6590 +static int add_sock(struct socket *sock, struct connection *con)
6591 +{
6592 +       con->sock = sock;
6593 +
6594 +       /* Install a data_ready callback */
6595 +       con->sock->sk->sk_data_ready = lowcomms_data_ready;
6596 +       con->sock->sk->sk_write_space = lowcomms_write_space;
6597 +       con->sock->sk->sk_state_change = lowcomms_state_change;
6598 +
6599 +       return 0;
6600 +}
6601 +
6602 +/* Add the port number to an IP6 or 4 sockaddr and return the address
6603 +   length */
6604 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
6605 +                         int *addr_len)
6606 +{
6607 +        saddr->sin6_family = local_addr.sin6_family;
6608 +        if (local_addr.sin6_family == AF_INET) {
6609 +           struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
6610 +           in4_addr->sin_port = cpu_to_be16(port);
6611 +           *addr_len = sizeof(struct sockaddr_in);
6612 +       }
6613 +       else {
6614 +           saddr->sin6_port = cpu_to_be16(port);
6615 +           *addr_len = sizeof(struct sockaddr_in6);
6616 +       }
6617 +}
6618 +
6619 +/* Close a remote connection and tidy up */
6620 +static void close_connection(struct connection *con)
6621 +{
6622 +       if (test_bit(CF_IS_OTHERSOCK, &con->flags))
6623 +               return;
6624 +
6625 +       down_write(&con->sock_sem);
6626 +
6627 +       if (con->sock) {
6628 +               sock_release(con->sock);
6629 +               con->sock = NULL;
6630 +               if (con->othersock) {
6631 +                       down_write(&con->othersock->sock_sem);
6632 +                       sock_release(con->othersock->sock);
6633 +                       con->othersock->sock = NULL;
6634 +                       up_write(&con->othersock->sock_sem);
6635 +                       kfree(con->othersock);
6636 +                       con->othersock = NULL;
6637 +               }
6638 +       }
6639 +       if (con->rx_page) {
6640 +               __free_page(con->rx_page);
6641 +               con->rx_page = NULL;
6642 +       }
6643 +       up_write(&con->sock_sem);
6644 +}
6645 +
6646 +/* Data received from remote end */
6647 +static int receive_from_sock(struct connection *con)
6648 +{
6649 +       int ret = 0;
6650 +       struct msghdr msg;
6651 +       struct iovec iov[2];
6652 +       mm_segment_t fs;
6653 +       unsigned len;
6654 +       int r;
6655 +       int call_again_soon = 0;
6656 +
6657 +       down_read(&con->sock_sem);
6658 +
6659 +       if (con->sock == NULL)
6660 +               goto out;
6661 +       if (con->rx_page == NULL) {
6662 +               /*
6663 +                * This doesn't need to be atomic, but I think it should
6664 +                * improve performance if it is.
6665 +                */
6666 +               con->rx_page = alloc_page(GFP_ATOMIC);
6667 +               if (con->rx_page == NULL)
6668 +                       goto out_resched;
6669 +               CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
6670 +       }
6671 +       /*
6672 +        * To avoid doing too many short reads, we will reschedule for another
6673 +        * another time if there are less than 32 bytes left in the buffer.
6674 +        */
6675 +       if (!CBUF_MAY_ADD(&con->cb, 32))
6676 +               goto out_resched;
6677 +
6678 +       msg.msg_control = NULL;
6679 +       msg.msg_controllen = 0;
6680 +       msg.msg_iovlen = 1;
6681 +       msg.msg_iov = iov;
6682 +       msg.msg_name = NULL;
6683 +       msg.msg_namelen = 0;
6684 +       msg.msg_flags = 0;
6685 +
6686 +       /*
6687 +        * iov[0] is the bit of the circular buffer between the current end
6688 +        * point (cb.base + cb.len) and the end of the buffer.
6689 +        */
6690 +       iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
6691 +       iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
6692 +       iov[1].iov_len = 0;
6693 +
6694 +       /*
6695 +        * iov[1] is the bit of the circular buffer between the start of the
6696 +        * buffer and the start of the currently used section (cb.base)
6697 +        */
6698 +       if (CBUF_DATA(&con->cb) >= con->cb.base) {
6699 +               iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
6700 +               iov[1].iov_len = con->cb.base;
6701 +               iov[1].iov_base = page_address(con->rx_page);
6702 +               msg.msg_iovlen = 2;
6703 +       }
6704 +       len = iov[0].iov_len + iov[1].iov_len;
6705 +
6706 +       fs = get_fs();
6707 +       set_fs(get_ds());
6708 +       r = ret = sock_recvmsg(con->sock, &msg, len,
6709 +                              MSG_DONTWAIT | MSG_NOSIGNAL);
6710 +       set_fs(fs);
6711 +
6712 +       if (ret <= 0)
6713 +               goto out_close;
6714 +       if (ret == len)
6715 +               call_again_soon = 1;
6716 +       CBUF_ADD(&con->cb, ret);
6717 +       ret = midcomms_process_incoming_buffer(con->nodeid,
6718 +                                              page_address(con->rx_page),
6719 +                                              con->cb.base, con->cb.len,
6720 +                                              PAGE_CACHE_SIZE);
6721 +       if (ret == -EBADMSG) {
6722 +               printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
6723 +                      "iov_len=%u, iov_base[0]=%p, read=%d\n",
6724 +                      page_address(con->rx_page), con->cb.base, con->cb.len,
6725 +                      len, iov[0].iov_base, r);
6726 +       }
6727 +       if (ret < 0)
6728 +               goto out_close;
6729 +       CBUF_EAT(&con->cb, ret);
6730 +
6731 +       if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
6732 +               __free_page(con->rx_page);
6733 +               con->rx_page = NULL;
6734 +       }
6735 +      out:
6736 +       if (call_again_soon)
6737 +               goto out_resched;
6738 +       up_read(&con->sock_sem);
6739 +       ret = 0;
6740 +       goto out_ret;
6741 +
6742 +      out_resched:
6743 +       lowcomms_data_ready(con->sock->sk, 0);
6744 +       up_read(&con->sock_sem);
6745 +       ret = 0;
6746 +       goto out_ret;
6747 +
6748 +      out_close:
6749 +       up_read(&con->sock_sem);
6750 +       if (ret != -EAGAIN && !test_bit(CF_IS_OTHERSOCK, &con->flags)) {
6751 +               close_connection(con);
6752 +               lowcomms_connect_sock(con);
6753 +       }
6754 +
6755 +      out_ret:
6756 +       return ret;
6757 +}
6758 +
6759 +/* Listening socket is busy, accept a connection */
6760 +static int accept_from_sock(struct connection *con)
6761 +{
6762 +       int result;
6763 +       struct sockaddr_in6 peeraddr;
6764 +       struct socket *newsock;
6765 +       int len;
6766 +       int nodeid;
6767 +       struct connection *newcon;
6768 +
6769 +       memset(&peeraddr, 0, sizeof(peeraddr));
6770 +       newsock = sock_alloc();
6771 +       if (!newsock)
6772 +               return -ENOMEM;
6773 +
6774 +       down_read(&con->sock_sem);
6775 +
6776 +       result = -ENOTCONN;
6777 +       if (con->sock == NULL)
6778 +               goto accept_err;
6779 +
6780 +       newsock->type = con->sock->type;
6781 +       newsock->ops = con->sock->ops;
6782 +
6783 +       result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
6784 +       if (result < 0)
6785 +               goto accept_err;
6786 +
6787 +       /* Get the connected socket's peer */
6788 +       if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
6789 +                                 &len, 2)) {
6790 +               result = -ECONNABORTED;
6791 +               goto accept_err;
6792 +       }
6793 +
6794 +       /* Get the new node's NODEID */
6795 +       nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
6796 +       if (nodeid == 0) {
6797 +               printk("dlm: connect from non cluster node\n");
6798 +               sock_release(newsock);
6799 +               up_read(&con->sock_sem);
6800 +               return -1;
6801 +       }
6802 +
6803 +       log_print("got connection from %d", nodeid);
6804 +
6805 +       /*  Check to see if we already have a connection to this node. This
6806 +        *  could happen if the two nodes initiate a connection at roughly
6807 +        *  the same time and the connections cross on the wire.
6808 +        * TEMPORARY FIX:
6809 +        *  In this case we store the incoming one in "othersock"
6810 +        */
6811 +       newcon = nodeid2con(nodeid);
6812 +       down_write(&newcon->sock_sem);
6813 +       if (newcon->sock) {
6814 +               struct connection *othercon;
6815 +
6816 +               othercon = kmalloc(sizeof(struct connection), GFP_KERNEL);
6817 +               if (!othercon) {
6818 +                       printk("dlm: failed to allocate incoming socket\n");
6819 +                       sock_release(newsock);
6820 +                       up_write(&newcon->sock_sem);
6821 +                       up_read(&con->sock_sem);
6822 +                       goto accept_out;
6823 +               }
6824 +               memset(othercon, 0, sizeof(*othercon));
6825 +               newcon->othersock = othercon;
6826 +               othercon->nodeid = nodeid;
6827 +               othercon->sock = newsock;
6828 +               othercon->rx_action = receive_from_sock;
6829 +               add_sock(newsock, othercon);
6830 +               init_rwsem(&othercon->sock_sem);
6831 +               set_bit(CF_IS_OTHERSOCK, &othercon->flags);
6832 +               newsock->sk->sk_user_data = othercon;
6833 +
6834 +               up_write(&newcon->sock_sem);
6835 +               lowcomms_data_ready(newsock->sk, 0);
6836 +               up_read(&con->sock_sem);
6837 +               goto accept_out;
6838 +       }
6839 +
6840 +       newsock->sk->sk_user_data = newcon;
6841 +       newcon->rx_action = receive_from_sock;
6842 +       add_sock(newsock, newcon);
6843 +       up_write(&newcon->sock_sem);
6844 +
6845 +       /*
6846 +        * Add it to the active queue in case we got data
6847 +        * beween processing the accept adding the socket
6848 +        * to the read_sockets list
6849 +        */
6850 +       lowcomms_data_ready(newsock->sk, 0);
6851 +
6852 +       up_read(&con->sock_sem);
6853 +
6854 +      accept_out:
6855 +       return 0;
6856 +
6857 +      accept_err:
6858 +       up_read(&con->sock_sem);
6859 +       sock_release(newsock);
6860 +
6861 +       printk("dlm: error accepting connection from node: %d\n", result);
6862 +       return result;
6863 +}
6864 +
6865 +/* Connect a new socket to its peer */
6866 +static int connect_to_sock(struct connection *con)
6867 +{
6868 +       int result = -EHOSTUNREACH;
6869 +       struct sockaddr_in6 saddr;
6870 +       int addr_len;
6871 +       struct socket *sock;
6872 +
6873 +       if (con->nodeid == 0) {
6874 +               log_print("attempt to connect sock 0 foiled");
6875 +               return 0;
6876 +       }
6877 +
6878 +       down_write(&con->sock_sem);
6879 +       if (con->retries++ > MAX_CONNECT_RETRIES)
6880 +               goto out;
6881 +
6882 +       // FIXME not sure this should happen, let alone like this.
6883 +       if (con->sock) {
6884 +               sock_release(con->sock);
6885 +               con->sock = NULL;
6886 +       }
6887 +
6888 +       /* Create a socket to communicate with */
6889 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6890 +       if (result < 0)
6891 +               goto out_err;
6892 +
6893 +       if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
6894 +               goto out_err;
6895 +
6896 +       sock->sk->sk_user_data = con;
6897 +       con->rx_action = receive_from_sock;
6898 +
6899 +       make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
6900 +
6901 +       add_sock(sock, con);
6902 +       result =
6903 +           sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
6904 +                              O_NONBLOCK);
6905 +       if (result == -EINPROGRESS)
6906 +               result = 0;
6907 +       if (result != 0)
6908 +               goto out_err;
6909 +
6910 +      out:
6911 +       up_write(&con->sock_sem);
6912 +       /*
6913 +        * Returning an error here means we've given up trying to connect to
6914 +        * a remote node, otherwise we return 0 and reschedule the connetion
6915 +        * attempt
6916 +        */
6917 +       return result;
6918 +
6919 +      out_err:
6920 +       if (con->sock) {
6921 +               sock_release(con->sock);
6922 +               con->sock = NULL;
6923 +       }
6924 +       /*
6925 +        * Some errors are fatal and this list might need adjusting. For other
6926 +        * errors we try again until the max number of retries is reached.
6927 +        */
6928 +       if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
6929 +           result != -ENETDOWN && result != EINVAL
6930 +           && result != -EPROTONOSUPPORT) {
6931 +               lowcomms_connect_sock(con);
6932 +               result = 0;
6933 +       }
6934 +       goto out;
6935 +}
6936 +
6937 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
6938 +{
6939 +        struct socket *sock = NULL;
6940 +       mm_segment_t fs;
6941 +       int result = 0;
6942 +       int one = 1;
6943 +       struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
6944 +
6945 +       /* Create a socket to communicate with */
6946 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
6947 +       if (result < 0) {
6948 +               printk("dlm: Can't create listening comms socket\n");
6949 +               goto create_out;
6950 +       }
6951 +
6952 +       fs = get_fs();
6953 +       set_fs(get_ds());
6954 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
6955 +       set_fs(fs);
6956 +       if (result < 0) {
6957 +               printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
6958 +       }
6959 +       sock->sk->sk_user_data = con;
6960 +       con->rx_action = accept_from_sock;
6961 +       con->sock = sock;
6962 +
6963 +       /* Bind to our port */
6964 +       make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
6965 +       result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
6966 +       if (result < 0) {
6967 +               printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
6968 +               sock_release(sock);
6969 +               sock = NULL;
6970 +               goto create_out;
6971 +       }
6972 +
6973 +       fs = get_fs();
6974 +       set_fs(get_ds());
6975 +
6976 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
6977 +       set_fs(fs);
6978 +       if (result < 0) {
6979 +               printk("dlm: Set keepalive failed: %d\n", result);
6980 +       }
6981 +
6982 +       result = sock->ops->listen(sock, 5);
6983 +       if (result < 0) {
6984 +               printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
6985 +               sock_release(sock);
6986 +               sock = NULL;
6987 +               goto create_out;
6988 +       }
6989 +
6990 +      create_out:
6991 +       return sock;
6992 +}
6993 +
6994 +
6995 +/* Listen on all interfaces */
6996 +static int listen_for_all(void)
6997 +{
6998 +       int result = 0;
6999 +       int nodeid;
7000 +       struct socket *sock = NULL;
7001 +       struct list_head *addr_list;
7002 +       struct connection *con = nodeid2con(0);
7003 +       struct cluster_node_addr *node_addr;
7004 +       char local_addr[sizeof(struct sockaddr_in6)];
7005 +
7006 +       /* This will also fill in local_addr */
7007 +       nodeid = lowcomms_our_nodeid();
7008 +
7009 +       addr_list = kcl_get_node_addresses(nodeid);
7010 +       if (!addr_list) {
7011 +               printk("dlm: cannot initialise comms layer\n");
7012 +               result = -ENOTCONN;
7013 +               goto create_out;
7014 +       }
7015 +
7016 +       list_for_each_entry(node_addr, addr_list, list) {
7017 +
7018 +               if (!con) {
7019 +                       con = kmalloc(sizeof(struct connection), GFP_KERNEL);
7020 +                       if (!con) {
7021 +                               printk("dlm: failed to allocate listen socket\n");
7022 +                               goto create_out;
7023 +                       }
7024 +                       memset(con, 0, sizeof(*con));
7025 +                       init_rwsem(&con->sock_sem);
7026 +                       spin_lock_init(&con->writequeue_lock);
7027 +                       INIT_LIST_HEAD(&con->writequeue);
7028 +                       set_bit(CF_IS_OTHERSOCK, &con->flags);
7029 +               }
7030 +
7031 +               memcpy(local_addr, node_addr->addr, node_addr->addr_len);
7032 +               sock = create_listen_sock(con, local_addr,
7033 +                                         node_addr->addr_len);
7034 +               if (sock) {
7035 +                       add_sock(sock, con);
7036 +               }
7037 +               else {
7038 +                       kfree(con);
7039 +               }
7040 +
7041 +               /* Keep a list of dynamically allocated listening sockets
7042 +                  so we can free them at shutdown */
7043 +               if (test_bit(CF_IS_OTHERSOCK, &con->flags)) {
7044 +                       list_add_tail(&con->listenlist, &listen_sockets);
7045 +               }
7046 +               con = NULL;
7047 +       }
7048 +
7049 +      create_out:
7050 +       return result;
7051 +}
7052 +
7053 +
7054 +
7055 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
7056 +                                                    int allocation)
7057 +{
7058 +       struct writequeue_entry *entry;
7059 +
7060 +       entry = kmalloc(sizeof(struct writequeue_entry), allocation);
7061 +       if (!entry)
7062 +               return NULL;
7063 +
7064 +       entry->page = alloc_page(allocation);
7065 +       if (!entry->page) {
7066 +               kfree(entry);
7067 +               return NULL;
7068 +       }
7069 +
7070 +       entry->offset = 0;
7071 +       entry->len = 0;
7072 +       entry->end = 0;
7073 +       entry->users = 0;
7074 +       entry->con = con;
7075 +
7076 +       return entry;
7077 +}
7078 +
7079 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7080 +                                            int allocation, char **ppc)
7081 +{
7082 +       struct connection *con = nodeid2con(nodeid);
7083 +       struct writequeue_entry *e;
7084 +       int offset = 0;
7085 +       int users = 0;
7086 +
7087 +       if (!atomic_read(&accepting))
7088 +               return NULL;
7089 +
7090 +       spin_lock(&con->writequeue_lock);
7091 +       e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
7092 +       if (((struct list_head *) e == &con->writequeue) ||
7093 +           (PAGE_CACHE_SIZE - e->end < len)) {
7094 +               e = NULL;
7095 +       } else {
7096 +               offset = e->end;
7097 +               e->end += len;
7098 +               users = e->users++;
7099 +       }
7100 +       spin_unlock(&con->writequeue_lock);
7101 +
7102 +       if (e) {
7103 +             got_one:
7104 +               if (users == 0)
7105 +                       kmap(e->page);
7106 +               *ppc = page_address(e->page) + offset;
7107 +               return e;
7108 +       }
7109 +
7110 +       e = new_writequeue_entry(con, allocation);
7111 +       if (e) {
7112 +               spin_lock(&con->writequeue_lock);
7113 +               offset = e->end;
7114 +               e->end += len;
7115 +               users = e->users++;
7116 +               list_add_tail(&e->list, &con->writequeue);
7117 +               spin_unlock(&con->writequeue_lock);
7118 +               atomic_inc(&writequeue_length);
7119 +               goto got_one;
7120 +       }
7121 +       return NULL;
7122 +}
7123 +
7124 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7125 +{
7126 +       struct connection *con = e->con;
7127 +       int users;
7128 +
7129 +       if (!atomic_read(&accepting))
7130 +               return;
7131 +
7132 +       spin_lock(&con->writequeue_lock);
7133 +       users = --e->users;
7134 +       if (users)
7135 +               goto out;
7136 +       e->len = e->end - e->offset;
7137 +       kunmap(e->page);
7138 +       spin_unlock(&con->writequeue_lock);
7139 +
7140 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7141 +               spin_lock_bh(&write_sockets_lock);
7142 +               list_add_tail(&con->write_list, &write_sockets);
7143 +               spin_unlock_bh(&write_sockets_lock);
7144 +
7145 +               wake_up_interruptible(&lowcomms_send_waitq);
7146 +       }
7147 +       return;
7148 +
7149 +      out:
7150 +       spin_unlock(&con->writequeue_lock);
7151 +       return;
7152 +}
7153 +
7154 +static void free_entry(struct writequeue_entry *e)
7155 +{
7156 +       __free_page(e->page);
7157 +       kfree(e);
7158 +       atomic_dec(&writequeue_length);
7159 +}
7160 +
7161 +/* Send a message */
7162 +static int send_to_sock(struct connection *con)
7163 +{
7164 +       int ret = 0;
7165 +       ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7166 +       const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7167 +       struct writequeue_entry *e;
7168 +       int len, offset;
7169 +
7170 +       down_read(&con->sock_sem);
7171 +       if (con->sock == NULL)
7172 +               goto out_connect;
7173 +
7174 +       sendpage = con->sock->ops->sendpage;
7175 +
7176 +       spin_lock(&con->writequeue_lock);
7177 +       for (;;) {
7178 +               e = list_entry(con->writequeue.next, struct writequeue_entry,
7179 +                              list);
7180 +               if ((struct list_head *) e == &con->writequeue)
7181 +                       break;
7182 +
7183 +               len = e->len;
7184 +               offset = e->offset;
7185 +               BUG_ON(len == 0 && e->users == 0);
7186 +               spin_unlock(&con->writequeue_lock);
7187 +
7188 +               ret = 0;
7189 +               if (len) {
7190 +                       ret = sendpage(con->sock, e->page, offset, len,
7191 +                                      msg_flags);
7192 +                       if (ret == -EAGAIN || ret == 0)
7193 +                               goto out;
7194 +                       if (ret <= 0)
7195 +                               goto send_error;
7196 +               }
7197 +
7198 +               spin_lock(&con->writequeue_lock);
7199 +               e->offset += ret;
7200 +               e->len -= ret;
7201 +
7202 +               if (e->len == 0 && e->users == 0) {
7203 +                       list_del(&e->list);
7204 +                       free_entry(e);
7205 +                       continue;
7206 +               }
7207 +       }
7208 +       spin_unlock(&con->writequeue_lock);
7209 +      out:
7210 +       up_read(&con->sock_sem);
7211 +       return ret;
7212 +
7213 +      send_error:
7214 +       up_read(&con->sock_sem);
7215 +       close_connection(con);
7216 +       lowcomms_connect_sock(con);
7217 +       return ret;
7218 +
7219 +      out_connect:
7220 +       up_read(&con->sock_sem);
7221 +       lowcomms_connect_sock(con);
7222 +       return 0;
7223 +}
7224 +
7225 +/* Called from recoverd when it knows that a node has
7226 +   left the cluster */
7227 +int lowcomms_close(int nodeid)
7228 +{
7229 +       struct connection *con;
7230 +
7231 +       if (!connections)
7232 +               goto out;
7233 +
7234 +       con = nodeid2con(nodeid);
7235 +       if (con->sock) {
7236 +               close_connection(con);
7237 +               return 0;
7238 +       }
7239 +
7240 +      out:
7241 +       return -1;
7242 +}
7243 +
7244 +/* API send message call, may queue the request */
7245 +/* N.B. This is the old interface - use the new one for new calls */
7246 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7247 +{
7248 +       struct writequeue_entry *e;
7249 +       char *b;
7250 +
7251 +       DLM_ASSERT(nodeid < dlm_config.max_connections,
7252 +                   printk("nodeid=%u\n", nodeid););
7253 +
7254 +       e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7255 +       if (e) {
7256 +               memcpy(b, buf, len);
7257 +               lowcomms_commit_buffer(e);
7258 +               return 0;
7259 +       }
7260 +       return -ENOBUFS;
7261 +}
7262 +
7263 +/* Look for activity on active sockets */
7264 +static void process_sockets(void)
7265 +{
7266 +       struct list_head *list;
7267 +       struct list_head *temp;
7268 +
7269 +       spin_lock_bh(&read_sockets_lock);
7270 +       list_for_each_safe(list, temp, &read_sockets) {
7271 +               struct connection *con =
7272 +                   list_entry(list, struct connection, read_list);
7273 +               list_del(&con->read_list);
7274 +               clear_bit(CF_READ_PENDING, &con->flags);
7275 +
7276 +               spin_unlock_bh(&read_sockets_lock);
7277 +
7278 +               con->rx_action(con);
7279 +
7280 +               /* Don't starve out everyone else */
7281 +               schedule();
7282 +               spin_lock_bh(&read_sockets_lock);
7283 +       }
7284 +       spin_unlock_bh(&read_sockets_lock);
7285 +}
7286 +
7287 +/* Try to send any messages that are pending
7288 + */
7289 +static void process_output_queue(void)
7290 +{
7291 +       struct list_head *list;
7292 +       struct list_head *temp;
7293 +       int ret;
7294 +
7295 +       spin_lock_bh(&write_sockets_lock);
7296 +       list_for_each_safe(list, temp, &write_sockets) {
7297 +               struct connection *con =
7298 +                   list_entry(list, struct connection, write_list);
7299 +               list_del(&con->write_list);
7300 +               clear_bit(CF_WRITE_PENDING, &con->flags);
7301 +
7302 +               spin_unlock_bh(&write_sockets_lock);
7303 +
7304 +               ret = send_to_sock(con);
7305 +               if (ret < 0) {
7306 +               }
7307 +               spin_lock_bh(&write_sockets_lock);
7308 +       }
7309 +       spin_unlock_bh(&write_sockets_lock);
7310 +}
7311 +
7312 +static void process_state_queue(void)
7313 +{
7314 +       struct list_head *list;
7315 +       struct list_head *temp;
7316 +       int ret;
7317 +
7318 +       spin_lock_bh(&state_sockets_lock);
7319 +       list_for_each_safe(list, temp, &state_sockets) {
7320 +               struct connection *con =
7321 +                   list_entry(list, struct connection, state_list);
7322 +               list_del(&con->state_list);
7323 +               clear_bit(CF_CONNECT_PENDING, &con->flags);
7324 +               spin_unlock_bh(&state_sockets_lock);
7325 +
7326 +               ret = connect_to_sock(con);
7327 +               if (ret < 0) {
7328 +               }
7329 +               spin_lock_bh(&state_sockets_lock);
7330 +       }
7331 +       spin_unlock_bh(&state_sockets_lock);
7332 +}
7333 +
7334 +/* Discard all entries on the write queues */
7335 +static void clean_writequeues(void)
7336 +{
7337 +       struct list_head *list;
7338 +       struct list_head *temp;
7339 +       int nodeid;
7340 +
7341 +       for (nodeid = 1; nodeid < dlm_config.max_connections; nodeid++) {
7342 +               struct connection *con = nodeid2con(nodeid);
7343 +
7344 +               spin_lock(&con->writequeue_lock);
7345 +               list_for_each_safe(list, temp, &con->writequeue) {
7346 +                       struct writequeue_entry *e =
7347 +                           list_entry(list, struct writequeue_entry, list);
7348 +                       list_del(&e->list);
7349 +                       free_entry(e);
7350 +               }
7351 +               spin_unlock(&con->writequeue_lock);
7352 +       }
7353 +}
7354 +
7355 +static int read_list_empty(void)
7356 +{
7357 +       int status;
7358 +
7359 +       spin_lock_bh(&read_sockets_lock);
7360 +       status = list_empty(&read_sockets);
7361 +       spin_unlock_bh(&read_sockets_lock);
7362 +
7363 +       return status;
7364 +}
7365 +
7366 +/* DLM Transport comms receive daemon */
7367 +static int dlm_recvd(void *data)
7368 +{
7369 +       daemonize("dlm_recvd");
7370 +       atomic_set(&recv_run, 1);
7371 +
7372 +       init_waitqueue_head(&lowcomms_recv_waitq);
7373 +       init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7374 +       add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7375 +
7376 +       complete(&thread_completion);
7377 +
7378 +       while (atomic_read(&recv_run)) {
7379 +
7380 +               set_task_state(current, TASK_INTERRUPTIBLE);
7381 +
7382 +               if (read_list_empty())
7383 +                       schedule();
7384 +
7385 +               set_task_state(current, TASK_RUNNING);
7386 +
7387 +               process_sockets();
7388 +       }
7389 +
7390 +       down(&thread_lock);
7391 +       up(&thread_lock);
7392 +
7393 +       complete(&thread_completion);
7394 +
7395 +       return 0;
7396 +}
7397 +
7398 +static int write_and_state_lists_empty(void)
7399 +{
7400 +       int status;
7401 +
7402 +       spin_lock_bh(&write_sockets_lock);
7403 +       status = list_empty(&write_sockets);
7404 +       spin_unlock_bh(&write_sockets_lock);
7405 +
7406 +       spin_lock_bh(&state_sockets_lock);
7407 +       if (list_empty(&state_sockets) == 0)
7408 +               status = 0;
7409 +       spin_unlock_bh(&state_sockets_lock);
7410 +
7411 +       return status;
7412 +}
7413 +
7414 +/* DLM Transport send daemon */
7415 +static int dlm_sendd(void *data)
7416 +{
7417 +       daemonize("dlm_sendd");
7418 +       atomic_set(&send_run, 1);
7419 +
7420 +       init_waitqueue_head(&lowcomms_send_waitq);
7421 +       init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7422 +       add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7423 +
7424 +       complete(&thread_completion);
7425 +
7426 +       while (atomic_read(&send_run)) {
7427 +
7428 +               set_task_state(current, TASK_INTERRUPTIBLE);
7429 +
7430 +               if (write_and_state_lists_empty())
7431 +                       schedule();
7432 +
7433 +               set_task_state(current, TASK_RUNNING);
7434 +
7435 +               process_state_queue();
7436 +               process_output_queue();
7437 +       }
7438 +
7439 +       down(&thread_lock);
7440 +       up(&thread_lock);
7441 +
7442 +       complete(&thread_completion);
7443 +
7444 +       return 0;
7445 +}
7446 +
7447 +static void daemons_stop(void)
7448 +{
7449 +       if (atomic_read(&recv_run)) {
7450 +               down(&thread_lock);
7451 +               atomic_set(&recv_run, 0);
7452 +               wake_up_interruptible(&lowcomms_recv_waitq);
7453 +               up(&thread_lock);
7454 +               wait_for_completion(&thread_completion);
7455 +       }
7456 +
7457 +       if (atomic_read(&send_run)) {
7458 +               down(&thread_lock);
7459 +               atomic_set(&send_run, 0);
7460 +               wake_up_interruptible(&lowcomms_send_waitq);
7461 +               up(&thread_lock);
7462 +               wait_for_completion(&thread_completion);
7463 +       }
7464 +}
7465 +
7466 +static int daemons_start(void)
7467 +{
7468 +       int error;
7469 +
7470 +       error = kernel_thread(dlm_recvd, NULL, 0);
7471 +       if (error < 0) {
7472 +               log_print("can't start recvd thread: %d", error);
7473 +               goto out;
7474 +       }
7475 +       wait_for_completion(&thread_completion);
7476 +
7477 +       error = kernel_thread(dlm_sendd, NULL, 0);
7478 +       if (error < 0) {
7479 +               log_print("can't start sendd thread: %d", error);
7480 +               daemons_stop();
7481 +               goto out;
7482 +       }
7483 +       wait_for_completion(&thread_completion);
7484 +
7485 +       error = 0;
7486 + out:
7487 +       return error;
7488 +}
7489 +
7490 +/*
7491 + * Return the largest buffer size we can cope with.
7492 + */
7493 +int lowcomms_max_buffer_size(void)
7494 +{
7495 +       return PAGE_CACHE_SIZE;
7496 +}
7497 +
7498 +void lowcomms_stop(void)
7499 +{
7500 +       int i;
7501 +       struct connection *temp;
7502 +       struct connection *lcon;
7503 +
7504 +       atomic_set(&accepting, 0);
7505 +
7506 +       /* Set all the activity flags to prevent any
7507 +          socket activity.
7508 +       */
7509 +       for (i = 0; i < conn_array_size; i++) {
7510 +               connections[i].flags = 0x7;
7511 +       }
7512 +       daemons_stop();
7513 +       clean_writequeues();
7514 +
7515 +       for (i = 0; i < conn_array_size; i++) {
7516 +               close_connection(nodeid2con(i));
7517 +       }
7518 +
7519 +       kfree(connections);
7520 +       connections = NULL;
7521 +
7522 +       /* Free up any dynamically allocated listening sockets */
7523 +       list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
7524 +               sock_release(lcon->sock);
7525 +               kfree(lcon);
7526 +       }
7527 +
7528 +       kcl_releaseref_cluster();
7529 +}
7530 +
7531 +/* This is quite likely to sleep... */
7532 +int lowcomms_start(void)
7533 +{
7534 +       int error = 0;
7535 +       int i;
7536 +
7537 +       INIT_LIST_HEAD(&read_sockets);
7538 +       INIT_LIST_HEAD(&write_sockets);
7539 +       INIT_LIST_HEAD(&state_sockets);
7540 +       INIT_LIST_HEAD(&listen_sockets);
7541 +
7542 +       spin_lock_init(&read_sockets_lock);
7543 +       spin_lock_init(&write_sockets_lock);
7544 +       spin_lock_init(&state_sockets_lock);
7545 +
7546 +       init_completion(&thread_completion);
7547 +       init_MUTEX(&thread_lock);
7548 +       atomic_set(&send_run, 0);
7549 +       atomic_set(&recv_run, 0);
7550 +
7551 +       error = -ENOTCONN;
7552 +       if (kcl_addref_cluster())
7553 +               goto out;
7554 +
7555 +       /*
7556 +        * Temporarily initialise the waitq head so that lowcomms_send_message
7557 +        * doesn't crash if it gets called before the thread is fully
7558 +        * initialised
7559 +        */
7560 +       init_waitqueue_head(&lowcomms_send_waitq);
7561 +
7562 +       error = -ENOMEM;
7563 +
7564 +       connections = kmalloc(sizeof(struct connection) *
7565 +                             dlm_config.max_connections, GFP_KERNEL);
7566 +       if (!connections)
7567 +               goto out;
7568 +
7569 +       memset(connections, 0,
7570 +              sizeof(struct connection) * dlm_config.max_connections);
7571 +       for (i = 0; i < dlm_config.max_connections; i++) {
7572 +               connections[i].nodeid = i;
7573 +               init_rwsem(&connections[i].sock_sem);
7574 +               INIT_LIST_HEAD(&connections[i].writequeue);
7575 +               spin_lock_init(&connections[i].writequeue_lock);
7576 +       }
7577 +       conn_array_size = dlm_config.max_connections;
7578 +
7579 +       /* Start listening */
7580 +       error = listen_for_all();
7581 +       if (error)
7582 +               goto fail_free_conn;
7583 +
7584 +       error = daemons_start();
7585 +       if (error)
7586 +               goto fail_free_conn;
7587 +
7588 +       atomic_set(&accepting, 1);
7589 +
7590 +       return 0;
7591 +
7592 +      fail_free_conn:
7593 +       kfree(connections);
7594 +
7595 +      out:
7596 +       return error;
7597 +}
7598 +
7599 +/* Don't accept any more outgoing work */
7600 +void lowcomms_stop_accept()
7601 +{
7602 +        atomic_set(&accepting, 0);
7603 +}
7604 +
7605 +/* Cluster Manager interface functions for looking up
7606 +   nodeids and IP addresses by each other
7607 +*/
7608 +
7609 +/* Return the IP address of a node given its NODEID */
7610 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
7611 +{
7612 +       struct list_head *addrs;
7613 +       struct cluster_node_addr *node_addr;
7614 +       struct cluster_node_addr *current_addr = NULL;
7615 +       struct sockaddr_in6 *saddr;
7616 +       int interface;
7617 +       int i;
7618 +
7619 +       addrs = kcl_get_node_addresses(nodeid);
7620 +       if (!addrs)
7621 +               return -1;
7622 +
7623 +       interface = kcl_get_current_interface();
7624 +
7625 +       /* Look for address number <interface> */
7626 +       i=0; /* i/f numbers start at 1 */
7627 +       list_for_each_entry(node_addr, addrs, list) {
7628 +               if (interface == ++i) {
7629 +                       current_addr = node_addr;
7630 +                       break;
7631 +               }
7632 +       }
7633 +
7634 +       /* If that failed then just use the first one */
7635 +       if (!current_addr)
7636 +               current_addr = (struct cluster_node_addr *)addrs->next;
7637 +
7638 +       saddr = (struct sockaddr_in6 *)current_addr->addr;
7639 +
7640 +       /* Extract the IP address */
7641 +       if (saddr->sin6_family == AF_INET) {
7642 +               struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
7643 +               struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
7644 +               ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
7645 +       }
7646 +       else {
7647 +               struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
7648 +               memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
7649 +       }
7650 +
7651 +       return 0;
7652 +}
7653 +
7654 +/* Return the NODEID for a node given its sockaddr */
7655 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
7656 +{
7657 +       struct kcl_cluster_node node;
7658 +       struct sockaddr_in6 ipv6_addr;
7659 +       struct sockaddr_in  ipv4_addr;
7660 +
7661 +       if (addr->sa_family == AF_INET) {
7662 +               struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
7663 +               memcpy(&ipv4_addr, &local_addr, addr_len);
7664 +               memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
7665 +
7666 +               addr = (struct sockaddr *)&ipv4_addr;
7667 +       }
7668 +       else {
7669 +               struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
7670 +               memcpy(&ipv6_addr, &local_addr, addr_len);
7671 +               memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
7672 +
7673 +               addr = (struct sockaddr *)&ipv6_addr;
7674 +       }
7675 +
7676 +       if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
7677 +               return node.node_id;
7678 +       else
7679 +               return 0;
7680 +}
7681 +
7682 +int lowcomms_our_nodeid(void)
7683 +{
7684 +       struct kcl_cluster_node node;
7685 +       struct list_head *addrs;
7686 +       struct cluster_node_addr *first_addr;
7687 +       static int our_nodeid = 0;
7688 +
7689 +       if (our_nodeid)
7690 +               return our_nodeid;
7691 +
7692 +       if (kcl_get_node_by_nodeid(0, &node) == -1)
7693 +               return 0;
7694 +
7695 +       our_nodeid = node.node_id;
7696 +
7697 +       /* Fill in the "template" structure */
7698 +       addrs = kcl_get_node_addresses(our_nodeid);
7699 +       if (!addrs)
7700 +               return 0;
7701 +
7702 +       first_addr = (struct cluster_node_addr *) addrs->next;
7703 +       memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
7704 +
7705 +       return node.node_id;
7706 +}
7707 +/*
7708 + * Overrides for Emacs so that we follow Linus's tabbing style.
7709 + * Emacs will notice this stuff at the end of the file and automatically
7710 + * adjust the settings for this buffer only.  This must remain at the end
7711 + * of the file.
7712 + * ---------------------------------------------------------------------------
7713 + * Local variables:
7714 + * c-file-style: "linux"
7715 + * End:
7716 + */
7717 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
7718 --- linux-orig/cluster/dlm/lowcomms.h   1970-01-01 07:30:00.000000000 +0730
7719 +++ linux-patched/cluster/dlm/lowcomms.h        2004-07-13 18:57:22.000000000 +0800
7720 @@ -0,0 +1,34 @@
7721 +/******************************************************************************
7722 +*******************************************************************************
7723 +**
7724 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7725 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7726 +**
7727 +**  This copyrighted material is made available to anyone wishing to use,
7728 +**  modify, copy, or redistribute it subject to the terms and conditions
7729 +**  of the GNU General Public License v.2.
7730 +**
7731 +*******************************************************************************
7732 +******************************************************************************/
7733 +
7734 +#ifndef __LOWCOMMS_DOT_H__
7735 +#define __LOWCOMMS_DOT_H__
7736 +
7737 +/* The old interface */
7738 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
7739 +
7740 +/* The new interface */
7741 +struct writequeue_entry;
7742 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7743 +                                                   int allocation, char **ppc);
7744 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
7745 +
7746 +int lowcomms_start(void);
7747 +void lowcomms_stop(void);
7748 +void lowcomms_stop_accept(void);
7749 +int lowcomms_close(int nodeid);
7750 +int lowcomms_max_buffer_size(void);
7751 +
7752 +int lowcomms_our_nodeid(void);
7753 +
7754 +#endif                         /* __LOWCOMMS_DOT_H__ */
7755 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
7756 --- linux-orig/cluster/dlm/main.c       1970-01-01 07:30:00.000000000 +0730
7757 +++ linux-patched/cluster/dlm/main.c    2004-07-13 18:57:22.000000000 +0800
7758 @@ -0,0 +1,98 @@
7759 +/******************************************************************************
7760 +*******************************************************************************
7761 +**
7762 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7763 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7764 +**
7765 +**  This copyrighted material is made available to anyone wishing to use,
7766 +**  modify, copy, or redistribute it subject to the terms and conditions
7767 +**  of the GNU General Public License v.2.
7768 +**
7769 +*******************************************************************************
7770 +******************************************************************************/
7771 +
7772 +#define EXPORT_SYMTAB
7773 +
7774 +#include <linux/init.h>
7775 +#include <linux/proc_fs.h>
7776 +#include <linux/ctype.h>
7777 +#include <linux/module.h>
7778 +#include <net/sock.h>
7779 +
7780 +#include <cluster/cnxman.h>
7781 +
7782 +#include "dlm_internal.h"
7783 +#include "lockspace.h"
7784 +#include "recoverd.h"
7785 +#include "ast.h"
7786 +#include "lkb.h"
7787 +#include "nodes.h"
7788 +#include "locking.h"
7789 +#include "config.h"
7790 +#include "memory.h"
7791 +#include "recover.h"
7792 +#include "lowcomms.h"
7793 +
7794 +int  dlm_device_init(void);
7795 +void dlm_device_exit(void);
7796 +void dlm_proc_init(void);
7797 +void dlm_proc_exit(void);
7798 +
7799 +
7800 +/* Cluster manager callbacks, we want to know if a node dies
7801 +   N.B. this is independent of lockspace-specific event callbacks from SM */
7802 +
7803 +static void cman_callback(kcl_callback_reason reason, long arg)
7804 +{
7805 +       if (reason == DIED) {
7806 +               lowcomms_close((int) arg);
7807 +       }
7808 +
7809 +       /* This is unconditional. so do what we can to tidy up */
7810 +       if (reason == LEAVING) {
7811 +               dlm_emergency_shutdown();
7812 +       }
7813 +}
7814 +
7815 +int __init init_dlm(void)
7816 +{
7817 +       dlm_proc_init();
7818 +       dlm_lockspace_init();
7819 +       dlm_recoverd_init();
7820 +       dlm_nodes_init();
7821 +       dlm_device_init();
7822 +       dlm_memory_init();
7823 +       dlm_config_init();
7824 +
7825 +       kcl_add_callback(cman_callback);
7826 +
7827 +       printk("DLM %s (built %s %s) installed\n",
7828 +              DLM_RELEASE_NAME, __DATE__, __TIME__);
7829 +
7830 +       return 0;
7831 +}
7832 +
7833 +void __exit exit_dlm(void)
7834 +{
7835 +       kcl_remove_callback(cman_callback);
7836 +
7837 +       dlm_device_exit();
7838 +       dlm_memory_exit();
7839 +       dlm_config_exit();
7840 +       dlm_proc_exit();
7841 +}
7842 +
7843 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
7844 +MODULE_AUTHOR("Red Hat, Inc.");
7845 +MODULE_LICENSE("GPL");
7846 +
7847 +module_init(init_dlm);
7848 +module_exit(exit_dlm);
7849 +
7850 +EXPORT_SYMBOL(dlm_init);
7851 +EXPORT_SYMBOL(dlm_release);
7852 +EXPORT_SYMBOL(dlm_new_lockspace);
7853 +EXPORT_SYMBOL(dlm_release_lockspace);
7854 +EXPORT_SYMBOL(dlm_lock);
7855 +EXPORT_SYMBOL(dlm_unlock);
7856 +EXPORT_SYMBOL(dlm_debug_dump);
7857 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
7858 --- linux-orig/cluster/dlm/memory.c     1970-01-01 07:30:00.000000000 +0730
7859 +++ linux-patched/cluster/dlm/memory.c  2004-07-13 18:57:22.000000000 +0800
7860 @@ -0,0 +1,238 @@
7861 +/******************************************************************************
7862 +*******************************************************************************
7863 +**
7864 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
7865 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
7866 +**
7867 +**  This copyrighted material is made available to anyone wishing to use,
7868 +**  modify, copy, or redistribute it subject to the terms and conditions
7869 +**  of the GNU General Public License v.2.
7870 +**
7871 +*******************************************************************************
7872 +******************************************************************************/
7873 +
7874 +/* memory.c
7875 + *
7876 + * memory allocation routines
7877 + *
7878 + */
7879 +
7880 +#include "dlm_internal.h"
7881 +#include "memory.h"
7882 +#include "config.h"
7883 +
7884 +/* as the man says...Shouldn't this be in a header file somewhere? */
7885 +#define        BYTES_PER_WORD          sizeof(void *)
7886 +
7887 +static kmem_cache_t *rsb_cache_small;
7888 +static kmem_cache_t *rsb_cache_large;
7889 +static kmem_cache_t *lkb_cache;
7890 +static kmem_cache_t *lvb_cache;
7891 +static kmem_cache_t *resdir_cache_large;
7892 +static kmem_cache_t *resdir_cache_small;
7893 +
7894 +/* The thresholds above which we allocate large RSBs/resdatas rather than small
7895 + * ones. This must make the resultant structure end on a word boundary */
7896 +#define LARGE_RSB_NAME 28
7897 +#define LARGE_RES_NAME 28
7898 +
7899 +int dlm_memory_init()
7900 +{
7901 +       int ret = -ENOMEM;
7902 +
7903 +
7904 +       rsb_cache_small =
7905 +           kmem_cache_create("dlm_rsb(small)",
7906 +                             (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7907 +                             __alignof__(struct dlm_rsb), 0, NULL, NULL);
7908 +       if (!rsb_cache_small)
7909 +               goto out;
7910 +
7911 +       rsb_cache_large =
7912 +           kmem_cache_create("dlm_rsb(large)",
7913 +                             sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
7914 +                             __alignof__(struct dlm_rsb), 0, NULL, NULL);
7915 +       if (!rsb_cache_large)
7916 +               goto out_free_rsbs;
7917 +
7918 +       lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
7919 +                                     __alignof__(struct dlm_lkb), 0, NULL, NULL);
7920 +       if (!lkb_cache)
7921 +               goto out_free_rsbl;
7922 +
7923 +       resdir_cache_large =
7924 +           kmem_cache_create("dlm_resdir(l)",
7925 +                             sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
7926 +                             __alignof__(struct dlm_direntry), 0, NULL, NULL);
7927 +       if (!resdir_cache_large)
7928 +               goto out_free_lkb;
7929 +
7930 +       resdir_cache_small =
7931 +           kmem_cache_create("dlm_resdir(s)",
7932 +                             (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
7933 +                             __alignof__(struct dlm_direntry), 0, NULL, NULL);
7934 +       if (!resdir_cache_small)
7935 +               goto out_free_resl;
7936 +
7937 +       /* LVB cache also holds ranges, so should be 64bit aligned */
7938 +       lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
7939 +                                     __alignof__(uint64_t), 0, NULL, NULL);
7940 +       if (!lkb_cache)
7941 +               goto out_free_ress;
7942 +
7943 +       ret = 0;
7944 +       goto out;
7945 +
7946 +      out_free_ress:
7947 +       kmem_cache_destroy(resdir_cache_small);
7948 +
7949 +      out_free_resl:
7950 +       kmem_cache_destroy(resdir_cache_large);
7951 +
7952 +      out_free_lkb:
7953 +       kmem_cache_destroy(lkb_cache);
7954 +
7955 +      out_free_rsbl:
7956 +       kmem_cache_destroy(rsb_cache_large);
7957 +
7958 +      out_free_rsbs:
7959 +       kmem_cache_destroy(rsb_cache_small);
7960 +
7961 +      out:
7962 +       return ret;
7963 +}
7964 +
7965 +void dlm_memory_exit()
7966 +{
7967 +       kmem_cache_destroy(rsb_cache_large);
7968 +       kmem_cache_destroy(rsb_cache_small);
7969 +       kmem_cache_destroy(lkb_cache);
7970 +       kmem_cache_destroy(resdir_cache_small);
7971 +       kmem_cache_destroy(resdir_cache_large);
7972 +       kmem_cache_destroy(lvb_cache);
7973 +}
7974 +
7975 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
7976 +{
7977 +       struct dlm_rsb *r;
7978 +
7979 +       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
7980 +
7981 +       if (namelen >= LARGE_RSB_NAME)
7982 +               r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
7983 +       else
7984 +               r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
7985 +
7986 +       if (r)
7987 +               memset(r, 0, sizeof(struct dlm_rsb) + namelen);
7988 +
7989 +       return r;
7990 +}
7991 +
7992 +void free_rsb(struct dlm_rsb *r)
7993 +{
7994 +       int length = r->res_length;
7995 +
7996 +#ifdef POISON
7997 +       memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
7998 +#endif
7999 +
8000 +       if (length >= LARGE_RSB_NAME)
8001 +               kmem_cache_free(rsb_cache_large, r);
8002 +       else
8003 +               kmem_cache_free(rsb_cache_small, r);
8004 +}
8005 +
8006 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
8007 +{
8008 +       struct dlm_lkb *l;
8009 +
8010 +       l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
8011 +       if (l)
8012 +               memset(l, 0, sizeof(struct dlm_lkb));
8013 +
8014 +       return l;
8015 +}
8016 +
8017 +void free_lkb(struct dlm_lkb *l)
8018 +{
8019 +#ifdef POISON
8020 +       memset(l, 0xAA, sizeof(struct dlm_lkb));
8021 +#endif
8022 +       kmem_cache_free(lkb_cache, l);
8023 +}
8024 +
8025 +struct dlm_direntry *allocate_resdata(struct dlm_ls *ls, int namelen)
8026 +{
8027 +       struct dlm_direntry *rd;
8028 +
8029 +       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8030 +
8031 +       if (namelen >= LARGE_RES_NAME)
8032 +               rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
8033 +       else
8034 +               rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
8035 +
8036 +       if (rd)
8037 +               memset(rd, 0, sizeof(struct dlm_direntry));
8038 +
8039 +       return rd;
8040 +}
8041 +
8042 +void free_resdata(struct dlm_direntry *de)
8043 +{
8044 +       if (de->length >= LARGE_RES_NAME)
8045 +               kmem_cache_free(resdir_cache_large, de);
8046 +       else
8047 +               kmem_cache_free(resdir_cache_small, de);
8048 +}
8049 +
8050 +char *allocate_lvb(struct dlm_ls *ls)
8051 +{
8052 +       char *l;
8053 +
8054 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8055 +       if (l)
8056 +               memset(l, 0, DLM_LVB_LEN);
8057 +
8058 +       return l;
8059 +}
8060 +
8061 +void free_lvb(char *l)
8062 +{
8063 +       kmem_cache_free(lvb_cache, l);
8064 +}
8065 +
8066 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
8067 + * bits) */
8068 +uint64_t *allocate_range(struct dlm_ls * ls)
8069 +{
8070 +       uint64_t *l;
8071 +
8072 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8073 +       if (l)
8074 +               memset(l, 0, DLM_LVB_LEN);
8075 +
8076 +       return l;
8077 +}
8078 +
8079 +void free_range(uint64_t *l)
8080 +{
8081 +       kmem_cache_free(lvb_cache, l);
8082 +}
8083 +
8084 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
8085 +{
8086 +       struct dlm_rcom *rc;
8087 +
8088 +       rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
8089 +       if (rc)
8090 +               memset(rc, 0, dlm_config.buffer_size);
8091 +
8092 +       return rc;
8093 +}
8094 +
8095 +void free_rcom_buffer(struct dlm_rcom *rc)
8096 +{
8097 +       kfree(rc);
8098 +}
8099 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
8100 --- linux-orig/cluster/dlm/memory.h     1970-01-01 07:30:00.000000000 +0730
8101 +++ linux-patched/cluster/dlm/memory.h  2004-07-13 18:57:22.000000000 +0800
8102 @@ -0,0 +1,32 @@
8103 +/******************************************************************************
8104 +*******************************************************************************
8105 +**
8106 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8107 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8108 +**
8109 +**  This copyrighted material is made available to anyone wishing to use,
8110 +**  modify, copy, or redistribute it subject to the terms and conditions
8111 +**  of the GNU General Public License v.2.
8112 +**
8113 +*******************************************************************************
8114 +******************************************************************************/
8115 +
8116 +#ifndef __MEMORY_DOT_H__
8117 +#define __MEMORY_DOT_H__
8118 +
8119 +int dlm_memory_init(void);
8120 +void dlm_memory_exit(void);
8121 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
8122 +void free_rsb(struct dlm_rsb *r);
8123 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
8124 +void free_lkb(struct dlm_lkb *l);
8125 +struct dlm_direntry *allocate_resdata(struct dlm_ls *ls, int namelen);
8126 +void free_resdata(struct dlm_direntry *de);
8127 +char *allocate_lvb(struct dlm_ls *ls);
8128 +void free_lvb(char *l);
8129 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
8130 +void free_rcom_buffer(struct dlm_rcom *rc);
8131 +uint64_t *allocate_range(struct dlm_ls *ls);
8132 +void free_range(uint64_t *l);
8133 +
8134 +#endif         /* __MEMORY_DOT_H__ */
8135 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8136 --- linux-orig/cluster/dlm/midcomms.c   1970-01-01 07:30:00.000000000 +0730
8137 +++ linux-patched/cluster/dlm/midcomms.c        2004-07-13 18:57:22.000000000 +0800
8138 @@ -0,0 +1,351 @@
8139 +/******************************************************************************
8140 +*******************************************************************************
8141 +**
8142 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8143 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8144 +**
8145 +**  This copyrighted material is made available to anyone wishing to use,
8146 +**  modify, copy, or redistribute it subject to the terms and conditions
8147 +**  of the GNU General Public License v.2.
8148 +**
8149 +*******************************************************************************
8150 +******************************************************************************/
8151 +
8152 +/*
8153 + * midcomms.c
8154 + *
8155 + * This is the appallingly named "mid-level" comms layer.
8156 + *
8157 + * Its purpose is to take packets from the "real" comms layer,
8158 + * split them up into packets and pass them to the interested
8159 + * part of the locking mechanism.
8160 + *
8161 + * It also takes messages from the locking layer, formats them
8162 + * into packets and sends them to the comms layer.
8163 + *
8164 + * It knows the format of the mid-level messages used and nodeidss
8165 + * but it does not know how to resolve a nodeid into an IP address
8166 + * or any of the comms channel details
8167 + *
8168 + */
8169 +
8170 +#include "dlm_internal.h"
8171 +#include "lowcomms.h"
8172 +#include "midcomms.h"
8173 +#include "lockqueue.h"
8174 +#include "nodes.h"
8175 +#include "reccomms.h"
8176 +#include "config.h"
8177 +
8178 +/* Byteorder routines */
8179 +
8180 +static void host_to_network(void *msg)
8181 +{
8182 +       struct dlm_header *head = msg;
8183 +       struct dlm_request *req = msg;
8184 +       struct dlm_reply *rep = msg;
8185 +       struct dlm_query_request *qreq = msg;
8186 +       struct dlm_query_reply *qrep= msg;
8187 +       struct dlm_rcom *rc = msg;
8188 +
8189 +       /* Force into network byte order */
8190 +
8191 +       /*
8192 +        * Do the common header first
8193 +        */
8194 +
8195 +       head->rh_length = cpu_to_le16(head->rh_length);
8196 +       head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8197 +       /* Leave the lkid alone as it is transparent at the remote end */
8198 +
8199 +       /*
8200 +        * Do the fields in the remlockrequest or remlockreply structs
8201 +        */
8202 +
8203 +       switch (req->rr_header.rh_cmd) {
8204 +
8205 +       case GDLM_REMCMD_LOCKREQUEST:
8206 +       case GDLM_REMCMD_CONVREQUEST:
8207 +               req->rr_range_start = cpu_to_le64(req->rr_range_start);
8208 +               req->rr_range_end = cpu_to_le64(req->rr_range_end);
8209 +               /* Deliberate fall through */
8210 +       case GDLM_REMCMD_UNLOCKREQUEST:
8211 +       case GDLM_REMCMD_LOOKUP:
8212 +       case GDLM_REMCMD_LOCKGRANT:
8213 +       case GDLM_REMCMD_SENDBAST:
8214 +       case GDLM_REMCMD_SENDCAST:
8215 +       case GDLM_REMCMD_REM_RESDATA:
8216 +               req->rr_flags = cpu_to_le32(req->rr_flags);
8217 +               req->rr_status = cpu_to_le32(req->rr_status);
8218 +               break;
8219 +
8220 +       case GDLM_REMCMD_LOCKREPLY:
8221 +               rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
8222 +               rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
8223 +               rep->rl_status = cpu_to_le32(rep->rl_status);
8224 +               break;
8225 +
8226 +       case GDLM_REMCMD_RECOVERMESSAGE:
8227 +       case GDLM_REMCMD_RECOVERREPLY:
8228 +               rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8229 +               rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8230 +               break;
8231 +
8232 +       case GDLM_REMCMD_QUERY:
8233 +               qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
8234 +               qreq->rq_query = cpu_to_le32(qreq->rq_query);
8235 +               qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
8236 +               break;
8237 +
8238 +       case GDLM_REMCMD_QUERYREPLY:
8239 +               qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
8240 +               qrep->rq_status = cpu_to_le32(qrep->rq_status);
8241 +               qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
8242 +               qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
8243 +               qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
8244 +               break;
8245 +
8246 +       default:
8247 +               printk("dlm: warning, unknown REMCMD type %u\n",
8248 +                      req->rr_header.rh_cmd);
8249 +       }
8250 +}
8251 +
8252 +static void network_to_host(void *msg)
8253 +{
8254 +       struct dlm_header *head = msg;
8255 +       struct dlm_request *req = msg;
8256 +       struct dlm_reply *rep = msg;
8257 +       struct dlm_query_request *qreq = msg;
8258 +       struct dlm_query_reply *qrep = msg;
8259 +       struct dlm_rcom *rc = msg;
8260 +
8261 +       /* Force into host byte order */
8262 +
8263 +       /*
8264 +        * Do the common header first
8265 +        */
8266 +
8267 +       head->rh_length = le16_to_cpu(head->rh_length);
8268 +       head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8269 +       /* Leave the lkid alone as it is transparent at the remote end */
8270 +
8271 +       /*
8272 +        * Do the fields in the remlockrequest or remlockreply structs
8273 +        */
8274 +
8275 +       switch (req->rr_header.rh_cmd) {
8276 +
8277 +       case GDLM_REMCMD_LOCKREQUEST:
8278 +       case GDLM_REMCMD_CONVREQUEST:
8279 +               req->rr_range_start = le64_to_cpu(req->rr_range_start);
8280 +               req->rr_range_end = le64_to_cpu(req->rr_range_end);
8281 +       case GDLM_REMCMD_LOOKUP:
8282 +       case GDLM_REMCMD_UNLOCKREQUEST:
8283 +       case GDLM_REMCMD_LOCKGRANT:
8284 +       case GDLM_REMCMD_SENDBAST:
8285 +       case GDLM_REMCMD_SENDCAST:
8286 +       case GDLM_REMCMD_REM_RESDATA:
8287 +               /* Actually, not much to do here as the remote lock IDs are
8288 +                * transparent too */
8289 +               req->rr_flags = le32_to_cpu(req->rr_flags);
8290 +               req->rr_status = le32_to_cpu(req->rr_status);
8291 +               break;
8292 +
8293 +       case GDLM_REMCMD_LOCKREPLY:
8294 +               rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
8295 +               rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
8296 +               rep->rl_status = le32_to_cpu(rep->rl_status);
8297 +               break;
8298 +
8299 +       case GDLM_REMCMD_RECOVERMESSAGE:
8300 +       case GDLM_REMCMD_RECOVERREPLY:
8301 +               rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8302 +               rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8303 +               break;
8304 +
8305 +
8306 +       case GDLM_REMCMD_QUERY:
8307 +               qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
8308 +               qreq->rq_query = le32_to_cpu(qreq->rq_query);
8309 +               qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
8310 +               break;
8311 +
8312 +       case GDLM_REMCMD_QUERYREPLY:
8313 +               qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
8314 +               qrep->rq_status = le32_to_cpu(qrep->rq_status);
8315 +               qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
8316 +               qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
8317 +               qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
8318 +               break;
8319 +
8320 +       default:
8321 +               printk("dlm: warning, unknown REMCMD type %u\n",
8322 +                      req->rr_header.rh_cmd);
8323 +       }
8324 +}
8325 +
8326 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8327 +                        unsigned len, unsigned limit)
8328 +{
8329 +       unsigned copy = len;
8330 +
8331 +       if ((copy + offset) > limit)
8332 +               copy = limit - offset;
8333 +       memcpy(dst, base + offset, copy);
8334 +       len -= copy;
8335 +       if (len)
8336 +               memcpy(dst + copy, base, len);
8337 +}
8338 +
8339 +static void khexdump(const unsigned char *c, int len)
8340 +{
8341 +       while (len > 16) {
8342 +               printk(KERN_INFO
8343 +                      "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8344 +                      c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8345 +                      c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8346 +               len -= 16;
8347 +       }
8348 +       while (len > 4) {
8349 +               printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8350 +                      c[3]);
8351 +               len -= 4;
8352 +       }
8353 +       while (len > 0) {
8354 +               printk(KERN_INFO "%02x\n", c[0]);
8355 +               len--;
8356 +       }
8357 +}
8358 +
8359 +/*
8360 + * Called from the low-level comms layer to process a buffer of
8361 + * commands.
8362 + *
8363 + * Only complete messages are processed here, any "spare" bytes from
8364 + * the end of a buffer are saved and tacked onto the front of the next
8365 + * message that comes in. I doubt this will happen very often but we
8366 + * need to be able to cope with it and I don't want the task to be waiting
8367 + * for packets to come in when there is useful work to be done.
8368 + *
8369 + */
8370 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8371 +                                    unsigned offset, unsigned len,
8372 +                                    unsigned limit)
8373 +{
8374 +       unsigned char __tmp[sizeof(struct dlm_header) + 64];
8375 +       struct dlm_header *msg = (struct dlm_header *) __tmp;
8376 +       int ret = 0;
8377 +       int err = 0;
8378 +       unsigned msglen;
8379 +       __u32 id, space;
8380 +
8381 +       while (len > sizeof(struct dlm_header)) {
8382 +               /* Get message header and check it over */
8383 +               copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
8384 +                            limit);
8385 +               msglen = le16_to_cpu(msg->rh_length);
8386 +               id = msg->rh_lkid;
8387 +               space = msg->rh_lockspace;
8388 +
8389 +               /* Check message size */
8390 +               err = -EINVAL;
8391 +               if (msglen < sizeof(struct dlm_header))
8392 +                       break;
8393 +               err = -E2BIG;
8394 +               if (msglen > dlm_config.buffer_size) {
8395 +                       printk("dlm: message size too big %d\n", msglen);
8396 +                       break;
8397 +               }
8398 +               err = 0;
8399 +
8400 +               /* Not enough in buffer yet? wait for some more */
8401 +               if (msglen > len)
8402 +                       break;
8403 +
8404 +               /* Make sure our temp buffer is large enough */
8405 +               if (msglen > sizeof(__tmp) &&
8406 +                   msg == (struct dlm_header *) __tmp) {
8407 +                       msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8408 +                       if (msg == NULL)
8409 +                               return ret;
8410 +               }
8411 +
8412 +               copy_from_cb(msg, base, offset, msglen, limit);
8413 +               BUG_ON(id != msg->rh_lkid);
8414 +               BUG_ON(space != msg->rh_lockspace);
8415 +               ret += msglen;
8416 +               offset += msglen;
8417 +               offset &= (limit - 1);
8418 +               len -= msglen;
8419 +               network_to_host(msg);
8420 +
8421 +               if ((msg->rh_cmd > 32) ||
8422 +                   (msg->rh_cmd == 0) ||
8423 +                   (msg->rh_length < sizeof(struct dlm_header)) ||
8424 +                   (msg->rh_length > dlm_config.buffer_size)) {
8425 +
8426 +                       printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8427 +                              "lkid=%u, lockspace=%u\n",
8428 +                              msg->rh_cmd, msg->rh_flags, msg->rh_length,
8429 +                              msg->rh_lkid, msg->rh_lockspace);
8430 +
8431 +                       printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8432 +                              "ret=%u, limit=%08x newbuf=%d\n",
8433 +                              base, offset, len, ret, limit,
8434 +                              ((struct dlm_header *) __tmp == msg));
8435 +
8436 +                       khexdump((const unsigned char *) msg, msg->rh_length);
8437 +
8438 +                       return -EBADMSG;
8439 +               }
8440 +
8441 +               switch (msg->rh_cmd) {
8442 +               case GDLM_REMCMD_RECOVERMESSAGE:
8443 +               case GDLM_REMCMD_RECOVERREPLY:
8444 +                       process_recovery_comm(nodeid, msg);
8445 +                       break;
8446 +               default:
8447 +                       process_cluster_request(nodeid, msg, FALSE);
8448 +               }
8449 +       }
8450 +
8451 +       if (msg != (struct dlm_header *) __tmp)
8452 +               kfree(msg);
8453 +
8454 +       return err ? err : ret;
8455 +}
8456 +
8457 +/*
8458 + * Send a lowcomms buffer
8459 + */
8460 +
8461 +void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
8462 +{
8463 +       host_to_network(msg);
8464 +       lowcomms_commit_buffer(e);
8465 +}
8466 +
8467 +/*
8468 + * Make the message into network byte order and send it
8469 + */
8470 +
8471 +int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
8472 +                         int allocation)
8473 +{
8474 +       int len = msg->rh_length;
8475 +
8476 +       host_to_network(msg);
8477 +
8478 +       /*
8479 +        * Loopback.  In fact, the locking code pretty much prevents this from
8480 +        * being needed but it can happen when the directory node is also the
8481 +        * local node.
8482 +        */
8483 +
8484 +       if (nodeid == our_nodeid())
8485 +               return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
8486 +                                                       len, len);
8487 +
8488 +       return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
8489 +}
8490 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
8491 --- linux-orig/cluster/dlm/midcomms.h   1970-01-01 07:30:00.000000000 +0730
8492 +++ linux-patched/cluster/dlm/midcomms.h        2004-07-13 18:57:22.000000000 +0800
8493 @@ -0,0 +1,24 @@
8494 +/******************************************************************************
8495 +*******************************************************************************
8496 +**
8497 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8498 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8499 +**
8500 +**  This copyrighted material is made available to anyone wishing to use,
8501 +**  modify, copy, or redistribute it subject to the terms and conditions
8502 +**  of the GNU General Public License v.2.
8503 +**
8504 +*******************************************************************************
8505 +******************************************************************************/
8506 +
8507 +#ifndef __MIDCOMMS_DOT_H__
8508 +#define __MIDCOMMS_DOT_H__
8509 +
8510 +int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
8511 +                         int allocation);
8512 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
8513 +                                    unsigned len, unsigned limit);
8514 +void midcomms_send_buffer(struct dlm_header *msg,
8515 +                         struct writequeue_entry *e);
8516 +
8517 +#endif                         /* __MIDCOMMS_DOT_H__ */
8518 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
8519 --- linux-orig/cluster/dlm/nodes.c      1970-01-01 07:30:00.000000000 +0730
8520 +++ linux-patched/cluster/dlm/nodes.c   2004-07-13 18:57:22.000000000 +0800
8521 @@ -0,0 +1,325 @@
8522 +/******************************************************************************
8523 +*******************************************************************************
8524 +**
8525 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8526 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8527 +**
8528 +**  This copyrighted material is made available to anyone wishing to use,
8529 +**  modify, copy, or redistribute it subject to the terms and conditions
8530 +**  of the GNU General Public License v.2.
8531 +**
8532 +*******************************************************************************
8533 +******************************************************************************/
8534 +
8535 +#include <net/sock.h>
8536 +#include <cluster/cnxman.h>
8537 +
8538 +#include "dlm_internal.h"
8539 +#include "lowcomms.h"
8540 +#include "nodes.h"
8541 +#include "recover.h"
8542 +#include "reccomms.h"
8543 +#include "util.h"
8544 +
8545 +static struct list_head cluster_nodes;
8546 +static spinlock_t node_lock;
8547 +static uint32_t local_nodeid;
8548 +static struct semaphore local_init_lock;
8549 +
8550 +
8551 +void dlm_nodes_init(void)
8552 +{
8553 +       INIT_LIST_HEAD(&cluster_nodes);
8554 +       spin_lock_init(&node_lock);
8555 +       local_nodeid = 0;
8556 +       init_MUTEX(&local_init_lock);
8557 +}
8558 +
8559 +static struct dlm_node *search_node(uint32_t nodeid)
8560 +{
8561 +       struct dlm_node *node;
8562 +
8563 +       list_for_each_entry(node, &cluster_nodes, list) {
8564 +               if (node->nodeid == nodeid)
8565 +                       goto out;
8566 +       }
8567 +       node = NULL;
8568 +      out:
8569 +       return node;
8570 +}
8571 +
8572 +static void put_node(struct dlm_node *node)
8573 +{
8574 +       spin_lock(&node_lock);
8575 +       node->refcount--;
8576 +       if (node->refcount == 0) {
8577 +               list_del(&node->list);
8578 +               spin_unlock(&node_lock);
8579 +               kfree(node);
8580 +               return;
8581 +       }
8582 +       spin_unlock(&node_lock);
8583 +}
8584 +
8585 +static int get_node(uint32_t nodeid, struct dlm_node **ndp)
8586 +{
8587 +       struct dlm_node *node, *node2;
8588 +       int error = -ENOMEM;
8589 +
8590 +       spin_lock(&node_lock);
8591 +       node = search_node(nodeid);
8592 +       if (node)
8593 +               node->refcount++;
8594 +       spin_unlock(&node_lock);
8595 +
8596 +       if (node)
8597 +               goto out;
8598 +
8599 +       node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
8600 +       if (!node)
8601 +               goto fail;
8602 +
8603 +       memset(node, 0, sizeof(struct dlm_node));
8604 +       node->nodeid = nodeid;
8605 +
8606 +       spin_lock(&node_lock);
8607 +       node2 = search_node(nodeid);
8608 +       if (node2) {
8609 +               node2->refcount++;
8610 +               spin_unlock(&node_lock);
8611 +               kfree(node);
8612 +               node = node2;
8613 +               goto out;
8614 +       }
8615 +
8616 +       node->refcount = 1;
8617 +       list_add_tail(&node->list, &cluster_nodes);
8618 +       spin_unlock(&node_lock);
8619 +
8620 +      out:
8621 +       *ndp = node;
8622 +       return 0;
8623 +
8624 +      fail:
8625 +       return error;
8626 +}
8627 +
8628 +int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
8629 +{
8630 +       struct dlm_csb *csb;
8631 +       struct dlm_node *node;
8632 +       int error = -ENOMEM;
8633 +
8634 +       csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
8635 +       if (!csb)
8636 +               goto fail;
8637 +
8638 +       memset(csb, 0, sizeof(struct dlm_csb));
8639 +
8640 +       error = get_node(nodeid, &node);
8641 +       if (error)
8642 +               goto fail_free;
8643 +
8644 +       csb->node = node;
8645 +
8646 +       down(&local_init_lock);
8647 +
8648 +       if (!local_nodeid) {
8649 +               if (nodeid == our_nodeid()) {
8650 +                       local_nodeid = node->nodeid;
8651 +               }
8652 +       }
8653 +       up(&local_init_lock);
8654 +
8655 +       *ret_csb = csb;
8656 +       return 0;
8657 +
8658 +      fail_free:
8659 +       kfree(csb);
8660 +      fail:
8661 +       return error;
8662 +}
8663 +
8664 +void release_csb(struct dlm_csb *csb)
8665 +{
8666 +       put_node(csb->node);
8667 +       kfree(csb);
8668 +}
8669 +
8670 +uint32_t our_nodeid(void)
8671 +{
8672 +       return lowcomms_our_nodeid();
8673 +}
8674 +
8675 +int nodes_reconfig_wait(struct dlm_ls *ls)
8676 +{
8677 +       int error;
8678 +
8679 +       if (ls->ls_low_nodeid == our_nodeid()) {
8680 +               error = dlm_wait_status_all(ls, NODES_VALID);
8681 +               if (!error)
8682 +                       set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
8683 +
8684 +               /* Experimental: this delay should allow any final messages
8685 +                * from the previous node to be received before beginning
8686 +                * recovery. */
8687 +
8688 +               if (ls->ls_num_nodes == 1) {
8689 +                       current->state = TASK_UNINTERRUPTIBLE;
8690 +                       schedule_timeout((2) * HZ);
8691 +               }
8692 +
8693 +       } else
8694 +               error = dlm_wait_status_low(ls, NODES_ALL_VALID);
8695 +
8696 +       return error;
8697 +}
8698 +
8699 +static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
8700 +{
8701 +       struct dlm_csb *csb = NULL;
8702 +       struct list_head *tmp;
8703 +       struct list_head *newlist = &new->list;
8704 +       struct list_head *head = &ls->ls_nodes;
8705 +
8706 +       list_for_each(tmp, head) {
8707 +               csb = list_entry(tmp, struct dlm_csb, list);
8708 +
8709 +               if (new->node->nodeid < csb->node->nodeid)
8710 +                       break;
8711 +       }
8712 +
8713 +       if (!csb)
8714 +               list_add_tail(newlist, head);
8715 +       else {
8716 +               /* FIXME: can use list macro here */
8717 +               newlist->prev = tmp->prev;
8718 +               newlist->next = tmp;
8719 +               tmp->prev->next = newlist;
8720 +               tmp->prev = newlist;
8721 +       }
8722 +}
8723 +
8724 +int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
8725 +{
8726 +       struct dlm_csb *csb, *safe;
8727 +       int error, i, found, pos = 0, neg = 0;
8728 +       uint32_t low = (uint32_t) (-1);
8729 +
8730 +       /*
8731 +        * Remove (and save) departed nodes from lockspace's nodes list
8732 +        */
8733 +
8734 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
8735 +               found = FALSE;
8736 +               for (i = 0; i < rv->node_count; i++) {
8737 +                       if (csb->node->nodeid == rv->nodeids[i]) {
8738 +                               found = TRUE;
8739 +                               break;
8740 +                       }
8741 +               }
8742 +
8743 +               if (!found) {
8744 +                       neg++;
8745 +                       csb->gone_event = rv->event_id;
8746 +                       list_del(&csb->list);
8747 +                       list_add_tail(&csb->list, &ls->ls_nodes_gone);
8748 +                       ls->ls_num_nodes--;
8749 +                       log_all(ls, "remove node %u", csb->node->nodeid);
8750 +               }
8751 +       }
8752 +
8753 +       /*
8754 +        * Add new nodes to lockspace's nodes list
8755 +        */
8756 +
8757 +       for (i = 0; i < rv->node_count; i++) {
8758 +               found = FALSE;
8759 +               list_for_each_entry(csb, &ls->ls_nodes, list) {
8760 +                       if (csb->node->nodeid == rv->nodeids[i]) {
8761 +                               found = TRUE;
8762 +                               break;
8763 +                       }
8764 +               }
8765 +
8766 +               if (!found) {
8767 +                       pos++;
8768 +
8769 +                       error = init_new_csb(rv->nodeids[i], &csb);
8770 +                       DLM_ASSERT(!error,);
8771 +
8772 +                       add_ordered_node(ls, csb);
8773 +                       ls->ls_num_nodes++;
8774 +                       log_all(ls, "add node %u", csb->node->nodeid);
8775 +               }
8776 +       }
8777 +
8778 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
8779 +               if (csb->node->nodeid < low)
8780 +                       low = csb->node->nodeid;
8781 +       }
8782 +
8783 +       rcom_log_clear(ls);
8784 +       ls->ls_low_nodeid = low;
8785 +       ls->ls_nodes_mask = dlm_next_power2(ls->ls_num_nodes) - 1;
8786 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8787 +       *neg_out = neg;
8788 +
8789 +       error = nodes_reconfig_wait(ls);
8790 +
8791 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
8792 +
8793 +       return error;
8794 +}
8795 +
8796 +int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
8797 +{
8798 +       struct dlm_csb *csb;
8799 +       int i, error;
8800 +       uint32_t low = (uint32_t) (-1);
8801 +
8802 +       log_all(ls, "add nodes");
8803 +
8804 +       for (i = 0; i < rv->node_count; i++) {
8805 +               error = init_new_csb(rv->nodeids[i], &csb);
8806 +               if (error)
8807 +                       goto fail;
8808 +
8809 +               add_ordered_node(ls, csb);
8810 +               ls->ls_num_nodes++;
8811 +
8812 +               if (csb->node->nodeid < low)
8813 +                       low = csb->node->nodeid;
8814 +       }
8815 +
8816 +       ls->ls_low_nodeid = low;
8817 +       ls->ls_nodes_mask = dlm_next_power2(ls->ls_num_nodes) - 1;
8818 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
8819 +
8820 +       error = nodes_reconfig_wait(ls);
8821 +
8822 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
8823 +
8824 +       return error;
8825 +
8826 +      fail:
8827 +       while (!list_empty(&ls->ls_nodes)) {
8828 +               csb = list_entry(ls->ls_nodes.next, struct dlm_csb, list);
8829 +               list_del(&csb->list);
8830 +               release_csb(csb);
8831 +       }
8832 +       ls->ls_num_nodes = 0;
8833 +
8834 +       return error;
8835 +}
8836 +
8837 +int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
8838 +{
8839 +       struct dlm_csb *csb;
8840 +
8841 +       list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
8842 +               if (csb->node->nodeid == nodeid)
8843 +                       return TRUE;
8844 +       }
8845 +       return FALSE;
8846 +}
8847 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
8848 --- linux-orig/cluster/dlm/nodes.h      1970-01-01 07:30:00.000000000 +0730
8849 +++ linux-patched/cluster/dlm/nodes.h   2004-07-13 18:57:22.000000000 +0800
8850 @@ -0,0 +1,25 @@
8851 +/******************************************************************************
8852 +*******************************************************************************
8853 +**
8854 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8855 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8856 +**
8857 +**  This copyrighted material is made available to anyone wishing to use,
8858 +**  modify, copy, or redistribute it subject to the terms and conditions
8859 +**  of the GNU General Public License v.2.
8860 +**
8861 +*******************************************************************************
8862 +******************************************************************************/
8863 +
8864 +#ifndef __NODES_DOT_H__
8865 +#define __NODES_DOT_H__
8866 +
8867 +void dlm_nodes_init(void);
8868 +int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
8869 +void release_csb(struct dlm_csb * csb);
8870 +uint32_t our_nodeid(void);
8871 +int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
8872 +int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
8873 +int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
8874 +
8875 +#endif                         /* __NODES_DOT_H__ */
8876 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
8877 --- linux-orig/cluster/dlm/proc.c       1970-01-01 07:30:00.000000000 +0730
8878 +++ linux-patched/cluster/dlm/proc.c    2004-07-13 18:57:22.000000000 +0800
8879 @@ -0,0 +1,473 @@
8880 +/******************************************************************************
8881 +*******************************************************************************
8882 +**
8883 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8884 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8885 +**
8886 +**  This copyrighted material is made available to anyone wishing to use,
8887 +**  modify, copy, or redistribute it subject to the terms and conditions
8888 +**  of the GNU General Public License v.2.
8889 +**
8890 +*******************************************************************************
8891 +******************************************************************************/
8892 +
8893 +#include <linux/init.h>
8894 +#include <linux/proc_fs.h>
8895 +#include <linux/ctype.h>
8896 +#include <linux/seq_file.h>
8897 +#include <linux/module.h>
8898 +
8899 +#include "dlm_internal.h"
8900 +#include "lockspace.h"
8901 +
8902 +#if defined(DLM_DEBUG)
8903 +#define DLM_DEBUG_SIZE         (1024)
8904 +#define MAX_DEBUG_MSG_LEN      (64)
8905 +#else
8906 +#define DLM_DEBUG_SIZE         (0)
8907 +#define MAX_DEBUG_MSG_LEN      (0)
8908 +#endif
8909 +
8910 +static char *                  debug_buf;
8911 +static unsigned int            debug_size;
8912 +static unsigned int            debug_point;
8913 +static int                     debug_wrap;
8914 +static spinlock_t              debug_lock;
8915 +static struct proc_dir_entry * debug_proc_entry = NULL;
8916 +static struct proc_dir_entry * rcom_proc_entry = NULL;
8917 +static char                    proc_ls_name[255] = "";
8918 +
8919 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
8920 +static struct proc_dir_entry * locks_proc_entry = NULL;
8921 +static struct seq_operations   locks_info_op;
8922 +
8923 +
8924 +static int locks_open(struct inode *inode, struct file *file)
8925 +{
8926 +       return seq_open(file, &locks_info_op);
8927 +}
8928 +
8929 +/* Write simply sets the lockspace to use */
8930 +static ssize_t locks_write(struct file *file, const char *buf,
8931 +                          size_t count, loff_t * ppos)
8932 +{
8933 +       if (count < sizeof(proc_ls_name)) {
8934 +               copy_from_user(proc_ls_name, buf, count);
8935 +               proc_ls_name[count] = '\0';
8936 +
8937 +               /* Remove any trailing LF so that lazy users
8938 +                  can just echo "lsname" > /proc/cluster/dlm_locks */
8939 +               if (proc_ls_name[count - 1] == '\n')
8940 +                       proc_ls_name[count - 1] = '\0';
8941 +
8942 +               return count;
8943 +       }
8944 +       return 0;
8945 +}
8946 +
8947 +static struct file_operations locks_fops = {
8948 +      open:locks_open,
8949 +      write:locks_write,
8950 +      read:seq_read,
8951 +      llseek:seq_lseek,
8952 +      release:seq_release,
8953 +};
8954 +
8955 +struct ls_dumpinfo {
8956 +       int entry;
8957 +       struct list_head *next;
8958 +       struct dlm_ls *ls;
8959 +       struct dlm_rsb *rsb;
8960 +};
8961 +
8962 +static int print_resource(struct dlm_rsb * res, struct seq_file *s);
8963 +
8964 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
8965 +{
8966 +       int i;
8967 +
8968 +       if (!di->next) {
8969 +               /* Find the next non-empty hash bucket */
8970 +               for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
8971 +                       read_lock(&di->ls->ls_rsbtbl[i].lock);
8972 +                       if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
8973 +                               di->next = di->ls->ls_rsbtbl[i].list.next;
8974 +                               read_unlock(&di->ls->ls_rsbtbl[i].lock);
8975 +                               break;
8976 +                       }
8977 +                       read_unlock(&di->ls->ls_rsbtbl[i].lock);
8978 +               }
8979 +               di->entry = i;
8980 +
8981 +               if (di->entry >= di->ls->ls_rsbtbl_size)
8982 +                       return NULL;    /* End of hash list */
8983 +       } else {                /* Find the next entry in the list */
8984 +               i = di->entry;
8985 +               read_lock(&di->ls->ls_rsbtbl[i].lock);
8986 +               di->next = di->next->next;
8987 +               if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
8988 +                       /* End of list - move to next bucket */
8989 +                       di->next = NULL;
8990 +                       di->entry++;
8991 +                       read_unlock(&di->ls->ls_rsbtbl[i].lock);
8992 +                       return next_rsb(di);    /* do the top half of this conditional */
8993 +               }
8994 +               read_unlock(&di->ls->ls_rsbtbl[i].lock);
8995 +       }
8996 +       di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
8997 +
8998 +       return di;
8999 +}
9000 +
9001 +static void *s_start(struct seq_file *m, loff_t * pos)
9002 +{
9003 +       struct ls_dumpinfo *di;
9004 +       struct dlm_ls *ls;
9005 +       int i;
9006 +
9007 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9008 +       if (!ls)
9009 +               return NULL;
9010 +
9011 +       di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9012 +       if (!di)
9013 +               return NULL;
9014 +
9015 +       if (*pos == 0)
9016 +               seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9017 +
9018 +       di->entry = 0;
9019 +       di->next = NULL;
9020 +       di->ls = ls;
9021 +
9022 +       for (i = 0; i < *pos; i++)
9023 +               if (next_rsb(di) == NULL)
9024 +                       return NULL;
9025 +
9026 +       return next_rsb(di);
9027 +}
9028 +
9029 +static void *s_next(struct seq_file *m, void *p, loff_t * pos)
9030 +{
9031 +       struct ls_dumpinfo *di = p;
9032 +
9033 +       *pos += 1;
9034 +
9035 +       return next_rsb(di);
9036 +}
9037 +
9038 +static int s_show(struct seq_file *m, void *p)
9039 +{
9040 +       struct ls_dumpinfo *di = p;
9041 +       return print_resource(di->rsb, m);
9042 +}
9043 +
9044 +static void s_stop(struct seq_file *m, void *p)
9045 +{
9046 +       kfree(p);
9047 +}
9048 +
9049 +static struct seq_operations locks_info_op = {
9050 +      start:s_start,
9051 +      next:s_next,
9052 +      stop:s_stop,
9053 +      show:s_show
9054 +};
9055 +
9056 +static char *print_lockmode(int mode)
9057 +{
9058 +       switch (mode) {
9059 +       case DLM_LOCK_IV:
9060 +               return "--";
9061 +       case DLM_LOCK_NL:
9062 +               return "NL";
9063 +       case DLM_LOCK_CR:
9064 +               return "CR";
9065 +       case DLM_LOCK_CW:
9066 +               return "CW";
9067 +       case DLM_LOCK_PR:
9068 +               return "PR";
9069 +       case DLM_LOCK_PW:
9070 +               return "PW";
9071 +       case DLM_LOCK_EX:
9072 +               return "EX";
9073 +       default:
9074 +               return "??";
9075 +       }
9076 +}
9077 +
9078 +static void print_lock(struct seq_file *s, struct dlm_lkb * lkb, struct dlm_rsb * res)
9079 +{
9080 +
9081 +       seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
9082 +
9083 +       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9084 +           || lkb->lkb_status == GDLM_LKSTS_WAITING)
9085 +               seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
9086 +
9087 +       if (lkb->lkb_range) {
9088 +               /* This warns on Alpha. Tough. Only I see it */
9089 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9090 +                   || lkb->lkb_status == GDLM_LKSTS_GRANTED)
9091 +                       seq_printf(s, " %" PRIx64 "-%" PRIx64,
9092 +                                  lkb->lkb_range[GR_RANGE_START],
9093 +                                  lkb->lkb_range[GR_RANGE_END]);
9094 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9095 +                   || lkb->lkb_status == GDLM_LKSTS_WAITING)
9096 +                       seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
9097 +                                  lkb->lkb_range[RQ_RANGE_START],
9098 +                                  lkb->lkb_range[RQ_RANGE_END]);
9099 +       }
9100 +
9101 +       if (lkb->lkb_nodeid) {
9102 +               if (lkb->lkb_nodeid != res->res_nodeid)
9103 +                       seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9104 +                                  lkb->lkb_remid);
9105 +               else
9106 +                       seq_printf(s, " Master:     %08x", lkb->lkb_remid);
9107 +       }
9108 +
9109 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9110 +               seq_printf(s, "  LQ: %d", lkb->lkb_lockqueue_state);
9111 +
9112 +       seq_printf(s, "\n");
9113 +}
9114 +
9115 +static int print_resource(struct dlm_rsb *res, struct seq_file *s)
9116 +{
9117 +       int i;
9118 +       struct list_head *locklist;
9119 +
9120 +       seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9121 +                  res->res_parent, res->res_length);
9122 +       for (i = 0; i < res->res_length; i++) {
9123 +               if (isprint(res->res_name[i]))
9124 +                       seq_printf(s, "%c", res->res_name[i]);
9125 +               else
9126 +                       seq_printf(s, "%c", '.');
9127 +       }
9128 +       if (res->res_nodeid)
9129 +               seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
9130 +                          res->res_nodeid);
9131 +       else
9132 +               seq_printf(s, "\"  \nMaster Copy\n");
9133 +
9134 +       /* Print the LVB: */
9135 +       if (res->res_lvbptr) {
9136 +               seq_printf(s, "LVB: ");
9137 +               for (i = 0; i < DLM_LVB_LEN; i++) {
9138 +                       if (i == DLM_LVB_LEN / 2)
9139 +                               seq_printf(s, "\n     ");
9140 +                       seq_printf(s, "%02x ",
9141 +                                  (unsigned char) res->res_lvbptr[i]);
9142 +               }
9143 +               seq_printf(s, "\n");
9144 +       }
9145 +
9146 +       /* Print the locks attached to this resource */
9147 +       seq_printf(s, "Granted Queue\n");
9148 +       list_for_each(locklist, &res->res_grantqueue) {
9149 +               struct dlm_lkb *this_lkb =
9150 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9151 +               print_lock(s, this_lkb, res);
9152 +       }
9153 +
9154 +       seq_printf(s, "Conversion Queue\n");
9155 +       list_for_each(locklist, &res->res_convertqueue) {
9156 +               struct dlm_lkb *this_lkb =
9157 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9158 +               print_lock(s, this_lkb, res);
9159 +       }
9160 +
9161 +       seq_printf(s, "Waiting Queue\n");
9162 +       list_for_each(locklist, &res->res_waitqueue) {
9163 +               struct dlm_lkb *this_lkb =
9164 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9165 +               print_lock(s, this_lkb, res);
9166 +       }
9167 +       return 0;
9168 +}
9169 +#endif                         /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9170 +
9171 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
9172 +{
9173 +       va_list va;
9174 +       int i, n, size, len;
9175 +       char buf[MAX_DEBUG_MSG_LEN+1];
9176 +
9177 +       spin_lock(&debug_lock);
9178 +
9179 +       if (!debug_buf)
9180 +               goto out;
9181 +
9182 +       size = MAX_DEBUG_MSG_LEN;
9183 +       memset(buf, 0, size+1);
9184 +
9185 +       n = snprintf(buf, size, "%s ", ls->ls_name);
9186 +       size -= n;
9187 +
9188 +       va_start(va, fmt);
9189 +       vsnprintf(buf+n, size, fmt, va);
9190 +       va_end(va);
9191 +
9192 +       len = strlen(buf);
9193 +       if (len > MAX_DEBUG_MSG_LEN-1)
9194 +               len = MAX_DEBUG_MSG_LEN-1;
9195 +       buf[len] = '\n';
9196 +       buf[len+1] = '\0';
9197 +
9198 +       for (i = 0; i < strlen(buf); i++) {
9199 +               debug_buf[debug_point++] = buf[i];
9200 +
9201 +               if (debug_point == debug_size) {
9202 +                       debug_point = 0;
9203 +                       debug_wrap = 1;
9204 +               }
9205 +       }
9206 + out:
9207 +       spin_unlock(&debug_lock);
9208 +}
9209 +
9210 +void dlm_debug_dump(void)
9211 +{
9212 +       int i;
9213 +
9214 +       spin_lock(&debug_lock);
9215 +       if (debug_wrap) {
9216 +               for (i = debug_point; i < debug_size; i++)
9217 +                       printk("%c", debug_buf[i]);
9218 +       }
9219 +       for (i = 0; i < debug_point; i++)
9220 +               printk("%c", debug_buf[i]);
9221 +       spin_unlock(&debug_lock);
9222 +}
9223 +
9224 +void dlm_debug_setup(int size)
9225 +{
9226 +       char *b = NULL;
9227 +
9228 +       if (size > PAGE_SIZE)
9229 +               size = PAGE_SIZE;
9230 +       if (size)
9231 +               b = kmalloc(size, GFP_KERNEL);
9232 +
9233 +       spin_lock(&debug_lock);
9234 +       if (debug_buf)
9235 +               kfree(debug_buf);
9236 +       if (!size || !b)
9237 +               goto out;
9238 +       debug_size = size;
9239 +       debug_point = 0;
9240 +       debug_wrap = 0;
9241 +       debug_buf = b;
9242 +       memset(debug_buf, 0, debug_size);
9243 + out:
9244 +        spin_unlock(&debug_lock);
9245 +}
9246 +
9247 +static void dlm_debug_init(void)
9248 +{
9249 +       debug_buf = NULL;
9250 +        debug_size = 0;
9251 +       debug_point = 0;
9252 +       debug_wrap = 0;
9253 +       spin_lock_init(&debug_lock);
9254 +
9255 +       dlm_debug_setup(DLM_DEBUG_SIZE);
9256 +}
9257 +
9258 +#ifdef CONFIG_PROC_FS
9259 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9260 +{
9261 +       int i, n = 0;
9262 +
9263 +       spin_lock(&debug_lock);
9264 +
9265 +       if (debug_wrap) {
9266 +               for (i = debug_point; i < debug_size; i++)
9267 +                       n += sprintf(b + n, "%c", debug_buf[i]);
9268 +       }
9269 +       for (i = 0; i < debug_point; i++)
9270 +               n += sprintf(b + n, "%c", debug_buf[i]);
9271 +
9272 +       spin_unlock(&debug_lock);
9273 +
9274 +       return n;
9275 +}
9276 +
9277 +int dlm_rcom_info(char *b, char **start, off_t offset, int length)
9278 +{
9279 +       struct dlm_ls *ls;
9280 +       struct dlm_csb *csb;
9281 +       int n = 0;
9282 +
9283 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9284 +       if (!ls)
9285 +               return 0;
9286 +
9287 +       n += sprintf(b + n, "nodeid names_send_count names_send_msgid "
9288 +                                  "names_recv_count names_recv_msgid "
9289 +                                  "locks_send_count locks_send_msgid "
9290 +                                  "locks_recv_count locks_recv_msgid\n");
9291 +
9292 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
9293 +               n += sprintf(b + n, "%u %u %u %u %u %u %u %u %u\n",
9294 +                            csb->node->nodeid,
9295 +                            csb->names_send_count,
9296 +                            csb->names_send_msgid,
9297 +                            csb->names_recv_count,
9298 +                            csb->names_recv_msgid,
9299 +                            csb->locks_send_count,
9300 +                            csb->locks_send_msgid,
9301 +                            csb->locks_recv_count,
9302 +                            csb->locks_recv_msgid);
9303 +        }
9304 +       return n;
9305 +}
9306 +#endif
9307 +
9308 +void dlm_proc_init(void)
9309 +{
9310 +#ifdef CONFIG_PROC_FS
9311 +       debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
9312 +                                            NULL);
9313 +       if (!debug_proc_entry)
9314 +               return;
9315 +
9316 +       debug_proc_entry->get_info = &dlm_debug_info;
9317 +
9318 +       rcom_proc_entry = create_proc_entry("cluster/dlm_rcom", S_IRUGO, NULL);
9319 +       if (!rcom_proc_entry)
9320 +               return;
9321 +
9322 +       rcom_proc_entry->get_info = &dlm_rcom_info;
9323 +#endif
9324 +       dlm_debug_init();
9325 +
9326 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9327 +       locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
9328 +                                                 S_IFREG | 0400,
9329 +                                                 NULL, NULL, NULL);
9330 +       if (!locks_proc_entry)
9331 +               return;
9332 +       locks_proc_entry->proc_fops = &locks_fops;
9333 +#endif
9334 +}
9335 +
9336 +void dlm_proc_exit(void)
9337 +{
9338 +#ifdef CONFIG_PROC_FS
9339 +       if (debug_proc_entry) {
9340 +               remove_proc_entry("cluster/dlm_debug", NULL);
9341 +               dlm_debug_setup(0);
9342 +       }
9343 +
9344 +       if (rcom_proc_entry)
9345 +               remove_proc_entry("cluster/dlm_rcom", NULL);
9346 +#endif
9347 +
9348 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9349 +       if (locks_proc_entry)
9350 +               remove_proc_entry("cluster/dlm_locks", NULL);
9351 +#endif
9352 +}
9353 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
9354 --- linux-orig/cluster/dlm/queries.c    1970-01-01 07:30:00.000000000 +0730
9355 +++ linux-patched/cluster/dlm/queries.c 2004-07-13 18:57:22.000000000 +0800
9356 @@ -0,0 +1,696 @@
9357 +/******************************************************************************
9358 +*******************************************************************************
9359 +**
9360 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9361 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9362 +**
9363 +**  This copyrighted material is made available to anyone wishing to use,
9364 +**  modify, copy, or redistribute it subject to the terms and conditions
9365 +**  of the GNU General Public License v.2.
9366 +**
9367 +*******************************************************************************
9368 +******************************************************************************/
9369 +
9370 +/*
9371 + * queries.c
9372 + *
9373 + * This file provides the kernel query interface to the DLM.
9374 + *
9375 + */
9376 +
9377 +#define EXPORT_SYMTAB
9378 +#include <linux/module.h>
9379 +
9380 +#include "dlm_internal.h"
9381 +#include "lockspace.h"
9382 +#include "lockqueue.h"
9383 +#include "locking.h"
9384 +#include "lkb.h"
9385 +#include "nodes.h"
9386 +#include "dir.h"
9387 +#include "ast.h"
9388 +#include "memory.h"
9389 +#include "lowcomms.h"
9390 +#include "midcomms.h"
9391 +#include "rsb.h"
9392 +
9393 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
9394 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
9395 +
9396 +/*
9397 + * API entry point.
9398 + */
9399 +int dlm_query(void *lockspace,
9400 +             struct dlm_lksb *lksb,
9401 +             int query,
9402 +             struct dlm_queryinfo *qinfo,
9403 +             void (ast_routine(void *)),
9404 +             void *astarg)
9405 +{
9406 +       int status = -EINVAL;
9407 +       struct dlm_lkb *target_lkb;
9408 +       struct dlm_lkb *query_lkb = NULL;       /* Our temporary LKB */
9409 +       struct dlm_ls  *ls = (struct dlm_ls *) find_lockspace_by_local_id(lockspace);
9410 +
9411 +
9412 +       if (!qinfo)
9413 +               goto out;
9414 +       if (!ls)
9415 +               goto out;
9416 +       if (!ast_routine)
9417 +               goto out;
9418 +       if (!lksb)
9419 +               goto out;
9420 +
9421 +       if (!qinfo->gqi_lockinfo)
9422 +               qinfo->gqi_locksize = 0;
9423 +
9424 +        /* Find the lkid */
9425 +       target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
9426 +       if (!target_lkb)
9427 +               goto out;
9428 +
9429 +       /* If the user wants a list of locks that are blocking or
9430 +          not blocking this lock, then it must be waiting
9431 +          for something
9432 +       */
9433 +       if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
9434 +            (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
9435 +           target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
9436 +               return -EINVAL;
9437 +
9438 +       /* We now allocate an LKB for our own use (so we can hang
9439 +        * things like the AST routine and the lksb from it) */
9440 +       lksb->sb_status = -EBUSY;
9441 +       query_lkb = create_lkb(ls);
9442 +       if (!query_lkb) {
9443 +               status = -ENOMEM;
9444 +               goto out;
9445 +       }
9446 +       query_lkb->lkb_astaddr  = ast_routine;
9447 +       query_lkb->lkb_astparam = (long)astarg;
9448 +       query_lkb->lkb_resource = target_lkb->lkb_resource;
9449 +       query_lkb->lkb_lksb     = lksb;
9450 +
9451 +       /* Don't free the resource while we are querying it. This ref
9452 +        * will be dropped when the LKB is freed */
9453 +       hold_rsb(query_lkb->lkb_resource);
9454 +
9455 +       /* Fill in the stuff that's always local */
9456 +       if (qinfo->gqi_resinfo) {
9457 +               if (target_lkb->lkb_resource->res_nodeid)
9458 +                       qinfo->gqi_resinfo->rsi_masternode =
9459 +                               target_lkb->lkb_resource->res_nodeid;
9460 +               else
9461 +                       qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
9462 +               qinfo->gqi_resinfo->rsi_length =
9463 +                       target_lkb->lkb_resource->res_length;
9464 +               memcpy(qinfo->gqi_resinfo->rsi_name,
9465 +                      target_lkb->lkb_resource->res_name,
9466 +                      qinfo->gqi_resinfo->rsi_length);
9467 +       }
9468 +
9469 +       /* If the master is local (or the user doesn't want the overhead of a
9470 +        * remote call) - fill in the details here */
9471 +       if (target_lkb->lkb_resource->res_nodeid == 0 ||
9472 +           (query & DLM_QUERY_LOCAL)) {
9473 +
9474 +               status = 0;
9475 +               /* Resource info */
9476 +               if (qinfo->gqi_resinfo) {
9477 +                       query_resource(target_lkb->lkb_resource,
9478 +                                      qinfo->gqi_resinfo);
9479 +               }
9480 +
9481 +               /* Lock lists */
9482 +               if (qinfo->gqi_lockinfo) {
9483 +                       status = query_locks(query, target_lkb, qinfo);
9484 +               }
9485 +
9486 +               query_lkb->lkb_retstatus = status;
9487 +               queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
9488 +               wake_astd();
9489 +
9490 +               /* An AST will be delivered so we must return success here */
9491 +               status = 0;
9492 +               goto out;
9493 +       }
9494 +
9495 +       /* Remote master */
9496 +       if (target_lkb->lkb_resource->res_nodeid != 0)
9497 +       {
9498 +               struct dlm_query_request *remquery;
9499 +               struct writequeue_entry *e;
9500 +
9501 +               /* Clear this cos the receiving end adds to it with
9502 +                  each incoming packet */
9503 +               qinfo->gqi_lockcount = 0;
9504 +
9505 +               /* Squirrel a pointer to the query info struct
9506 +                  somewhere illegal */
9507 +               query_lkb->lkb_request = (struct dlm_request *) qinfo;
9508 +
9509 +               e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
9510 +                                       sizeof(struct dlm_query_request),
9511 +                                       ls->ls_allocation,
9512 +                                       (char **) &remquery);
9513 +               if (!e) {
9514 +                       status = -ENOBUFS;
9515 +                       goto out;
9516 +               }
9517 +
9518 +               /* Build remote packet */
9519 +               memset(remquery, 0, sizeof(struct dlm_query_request));
9520 +
9521 +               remquery->rq_maxlocks  = qinfo->gqi_locksize;
9522 +               remquery->rq_query     = query;
9523 +               remquery->rq_mstlkid   = target_lkb->lkb_remid;
9524 +               if (qinfo->gqi_lockinfo)
9525 +                       remquery->rq_maxlocks = qinfo->gqi_locksize;
9526 +
9527 +               remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
9528 +               remquery->rq_header.rh_flags     = 0;
9529 +               remquery->rq_header.rh_length    = sizeof(struct dlm_query_request);
9530 +               remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
9531 +               remquery->rq_header.rh_lockspace = ls->ls_global_id;
9532 +
9533 +               midcomms_send_buffer(&remquery->rq_header, e);
9534 +               status = 0;
9535 +       }
9536 +
9537 +      out:
9538 +
9539 +       return status;
9540 +}
9541 +
9542 +static inline int valid_range(struct dlm_range *r)
9543 +{
9544 +    if (r->ra_start != 0ULL ||
9545 +       r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
9546 +       return 1;
9547 +    else
9548 +       return 0;
9549 +}
9550 +
9551 +static void put_int(int x, char *buf, int *offp)
9552 +{
9553 +        x = cpu_to_le32(x);
9554 +        memcpy(buf + *offp, &x, sizeof(int));
9555 +        *offp += sizeof(int);
9556 +}
9557 +
9558 +static void put_int64(uint64_t x, char *buf, int *offp)
9559 +{
9560 +        x = cpu_to_le64(x);
9561 +        memcpy(buf + *offp, &x, sizeof(uint64_t));
9562 +        *offp += sizeof(uint64_t);
9563 +}
9564 +
9565 +static int get_int(char *buf, int *offp)
9566 +{
9567 +        int value;
9568 +        memcpy(&value, buf + *offp, sizeof(int));
9569 +        *offp += sizeof(int);
9570 +        return le32_to_cpu(value);
9571 +}
9572 +
9573 +static uint64_t get_int64(char *buf, int *offp)
9574 +{
9575 +        uint64_t value;
9576 +
9577 +        memcpy(&value, buf + *offp, sizeof(uint64_t));
9578 +        *offp += sizeof(uint64_t);
9579 +        return le64_to_cpu(value);
9580 +}
9581 +
9582 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
9583 +
9584 +/* Called from recvd to get lock info for a remote node */
9585 +int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
9586 +{
9587 +        struct dlm_query_request *query = (struct dlm_query_request *) msg;
9588 +       struct dlm_query_reply *reply;
9589 +       struct dlm_resinfo resinfo;
9590 +       struct dlm_queryinfo qinfo;
9591 +       struct writequeue_entry *e;
9592 +       char *buf;
9593 +       struct dlm_lkb *lkb;
9594 +       int status = 0;
9595 +       int bufidx;
9596 +       int finished = 0;
9597 +       int cur_lock = 0;
9598 +       int start_lock = 0;
9599 +
9600 +       lkb = find_lock_by_id(ls, query->rq_mstlkid);
9601 +       if (!lkb) {
9602 +               status = -EINVAL;
9603 +               goto send_error;
9604 +       }
9605 +
9606 +       qinfo.gqi_resinfo = &resinfo;
9607 +       qinfo.gqi_locksize = query->rq_maxlocks;
9608 +
9609 +       /* Get the resource bits */
9610 +       query_resource(lkb->lkb_resource, &resinfo);
9611 +
9612 +       /* Now get the locks if wanted */
9613 +       if (query->rq_maxlocks) {
9614 +               qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
9615 +                                            GFP_KERNEL);
9616 +               if (!qinfo.gqi_lockinfo) {
9617 +                       status = -ENOMEM;
9618 +                       goto send_error;
9619 +               }
9620 +
9621 +               status = query_locks(query->rq_query, lkb, &qinfo);
9622 +               if (status && status != -E2BIG) {
9623 +                       kfree(qinfo.gqi_lockinfo);
9624 +                       goto send_error;
9625 +               }
9626 +       }
9627 +       else {
9628 +               qinfo.gqi_lockinfo = NULL;
9629 +               qinfo.gqi_lockcount = 0;
9630 +       }
9631 +
9632 +       /* Send as many blocks as needed for all the locks */
9633 +       do {
9634 +               int i;
9635 +               int msg_len = sizeof(struct dlm_query_reply);
9636 +               int last_msg_len = msg_len; /* keeps compiler quiet */
9637 +               int last_lock;
9638 +
9639 +               /* First work out how many locks we can fit into a block */
9640 +               for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
9641 +
9642 +                       last_msg_len = msg_len;
9643 +
9644 +                       msg_len += LOCK_LEN;
9645 +                       if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
9646 +                           valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
9647 +
9648 +                               msg_len += sizeof(uint64_t) * 4;
9649 +                       }
9650 +               }
9651 +
9652 +               /* There must be a neater way of doing this... */
9653 +               if (msg_len > PAGE_SIZE) {
9654 +                       last_lock = i-1;
9655 +                       msg_len = last_msg_len;
9656 +               }
9657 +               else {
9658 +                       last_lock = i;
9659 +               }
9660 +
9661 +               e = lowcomms_get_buffer(nodeid,
9662 +                                       msg_len,
9663 +                                       ls->ls_allocation,
9664 +                                       (char **) &reply);
9665 +               if (!e) {
9666 +                       kfree(qinfo.gqi_lockinfo);
9667 +                       status = -ENOBUFS;
9668 +                       goto out;
9669 +               }
9670 +
9671 +               reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
9672 +               reply->rq_header.rh_length    = msg_len;
9673 +               reply->rq_header.rh_lkid      = msg->rh_lkid;
9674 +               reply->rq_header.rh_lockspace = msg->rh_lockspace;
9675 +
9676 +               reply->rq_status     = status;
9677 +               reply->rq_startlock  = cur_lock;
9678 +               reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
9679 +               reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
9680 +               reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
9681 +               memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
9682 +
9683 +               buf = (char *)reply;
9684 +               bufidx = sizeof(struct dlm_query_reply);
9685 +
9686 +               for (; cur_lock < last_lock; cur_lock++) {
9687 +
9688 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
9689 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
9690 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
9691 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
9692 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
9693 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
9694 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
9695 +
9696 +                       if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
9697 +                           valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
9698 +
9699 +                               buf[bufidx++] = 1;
9700 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
9701 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
9702 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
9703 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
9704 +                       }
9705 +                       else {
9706 +                               buf[bufidx++] = 0;
9707 +                       }
9708 +               }
9709 +
9710 +               if (cur_lock == qinfo.gqi_lockcount) {
9711 +                       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
9712 +                       finished = 1;
9713 +               }
9714 +               else {
9715 +                       reply->rq_header.rh_flags = 0;
9716 +               }
9717 +
9718 +               reply->rq_numlocks = cur_lock - start_lock;
9719 +               start_lock = cur_lock;
9720 +
9721 +               midcomms_send_buffer(&reply->rq_header, e);
9722 +       } while (!finished);
9723 +
9724 +       kfree(qinfo.gqi_lockinfo);
9725 + out:
9726 +       return status;
9727 +
9728 + send_error:
9729 +       e = lowcomms_get_buffer(nodeid,
9730 +                               sizeof(struct dlm_query_reply),
9731 +                               ls->ls_allocation,
9732 +                               (char **) &reply);
9733 +       if (!e) {
9734 +               status =  -ENOBUFS;
9735 +               goto out;
9736 +       }
9737 +       reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
9738 +       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY; /* Don't support multiple blocks yet */
9739 +       reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
9740 +       reply->rq_header.rh_lkid = msg->rh_lkid;
9741 +       reply->rq_header.rh_lockspace = msg->rh_lockspace;
9742 +       reply->rq_status     = status;
9743 +       reply->rq_numlocks   = 0;
9744 +       reply->rq_startlock  = 0;
9745 +       reply->rq_grantcount = 0;
9746 +       reply->rq_convcount  = 0;
9747 +       reply->rq_waitcount  = 0;
9748 +
9749 +       midcomms_send_buffer(&reply->rq_header, e);
9750 +
9751 +       return status;
9752 +}
9753 +
9754 +/* Reply to a remote query */
9755 +int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
9756 +{
9757 +       struct dlm_lkb *query_lkb;
9758 +       struct dlm_queryinfo *qinfo;
9759 +       struct dlm_query_reply *reply;
9760 +       char *buf;
9761 +       int i;
9762 +       int bufidx;
9763 +
9764 +       query_lkb = find_lock_by_id(ls, msg->rh_lkid);
9765 +       if (!query_lkb)
9766 +               return -EINVAL;
9767 +
9768 +       qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
9769 +       reply = (struct dlm_query_reply *) msg;
9770 +
9771 +       /* Copy the easy bits first */
9772 +       qinfo->gqi_lockcount += reply->rq_numlocks;
9773 +       if (qinfo->gqi_resinfo) {
9774 +               qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
9775 +               qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
9776 +               qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
9777 +               memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
9778 +                       DLM_LVB_LEN);
9779 +       }
9780 +
9781 +       /* Now unpack the locks */
9782 +       bufidx = sizeof(struct dlm_query_reply);
9783 +       buf = (char *) msg;
9784 +
9785 +       DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
9786 +                   printk("start = %d, num + %d. Max=  %d\n",
9787 +                          reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
9788 +
9789 +       for (i = reply->rq_startlock;
9790 +            i < reply->rq_startlock + reply->rq_numlocks; i++) {
9791 +               qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
9792 +               qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
9793 +               qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
9794 +               qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
9795 +               qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
9796 +               qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
9797 +               qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
9798 +               if (buf[bufidx++]) {
9799 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
9800 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
9801 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
9802 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
9803 +               }
9804 +               else {
9805 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
9806 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
9807 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
9808 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
9809 +               }
9810 +       }
9811 +
9812 +       /* If this was the last block then now tell the user */
9813 +       if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
9814 +               query_lkb->lkb_retstatus = reply->rq_status;
9815 +               queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
9816 +               wake_astd();
9817 +       }
9818 +
9819 +       return 0;
9820 +}
9821 +
9822 +/* Aggregate resource information */
9823 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
9824 +{
9825 +       struct list_head *tmp;
9826 +
9827 +
9828 +       if (rsb->res_lvbptr)
9829 +               memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
9830 +
9831 +       resinfo->rsi_grantcount = 0;
9832 +       list_for_each(tmp, &rsb->res_grantqueue) {
9833 +               resinfo->rsi_grantcount++;
9834 +       }
9835 +
9836 +       resinfo->rsi_waitcount = 0;
9837 +       list_for_each(tmp, &rsb->res_waitqueue) {
9838 +               resinfo->rsi_waitcount++;
9839 +       }
9840 +
9841 +       resinfo->rsi_convcount = 0;
9842 +       list_for_each(tmp, &rsb->res_convertqueue) {
9843 +               resinfo->rsi_convcount++;
9844 +       }
9845 +
9846 +       return 0;
9847 +}
9848 +
9849 +static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
9850 +{
9851 +       int entry;
9852 +
9853 +       /* Don't fill it in if the buffer is full */
9854 +       if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
9855 +               return -E2BIG;
9856 +
9857 +       /* gqi_lockcount contains the number of locks we have returned */
9858 +       entry = qinfo->gqi_lockcount++;
9859 +
9860 +       /* Fun with master copies */
9861 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
9862 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
9863 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
9864 +       }
9865 +       else {
9866 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
9867 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
9868 +       }
9869 +
9870 +       /* Also make sure we always have a valid nodeid in there, the
9871 +          calling end may not know which node "0" is */
9872 +       if (lkb->lkb_nodeid)
9873 +           qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
9874 +       else
9875 +           qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
9876 +
9877 +       if (lkb->lkb_parent)
9878 +               qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
9879 +       else
9880 +               qinfo->gqi_lockinfo[entry].lki_parent = 0;
9881 +
9882 +       qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
9883 +       qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
9884 +       qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
9885 +
9886 +       if (lkb->lkb_range) {
9887 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
9888 +                       lkb->lkb_range[GR_RANGE_START];
9889 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
9890 +                       lkb->lkb_range[GR_RANGE_END];
9891 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
9892 +                       lkb->lkb_range[RQ_RANGE_START];
9893 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
9894 +                       lkb->lkb_range[RQ_RANGE_END];
9895 +       } else {
9896 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
9897 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
9898 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
9899 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
9900 +       }
9901 +       return 0;
9902 +}
9903 +
9904 +static int query_lkb_queue(struct list_head *queue, int query,
9905 +                          struct dlm_queryinfo *qinfo)
9906 +{
9907 +       struct list_head *tmp;
9908 +       int status = 0;
9909 +       int mode = query & DLM_QUERY_MODE_MASK;
9910 +
9911 +       list_for_each(tmp, queue) {
9912 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9913 +               int lkmode;
9914 +
9915 +               if (query & DLM_QUERY_RQMODE)
9916 +                       lkmode = lkb->lkb_rqmode;
9917 +               else
9918 +                       lkmode = lkb->lkb_grmode;
9919 +
9920 +               /* Add the LKB info to the list if it matches the criteria in
9921 +                * the query bitmap */
9922 +               switch (query & DLM_QUERY_MASK) {
9923 +               case DLM_QUERY_LOCKS_ALL:
9924 +                       status = add_lock(lkb, qinfo);
9925 +                       break;
9926 +
9927 +               case DLM_QUERY_LOCKS_HIGHER:
9928 +                       if (lkmode > mode)
9929 +                               status = add_lock(lkb, qinfo);
9930 +                       break;
9931 +
9932 +               case DLM_QUERY_LOCKS_EQUAL:
9933 +                       if (lkmode == mode)
9934 +                               status = add_lock(lkb, qinfo);
9935 +                       break;
9936 +
9937 +               case DLM_QUERY_LOCKS_LOWER:
9938 +                       if (lkmode < mode)
9939 +                               status = add_lock(lkb, qinfo);
9940 +                       break;
9941 +               }
9942 +       }
9943 +       return status;
9944 +}
9945 +
9946 +/*
9947 + * Return 1 if the locks' ranges overlap
9948 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
9949 + */
9950 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
9951 +{
9952 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
9953 +               return 1;
9954 +
9955 +       if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
9956 +           lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
9957 +               return 0;
9958 +
9959 +       return 1;
9960 +}
9961 +extern const int __dlm_compat_matrix[8][8];
9962 +
9963 +
9964 +static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
9965 +{
9966 +       struct list_head *tmp;
9967 +       int status = 0;
9968 +
9969 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9970 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9971 +
9972 +               if (ranges_overlap(lkb, qlkb) &&
9973 +                   !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
9974 +                       status = add_lock(lkb, qinfo);
9975 +       }
9976 +
9977 +       return status;
9978 +}
9979 +
9980 +static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
9981 +{
9982 +       struct list_head *tmp;
9983 +       int status = 0;
9984 +
9985 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
9986 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
9987 +
9988 +               if (!(ranges_overlap(lkb, qlkb) &&
9989 +                     !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
9990 +                       status = add_lock(lkb, qinfo);
9991 +       }
9992 +
9993 +       return status;
9994 +}
9995 +
9996 +/* Gather a list of appropriate locks */
9997 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
9998 +{
9999 +       int status = 0;
10000 +
10001 +
10002 +       /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
10003 +        * was requested as the mode
10004 +        */
10005 +       if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
10006 +               query &= ~DLM_QUERY_MODE_MASK;
10007 +               if (query & DLM_QUERY_RQMODE)
10008 +                       query |= lkb->lkb_rqmode;
10009 +               else
10010 +                       query |= lkb->lkb_grmode;
10011 +       }
10012 +
10013 +       qinfo->gqi_lockcount = 0;
10014 +
10015 +       /* BLOCKING/NOTBLOCK only look at the granted queue */
10016 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
10017 +               return get_blocking_locks(lkb, qinfo);
10018 +
10019 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
10020 +               return get_nonblocking_locks(lkb, qinfo);
10021 +
10022 +        /* Do the lock queues that were requested */
10023 +       if (query & DLM_QUERY_QUEUE_GRANT) {
10024 +               status = query_lkb_queue(&lkb->lkb_resource->res_grantqueue,
10025 +                                        query, qinfo);
10026 +       }
10027 +
10028 +       if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
10029 +               status = query_lkb_queue(&lkb->lkb_resource->res_convertqueue,
10030 +                                        query, qinfo);
10031 +       }
10032 +
10033 +       if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
10034 +               status = query_lkb_queue(&lkb->lkb_resource->res_waitqueue,
10035 +                                        query, qinfo);
10036 +       }
10037 +
10038 +
10039 +       return status;
10040 +}
10041 +
10042 +EXPORT_SYMBOL(dlm_query);
10043 +/*
10044 + * Overrides for Emacs so that we follow Linus's tabbing style.
10045 + * Emacs will notice this stuff at the end of the file and automatically
10046 + * adjust the settings for this buffer only.  This must remain at the end
10047 + * of the file.
10048 + * ---------------------------------------------------------------------------
10049 + * Local variables:
10050 + * c-file-style: "linux"
10051 + * End:
10052 + */
10053 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
10054 --- linux-orig/cluster/dlm/queries.h    1970-01-01 07:30:00.000000000 +0730
10055 +++ linux-patched/cluster/dlm/queries.h 2004-07-13 18:57:22.000000000 +0800
10056 @@ -0,0 +1,20 @@
10057 +/******************************************************************************
10058 +*******************************************************************************
10059 +**
10060 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
10061 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
10062 +**
10063 +**  This copyrighted material is made available to anyone wishing to use,
10064 +**  modify, copy, or redistribute it subject to the terms and conditions
10065 +**  of the GNU General Public License v.2.
10066 +**
10067 +*******************************************************************************
10068 +******************************************************************************/
10069 +
10070 +#ifndef __QUERIES_DOT_H__
10071 +#define __QUERIES_DOT_H__
10072 +
10073 +extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10074 +extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10075 +
10076 +#endif                          /* __QUERIES_DOT_H__ */
10077 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
10078 --- linux-orig/cluster/dlm/rebuild.c    1970-01-01 07:30:00.000000000 +0730
10079 +++ linux-patched/cluster/dlm/rebuild.c 2004-07-13 18:57:22.000000000 +0800
10080 @@ -0,0 +1,1254 @@
10081 +/******************************************************************************
10082 +*******************************************************************************
10083 +**
10084 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
10085 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
10086 +**
10087 +**  This copyrighted material is made available to anyone wishing to use,
10088 +**  modify, copy, or redistribute it subject to the terms and conditions
10089 +**  of the GNU General Public License v.2.
10090 +**
10091 +*******************************************************************************
10092 +******************************************************************************/
10093 +
10094 +/*
10095 + * Rebuild RSB's on new masters.  Functions for transferring locks and
10096 + * subresources to new RSB masters during recovery.
10097 + */
10098 +
10099 +#include "dlm_internal.h"
10100 +#include "reccomms.h"
10101 +#include "lkb.h"
10102 +#include "rsb.h"
10103 +#include "nodes.h"
10104 +#include "config.h"
10105 +#include "memory.h"
10106 +#include "recover.h"
10107 +
10108 +
10109 +/* Types of entity serialised in remastering messages */
10110 +#define REMASTER_ROOTRSB 1
10111 +#define REMASTER_RSB     2
10112 +#define REMASTER_LKB     3
10113 +
10114 +struct rcom_fill {
10115 +       char *                  outbuf;         /* Beginning of data */
10116 +       int                     offset;         /* Current offset into outbuf */
10117 +       int                     maxlen;         /* Max value of offset */
10118 +       int                     remasterid;
10119 +       int                     count;
10120 +       struct dlm_rsb *        rsb;
10121 +       struct dlm_rsb *        subrsb;
10122 +       struct dlm_lkb *        lkb;
10123 +       struct list_head *      lkbqueue;
10124 +       char                    more;
10125 +};
10126 +typedef struct rcom_fill rcom_fill_t;
10127 +
10128 +
10129 +struct rebuild_node {
10130 +       struct list_head        list;
10131 +       int                     nodeid;
10132 +       struct dlm_rsb *        rootrsb;
10133 +};
10134 +typedef struct rebuild_node rebuild_node_t;
10135 +
10136 +
10137 +/*
10138 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10139 + * master.  The rsb will be "done" with recovery when the new master has
10140 + * replied with all the new remote lockid's for this rsb's lkb's.
10141 + */
10142 +
10143 +void expect_new_lkids(struct dlm_rsb *rsb)
10144 +{
10145 +       rsb->res_newlkid_expect = 0;
10146 +       recover_list_add(rsb);
10147 +}
10148 +
10149 +/*
10150 + * This function is called on root rsb or subrsb when another lkb is being sent
10151 + * to the new master for which we expect to receive a corresponding remote lkid
10152 + */
10153 +
10154 +void need_new_lkid(struct dlm_rsb *rsb)
10155 +{
10156 +       struct dlm_rsb *root = rsb;
10157 +
10158 +       if (rsb->res_parent)
10159 +               root = rsb->res_root;
10160 +
10161 +       if (!root->res_newlkid_expect)
10162 +               recover_list_add(root);
10163 +       else
10164 +               DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10165 +
10166 +       root->res_newlkid_expect++;
10167 +}
10168 +
10169 +/*
10170 + * This function is called for each lkb for which a new remote lkid is
10171 + * received.  Decrement the expected number of remote lkids expected for the
10172 + * root rsb.
10173 + */
10174 +
10175 +void have_new_lkid(struct dlm_lkb *lkb)
10176 +{
10177 +       struct dlm_rsb *root = lkb->lkb_resource;
10178 +
10179 +       if (root->res_parent)
10180 +               root = root->res_root;
10181 +
10182 +       down_write(&root->res_lock);
10183 +
10184 +       DLM_ASSERT(root->res_newlkid_expect,
10185 +                  printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10186 +
10187 +       root->res_newlkid_expect--;
10188 +
10189 +       if (!root->res_newlkid_expect) {
10190 +               clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10191 +               recover_list_del(root);
10192 +       }
10193 +       up_write(&root->res_lock);
10194 +}
10195 +
10196 +/*
10197 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10198 + * list if necessary.
10199 + *
10200 + * Currently no locking is needed here as it all happens in the dlm_recvd
10201 + * thread
10202 + */
10203 +
10204 +static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
10205 +{
10206 +       rebuild_node_t *node = NULL;
10207 +
10208 +       list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10209 +               if (node->nodeid == nodeid)
10210 +                       return node;
10211 +       }
10212 +
10213 +       /* Not found, add one */
10214 +       node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10215 +       if (!node)
10216 +               return NULL;
10217 +
10218 +       node->nodeid = nodeid;
10219 +       node->rootrsb = NULL;
10220 +       list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10221 +
10222 +       return node;
10223 +}
10224 +
10225 +/*
10226 + * Tidy up after a rebuild run.  Called when all recovery has finished
10227 + */
10228 +
10229 +void rebuild_freemem(struct dlm_ls *ls)
10230 +{
10231 +       rebuild_node_t *node = NULL, *s;
10232 +
10233 +       list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10234 +               list_del(&node->list);
10235 +               kfree(node);
10236 +       }
10237 +}
10238 +
10239 +static void put_int(int x, char *buf, int *offp)
10240 +{
10241 +       x = cpu_to_le32(x);
10242 +       memcpy(buf + *offp, &x, sizeof(int));
10243 +       *offp += sizeof(int);
10244 +}
10245 +
10246 +static void put_int64(uint64_t x, char *buf, int *offp)
10247 +{
10248 +       x = cpu_to_le64(x);
10249 +       memcpy(buf + *offp, &x, sizeof(uint64_t));
10250 +       *offp += sizeof(uint64_t);
10251 +}
10252 +
10253 +static void put_bytes(char *x, int len, char *buf, int *offp)
10254 +{
10255 +       put_int(len, buf, offp);
10256 +       memcpy(buf + *offp, x, len);
10257 +       *offp += len;
10258 +}
10259 +
10260 +static void put_char(char x, char *buf, int *offp)
10261 +{
10262 +       buf[*offp] = x;
10263 +       *offp += 1;
10264 +}
10265 +
10266 +static int get_int(char *buf, int *offp)
10267 +{
10268 +       int value;
10269 +       memcpy(&value, buf + *offp, sizeof(int));
10270 +       *offp += sizeof(int);
10271 +       return le32_to_cpu(value);
10272 +}
10273 +
10274 +static uint64_t get_int64(char *buf, int *offp)
10275 +{
10276 +       uint64_t value;
10277 +
10278 +       memcpy(&value, buf + *offp, sizeof(uint64_t));
10279 +       *offp += sizeof(uint64_t);
10280 +       return le64_to_cpu(value);
10281 +}
10282 +
10283 +static char get_char(char *buf, int *offp)
10284 +{
10285 +       char x = buf[*offp];
10286 +
10287 +       *offp += 1;
10288 +       return x;
10289 +}
10290 +
10291 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
10292 +{
10293 +       *len = get_int(buf, offp);
10294 +       memcpy(bytes, buf + *offp, *len);
10295 +       *offp += *len;
10296 +}
10297 +
10298 +static int lkb_length(struct dlm_lkb *lkb)
10299 +{
10300 +       int len = 0;
10301 +
10302 +       len += sizeof(int);     /* lkb_id */
10303 +       len += sizeof(int);     /* lkb_resource->res_reamasterid */
10304 +       len += sizeof(int);     /* lkb_flags */
10305 +       len += sizeof(int);     /* lkb_status */
10306 +       len += sizeof(char);    /* lkb_rqmode */
10307 +       len += sizeof(char);    /* lkb_grmode */
10308 +       len += sizeof(int);     /* lkb_childcnt */
10309 +       len += sizeof(int);     /* lkb_parent->lkb_id */
10310 +       len += sizeof(int);     /* lkb_bastaddr */
10311 +
10312 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10313 +               len += sizeof(int);     /* number of lvb bytes */
10314 +               len += DLM_LVB_LEN;
10315 +       }
10316 +
10317 +       if (lkb->lkb_range) {
10318 +               len += sizeof(uint64_t);
10319 +               len += sizeof(uint64_t);
10320 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
10321 +                       len += sizeof(uint64_t);
10322 +                       len += sizeof(uint64_t);
10323 +               }
10324 +       }
10325 +
10326 +       return len;
10327 +}
10328 +
10329 +/*
10330 + * It's up to the caller to be sure there's enough space in the buffer.
10331 + */
10332 +
10333 +static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
10334 +{
10335 +       int flags;
10336 +
10337 +       /* Need to tell the remote end if we have a range */
10338 +       flags = lkb->lkb_flags;
10339 +       if (lkb->lkb_range)
10340 +               flags |= GDLM_LKFLG_RANGE;
10341 +
10342 +       /*
10343 +        * See lkb_length()
10344 +        * Total: 30 (no lvb) or 66 (with lvb) bytes
10345 +        */
10346 +
10347 +       put_int(lkb->lkb_id, buf, offp);
10348 +       put_int(lkb->lkb_resource->res_remasterid, buf, offp);
10349 +       put_int(flags, buf, offp);
10350 +       put_int(lkb->lkb_status, buf, offp);
10351 +       put_char(lkb->lkb_rqmode, buf, offp);
10352 +       put_char(lkb->lkb_grmode, buf, offp);
10353 +       put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
10354 +
10355 +       if (lkb->lkb_parent)
10356 +               put_int(lkb->lkb_parent->lkb_id, buf, offp);
10357 +       else
10358 +               put_int(0, buf, offp);
10359 +
10360 +       if (lkb->lkb_bastaddr)
10361 +               put_int(1, buf, offp);
10362 +       else
10363 +               put_int(0, buf, offp);
10364 +
10365 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
10366 +               DLM_ASSERT(lkb->lkb_lvbptr,);
10367 +               put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
10368 +       }
10369 +
10370 +       /* Only send the range we actually need */
10371 +       if (lkb->lkb_range) {
10372 +               switch (lkb->lkb_status) {
10373 +               case GDLM_LKSTS_CONVERT:
10374 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10375 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10376 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10377 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10378 +                       break;
10379 +               case GDLM_LKSTS_WAITING:
10380 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
10381 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
10382 +                       break;
10383 +               case GDLM_LKSTS_GRANTED:
10384 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
10385 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
10386 +                       break;
10387 +               default:
10388 +                       DLM_ASSERT(0,);
10389 +               }
10390 +       }
10391 +}
10392 +
10393 +static int rsb_length(struct dlm_rsb *rsb)
10394 +{
10395 +       int len = 0;
10396 +
10397 +       len += sizeof(int);     /* number of res_name bytes */
10398 +       len += rsb->res_length; /* res_name */
10399 +       len += sizeof(int);     /* res_remasterid */
10400 +       len += sizeof(int);     /* res_parent->res_remasterid */
10401 +
10402 +       return len;
10403 +}
10404 +
10405 +static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
10406 +{
10407 +       struct list_head *tmp;
10408 +       struct dlm_rsb *r;
10409 +
10410 +       tmp = subrsb->res_subreslist.next;
10411 +       r = list_entry(tmp, struct dlm_rsb, res_subreslist);
10412 +
10413 +       return r;
10414 +}
10415 +
10416 +static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
10417 +{
10418 +       struct dlm_rsb *last;
10419 +       last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
10420 +       if (last == r)
10421 +               return 1;
10422 +       return 0;
10423 +}
10424 +
10425 +/*
10426 + * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
10427 + * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's on the
10428 + * wait queue are not rebuilt.
10429 + */
10430 +
10431 +static int lkbs_to_remaster(struct dlm_rsb *r)
10432 +{
10433 +       struct dlm_lkb *lkb;
10434 +       struct dlm_rsb *sub;
10435 +
10436 +       if (!list_empty(&r->res_grantqueue) ||
10437 +           !list_empty(&r->res_convertqueue))
10438 +               return TRUE;
10439 +
10440 +       list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
10441 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10442 +                       continue;
10443 +               return TRUE;
10444 +       }
10445 +
10446 +       list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
10447 +               if (!list_empty(&sub->res_grantqueue) ||
10448 +                   !list_empty(&sub->res_convertqueue))
10449 +                       return TRUE;
10450 +
10451 +               list_for_each_entry(lkb, &sub->res_waitqueue, lkb_statequeue) {
10452 +                       if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10453 +                               continue;
10454 +                       return TRUE;
10455 +               }
10456 +       }
10457 +
10458 +       return FALSE;
10459 +}
10460 +
10461 +static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
10462 +{
10463 +       /*
10464 +        * See rsb_length()
10465 +        * Total: 36 bytes (4 + 24 + 4 + 4)
10466 +        */
10467 +
10468 +       put_bytes(rsb->res_name, rsb->res_length, buf, offp);
10469 +       put_int(rsb->res_remasterid, buf, offp);
10470 +
10471 +       if (rsb->res_parent)
10472 +               put_int(rsb->res_parent->res_remasterid, buf, offp);
10473 +       else
10474 +               put_int(0, buf, offp);
10475 +
10476 +       DLM_ASSERT(!rsb->res_lvbptr,);
10477 +}
10478 +
10479 +/*
10480 + * Flatten an LKB into a buffer for sending to the new RSB master.  As a
10481 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
10482 + * master.
10483 + */
10484 +
10485 +static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
10486 +                       rcom_fill_t *fill)
10487 +{
10488 +       if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
10489 +               goto nospace;
10490 +
10491 +       lkb->lkb_nodeid = r->res_nodeid;
10492 +
10493 +       put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
10494 +       serialise_lkb(lkb, fill->outbuf, &fill->offset);
10495 +
10496 +       fill->count++;
10497 +       need_new_lkid(r);
10498 +       return 0;
10499 +
10500 +      nospace:
10501 +       return -ENOSPC;
10502 +}
10503 +
10504 +/*
10505 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
10506 + */
10507 +
10508 +static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
10509 +                         rcom_fill_t *fill)
10510 +{
10511 +       struct dlm_lkb *lkb;
10512 +       int error;
10513 +
10514 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
10515 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
10516 +                       continue;
10517 +
10518 +               error = pack_one_lkb(r, lkb, fill);
10519 +               if (error)
10520 +                       goto nospace;
10521 +       }
10522 +
10523 +       return 0;
10524 +
10525 +      nospace:
10526 +       fill->lkb = lkb;
10527 +       fill->lkbqueue = queue;
10528 +
10529 +       return error;
10530 +}
10531 +
10532 +static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
10533 +{
10534 +       int error;
10535 +
10536 +       error = pack_lkb_queue(r, &r->res_grantqueue, fill);
10537 +       if (error)
10538 +               goto nospace;
10539 +
10540 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10541 +       if (error)
10542 +               goto nospace;
10543 +
10544 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10545 +
10546 +      nospace:
10547 +       return error;
10548 +}
10549 +
10550 +/*
10551 + * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
10552 + * queue and full lkb queues.
10553 + */
10554 +
10555 +static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
10556 +{
10557 +       struct list_head *tmp, *start, *end;
10558 +       struct dlm_lkb *lkb;
10559 +       int error;
10560 +
10561 +       /*
10562 +        * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
10563 +        */
10564 +
10565 +       error = pack_one_lkb(r, fill->lkb, fill);
10566 +       if (error)
10567 +               goto out;
10568 +
10569 +       start = fill->lkb->lkb_statequeue.next;
10570 +       end = fill->lkbqueue;
10571 +
10572 +       for (tmp = start; tmp != end; tmp = tmp->next) {
10573 +               lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10574 +
10575 +               error = pack_one_lkb(r, lkb, fill);
10576 +               if (error) {
10577 +                       fill->lkb = lkb;
10578 +                       goto out;
10579 +               }
10580 +       }
10581 +
10582 +       /*
10583 +        * Pack all lkb's on r's queues following fill->lkbqueue.
10584 +        */
10585 +
10586 +       if (fill->lkbqueue == &r->res_waitqueue)
10587 +               goto out;
10588 +       if (fill->lkbqueue == &r->res_convertqueue)
10589 +               goto skip;
10590 +
10591 +       DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
10592 +
10593 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
10594 +       if (error)
10595 +               goto out;
10596 +      skip:
10597 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
10598 +
10599 +      out:
10600 +       return error;
10601 +}
10602 +
10603 +static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
10604 +                          rcom_fill_t *fill)
10605 +{
10606 +       int error;
10607 +
10608 +       down_write(&subrsb->res_lock);
10609 +
10610 +       if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
10611 +               goto nospace;
10612 +
10613 +       subrsb->res_nodeid = rsb->res_nodeid;
10614 +       subrsb->res_remasterid = ++fill->remasterid;
10615 +
10616 +       put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
10617 +       serialise_rsb(subrsb, fill->outbuf, &fill->offset);
10618 +
10619 +       error = pack_lkb_queues(subrsb, fill);
10620 +       if (error)
10621 +               goto nospace;
10622 +
10623 +       up_write(&subrsb->res_lock);
10624 +
10625 +       return 0;
10626 +
10627 +      nospace:
10628 +       up_write(&subrsb->res_lock);
10629 +       fill->subrsb = subrsb;
10630 +
10631 +       return -ENOSPC;
10632 +}
10633 +
10634 +static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
10635 +                       rcom_fill_t *fill)
10636 +{
10637 +       struct dlm_rsb *subrsb;
10638 +       int error = 0;
10639 +
10640 +       /*
10641 +        * When an initial subrsb is given, we know it needs to be packed.
10642 +        * When no initial subrsb is given, begin with the first (if any exist).
10643 +        */
10644 +
10645 +       if (!in_subrsb) {
10646 +               if (list_empty(&rsb->res_subreslist))
10647 +                       goto out;
10648 +
10649 +               subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
10650 +                                   res_subreslist);
10651 +       } else
10652 +               subrsb = in_subrsb;
10653 +
10654 +       for (;;) {
10655 +               error = pack_one_subrsb(rsb, subrsb, fill);
10656 +               if (error)
10657 +                       goto out;
10658 +
10659 +               if (last_in_list(subrsb, &rsb->res_subreslist))
10660 +                       break;
10661 +
10662 +               subrsb = next_subrsb(subrsb);
10663 +       }
10664 +
10665 +      out:
10666 +       return error;
10667 +}
10668 +
10669 +/*
10670 + * Finish packing whatever is left in an rsb tree.  If space runs out while
10671 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
10672 + *
10673 + * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
10674 + *  subrsb && !lkb, we left off just before starting a new subrsb.
10675 + *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
10676 + * !subrsb && !lkb, we shouldn't be in this function, but starting
10677 + *                  a new rsb in pack_rsb_tree().
10678 + */
10679 +
10680 +static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
10681 +                                  rcom_fill_t *fill)
10682 +{
10683 +       struct dlm_rsb *subrsb = NULL;
10684 +       int error = 0;
10685 +
10686 +       if (!fill->subrsb && fill->lkb) {
10687 +               error = pack_lkb_remaining(rsb, fill);
10688 +               if (error)
10689 +                       goto out;
10690 +
10691 +               error = pack_subrsbs(rsb, NULL, fill);
10692 +               if (error)
10693 +                       goto out;
10694 +       }
10695 +
10696 +       else if (fill->subrsb && !fill->lkb) {
10697 +               error = pack_subrsbs(rsb, fill->subrsb, fill);
10698 +               if (error)
10699 +                       goto out;
10700 +       }
10701 +
10702 +       else if (fill->subrsb && fill->lkb) {
10703 +               error = pack_lkb_remaining(fill->subrsb, fill);
10704 +               if (error)
10705 +                       goto out;
10706 +
10707 +               if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
10708 +                       goto out;
10709 +
10710 +               subrsb = next_subrsb(fill->subrsb);
10711 +
10712 +               error = pack_subrsbs(rsb, subrsb, fill);
10713 +               if (error)
10714 +                       goto out;
10715 +       }
10716 +
10717 +       fill->subrsb = NULL;
10718 +       fill->lkb = NULL;
10719 +
10720 +      out:
10721 +       return error;
10722 +}
10723 +
10724 +/*
10725 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
10726 + * buffer.  When the buffer runs out of space, save the place to restart (the
10727 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
10728 + */
10729 +
10730 +static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
10731 +                        rcom_fill_t *fill)
10732 +{
10733 +       int error = -ENOSPC;
10734 +
10735 +       fill->remasterid = 0;
10736 +
10737 +       /*
10738 +        * Pack the root rsb itself.  A 1 byte type precedes the serialised
10739 +        * rsb.  Then pack the lkb's for the root rsb.
10740 +        */
10741 +
10742 +       down_write(&rsb->res_lock);
10743 +
10744 +       if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
10745 +               goto out;
10746 +
10747 +       rsb->res_remasterid = ++fill->remasterid;
10748 +       put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
10749 +       serialise_rsb(rsb, fill->outbuf, &fill->offset);
10750 +
10751 +       error = pack_lkb_queues(rsb, fill);
10752 +       if (error)
10753 +               goto out;
10754 +
10755 +       up_write(&rsb->res_lock);
10756 +
10757 +       /*
10758 +        * Pack subrsb/lkb's under the root rsb.
10759 +        */
10760 +
10761 +       error = pack_subrsbs(rsb, NULL, fill);
10762 +
10763 +       return error;
10764 +
10765 +      out:
10766 +       up_write(&rsb->res_lock);
10767 +       return error;
10768 +}
10769 +
10770 +/*
10771 + * Given an RSB, return the next RSB that should be sent to a new master.
10772 + */
10773 +
10774 +static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
10775 +                                          struct dlm_rsb *rsb)
10776 +{
10777 +       struct list_head *tmp, *start, *end;
10778 +       struct dlm_rsb *r;
10779 +
10780 +       if (!rsb)
10781 +               start = ls->ls_rootres.next;
10782 +       else
10783 +               start = rsb->res_rootlist.next;
10784 +
10785 +       end = &ls->ls_rootres;
10786 +
10787 +       for (tmp = start; tmp != end; tmp = tmp->next) {
10788 +               r = list_entry(tmp, struct dlm_rsb, res_rootlist);
10789 +
10790 +               if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
10791 +                       if (r->res_nodeid && lkbs_to_remaster(r)) {
10792 +                               expect_new_lkids(r);
10793 +                               return r;
10794 +                       } else
10795 +                               clear_bit(RESFL_NEW_MASTER, &r->res_flags);
10796 +               }
10797 +       }
10798 +
10799 +       return NULL;
10800 +}
10801 +
10802 +/*
10803 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
10804 + * new master node.  In the case where all the data to send to one node
10805 + * requires multiple messages, this function needs to resume filling each
10806 + * successive buffer from the point where it left off when the previous buffer
10807 + * filled up.
10808 + */
10809 +
10810 +static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
10811 +                            uint32_t *nodeid)
10812 +{
10813 +       struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
10814 +       int error;
10815 +
10816 +       fill->offset = 0;
10817 +
10818 +       if (!prev_rsb) {
10819 +
10820 +               /*
10821 +                * The first time this function is called.
10822 +                */
10823 +
10824 +               rsb = next_remastered_rsb(ls, NULL);
10825 +               if (!rsb)
10826 +                       goto no_more;
10827 +
10828 +       } else if (fill->subrsb || fill->lkb) {
10829 +
10830 +               /*
10831 +                * Continue packing an rsb tree that was partially packed last
10832 +                * time (fill->subrsb/lkb indicates where packing of last block
10833 +                * left off)
10834 +                */
10835 +
10836 +               rsb = prev_rsb;
10837 +               *nodeid = rsb->res_nodeid;
10838 +
10839 +               error = pack_rsb_tree_remaining(ls, rsb, fill);
10840 +               if (error == -ENOSPC)
10841 +                       goto more;
10842 +
10843 +               rsb = next_remastered_rsb(ls, prev_rsb);
10844 +               if (!rsb)
10845 +                       goto no_more;
10846 +
10847 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
10848 +                       goto more;
10849 +       } else {
10850 +               rsb = prev_rsb;
10851 +       }
10852 +
10853 +       /*
10854 +        * Pack rsb trees into the buffer until we run out of space, run out of
10855 +        * new rsb's or hit a new nodeid.
10856 +        */
10857 +
10858 +       *nodeid = rsb->res_nodeid;
10859 +
10860 +       for (;;) {
10861 +               error = pack_rsb_tree(ls, rsb, fill);
10862 +               if (error == -ENOSPC)
10863 +                       goto more;
10864 +
10865 +               prev_rsb = rsb;
10866 +
10867 +               rsb = next_remastered_rsb(ls, prev_rsb);
10868 +               if (!rsb)
10869 +                       goto no_more;
10870 +
10871 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
10872 +                       goto more;
10873 +       }
10874 +
10875 +      more:
10876 +       fill->more = 1;
10877 +       fill->rsb = rsb;
10878 +       return;
10879 +
10880 +      no_more:
10881 +       fill->more = 0;
10882 +}
10883 +
10884 +/*
10885 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
10886 + */
10887 +
10888 +int rebuild_rsbs_send(struct dlm_ls *ls)
10889 +{
10890 +       struct dlm_rcom *rc;
10891 +       rcom_fill_t fill;
10892 +       uint32_t nodeid;
10893 +       int error;
10894 +
10895 +       DLM_ASSERT(recover_list_empty(ls),);
10896 +
10897 +       log_all(ls, "rebuild locks");
10898 +
10899 +       error = -ENOMEM;
10900 +       rc = allocate_rcom_buffer(ls);
10901 +       if (!rc)
10902 +               goto ret;
10903 +
10904 +       error = 0;
10905 +       memset(&fill, 0, sizeof(rcom_fill_t));
10906 +       fill.outbuf = rc->rc_buf;
10907 +       fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
10908 +
10909 +       do {
10910 +               fill_rcom_buffer(ls, &fill, &nodeid);
10911 +               if (!fill.offset)
10912 +                       break;
10913 +
10914 +               rc->rc_datalen = fill.offset;
10915 +               error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
10916 +               if (error)
10917 +                       goto out;
10918 +
10919 +               schedule();
10920 +               error = dlm_recovery_stopped(ls);
10921 +               if (error)
10922 +                       goto out;
10923 +       }
10924 +       while (fill.more);
10925 +
10926 +       error = dlm_wait_function(ls, &recover_list_empty);
10927 +
10928 +       log_all(ls, "rebuilt %d locks", fill.count);
10929 +
10930 +      out:
10931 +       rebuild_freemem(ls);
10932 +       free_rcom_buffer(rc);
10933 +
10934 +      ret:
10935 +       return error;
10936 +}
10937 +
10938 +static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
10939 +                                         struct dlm_rsb *rootrsb)
10940 +{
10941 +       struct dlm_rsb *rsb;
10942 +
10943 +       DLM_ASSERT(rootrsb,);
10944 +
10945 +       if (rootrsb->res_remasterid == remasterid) {
10946 +               rsb = rootrsb;
10947 +               goto out;
10948 +       }
10949 +
10950 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
10951 +               if (rsb->res_remasterid == remasterid)
10952 +                       goto out;
10953 +       }
10954 +       rsb = NULL;
10955 +
10956 +      out:
10957 +       return rsb;
10958 +}
10959 +
10960 +/*
10961 + * Search a queue for the given remote lock id (remlkid).
10962 + */
10963 +
10964 +static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
10965 +                                     int remid)
10966 +{
10967 +       struct dlm_lkb *lkb;
10968 +
10969 +       list_for_each_entry(lkb, statequeue, lkb_statequeue) {
10970 +               if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
10971 +                       return lkb;
10972 +               }
10973 +       }
10974 +
10975 +       return NULL;
10976 +}
10977 +
10978 +/*
10979 + * Given a remote lock ID (and a parent resource), return the local LKB for it
10980 + * Hopefully we dont need to do this too often on deep lock trees.  This is
10981 + * VERY suboptimal for anything but the smallest lock trees. It searches the
10982 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
10983 + * returns the LKB address.  OPTIMISATION: we should keep a list of these while
10984 + * we are building up the remastered LKBs
10985 + */
10986 +
10987 +static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
10988 +                                      int remid)
10989 +{
10990 +       struct dlm_lkb *lkb;
10991 +       struct dlm_rsb *rsb;
10992 +
10993 +       lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
10994 +       if (lkb)
10995 +               goto out;
10996 +
10997 +       lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
10998 +       if (lkb)
10999 +               goto out;
11000 +
11001 +       lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
11002 +       if (lkb)
11003 +               goto out;
11004 +
11005 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11006 +               lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
11007 +               if (lkb)
11008 +                       goto out;
11009 +
11010 +               lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
11011 +               if (lkb)
11012 +                       goto out;
11013 +
11014 +               lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
11015 +               if (lkb)
11016 +                       goto out;
11017 +       }
11018 +       lkb = NULL;
11019 +
11020 +      out:
11021 +       return lkb;
11022 +}
11023 +
11024 +/*
11025 + * Unpack an LKB from a remaster operation
11026 + */
11027 +
11028 +static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
11029 +                          struct dlm_rsb *rootrsb, char *buf, int *ptr,
11030 +                          char *outbuf, int *outoffp)
11031 +{
11032 +       struct dlm_lkb *lkb;
11033 +       struct dlm_rsb *rsb;
11034 +       int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
11035 +
11036 +       remote_lkid = get_int(buf, ptr);
11037 +
11038 +       rsb_rmid = get_int(buf, ptr);
11039 +       rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
11040 +       DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
11041 +
11042 +       /*
11043 +        * We could have received this lkb already from a previous recovery
11044 +        * that was interrupted.  If so, just return the lkid to the remote
11045 +        * node.
11046 +        */
11047 +       lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
11048 +       if (lkb)
11049 +               goto put_lkid;
11050 +
11051 +       lkb = create_lkb(rsb->res_ls);
11052 +       if (!lkb)
11053 +               goto out;
11054 +
11055 +       lkb->lkb_remid = remote_lkid;
11056 +       lkb->lkb_flags = get_int(buf, ptr);
11057 +       status = get_int(buf, ptr);
11058 +       lkb->lkb_rqmode = get_char(buf, ptr);
11059 +       lkb->lkb_grmode = get_char(buf, ptr);
11060 +       atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
11061 +
11062 +       parentid = get_int(buf, ptr);
11063 +       lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
11064 +
11065 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11066 +               lkb->lkb_lvbptr = allocate_lvb(ls);
11067 +               if (!lkb->lkb_lvbptr)
11068 +                       goto out;
11069 +               get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
11070 +       }
11071 +
11072 +       if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
11073 +               uint64_t start, end;
11074 +
11075 +               /* Don't need to keep the range flag, for comms use only */
11076 +               lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
11077 +               start = get_int64(buf, ptr);
11078 +               end = get_int64(buf, ptr);
11079 +
11080 +               lkb->lkb_range = allocate_range(rsb->res_ls);
11081 +               if (!lkb->lkb_range)
11082 +                       goto out;
11083 +
11084 +               switch (status) {
11085 +               case GDLM_LKSTS_CONVERT:
11086 +                       lkb->lkb_range[RQ_RANGE_START] = start;
11087 +                       lkb->lkb_range[RQ_RANGE_END] = end;
11088 +                       start = get_int64(buf, ptr);
11089 +                       end = get_int64(buf, ptr);
11090 +                       lkb->lkb_range[GR_RANGE_START] = start;
11091 +                       lkb->lkb_range[GR_RANGE_END] = end;
11092 +
11093 +               case GDLM_LKSTS_WAITING:
11094 +                       lkb->lkb_range[RQ_RANGE_START] = start;
11095 +                       lkb->lkb_range[RQ_RANGE_END] = end;
11096 +                       break;
11097 +
11098 +               case GDLM_LKSTS_GRANTED:
11099 +                       lkb->lkb_range[GR_RANGE_START] = start;
11100 +                       lkb->lkb_range[GR_RANGE_END] = end;
11101 +                       break;
11102 +               default:
11103 +                       DLM_ASSERT(0,);
11104 +               }
11105 +       }
11106 +
11107 +       /* Resolve local lock LKB address from parent ID */
11108 +       if (parentid)
11109 +               lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11110 +                                                 parentid);
11111 +
11112 +       atomic_inc(&rsb->res_ref);
11113 +       lkb->lkb_resource = rsb;
11114 +
11115 +       lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11116 +       lkb->lkb_nodeid = rem_nodeid;
11117 +
11118 +       /*
11119 +        * Put the lkb on an RSB queue.  An lkb that's in the midst of a
11120 +        * conversion request (on the requesting node's lockqueue and has
11121 +        * LQCONVERT set) should be put on the granted queue.  The convert
11122 +        * request will be resent by the requesting node.
11123 +        */
11124 +
11125 +       if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11126 +               lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11127 +               DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11128 +                           printk("status=%d\n", status););
11129 +               lkb->lkb_rqmode = DLM_LOCK_IV;
11130 +               status = GDLM_LKSTS_GRANTED;
11131 +       }
11132 +
11133 +       lkb_enqueue(rsb, lkb, status);
11134 +
11135 +       /*
11136 +        * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11137 +        */
11138 +
11139 +       if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11140 +           && lkb->lkb_grmode > DLM_LOCK_NL) {
11141 +               if (!rsb->res_lvbptr)
11142 +                       rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
11143 +               if (!rsb->res_lvbptr)
11144 +                       goto out;
11145 +               memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11146 +       }
11147 +
11148 +       /*
11149 +        * Clear flags that may have been sent over that are only relevant in
11150 +        * the context of the sender.
11151 +        */
11152 +
11153 +       lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11154 +                           GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
11155 +
11156 +      put_lkid:
11157 +       /* Return the new LKID to the caller's buffer */
11158 +       put_int(lkb->lkb_id, outbuf, outoffp);
11159 +       put_int(lkb->lkb_remid, outbuf, outoffp);
11160 +       error = 0;
11161 +
11162 +      out:
11163 +       return error;
11164 +}
11165 +
11166 +static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
11167 +                                      struct dlm_rsb *rootrsb, char *buf,
11168 +                                      int *ptr)
11169 +{
11170 +       int length;
11171 +       int remasterid;
11172 +       int parent_remasterid;
11173 +       char name[DLM_RESNAME_MAXLEN];
11174 +       int error;
11175 +       struct dlm_rsb *parent = NULL;
11176 +       struct dlm_rsb *rsb;
11177 +
11178 +       get_bytes(name, &length, buf, ptr);
11179 +       remasterid = get_int(buf, ptr);
11180 +       parent_remasterid = get_int(buf, ptr);
11181 +
11182 +       if (parent_remasterid)
11183 +               parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11184 +
11185 +       /*
11186 +        * The rsb reference from this find_or_create_rsb() will keep the rsb
11187 +        * around while we add new lkb's to it from deserialise_lkb.  Each of
11188 +        * the lkb's will add an rsb reference.  The reference added here is
11189 +        * removed by release_rsb() after all lkb's are added.
11190 +        */
11191 +
11192 +       error = find_or_create_rsb(ls, parent, name, length, 1, &rsb);
11193 +       DLM_ASSERT(!error,);
11194 +
11195 +       /* There is a case where the above needs to create the RSB. */
11196 +       if (rsb->res_nodeid == -1)
11197 +               rsb->res_nodeid = our_nodeid();
11198 +
11199 +       rsb->res_remasterid = remasterid;
11200 +
11201 +       return rsb;
11202 +}
11203 +
11204 +/*
11205 + * Processing at the receiving end of a NEWLOCKS message from a node in
11206 + * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
11207 + * node whose locks we are now mastering.  For a reply we need to send back the
11208 + * new lockids of the remastered locks so that remote ops can find them.
11209 + */
11210 +
11211 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
11212 +{
11213 +       struct dlm_rcom *rc;
11214 +       struct dlm_rsb *rsb = NULL;
11215 +       rebuild_node_t *rnode;
11216 +       char *outbuf;
11217 +       int outptr, ptr = 0, error = -ENOMEM;
11218 +
11219 +       rnode = find_rebuild_root(ls, nodeid);
11220 +       if (!rnode)
11221 +               goto out;
11222 +
11223 +       /*
11224 +        * Allocate a buffer for the reply message which is a list of remote
11225 +        * lock IDs and their (new) local lock ids.  It will always be big
11226 +        * enough to fit <n> ID pairs if it already fit <n> LKBs.
11227 +        */
11228 +
11229 +       rc = allocate_rcom_buffer(ls);
11230 +       if (!rc)
11231 +               goto out;
11232 +       outbuf = rc->rc_buf;
11233 +       outptr = 0;
11234 +
11235 +       /*
11236 +        * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
11237 +        * created.  Each deserialise_rsb adds an rsb reference that must be
11238 +        * removed with release_rsb once all new lkb's for an rsb have been
11239 +        * added.
11240 +        */
11241 +
11242 +       while (ptr < len) {
11243 +               int type;
11244 +
11245 +               type = get_char(buf, &ptr);
11246 +
11247 +               switch (type) {
11248 +               case REMASTER_ROOTRSB:
11249 +                       if (rsb)
11250 +                               release_rsb(rsb);
11251 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11252 +                                             &ptr);
11253 +                       rnode->rootrsb = rsb;
11254 +                       break;
11255 +
11256 +               case REMASTER_RSB:
11257 +                       if (rsb)
11258 +                               release_rsb(rsb);
11259 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
11260 +                                             &ptr);
11261 +                       break;
11262 +
11263 +               case REMASTER_LKB:
11264 +                       deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
11265 +                                       outbuf, &outptr);
11266 +                       break;
11267 +
11268 +               default:
11269 +                       DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
11270 +                                             "len=%d\n", type, nodeid, ptr,
11271 +                                             len););
11272 +               }
11273 +       }
11274 +
11275 +       if (rsb)
11276 +               release_rsb(rsb);
11277 +
11278 +       /*
11279 +        * Reply with the new lock IDs.
11280 +        */
11281 +
11282 +       rc->rc_datalen = outptr;
11283 +       error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
11284 +
11285 +       free_rcom_buffer(rc);
11286 +
11287 +      out:
11288 +       return error;
11289 +}
11290 +
11291 +/*
11292 + * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
11293 + * new master telling us what the new remote lock IDs are for the remastered
11294 + * locks
11295 + */
11296 +
11297 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
11298 +{
11299 +       int offset = 0;
11300 +
11301 +       if (len == 1)
11302 +               len = 0;
11303 +
11304 +       while (offset < len) {
11305 +               int remote_id;
11306 +               int local_id;
11307 +               struct dlm_lkb *lkb;
11308 +
11309 +               if (offset + 8 > len) {
11310 +                       log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
11311 +                                 "length nodeid=%d offset=%d len=%d",
11312 +                                 nodeid, offset, len);
11313 +                       break;
11314 +               }
11315 +
11316 +               remote_id = get_int(buf, &offset);
11317 +               local_id = get_int(buf, &offset);
11318 +
11319 +               lkb = find_lock_by_id(ls, local_id);
11320 +               if (lkb) {
11321 +                       lkb->lkb_remid = remote_id;
11322 +                       have_new_lkid(lkb);
11323 +               } else {
11324 +                       log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
11325 +                                 "nodeid=%d id=%x remid=%x offset=%d len=%d",
11326 +                                 nodeid, local_id, remote_id, offset, len);
11327 +               }
11328 +       }
11329 +
11330 +       if (recover_list_empty(ls))
11331 +               wake_up(&ls->ls_wait_general);
11332 +
11333 +       return 0;
11334 +}
11335 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
11336 --- linux-orig/cluster/dlm/rebuild.h    1970-01-01 07:30:00.000000000 +0730
11337 +++ linux-patched/cluster/dlm/rebuild.h 2004-07-13 18:57:22.000000000 +0800
11338 @@ -0,0 +1,22 @@
11339 +/******************************************************************************
11340 +*******************************************************************************
11341 +**
11342 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11343 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11344 +**
11345 +**  This copyrighted material is made available to anyone wishing to use,
11346 +**  modify, copy, or redistribute it subject to the terms and conditions
11347 +**  of the GNU General Public License v.2.
11348 +**
11349 +*******************************************************************************
11350 +******************************************************************************/
11351 +
11352 +#ifndef __REBUILD_DOT_H__
11353 +#define __REBUILD_DOT_H__
11354 +
11355 +int rebuild_rsbs_send(struct dlm_ls *ls);
11356 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
11357 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
11358 +int rebuild_freemem(struct dlm_ls *ls);
11359 +
11360 +#endif                         /* __REBUILD_DOT_H__ */
11361 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
11362 --- linux-orig/cluster/dlm/reccomms.c   1970-01-01 07:30:00.000000000 +0730
11363 +++ linux-patched/cluster/dlm/reccomms.c        2004-07-13 18:57:22.000000000 +0800
11364 @@ -0,0 +1,504 @@
11365 +/******************************************************************************
11366 +*******************************************************************************
11367 +**
11368 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11369 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11370 +**
11371 +**  This copyrighted material is made available to anyone wishing to use,
11372 +**  modify, copy, or redistribute it subject to the terms and conditions
11373 +**  of the GNU General Public License v.2.
11374 +**
11375 +*******************************************************************************
11376 +******************************************************************************/
11377 +
11378 +#include "dlm_internal.h"
11379 +#include "lowcomms.h"
11380 +#include "midcomms.h"
11381 +#include "reccomms.h"
11382 +#include "nodes.h"
11383 +#include "lockspace.h"
11384 +#include "recover.h"
11385 +#include "dir.h"
11386 +#include "config.h"
11387 +#include "rebuild.h"
11388 +#include "memory.h"
11389 +
11390 +/* Running on the basis that only a single recovery communication will be done
11391 + * at a time per lockspace */
11392 +
11393 +static void rcom_process_message(struct dlm_ls * ls, uint32_t nodeid, struct dlm_rcom * rc);
11394 +
11395 +/*
11396 + * Track per-node progress/stats during recovery to help debugging.
11397 + */
11398 +
11399 +void rcom_log(struct dlm_ls *ls, int nodeid, struct dlm_rcom *rc, int send)
11400 +{
11401 +       struct dlm_csb *csb;
11402 +       int found = 0;
11403 +
11404 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
11405 +               if (csb->node->nodeid == nodeid) {
11406 +                       found = TRUE;
11407 +                       break;
11408 +               }
11409 +       }
11410 +
11411 +       if (!found)
11412 +               return;
11413 +
11414 +       if (rc->rc_subcmd == RECCOMM_RECOVERNAMES) {
11415 +               if (send) {
11416 +                       csb->names_send_count++;
11417 +                       csb->names_send_msgid = rc->rc_msgid;
11418 +               } else {
11419 +                       csb->names_recv_count++;
11420 +                       csb->names_recv_msgid = rc->rc_msgid;
11421 +               }
11422 +       } else if (rc->rc_subcmd == RECCOMM_NEWLOCKS) {
11423 +               if (send) {
11424 +                       csb->locks_send_count++;
11425 +                       csb->locks_send_msgid = rc->rc_msgid;
11426 +               } else {
11427 +                       csb->locks_recv_count++;
11428 +                       csb->locks_recv_msgid = rc->rc_msgid;
11429 +               }
11430 +       }
11431 +}
11432 +
11433 +void rcom_log_clear(struct dlm_ls *ls)
11434 +{
11435 +       struct dlm_csb *csb;
11436 +
11437 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
11438 +               csb->names_send_count = 0;
11439 +               csb->names_send_msgid = 0;
11440 +               csb->names_recv_count = 0;
11441 +               csb->names_recv_msgid = 0;
11442 +               csb->locks_send_count = 0;
11443 +               csb->locks_send_msgid = 0;
11444 +               csb->locks_recv_count = 0;
11445 +               csb->locks_recv_msgid = 0;
11446 +       }
11447 +}
11448 +
11449 +static int rcom_response(struct dlm_ls *ls)
11450 +{
11451 +       return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11452 +}
11453 +
11454 +/**
11455 + * rcom_send_message - send or request recovery data
11456 + * @ls: the lockspace
11457 + * @nodeid: node to which the message is sent
11458 + * @type: type of recovery message
11459 + * @rc: the rc buffer to send
11460 + * @need_reply: wait for reply if this is set
11461 + *
11462 + * Using this interface
11463 + * i)   Allocate an rc buffer:
11464 + *          rc = allocate_rcom_buffer(ls);
11465 + * ii)  Copy data to send beginning at rc->rc_buf:
11466 + *          memcpy(rc->rc_buf, mybuf, mylen);
11467 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
11468 + *          rc->rc_datalen = mylen
11469 + * iv)  Submit the rc to this function:
11470 + *          rcom_send_message(rc);
11471 + *
11472 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
11473 + * dlm_rcom).  If more data must be passed in one send, use
11474 + * rcom_expand_buffer() which incrementally increases the size of the rc buffer
11475 + * by dlm_config.buffer_size bytes.
11476 + *
11477 + * Any data returned for the message (when need_reply is set) will saved in
11478 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
11479 + * number of bytes copied into rc->rc_buf.
11480 + *
11481 + * Returns: 0 on success, -EXXX on failure
11482 + */
11483 +
11484 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
11485 +                     struct dlm_rcom *rc, int need_reply)
11486 +{
11487 +       int error = 0;
11488 +
11489 +       if (!rc->rc_datalen)
11490 +               rc->rc_datalen = 1;
11491 +
11492 +       /*
11493 +        * Fill in the header.
11494 +        */
11495 +
11496 +       rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
11497 +       rc->rc_header.rh_lockspace = ls->ls_global_id;
11498 +       rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
11499 +       rc->rc_subcmd = type;
11500 +       rc->rc_msgid = ++ls->ls_rcom_msgid;
11501 +
11502 +       rcom_log(ls, nodeid, rc, 1);
11503 +
11504 +       /*
11505 +        * When a reply is received, the reply data goes back into this buffer.
11506 +        * Synchronous rcom requests (need_reply=1) are serialised because of
11507 +        * the single ls_rcom.
11508 +        */
11509 +
11510 +       if (need_reply) {
11511 +               down(&ls->ls_rcom_lock);
11512 +               ls->ls_rcom = rc;
11513 +       }
11514 +
11515 +       /*
11516 +        * After sending the message we'll wait at the end of this function to
11517 +        * get a reply.  The READY flag will be set when the reply has been
11518 +        * received and requested data has been copied into
11519 +        * ls->ls_rcom->rc_buf;
11520 +        */
11521 +
11522 +       DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
11523 +
11524 +       /*
11525 +        * The WAIT bit indicates that we're waiting for and willing to accept a
11526 +        * reply.  Any replies are ignored unless this bit is set.
11527 +        */
11528 +
11529 +       set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11530 +
11531 +       /*
11532 +        * Process the message locally.
11533 +        */
11534 +
11535 +       if (nodeid == our_nodeid()) {
11536 +               rcom_process_message(ls, nodeid, rc);
11537 +               goto out;
11538 +       }
11539 +
11540 +       /*
11541 +        * Send the message.
11542 +        */
11543 +
11544 +       log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
11545 +
11546 +       error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
11547 +                                     GFP_KERNEL);
11548 +       DLM_ASSERT(error >= 0, printk("error = %d\n", error););
11549 +       error = 0;
11550 +
11551 +       /*
11552 +        * Wait for a reply.  Once a reply is processed from midcomms, the
11553 +        * READY bit will be set and we'll be awoken (dlm_wait_function will
11554 +        * return 0).
11555 +        */
11556 +
11557 +       if (need_reply) {
11558 +               error = dlm_wait_function(ls, &rcom_response);
11559 +               if (error)
11560 +                       log_debug(ls, "rcom wait error %d", error);
11561 +       }
11562 +
11563 +      out:
11564 +       clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
11565 +       clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11566 +
11567 +       if (need_reply)
11568 +               up(&ls->ls_rcom_lock);
11569 +
11570 +       return error;
11571 +}
11572 +
11573 +/*
11574 + * Runs in same context as midcomms.
11575 + */
11576 +
11577 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
11578 +{
11579 +       struct dlm_rcom rc_stack;
11580 +       struct dlm_rcom *reply = NULL;
11581 +       int status, datalen, maxlen;
11582 +       uint32_t r_nodeid, be_nodeid;
11583 +
11584 +       if (!ls)
11585 +               return;
11586 +
11587 +       rcom_log(ls, nodeid, rc, 0);
11588 +
11589 +       if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
11590 +               log_error(ls, "ignoring recovery message %x from %u",
11591 +                         rc->rc_subcmd, nodeid);
11592 +               return;
11593 +       }
11594 +
11595 +       switch (rc->rc_subcmd) {
11596 +
11597 +       case RECCOMM_STATUS:
11598 +
11599 +               memset(&rc_stack, 0, sizeof(struct dlm_rcom));
11600 +               reply = &rc_stack;
11601 +
11602 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11603 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11604 +               reply->rc_subcmd = rc->rc_subcmd;
11605 +               reply->rc_msgid = rc->rc_msgid;
11606 +               reply->rc_buf[0] = 0;
11607 +
11608 +               if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
11609 +                       reply->rc_buf[0] |= RESDIR_VALID;
11610 +
11611 +               if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
11612 +                       reply->rc_buf[0] |= RESDIR_ALL_VALID;
11613 +
11614 +               if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
11615 +                       reply->rc_buf[0] |= NODES_VALID;
11616 +
11617 +               if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
11618 +                       reply->rc_buf[0] |= NODES_ALL_VALID;
11619 +
11620 +               reply->rc_datalen = 1;
11621 +               reply->rc_header.rh_length =
11622 +                       sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11623 +
11624 +               log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
11625 +               break;
11626 +
11627 +       case RECCOMM_RECOVERNAMES:
11628 +
11629 +               reply = allocate_rcom_buffer(ls);
11630 +               DLM_ASSERT(reply,);
11631 +               maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
11632 +
11633 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11634 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11635 +               reply->rc_subcmd = rc->rc_subcmd;
11636 +               reply->rc_msgid = rc->rc_msgid;
11637 +
11638 +               /*
11639 +                * The other node wants a bunch of resource names.  The name of
11640 +                * the resource to begin with is in rc->rc_buf.
11641 +                */
11642 +
11643 +               datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
11644 +                                              reply->rc_buf, maxlen, nodeid);
11645 +
11646 +               reply->rc_datalen = datalen;
11647 +               reply->rc_header.rh_length =
11648 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11649 +
11650 +               log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
11651 +                         reply->rc_msgid);
11652 +               break;
11653 +
11654 +       case RECCOMM_GETMASTER:
11655 +
11656 +               reply = allocate_rcom_buffer(ls);
11657 +               DLM_ASSERT(reply,);
11658 +
11659 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11660 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11661 +               reply->rc_subcmd = rc->rc_subcmd;
11662 +               reply->rc_msgid = rc->rc_msgid;
11663 +
11664 +               /*
11665 +                * The other node wants to know the master of a named resource.
11666 +                */
11667 +
11668 +               status = dlm_dir_lookup_recovery(ls, nodeid, rc->rc_buf,
11669 +                                                rc->rc_datalen, &r_nodeid);
11670 +               if (status != 0) {
11671 +                       free_rcom_buffer(reply);
11672 +                       reply = NULL;
11673 +                       return;
11674 +               }
11675 +               be_nodeid = cpu_to_be32(r_nodeid);
11676 +               memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
11677 +               reply->rc_datalen = sizeof(uint32_t);
11678 +               reply->rc_header.rh_length =
11679 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11680 +               break;
11681 +
11682 +       case RECCOMM_BULKLOOKUP:
11683 +
11684 +               reply = allocate_rcom_buffer(ls);
11685 +               DLM_ASSERT(reply,);
11686 +
11687 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11688 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11689 +               reply->rc_subcmd = rc->rc_subcmd;
11690 +               reply->rc_msgid = rc->rc_msgid;
11691 +
11692 +               /*
11693 +                * This is a bulk version of the above and just returns a
11694 +                * buffer full of node ids to match the resources
11695 +                */
11696 +
11697 +               datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
11698 +                                            rc->rc_datalen, reply->rc_buf);
11699 +               if (datalen < 0) {
11700 +                       free_rcom_buffer(reply);
11701 +                       reply = NULL;
11702 +                       return;
11703 +               }
11704 +
11705 +               reply->rc_datalen = datalen;
11706 +               reply->rc_header.rh_length =
11707 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11708 +               break;
11709 +
11710 +               /*
11711 +                * These RECCOMM messages don't need replies.
11712 +                */
11713 +
11714 +       case RECCOMM_NEWLOCKS:
11715 +               rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11716 +               break;
11717 +
11718 +       case RECCOMM_NEWLOCKIDS:
11719 +               rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11720 +               break;
11721 +
11722 +       case RECCOMM_REMRESDATA:
11723 +               remove_resdata(ls, nodeid, rc->rc_buf, rc->rc_datalen);
11724 +               break;
11725 +
11726 +       default:
11727 +               DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
11728 +       }
11729 +
11730 +       if (reply) {
11731 +               if (nodeid == our_nodeid()) {
11732 +                       DLM_ASSERT(rc == ls->ls_rcom,);
11733 +                       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11734 +                       rc->rc_datalen = reply->rc_datalen;
11735 +               } else {
11736 +                       midcomms_send_message(nodeid,
11737 +                                             (struct dlm_header *) reply,
11738 +                                             GFP_KERNEL);
11739 +               }
11740 +
11741 +               if (reply != &rc_stack)
11742 +                       free_rcom_buffer(reply);
11743 +       }
11744 +}
11745 +
11746 +static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
11747 +                              struct dlm_rcom *reply)
11748 +{
11749 +       struct dlm_rcom *rc = ls->ls_rcom;
11750 +
11751 +       if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
11752 +               log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
11753 +               return;
11754 +       }
11755 +
11756 +       if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
11757 +               log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
11758 +                         reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
11759 +               return;
11760 +       }
11761 +
11762 +       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
11763 +       rc->rc_datalen = reply->rc_datalen;
11764 +
11765 +       /*
11766 +        * Tell the thread waiting in rcom_send_message() that it can go ahead.
11767 +        */
11768 +
11769 +       set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
11770 +       wake_up(&ls->ls_wait_general);
11771 +}
11772 +
11773 +static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
11774 +                               struct dlm_rcom *reply)
11775 +{
11776 +       restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
11777 +                              reply->rc_msgid);
11778 +}
11779 +
11780 +/*
11781 + * Runs in same context as midcomms.
11782 + */
11783 +
11784 +static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
11785 +                              struct dlm_rcom *reply)
11786 +{
11787 +       if (dlm_recovery_stopped(ls)) {
11788 +               log_error(ls, "ignoring recovery reply %x from %u",
11789 +                         reply->rc_subcmd, nodeid);
11790 +               return;
11791 +       }
11792 +
11793 +       switch (reply->rc_subcmd) {
11794 +       case RECCOMM_GETMASTER:
11795 +               process_reply_async(ls, nodeid, reply);
11796 +               break;
11797 +       case RECCOMM_STATUS:
11798 +       case RECCOMM_NEWLOCKS:
11799 +       case RECCOMM_NEWLOCKIDS:
11800 +       case RECCOMM_RECOVERNAMES:
11801 +               process_reply_sync(ls, nodeid, reply);
11802 +               break;
11803 +       default:
11804 +               log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
11805 +                         reply->rc_subcmd, nodeid);
11806 +       }
11807 +}
11808 +
11809 +
11810 +static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
11811 +{
11812 +       struct writequeue_entry *wq;
11813 +       struct dlm_rcom *rc = (struct dlm_rcom *) header;
11814 +       struct dlm_rcom *reply;
11815 +
11816 +       wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
11817 +                                (char **)&reply);
11818 +       if (!wq)
11819 +               return -ENOMEM;
11820 +
11821 +       reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
11822 +       reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
11823 +       reply->rc_subcmd = rc->rc_subcmd;
11824 +       reply->rc_msgid = rc->rc_msgid;
11825 +       reply->rc_buf[0] = 0;
11826 +
11827 +       reply->rc_datalen = 1;
11828 +       reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
11829 +
11830 +       midcomms_send_buffer((struct dlm_header *)reply, wq);
11831 +       return 0;
11832 +}
11833 +
11834 +
11835 +/*
11836 + * Runs in same context as midcomms.  Both recovery requests and recovery
11837 + * replies come through this function.
11838 + */
11839 +
11840 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
11841 +{
11842 +       struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
11843 +       struct dlm_rcom *rc = (struct dlm_rcom *) header;
11844 +
11845 +       /* If the lockspace doesn't exist then still send a status message
11846 +          back, it's possible that it just doesn't have it's global_id
11847 +          yet. */
11848 +       if (!ls) {
11849 +             send_ls_not_ready(nodeid, header);
11850 +             return;
11851 +       }
11852 +
11853 +       switch (header->rh_cmd) {
11854 +       case GDLM_REMCMD_RECOVERMESSAGE:
11855 +               down_read(&ls->ls_rec_rsblist);
11856 +               rcom_process_message(ls, nodeid, rc);
11857 +               up_read(&ls->ls_rec_rsblist);
11858 +               break;
11859 +
11860 +       case GDLM_REMCMD_RECOVERREPLY:
11861 +               rcom_process_reply(ls, nodeid, rc);
11862 +               break;
11863 +
11864 +       default:
11865 +               DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
11866 +       }
11867 +}
11868 +
11869 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
11870 --- linux-orig/cluster/dlm/reccomms.h   1970-01-01 07:30:00.000000000 +0730
11871 +++ linux-patched/cluster/dlm/reccomms.h        2004-07-13 18:57:22.000000000 +0800
11872 @@ -0,0 +1,37 @@
11873 +/******************************************************************************
11874 +*******************************************************************************
11875 +**
11876 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11877 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11878 +**
11879 +**  This copyrighted material is made available to anyone wishing to use,
11880 +**  modify, copy, or redistribute it subject to the terms and conditions
11881 +**  of the GNU General Public License v.2.
11882 +**
11883 +*******************************************************************************
11884 +******************************************************************************/
11885 +
11886 +#ifndef __RECCOMMS_DOT_H__
11887 +#define __RECCOMMS_DOT_H__
11888 +
11889 +/* Bit flags */
11890 +
11891 +#define RESDIR_VALID            (1)
11892 +#define RESDIR_ALL_VALID        (2)
11893 +#define NODES_VALID             (4)
11894 +#define NODES_ALL_VALID         (8)
11895 +
11896 +#define RECCOMM_STATUS          (1)
11897 +#define RECCOMM_RECOVERNAMES    (2)
11898 +#define RECCOMM_GETMASTER       (3)
11899 +#define RECCOMM_BULKLOOKUP      (4)
11900 +#define RECCOMM_NEWLOCKS        (5)
11901 +#define RECCOMM_NEWLOCKIDS      (6)
11902 +#define RECCOMM_REMRESDATA      (7)
11903 +
11904 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
11905 +                     struct dlm_rcom *rc, int need_reply);
11906 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
11907 +void rcom_log_clear(struct dlm_ls *ls);
11908 +
11909 +#endif
11910 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
11911 --- linux-orig/cluster/dlm/recover.c    1970-01-01 07:30:00.000000000 +0730
11912 +++ linux-patched/cluster/dlm/recover.c 2004-07-13 18:57:22.000000000 +0800
11913 @@ -0,0 +1,610 @@
11914 +/******************************************************************************
11915 +*******************************************************************************
11916 +**
11917 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
11918 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
11919 +**
11920 +**  This copyrighted material is made available to anyone wishing to use,
11921 +**  modify, copy, or redistribute it subject to the terms and conditions
11922 +**  of the GNU General Public License v.2.
11923 +**
11924 +*******************************************************************************
11925 +******************************************************************************/
11926 +
11927 +#include "dlm_internal.h"
11928 +#include "reccomms.h"
11929 +#include "dir.h"
11930 +#include "locking.h"
11931 +#include "rsb.h"
11932 +#include "lockspace.h"
11933 +#include "lkb.h"
11934 +#include "nodes.h"
11935 +#include "config.h"
11936 +#include "ast.h"
11937 +#include "memory.h"
11938 +
11939 +/*
11940 + * Called in recovery routines to check whether the recovery process has been
11941 + * interrupted/stopped by another transition.  A recovery in-process will abort
11942 + * if the lockspace is "stopped" so that a new recovery process can start from
11943 + * the beginning when the lockspace is "started" again.
11944 + */
11945 +
11946 +int dlm_recovery_stopped(struct dlm_ls *ls)
11947 +{
11948 +       return test_bit(LSFL_LS_STOP, &ls->ls_flags);
11949 +}
11950 +
11951 +static void dlm_wait_timer_fn(unsigned long data)
11952 +{
11953 +       struct dlm_ls *ls = (struct dlm_ls *) data;
11954 +
11955 +       wake_up(&ls->ls_wait_general);
11956 +}
11957 +
11958 +/*
11959 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
11960 + * set due to failure of a node in ls_nodes).  When another function thinks it
11961 + * could have completed the waited-on task, they should wake up ls_wait_general
11962 + * to get an immediate response rather than waiting for the timer to detect the
11963 + * result.  A timer wakes us up periodically while waiting to see if we should
11964 + * abort due to a node failure.
11965 + */
11966 +
11967 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls))
11968 +{
11969 +       struct timer_list timer;
11970 +       int error = 0;
11971 +
11972 +       init_timer(&timer);
11973 +       timer.function = dlm_wait_timer_fn;
11974 +       timer.data = (long) ls;
11975 +
11976 +       for (;;) {
11977 +               mod_timer(&timer, jiffies + (5 * HZ));
11978 +
11979 +               wchan_cond_sleep_intr(ls->ls_wait_general,
11980 +                                     !testfn(ls) &&
11981 +                                     !test_bit(LSFL_LS_STOP, &ls->ls_flags));
11982 +
11983 +               if (timer_pending(&timer))
11984 +                       del_timer(&timer);
11985 +
11986 +               if (testfn(ls))
11987 +                       break;
11988 +
11989 +               if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
11990 +                       error = -1;
11991 +                       break;
11992 +               }
11993 +       }
11994 +
11995 +       return error;
11996 +}
11997 +
11998 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
11999 +{
12000 +       struct dlm_rcom rc_stack, *rc;
12001 +       struct dlm_csb *csb;
12002 +       int status;
12003 +       int error = 0;
12004 +
12005 +       memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12006 +       rc = &rc_stack;
12007 +       rc->rc_datalen = 0;
12008 +
12009 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
12010 +               for (;;) {
12011 +                       error = dlm_recovery_stopped(ls);
12012 +                       if (error)
12013 +                               goto out;
12014 +
12015 +                       error = rcom_send_message(ls, csb->node->nodeid,
12016 +                                                 RECCOMM_STATUS, rc, 1);
12017 +                       if (error)
12018 +                               goto out;
12019 +
12020 +                       status = rc->rc_buf[0];
12021 +                       if (status & wait_status)
12022 +                               break;
12023 +                       else {
12024 +                               set_current_state(TASK_INTERRUPTIBLE);
12025 +                               schedule_timeout(HZ >> 1);
12026 +                       }
12027 +               }
12028 +       }
12029 +
12030 +      out:
12031 +       return error;
12032 +}
12033 +
12034 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
12035 +{
12036 +       struct dlm_rcom rc_stack, *rc;
12037 +       uint32_t nodeid = ls->ls_low_nodeid;
12038 +       int status;
12039 +       int error = 0;
12040 +
12041 +       memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12042 +       rc = &rc_stack;
12043 +       rc->rc_datalen = 0;
12044 +
12045 +       for (;;) {
12046 +               error = dlm_recovery_stopped(ls);
12047 +               if (error)
12048 +                       goto out;
12049 +
12050 +               error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
12051 +               if (error)
12052 +                       break;
12053 +
12054 +               status = rc->rc_buf[0];
12055 +               if (status & wait_status)
12056 +                       break;
12057 +               else {
12058 +                       set_current_state(TASK_INTERRUPTIBLE);
12059 +                       schedule_timeout(HZ >> 1);
12060 +               }
12061 +       }
12062 +
12063 +      out:
12064 +       return error;
12065 +}
12066 +
12067 +static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
12068 +{
12069 +       struct dlm_lkb *lkb, *safe;
12070 +       struct dlm_rsb *rsb;
12071 +       int count = 0;
12072 +
12073 +       list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
12074 +               if (!lkb->lkb_nodeid)
12075 +                       continue;
12076 +
12077 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
12078 +
12079 +               if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
12080 +                       list_del(&lkb->lkb_statequeue);
12081 +
12082 +                       rsb = lkb->lkb_resource;
12083 +                       lkb->lkb_status = 0;
12084 +
12085 +                       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
12086 +                           && &lkb->lkb_duetime)
12087 +                               remove_from_deadlockqueue(lkb);
12088 +
12089 +                       release_lkb(ls, lkb);
12090 +                       release_rsb(rsb);
12091 +                       count++;
12092 +               }
12093 +       }
12094 +
12095 +       return count;
12096 +}
12097 +
12098 +/*
12099 + * Go through local restbl and for each rsb we're master of, clear out any
12100 + * lkb's held by departed nodes.
12101 + */
12102 +
12103 +int restbl_lkb_purge(struct dlm_ls *ls)
12104 +{
12105 +       struct list_head *tmp2, *safe2;
12106 +       int count = 0;
12107 +       struct dlm_rsb *rootrsb, *safe, *rsb;
12108 +
12109 +       log_all(ls, "purge locks of departed nodes");
12110 +
12111 +       list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
12112 +
12113 +               if (rootrsb->res_nodeid)
12114 +                       continue;
12115 +
12116 +               hold_rsb(rootrsb);
12117 +               down_write(&rootrsb->res_lock);
12118 +
12119 +               /* This traverses the subreslist in reverse order so we purge
12120 +                * the children before their parents. */
12121 +
12122 +               for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12123 +                    tmp2 != &rootrsb->res_subreslist;
12124 +                    tmp2 = safe2, safe2 = safe2->prev) {
12125 +                       rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
12126 +
12127 +                       hold_rsb(rsb);
12128 +                       purge_queue(ls, &rsb->res_grantqueue);
12129 +                       purge_queue(ls, &rsb->res_convertqueue);
12130 +                       purge_queue(ls, &rsb->res_waitqueue);
12131 +                       release_rsb(rsb);
12132 +               }
12133 +               count += purge_queue(ls, &rootrsb->res_grantqueue);
12134 +               count += purge_queue(ls, &rootrsb->res_convertqueue);
12135 +               count += purge_queue(ls, &rootrsb->res_waitqueue);
12136 +
12137 +               up_write(&rootrsb->res_lock);
12138 +               release_rsb(rootrsb);
12139 +       }
12140 +
12141 +       log_all(ls, "purged %d locks", count);
12142 +
12143 +       return 0;
12144 +}
12145 +
12146 +/*
12147 + * Grant any locks that have become grantable after a purge
12148 + */
12149 +
12150 +int restbl_grant_after_purge(struct dlm_ls *ls)
12151 +{
12152 +       struct dlm_rsb *root, *rsb, *safe;
12153 +       int error = 0;
12154 +
12155 +       down_write(&ls->ls_gap_rsblist);
12156 +
12157 +       list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12158 +               /* only the rsb master grants locks */
12159 +               if (root->res_nodeid)
12160 +                       continue;
12161 +
12162 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12163 +                       log_debug(ls, "restbl_grant_after_purge aborted");
12164 +                       error = -EINTR;
12165 +                       up_write(&ls->ls_gap_rsblist);
12166 +                       goto out;
12167 +               }
12168 +
12169 +               down_write(&root->res_lock);
12170 +               grant_pending_locks(root);
12171 +               up_write(&root->res_lock);
12172 +
12173 +               list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12174 +                       down_write(&rsb->res_lock);
12175 +                       grant_pending_locks(rsb);
12176 +                       up_write(&rsb->res_lock);
12177 +               }
12178 +       }
12179 +       up_write(&ls->ls_gap_rsblist);
12180 +       wake_astd();
12181 + out:
12182 +       return error;
12183 +}
12184 +
12185 +/*
12186 + * Set the lock master for all LKBs in a lock queue
12187 + */
12188 +
12189 +static void set_lock_master(struct list_head *queue, int nodeid)
12190 +{
12191 +       struct dlm_lkb *lkb;
12192 +
12193 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
12194 +               /* Don't muck around with pre-exising sublocks */
12195 +               if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12196 +                       lkb->lkb_nodeid = nodeid;
12197 +       }
12198 +}
12199 +
12200 +static void set_master_lkbs(struct dlm_rsb *rsb)
12201 +{
12202 +       set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12203 +       set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12204 +       set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12205 +}
12206 +
12207 +/*
12208 + * This rsb struct is now the master so it is responsible for keeping the
12209 + * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
12210 + * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
12211 + * this rsb in deserialise_lkb.
12212 + */
12213 +
12214 +static void set_rsb_lvb(struct dlm_rsb *rsb)
12215 +{
12216 +       struct dlm_lkb *lkb;
12217 +
12218 +       list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12219 +
12220 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12221 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12222 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12223 +               {
12224 +                       if (!rsb->res_lvbptr)
12225 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12226 +
12227 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12228 +                       return;
12229 +               }
12230 +       }
12231 +
12232 +       list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12233 +
12234 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12235 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12236 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12237 +               {
12238 +                       if (!rsb->res_lvbptr)
12239 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12240 +
12241 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12242 +                       return;
12243 +               }
12244 +       }
12245 +}
12246 +
12247 +/*
12248 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12249 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12250 + */
12251 +
12252 +static void set_new_master(struct dlm_rsb *rsb)
12253 +{
12254 +       struct dlm_rsb *subrsb;
12255 +
12256 +       down_write(&rsb->res_lock);
12257 +
12258 +       if (rsb->res_nodeid == our_nodeid()) {
12259 +               rsb->res_nodeid = 0;
12260 +               set_rsb_lvb(rsb);
12261 +       }
12262 +
12263 +       set_master_lkbs(rsb);
12264 +
12265 +       list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
12266 +               subrsb->res_nodeid = rsb->res_nodeid;
12267 +               set_master_lkbs(subrsb);
12268 +       }
12269 +
12270 +       up_write(&rsb->res_lock);
12271 +
12272 +       set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
12273 +}
12274 +
12275 +/*
12276 + * The recover_list contains all the rsb's for which we've requested the new
12277 + * master nodeid.  As replies are returned from the resource directories the
12278 + * rsb's are removed from the list.  When the list is empty we're done.
12279 + *
12280 + * The recover_list is later similarly used for all rsb's for which we've sent
12281 + * new lkb's and need to receive new corresponding lkid's.
12282 + */
12283 +
12284 +int recover_list_empty(struct dlm_ls *ls)
12285 +{
12286 +       int empty;
12287 +
12288 +       spin_lock(&ls->ls_recover_list_lock);
12289 +       empty = list_empty(&ls->ls_recover_list);
12290 +       spin_unlock(&ls->ls_recover_list_lock);
12291 +
12292 +       return empty;
12293 +}
12294 +
12295 +int recover_list_count(struct dlm_ls *ls)
12296 +{
12297 +       int count;
12298 +
12299 +       spin_lock(&ls->ls_recover_list_lock);
12300 +       count = ls->ls_recover_list_count;
12301 +       spin_unlock(&ls->ls_recover_list_lock);
12302 +
12303 +       return count;
12304 +}
12305 +
12306 +void recover_list_add(struct dlm_rsb *rsb)
12307 +{
12308 +       struct dlm_ls *ls = rsb->res_ls;
12309 +
12310 +       spin_lock(&ls->ls_recover_list_lock);
12311 +       if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
12312 +               list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
12313 +               ls->ls_recover_list_count++;
12314 +               hold_rsb(rsb);
12315 +       }
12316 +       spin_unlock(&ls->ls_recover_list_lock);
12317 +}
12318 +
12319 +void recover_list_del(struct dlm_rsb *rsb)
12320 +{
12321 +       struct dlm_ls *ls = rsb->res_ls;
12322 +
12323 +       spin_lock(&ls->ls_recover_list_lock);
12324 +       clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
12325 +       list_del(&rsb->res_recover_list);
12326 +       ls->ls_recover_list_count--;
12327 +       spin_unlock(&ls->ls_recover_list_lock);
12328 +
12329 +       release_rsb(rsb);
12330 +}
12331 +
12332 +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
12333 +{
12334 +       struct dlm_rsb *rsb = NULL;
12335 +
12336 +       spin_lock(&ls->ls_recover_list_lock);
12337 +
12338 +       list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
12339 +               if (rsb->res_recover_msgid == msgid)
12340 +                       goto rec_found;
12341 +       }
12342 +       rsb = NULL;
12343 +
12344 + rec_found:
12345 +       spin_unlock(&ls->ls_recover_list_lock);
12346 +       return rsb;
12347 +}
12348 +
12349 +#if 0
12350 +static void recover_list_clear(struct dlm_ls *ls)
12351 +{
12352 +       struct dlm_rsb *rsb;
12353 +
12354 +
12355 +       spin_lock(&ls->ls_recover_list_lock);
12356 +
12357 +       while (!list_empty(&ls->ls_recover_list)) {
12358 +               rsb = list_entry(ls->ls_recover_list.next, struct dlm_rsb,
12359 +                                res_recover_list);
12360 +               list_del(&rsb->res_recover_list);
12361 +               ls->ls_recover_list_count--;
12362 +       }
12363 +       spin_unlock(&ls->ls_recover_list_lock);
12364 +
12365 +}
12366 +#endif
12367 +
12368 +static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
12369 +{
12370 +       struct dlm_ls *ls = rsb->res_ls;
12371 +       uint32_t dir_nodeid, r_nodeid;
12372 +       int error;
12373 +
12374 +       dir_nodeid = get_directory_nodeid(rsb);
12375 +
12376 +       if (dir_nodeid == our_nodeid()) {
12377 +               error = dlm_dir_lookup_recovery(ls, dir_nodeid, rsb->res_name,
12378 +                                               rsb->res_length, &r_nodeid);
12379 +               if (error)
12380 +                       goto fail;
12381 +
12382 +               rsb->res_nodeid = r_nodeid;
12383 +               set_new_master(rsb);
12384 +       } else {
12385 +               /* As we are the only thread doing recovery this
12386 +                  should be safe. if not then we need to use a different
12387 +                  ID somehow. We must set it in the RSB before rcom_send_msg
12388 +                  completes cos we may get a reply quite quickly.
12389 +               */
12390 +               rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
12391 +
12392 +               recover_list_add(rsb);
12393 +
12394 +               memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
12395 +               rc->rc_datalen = rsb->res_length;
12396 +
12397 +               error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
12398 +                                         rc, 0);
12399 +               if (error)
12400 +                       goto fail;
12401 +       }
12402 +
12403 +      fail:
12404 +       return error;
12405 +}
12406 +
12407 +/*
12408 + * Go through local root resources and for each rsb which has a master which
12409 + * has departed, get the new master nodeid from the resdir.  The resdir will
12410 + * assign mastery to the first node to look up the new master.  That means
12411 + * we'll discover in this lookup if we're the new master of any rsb's.
12412 + *
12413 + * We fire off all the resdir requests individually and asynchronously to the
12414 + * correct resdir node.  The replies are processed in rsb_master_recv().
12415 + */
12416 +
12417 +int restbl_rsb_update(struct dlm_ls *ls)
12418 +{
12419 +       struct dlm_rsb *rsb, *safe;
12420 +       struct dlm_rcom *rc;
12421 +       int error = -ENOMEM;
12422 +       int count = 0;
12423 +
12424 +       log_all(ls, "update remastered resources");
12425 +
12426 +       rc = allocate_rcom_buffer(ls);
12427 +       if (!rc)
12428 +               goto out;
12429 +
12430 +       list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
12431 +               if (!rsb->res_nodeid)
12432 +                       continue;
12433 +
12434 +               error = dlm_recovery_stopped(ls);
12435 +               if (error)
12436 +                       goto out_free;
12437 +
12438 +               if (in_nodes_gone(ls, rsb->res_nodeid)) {
12439 +                       error = rsb_master_lookup(rsb, rc);
12440 +                       if (error)
12441 +                               goto out_free;
12442 +                       count++;
12443 +               }
12444 +       }
12445 +
12446 +       error = dlm_wait_function(ls, &recover_list_empty);
12447 +
12448 +       log_all(ls, "updated %d resources", count);
12449 +
12450 +      out_free:
12451 +       free_rcom_buffer(rc);
12452 +
12453 +      out:
12454 +       return error;
12455 +}
12456 +
12457 +int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
12458 +                          int length, int msgid)
12459 +{
12460 +       struct dlm_rsb *rsb;
12461 +       uint32_t be_nodeid;
12462 +
12463 +       rsb = recover_list_find(ls, msgid);
12464 +       if (!rsb) {
12465 +               log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
12466 +               goto out;
12467 +       }
12468 +
12469 +       memcpy(&be_nodeid, buf, sizeof(uint32_t));
12470 +       rsb->res_nodeid = be32_to_cpu(be_nodeid);
12471 +       set_new_master(rsb);
12472 +       recover_list_del(rsb);
12473 +
12474 +       if (recover_list_empty(ls))
12475 +               wake_up(&ls->ls_wait_general);
12476 +
12477 +      out:
12478 +       return 0;
12479 +}
12480 +
12481 +/*
12482 + * This function not used any longer.
12483 + */
12484 +
12485 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
12486 +                      char *outbuf)
12487 +{
12488 +       char *inbufptr, *outbufptr;
12489 +
12490 +       /*
12491 +        * The other node wants nodeids matching the resource names in inbuf.
12492 +        * The resource names are packed into inbuf as
12493 +        * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
12494 +        * lenX bytes.  Matching nodeids are packed into outbuf in order
12495 +        * [nodeid1][nodeid2]...
12496 +        */
12497 +
12498 +       inbufptr = inbuf;
12499 +       outbufptr = outbuf;
12500 +
12501 +       while (inbufptr < inbuf + inlen) {
12502 +               uint32_t r_nodeid, be_nodeid;
12503 +               int status;
12504 +
12505 +               status = dlm_dir_lookup_recovery(ls, nodeid, inbufptr + 1,
12506 +                                                *inbufptr, &r_nodeid);
12507 +               if (status != 0)
12508 +                       goto fail;
12509 +
12510 +               inbufptr += *inbufptr + 1;
12511 +
12512 +               be_nodeid = cpu_to_be32(r_nodeid);
12513 +               memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
12514 +               outbufptr += sizeof(uint32_t);
12515 +
12516 +               /* add assertion that outbufptr - outbuf is not > than ... */
12517 +       }
12518 +
12519 +       return (outbufptr - outbuf);
12520 +
12521 +      fail:
12522 +       return -1;
12523 +}
12524 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
12525 --- linux-orig/cluster/dlm/recover.h    1970-01-01 07:30:00.000000000 +0730
12526 +++ linux-patched/cluster/dlm/recover.h 2004-07-13 18:57:22.000000000 +0800
12527 @@ -0,0 +1,33 @@
12528 +/******************************************************************************
12529 +*******************************************************************************
12530 +**
12531 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12532 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12533 +**
12534 +**  This copyrighted material is made available to anyone wishing to use,
12535 +**  modify, copy, or redistribute it subject to the terms and conditions
12536 +**  of the GNU General Public License v.2.
12537 +**
12538 +*******************************************************************************
12539 +******************************************************************************/
12540 +
12541 +#ifndef __RECOVER_DOT_H__
12542 +#define __RECOVER_DOT_H__
12543 +
12544 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
12545 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
12546 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
12547 +int dlm_recovery_stopped(struct dlm_ls *ls);
12548 +int recover_list_empty(struct dlm_ls *ls);
12549 +int recover_list_count(struct dlm_ls *ls);
12550 +void recover_list_add(struct dlm_rsb *rsb);
12551 +void recover_list_del(struct dlm_rsb *rsb);
12552 +int restbl_lkb_purge(struct dlm_ls *ls);
12553 +void restbl_grant_after_purge(struct dlm_ls *ls);
12554 +int restbl_rsb_update(struct dlm_ls *ls);
12555 +int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
12556 +                          int msgid);
12557 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
12558 +                      char *outbuf);
12559 +
12560 +#endif                         /* __RECOVER_DOT_H__ */
12561 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
12562 --- linux-orig/cluster/dlm/recoverd.c   1970-01-01 07:30:00.000000000 +0730
12563 +++ linux-patched/cluster/dlm/recoverd.c        2004-07-13 18:57:22.000000000 +0800
12564 @@ -0,0 +1,693 @@
12565 +/******************************************************************************
12566 +*******************************************************************************
12567 +**
12568 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12569 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12570 +**
12571 +**  This copyrighted material is made available to anyone wishing to use,
12572 +**  modify, copy, or redistribute it subject to the terms and conditions
12573 +**  of the GNU General Public License v.2.
12574 +**
12575 +*******************************************************************************
12576 +******************************************************************************/
12577 +
12578 +#include "dlm_internal.h"
12579 +#include "nodes.h"
12580 +#include "dir.h"
12581 +#include "ast.h"
12582 +#include "recover.h"
12583 +#include "lockspace.h"
12584 +#include "lowcomms.h"
12585 +#include "lockqueue.h"
12586 +#include "lkb.h"
12587 +#include "rebuild.h"
12588 +
12589 +/*
12590 + * next_move actions
12591 + */
12592 +
12593 +#define DO_STOP             (1)
12594 +#define DO_START            (2)
12595 +#define DO_FINISH           (3)
12596 +#define DO_FINISH_STOP      (4)
12597 +#define DO_FINISH_START     (5)
12598 +
12599 +/*
12600 + * recoverd_flags for thread
12601 + */
12602 +
12603 +#define THREAD_STOP         (0)
12604 +
12605 +/*
12606 + * local thread variables
12607 + */
12608 +
12609 +static unsigned long recoverd_flags;
12610 +static struct completion recoverd_run;
12611 +static wait_queue_head_t recoverd_wait;
12612 +static struct task_struct *recoverd_task;
12613 +
12614 +/*
12615 + * Queue of lockspaces (dlm_recover structs) which need to be
12616 + * started/recovered
12617 + */
12618 +
12619 +static struct list_head recoverd_start_queue;
12620 +static atomic_t recoverd_start_count;
12621 +
12622 +extern struct list_head lslist;
12623 +extern spinlock_t lslist_lock;
12624 +
12625 +void dlm_recoverd_init(void)
12626 +{
12627 +       INIT_LIST_HEAD(&recoverd_start_queue);
12628 +       atomic_set(&recoverd_start_count, 0);
12629 +
12630 +       init_completion(&recoverd_run);
12631 +       init_waitqueue_head(&recoverd_wait);
12632 +       memset(&recoverd_flags, 0, sizeof(unsigned long));
12633 +}
12634 +
12635 +static int enable_locking(struct dlm_ls *ls, int event_id)
12636 +{
12637 +       int error = 0;
12638 +
12639 +       spin_lock(&ls->ls_recover_lock);
12640 +       if (ls->ls_last_stop < event_id) {
12641 +               set_bit(LSFL_LS_RUN, &ls->ls_flags);
12642 +               up_write(&ls->ls_in_recovery);
12643 +       } else {
12644 +               error = -EINTR;
12645 +               log_debug(ls, "enable_locking: abort %d", event_id);
12646 +       }
12647 +       spin_unlock(&ls->ls_recover_lock);
12648 +       return error;
12649 +}
12650 +
12651 +static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
12652 +{
12653 +       int error;
12654 +
12655 +       log_all(ls, "recover event %u (first)", rv->event_id);
12656 +
12657 +       kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
12658 +
12659 +       error = ls_nodes_init(ls, rv);
12660 +       if (error) {
12661 +               log_error(ls, "nodes_init failed %d", error);
12662 +               goto out;
12663 +       }
12664 +
12665 +       error = dlm_dir_rebuild_local(ls);
12666 +       if (error) {
12667 +               log_error(ls, "dlm_dir_rebuild_local failed %d", error);
12668 +               goto out;
12669 +       }
12670 +
12671 +       error = dlm_dir_rebuild_wait(ls);
12672 +       if (error) {
12673 +               log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
12674 +               goto out;
12675 +       }
12676 +
12677 +       log_all(ls, "recover event %u done", rv->event_id);
12678 +       kcl_start_done(ls->ls_local_id, rv->event_id);
12679 +
12680 +      out:
12681 +       return error;
12682 +}
12683 +
12684 +/*
12685 + * We are given here a new group of nodes which are in the lockspace.  We first
12686 + * figure out the differences in ls membership from when we were last running.
12687 + * If nodes from before are gone, then there will be some lock recovery to do.
12688 + * If there are only nodes which have joined, then there's no lock recovery.
12689 + *
12690 + * note: cman requires an rc to finish starting on an revent (where nodes die)
12691 + * before it allows an sevent (where nodes join) to be processed.  This means
12692 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
12693 + * joined.
12694 + */
12695 +
12696 +static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
12697 +{
12698 +       int error, neg = 0;
12699 +
12700 +       log_all(ls, "recover event %u", rv->event_id);
12701 +
12702 +       /*
12703 +        * Add or remove nodes from the lockspace's ls_nodes list.
12704 +        */
12705 +
12706 +       error = ls_nodes_reconfig(ls, rv, &neg);
12707 +       if (error) {
12708 +               log_error(ls, "nodes_reconfig failed %d", error);
12709 +               goto fail;
12710 +       }
12711 +
12712 +       /*
12713 +        * Rebuild our own share of the resdir by collecting from all other
12714 +        * nodes rsb name/master pairs for which the name hashes to us.
12715 +        */
12716 +
12717 +       error = dlm_dir_rebuild_local(ls);
12718 +       if (error) {
12719 +               log_error(ls, "dlm_dir_rebuild_local failed %d", error);
12720 +               goto fail;
12721 +       }
12722 +
12723 +       /*
12724 +        * Purge resdir-related requests that are being held in requestqueue.
12725 +        * All resdir requests from before recovery started are invalid now due
12726 +        * to the resdir rebuild and will be resent by the requesting nodes.
12727 +        */
12728 +
12729 +       purge_requestqueue(ls);
12730 +       set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12731 +
12732 +       /*
12733 +        * Wait for all nodes to complete resdir rebuild.
12734 +        */
12735 +
12736 +       error = dlm_dir_rebuild_wait(ls);
12737 +       if (error) {
12738 +               log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
12739 +               goto fail;
12740 +       }
12741 +
12742 +       /*
12743 +        * Mark our own lkb's waiting in the lockqueue for remote replies from
12744 +        * nodes that are now departed.  These will be resent to the new
12745 +        * masters in resend_cluster_requests.  Also mark resdir lookup
12746 +        * requests for resending.
12747 +        */
12748 +
12749 +       lockqueue_lkb_mark(ls);
12750 +
12751 +       error = dlm_recovery_stopped(ls);
12752 +       if (error)
12753 +               goto fail;
12754 +
12755 +       if (neg) {
12756 +               /*
12757 +                * Clear lkb's for departed nodes.  This can't fail since it
12758 +                * doesn't involve communicating with other nodes.
12759 +                */
12760 +
12761 +               down_write(&ls->ls_rec_rsblist);
12762 +               restbl_lkb_purge(ls);
12763 +               up_write(&ls->ls_rec_rsblist);
12764 +
12765 +               down_read(&ls->ls_rec_rsblist);
12766 +
12767 +               /*
12768 +                * Get new master id's for rsb's of departed nodes.  This fails
12769 +                * if we can't communicate with other nodes.
12770 +                */
12771 +
12772 +               error = restbl_rsb_update(ls);
12773 +               if (error) {
12774 +                       log_error(ls, "restbl_rsb_update failed %d", error);
12775 +                       goto fail_up;
12776 +               }
12777 +
12778 +               /*
12779 +                * Send our lkb info to new masters.  This fails if we can't
12780 +                * communicate with a node.
12781 +                */
12782 +
12783 +               error = rebuild_rsbs_send(ls);
12784 +               if (error) {
12785 +                       log_error(ls, "rebuild_rsbs_send failed %d", error);
12786 +                       goto fail_up;
12787 +               }
12788 +               up_read(&ls->ls_rec_rsblist);
12789 +       }
12790 +
12791 +       clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
12792 +
12793 +       log_all(ls, "recover event %u done", rv->event_id);
12794 +       kcl_start_done(ls->ls_local_id, rv->event_id);
12795 +       return 0;
12796 +
12797 + fail_up:
12798 +       up_read(&ls->ls_rec_rsblist);
12799 + fail:
12800 +       log_all(ls, "recover event %d error %d", rv->event_id, error);
12801 +       return error;
12802 +}
12803 +
12804 +static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
12805 +{
12806 +       struct dlm_csb *csb, *safe;
12807 +
12808 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
12809 +               if (csb->gone_event <= finish_event) {
12810 +                       list_del(&csb->list);
12811 +                       release_csb(csb);
12812 +               }
12813 +       }
12814 +}
12815 +
12816 +/*
12817 + * Between calls to this routine for a ls, there can be multiple stop/start
12818 + * events from cman where every start but the latest is cancelled by stops.
12819 + * There can only be a single finish from cman because every finish requires us
12820 + * to call start_done.  A single finish event could be followed by multiple
12821 + * stop/start events.  This routine takes any combination of events from cman
12822 + * and boils them down to one course of action.
12823 + */
12824 +
12825 +static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
12826 +                    int *finish_out)
12827 +{
12828 +       LIST_HEAD(events);
12829 +       unsigned int cmd = 0, stop, start, finish;
12830 +       unsigned int last_stop, last_start, last_finish;
12831 +       struct dlm_recover *rv = NULL, *start_rv = NULL;
12832 +
12833 +       /*
12834 +        * Grab the current state of cman/sm events.
12835 +        */
12836 +
12837 +       spin_lock(&ls->ls_recover_lock);
12838 +
12839 +       stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
12840 +       start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
12841 +       finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
12842 +
12843 +       last_stop = ls->ls_last_stop;
12844 +       last_start = ls->ls_last_start;
12845 +       last_finish = ls->ls_last_finish;
12846 +
12847 +       while (!list_empty(&ls->ls_recover)) {
12848 +               rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
12849 +               list_del(&rv->list);
12850 +               list_add_tail(&rv->list, &events);
12851 +       }
12852 +       spin_unlock(&ls->ls_recover_lock);
12853 +
12854 +       log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
12855 +                 last_stop, last_start, last_finish);
12856 +
12857 +       /*
12858 +        * Toss start events which have since been cancelled.
12859 +        */
12860 +
12861 +       while (!list_empty(&events)) {
12862 +               DLM_ASSERT(start,);
12863 +               rv = list_entry(events.next, struct dlm_recover, list);
12864 +               list_del(&rv->list);
12865 +
12866 +               if (rv->event_id <= last_stop) {
12867 +                       log_debug(ls, "move skip event %u", rv->event_id);
12868 +                       kfree(rv->nodeids);
12869 +                       kfree(rv);
12870 +                       rv = NULL;
12871 +               } else {
12872 +                       log_debug(ls, "move use event %u", rv->event_id);
12873 +                       DLM_ASSERT(!start_rv,);
12874 +                       start_rv = rv;
12875 +               }
12876 +       }
12877 +
12878 +       /*
12879 +        * Eight possible combinations of events.
12880 +        */
12881 +
12882 +       /* 0 */
12883 +       if (!stop && !start && !finish) {
12884 +               DLM_ASSERT(!start_rv,);
12885 +               cmd = 0;
12886 +               goto out;
12887 +       }
12888 +
12889 +       /* 1 */
12890 +       if (!stop && !start && finish) {
12891 +               DLM_ASSERT(!start_rv,);
12892 +               DLM_ASSERT(last_start > last_stop,);
12893 +               DLM_ASSERT(last_finish == last_start,);
12894 +               cmd = DO_FINISH;
12895 +               *finish_out = last_finish;
12896 +               goto out;
12897 +       }
12898 +
12899 +       /* 2 */
12900 +       if (!stop && start && !finish) {
12901 +               DLM_ASSERT(start_rv,);
12902 +               DLM_ASSERT(last_start > last_stop,);
12903 +               cmd = DO_START;
12904 +               *rv_out = start_rv;
12905 +               goto out;
12906 +       }
12907 +
12908 +       /* 3 */
12909 +       if (!stop && start && finish) {
12910 +               DLM_ASSERT(0, printk("finish and start with no stop\n"););
12911 +       }
12912 +
12913 +       /* 4 */
12914 +       if (stop && !start && !finish) {
12915 +               DLM_ASSERT(!start_rv,);
12916 +               DLM_ASSERT(last_start == last_stop,);
12917 +               cmd = DO_STOP;
12918 +               goto out;
12919 +       }
12920 +
12921 +       /* 5 */
12922 +       if (stop && !start && finish) {
12923 +               DLM_ASSERT(!start_rv,);
12924 +               DLM_ASSERT(last_finish == last_start,);
12925 +               DLM_ASSERT(last_stop == last_start,);
12926 +               cmd = DO_FINISH_STOP;
12927 +               *finish_out = last_finish;
12928 +               goto out;
12929 +       }
12930 +
12931 +       /* 6 */
12932 +       if (stop && start && !finish) {
12933 +               if (start_rv) {
12934 +                       DLM_ASSERT(last_start > last_stop,);
12935 +                       cmd = DO_START;
12936 +                       *rv_out = start_rv;
12937 +               } else {
12938 +                       DLM_ASSERT(last_stop == last_start,);
12939 +                       cmd = DO_STOP;
12940 +               }
12941 +               goto out;
12942 +       }
12943 +
12944 +       /* 7 */
12945 +       if (stop && start && finish) {
12946 +               if (start_rv) {
12947 +                       DLM_ASSERT(last_start > last_stop,);
12948 +                       DLM_ASSERT(last_start > last_finish,);
12949 +                       cmd = DO_FINISH_START;
12950 +                       *finish_out = last_finish;
12951 +                       *rv_out = start_rv;
12952 +               } else {
12953 +                       DLM_ASSERT(last_start == last_stop,);
12954 +                       DLM_ASSERT(last_start > last_finish,);
12955 +                       cmd = DO_FINISH_STOP;
12956 +                       *finish_out = last_finish;
12957 +               }
12958 +               goto out;
12959 +       }
12960 +
12961 +      out:
12962 +       return cmd;
12963 +}
12964 +
12965 +/*
12966 + * This function decides what to do given every combination of current
12967 + * lockspace state and next lockspace state.
12968 + */
12969 +
12970 +static void do_ls_recovery(struct dlm_ls *ls)
12971 +{
12972 +       struct dlm_recover *rv = NULL;
12973 +       int error, cur_state, next_state = 0, do_now, finish_event = 0;
12974 +
12975 +       do_now = next_move(ls, &rv, &finish_event);
12976 +       if (!do_now)
12977 +               goto out;
12978 +
12979 +       cur_state = ls->ls_state;
12980 +       next_state = 0;
12981 +
12982 +       DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
12983 +                   log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
12984 +
12985 +       /*
12986 +        * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
12987 +        * a stop and start which equates with a START.
12988 +        */
12989 +
12990 +       if (cur_state == LSST_CLEAR) {
12991 +               switch (do_now) {
12992 +               case DO_STOP:
12993 +                       next_state = LSST_WAIT_START;
12994 +                       break;
12995 +
12996 +               case DO_START:
12997 +                       error = ls_reconfig(ls, rv);
12998 +                       if (error)
12999 +                               next_state = LSST_WAIT_START;
13000 +                       else
13001 +                               next_state = LSST_RECONFIG_DONE;
13002 +                       break;
13003 +
13004 +               case DO_FINISH: /* invalid */
13005 +               case DO_FINISH_STOP:    /* invalid */
13006 +               case DO_FINISH_START:   /* invalid */
13007 +               default:
13008 +                       DLM_ASSERT(0,);
13009 +               }
13010 +               goto out;
13011 +       }
13012 +
13013 +       /*
13014 +        * LSST_WAIT_START - we're not running because of getting a stop or
13015 +        * failing a start.  We wait in this state for another stop/start or
13016 +        * just the next start to begin another reconfig attempt.
13017 +        */
13018 +
13019 +       if (cur_state == LSST_WAIT_START) {
13020 +               switch (do_now) {
13021 +               case DO_STOP:
13022 +                       break;
13023 +
13024 +               case DO_START:
13025 +                       error = ls_reconfig(ls, rv);
13026 +                       if (error)
13027 +                               next_state = LSST_WAIT_START;
13028 +                       else
13029 +                               next_state = LSST_RECONFIG_DONE;
13030 +                       break;
13031 +
13032 +               case DO_FINISH: /* invalid */
13033 +               case DO_FINISH_STOP:    /* invalid */
13034 +               case DO_FINISH_START:   /* invalid */
13035 +               default:
13036 +                       DLM_ASSERT(0,);
13037 +               }
13038 +               goto out;
13039 +       }
13040 +
13041 +       /*
13042 +        * LSST_RECONFIG_DONE - we entered this state after successfully
13043 +        * completing ls_reconfig and calling kcl_start_done.  We expect to get
13044 +        * a finish if everything goes ok.  A finish could be followed by stop
13045 +        * or stop/start before we get here to check it.  Or a finish may never
13046 +        * happen, only stop or stop/start.
13047 +        */
13048 +
13049 +       if (cur_state == LSST_RECONFIG_DONE) {
13050 +               switch (do_now) {
13051 +               case DO_FINISH:
13052 +                       clear_finished_nodes(ls, finish_event);
13053 +                       next_state = LSST_CLEAR;
13054 +
13055 +                       error = enable_locking(ls, finish_event);
13056 +                       if (error)
13057 +                               break;
13058 +
13059 +                       error = process_requestqueue(ls);
13060 +                       if (error)
13061 +                               break;
13062 +
13063 +                       error = resend_cluster_requests(ls);
13064 +                       if (error)
13065 +                               break;
13066 +
13067 +                       restbl_grant_after_purge(ls);
13068 +
13069 +                       log_all(ls, "recover event %u finished", finish_event);
13070 +                       break;
13071 +
13072 +               case DO_STOP:
13073 +                       next_state = LSST_WAIT_START;
13074 +                       break;
13075 +
13076 +               case DO_FINISH_STOP:
13077 +                       clear_finished_nodes(ls, finish_event);
13078 +                       next_state = LSST_WAIT_START;
13079 +                       break;
13080 +
13081 +               case DO_FINISH_START:
13082 +                       clear_finished_nodes(ls, finish_event);
13083 +                       /* fall into DO_START */
13084 +
13085 +               case DO_START:
13086 +                       error = ls_reconfig(ls, rv);
13087 +                       if (error)
13088 +                               next_state = LSST_WAIT_START;
13089 +                       else
13090 +                               next_state = LSST_RECONFIG_DONE;
13091 +                       break;
13092 +
13093 +               default:
13094 +                       DLM_ASSERT(0,);
13095 +               }
13096 +               goto out;
13097 +       }
13098 +
13099 +       /*
13100 +        * LSST_INIT - state after ls is created and before it has been
13101 +        * started.  A start operation will cause the ls to be started for the
13102 +        * first time.  A failed start will cause to just wait in INIT for
13103 +        * another stop/start.
13104 +        */
13105 +
13106 +       if (cur_state == LSST_INIT) {
13107 +               switch (do_now) {
13108 +               case DO_START:
13109 +                       error = ls_first_start(ls, rv);
13110 +                       if (!error)
13111 +                               next_state = LSST_INIT_DONE;
13112 +                       break;
13113 +
13114 +               case DO_STOP:
13115 +                       break;
13116 +
13117 +               case DO_FINISH: /* invalid */
13118 +               case DO_FINISH_STOP:    /* invalid */
13119 +               case DO_FINISH_START:   /* invalid */
13120 +               default:
13121 +                       DLM_ASSERT(0,);
13122 +               }
13123 +               goto out;
13124 +       }
13125 +
13126 +       /*
13127 +        * LSST_INIT_DONE - after the first start operation is completed
13128 +        * successfully and kcl_start_done() called.  If there are no errors, a
13129 +        * finish will arrive next and we'll move to LSST_CLEAR.
13130 +        */
13131 +
13132 +       if (cur_state == LSST_INIT_DONE) {
13133 +               switch (do_now) {
13134 +               case DO_STOP:
13135 +               case DO_FINISH_STOP:
13136 +                       next_state = LSST_WAIT_START;
13137 +                       break;
13138 +
13139 +               case DO_START:
13140 +               case DO_FINISH_START:
13141 +                       error = ls_reconfig(ls, rv);
13142 +                       if (error)
13143 +                               next_state = LSST_WAIT_START;
13144 +                       else
13145 +                               next_state = LSST_RECONFIG_DONE;
13146 +                       break;
13147 +
13148 +               case DO_FINISH:
13149 +                       next_state = LSST_CLEAR;
13150 +                       enable_locking(ls, finish_event);
13151 +                       log_all(ls, "recover event %u finished", finish_event);
13152 +                       break;
13153 +
13154 +               default:
13155 +                       DLM_ASSERT(0,);
13156 +               }
13157 +               goto out;
13158 +       }
13159 +
13160 +      out:
13161 +       if (next_state)
13162 +               ls->ls_state = next_state;
13163 +
13164 +       if (rv) {
13165 +               kfree(rv->nodeids);
13166 +               kfree(rv);
13167 +       }
13168 +}
13169 +
13170 +static __inline__ struct dlm_ls *get_work(int clear)
13171 +{
13172 +       struct dlm_ls *ls;
13173 +
13174 +       spin_lock(&lslist_lock);
13175 +
13176 +       list_for_each_entry(ls, &lslist, ls_list) {
13177 +               if (clear) {
13178 +                       if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
13179 +                               goto got_work;
13180 +
13181 +               } else {
13182 +                       if (test_bit(LSFL_WORK, &ls->ls_flags))
13183 +                               goto got_work;
13184 +               }
13185 +       }
13186 +       ls = NULL;
13187 +
13188 + got_work:
13189 +       spin_unlock(&lslist_lock);
13190 +
13191 +       return ls;
13192 +}
13193 +
13194 +/*
13195 + * Thread which does recovery for all lockspaces.
13196 + */
13197 +
13198 +static int dlm_recoverd(void *arg)
13199 +{
13200 +       struct dlm_ls *ls;
13201 +
13202 +       daemonize("dlm_recoverd");
13203 +       recoverd_task = current;
13204 +       complete(&recoverd_run);
13205 +
13206 +       while (!test_bit(THREAD_STOP, &recoverd_flags)) {
13207 +               wchan_cond_sleep_intr(recoverd_wait, !get_work(0));
13208 +               if ((ls = get_work(1)))
13209 +                       do_ls_recovery(ls);
13210 +       }
13211 +
13212 +       complete(&recoverd_run);
13213 +       return 0;
13214 +}
13215 +
13216 +/*
13217 + * Mark a specific lockspace as needing work and wake up the thread to do it.
13218 + */
13219 +
13220 +void dlm_recoverd_kick(struct dlm_ls *ls)
13221 +{
13222 +       set_bit(LSFL_WORK, &ls->ls_flags);
13223 +       wake_up(&recoverd_wait);
13224 +}
13225 +
13226 +/*
13227 + * Start the recoverd thread when dlm is started (before any lockspaces).
13228 + */
13229 +
13230 +int dlm_recoverd_start(void)
13231 +{
13232 +       int error;
13233 +
13234 +       clear_bit(THREAD_STOP, &recoverd_flags);
13235 +       error = kernel_thread(dlm_recoverd, NULL, 0);
13236 +       if (error < 0)
13237 +               goto out;
13238 +
13239 +       error = 0;
13240 +       wait_for_completion(&recoverd_run);
13241 +
13242 +      out:
13243 +       return error;
13244 +}
13245 +
13246 +/*
13247 + * Stop the recoverd thread when dlm is shut down (all lockspaces are gone).
13248 + */
13249 +
13250 +int dlm_recoverd_stop(void)
13251 +{
13252 +       set_bit(THREAD_STOP, &recoverd_flags);
13253 +       wake_up(&recoverd_wait);
13254 +       wait_for_completion(&recoverd_run);
13255 +
13256 +       return 0;
13257 +}
13258 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
13259 --- linux-orig/cluster/dlm/recoverd.h   1970-01-01 07:30:00.000000000 +0730
13260 +++ linux-patched/cluster/dlm/recoverd.h        2004-07-13 18:57:22.000000000 +0800
13261 @@ -0,0 +1,22 @@
13262 +/******************************************************************************
13263 +*******************************************************************************
13264 +**
13265 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13266 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13267 +**
13268 +**  This copyrighted material is made available to anyone wishing to use,
13269 +**  modify, copy, or redistribute it subject to the terms and conditions
13270 +**  of the GNU General Public License v.2.
13271 +**
13272 +*******************************************************************************
13273 +******************************************************************************/
13274 +
13275 +#ifndef __RECOVERD_DOT_H__
13276 +#define __RECOVERD_DOT_H__
13277 +
13278 +void dlm_recoverd_init(void);
13279 +void dlm_recoverd_kick(struct dlm_ls *ls);
13280 +int dlm_recoverd_start(void);
13281 +int dlm_recoverd_stop(void);
13282 +
13283 +#endif                         /* __RECOVERD_DOT_H__ */
13284 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
13285 --- linux-orig/cluster/dlm/rsb.c        1970-01-01 07:30:00.000000000 +0730
13286 +++ linux-patched/cluster/dlm/rsb.c     2004-07-13 18:57:22.000000000 +0800
13287 @@ -0,0 +1,319 @@
13288 +/******************************************************************************
13289 +*******************************************************************************
13290 +**
13291 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13292 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13293 +**
13294 +**  This copyrighted material is made available to anyone wishing to use,
13295 +**  modify, copy, or redistribute it subject to the terms and conditions
13296 +**  of the GNU General Public License v.2.
13297 +**
13298 +*******************************************************************************
13299 +******************************************************************************/
13300 +
13301 +#include "dlm_internal.h"
13302 +#include "locking.h"
13303 +#include "memory.h"
13304 +#include "lockqueue.h"
13305 +#include "nodes.h"
13306 +#include "dir.h"
13307 +#include "util.h"
13308 +
13309 +static struct dlm_rsb *search_hashchain(struct list_head *head,
13310 +                                       struct dlm_rsb *parent,
13311 +                                       char *name, int namelen)
13312 +{
13313 +       struct dlm_rsb *r;
13314 +
13315 +       list_for_each_entry(r, head, res_hashchain) {
13316 +               if ((parent == r->res_parent) && (namelen == r->res_length) &&
13317 +                   (memcmp(name, r->res_name, namelen) == 0)) {
13318 +                       atomic_inc(&r->res_ref);
13319 +                       return r;
13320 +               }
13321 +       }
13322 +
13323 +       return NULL;
13324 +}
13325 +
13326 +/*
13327 + * A way to arbitrarily hold onto an rsb which we already have a reference to
13328 + * to make sure it doesn't go away.  Opposite of release_rsb().
13329 + */
13330 +
13331 +void hold_rsb(struct dlm_rsb *r)
13332 +{
13333 +       atomic_inc(&r->res_ref);
13334 +}
13335 +
13336 +/*
13337 + * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
13338 + * struct when there are zero references.  Every lkb for the rsb adds a
13339 + * reference.  When ref is zero there can be no more lkb's for the rsb, on the
13340 + * queue's or anywhere else.
13341 + */
13342 +
13343 +void release_rsb(struct dlm_rsb *r)
13344 +{
13345 +       struct dlm_ls *ls = r->res_ls;
13346 +       int removed = FALSE;
13347 +
13348 +       write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
13349 +       if (atomic_dec_and_test(&r->res_ref)) {
13350 +               DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
13351 +               DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
13352 +               DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
13353 +               removed = TRUE;
13354 +               list_del(&r->res_hashchain);
13355 +       }
13356 +       write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
13357 +
13358 +       if (!removed)
13359 +               return;
13360 +
13361 +       down_read(&ls->ls_gap_rsblist);
13362 +       if (r->res_parent)
13363 +               list_del(&r->res_subreslist);
13364 +       else
13365 +               list_del(&r->res_rootlist);
13366 +       up_read(&ls->ls_gap_rsblist);
13367 +
13368 +       if (r->res_parent)
13369 +               goto out;
13370 +       if (r->res_nodeid && r->res_nodeid != -1)
13371 +               goto out;
13372 +       if (r->res_nodeid == -1 && !test_bit(RESFL_MASTER, &r->res_flags))
13373 +               goto out;
13374 +
13375 +       if (get_directory_nodeid(r) != our_nodeid())
13376 +               remote_remove_resdata(r->res_ls, get_directory_nodeid(r),
13377 +                                     r->res_name, r->res_length);
13378 +       else
13379 +               remove_resdata(r->res_ls, our_nodeid(), r->res_name,
13380 +                              r->res_length);
13381 + out:
13382 +       if (r->res_lvbptr)
13383 +               free_lvb(r->res_lvbptr);
13384 +
13385 +       free_rsb(r);
13386 +}
13387 +
13388 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
13389 +{
13390 +       struct dlm_rsb *r = lkb->lkb_resource;
13391 +
13392 +       write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
13393 +       if (!r->res_parent && atomic_read(&r->res_ref) == 1)
13394 +               r->res_nodeid = -1;
13395 +       write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
13396 +
13397 +       return r;
13398 +}
13399 +
13400 +/*
13401 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
13402 + * If the rsb exists, its ref count is incremented by this function.  If it
13403 + * doesn't exist, it's created with a ref count of one.
13404 + */
13405 +
13406 +int find_or_create_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
13407 +                      int namelen, int create, struct dlm_rsb **rp)
13408 +{
13409 +       uint32_t bucket;
13410 +       struct dlm_rsb *r, *tmp;
13411 +       int error = -ENOMEM;
13412 +
13413 +       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
13414 +
13415 +       bucket = dlm_hash(name, namelen);
13416 +       bucket &= (ls->ls_rsbtbl_size - 1);
13417 +
13418 +       read_lock(&ls->ls_rsbtbl[bucket].lock);
13419 +       r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, namelen);
13420 +       read_unlock(&ls->ls_rsbtbl[bucket].lock);
13421 +
13422 +       if (r)
13423 +               goto out_set;
13424 +       if (!create) {
13425 +               *rp = NULL;
13426 +               goto out;
13427 +       }
13428 +
13429 +       r = allocate_rsb(ls, namelen);
13430 +       if (!r)
13431 +               goto fail;
13432 +
13433 +       INIT_LIST_HEAD(&r->res_subreslist);
13434 +       INIT_LIST_HEAD(&r->res_grantqueue);
13435 +       INIT_LIST_HEAD(&r->res_convertqueue);
13436 +       INIT_LIST_HEAD(&r->res_waitqueue);
13437 +
13438 +       memcpy(r->res_name, name, namelen);
13439 +       r->res_length = namelen;
13440 +       r->res_ls = ls;
13441 +       init_rwsem(&r->res_lock);
13442 +       atomic_set(&r->res_ref, 1);
13443 +       r->res_bucket = bucket;
13444 +
13445 +       if (parent) {
13446 +               r->res_parent = parent;
13447 +               r->res_depth = parent->res_depth + 1;
13448 +               r->res_root = parent->res_root;
13449 +               r->res_nodeid = parent->res_nodeid;
13450 +       } else {
13451 +               r->res_parent = NULL;
13452 +               r->res_depth = 1;
13453 +               r->res_root = r;
13454 +               r->res_nodeid = -1;
13455 +       }
13456 +
13457 +       write_lock(&ls->ls_rsbtbl[bucket].lock);
13458 +       tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, namelen);
13459 +       if (tmp) {
13460 +               write_unlock(&ls->ls_rsbtbl[bucket].lock);
13461 +               free_rsb(r);
13462 +               r = tmp;
13463 +       } else {
13464 +               list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
13465 +               write_unlock(&ls->ls_rsbtbl[bucket].lock);
13466 +
13467 +               down_read(&ls->ls_gap_rsblist);
13468 +               if (parent)
13469 +                       list_add_tail(&r->res_subreslist,
13470 +                                     &r->res_root->res_subreslist);
13471 +               else
13472 +                       list_add(&r->res_rootlist, &ls->ls_rootres);
13473 +               up_read(&ls->ls_gap_rsblist);
13474 +       }
13475 +
13476 +      out_set:
13477 +       *rp = r;
13478 +
13479 +      out:
13480 +       error = 0;
13481 +
13482 +      fail:
13483 +       return error;
13484 +}
13485 +
13486 +/*
13487 + * Add a LKB to a resource's grant/convert/wait queue. in order
13488 + */
13489 +
13490 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
13491 +{
13492 +       struct dlm_lkb *lkb = NULL;
13493 +
13494 +       list_for_each_entry(lkb, head, lkb_statequeue) {
13495 +               if (lkb->lkb_rqmode < mode)
13496 +                       break;
13497 +       }
13498 +
13499 +       if (!lkb) {
13500 +               /* No entries in the queue, we are alone */
13501 +               list_add_tail(new, head);
13502 +       } else {
13503 +               __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
13504 +       }
13505 +}
13506 +
13507 +/*
13508 + * The rsb res_lock must be held in write when this function is called.
13509 + */
13510 +
13511 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13512 +{
13513 +       DLM_ASSERT(!lkb->lkb_status,
13514 +                  print_lkb(lkb);
13515 +                  print_rsb(r););
13516 +
13517 +       lkb->lkb_status = type;
13518 +
13519 +       switch (type) {
13520 +       case GDLM_LKSTS_WAITING:
13521 +               list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
13522 +               break;
13523 +
13524 +       case GDLM_LKSTS_GRANTED:
13525 +               lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
13526 +                               lkb->lkb_grmode);
13527 +               break;
13528 +
13529 +       case GDLM_LKSTS_CONVERT:
13530 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
13531 +                       list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
13532 +
13533 +               else
13534 +                       if (lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT)
13535 +                               list_add_tail(&lkb->lkb_statequeue,
13536 +                                             &r->res_convertqueue);
13537 +                       else
13538 +                               lkb_add_ordered(&lkb->lkb_statequeue,
13539 +                                               &r->res_convertqueue, lkb->lkb_rqmode);
13540 +               break;
13541 +
13542 +       default:
13543 +               DLM_ASSERT(0,);
13544 +       }
13545 +}
13546 +
13547 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13548 +{
13549 +       down_write(&r->res_lock);
13550 +       lkb_enqueue(r, lkb, type);
13551 +       up_write(&r->res_lock);
13552 +}
13553 +
13554 +/*
13555 + * The rsb res_lock must be held in write when this function is called.
13556 + */
13557 +
13558 +int lkb_dequeue(struct dlm_lkb *lkb)
13559 +{
13560 +       int status = lkb->lkb_status;
13561 +
13562 +       if (!status)
13563 +               goto out;
13564 +
13565 +       lkb->lkb_status = 0;
13566 +       list_del(&lkb->lkb_statequeue);
13567 +
13568 +      out:
13569 +       return status;
13570 +}
13571 +
13572 +int res_lkb_dequeue(struct dlm_lkb *lkb)
13573 +{
13574 +       int status;
13575 +
13576 +       down_write(&lkb->lkb_resource->res_lock);
13577 +       status = lkb_dequeue(lkb);
13578 +       up_write(&lkb->lkb_resource->res_lock);
13579 +
13580 +       return status;
13581 +}
13582 +
13583 +/*
13584 + * The rsb res_lock must be held in write when this function is called.
13585 + */
13586 +
13587 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13588 +{
13589 +       int status;
13590 +
13591 +       status = lkb_dequeue(lkb);
13592 +       lkb_enqueue(r, lkb, type);
13593 +
13594 +       return status;
13595 +}
13596 +
13597 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
13598 +{
13599 +       int status;
13600 +
13601 +       down_write(&r->res_lock);
13602 +       status = lkb_swqueue(r, lkb, type);
13603 +       up_write(&r->res_lock);
13604 +
13605 +       return status;
13606 +}
13607 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
13608 --- linux-orig/cluster/dlm/rsb.h        1970-01-01 07:30:00.000000000 +0730
13609 +++ linux-patched/cluster/dlm/rsb.h     2004-07-13 18:57:22.000000000 +0800
13610 @@ -0,0 +1,30 @@
13611 +/******************************************************************************
13612 +*******************************************************************************
13613 +**
13614 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13615 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13616 +**
13617 +**  This copyrighted material is made available to anyone wishing to use,
13618 +**  modify, copy, or redistribute it subject to the terms and conditions
13619 +**  of the GNU General Public License v.2.
13620 +**
13621 +*******************************************************************************
13622 +******************************************************************************/
13623 +
13624 +#ifndef __RSB_DOT_H__
13625 +#define __RSB_DOT_H__
13626 +
13627 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
13628 +void release_rsb(struct dlm_rsb *r);
13629 +void hold_rsb(struct dlm_rsb *r);
13630 +int find_or_create_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
13631 +                      int namelen, int create, struct dlm_rsb **rp);
13632 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
13633 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13634 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13635 +int lkb_dequeue(struct dlm_lkb *lkb);
13636 +int res_lkb_dequeue(struct dlm_lkb *lkb);
13637 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13638 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
13639 +
13640 +#endif                         /* __RSB_DOT_H__ */
13641 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
13642 --- linux-orig/cluster/dlm/util.c       1970-01-01 07:30:00.000000000 +0730
13643 +++ linux-patched/cluster/dlm/util.c    2004-07-13 18:57:22.000000000 +0800
13644 @@ -0,0 +1,190 @@
13645 +/******************************************************************************
13646 +*******************************************************************************
13647 +**
13648 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13649 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13650 +**
13651 +**  This copyrighted material is made available to anyone wishing to use,
13652 +**  modify, copy, or redistribute it subject to the terms and conditions
13653 +**  of the GNU General Public License v.2.
13654 +**
13655 +*******************************************************************************
13656 +******************************************************************************/
13657 +
13658 +#include "dlm_internal.h"
13659 +
13660 +static const uint32_t crc_32_tab[] = {
13661 +       0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
13662 +       0xe963a535, 0x9e6495a3,
13663 +       0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
13664 +       0xe7b82d07, 0x90bf1d91,
13665 +       0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
13666 +       0xf4d4b551, 0x83d385c7,
13667 +       0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
13668 +       0xfa0f3d63, 0x8d080df5,
13669 +       0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
13670 +       0xd20d85fd, 0xa50ab56b,
13671 +       0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
13672 +       0xdcd60dcf, 0xabd13d59,
13673 +       0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
13674 +       0xcfba9599, 0xb8bda50f,
13675 +       0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
13676 +       0xc1611dab, 0xb6662d3d,
13677 +       0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
13678 +       0x9fbfe4a5, 0xe8b8d433,
13679 +       0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
13680 +       0x91646c97, 0xe6635c01,
13681 +       0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
13682 +       0x8208f4c1, 0xf50fc457,
13683 +       0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
13684 +       0x8cd37cf3, 0xfbd44c65,
13685 +       0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
13686 +       0xa4d1c46d, 0xd3d6f4fb,
13687 +       0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
13688 +       0xaa0a4c5f, 0xdd0d7cc9,
13689 +       0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
13690 +       0xb966d409, 0xce61e49f,
13691 +       0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
13692 +       0xb7bd5c3b, 0xc0ba6cad,
13693 +       0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
13694 +       0x04db2615, 0x73dc1683,
13695 +       0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
13696 +       0x0a00ae27, 0x7d079eb1,
13697 +       0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
13698 +       0x196c3671, 0x6e6b06e7,
13699 +       0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
13700 +       0x17b7be43, 0x60b08ed5,
13701 +       0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
13702 +       0x3fb506dd, 0x48b2364b,
13703 +       0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
13704 +       0x316e8eef, 0x4669be79,
13705 +       0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
13706 +       0x220216b9, 0x5505262f,
13707 +       0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
13708 +       0x2cd99e8b, 0x5bdeae1d,
13709 +       0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
13710 +       0x72076785, 0x05005713,
13711 +       0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
13712 +       0x7cdcefb7, 0x0bdbdf21,
13713 +       0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
13714 +       0x6fb077e1, 0x18b74777,
13715 +       0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
13716 +       0x616bffd3, 0x166ccf45,
13717 +       0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
13718 +       0x4969474d, 0x3e6e77db,
13719 +       0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
13720 +       0x47b2cf7f, 0x30b5ffe9,
13721 +       0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
13722 +       0x54de5729, 0x23d967bf,
13723 +       0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
13724 +       0x5a05df1b, 0x2d02ef8d
13725 +};
13726 +
13727 +/**
13728 + * dlm_hash - hash an array of data
13729 + * @data: the data to be hashed
13730 + * @len: the length of data to be hashed
13731 + *
13732 + * Copied from GFS.
13733 + *
13734 + * Take some data and convert it to a 32-bit hash.
13735 + *
13736 + * The hash function is a 32-bit CRC of the data.  The algorithm uses
13737 + * the crc_32_tab table above.
13738 + *
13739 + * This may not be the fastest hash function, but it does a fair bit better
13740 + * at providing uniform results than the others I've looked at.  That's
13741 + * really important for efficient directories.
13742 + *
13743 + * Returns: the hash
13744 + */
13745 +
13746 +uint32_t dlm_hash(const char *data, int len)
13747 +{
13748 +       uint32_t hash = 0xFFFFFFFF;
13749 +
13750 +       for (; len--; data++)
13751 +               hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
13752 +
13753 +       hash = ~hash;
13754 +
13755 +       return hash;
13756 +}
13757 +
13758 +uint32_t dlm_next_power2(uint32_t val)
13759 +{
13760 +       uint32_t x;
13761 +
13762 +       for (x = 1; x < val; x <<= 1) ;
13763 +
13764 +       return x;
13765 +}
13766 +
13767 +void print_lkb(struct dlm_lkb *lkb)
13768 +{
13769 +       printk("dlm: lkb\n"
13770 +              "id %x\n"
13771 +              "remid %x\n"
13772 +              "flags %x\n"
13773 +              "status %x\n"
13774 +              "rqmode %d\n"
13775 +              "grmode %d\n"
13776 +              "nodeid %u\n"
13777 +              "lqstate %x\n"
13778 +              "lqflags %x\n",
13779 +               lkb->lkb_id,
13780 +               lkb->lkb_remid,
13781 +               lkb->lkb_flags,
13782 +               lkb->lkb_status,
13783 +               lkb->lkb_rqmode,
13784 +               lkb->lkb_grmode,
13785 +               lkb->lkb_nodeid,
13786 +               lkb->lkb_lockqueue_state,
13787 +               lkb->lkb_lockqueue_flags);
13788 +}
13789 +
13790 +void print_rsb(struct dlm_rsb *r)
13791 +{
13792 +       printk("dlm: rsb\n"
13793 +              "name \"%s\"\n"
13794 +              "nodeid %u\n"
13795 +              "ref %u\n",
13796 +              r->res_name,
13797 +              r->res_nodeid,
13798 +              atomic_read(&r->res_ref));
13799 +}
13800 +
13801 +void print_request(struct dlm_request *req)
13802 +{
13803 +       printk("dlm: request\n"
13804 +              "rh_cmd %u\n"
13805 +              "rh_lkid %x\n"
13806 +              "remlkid %x\n"
13807 +              "flags %x\n"
13808 +              "status %u\n"
13809 +              "rqmode %u\n",
13810 +              req->rr_header.rh_cmd,
13811 +              req->rr_header.rh_lkid,
13812 +              req->rr_remlkid,
13813 +              req->rr_flags,
13814 +              req->rr_status,
13815 +              req->rr_rqmode);
13816 +}
13817 +
13818 +void print_reply(struct dlm_reply *rp)
13819 +{
13820 +       printk("dlm: reply\n"
13821 +              "rh_cmd %u\n"
13822 +              "rh_lkid %x\n"
13823 +              "lockstate %u\n"
13824 +              "nodeid %u\n"
13825 +              "status %u\n"
13826 +              "lkid %x\n",
13827 +              rp->rl_header.rh_cmd,
13828 +              rp->rl_header.rh_lkid,
13829 +              rp->rl_lockstate,
13830 +              rp->rl_nodeid,
13831 +              rp->rl_status,
13832 +              rp->rl_lkid);
13833 +}
13834 +
13835 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
13836 --- linux-orig/cluster/dlm/util.h       1970-01-01 07:30:00.000000000 +0730
13837 +++ linux-patched/cluster/dlm/util.h    2004-07-13 18:57:22.000000000 +0800
13838 @@ -0,0 +1,25 @@
13839 +/******************************************************************************
13840 +*******************************************************************************
13841 +**
13842 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13843 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13844 +**
13845 +**  This copyrighted material is made available to anyone wishing to use,
13846 +**  modify, copy, or redistribute it subject to the terms and conditions
13847 +**  of the GNU General Public License v.2.
13848 +**
13849 +*******************************************************************************
13850 +******************************************************************************/
13851 +
13852 +#ifndef __UTIL_DOT_H__
13853 +#define __UTIL_DOT_H__
13854 +
13855 +uint32_t dlm_hash(const char *data, int len);
13856 +uint32_t dlm_next_power2(uint32_t val);
13857 +
13858 +void print_lkb(struct dlm_lkb *lkb);
13859 +void print_rsb(struct dlm_rsb *r);
13860 +void print_request(struct dlm_request *req);
13861 +void print_reply(struct dlm_reply *rp);
13862 +
13863 +#endif
13864 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
13865 --- linux-orig/include/cluster/dlm.h    1970-01-01 07:30:00.000000000 +0730
13866 +++ linux-patched/include/cluster/dlm.h 2004-07-13 18:57:22.000000000 +0800
13867 @@ -0,0 +1,412 @@
13868 +/******************************************************************************
13869 +*******************************************************************************
13870 +**
13871 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13872 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13873 +**
13874 +**  This copyrighted material is made available to anyone wishing to use,
13875 +**  modify, copy, or redistribute it subject to the terms and conditions
13876 +**  of the GNU General Public License v.2.
13877 +**
13878 +*******************************************************************************
13879 +******************************************************************************/
13880 +
13881 +#ifndef __DLM_DOT_H__
13882 +#define __DLM_DOT_H__
13883 +
13884 +/*
13885 + * Interface to DLM - routines and structures to use DLM lockspaces.
13886 + */
13887 +
13888 +/*
13889 + * Lock Modes
13890 + */
13891 +
13892 +#define DLM_LOCK_IV            (-1)    /* invalid */
13893 +#define DLM_LOCK_NL            (0)     /* null */
13894 +#define DLM_LOCK_CR            (1)     /* concurrent read */
13895 +#define DLM_LOCK_CW            (2)     /* concurrent write */
13896 +#define DLM_LOCK_PR            (3)     /* protected read */
13897 +#define DLM_LOCK_PW            (4)     /* protected write */
13898 +#define DLM_LOCK_EX            (5)     /* exclusive */
13899 +
13900 +/*
13901 + * Maximum size in bytes of a dlm_lock name
13902 + */
13903 +
13904 +#define DLM_RESNAME_MAXLEN     (64)
13905 +
13906 +/*
13907 + * Size in bytes of Lock Value Block
13908 + */
13909 +
13910 +#define DLM_LVB_LEN            (32)
13911 +
13912 +/*
13913 + * Flags to dlm_new_lockspace
13914 + *
13915 + * DLM_LSF_NOTIMERS
13916 + *
13917 + * Do not subject locks in this lockspace to time-outs.
13918 + *
13919 + * DLM_LSF_NOCONVGRANT
13920 + *
13921 + * Do not grant new locks unless the conversion queue is empty.
13922 + *
13923 + */
13924 +
13925 +#define DLM_LSF_NOTIMERS       (1)
13926 +#define DLM_LSF_NOCONVGRANT    (2)
13927 +
13928 +/*
13929 + * Flags to dlm_lock
13930 + *
13931 + * DLM_LKF_NOQUEUE
13932 + *
13933 + * Do not queue the lock request on the wait queue if it cannot be granted
13934 + * immediately.  If the lock cannot be granted because of this flag, DLM will
13935 + * either return -EAGAIN from the dlm_lock call or will return 0 from
13936 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
13937 + *
13938 + * DLM_LKF_CONVERT
13939 + *
13940 + * Indicates a lock conversion request.  For conversions the name and namelen
13941 + * are ignored and the lock ID in the LKSB is used to identify the lock.
13942 + *
13943 + * DLM_LKF_VALBLK
13944 + *
13945 + * Requests DLM to return the current contents of the lock value block in the
13946 + * lock status block.  When this flag is set in a lock conversion from PW or EX
13947 + * modes, DLM assigns the value specified in the lock status block to the lock
13948 + * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
13949 + * containing application-specific information.
13950 + *
13951 + * DLM_LKF_QUECVT
13952 + *
13953 + * Force a conversion lock request to the back of the convert queue.  All other
13954 + * conversion requests ahead of it must be granted before it can be granted.
13955 + * This enforces a FIFO ordering on the convert queue.  When this flag is set,
13956 + * indefinite postponement is averted.  This flag is allowed only when
13957 + * converting a lock to a more restrictive mode.
13958 + *
13959 + * DLM_LKF_CANCEL
13960 + *
13961 + * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
13962 + * previously granted mode.
13963 + *
13964 + * DLM_LKF_IVVALBLK
13965 + *
13966 + * Invalidate/clear the lock value block.
13967 + *
13968 + * DLM_LKF_CONVDEADLK
13969 + *
13970 + * The granted mode of a lock being converted (from a non-NL mode) can be
13971 + * changed to NL in the process of acquiring the requested mode to avoid
13972 + * conversion deadlock.
13973 + *
13974 + * DLM_LKF_PERSISTENT
13975 + *
13976 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
13977 + * that this lock should not be unlocked when the process exits.
13978 + *
13979 + * DLM_LKF_NODLKWT
13980 + *
13981 + * This lock is not to be checked for conversion deadlocks.
13982 + *
13983 + * DLM_LKF_NODLCKBLK
13984 + *
13985 + * not yet implemented
13986 + *
13987 + * DLM_LKF_EXPEDITE
13988 + *
13989 + * If this lock conversion cannot be granted immediately it is to go to the
13990 + * head of the conversion queue regardless of its requested lock mode.
13991 + *
13992 + * DLM_LKF_NOQUEUEBAST
13993 + *
13994 + * Send blocking AST's before returning -EAGAIN to the caller.  It is only
13995 + * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
13996 + * NOQUEUE requests otherwise.
13997 + *
13998 + */
13999 +
14000 +#define DLM_LKF_NOQUEUE        (0x00000001)
14001 +#define DLM_LKF_CANCEL         (0x00000002)
14002 +#define DLM_LKF_CONVERT        (0x00000004)
14003 +#define DLM_LKF_VALBLK         (0x00000008)
14004 +#define DLM_LKF_QUECVT         (0x00000010)
14005 +#define DLM_LKF_IVVALBLK       (0x00000020)
14006 +#define DLM_LKF_CONVDEADLK     (0x00000040)
14007 +#define DLM_LKF_PERSISTENT     (0x00000080)
14008 +#define DLM_LKF_NODLCKWT       (0x00000100)
14009 +#define DLM_LKF_NODLCKBLK      (0x00000200)
14010 +#define DLM_LKF_EXPEDITE       (0x00000400)
14011 +#define DLM_LKF_NOQUEUEBAST    (0x00000800)
14012 +
14013 +/*
14014 + * Some return codes that are not not in errno.h
14015 + */
14016 +
14017 +#define DLM_ECANCEL            (0x10001)
14018 +#define DLM_EUNLOCK            (0x10002)
14019 +
14020 +typedef void dlm_lockspace_t;
14021 +
14022 +/*
14023 + * Lock range structure
14024 + */
14025 +
14026 +struct dlm_range {
14027 +       uint64_t ra_start;
14028 +       uint64_t ra_end;
14029 +};
14030 +
14031 +/*
14032 + * Lock status block
14033 + *
14034 + * Use this structure to specify the contents of the lock value block.  For a
14035 + * conversion request, this structure is used to specify the lock ID of the
14036 + * lock.  DLM writes the status of the lock request and the lock ID assigned
14037 + * to the request in the lock status block.
14038 + *
14039 + * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
14040 + * It is available when dlm_lock returns.
14041 + *
14042 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
14043 + * shown for the DLM_LKF_VALBLK flag.
14044 + *
14045 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
14046 + * it was first demoted to NL to avoid conversion deadlock.
14047 + *
14048 + * sb_status: the returned status of the lock request set prior to AST
14049 + * execution.  Possible return values:
14050 + *
14051 + * 0 if lock request was successful
14052 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14053 + * -ENOMEM if there is no memory to process request
14054 + * -EINVAL if there are invalid parameters
14055 + * -DLM_EUNLOCK if unlock request was successful
14056 + * -DLM_ECANCEL ?
14057 + */
14058 +
14059 +#define DLM_SBF_DEMOTED        (0x01)
14060 +
14061 +struct dlm_lksb {
14062 +       int      sb_status;
14063 +       uint32_t sb_lkid;
14064 +       char     sb_flags;
14065 +       char *   sb_lvbptr;
14066 +};
14067 +
14068 +/*
14069 + * These defines are the bits that make up the
14070 + * query code.
14071 + */
14072 +
14073 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
14074 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
14075 +#define DLM_LOCK_THIS            0x0007
14076 +#define DLM_QUERY_MODE_MASK      0x0007
14077 +
14078 +/* Bits 3, 4, 5  bitmap of queue(s) to query */
14079 +#define DLM_QUERY_QUEUE_WAIT     0x0008
14080 +#define DLM_QUERY_QUEUE_CONVERT  0x0010
14081 +#define DLM_QUERY_QUEUE_GRANT    0x0020
14082 +#define DLM_QUERY_QUEUE_GRANTED  0x0030        /* Shorthand */
14083 +#define DLM_QUERY_QUEUE_ALL      0x0038        /* Shorthand */
14084 +
14085 +/* Bit 6, Return only the information that can be established without a network
14086 + * round-trip. The caller must be aware of the implications of this. Useful for
14087 + * just getting the master node id or resource name. */
14088 +#define DLM_QUERY_LOCAL          0x0040
14089 +
14090 +/* Bits 8 up, query type */
14091 +#define DLM_QUERY_LOCKS_HIGHER   0x0100
14092 +#define DLM_QUERY_LOCKS_LOWER    0x0200
14093 +#define DLM_QUERY_LOCKS_EQUAL    0x0300
14094 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
14095 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
14096 +#define DLM_QUERY_LOCKS_ALL      0x0600
14097 +#define DLM_QUERY_MASK           0x0F00
14098 +
14099 +/* GRMODE is the default for mode comparisons,
14100 +   RQMODE might also be handy */
14101 +#define DLM_QUERY_GRMODE         0x0000
14102 +#define DLM_QUERY_RQMODE         0x1000
14103 +
14104 +/* Structures passed into and out of the query */
14105 +
14106 +struct dlm_lockinfo {
14107 +       int lki_lkid;           /* Lock ID on originating node */
14108 +        int lki_mstlkid;        /* Lock ID on master node */
14109 +       int lki_parent;
14110 +       int lki_node;           /* Originating node (not master) */
14111 +       uint8_t lki_state;      /* Queue the lock is on */
14112 +       uint8_t lki_grmode;     /* Granted mode */
14113 +       uint8_t lki_rqmode;     /* Requested mode */
14114 +       struct dlm_range lki_grrange;   /* Granted range, if applicable */
14115 +       struct dlm_range lki_rqrange;   /* Requested range, if applicable */
14116 +};
14117 +
14118 +struct dlm_resinfo {
14119 +       int rsi_length;
14120 +       int rsi_grantcount;     /* No. of nodes on grant queue */
14121 +       int rsi_convcount;      /* No. of nodes on convert queue */
14122 +       int rsi_waitcount;      /* No. of nodes on wait queue */
14123 +       int rsi_masternode;     /* Master for this resource */
14124 +       char rsi_name[DLM_RESNAME_MAXLEN];      /* Resource name */
14125 +       char rsi_valblk[DLM_LVB_LEN];   /* Master's LVB contents, if applicable
14126 +                                        */
14127 +};
14128 +
14129 +struct dlm_queryinfo {
14130 +       struct dlm_resinfo *gqi_resinfo;
14131 +       struct dlm_lockinfo *gqi_lockinfo;      /* This points to an array
14132 +                                                * of structs */
14133 +       int gqi_locksize;       /* input */
14134 +       int gqi_lockcount;      /* output */
14135 +};
14136 +
14137 +#ifdef __KERNEL__
14138 +/*
14139 + * dlm_init
14140 + *
14141 + * Starts and initializes DLM threads and structures.  Creation of the first
14142 + * lockspace will call this if it has not been called already.
14143 + *
14144 + * Returns: 0 if successful, -EXXX on error
14145 + */
14146 +
14147 +int dlm_init(void);
14148 +
14149 +/*
14150 + * dlm_release
14151 + *
14152 + * Stops DLM threads.
14153 + *
14154 + * Returns: 0 if successful, -EXXX on error
14155 + */
14156 +
14157 +int dlm_release(void);
14158 +
14159 +/*
14160 + * dlm_new_lockspace
14161 + *
14162 + * Starts a lockspace with the given name.  If the named lockspace exists in
14163 + * the cluster, the calling node joins it.
14164 + */
14165 +
14166 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
14167 +                     int flags);
14168 +
14169 +/*
14170 + * dlm_release_lockspace
14171 + *
14172 + * Stop a lockspace.
14173 + */
14174 +
14175 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14176 +
14177 +/*
14178 + * dlm_lock
14179 + *
14180 + * Make an asyncronous request to acquire or convert a lock on a named
14181 + * resource.
14182 + *
14183 + * lockspace: context for the request
14184 + * mode: the requested mode of the lock (DLM_LOCK_)
14185 + * lksb: lock status block for input and async return values
14186 + * flags: input flags (DLM_LKF_)
14187 + * name: name of the resource to lock, can be binary
14188 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14189 + * parent: the lock ID of a parent lock or 0 if none
14190 + * lockast: function DLM executes when it completes processing the request
14191 + * astarg: argument passed to lockast and bast functions
14192 + * bast: function DLM executes when this lock later blocks another request
14193 + *
14194 + * Returns:
14195 + * 0 if request is successfully queued for processing
14196 + * -EINVAL if any input parameters are invalid
14197 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14198 + * -ENOMEM if there is no memory to process request
14199 + * -ENOTCONN if there is a communication error
14200 + *
14201 + * If the call to dlm_lock returns an error then the operation has failed and
14202 + * the AST routine will not be called.  If dlm_lock returns 0 it is still
14203 + * possible that the lock operation will fail. The AST routine will be called
14204 + * when the locking is complete and the status is returned in the lksb.
14205 + *
14206 + * If the AST routines or parameter are passed to a conversion operation then
14207 + * they will overwrite those values that were passed to a previous dlm_lock
14208 + * call.
14209 + *
14210 + * AST routines should not block (at least not for long), but may make
14211 + * any locking calls they please.
14212 + */
14213 +
14214 +int dlm_lock(dlm_lockspace_t *lockspace,
14215 +            uint32_t mode,
14216 +            struct dlm_lksb *lksb,
14217 +            uint32_t flags,
14218 +            void *name,
14219 +            unsigned int namelen,
14220 +            uint32_t parent,
14221 +            void (*lockast) (void *astarg),
14222 +            void *astarg,
14223 +            void (*bast) (void *astarg, int mode),
14224 +            struct dlm_range *range);
14225 +
14226 +/*
14227 + * dlm_unlock
14228 + *
14229 + * Asynchronously release a lock on a resource.  The AST routine is called
14230 + * when the resource is successfully unlocked.
14231 + *
14232 + * lockspace: context for the request
14233 + * lkid: the lock ID as returned in the lksb
14234 + * flags: input flags (DLM_LKF_)
14235 + * lksb: if NULL the lksb parameter passed to last lock request is used
14236 + * astarg: if NULL, astarg in last lock request is used
14237 + *
14238 + * Returns:
14239 + * 0 if request is successfully queued for processing
14240 + * -EINVAL if any input parameters are invalid
14241 + * -ENOTEMPTY if the lock still has sublocks
14242 + * -EBUSY if the lock is waiting for a remote lock operation
14243 + * -ENOTCONN if there is a communication error
14244 + */
14245 +
14246 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
14247 +                      uint32_t lkid,
14248 +                      uint32_t flags,
14249 +                      struct dlm_lksb *lksb,
14250 +                      void *astarg);
14251 +
14252 +/* Query interface
14253 + *
14254 + * Query the other holders of a resource, given a known lock ID
14255 + *
14256 + * lockspace:   context for the request
14257 + * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
14258 + *              on the resource. sb_status will contain the status
14259 + *             of the request on completion.
14260 + * query:       query bitmap see DLM_QUERY_* above
14261 + * qinfo:       pointer to dlm_queryinfo structure
14262 + * ast_routine: AST routine to call on completion
14263 + * artarg:      argument to AST routine. It is "traditional"
14264 + *              to put the qinfo pointer into lksb->sb_lvbptr
14265 + *              and pass the lksb in here.
14266 + */
14267 +extern int dlm_query(dlm_lockspace_t *lockspace,
14268 +                     struct dlm_lksb *lksb,
14269 +                     int query,
14270 +                     struct dlm_queryinfo *qinfo,
14271 +                     void (ast_routine(void *)),
14272 +                     void *astarg);
14273 +
14274 +
14275 +void dlm_debug_dump(void);
14276 +
14277 +#endif                         /* __KERNEL__ */
14278 +
14279 +#endif                         /* __DLM_DOT_H__ */
14280 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
14281 --- linux-orig/include/cluster/dlm_device.h     1970-01-01 07:30:00.000000000 +0730
14282 +++ linux-patched/include/cluster/dlm_device.h  2004-07-13 18:57:22.000000000 +0800
14283 @@ -0,0 +1,63 @@
14284 +/******************************************************************************
14285 +*******************************************************************************
14286 +**
14287 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14288 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14289 +**
14290 +**  This copyrighted material is made available to anyone wishing to use,
14291 +**  modify, copy, or redistribute it subject to the terms and conditions
14292 +**  of the GNU General Public License v.2.
14293 +**
14294 +*******************************************************************************
14295 +******************************************************************************/
14296 +
14297 +/* This is the device interface for dlm, most users will use a library
14298 + * interface.
14299 + */
14300 +
14301 +/* Version of the device interface */
14302 +#define DLM_DEVICE_VERSION_MAJOR 2
14303 +#define DLM_DEVICE_VERSION_MINOR 0
14304 +#define DLM_DEVICE_VERSION_PATCH 0
14305 +
14306 +/* struct passed to the lock write */
14307 +struct dlm_lock_params {
14308 +       uint32_t version[3];
14309 +       uint8_t cmd;
14310 +       uint8_t mode;
14311 +       uint16_t flags;
14312 +       uint32_t lkid;
14313 +       uint32_t parent;
14314 +       struct dlm_range range;
14315 +       uint8_t namelen;
14316 +        void *astparam;
14317 +        void *astaddr;
14318 +        void *bastaddr;
14319 +        struct dlm_lksb *lksb;
14320 +       char name[1];
14321 +};
14322 +
14323 +
14324 +/* struct read from the "device" fd,
14325 +   consists mainly of userspace pointers for the library to use */
14326 +struct dlm_lock_result {
14327 +       uint8_t cmd;
14328 +        void *astparam;
14329 +        void (*astaddr)(void *astparam);
14330 +        struct dlm_lksb *user_lksb;
14331 +        struct dlm_lksb lksb;  /* But this has real data in it */
14332 +        uint8_t bast_mode; /* Not yet used */
14333 +};
14334 +
14335 +/* commands passed to the device */
14336 +#define DLM_USER_LOCK       1
14337 +#define DLM_USER_UNLOCK     2
14338 +#define DLM_USER_QUERY      3
14339 +
14340 +/* Arbitrary length restriction */
14341 +#define MAX_LS_NAME_LEN 64
14342 +
14343 +/* ioctls on the device */
14344 +#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
14345 +#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
14346 +#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)