linux-cluster-dlm.patch

   1 # Add DLM to the build system
   2 diff -urN -p linux-2.6.8.1/cluster/Kconfig linux/cluster/Kconfig
   3 --- linux-2.6.8.1/cluster/Kconfig       2004-08-24 13:23:09.000000000 +0800
   4 +++ linux/cluster/Kconfig       2004-08-24 13:23:32.000000000 +0800
   5 @@ -10,4 +10,22 @@ config CLUSTER
   6         needed by all the other components. It provides membership services
   7         for those other subsystems.
   8
   9 +config CLUSTER_DLM
  10 +       tristate "Distributed Lock Manager"
  11 +       depends on CLUSTER
  12 +       ---help---
  13 +       A fully distributed lock manager, providing cluster-wide locking services
  14 +       and protected lock namespaces for kernel and userland applications.
  15 +
  16 +config CLUSTER_DLM_PROCLOCKS
  17 +       boolean "/proc/locks support for DLM"
  18 +       depends on CLUSTER_DLM
  19 +       depends on PROC_FS
  20 +       ---help---
  21 +       If this option is enabled a file will appear in /proc/cluster/dlm_locks.
  22 +       write into this "file" the name of a lockspace known to the DLM and then
  23 +       read out a list of all the resources and locks in that lockspace that are
  24 +       known to the local node. Note because the DLM is distributed this may not
  25 +       be the full lock picture.
  26 +
  27  endmenu
  28 diff -urN -p linux-2.6.8.1/cluster/Makefile linux/cluster/Makefile
  29 --- linux-2.6.8.1/cluster/Makefile      2004-08-24 13:23:09.000000000 +0800
  30 +++ linux/cluster/Makefile      2004-08-24 13:23:32.000000000 +0800
  31 @@ -1,3 +1,4 @@
  32  obj-y  := nocluster.o
  33
  34  obj-$(CONFIG_CLUSTER)         += cman/
  35 +obj-$(CONFIG_CLUSTER_DLM)     += dlm/
  36 diff -urN -p linux-2.6.8.1/cluster/dlm/Makefile linux/cluster/dlm/Makefile
  37 --- linux-2.6.8.1/cluster/dlm/Makefile  1970-01-01 07:30:00.000000000 +0730
  38 +++ linux/cluster/dlm/Makefile  2004-08-24 13:23:32.000000000 +0800
  39 @@ -0,0 +1,23 @@
  40 +dlm-objs                 :=    ast.o \
  41 +                               config.o \
  42 +                               device.o \
  43 +                               dir.o \
  44 +                               lkb.o \
  45 +                               locking.o \
  46 +                               lockqueue.o \
  47 +                               lockspace.o \
  48 +                               lowcomms.o \
  49 +                               main.o \
  50 +                               memory.o \
  51 +                               midcomms.o \
  52 +                               nodes.o \
  53 +                               proc.o \
  54 +                               queries.o \
  55 +                               rebuild.o \
  56 +                               reccomms.o \
  57 +                               recover.o \
  58 +                               recoverd.o \
  59 +                               rsb.o \
  60 +                               util.o \
  61 +
  62 +obj-$(CONFIG_CLUSTER_DLM) += dlm.o
  63 diff -urN linux-orig/cluster/dlm/ast.c linux-patched/cluster/dlm/ast.c
  64 --- linux-orig/cluster/dlm/ast.c        1970-01-01 07:30:00.000000000 +0730
  65 +++ linux-patched/cluster/dlm/ast.c     2004-11-03 11:31:56.000000000 +0800
  66 @@ -0,0 +1,618 @@
  67 +/******************************************************************************
  68 +*******************************************************************************
  69 +**
  70 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
  71 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
  72 +**
  73 +**  This copyrighted material is made available to anyone wishing to use,
  74 +**  modify, copy, or redistribute it subject to the terms and conditions
  75 +**  of the GNU General Public License v.2.
  76 +**
  77 +*******************************************************************************
  78 +******************************************************************************/
  79 +
  80 +/*
  81 + * This delivers ASTs and checks for dead remote requests and deadlocks.
  82 + */
  83 +
  84 +#include <linux/timer.h>
  85 +
  86 +#include "dlm_internal.h"
  87 +#include "rsb.h"
  88 +#include "lockqueue.h"
  89 +#include "dir.h"
  90 +#include "locking.h"
  91 +#include "lkb.h"
  92 +#include "lowcomms.h"
  93 +#include "midcomms.h"
  94 +#include "ast.h"
  95 +#include "nodes.h"
  96 +#include "config.h"
  97 +#include "util.h"
  98 +
  99 +/* Wake up flags for astd */
 100 +#define WAKE_ASTS  1
 101 +#define WAKE_TIMER 2
 102 +
 103 +static struct list_head                ast_queue;
 104 +static struct semaphore                ast_queue_lock;
 105 +static wait_queue_head_t       astd_waitchan;
 106 +struct task_struct *           astd_task;
 107 +static unsigned long           astd_wakeflags;
 108 +
 109 +static struct list_head                _deadlockqueue;
 110 +static struct semaphore                _deadlockqueue_lock;
 111 +static struct list_head                _lockqueue;
 112 +static struct semaphore                _lockqueue_lock;
 113 +static struct timer_list       _lockqueue_timer;
 114 +
 115 +void add_to_lockqueue(struct dlm_lkb *lkb)
 116 +{
 117 +       /* Time stamp the entry so we know if it's been waiting too long */
 118 +       lkb->lkb_lockqueue_time = jiffies;
 119 +
 120 +       down(&_lockqueue_lock);
 121 +       list_add(&lkb->lkb_lockqueue, &_lockqueue);
 122 +       up(&_lockqueue_lock);
 123 +}
 124 +
 125 +void remove_from_lockqueue(struct dlm_lkb *lkb)
 126 +{
 127 +       down(&_lockqueue_lock);
 128 +       list_del(&lkb->lkb_lockqueue);
 129 +       up(&_lockqueue_lock);
 130 +
 131 +#ifdef CONFIG_DLM_STATS
 132 +       dlm_stats.lockqueue_time[lkb->lkb_lockqueue_state] += (jiffies - lkb->lkb_lockqueue_time);
 133 +       dlm_stats.lockqueue_locks[lkb->lkb_lockqueue_state]++;
 134 +#endif
 135 +       lkb->lkb_lockqueue_state = 0;
 136 +}
 137 +
 138 +void add_to_deadlockqueue(struct dlm_lkb *lkb)
 139 +{
 140 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 141 +               return;
 142 +       lkb->lkb_duetime = jiffies;
 143 +       down(&_deadlockqueue_lock);
 144 +       list_add(&lkb->lkb_deadlockq, &_deadlockqueue);
 145 +       up(&_deadlockqueue_lock);
 146 +}
 147 +
 148 +void remove_from_deadlockqueue(struct dlm_lkb *lkb)
 149 +{
 150 +       if (test_bit(LSFL_NOTIMERS, &lkb->lkb_resource->res_ls->ls_flags))
 151 +               return;
 152 +
 153 +       down(&_deadlockqueue_lock);
 154 +       list_del(&lkb->lkb_deadlockq);
 155 +       up(&_deadlockqueue_lock);
 156 +
 157 +       /* Invalidate the due time */
 158 +       memset(&lkb->lkb_duetime, 0, sizeof(lkb->lkb_duetime));
 159 +}
 160 +
 161 +/*
 162 + * Queue an AST for delivery, this will only deal with
 163 + * kernel ASTs, usermode API will piggyback on top of this.
 164 + *
 165 + * This can be called in either the user or DLM context.
 166 + * ASTs are queued EVEN IF we are already running in dlm_astd
 167 + * context as we don't know what other locks are held (eg we could
 168 + * be being called from a lock operation that was called from
 169 + * another AST!
 170 + * If the AST is to be queued remotely then a message is sent to
 171 + * the target system via midcomms.
 172 + */
 173 +
 174 +void queue_ast(struct dlm_lkb *lkb, uint16_t flags, uint8_t rqmode)
 175 +{
 176 +       struct dlm_request req;
 177 +
 178 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
 179 +               /*
 180 +                * Send a message to have an ast queued remotely.  Note: we do
 181 +                * not send remote completion asts, they are handled as part of
 182 +                * remote lock granting.
 183 +                */
 184 +               if (flags & AST_BAST) {
 185 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDBAST;
 186 +                       req.rr_header.rh_length = sizeof(req);
 187 +                       req.rr_header.rh_flags = 0;
 188 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 189 +                       req.rr_header.rh_lockspace =
 190 +                           lkb->lkb_resource->res_ls->ls_global_id;
 191 +                       req.rr_status = lkb->lkb_retstatus;
 192 +                       req.rr_remlkid = lkb->lkb_remid;
 193 +                       req.rr_rqmode = rqmode;
 194 +
 195 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 196 +                               lkb->lkb_resource->res_ls->ls_allocation);
 197 +               } else if (lkb->lkb_retstatus == -EDEADLOCK) {
 198 +                       /*
 199 +                        * We only queue remote Completion ASTs here for error
 200 +                        * completions that happen out of band.
 201 +                        * DEADLOCK is one such.
 202 +                        */
 203 +                       req.rr_header.rh_cmd = GDLM_REMCMD_SENDCAST;
 204 +                       req.rr_header.rh_length = sizeof(req);
 205 +                       req.rr_header.rh_flags = 0;
 206 +                       req.rr_header.rh_lkid = lkb->lkb_id;
 207 +                       req.rr_header.rh_lockspace =
 208 +                           lkb->lkb_resource->res_ls->ls_global_id;
 209 +                       req.rr_status = lkb->lkb_retstatus;
 210 +                       req.rr_remlkid = lkb->lkb_remid;
 211 +                       req.rr_rqmode = rqmode;
 212 +
 213 +                       midcomms_send_message(lkb->lkb_nodeid, &req.rr_header,
 214 +                               lkb->lkb_resource->res_ls->ls_allocation);
 215 +               }
 216 +       } else {
 217 +               /*
 218 +                * Prepare info that will be returned in ast/bast.
 219 +                */
 220 +
 221 +               if (flags & AST_BAST) {
 222 +                       lkb->lkb_bastmode = rqmode;
 223 +               } else {
 224 +                       lkb->lkb_lksb->sb_status = lkb->lkb_retstatus;
 225 +                       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
 226 +                               lkb->lkb_lksb->sb_flags = DLM_SBF_DEMOTED;
 227 +                       else
 228 +                               lkb->lkb_lksb->sb_flags = 0;
 229 +               }
 230 +
 231 +               down(&ast_queue_lock);
 232 +               if (!(lkb->lkb_astflags & (AST_COMP | AST_BAST)))
 233 +                       list_add_tail(&lkb->lkb_astqueue, &ast_queue);
 234 +               lkb->lkb_astflags |= flags;
 235 +               up(&ast_queue_lock);
 236 +
 237 +               /* It is the responsibility of the caller to call wake_astd()
 238 +                * after it has finished other locking operations that request
 239 +                * the ASTs to be delivered after */
 240 +       }
 241 +}
 242 +
 243 +/*
 244 + * Process any LKBs on the AST queue.
 245 + */
 246 +
 247 +static void process_asts(void)
 248 +{
 249 +       struct dlm_ls *ls;
 250 +       struct dlm_rsb *rsb;
 251 +       struct dlm_lkb *lkb;
 252 +       void (*cast) (long param);
 253 +       void (*bast) (long param, int mode);
 254 +       long astparam;
 255 +       uint16_t flags;
 256 +
 257 +       for (;;) {
 258 +               down(&ast_queue_lock);
 259 +               if (list_empty(&ast_queue)) {
 260 +                       up(&ast_queue_lock);
 261 +                       break;
 262 +               }
 263 +
 264 +               lkb = list_entry(ast_queue.next, struct dlm_lkb, lkb_astqueue);
 265 +               list_del(&lkb->lkb_astqueue);
 266 +               flags = lkb->lkb_astflags;
 267 +               lkb->lkb_astflags = 0;
 268 +               up(&ast_queue_lock);
 269 +
 270 +               cast = lkb->lkb_astaddr;
 271 +               bast = lkb->lkb_bastaddr;
 272 +               astparam = lkb->lkb_astparam;
 273 +               rsb = lkb->lkb_resource;
 274 +               ls = rsb->res_ls;
 275 +
 276 +               if (flags & AST_COMP) {
 277 +                       if (flags & AST_DEL) {
 278 +                               DLM_ASSERT(lkb->lkb_astflags == 0,);
 279 +
 280 +                               /* FIXME: we don't want to block asts for other
 281 +                                  lockspaces while one is being recovered */
 282 +
 283 +                               down_read(&ls->ls_in_recovery);
 284 +                               release_lkb(ls, lkb);
 285 +                               release_rsb(rsb);
 286 +                               up_read(&ls->ls_in_recovery);
 287 +                       }
 288 +
 289 +                       if (cast) {
 290 +#ifdef CONFIG_DLM_STATS
 291 +                               dlm_stats.cast++;
 292 +#endif
 293 +                               cast(astparam);
 294 +                       }
 295 +               }
 296 +
 297 +               if (flags & AST_BAST && !(flags & AST_DEL)) {
 298 +                       int bmode = lkb->lkb_bastmode;
 299 +
 300 +                       /* gr or rq mode of the lock may have changed since the
 301 +                          ast was queued making the delivery unnecessary */
 302 +
 303 +                       if (!bast || dlm_modes_compat(lkb->lkb_grmode, bmode))
 304 +                               continue;
 305 +
 306 +                       if (lkb->lkb_rqmode == DLM_LOCK_IV ||
 307 +                           !dlm_modes_compat(lkb->lkb_rqmode, bmode)) {
 308 +                               bast(astparam, bmode);
 309 +#ifdef CONFIG_DLM_STATS
 310 +                               dlm_stats.bast++;
 311 +#endif
 312 +                       }
 313 +               }
 314 +
 315 +               schedule();
 316 +       }
 317 +}
 318 +
 319 +void lockqueue_lkb_mark(struct dlm_ls *ls)
 320 +{
 321 +       struct dlm_lkb *lkb, *safe;
 322 +       int count = 0;
 323 +
 324 +       log_all(ls, "mark waiting requests");
 325 +
 326 +       down(&_lockqueue_lock);
 327 +
 328 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 329 +
 330 +               if (lkb->lkb_resource->res_ls != ls)
 331 +                       continue;
 332 +
 333 +               log_debug(ls, "mark %x lq %d nodeid %d", lkb->lkb_id,
 334 +                         lkb->lkb_lockqueue_state, lkb->lkb_nodeid);
 335 +
 336 +               /*
 337 +                * These lkb's are new and the master is being looked up.  Mark
 338 +                * the lkb request to be resent.  Even if the destination node
 339 +                * for the request is still living and has our request, it will
 340 +                * purge all resdir requests in purge_requestqueue.  If there's
 341 +                * a reply to the LOOKUP request in our requestqueue (the reply
 342 +                * arrived after ls_stop), it is invalid and will be discarded
 343 +                * in purge_requestqueue, too.
 344 +                */
 345 +
 346 +               if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
 347 +                       DLM_ASSERT(lkb->lkb_nodeid == -1,
 348 +                                   print_lkb(lkb);
 349 +                                   print_rsb(lkb->lkb_resource););
 350 +
 351 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 352 +                       count++;
 353 +                       continue;
 354 +               }
 355 +
 356 +               /*
 357 +                * We're waiting for an unlock reply and the master node from
 358 +                * whom we're expecting the reply has failed.  If there's a
 359 +                * reply in the requestqueue do nothing and process it later in
 360 +                * process_requestqueue.  If there's no reply, don't rebuild
 361 +                * the lkb on a new master, but just assume we've gotten an
 362 +                * unlock completion reply from the prev master (this also
 363 +                * means not resending the unlock request).  If the unlock is
 364 +                * for the last lkb on the rsb, the rsb has nodeid of -1 and
 365 +                * the rsb won't be rebuilt on the new master either.
 366 +                *
 367 +                * If we're waiting for an unlock reply and the master node is
 368 +                * still alive, we should either have a reply in the
 369 +                * requestqueue from the master already, or we should get one
 370 +                * from the master once recovery is complete.  There is no
 371 +                * rebuilding of the rsb/lkb in this case and no resending of
 372 +                * the request.
 373 +                 */
 374 +
 375 +               if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK) {
 376 +                       if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
 377 +                               if (reply_in_requestqueue(ls, lkb->lkb_id)) {
 378 +                                       lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
 379 +                                       log_debug(ls, "mark %x unlock have rep",
 380 +                                                 lkb->lkb_id);
 381 +                               } else {
 382 +                                       /* assume we got reply fr old master */
 383 +                                       lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
 384 +                                       lkb->lkb_flags |= GDLM_LKFLG_UNLOCKDONE;
 385 +                                       log_debug(ls, "mark %x unlock no rep",
 386 +                                               lkb->lkb_id);
 387 +                               }
 388 +                       }
 389 +                       count++;
 390 +                       continue;
 391 +               }
 392 +
 393 +               /*
 394 +                * These lkb's have an outstanding request to a bygone node.
 395 +                * The request will be redirected to the new master node in
 396 +                * resend_cluster_requests().  Don't mark the request for
 397 +                * resending if there's a reply for it saved in the
 398 +                * requestqueue.
 399 +                */
 400 +
 401 +               if (in_nodes_gone(ls, lkb->lkb_nodeid) &&
 402 +                   !reply_in_requestqueue(ls, lkb->lkb_id)) {
 403 +
 404 +                       lkb->lkb_flags |= GDLM_LKFLG_LQRESEND;
 405 +
 406 +                       /*
 407 +                        * Don't rebuild this lkb on a new rsb in
 408 +                        * rebuild_rsbs_send().
 409 +                        */
 410 +
 411 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONDGRANT) {
 412 +                               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_WAITING,
 413 +                                           print_lkb(lkb);
 414 +                                           print_rsb(lkb->lkb_resource););
 415 +                               lkb->lkb_flags |= GDLM_LKFLG_NOREBUILD;
 416 +                       }
 417 +
 418 +                       /*
 419 +                        * This flag indicates to the new master that his lkb
 420 +                        * is in the midst of a convert request and should be
 421 +                        * placed on the granted queue rather than the convert
 422 +                        * queue.  We will resend this convert request to the
 423 +                        * new master.
 424 +                        */
 425 +
 426 +                       else if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_CONVERT) {
 427 +                               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,
 428 +                                           print_lkb(lkb);
 429 +                                           print_rsb(lkb->lkb_resource););
 430 +                               lkb->lkb_flags |= GDLM_LKFLG_LQCONVERT;
 431 +                       }
 432 +
 433 +                       count++;
 434 +               }
 435 +       }
 436 +       up(&_lockqueue_lock);
 437 +
 438 +       log_all(ls, "marked %d requests", count);
 439 +}
 440 +
 441 +int resend_cluster_requests(struct dlm_ls *ls)
 442 +{
 443 +       struct dlm_lkb *lkb, *safe;
 444 +       struct dlm_rsb *r;
 445 +       int error = 0, state, count = 0;
 446 +
 447 +       log_all(ls, "resend marked requests");
 448 +
 449 +       down(&_lockqueue_lock);
 450 +
 451 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 452 +
 453 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
 454 +                       log_debug(ls, "resend_cluster_requests: aborted");
 455 +                       error = -EINTR;
 456 +                       break;
 457 +               }
 458 +
 459 +               r = lkb->lkb_resource;
 460 +
 461 +               if (r->res_ls != ls)
 462 +                       continue;
 463 +
 464 +               log_debug(ls, "resend %x lq %d flg %x node %d/%d \"%s\"",
 465 +                         lkb->lkb_id, lkb->lkb_lockqueue_state, lkb->lkb_flags,
 466 +                         lkb->lkb_nodeid, r->res_nodeid, r->res_name);
 467 +
 468 +               if (lkb->lkb_flags & GDLM_LKFLG_UNLOCKDONE) {
 469 +                       log_debug(ls, "unlock done %x", lkb->lkb_id);
 470 +                       list_del(&lkb->lkb_lockqueue);
 471 +                       res_lkb_dequeue(lkb);
 472 +                       lkb->lkb_retstatus = -DLM_EUNLOCK;
 473 +                       queue_ast(lkb, AST_COMP | AST_DEL, 0);
 474 +                       count++;
 475 +                       continue;
 476 +               }
 477 +
 478 +               /*
 479 +                * Resend/process the lockqueue lkb's (in-progres requests)
 480 +                * that were flagged at the start of recovery in
 481 +                * lockqueue_lkb_mark().
 482 +                */
 483 +
 484 +               if (lkb->lkb_flags & GDLM_LKFLG_LQRESEND) {
 485 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQRESEND;
 486 +                       lkb->lkb_flags &= ~GDLM_LKFLG_NOREBUILD;
 487 +                       lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
 488 +
 489 +                       if (lkb->lkb_nodeid == -1) {
 490 +                               /*
 491 +                                * Send lookup to new resdir node.
 492 +                                */
 493 +                               lkb->lkb_lockqueue_time = jiffies;
 494 +                               send_cluster_request(lkb,
 495 +                                                    lkb->lkb_lockqueue_state);
 496 +                       }
 497 +
 498 +                       else if (lkb->lkb_nodeid != 0) {
 499 +                               /*
 500 +                                * There's a new RSB master (that's not us.)
 501 +                                */
 502 +                               lkb->lkb_lockqueue_time = jiffies;
 503 +                               send_cluster_request(lkb,
 504 +                                                    lkb->lkb_lockqueue_state);
 505 +                       }
 506 +
 507 +                       else {
 508 +                               /*
 509 +                                * We are the new RSB master for this lkb
 510 +                                * request.
 511 +                                */
 512 +                               state = lkb->lkb_lockqueue_state;
 513 +                               lkb->lkb_lockqueue_state = 0;
 514 +                               /* list_del equals remove_from_lockqueue() */
 515 +                               list_del(&lkb->lkb_lockqueue);
 516 +                               process_remastered_lkb(ls, lkb, state);
 517 +                       }
 518 +
 519 +                       count++;
 520 +               }
 521 +       }
 522 +       up(&_lockqueue_lock);
 523 +
 524 +       log_all(ls, "resent %d requests", count);
 525 +       return error;
 526 +}
 527 +
 528 +/*
 529 + * Process any LKBs on the Lock queue, this
 530 + * just looks at the entries to see if they have been
 531 + * on the queue too long and fails the requests if so.
 532 + */
 533 +
 534 +static void process_lockqueue(void)
 535 +{
 536 +       struct dlm_lkb *lkb, *safe;
 537 +       struct dlm_ls *ls;
 538 +       int count = 0;
 539 +
 540 +       down(&_lockqueue_lock);
 541 +
 542 +       list_for_each_entry_safe(lkb, safe, &_lockqueue, lkb_lockqueue) {
 543 +               ls = lkb->lkb_resource->res_ls;
 544 +
 545 +               if (test_bit(LSFL_NOTIMERS, &ls->ls_flags))
 546 +                       continue;
 547 +
 548 +               /* Don't time out locks that are in transition */
 549 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
 550 +                       continue;
 551 +
 552 +               if (check_timeout(lkb->lkb_lockqueue_time,
 553 +                                 dlm_config.lock_timeout)) {
 554 +                       count++;
 555 +                       list_del(&lkb->lkb_lockqueue);
 556 +                       up(&_lockqueue_lock);
 557 +                       cancel_lockop(lkb, -ETIMEDOUT);
 558 +                       down(&_lockqueue_lock);
 559 +               }
 560 +       }
 561 +       up(&_lockqueue_lock);
 562 +
 563 +       if (count)
 564 +               wake_astd();
 565 +
 566 +       mod_timer(&_lockqueue_timer,
 567 +                 jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 568 +}
 569 +
 570 +/* Look for deadlocks */
 571 +static void process_deadlockqueue(void)
 572 +{
 573 +       struct dlm_lkb *lkb, *safe;
 574 +
 575 +       down(&_deadlockqueue_lock);
 576 +
 577 +       list_for_each_entry_safe(lkb, safe, &_deadlockqueue, lkb_deadlockq) {
 578 +               struct dlm_lkb *kill_lkb;
 579 +
 580 +               /* Only look at "due" locks */
 581 +               if (!check_timeout(lkb->lkb_duetime, dlm_config.deadlocktime))
 582 +                       break;
 583 +
 584 +               /* Don't look at locks that are in transition */
 585 +               if (!test_bit(LSFL_LS_RUN,
 586 +                             &lkb->lkb_resource->res_ls->ls_flags))
 587 +                       continue;
 588 +
 589 +               up(&_deadlockqueue_lock);
 590 +
 591 +               /* Lock has hit due time, check for conversion deadlock */
 592 +               kill_lkb = conversion_deadlock_check(lkb);
 593 +               if (kill_lkb)
 594 +                       cancel_conversion(kill_lkb, -EDEADLOCK);
 595 +
 596 +               down(&_deadlockqueue_lock);
 597 +       }
 598 +       up(&_deadlockqueue_lock);
 599 +}
 600 +
 601 +static __inline__ int no_asts(void)
 602 +{
 603 +       int ret;
 604 +
 605 +       down(&ast_queue_lock);
 606 +       ret = list_empty(&ast_queue);
 607 +       up(&ast_queue_lock);
 608 +       return ret;
 609 +}
 610 +
 611 +static void lockqueue_timer_fn(unsigned long arg)
 612 +{
 613 +       set_bit(WAKE_TIMER, &astd_wakeflags);
 614 +       wake_up(&astd_waitchan);
 615 +}
 616 +
 617 +/*
 618 + * DLM daemon which delivers asts.
 619 + */
 620 +
 621 +static int dlm_astd(void *data)
 622 +{
 623 +       /*
 624 +        * Set a timer to check the lockqueue for dead locks (and deadlocks).
 625 +        */
 626 +       INIT_LIST_HEAD(&_lockqueue);
 627 +       init_MUTEX(&_lockqueue_lock);
 628 +       INIT_LIST_HEAD(&_deadlockqueue);
 629 +       init_MUTEX(&_deadlockqueue_lock);
 630 +       init_timer(&_lockqueue_timer);
 631 +       _lockqueue_timer.function = lockqueue_timer_fn;
 632 +       _lockqueue_timer.data = 0;
 633 +       mod_timer(&_lockqueue_timer,
 634 +                 jiffies + ((dlm_config.lock_timeout >> 1) * HZ));
 635 +
 636 +       while (!kthread_should_stop()) {
 637 +               wchan_cond_sleep_intr(astd_waitchan, !test_bit(WAKE_ASTS, &astd_wakeflags));
 638 +
 639 +               if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
 640 +                       process_asts();
 641 +
 642 +               if (test_and_clear_bit(WAKE_TIMER, &astd_wakeflags)) {
 643 +                       process_lockqueue();
 644 +                       if (dlm_config.deadlocktime)
 645 +                               process_deadlockqueue();
 646 +               }
 647 +       }
 648 +
 649 +       if (timer_pending(&_lockqueue_timer))
 650 +               del_timer(&_lockqueue_timer);
 651 +
 652 +       return 0;
 653 +}
 654 +
 655 +void wake_astd(void)
 656 +{
 657 +       if (!no_asts()) {
 658 +               set_bit(WAKE_ASTS, &astd_wakeflags);
 659 +               wake_up(&astd_waitchan);
 660 +       }
 661 +}
 662 +
 663 +int astd_start(void)
 664 +{
 665 +       struct task_struct *p;
 666 +       int error = 0;
 667 +
 668 +       INIT_LIST_HEAD(&ast_queue);
 669 +       init_MUTEX(&ast_queue_lock);
 670 +       init_waitqueue_head(&astd_waitchan);
 671 +
 672 +       p = kthread_run(dlm_astd, NULL, 0, "dlm_astd");
 673 +       if (IS_ERR(p))
 674 +               error = PTR_ERR(p);
 675 +       else
 676 +               astd_task = p;
 677 +       return error;
 678 +}
 679 +
 680 +void astd_stop(void)
 681 +{
 682 +       kthread_stop(astd_task);
 683 +       wake_up(&astd_waitchan);
 684 +}
 685 diff -urN linux-orig/cluster/dlm/ast.h linux-patched/cluster/dlm/ast.h
 686 --- linux-orig/cluster/dlm/ast.h        1970-01-01 07:30:00.000000000 +0730
 687 +++ linux-patched/cluster/dlm/ast.h     2004-11-03 11:31:56.000000000 +0800
 688 @@ -0,0 +1,28 @@
 689 +/******************************************************************************
 690 +*******************************************************************************
 691 +**
 692 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 693 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 694 +**
 695 +**  This copyrighted material is made available to anyone wishing to use,
 696 +**  modify, copy, or redistribute it subject to the terms and conditions
 697 +**  of the GNU General Public License v.2.
 698 +**
 699 +*******************************************************************************
 700 +******************************************************************************/
 701 +
 702 +#ifndef __AST_DOT_H__
 703 +#define __AST_DOT_H__
 704 +
 705 +void lockqueue_lkb_mark(struct dlm_ls *ls);
 706 +int resend_cluster_requests(struct dlm_ls *ls);
 707 +void add_to_lockqueue(struct dlm_lkb *lkb);
 708 +void remove_from_lockqueue(struct dlm_lkb *lkb);
 709 +void add_to_deadlockqueue(struct dlm_lkb *lkb);
 710 +void remove_from_deadlockqueue(struct dlm_lkb *lkb);
 711 +void queue_ast(struct dlm_lkb *lkb, uint16_t astflags, uint8_t rqmode);
 712 +void wake_astd(void);
 713 +int astd_start(void);
 714 +void astd_stop(void);
 715 +
 716 +#endif                         /* __AST_DOT_H__ */
 717 diff -urN linux-orig/cluster/dlm/config.c linux-patched/cluster/dlm/config.c
 718 --- linux-orig/cluster/dlm/config.c     1970-01-01 07:30:00.000000000 +0730
 719 +++ linux-patched/cluster/dlm/config.c  2004-11-03 11:31:56.000000000 +0800
 720 @@ -0,0 +1,137 @@
 721 +/******************************************************************************
 722 +*******************************************************************************
 723 +**
 724 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 725 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 726 +**
 727 +**  This copyrighted material is made available to anyone wishing to use,
 728 +**  modify, copy, or redistribute it subject to the terms and conditions
 729 +**  of the GNU General Public License v.2.
 730 +**
 731 +*******************************************************************************
 732 +******************************************************************************/
 733 +
 734 +#include <linux/module.h>
 735 +#include <linux/proc_fs.h>
 736 +
 737 +#include "dlm_internal.h"
 738 +#include "lowcomms.h"
 739 +#include "config.h"
 740 +
 741 +/* Config file defaults */
 742 +#define DEFAULT_TCP_PORT       21064
 743 +#define DEFAULT_LOCK_TIMEOUT      30
 744 +#define DEFAULT_BUFFER_SIZE     4096
 745 +#define DEFAULT_RSBTBL_SIZE      256
 746 +#define DEFAULT_LKBTBL_SIZE     1024
 747 +#define DEFAULT_DIRTBL_SIZE      512
 748 +#define DEFAULT_CONN_INCREMENT    32
 749 +#define DEFAULT_DEADLOCKTIME      10
 750 +#define DEFAULT_RECOVER_TIMER      5
 751 +
 752 +struct config_info dlm_config = {
 753 +       .tcp_port = DEFAULT_TCP_PORT,
 754 +       .lock_timeout = DEFAULT_LOCK_TIMEOUT,
 755 +       .buffer_size = DEFAULT_BUFFER_SIZE,
 756 +       .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
 757 +       .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
 758 +       .dirtbl_size = DEFAULT_DIRTBL_SIZE,
 759 +       .conn_increment = DEFAULT_CONN_INCREMENT,
 760 +       .deadlocktime = DEFAULT_DEADLOCKTIME,
 761 +       .recover_timer = DEFAULT_RECOVER_TIMER
 762 +};
 763 +
 764 +
 765 +static struct config_proc_info {
 766 +    char *name;
 767 +    int  *value;
 768 +} config_proc[] = {
 769 +    {
 770 +       .name = "tcp_port",
 771 +       .value = &dlm_config.tcp_port,
 772 +    },
 773 +    {
 774 +       .name = "lock_timeout",
 775 +       .value = &dlm_config.lock_timeout,
 776 +    },
 777 +    {
 778 +       .name = "buffer_size",
 779 +       .value = &dlm_config.buffer_size,
 780 +    },
 781 +    {
 782 +       .name = "rsbtbl_size",
 783 +       .value = &dlm_config.rsbtbl_size,
 784 +    },
 785 +    {
 786 +       .name = "lkbtbl_size",
 787 +       .value = &dlm_config.lkbtbl_size,
 788 +    },
 789 +    {
 790 +       .name = "dirtbl_size",
 791 +       .value = &dlm_config.dirtbl_size,
 792 +    },
 793 +    {
 794 +       .name = "conn_increment",
 795 +       .value = &dlm_config.conn_increment,
 796 +    },
 797 +    {
 798 +       .name = "deadlocktime",
 799 +       .value = &dlm_config.deadlocktime,
 800 +    },
 801 +    {
 802 +       .name = "recover_timer",
 803 +       .value = &dlm_config.recover_timer,
 804 +    }
 805 +};
 806 +static struct proc_dir_entry *dlm_dir;
 807 +
 808 +static int dlm_config_read_proc(char *page, char **start, off_t off, int count,
 809 +                               int *eof, void *data)
 810 +{
 811 +       struct config_proc_info *cinfo = data;
 812 +       return snprintf(page, count, "%d\n", *cinfo->value);
 813 +}
 814 +
 815 +static int dlm_config_write_proc(struct file *file, const char *buffer,
 816 +                                unsigned long count, void *data)
 817 +{
 818 +       struct config_proc_info *cinfo = data;
 819 +       int value;
 820 +       char *end;
 821 +
 822 +       value = simple_strtoul(buffer, &end, 10);
 823 +       if (*end)
 824 +               *cinfo->value = value;
 825 +       return count;
 826 +}
 827 +
 828 +int dlm_config_init(void)
 829 +{
 830 +       int i;
 831 +       struct proc_dir_entry *pde;
 832 +
 833 +       dlm_dir = proc_mkdir("cluster/config/dlm", 0);
 834 +       if (!dlm_dir)
 835 +               return -1;
 836 +
 837 +       dlm_dir->owner = THIS_MODULE;
 838 +
 839 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
 840 +               pde = create_proc_entry(config_proc[i].name, 0660, dlm_dir);
 841 +               if (pde) {
 842 +                       pde->data = &config_proc[i];
 843 +                       pde->write_proc = dlm_config_write_proc;
 844 +                       pde->read_proc = dlm_config_read_proc;
 845 +               }
 846 +       }
 847 +       return 0;
 848 +}
 849 +
 850 +void dlm_config_exit(void)
 851 +{
 852 +       int i;
 853 +
 854 +       for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++)
 855 +               remove_proc_entry(config_proc[i].name, dlm_dir);
 856 +       remove_proc_entry("cluster/config/dlm", NULL);
 857 +}
 858 diff -urN linux-orig/cluster/dlm/config.h linux-patched/cluster/dlm/config.h
 859 --- linux-orig/cluster/dlm/config.h     1970-01-01 07:30:00.000000000 +0730
 860 +++ linux-patched/cluster/dlm/config.h  2004-11-03 11:31:56.000000000 +0800
 861 @@ -0,0 +1,33 @@
 862 +/******************************************************************************
 863 +*******************************************************************************
 864 +**
 865 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 866 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 867 +**
 868 +**  This copyrighted material is made available to anyone wishing to use,
 869 +**  modify, copy, or redistribute it subject to the terms and conditions
 870 +**  of the GNU General Public License v.2.
 871 +**
 872 +*******************************************************************************
 873 +******************************************************************************/
 874 +
 875 +#ifndef __CONFIG_DOT_H__
 876 +#define __CONFIG_DOT_H__
 877 +
 878 +struct config_info {
 879 +       int tcp_port;
 880 +       int lock_timeout;
 881 +       int buffer_size;
 882 +       int rsbtbl_size;
 883 +       int lkbtbl_size;
 884 +       int dirtbl_size;
 885 +       int conn_increment;
 886 +       int deadlocktime;
 887 +       int recover_timer;
 888 +};
 889 +
 890 +extern struct config_info dlm_config;
 891 +extern int  dlm_config_init(void);
 892 +extern void dlm_config_exit(void);
 893 +
 894 +#endif                         /* __CONFIG_DOT_H__ */
 895 diff -urN linux-orig/cluster/dlm/device.c linux-patched/cluster/dlm/device.c
 896 --- linux-orig/cluster/dlm/device.c     1970-01-01 07:30:00.000000000 +0730
 897 +++ linux-patched/cluster/dlm/device.c  2004-11-03 11:31:56.000000000 +0800
 898 @@ -0,0 +1,1212 @@
 899 +/******************************************************************************
 900 +*******************************************************************************
 901 +**
 902 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 903 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 904 +**
 905 +**  This copyrighted material is made available to anyone wishing to use,
 906 +**  modify, copy, or redistribute it subject to the terms and conditions
 907 +**  of the GNU General Public License v.2.
 908 +**
 909 +*******************************************************************************
 910 +******************************************************************************/
 911 +
 912 +/*
 913 + * device.c
 914 + *
 915 + * This is the userland interface to the DLM.
 916 + *
 917 + * The locking is done via a misc char device (find the
 918 + * registered minor number in /proc/misc).
 919 + *
 920 + * User code should not use this interface directly but
 921 + * call the library routines in libdlm.a instead.
 922 + *
 923 + */
 924 +
 925 +#include <linux/miscdevice.h>
 926 +#include <linux/init.h>
 927 +#include <linux/wait.h>
 928 +#include <linux/module.h>
 929 +#include <linux/file.h>
 930 +#include <linux/fs.h>
 931 +#include <linux/poll.h>
 932 +#include <linux/signal.h>
 933 +#include <linux/spinlock.h>
 934 +#include <asm/ioctls.h>
 935 +
 936 +#include "dlm_internal.h"
 937 +#include "device.h"
 938 +
 939 +extern struct dlm_lkb *dlm_get_lkb(struct dlm_ls *, int);
 940 +static struct file_operations _dlm_fops;
 941 +static const char *name_prefix="dlm";
 942 +static struct list_head user_ls_list;
 943 +static struct semaphore user_ls_lock;
 944 +
 945 +/* Flags in li_flags */
 946 +#define LI_FLAG_COMPLETE  1
 947 +#define LI_FLAG_FIRSTLOCK 2
 948 +
 949 +#define LOCKINFO_MAGIC 0x53595324
 950 +
 951 +struct lock_info {
 952 +       uint32_t li_magic;
 953 +       uint8_t li_cmd;
 954 +       struct dlm_lksb li_lksb;
 955 +       wait_queue_head_t li_waitq;
 956 +       unsigned long li_flags;
 957 +       void __user *li_castparam;
 958 +       void __user *li_castaddr;
 959 +       void __user *li_bastparam;
 960 +       void __user *li_bastaddr;
 961 +       void __user *li_pend_bastparam;
 962 +       void __user *li_pend_bastaddr;
 963 +       void __user *li_user_lvbptr;
 964 +       struct list_head li_ownerqueue;
 965 +       struct file_info *li_file;
 966 +       struct dlm_lksb __user *li_user_lksb;
 967 +       struct semaphore li_firstlock;
 968 +       struct dlm_queryinfo *li_queryinfo;
 969 +       struct dlm_queryinfo __user *li_user_queryinfo;
 970 +};
 971 +
 972 +/* A queued AST no less */
 973 +struct ast_info {
 974 +       struct dlm_lock_result result;
 975 +       struct dlm_queryinfo *queryinfo;
 976 +       struct dlm_queryinfo __user *user_queryinfo;
 977 +       struct list_head list;
 978 +       void __user *user_lvbptr;
 979 +       uint32_t ast_reason;    /* AST_COMP or AST_BAST from dlm_internal.h */
 980 +};
 981 +
 982 +/* One of these per userland lockspace */
 983 +struct user_ls {
 984 +       void    *ls_lockspace;
 985 +       atomic_t ls_refcnt;
 986 +       long     ls_flags; /* bit 1 means LS has been deleted */
 987 +
 988 +       /* Passed into misc_register() */
 989 +       struct miscdevice ls_miscinfo;
 990 +       struct list_head  ls_list;
 991 +};
 992 +
 993 +/* misc_device info for the control device */
 994 +static struct miscdevice ctl_device;
 995 +
 996 +/*
 997 + * Stuff we hang off the file struct.
 998 + * The first two are to cope with unlocking all the
 999 + * locks help by a process when it dies.
1000 + */
1001 +struct file_info {
1002 +       struct list_head    fi_lkb_list;     /* List of active lkbs */
1003 +       spinlock_t          fi_lkb_lock;
1004 +       struct list_head    fi_ast_list;     /* Queue of ASTs to be delivered */
1005 +       spinlock_t          fi_ast_lock;
1006 +       wait_queue_head_t   fi_wait;
1007 +       struct user_ls     *fi_ls;
1008 +       atomic_t            fi_refcnt;       /* Number of users */
1009 +       unsigned long       fi_flags;        /* Bit 1 means the device is open */
1010 +};
1011 +
1012 +
1013 +/* get and put ops for file_info.
1014 +   Actually I don't really like "get" and "put", but everyone
1015 +   else seems to use them and I can't think of anything
1016 +   nicer at the moment */
1017 +static void get_file_info(struct file_info *f)
1018 +{
1019 +       atomic_inc(&f->fi_refcnt);
1020 +}
1021 +
1022 +static void put_file_info(struct file_info *f)
1023 +{
1024 +       if (atomic_dec_and_test(&f->fi_refcnt))
1025 +               kfree(f);
1026 +}
1027 +
1028 +static void release_lockinfo(struct lock_info *li)
1029 +{
1030 +       put_file_info(li->li_file);
1031 +       if (li->li_lksb.sb_lvbptr && li->li_cmd != DLM_USER_QUERY)
1032 +               kfree(li->li_lksb.sb_lvbptr);
1033 +       kfree(li);
1034 +}
1035 +
1036 +static struct user_ls *__find_lockspace(int minor)
1037 +{
1038 +       struct user_ls *lsinfo;
1039 +
1040 +       list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
1041 +
1042 +               if (lsinfo->ls_miscinfo.minor == minor)
1043 +                       return lsinfo;
1044 +       }
1045 +       return NULL;
1046 +}
1047 +
1048 +/* Find a lockspace struct given the device minor number */
1049 +static struct user_ls *find_lockspace(int minor)
1050 +{
1051 +       struct user_ls *lsinfo;
1052 +
1053 +       down(&user_ls_lock);
1054 +       lsinfo = __find_lockspace(minor);
1055 +       up(&user_ls_lock);
1056 +
1057 +       return lsinfo;
1058 +}
1059 +
1060 +static void add_lockspace_to_list(struct user_ls *lsinfo)
1061 +{
1062 +       down(&user_ls_lock);
1063 +       list_add(&lsinfo->ls_list, &user_ls_list);
1064 +       up(&user_ls_lock);
1065 +}
1066 +
1067 +/* Register a lockspace with the DLM and create a misc
1068 +   device for userland to access it */
1069 +static int register_lockspace(char *name, struct user_ls **ls)
1070 +{
1071 +       struct user_ls *newls;
1072 +       int status;
1073 +       int namelen;
1074 +
1075 +       namelen = strlen(name)+strlen(name_prefix)+2;
1076 +
1077 +       newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
1078 +       if (!newls)
1079 +               return -ENOMEM;
1080 +       memset(newls, 0, sizeof(struct user_ls));
1081 +
1082 +       newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
1083 +       if (!newls->ls_miscinfo.name) {
1084 +               kfree(newls);
1085 +               return -ENOMEM;
1086 +       }
1087 +       status = dlm_new_lockspace(name, strlen(name),
1088 +                                  &newls->ls_lockspace, 0);
1089 +
1090 +       if (status != 0) {
1091 +               kfree(newls->ls_miscinfo.name);
1092 +               kfree(newls);
1093 +               return status;
1094 +       }
1095 +
1096 +       snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", name_prefix, name);
1097 +
1098 +       newls->ls_miscinfo.fops = &_dlm_fops;
1099 +       newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
1100 +
1101 +       status = misc_register(&newls->ls_miscinfo);
1102 +       if (status) {
1103 +               log_print("failed to register misc device for %s", name);
1104 +               dlm_release_lockspace(newls->ls_lockspace, 0);
1105 +               kfree(newls->ls_miscinfo.name);
1106 +               kfree(newls);
1107 +               return status;
1108 +       }
1109 +
1110 +
1111 +       add_lockspace_to_list(newls);
1112 +       *ls = newls;
1113 +       return 0;
1114 +}
1115 +
1116 +/* Called with the user_ls_lock semaphore held */
1117 +static int unregister_lockspace(struct user_ls *lsinfo, int force)
1118 +{
1119 +       int status;
1120 +
1121 +       status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
1122 +       if (status)
1123 +               return status;
1124 +
1125 +       status = misc_deregister(&lsinfo->ls_miscinfo);
1126 +       if (status)
1127 +               return status;
1128 +
1129 +       list_del(&lsinfo->ls_list);
1130 +       set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
1131 +       lsinfo->ls_lockspace = NULL;
1132 +       if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
1133 +               kfree(lsinfo->ls_miscinfo.name);
1134 +               kfree(lsinfo);
1135 +       }
1136 +
1137 +       return 0;
1138 +}
1139 +
1140 +/* Add it to userland's AST queue */
1141 +static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, uint32_t reason)
1142 +{
1143 +       struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
1144 +       if (!ast)
1145 +               return;
1146 +
1147 +       ast->result.astparam  = astparam;
1148 +       ast->result.astaddr   = astaddr;
1149 +       ast->result.user_lksb = li->li_user_lksb;
1150 +       ast->result.cmd       = li->li_cmd;
1151 +       ast->user_lvbptr      = li->li_user_lvbptr;
1152 +       ast->ast_reason       = reason;
1153 +       memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
1154 +
1155 +       /* These two will both be NULL for anything other than queries */
1156 +       ast->queryinfo        = li->li_queryinfo;
1157 +       ast->user_queryinfo   = li->li_user_queryinfo;
1158 +
1159 +       spin_lock(&li->li_file->fi_ast_lock);
1160 +       list_add_tail(&ast->list, &li->li_file->fi_ast_list);
1161 +       spin_unlock(&li->li_file->fi_ast_lock);
1162 +       wake_up_interruptible(&li->li_file->fi_wait);
1163 +}
1164 +
1165 +static void bast_routine(void *param, int mode)
1166 +{
1167 +       struct lock_info *li = param;
1168 +
1169 +       if (li && li->li_bastaddr) {
1170 +               add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, AST_BAST);
1171 +       }
1172 +}
1173 +
1174 +/*
1175 + * This is the kernel's AST routine.
1176 + * All lock, unlock & query operations complete here.
1177 + * The only syncronous ops are those done during device close.
1178 + */
1179 +static void ast_routine(void *param)
1180 +{
1181 +       struct lock_info *li = param;
1182 +
1183 +       /* Param may be NULL if a persistent lock is unlocked by someone else */
1184 +       if (!li)
1185 +               return;
1186 +
1187 +       /* If this is a succesful conversion then activate the blocking ast
1188 +        * args from the conversion request */
1189 +       if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1190 +           li->li_lksb.sb_status == 0) {
1191 +
1192 +               li->li_bastparam = li->li_pend_bastparam;
1193 +               li->li_bastaddr = li->li_pend_bastaddr;
1194 +               li->li_pend_bastaddr = NULL;
1195 +       }
1196 +
1197 +       /* If it's an async request then post data to the user's AST queue. */
1198 +       if (li->li_castaddr) {
1199 +
1200 +               /* Only queue AST if the device is still open */
1201 +               if (test_bit(1, &li->li_file->fi_flags))
1202 +                       add_to_astqueue(li, li->li_castaddr, li->li_castparam, AST_COMP);
1203 +
1204 +               /* If it's a new lock operation that failed, then
1205 +                * remove it from the owner queue and free the
1206 +                * lock_info. The DLM will not free the LKB until this
1207 +                * AST has completed.
1208 +                */
1209 +               if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
1210 +                   li->li_lksb.sb_status != 0) {
1211 +                       struct dlm_lkb *lkb;
1212 +
1213 +                       /* Wait till dlm_lock() has finished */
1214 +                       down(&li->li_firstlock);
1215 +                       up(&li->li_firstlock);
1216 +
1217 +                       /* If the LKB has been freed then we need to tidy up too */
1218 +                       lkb = dlm_get_lkb(li->li_file->fi_ls->ls_lockspace, li->li_lksb.sb_lkid);
1219 +                       if (!lkb) {
1220 +                               spin_lock(&li->li_file->fi_lkb_lock);
1221 +                               list_del(&li->li_ownerqueue);
1222 +                               spin_unlock(&li->li_file->fi_lkb_lock);
1223 +
1224 +                               release_lockinfo(li);
1225 +                       }
1226 +                       return;
1227 +               }
1228 +               /* Free unlocks & queries */
1229 +               if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
1230 +                   li->li_cmd == DLM_USER_QUERY) {
1231 +                       release_lockinfo(li);
1232 +               }
1233 +       }
1234 +       else {
1235 +               /* Synchronous request, just wake up the caller */
1236 +               set_bit(LI_FLAG_COMPLETE, &li->li_flags);
1237 +               wake_up_interruptible(&li->li_waitq);
1238 +       }
1239 +}
1240 +
1241 +/*
1242 + * Wait for the lock op to complete and return the status.
1243 + */
1244 +static int wait_for_ast(struct lock_info *li)
1245 +{
1246 +       /* Wait for the AST routine to complete */
1247 +       set_task_state(current, TASK_INTERRUPTIBLE);
1248 +       while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
1249 +               schedule();
1250 +
1251 +       set_task_state(current, TASK_RUNNING);
1252 +
1253 +       return li->li_lksb.sb_status;
1254 +}
1255 +
1256 +
1257 +/* Open on control device */
1258 +static int dlm_ctl_open(struct inode *inode, struct file *file)
1259 +{
1260 +       return 0;
1261 +}
1262 +
1263 +/* Close on control device */
1264 +static int dlm_ctl_close(struct inode *inode, struct file *file)
1265 +{
1266 +       return 0;
1267 +}
1268 +
1269 +/* Open on lockspace device */
1270 +static int dlm_open(struct inode *inode, struct file *file)
1271 +{
1272 +       struct file_info *f;
1273 +       struct user_ls *lsinfo;
1274 +
1275 +       lsinfo = find_lockspace(iminor(inode));
1276 +       if (!lsinfo)
1277 +               return -ENOENT;
1278 +
1279 +       f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
1280 +       if (!f)
1281 +               return -ENOMEM;
1282 +
1283 +       atomic_inc(&lsinfo->ls_refcnt);
1284 +       INIT_LIST_HEAD(&f->fi_lkb_list);
1285 +       INIT_LIST_HEAD(&f->fi_ast_list);
1286 +       spin_lock_init(&f->fi_ast_lock);
1287 +       spin_lock_init(&f->fi_lkb_lock);
1288 +       init_waitqueue_head(&f->fi_wait);
1289 +       f->fi_ls = lsinfo;
1290 +       atomic_set(&f->fi_refcnt, 1);
1291 +       set_bit(1, &f->fi_flags);
1292 +
1293 +       file->private_data = f;
1294 +
1295 +       return 0;
1296 +}
1297 +
1298 +/* Check the user's version matches ours */
1299 +static int check_version(struct dlm_lock_params *params)
1300 +{
1301 +       if (params->version[0] != DLM_DEVICE_VERSION_MAJOR ||
1302 +           (params->version[0] == DLM_DEVICE_VERSION_MAJOR &&
1303 +            params->version[1] > DLM_DEVICE_VERSION_MINOR)) {
1304 +
1305 +               log_print("version mismatch user (%d.%d.%d) kernel (%d.%d.%d)",
1306 +                      params->version[0],
1307 +                      params->version[1],
1308 +                      params->version[2],
1309 +                      DLM_DEVICE_VERSION_MAJOR,
1310 +                      DLM_DEVICE_VERSION_MINOR,
1311 +                      DLM_DEVICE_VERSION_PATCH);
1312 +               return -EINVAL;
1313 +       }
1314 +       return 0;
1315 +}
1316 +
1317 +/* Close on lockspace device */
1318 +static int dlm_close(struct inode *inode, struct file *file)
1319 +{
1320 +       struct file_info *f = file->private_data;
1321 +       struct lock_info li;
1322 +       struct lock_info *old_li, *safe;
1323 +       sigset_t tmpsig;
1324 +       sigset_t allsigs;
1325 +       struct user_ls *lsinfo;
1326 +       DECLARE_WAITQUEUE(wq, current);
1327 +
1328 +       lsinfo = find_lockspace(iminor(inode));
1329 +       if (!lsinfo)
1330 +               return -ENOENT;
1331 +
1332 +       /* Mark this closed so that ASTs will not be delivered any more */
1333 +       clear_bit(1, &f->fi_flags);
1334 +
1335 +       /* Block signals while we are doing this */
1336 +       sigfillset(&allsigs);
1337 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1338 +
1339 +       /* We use our own lock_info struct here, so that any
1340 +        * outstanding "real" ASTs will be delivered with the
1341 +        * corresponding "real" params, thus freeing the lock_info
1342 +        * that belongs the lock. This catches the corner case where
1343 +        * a lock is BUSY when we try to unlock it here
1344 +        */
1345 +       memset(&li, 0, sizeof(li));
1346 +       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1347 +       init_waitqueue_head(&li.li_waitq);
1348 +       add_wait_queue(&li.li_waitq, &wq);
1349 +
1350 +       /*
1351 +        * Free any outstanding locks, they are on the
1352 +        * list in LIFO order so there should be no problems
1353 +        * about unlocking parents before children.
1354 +        * Although we don't remove the lkbs from the list here
1355 +        * (what would be the point?), foreach_safe is needed
1356 +        * because the lkbs are freed during dlm_unlock operations
1357 +        */
1358 +       list_for_each_entry_safe(old_li, safe, &f->fi_lkb_list, li_ownerqueue) {
1359 +               int status;
1360 +               int lock_status;
1361 +               int flags = 0;
1362 +               struct dlm_lkb *lkb;
1363 +
1364 +               lkb = dlm_get_lkb(f->fi_ls->ls_lockspace, old_li->li_lksb.sb_lkid);
1365 +
1366 +               /* Don't unlock persistent locks */
1367 +               if (lkb->lkb_flags & GDLM_LKFLG_PERSISTENT) {
1368 +                       list_del(&old_li->li_ownerqueue);
1369 +
1370 +                       /* Update master copy */
1371 +                       if (lkb->lkb_resource->res_nodeid) {
1372 +                               li.li_lksb.sb_lkid = lkb->lkb_id;
1373 +                               status = dlm_lock(f->fi_ls->ls_lockspace,
1374 +                                               lkb->lkb_grmode, &li.li_lksb,
1375 +                                               DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
1376 +                                               NULL, 0, 0, ast_routine, &li,
1377 +                                               NULL, NULL);
1378 +                               if (status == 0)
1379 +                                       wait_for_ast(&li);
1380 +                       }
1381 +                       lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
1382 +
1383 +                       /* But tidy our references in it */
1384 +                       kfree(old_li);
1385 +                       lkb->lkb_astparam = (long)NULL;
1386 +                       put_file_info(f);
1387 +
1388 +                       continue;
1389 +               }
1390 +
1391 +               clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1392 +
1393 +               /* If it's not granted then cancel the request.
1394 +                * If the lock was WAITING then it will be dropped,
1395 +                *    if it was converting then it will be reverted to GRANTED,
1396 +                *    then we will unlock it.
1397 +                */
1398 +               lock_status = lkb->lkb_status;
1399 +
1400 +               if (lock_status != GDLM_LKSTS_GRANTED)
1401 +                       flags = DLM_LKF_CANCEL;
1402 +
1403 +               if (lkb->lkb_grmode >= DLM_LOCK_PW)
1404 +                       flags |= DLM_LKF_IVVALBLK;
1405 +
1406 +               status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1407 +
1408 +               /* Must wait for it to complete as the next lock could be its
1409 +                * parent */
1410 +               if (status == 0)
1411 +                       wait_for_ast(&li);
1412 +
1413 +               /* If it was waiting for a conversion, it will
1414 +                  now be granted so we can unlock it properly */
1415 +               if (lock_status == GDLM_LKSTS_CONVERT) {
1416 +                       flags &= ~DLM_LKF_CANCEL;
1417 +                       clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
1418 +                       status = dlm_unlock(f->fi_ls->ls_lockspace, lkb->lkb_id, flags, &li.li_lksb, &li);
1419 +
1420 +                       if (status == 0)
1421 +                               wait_for_ast(&li);
1422 +               }
1423 +               /* Unlock suceeded, free the lock_info struct. */
1424 +               if (status == 0) {
1425 +                       kfree(old_li);
1426 +                       put_file_info(f);
1427 +               }
1428 +       }
1429 +
1430 +       remove_wait_queue(&li.li_waitq, &wq);
1431 +
1432 +       /* If this is the last reference, and the lockspace has been deleted
1433 +          then free the struct */
1434 +       if (atomic_dec_and_test(&lsinfo->ls_refcnt) && !lsinfo->ls_lockspace) {
1435 +               kfree(lsinfo->ls_miscinfo.name);
1436 +               kfree(lsinfo);
1437 +       }
1438 +
1439 +       /* Restore signals */
1440 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1441 +       recalc_sigpending();
1442 +
1443 +       return 0;
1444 +}
1445 +
1446 +/*
1447 + * ioctls to create/remove lockspaces, and check how many
1448 + * outstanding ASTs there are against a particular LS.
1449 + */
1450 +static int dlm_ioctl(struct inode *inode, struct file *file,
1451 +                    uint command, ulong u)
1452 +{
1453 +       struct file_info *fi = file->private_data;
1454 +       int status = -EINVAL;
1455 +       int count;
1456 +       struct list_head *tmp_list;
1457 +
1458 +       switch (command) {
1459 +
1460 +               /* Are there any ASTs for us to read?
1461 +                * Warning, this returns the number of messages (ASTs)
1462 +                * in the queue, NOT the number of bytes to read
1463 +                */
1464 +       case FIONREAD:
1465 +               count = 0;
1466 +               spin_lock(&fi->fi_ast_lock);
1467 +               list_for_each(tmp_list, &fi->fi_ast_list)
1468 +                       count++;
1469 +               spin_unlock(&fi->fi_ast_lock);
1470 +               status = put_user(count, (int *)u);
1471 +               break;
1472 +
1473 +       default:
1474 +               return -ENOTTY;
1475 +       }
1476 +
1477 +       return status;
1478 +}
1479 +
1480 +/*
1481 + * ioctls to create/remove lockspaces.
1482 + */
1483 +static int dlm_ctl_ioctl(struct inode *inode, struct file *file,
1484 +                        uint command, ulong u)
1485 +{
1486 +       int status = -EINVAL;
1487 +       char ls_name[MAX_LS_NAME_LEN];
1488 +       struct user_ls *lsinfo;
1489 +       int force = 0;
1490 +
1491 +       switch (command) {
1492 +       case DLM_CREATE_LOCKSPACE:
1493 +               if (!capable(CAP_SYS_ADMIN))
1494 +                       return -EPERM;
1495 +
1496 +               if (strncpy_from_user(ls_name, (char*)u, MAX_LS_NAME_LEN) < 0)
1497 +                       return -EFAULT;
1498 +               status = register_lockspace(ls_name, &lsinfo);
1499 +
1500 +               /* If it succeeded then return the minor number */
1501 +               if (status == 0)
1502 +                       status = lsinfo->ls_miscinfo.minor;
1503 +               break;
1504 +
1505 +       case DLM_FORCE_RELEASE_LOCKSPACE:
1506 +               force = 2;
1507 +
1508 +       case DLM_RELEASE_LOCKSPACE:
1509 +               if (!capable(CAP_SYS_ADMIN))
1510 +                       return -EPERM;
1511 +
1512 +               down(&user_ls_lock);
1513 +               lsinfo = __find_lockspace(u);
1514 +               if (!lsinfo) {
1515 +                       up(&user_ls_lock);
1516 +                       return -EINVAL;
1517 +               }
1518 +
1519 +               status = unregister_lockspace(lsinfo, force);
1520 +               up(&user_ls_lock);
1521 +               break;
1522 +
1523 +       default:
1524 +               return -ENOTTY;
1525 +       }
1526 +
1527 +       return status;
1528 +}
1529 +
1530 +/* Deal with the messy stuff of copying a web of structs
1531 +   from kernel space to userspace */
1532 +static int copy_query_result(struct ast_info *ast)
1533 +{
1534 +       int status = -EFAULT;
1535 +       struct dlm_queryinfo qi;
1536 +
1537 +       /* Get the pointers to userspace structs */
1538 +       if (copy_from_user(&qi, ast->user_queryinfo,
1539 +                          sizeof(struct dlm_queryinfo)))
1540 +               goto copy_out;
1541 +
1542 +       if (put_user(ast->queryinfo->gqi_lockcount,
1543 +                    &ast->user_queryinfo->gqi_lockcount))
1544 +               goto copy_out;
1545 +
1546 +       if (qi.gqi_resinfo) {
1547 +               if (copy_to_user(qi.gqi_resinfo, ast->queryinfo->gqi_resinfo,
1548 +                                sizeof(struct dlm_resinfo)))
1549 +                       goto copy_out;
1550 +       }
1551 +
1552 +       if (qi.gqi_lockinfo) {
1553 +               if (copy_to_user(qi.gqi_lockinfo, ast->queryinfo->gqi_lockinfo,
1554 +                                sizeof(struct dlm_lockinfo) * ast->queryinfo->gqi_lockcount))
1555 +                       goto copy_out;
1556 +       }
1557 +
1558 +       status = 0;
1559 +
1560 +       if (ast->queryinfo->gqi_lockinfo)
1561 +               kfree(ast->queryinfo->gqi_lockinfo);
1562 +
1563 +       if (ast->queryinfo->gqi_resinfo)
1564 +               kfree(ast->queryinfo->gqi_resinfo);
1565 +
1566 +       kfree(ast->queryinfo);
1567 +
1568 + copy_out:
1569 +       return status;
1570 +}
1571 +
1572 +/* Read call, might block if no ASTs are waiting.
1573 + * It will only ever return one message at a time, regardless
1574 + * of how many are pending.
1575 + */
1576 +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
1577 +{
1578 +       struct file_info *fi = file->private_data;
1579 +       struct ast_info *ast;
1580 +       int ret;
1581 +       DECLARE_WAITQUEUE(wait, current);
1582 +
1583 +       if (count < sizeof(struct dlm_lock_result))
1584 +               return -EINVAL;
1585 +
1586 +       spin_lock(&fi->fi_ast_lock);
1587 +       if (list_empty(&fi->fi_ast_list)) {
1588 +
1589 +               /* No waiting ASTs.
1590 +                * Return EOF if the lockspace been deleted.
1591 +                */
1592 +               if (test_bit(1, &fi->fi_ls->ls_flags))
1593 +                       return 0;
1594 +
1595 +               if (file->f_flags & O_NONBLOCK) {
1596 +                       spin_unlock(&fi->fi_ast_lock);
1597 +                       return -EAGAIN;
1598 +               }
1599 +
1600 +               add_wait_queue(&fi->fi_wait, &wait);
1601 +
1602 +       repeat:
1603 +               set_current_state(TASK_INTERRUPTIBLE);
1604 +               if (list_empty(&fi->fi_ast_list) &&
1605 +                   !signal_pending(current)) {
1606 +
1607 +                       spin_unlock(&fi->fi_ast_lock);
1608 +                       schedule();
1609 +                       spin_lock(&fi->fi_ast_lock);
1610 +                       goto repeat;
1611 +               }
1612 +
1613 +               current->state = TASK_RUNNING;
1614 +               remove_wait_queue(&fi->fi_wait, &wait);
1615 +
1616 +               if (signal_pending(current)) {
1617 +                       spin_unlock(&fi->fi_ast_lock);
1618 +                       return -ERESTARTSYS;
1619 +               }
1620 +       }
1621 +
1622 +       ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
1623 +       list_del(&ast->list);
1624 +       spin_unlock(&fi->fi_ast_lock);
1625 +
1626 +       ret = sizeof(struct dlm_lock_result);
1627 +       if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
1628 +               ret = -EFAULT;
1629 +
1630 +       if (ast->ast_reason == AST_COMP &&
1631 +           ast->result.cmd == DLM_USER_LOCK && ast->user_lvbptr) {
1632 +               if (copy_to_user(ast->user_lvbptr, ast->result.lksb.sb_lvbptr, DLM_LVB_LEN))
1633 +                       ret = -EFAULT;
1634 +       }
1635 +
1636 +       /* If it was a query then copy the result block back here */
1637 +       if (ast->queryinfo) {
1638 +               int status = copy_query_result(ast);
1639 +               if (status)
1640 +                       ret = status;
1641 +       }
1642 +
1643 +       kfree(ast);
1644 +       return ret;
1645 +}
1646 +
1647 +static unsigned int dlm_poll(struct file *file, poll_table *wait)
1648 +{
1649 +       struct file_info *fi = file->private_data;
1650 +
1651 +       poll_wait(file, &fi->fi_wait, wait);
1652 +
1653 +       spin_lock(&fi->fi_ast_lock);
1654 +       if (!list_empty(&fi->fi_ast_list)) {
1655 +               spin_unlock(&fi->fi_ast_lock);
1656 +               return POLLIN | POLLRDNORM;
1657 +       }
1658 +
1659 +       spin_unlock(&fi->fi_ast_lock);
1660 +       return 0;
1661 +}
1662 +
1663 +static int do_user_query(struct file_info *fi, struct dlm_lock_params *kparams)
1664 +{
1665 +       struct lock_info *li;
1666 +       int status;
1667 +
1668 +       if (!kparams->castaddr)
1669 +               return -EINVAL;
1670 +
1671 +       if (!kparams->lksb)
1672 +               return -EINVAL;
1673 +
1674 +       li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1675 +       if (!li)
1676 +               return -ENOMEM;
1677 +
1678 +       get_file_info(fi);
1679 +       li->li_user_lksb = kparams->lksb;
1680 +       li->li_bastparam = kparams->bastparam;
1681 +       li->li_bastaddr  = kparams->bastaddr;
1682 +       li->li_castparam = kparams->castparam;
1683 +       li->li_castaddr  = kparams->castaddr;
1684 +       li->li_file      = fi;
1685 +       li->li_flags     = 0;
1686 +       li->li_cmd       = kparams->cmd;
1687 +       clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1688 +
1689 +       if (copy_from_user(&li->li_lksb, kparams->lksb,
1690 +                          sizeof(struct dlm_lksb))) {
1691 +               kfree(li);
1692 +               return -EFAULT;
1693 +       }
1694 +       li->li_user_queryinfo = (struct dlm_queryinfo *)li->li_lksb.sb_lvbptr;
1695 +
1696 +       /* Allocate query structs */
1697 +       status = -ENOMEM;
1698 +       li->li_queryinfo = kmalloc(sizeof(struct dlm_queryinfo), GFP_KERNEL);
1699 +       if (!li->li_queryinfo)
1700 +               goto out1;
1701 +
1702 +       /* Mainly to get gqi_lock buffer size */
1703 +       if (copy_from_user(li->li_queryinfo, li->li_lksb.sb_lvbptr,
1704 +                          sizeof(struct dlm_queryinfo))) {
1705 +               status = -EFAULT;
1706 +               goto out1;
1707 +       }
1708 +
1709 +       /* Overwrite userspace pointers we just copied with kernel space ones */
1710 +       if (li->li_queryinfo->gqi_resinfo) {
1711 +               li->li_queryinfo->gqi_resinfo = kmalloc(sizeof(struct dlm_resinfo), GFP_KERNEL);
1712 +               if (!li->li_queryinfo->gqi_resinfo)
1713 +                       goto out1;
1714 +       }
1715 +       if (li->li_queryinfo->gqi_lockinfo) {
1716 +               li->li_queryinfo->gqi_lockinfo =
1717 +                       kmalloc(sizeof(struct dlm_lockinfo) * li->li_queryinfo->gqi_locksize,
1718 +                               GFP_KERNEL);
1719 +               if (!li->li_queryinfo->gqi_lockinfo)
1720 +                       goto out2;
1721 +       }
1722 +
1723 +       li->li_lksb.sb_lvbptr = (char *)li->li_queryinfo;
1724 +
1725 +       return dlm_query(fi->fi_ls->ls_lockspace, &li->li_lksb,
1726 +                         kparams->flags, /* query */
1727 +                         li->li_queryinfo,
1728 +                         ast_routine, li);
1729 +
1730 + out2:
1731 +       kfree(li->li_queryinfo);
1732 +
1733 + out1:
1734 +       kfree(li);
1735 +       return status;
1736 +}
1737 +
1738 +static struct lock_info *allocate_lockinfo(struct file_info *fi, struct dlm_lock_params *kparams)
1739 +{
1740 +       struct lock_info *li;
1741 +
1742 +       li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
1743 +       if (li) {
1744 +               li->li_magic     = LOCKINFO_MAGIC;
1745 +               li->li_file      = fi;
1746 +               li->li_cmd       = kparams->cmd;
1747 +               li->li_queryinfo = NULL;
1748 +               li->li_flags     = 0;
1749 +               li->li_pend_bastparam = NULL;
1750 +               li->li_pend_bastaddr  = NULL;
1751 +               li->li_lksb.sb_lvbptr = NULL;
1752 +               li->li_bastaddr  = kparams->bastaddr;
1753 +               li->li_bastparam = kparams->bastparam;
1754 +
1755 +               get_file_info(fi);
1756 +       }
1757 +       return li;
1758 +}
1759 +
1760 +static int do_user_lock(struct file_info *fi, struct dlm_lock_params *kparams,
1761 +                       const char *buffer)
1762 +{
1763 +       struct lock_info *li;
1764 +       int status;
1765 +       char name[DLM_RESNAME_MAXLEN];
1766 +       void *lvbptr;
1767 +
1768 +       /*
1769 +        * Validate things that we need to have correct.
1770 +        */
1771 +       if (!kparams->castaddr)
1772 +               return -EINVAL;
1773 +
1774 +       if (!kparams->lksb)
1775 +               return -EINVAL;
1776 +
1777 +       if (!access_ok(VERIFY_WRITE, kparams->lksb, sizeof(struct dlm_lksb)))
1778 +               return -EFAULT;
1779 +
1780 +       /* Persistent child locks are not available yet */
1781 +       if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
1782 +               return -EINVAL;
1783 +
1784 +        /* For conversions, the lock will already have a lock_info
1785 +          block squirelled away in astparam */
1786 +       if (kparams->flags & DLM_LKF_CONVERT) {
1787 +               struct dlm_lkb *lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1788 +               if (!lkb) {
1789 +                       return -EINVAL;
1790 +               }
1791 +
1792 +               li = (struct lock_info *)lkb->lkb_astparam;
1793 +
1794 +               /* li may be NULL if the lock was PERSISTENT and the process went
1795 +                  away, so we need to allocate a new one */
1796 +               if (!li) {
1797 +                       li = allocate_lockinfo(fi, kparams);
1798 +                       if (li) {
1799 +                               spin_lock(&fi->fi_lkb_lock);
1800 +                               list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1801 +                               spin_unlock(&fi->fi_lkb_lock);
1802 +                       }
1803 +                       else {
1804 +                               return -ENOMEM;
1805 +                       }
1806 +               }
1807 +
1808 +               if (li->li_magic != LOCKINFO_MAGIC)
1809 +                       return -EINVAL;
1810 +
1811 +               /* For conversions don't overwrite the current blocking AST
1812 +                  info so that:
1813 +                  a) if a blocking AST fires before the conversion is queued
1814 +                     it runs the current handler
1815 +                  b) if the conversion is cancelled, the original blocking AST
1816 +                     declaration is active
1817 +                  The pend_ info is made active when the conversion
1818 +                  completes.
1819 +               */
1820 +               li->li_pend_bastaddr  = kparams->bastaddr;
1821 +               li->li_pend_bastparam = kparams->bastparam;
1822 +       }
1823 +       else {
1824 +               li = allocate_lockinfo(fi, kparams);
1825 +               if (!li)
1826 +                       return -ENOMEM;
1827 +
1828 +               /* Get the lock name */
1829 +               if (copy_from_user(name, buffer + offsetof(struct dlm_lock_params, name),
1830 +                                  kparams->namelen)) {
1831 +                       return -EFAULT;
1832 +               }
1833 +
1834 +               /* semaphore to allow us to complete our work before
1835 +                  the AST routine runs. In fact we only need (and use) this
1836 +                  when the initial lock fails */
1837 +               init_MUTEX_LOCKED(&li->li_firstlock);
1838 +               set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
1839 +       }
1840 +
1841 +       li->li_user_lksb = kparams->lksb;
1842 +       li->li_castaddr  = kparams->castaddr;
1843 +       li->li_castparam = kparams->castparam;
1844 +
1845 +       /* Copy the user's LKSB into kernel space,
1846 +          needed for conversions & value block operations.
1847 +          Save our kernel-space lvbptr first */
1848 +       lvbptr = li->li_lksb.sb_lvbptr;
1849 +       if (copy_from_user(&li->li_lksb, kparams->lksb, sizeof(struct dlm_lksb))) {
1850 +               status = -EFAULT;
1851 +               goto out_err;
1852 +       }
1853 +       /* Store new userland LVBptr and restore kernel one */
1854 +       li->li_user_lvbptr = li->li_lksb.sb_lvbptr;
1855 +       li->li_lksb.sb_lvbptr = lvbptr;
1856 +
1857 +       /* Copy in the value block */
1858 +       if (kparams->flags & DLM_LKF_VALBLK) {
1859 +               if (!li->li_lksb.sb_lvbptr) {
1860 +                       li->li_lksb.sb_lvbptr = kmalloc(DLM_LVB_LEN, GFP_KERNEL);
1861 +                       if (!li->li_lksb.sb_lvbptr) {
1862 +                               status = -ENOMEM;
1863 +                               goto out_err;
1864 +                       }
1865 +               }
1866 +
1867 +               if (copy_from_user(li->li_lksb.sb_lvbptr, kparams->lksb->sb_lvbptr,
1868 +                                  DLM_LVB_LEN)) {
1869 +                       status = -EFAULT;
1870 +                       goto out_err;
1871 +               }
1872 +       }
1873 +       else {
1874 +               li->li_user_lvbptr = NULL;
1875 +       }
1876 +
1877 +       /* Lock it ... */
1878 +       status = dlm_lock(fi->fi_ls->ls_lockspace, kparams->mode, &li->li_lksb,
1879 +                          kparams->flags, name, kparams->namelen,
1880 +                          kparams->parent,
1881 +                          ast_routine,
1882 +                          li,
1883 +                          (li->li_pend_bastaddr || li->li_bastaddr) ?
1884 +                                                   bast_routine : NULL,
1885 +                          kparams->range.ra_end ? &kparams->range : NULL);
1886 +
1887 +       /* If it succeeded (this far) with a new lock then keep track of
1888 +          it on the file's lkb list */
1889 +       if (!status && !(kparams->flags & DLM_LKF_CONVERT)) {
1890 +
1891 +               spin_lock(&fi->fi_lkb_lock);
1892 +               list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1893 +               spin_unlock(&fi->fi_lkb_lock);
1894 +
1895 +               up(&li->li_firstlock);
1896 +
1897 +               /* Copy the lkid back to userspace in case they want to cancel.
1898 +                  This address has already been tested so /should/ be OK, if not:
1899 +                  tough - we've taken the lock! */
1900 +               copy_to_user(&kparams->lksb->sb_lkid,
1901 +                            &li->li_lksb.sb_lkid,
1902 +                            sizeof(li->li_lksb.sb_lkid));
1903 +       }
1904 +
1905 +       return status;
1906 +
1907 + out_err:
1908 +       if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
1909 +
1910 +               release_lockinfo(li);
1911 +       }
1912 +       return status;
1913 +
1914 +}
1915 +
1916 +static int do_user_unlock(struct file_info *fi, struct dlm_lock_params *kparams)
1917 +{
1918 +       struct lock_info *li;
1919 +       struct dlm_lkb *lkb;
1920 +       int status;
1921 +       int convert_cancel = 0;
1922 +
1923 +       lkb = dlm_get_lkb(fi->fi_ls->ls_lockspace, kparams->lkid);
1924 +       if (!lkb) {
1925 +               return -EINVAL;
1926 +       }
1927 +
1928 +       /* Cancelling a conversion doesn't remove the lock...*/
1929 +       if (kparams->flags & DLM_LKF_CANCEL &&
1930 +           lkb->lkb_status == GDLM_LKSTS_CONVERT) {
1931 +               convert_cancel = 1;
1932 +       }
1933 +
1934 +       li = (struct lock_info *)lkb->lkb_astparam;
1935 +       if (!li) {
1936 +               li = allocate_lockinfo(fi, kparams);
1937 +               spin_lock(&fi->fi_lkb_lock);
1938 +               list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1939 +               spin_unlock(&fi->fi_lkb_lock);
1940 +       }
1941 +       if (!li)
1942 +               return -ENOMEM;
1943 +
1944 +       if (li->li_magic != LOCKINFO_MAGIC)
1945 +               return -EINVAL;
1946 +
1947 +       li->li_user_lksb = kparams->lksb;
1948 +       li->li_castparam = kparams->castparam;
1949 +       li->li_cmd       = kparams->cmd;
1950 +
1951 +       /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
1952 +          the existing li_castaddr as that's the completion routine for
1953 +          unlocks. dlm_unlock_wait() specifies a new AST routine to be
1954 +          executed when the unlock completes. */
1955 +       if (kparams->castaddr)
1956 +               li->li_castaddr = kparams->castaddr;
1957 +
1958 +       /* Have to do it here cos the lkb may not exist after
1959 +        * dlm_unlock() */
1960 +       if (!convert_cancel) {
1961 +               spin_lock(&fi->fi_lkb_lock);
1962 +               list_del(&li->li_ownerqueue);
1963 +               spin_unlock(&fi->fi_lkb_lock);
1964 +       }
1965 +
1966 +       /* Use existing lksb & astparams */
1967 +       status = dlm_unlock(fi->fi_ls->ls_lockspace,
1968 +                            kparams->lkid,
1969 +                            kparams->flags, &li->li_lksb, li);
1970 +       if (status && !convert_cancel) {
1971 +               /* It failed, put it back on the list */
1972 +               spin_lock(&fi->fi_lkb_lock);
1973 +               list_add(&li->li_ownerqueue, &fi->fi_lkb_list);
1974 +               spin_unlock(&fi->fi_lkb_lock);
1975 +       }
1976 +
1977 +       return status;
1978 +}
1979 +
1980 +/* Write call, submit a locking request */
1981 +static ssize_t dlm_write(struct file *file, const char __user *buffer,
1982 +                        size_t count, loff_t *ppos)
1983 +{
1984 +       struct file_info *fi = file->private_data;
1985 +       struct dlm_lock_params kparams;
1986 +       sigset_t tmpsig;
1987 +       sigset_t allsigs;
1988 +       int status;
1989 +
1990 +       if (count < sizeof(kparams)-1)  /* -1 because lock name is optional */
1991 +               return -EINVAL;
1992 +
1993 +       /* Has the lockspace been deleted */
1994 +       if (test_bit(1, &fi->fi_ls->ls_flags))
1995 +               return -ENOENT;
1996 +
1997 +       /* Get the command info */
1998 +       if (copy_from_user(&kparams, buffer, sizeof(kparams)))
1999 +               return -EFAULT;
2000 +
2001 +       if (check_version(&kparams))
2002 +               return -EINVAL;
2003 +
2004 +       /* Block signals while we are doing this */
2005 +       sigfillset(&allsigs);
2006 +       sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
2007 +
2008 +       switch (kparams.cmd)
2009 +       {
2010 +       case DLM_USER_LOCK:
2011 +               status = do_user_lock(fi, &kparams, buffer);
2012 +               break;
2013 +
2014 +       case DLM_USER_UNLOCK:
2015 +               status = do_user_unlock(fi, &kparams);
2016 +               break;
2017 +
2018 +       case DLM_USER_QUERY:
2019 +               status = do_user_query(fi, &kparams);
2020 +               break;
2021 +
2022 +       default:
2023 +               status = -EINVAL;
2024 +               break;
2025 +       }
2026 +       /* Restore signals */
2027 +       sigprocmask(SIG_SETMASK, &tmpsig, NULL);
2028 +       recalc_sigpending();
2029 +
2030 +       if (status == 0)
2031 +               return count;
2032 +       else
2033 +               return status;
2034 +}
2035 +
2036 +/* Called when the cluster is shutdown uncleanly, all lockspaces
2037 +   have been summarily removed */
2038 +void dlm_device_free_devices()
2039 +{
2040 +       struct user_ls *tmp;
2041 +       struct user_ls *lsinfo;
2042 +
2043 +       down(&user_ls_lock);
2044 +       list_for_each_entry_safe(lsinfo, tmp, &user_ls_list, ls_list) {
2045 +               misc_deregister(&lsinfo->ls_miscinfo);
2046 +
2047 +               /* Tidy up, but don't delete the lsinfo struct until
2048 +                  all the users have closed their devices */
2049 +               list_del(&lsinfo->ls_list);
2050 +               set_bit(1, &lsinfo->ls_flags); /* LS has been deleted */
2051 +               lsinfo->ls_lockspace = NULL;
2052 +       }
2053 +       up(&user_ls_lock);
2054 +}
2055 +
2056 +static struct file_operations _dlm_fops = {
2057 +      .open    = dlm_open,
2058 +      .release = dlm_close,
2059 +      .ioctl   = dlm_ioctl,
2060 +      .read    = dlm_read,
2061 +      .write   = dlm_write,
2062 +      .poll    = dlm_poll,
2063 +      .owner   = THIS_MODULE,
2064 +};
2065 +
2066 +static struct file_operations _dlm_ctl_fops = {
2067 +      .open    = dlm_ctl_open,
2068 +      .release = dlm_ctl_close,
2069 +      .ioctl   = dlm_ctl_ioctl,
2070 +      .owner   = THIS_MODULE,
2071 +};
2072 +
2073 +/*
2074 + * Create control device
2075 + */
2076 +int dlm_device_init(void)
2077 +{
2078 +       int r;
2079 +
2080 +       INIT_LIST_HEAD(&user_ls_list);
2081 +       init_MUTEX(&user_ls_lock);
2082 +
2083 +       ctl_device.name = "dlm-control";
2084 +       ctl_device.fops = &_dlm_ctl_fops;
2085 +       ctl_device.minor = MISC_DYNAMIC_MINOR;
2086 +
2087 +       r = misc_register(&ctl_device);
2088 +       if (r) {
2089 +               log_print("misc_register failed for DLM control device");
2090 +               return r;
2091 +       }
2092 +
2093 +       return 0;
2094 +}
2095 +
2096 +void dlm_device_exit(void)
2097 +{
2098 +       misc_deregister(&ctl_device);
2099 +}
2100 +
2101 +/*
2102 + * Overrides for Emacs so that we follow Linus's tabbing style.
2103 + * Emacs will notice this stuff at the end of the file and automatically
2104 + * adjust the settings for this buffer only.  This must remain at the end
2105 + * of the file.
2106 + * ---------------------------------------------------------------------------
2107 + * Local variables:
2108 + * c-file-style: "linux"
2109 + * End:
2110 + */
2111 diff -urN linux-orig/cluster/dlm/device.h linux-patched/cluster/dlm/device.h
2112 --- linux-orig/cluster/dlm/device.h     1970-01-01 07:30:00.000000000 +0730
2113 +++ linux-patched/cluster/dlm/device.h  2004-11-03 11:31:56.000000000 +0800
2114 @@ -0,0 +1,19 @@
2115 +/******************************************************************************
2116 +*******************************************************************************
2117 +**
2118 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2119 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2120 +**
2121 +**  This copyrighted material is made available to anyone wishing to use,
2122 +**  modify, copy, or redistribute it subject to the terms and conditions
2123 +**  of the GNU General Public License v.2.
2124 +**
2125 +*******************************************************************************
2126 +******************************************************************************/
2127 +
2128 +#ifndef __DEVICE_DOT_H__
2129 +#define __DEVICE_DOT_H__
2130 +
2131 +extern void dlm_device_free_devices(void);
2132 +
2133 +#endif                         /* __DEVICE_DOT_H__ */
2134 diff -urN linux-orig/cluster/dlm/dir.c linux-patched/cluster/dlm/dir.c
2135 --- linux-orig/cluster/dlm/dir.c        1970-01-01 07:30:00.000000000 +0730
2136 +++ linux-patched/cluster/dlm/dir.c     2004-11-03 11:31:56.000000000 +0800
2137 @@ -0,0 +1,471 @@
2138 +/******************************************************************************
2139 +*******************************************************************************
2140 +**
2141 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2142 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2143 +**
2144 +**  This copyrighted material is made available to anyone wishing to use,
2145 +**  modify, copy, or redistribute it subject to the terms and conditions
2146 +**  of the GNU General Public License v.2.
2147 +**
2148 +*******************************************************************************
2149 +******************************************************************************/
2150 +
2151 +#include "dlm_internal.h"
2152 +#include "nodes.h"
2153 +#include "lockspace.h"
2154 +#include "lowcomms.h"
2155 +#include "reccomms.h"
2156 +#include "rsb.h"
2157 +#include "config.h"
2158 +#include "memory.h"
2159 +#include "recover.h"
2160 +#include "util.h"
2161 +
2162 +struct resmov {
2163 +       uint32_t rm_nodeid;
2164 +       uint16_t rm_length;
2165 +       uint16_t rm_pad;
2166 +};
2167 +
2168 +void print_name(char *b, int len)
2169 +{
2170 +       int i;
2171 +       for (i = 0; i < len; i++)
2172 +               printk("%c", b[i]);
2173 +       printk("\n");
2174 +}
2175 +
2176 +static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
2177 +{
2178 +       spin_lock(&ls->ls_recover_list_lock);
2179 +       list_add(&de->list, &ls->ls_recover_list);
2180 +       spin_unlock(&ls->ls_recover_list_lock);
2181 +}
2182 +
2183 +static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
2184 +{
2185 +       int found = FALSE;
2186 +       struct dlm_direntry *de;
2187 +
2188 +       spin_lock(&ls->ls_recover_list_lock);
2189 +       list_for_each_entry(de, &ls->ls_recover_list, list) {
2190 +               if (de->length == len) {
2191 +                       list_del(&de->list);
2192 +                       de->master_nodeid = 0;
2193 +                       memset(de->name, 0, len);
2194 +                       found = TRUE;
2195 +                       break;
2196 +               }
2197 +       }
2198 +       spin_unlock(&ls->ls_recover_list_lock);
2199 +
2200 +       if (!found)
2201 +               de = allocate_direntry(ls, len);
2202 +       return de;
2203 +}
2204 +
2205 +void clear_free_de(struct dlm_ls *ls)
2206 +{
2207 +       struct dlm_direntry *de;
2208 +
2209 +       spin_lock(&ls->ls_recover_list_lock);
2210 +       while (!list_empty(&ls->ls_recover_list)) {
2211 +               de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
2212 +                               list);
2213 +               list_del(&de->list);
2214 +               free_direntry(de);
2215 +       }
2216 +       spin_unlock(&ls->ls_recover_list_lock);
2217 +}
2218 +
2219 +/*
2220 + * We use the upper 16 bits of the hash value to select the directory node.
2221 + * Low bits are used for distribution of rsb's among hash buckets on each node.
2222 + *
2223 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
2224 + * num_nodes to the hash value.  This value in the desired range is used as an
2225 + * offset into the sorted list of nodeid's to give the particular nodeid of the
2226 + * directory node.
2227 + */
2228 +
2229 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length)
2230 +{
2231 +       struct list_head *tmp;
2232 +       struct dlm_csb *csb = NULL;
2233 +       uint32_t hash, node, n = 0, nodeid;
2234 +
2235 +       if (ls->ls_num_nodes == 1) {
2236 +               nodeid = our_nodeid();
2237 +               goto out;
2238 +       }
2239 +
2240 +       hash = dlm_hash(name, length);
2241 +       node = (hash >> 16) % ls->ls_num_nodes;
2242 +
2243 +       if (ls->ls_node_array) {
2244 +               nodeid = ls->ls_node_array[node];
2245 +               goto out;
2246 +       }
2247 +
2248 +       list_for_each(tmp, &ls->ls_nodes) {
2249 +               if (n++ != node)
2250 +                       continue;
2251 +               csb = list_entry(tmp, struct dlm_csb, list);
2252 +               break;
2253 +       }
2254 +
2255 +       DLM_ASSERT(csb, printk("num_nodes=%u n=%u node=%u\n",
2256 +                               ls->ls_num_nodes, n, node););
2257 +       nodeid = csb->node->nodeid;
2258 + out:
2259 +       return nodeid;
2260 +}
2261 +
2262 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb)
2263 +{
2264 +       return name_to_directory_nodeid(rsb->res_ls, rsb->res_name,
2265 +                                       rsb->res_length);
2266 +}
2267 +
2268 +static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
2269 +{
2270 +       uint32_t val;
2271 +
2272 +       val = dlm_hash(name, len);
2273 +       val &= (ls->ls_dirtbl_size - 1);
2274 +
2275 +       return val;
2276 +}
2277 +
2278 +static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
2279 +{
2280 +       uint32_t bucket;
2281 +
2282 +       bucket = dir_hash(ls, de->name, de->length);
2283 +       list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2284 +}
2285 +
2286 +static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
2287 +                                         int namelen, uint32_t bucket)
2288 +{
2289 +       struct dlm_direntry *de;
2290 +
2291 +       list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
2292 +               if (de->length == namelen && !memcmp(name, de->name, namelen))
2293 +                       goto out;
2294 +       }
2295 +       de = NULL;
2296 + out:
2297 +       return de;
2298 +}
2299 +
2300 +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen)
2301 +{
2302 +       struct dlm_direntry *de;
2303 +       uint32_t bucket;
2304 +
2305 +       bucket = dir_hash(ls, name, namelen);
2306 +
2307 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2308 +
2309 +       de = search_bucket(ls, name, namelen, bucket);
2310 +
2311 +       if (!de) {
2312 +               log_all(ls, "remove fr %u none", nodeid);
2313 +               print_name(name, namelen);
2314 +               goto out;
2315 +       }
2316 +
2317 +       if (de->master_nodeid != nodeid) {
2318 +               log_all(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
2319 +               print_name(name, namelen);
2320 +               goto out;
2321 +       }
2322 +
2323 +       list_del(&de->list);
2324 +       free_direntry(de);
2325 + out:
2326 +       write_unlock(&ls->ls_dirtbl[bucket].lock);
2327 +}
2328 +
2329 +void dlm_dir_clear(struct dlm_ls *ls)
2330 +{
2331 +       struct list_head *head;
2332 +       struct dlm_direntry *de;
2333 +       int i;
2334 +
2335 +       for (i = 0; i < ls->ls_dirtbl_size; i++) {
2336 +               write_lock(&ls->ls_dirtbl[i].lock);
2337 +               head = &ls->ls_dirtbl[i].list;
2338 +               while (!list_empty(head)) {
2339 +                       de = list_entry(head->next, struct dlm_direntry, list);
2340 +                       list_del(&de->list);
2341 +                       put_free_de(ls, de);
2342 +               }
2343 +               write_unlock(&ls->ls_dirtbl[i].lock);
2344 +       }
2345 +}
2346 +
2347 +static void resmov_in(struct resmov *rm, char *buf)
2348 +{
2349 +       struct resmov tmp;
2350 +
2351 +       memcpy(&tmp, buf, sizeof(struct resmov));
2352 +
2353 +       rm->rm_nodeid = be32_to_cpu(tmp.rm_nodeid);
2354 +       rm->rm_length = be16_to_cpu(tmp.rm_length);
2355 +}
2356 +
2357 +int dlm_dir_rebuild_local(struct dlm_ls *ls)
2358 +{
2359 +       struct dlm_csb *csb;
2360 +       struct dlm_direntry *de;
2361 +       struct dlm_rcom *rc;
2362 +       struct resmov mov, last_mov;
2363 +       char *b, *last_name;
2364 +       int error = -ENOMEM, count = 0;
2365 +
2366 +       log_all(ls, "rebuild resource directory");
2367 +
2368 +       dlm_dir_clear(ls);
2369 +
2370 +       rc = allocate_rcom_buffer(ls);
2371 +       if (!rc)
2372 +               goto out;
2373 +
2374 +       last_name = (char *) kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
2375 +       if (!last_name)
2376 +               goto free_rc;
2377 +
2378 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
2379 +               last_mov.rm_length = 0;
2380 +               for (;;) {
2381 +                       error = dlm_recovery_stopped(ls);
2382 +                       if (error)
2383 +                               goto free_last;
2384 +
2385 +                       memcpy(rc->rc_buf, last_name, last_mov.rm_length);
2386 +                       rc->rc_datalen = last_mov.rm_length;
2387 +
2388 +                       error = rcom_send_message(ls, csb->node->nodeid,
2389 +                                                 RECCOMM_RECOVERNAMES, rc, 1);
2390 +                       if (error)
2391 +                               goto free_last;
2392 +
2393 +                       schedule();
2394 +
2395 +                       /*
2396 +                        * pick each res out of buffer
2397 +                        */
2398 +
2399 +                       b = rc->rc_buf;
2400 +
2401 +                       for (;;) {
2402 +                               resmov_in(&mov, b);
2403 +                               b += sizeof(struct resmov);
2404 +
2405 +                               /* Length of 0 with a non-zero nodeid marks the
2406 +                                * end of the list */
2407 +                               if (!mov.rm_length && mov.rm_nodeid)
2408 +                                       goto done;
2409 +
2410 +                               /* This is just the end of the block */
2411 +                               if (!mov.rm_length)
2412 +                                       break;
2413 +
2414 +                               DLM_ASSERT(mov.rm_nodeid == csb->node->nodeid,);
2415 +
2416 +                               error = -ENOMEM;
2417 +                               de = get_free_de(ls, mov.rm_length);
2418 +                               if (!de)
2419 +                                       goto free_last;
2420 +
2421 +                               de->master_nodeid = mov.rm_nodeid;
2422 +                               de->length = mov.rm_length;
2423 +                               memcpy(de->name, b, mov.rm_length);
2424 +                               b += mov.rm_length;
2425 +
2426 +                               add_entry_to_hash(ls, de);
2427 +                               count++;
2428 +
2429 +                               last_mov = mov;
2430 +                               memset(last_name, 0, DLM_RESNAME_MAXLEN);
2431 +                               memcpy(last_name, de->name, de->length);
2432 +                       }
2433 +               }
2434 +             done:
2435 +               ;
2436 +       }
2437 +
2438 +       set_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
2439 +       error = 0;
2440 +
2441 +       log_all(ls, "rebuilt %d resources", count);
2442 +
2443 +      free_last:
2444 +       kfree(last_name);
2445 +
2446 +      free_rc:
2447 +       free_rcom_buffer(rc);
2448 +
2449 +      out:
2450 +       clear_free_de(ls);
2451 +       return error;
2452 +}
2453 +
2454 +/*
2455 + * The reply end of dlm_dir_rebuild_local/RECOVERNAMES.  Collect and send as
2456 + * many resource names as can fit in the buffer.
2457 + */
2458 +
2459 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2460 +                        char *outbuf, int outlen, uint32_t nodeid)
2461 +{
2462 +       struct list_head *list;
2463 +       struct dlm_rsb *start_rsb = NULL, *rsb;
2464 +       int offset = 0, start_namelen, error;
2465 +       char *start_name;
2466 +       struct resmov tmp;
2467 +       uint32_t dir_nodeid;
2468 +
2469 +       /*
2470 +        * Find the rsb where we left off (or start again)
2471 +        */
2472 +
2473 +       start_namelen = inlen;
2474 +       start_name = inbuf;
2475 +
2476 +       if (start_namelen > 1) {
2477 +               error = find_rsb(ls, NULL, start_name, start_namelen, 0,
2478 +                                &start_rsb);
2479 +               DLM_ASSERT(!error && start_rsb, printk("error %d\n", error););
2480 +               release_rsb(start_rsb);
2481 +       }
2482 +
2483 +       /*
2484 +        * Send rsb names for rsb's we're master of and whose directory node
2485 +        * matches the requesting node.
2486 +        */
2487 +
2488 +       down_read(&ls->ls_root_lock);
2489 +       if (start_rsb)
2490 +               list = start_rsb->res_rootlist.next;
2491 +       else
2492 +               list = ls->ls_rootres.next;
2493 +
2494 +       for (offset = 0; list != &ls->ls_rootres; list = list->next) {
2495 +               rsb = list_entry(list, struct dlm_rsb, res_rootlist);
2496 +               if (rsb->res_nodeid)
2497 +                       continue;
2498 +
2499 +               dir_nodeid = get_directory_nodeid(rsb);
2500 +               if (dir_nodeid != nodeid)
2501 +                       continue;
2502 +
2503 +               if (offset + sizeof(struct resmov)*2 + rsb->res_length > outlen) {
2504 +                       /* Write end-of-block record */
2505 +                       memset(&tmp, 0, sizeof(struct resmov));
2506 +                       memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2507 +                       offset += sizeof(struct resmov);
2508 +                       goto out;
2509 +               }
2510 +
2511 +               memset(&tmp, 0, sizeof(struct resmov));
2512 +               tmp.rm_nodeid = cpu_to_be32(our_nodeid());
2513 +               tmp.rm_length = cpu_to_be16(rsb->res_length);
2514 +
2515 +               memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2516 +               offset += sizeof(struct resmov);
2517 +
2518 +               memcpy(outbuf + offset, rsb->res_name, rsb->res_length);
2519 +               offset += rsb->res_length;
2520 +       }
2521 +
2522 +       /*
2523 +        * If we've reached the end of the list (and there's room) write a
2524 +        * terminating record.
2525 +        */
2526 +
2527 +       if ((list == &ls->ls_rootres) &&
2528 +           (offset + sizeof(struct resmov) <= outlen)) {
2529 +
2530 +               memset(&tmp, 0, sizeof(struct resmov));
2531 +               /* This only needs to be non-zero */
2532 +               tmp.rm_nodeid = cpu_to_be32(1);
2533 +               /* and this must be zero */
2534 +               tmp.rm_length = 0;
2535 +               memcpy(outbuf + offset, &tmp, sizeof(struct resmov));
2536 +               offset += sizeof(struct resmov);
2537 +       }
2538 +
2539 + out:
2540 +       up_read(&ls->ls_root_lock);
2541 +       return offset;
2542 +}
2543 +
2544 +static int get_entry(struct dlm_ls *ls, uint32_t nodeid, char *name,
2545 +                    int namelen, uint32_t *r_nodeid)
2546 +{
2547 +       struct dlm_direntry *de, *tmp;
2548 +       uint32_t bucket;
2549 +
2550 +       bucket = dir_hash(ls, name, namelen);
2551 +
2552 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2553 +       de = search_bucket(ls, name, namelen, bucket);
2554 +       if (de) {
2555 +               *r_nodeid = de->master_nodeid;
2556 +               write_unlock(&ls->ls_dirtbl[bucket].lock);
2557 +               if (*r_nodeid == nodeid)
2558 +                       return -EEXIST;
2559 +               return 0;
2560 +       }
2561 +
2562 +       write_unlock(&ls->ls_dirtbl[bucket].lock);
2563 +
2564 +       de = allocate_direntry(ls, namelen);
2565 +       if (!de)
2566 +               return -ENOMEM;
2567 +
2568 +       de->master_nodeid = nodeid;
2569 +       de->length = namelen;
2570 +       memcpy(de->name, name, namelen);
2571 +
2572 +       write_lock(&ls->ls_dirtbl[bucket].lock);
2573 +       tmp = search_bucket(ls, name, namelen, bucket);
2574 +       if (tmp) {
2575 +               free_direntry(de);
2576 +               de = tmp;
2577 +       } else {
2578 +               list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
2579 +       }
2580 +       *r_nodeid = de->master_nodeid;
2581 +       write_unlock(&ls->ls_dirtbl[bucket].lock);
2582 +       return 0;
2583 +}
2584 +
2585 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2586 +                  uint32_t *r_nodeid)
2587 +{
2588 +       return get_entry(ls, nodeid, name, namelen, r_nodeid);
2589 +}
2590 +
2591 +/*
2592 + * The node with lowest id queries all nodes to determine when all are done.
2593 + * All other nodes query the low nodeid for this.
2594 + */
2595 +
2596 +int dlm_dir_rebuild_wait(struct dlm_ls *ls)
2597 +{
2598 +       int error;
2599 +
2600 +       if (ls->ls_low_nodeid == our_nodeid()) {
2601 +               error = dlm_wait_status_all(ls, RESDIR_VALID);
2602 +               if (!error)
2603 +                       set_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
2604 +       } else
2605 +               error = dlm_wait_status_low(ls, RESDIR_ALL_VALID);
2606 +
2607 +       return error;
2608 +}
2609 diff -urN linux-orig/cluster/dlm/dir.h linux-patched/cluster/dlm/dir.h
2610 --- linux-orig/cluster/dlm/dir.h        1970-01-01 07:30:00.000000000 +0730
2611 +++ linux-patched/cluster/dlm/dir.h     2004-11-03 11:31:56.000000000 +0800
2612 @@ -0,0 +1,33 @@
2613 +/******************************************************************************
2614 +*******************************************************************************
2615 +**
2616 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2617 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2618 +**
2619 +**  This copyrighted material is made available to anyone wishing to use,
2620 +**  modify, copy, or redistribute it subject to the terms and conditions
2621 +**  of the GNU General Public License v.2.
2622 +**
2623 +*******************************************************************************
2624 +******************************************************************************/
2625 +
2626 +#ifndef __DIR_DOT_H__
2627 +#define __DIR_DOT_H__
2628 +
2629 +void print_name(char *b, int len);
2630 +uint32_t name_to_directory_nodeid(struct dlm_ls *ls, char *name, int length);
2631 +uint32_t get_directory_nodeid(struct dlm_rsb *rsb);
2632 +
2633 +int dlm_dir_lookup(struct dlm_ls *ls, uint32_t nodeid, char *name, int namelen,
2634 +                       uint32_t *r_nodeid);
2635 +void dlm_dir_remove(struct dlm_ls *ls, uint32_t nodeid, char *name,
2636 +                   int namelen);
2637 +int dlm_dir_rebuild_local(struct dlm_ls *ls);
2638 +int dlm_dir_rebuild_send(struct dlm_ls *ls, char *inbuf, int inlen,
2639 +                        char *outbuf, int outlen, uint32_t nodeid);
2640 +int dlm_dir_rebuild_wait(struct dlm_ls * ls);
2641 +void dlm_dir_clear(struct dlm_ls *ls);
2642 +void dlm_dir_dump(struct dlm_ls *ls);
2643 +void clear_free_de(struct dlm_ls *ls);
2644 +
2645 +#endif                         /* __DIR_DOT_H__ */
2646 diff -urN linux-orig/cluster/dlm/dlm_internal.h linux-patched/cluster/dlm/dlm_internal.h
2647 --- linux-orig/cluster/dlm/dlm_internal.h       1970-01-01 07:30:00.000000000 +0730
2648 +++ linux-patched/cluster/dlm/dlm_internal.h    2004-11-03 11:31:56.000000000 +0800
2649 @@ -0,0 +1,612 @@
2650 +/******************************************************************************
2651 +*******************************************************************************
2652 +**
2653 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
2654 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
2655 +**
2656 +**  This copyrighted material is made available to anyone wishing to use,
2657 +**  modify, copy, or redistribute it subject to the terms and conditions
2658 +**  of the GNU General Public License v.2.
2659 +**
2660 +*******************************************************************************
2661 +******************************************************************************/
2662 +
2663 +#ifndef __DLM_INTERNAL_DOT_H__
2664 +#define __DLM_INTERNAL_DOT_H__
2665 +
2666 +/*
2667 + * This is the main header file to be included in each DLM source file.
2668 + */
2669 +
2670 +#define DLM_RELEASE_NAME "<CVS>"
2671 +
2672 +#include <linux/slab.h>
2673 +#include <linux/sched.h>
2674 +#include <asm/semaphore.h>
2675 +#include <linux/types.h>
2676 +#include <linux/spinlock.h>
2677 +#include <linux/vmalloc.h>
2678 +#include <asm/uaccess.h>
2679 +#include <linux/list.h>
2680 +#include <linux/errno.h>
2681 +#include <linux/random.h>
2682 +#include <linux/delay.h>
2683 +#include <linux/interrupt.h>
2684 +#include <linux/kthread.h>
2685 +
2686 +#include <cluster/dlm.h>
2687 +#include <cluster/dlm_device.h>
2688 +#include <cluster/service.h>
2689 +
2690 +#ifndef TRUE
2691 +#define TRUE (1)
2692 +#endif
2693 +
2694 +#ifndef FALSE
2695 +#define FALSE (0)
2696 +#endif
2697 +
2698 +#if (BITS_PER_LONG == 64)
2699 +#define PRIu64 "lu"
2700 +#define PRId64 "ld"
2701 +#define PRIo64 "lo"
2702 +#define PRIx64 "lx"
2703 +#define PRIX64 "lX"
2704 +#define SCNu64 "lu"
2705 +#define SCNd64 "ld"
2706 +#define SCNo64 "lo"
2707 +#define SCNx64 "lx"
2708 +#define SCNX64 "lX"
2709 +#else
2710 +#define PRIu64 "Lu"
2711 +#define PRId64 "Ld"
2712 +#define PRIo64 "Lo"
2713 +#define PRIx64 "Lx"
2714 +#define PRIX64 "LX"
2715 +#define SCNu64 "Lu"
2716 +#define SCNd64 "Ld"
2717 +#define SCNo64 "Lo"
2718 +#define SCNx64 "Lx"
2719 +#define SCNX64 "LX"
2720 +#endif
2721 +
2722 +#define wchan_cond_sleep_intr(chan, sleep_cond) \
2723 +do \
2724 +{ \
2725 +  DECLARE_WAITQUEUE(__wait_chan, current); \
2726 +  current->state = TASK_INTERRUPTIBLE; \
2727 +  add_wait_queue(&chan, &__wait_chan); \
2728 +  if ((sleep_cond)) \
2729 +    schedule(); \
2730 +  remove_wait_queue(&chan, &__wait_chan); \
2731 +  current->state = TASK_RUNNING; \
2732 +} \
2733 +while (0)
2734 +
2735 +static inline int check_timeout(unsigned long stamp, unsigned int seconds)
2736 +{
2737 +    return time_after(jiffies, stamp + seconds * HZ);
2738 +}
2739 +
2740 +
2741 +#define log_print(fmt, args...) printk("dlm: "fmt"\n", ##args)
2742 +
2743 +#define log_all(ls, fmt, args...) \
2744 +       do { \
2745 +               printk("dlm: %s: " fmt "\n", (ls)->ls_name, ##args); \
2746 +               dlm_debug_log(ls, fmt, ##args); \
2747 +       } while (0)
2748 +
2749 +#define log_error log_all
2750 +
2751 +#if defined(DLM_DEBUG2)
2752 +int nibbler_printf(const char *fmt, ...);
2753 +#define log_debug2(fmt, args...) nibbler_printf(fmt"\n", ##args)
2754 +#else
2755 +#define log_debug2(fmt, args...)
2756 +#endif
2757 +
2758 +#define DLM_DEBUG
2759 +#if defined(DLM_DEBUG)
2760 +#define log_debug(ls, fmt, args...) dlm_debug_log(ls, fmt, ##args)
2761 +#else
2762 +#define log_debug(ls, fmt, args...)
2763 +#endif
2764 +
2765 +#if defined(DLM_DEBUG) && defined(DLM_DEBUG_ALL)
2766 +#undef log_debug
2767 +#define log_debug log_all
2768 +#endif
2769 +
2770 +
2771 +#define DLM_ASSERT(x, do) \
2772 +{ \
2773 +  if (!(x)) \
2774 +  { \
2775 +    dlm_locks_dump(); \
2776 +    dlm_debug_dump(); \
2777 +    printk("\nDLM:  Assertion failed on line %d of file %s\n" \
2778 +               "DLM:  assertion:  \"%s\"\n" \
2779 +               "DLM:  time = %lu\n", \
2780 +               __LINE__, __FILE__, #x, jiffies); \
2781 +    {do} \
2782 +    printk("\n"); \
2783 +    BUG(); \
2784 +    panic("DLM:  Record message above and reboot.\n"); \
2785 +  } \
2786 +}
2787 +
2788 +
2789 +struct dlm_ls;
2790 +struct dlm_lkb;
2791 +struct dlm_rsb;
2792 +struct dlm_csb;
2793 +struct dlm_node;
2794 +struct dlm_lkbtable;
2795 +struct dlm_rsbtable;
2796 +struct dlm_dirtable;
2797 +struct dlm_direntry;
2798 +struct dlm_recover;
2799 +struct dlm_header;
2800 +struct dlm_request;
2801 +struct dlm_reply;
2802 +struct dlm_rcom;
2803 +struct dlm_query_request;
2804 +struct dlm_query_reply;
2805 +
2806 +
2807 +struct dlm_direntry {
2808 +       struct list_head        list;
2809 +       uint32_t                master_nodeid;
2810 +       uint16_t                length;
2811 +       char                    name[1];
2812 +};
2813 +
2814 +struct dlm_dirtable {
2815 +       struct list_head        list;
2816 +       rwlock_t                lock;
2817 +};
2818 +
2819 +struct dlm_rsbtable {
2820 +       struct list_head        list;
2821 +       rwlock_t                lock;
2822 +};
2823 +
2824 +struct dlm_lkbtable {
2825 +       struct list_head        list;
2826 +       rwlock_t                lock;
2827 +       uint16_t                counter;
2828 +};
2829 +
2830 +/*
2831 + * Cluster node (per node in cluster)
2832 + */
2833 +
2834 +struct dlm_node {
2835 +       struct list_head        list;
2836 +       uint32_t                nodeid;
2837 +       atomic_t                refcount;       /* num csb's referencing */
2838 +};
2839 +
2840 +/*
2841 + * Cluster System Block (per node in a ls)
2842 + */
2843 +
2844 +struct dlm_csb {
2845 +       struct list_head        list;           /* per-lockspace node list */
2846 +       struct dlm_node *       node;           /* global node structure */
2847 +       int                     gone_event;     /* event id when node removed */
2848 +};
2849 +
2850 +/*
2851 + * Used to save and manage recovery state for a lockspace.
2852 + */
2853 +
2854 +struct dlm_recover {
2855 +       struct list_head        list;
2856 +       uint32_t *              nodeids;
2857 +       int                     node_count;
2858 +       int                     event_id;
2859 +};
2860 +
2861 +/*
2862 + * Elements in the range array
2863 + */
2864 +
2865 +#define GR_RANGE_START         (0)
2866 +#define GR_RANGE_END           (1)
2867 +#define RQ_RANGE_START         (2)
2868 +#define RQ_RANGE_END           (3)
2869 +
2870 +/*
2871 + * Lockspace structure
2872 + */
2873 +
2874 +#define LSFL_WORK              (0)
2875 +#define LSFL_LS_RUN            (1)
2876 +#define LSFL_LS_STOP           (2)
2877 +#define LSFL_LS_START          (3)
2878 +#define LSFL_LS_FINISH         (4)
2879 +#define LSFL_RECCOMM_WAIT      (5)
2880 +#define LSFL_RECCOMM_READY     (6)
2881 +#define LSFL_NOTIMERS          (7)
2882 +#define LSFL_FINISH_RECOVERY   (8)
2883 +#define LSFL_RESDIR_VALID      (9)
2884 +#define LSFL_ALL_RESDIR_VALID  (10)
2885 +#define LSFL_NODES_VALID       (11)
2886 +#define LSFL_ALL_NODES_VALID   (12)
2887 +#define LSFL_REQUEST_WARN      (13)
2888 +#define LSFL_RECOVERD_EXIT      (14)
2889 +
2890 +#define LSST_NONE              (0)
2891 +#define LSST_INIT              (1)
2892 +#define LSST_INIT_DONE         (2)
2893 +#define LSST_CLEAR             (3)
2894 +#define LSST_WAIT_START                (4)
2895 +#define LSST_RECONFIG_DONE     (5)
2896 +
2897 +struct dlm_ls {
2898 +       struct list_head        ls_list;        /* list of lockspaces */
2899 +       uint32_t                ls_local_id;    /* local unique lockspace ID */
2900 +       uint32_t                ls_global_id;   /* global unique lockspace ID */
2901 +       int                     ls_allocation;  /* Memory allocation policy */
2902 +       int                     ls_count;       /* reference count */
2903 +       unsigned long           ls_flags;       /* LSFL_ */
2904 +
2905 +       struct dlm_rsbtable *   ls_rsbtbl;
2906 +       uint32_t                ls_rsbtbl_size;
2907 +
2908 +       struct dlm_lkbtable *   ls_lkbtbl;
2909 +       uint32_t                ls_lkbtbl_size;
2910 +
2911 +       struct dlm_dirtable *   ls_dirtbl;
2912 +       uint32_t                ls_dirtbl_size;
2913 +
2914 +       struct list_head        ls_nodes;       /* current nodes in ls */
2915 +       struct list_head        ls_nodes_gone;  /* dead node list, recovery */
2916 +       uint32_t                ls_num_nodes;   /* number of nodes in ls */
2917 +       uint32_t                ls_low_nodeid;
2918 +       uint32_t *              ls_node_array;
2919 +
2920 +       struct rw_semaphore     ls_unlock_sem;  /* To prevent unlock on a
2921 +                                                  parent lock racing with a
2922 +                                                  new child lock */
2923 +
2924 +       struct list_head        ls_deadlockq;   /* List of locks in conversion
2925 +                                                  ordered by duetime. for
2926 +                                                  deadlock detection */
2927 +
2928 +       /* recovery related */
2929 +
2930 +       struct task_struct *    ls_recoverd_task;
2931 +       struct semaphore        ls_recoverd_lock;
2932 +       struct list_head        ls_recover;     /* dlm_recover structs */
2933 +       spinlock_t              ls_recover_lock;
2934 +       int                     ls_last_stop;
2935 +       int                     ls_last_start;
2936 +       int                     ls_last_finish;
2937 +       int                     ls_state;       /* recovery states */
2938 +
2939 +       struct rw_semaphore     ls_in_recovery; /* block local requests */
2940 +       struct list_head        ls_requestqueue;/* queue remote requests */
2941 +       struct semaphore        ls_requestqueue_lock;
2942 +
2943 +       struct dlm_rcom *       ls_rcom;        /* recovery comms */
2944 +       uint32_t                ls_rcom_msgid;
2945 +       struct semaphore        ls_rcom_lock;
2946 +
2947 +       struct list_head        ls_recover_list;
2948 +       spinlock_t              ls_recover_list_lock;
2949 +       int                     ls_recover_list_count;
2950 +       wait_queue_head_t       ls_wait_general;
2951 +
2952 +       struct list_head        ls_rootres;     /* root resources */
2953 +       struct rw_semaphore     ls_root_lock;   /* protect rootres list */
2954 +
2955 +       struct list_head        ls_rebuild_rootrsb_list; /* Root of lock trees
2956 +                                                         we're deserialising */
2957 +       int                     ls_namelen;
2958 +       char                    ls_name[1];
2959 +};
2960 +
2961 +/*
2962 + * Resource block
2963 + */
2964 +
2965 +#define RESFL_NEW_MASTER       (0)
2966 +#define RESFL_RECOVER_LIST     (1)
2967 +#define RESFL_MASTER           (2)
2968 +
2969 +struct dlm_rsb {
2970 +       struct list_head        res_hashchain;
2971 +       uint32_t                res_bucket;
2972 +
2973 +       struct dlm_ls *         res_ls;         /* The owning lockspace */
2974 +
2975 +       struct list_head        res_rootlist;   /* List of root rsb's */
2976 +
2977 +       struct list_head        res_subreslist; /* List of all sub-resources
2978 +                                                  for this root rsb */
2979 +
2980 +       uint8_t                 res_depth;      /* Depth in resource tree */
2981 +       unsigned long           res_flags;      /* Flags, RESFL_ */
2982 +
2983 +       struct list_head        res_grantqueue;
2984 +       struct list_head        res_convertqueue;
2985 +       struct list_head        res_waitqueue;
2986 +
2987 +       uint32_t                res_nodeid;     /* nodeid of master node */
2988 +
2989 +       struct dlm_rsb *        res_root;       /* root rsb if a subresource */
2990 +       struct dlm_rsb *        res_parent;     /* parent rsb (if any) */
2991 +
2992 +       atomic_t                res_ref;        /* Number of lkb's */
2993 +       uint16_t                res_remasterid; /* ID used during remaster */
2994 +
2995 +       struct list_head        res_recover_list; /* General list for use
2996 +                                                    during recovery */
2997 +       int                     res_recover_msgid;
2998 +       int                     res_newlkid_expect;
2999 +
3000 +       struct rw_semaphore     res_lock;
3001 +
3002 +       char *                  res_lvbptr;     /* Lock value block */
3003 +
3004 +       uint8_t                 res_length;
3005 +       char                    res_name[1];    /* <res_length> bytes */
3006 +};
3007 +
3008 +/*
3009 + * Lock block. To avoid confusion, where flags mirror the public flags, they
3010 + * should have the same value.
3011 + *
3012 + * In general, DLM_LKF flags from dlm.h apply only to lkb_lockqueue_flags
3013 + * and GDLM_LKFLG flags from dlm_internal.h apply only to lkb_flags.
3014 + * The rr_flags field in the request struct is a copy of lkb_lockqueue_flags.
3015 + * There is one dangerous exception: GDLM_LKFLG_RANGE is set in rr_flags
3016 + * when sending a remote range lock request.  This value is then copied into
3017 + * the remote lkb_lockqueue_flags field.  This means GDLM_LKFLG_RANGE must
3018 + * not have the same value as any external DLM_LKF flag.
3019 + */
3020 +
3021 +#define GDLM_LKSTS_NEW         (0)
3022 +#define GDLM_LKSTS_WAITING     (1)
3023 +#define GDLM_LKSTS_GRANTED     (2)
3024 +#define GDLM_LKSTS_CONVERT     (3)
3025 +
3026 +/* mirror external flags */
3027 +#define GDLM_LKFLG_VALBLK      (0x00000008)
3028 +#define GDLM_LKFLG_PERSISTENT  (0x00000080)
3029 +#define GDLM_LKFLG_NODLCKWT    (0x00000100)
3030 +#define GDLM_LKFLG_EXPEDITE    (0x00000400)
3031 +#define GDLM_LKFLG_ORPHAN      (0x00004000)
3032 +/* external flags now go up to: (0x00004000) : DLM_LKF_ORPHAN */
3033 +
3034 +/* internal-only flags */
3035 +#define GDLM_LKFLG_RANGE       (0x00010000)
3036 +#define GDLM_LKFLG_MSTCPY      (0x00020000)
3037 +#define GDLM_LKFLG_DELETED     (0x00040000)
3038 +#define GDLM_LKFLG_LQCONVERT   (0x00080000)
3039 +#define GDLM_LKFLG_LQRESEND    (0x00100000)
3040 +#define GDLM_LKFLG_DEMOTED     (0x00200000)
3041 +#define GDLM_LKFLG_RESENT      (0x00400000)
3042 +#define GDLM_LKFLG_NOREBUILD   (0x00800000)
3043 +#define GDLM_LKFLG_UNLOCKDONE  (0x01000000)
3044 +
3045 +#define AST_COMP               (1)
3046 +#define AST_BAST               (2)
3047 +#define AST_DEL                        (4)
3048 +
3049 +struct dlm_lkb {
3050 +       uint32_t                lkb_flags;
3051 +       uint16_t                lkb_status;     /* grant, wait, convert */
3052 +       int8_t                  lkb_rqmode;     /* requested lock mode */
3053 +       int8_t                  lkb_grmode;     /* granted lock mode */
3054 +       uint32_t                lkb_retstatus;  /* status to return in lksb */
3055 +       uint32_t                lkb_id;         /* our lock ID */
3056 +       struct dlm_lksb *       lkb_lksb;       /* status block of caller */
3057 +       struct list_head        lkb_idtbl_list; /* lockidtbl */
3058 +       struct list_head        lkb_statequeue; /* rsb's g/c/w queue */
3059 +       struct dlm_rsb *        lkb_resource;
3060 +       struct dlm_lkb *        lkb_parent;     /* parent lock if any */
3061 +       atomic_t                lkb_childcnt;   /* number of children */
3062 +
3063 +       struct list_head        lkb_lockqueue;  /* queue of locks waiting
3064 +                                                  for remote reply */
3065 +       int                     lkb_lockqueue_state; /* reason on lockqueue */
3066 +       uint32_t                lkb_lockqueue_flags; /* as passed into
3067 +                                                       lock/unlock */
3068 +       int                     lkb_ownpid;     /* pid of lock owner */
3069 +       unsigned long           lkb_lockqueue_time;  /* time lkb went on the
3070 +                                                       lockqueue */
3071 +       unsigned long           lkb_duetime;    /* for deadlock detection */
3072 +
3073 +       uint32_t                lkb_remid;      /* id on remote partner */
3074 +       uint32_t                lkb_nodeid;     /* id of remote partner */
3075 +       void *                  lkb_astaddr;
3076 +       void *                  lkb_bastaddr;
3077 +       long                    lkb_astparam;
3078 +       struct list_head        lkb_astqueue;   /* locks with asts to deliver */
3079 +       uint16_t                lkb_astflags;   /* COMP, BAST, DEL */
3080 +       uint8_t                 lkb_bastmode;   /* requested mode */
3081 +       uint8_t                 lkb_highbast;   /* highest mode bast sent for */
3082 +
3083 +       struct dlm_request *    lkb_request;
3084 +
3085 +       struct list_head        lkb_deadlockq;  /* ls_deadlockq list */
3086 +
3087 +       char *                  lkb_lvbptr;     /* points to lksb lvb on local
3088 +                                                  lock, allocated lvb on
3089 +                                                  on remote lock */
3090 +       uint64_t *              lkb_range;      /* Points to an array of 64 bit
3091 +                                                  numbers that represent the
3092 +                                                  requested and granted ranges
3093 +                                                  of the lock. NULL implies
3094 +                                                  0-ffffffffffffffff */
3095 +};
3096 +
3097 +/*
3098 + * Header part of the mid-level comms system. All packets start with
3099 + * this header so we can identify them. The comms packet can
3100 + * contain many of these structs but the are split into individual
3101 + * work units before being passed to the lockqueue routines.
3102 + * below this are the structs that this is a header for
3103 + */
3104 +
3105 +struct dlm_header {
3106 +       uint8_t                 rh_cmd;         /* What we are */
3107 +       uint8_t                 rh_flags;       /* maybe just a pad */
3108 +       uint16_t                rh_length;      /* Length of struct (so we can
3109 +                                                  send many in 1 message) */
3110 +       uint32_t                rh_lkid;        /* Lock ID tag: ie the local
3111 +                                                  (requesting) lock ID */
3112 +       uint32_t                rh_lockspace;   /* Lockspace ID */
3113 +} __attribute__((packed));
3114 +
3115 +/*
3116 + * This is the struct used in a remote lock/unlock/convert request
3117 + * The mid-level comms API should turn this into native byte order.
3118 + * Most "normal" lock operations will use these two structs for
3119 + * communications. Recovery operations use their own structs
3120 + * but still with the gd_req_header on the front.
3121 + */
3122 +
3123 +struct dlm_request {
3124 +       struct dlm_header       rr_header;
3125 +       uint32_t                rr_remlkid;     /* Remote lock ID */
3126 +       uint32_t                rr_remparid;    /* Parent's remote lock ID */
3127 +       uint32_t                rr_flags;       /* Flags from lock/convert req*/
3128 +       uint64_t                rr_range_start; /* Yes, these are in the right
3129 +                                                  place... */
3130 +       uint64_t                rr_range_end;
3131 +       uint32_t                rr_status;      /* Status to return if this is
3132 +                                                  an AST request */
3133 +        uint32_t                rr_pid;         /* Owner PID of lock */
3134 +       uint8_t                 rr_rqmode;      /* Requested lock mode */
3135 +       uint8_t                 rr_asts;        /* Whether the LKB has ASTs */
3136 +       char                    rr_lvb[DLM_LVB_LEN];
3137 +       char                    rr_name[1];     /* As long as needs be. Only
3138 +                                                  used for directory lookups.
3139 +                                                  The length of this can be
3140 +                                                  worked out from the packet
3141 +                                                  length */
3142 +} __attribute__((packed));
3143 +
3144 +/*
3145 + * This is the struct returned by a remote lock/unlock/convert request
3146 + * The mid-level comms API should turn this into native byte order.
3147 + */
3148 +
3149 +struct dlm_reply {
3150 +       struct dlm_header       rl_header;
3151 +       uint32_t                rl_lockstate;   /* Whether request was
3152 +                                                  queued/granted/waiting */
3153 +       uint32_t                rl_nodeid;      /* nodeid of lock master */
3154 +       uint32_t                rl_status;      /* Status to return to caller */
3155 +       uint32_t                rl_lkid;        /* Remote lkid */
3156 +       char                    rl_lvb[DLM_LVB_LEN];
3157 +} __attribute__((packed));
3158 +
3159 +/*
3160 + * Recovery comms message
3161 + */
3162 +
3163 +struct dlm_rcom {
3164 +       struct dlm_header       rc_header;      /* 32 byte aligned */
3165 +       uint32_t                rc_msgid;
3166 +       uint16_t                rc_datalen;
3167 +       uint8_t                 rc_expanded;
3168 +       uint8_t                 rc_subcmd;      /* secondary command */
3169 +       char                    rc_buf[1];      /* first byte of data goes here
3170 +                                                  and extends beyond here for
3171 +                                                  another datalen - 1 bytes.
3172 +                                                  rh_length is set to sizeof
3173 +                                                  dlm_rcom + datalen - 1 */
3174 +} __attribute__((packed));
3175 +
3176 +
3177 +/* A remote query: GDLM_REMCMD_QUERY */
3178 +
3179 +struct dlm_query_request {
3180 +       struct dlm_header       rq_header;
3181 +       uint32_t                rq_mstlkid;   /* LockID on master node */
3182 +       uint32_t                rq_query;     /* query from the user */
3183 +       uint32_t                rq_maxlocks;  /* max number of locks we can
3184 +                                                cope with */
3185 +} __attribute__((packed));
3186 +
3187 +/* First block of a reply query.  cmd = GDLM_REMCMD_QUERY */
3188 +/* There may be subsequent blocks of
3189 +   lock info in GDLM_REMCMD_QUERYCONT messages which just have
3190 +   a normal header. The last of these will have rh_flags set to
3191 +   GDLM_REMFLAG_ENDQUERY
3192 + */
3193 +
3194 +struct dlm_query_reply {
3195 +       struct dlm_header       rq_header;
3196 +       uint32_t                rq_numlocks;  /* Number of locks in reply */
3197 +       uint32_t                rq_startlock; /* Which lock this block starts
3198 +                                                at (for multi-block replies) */
3199 +       uint32_t                rq_status;
3200 +
3201 +       /* Resource information */
3202 +       uint32_t                rq_grantcount;  /* No. of nodes on grantqueue */
3203 +       uint32_t                rq_convcount;   /* No. of nodes on convertq */
3204 +       uint32_t                rq_waitcount;   /* No. of nodes on waitqueue */
3205 +       char                    rq_valblk[DLM_LVB_LEN]; /* Master's LVB
3206 +                                                          contents, if
3207 +                                                          applicable */
3208 +} __attribute__((packed));
3209 +
3210 +/*
3211 + * Lockqueue wait lock states
3212 + */
3213 +
3214 +#define GDLM_LQSTATE_WAIT_RSB          1
3215 +#define GDLM_LQSTATE_WAIT_CONVERT      2
3216 +#define GDLM_LQSTATE_WAIT_CONDGRANT    3
3217 +#define GDLM_LQSTATE_WAIT_UNLOCK       4
3218 +
3219 +/* Commands sent across the comms link */
3220 +#define GDLM_REMCMD_LOOKUP             1
3221 +#define GDLM_REMCMD_LOCKREQUEST                2
3222 +#define GDLM_REMCMD_UNLOCKREQUEST      3
3223 +#define GDLM_REMCMD_CONVREQUEST                4
3224 +#define GDLM_REMCMD_LOCKREPLY          5
3225 +#define GDLM_REMCMD_LOCKGRANT          6
3226 +#define GDLM_REMCMD_SENDBAST           7
3227 +#define GDLM_REMCMD_SENDCAST           8
3228 +#define GDLM_REMCMD_REM_RESDATA                9
3229 +#define GDLM_REMCMD_RECOVERMESSAGE     20
3230 +#define GDLM_REMCMD_RECOVERREPLY       21
3231 +#define GDLM_REMCMD_QUERY              30
3232 +#define GDLM_REMCMD_QUERYREPLY         31
3233 +
3234 +/* Set in rh_flags when this is the last block of
3235 +   query information. Note this could also be the first
3236 +   block */
3237 +#define GDLM_REMFLAG_ENDQUERY       1
3238 +
3239 +#ifdef CONFIG_DLM_STATS
3240 +struct dlm_statinfo
3241 +{
3242 +       unsigned int cast;
3243 +       unsigned int bast;
3244 +       unsigned int lockops;
3245 +       unsigned int unlockops;
3246 +       unsigned int convertops;
3247 +       unsigned long lockqueue_time[5];
3248 +       unsigned long lockqueue_locks[5];
3249 +};
3250 +extern struct dlm_statinfo dlm_stats;
3251 +#endif
3252 +
3253 +#ifndef BUG_ON
3254 +#define BUG_ON(x)
3255 +#endif
3256 +
3257 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...);
3258 +void dlm_debug_dump(void);
3259 +void dlm_locks_dump(void);
3260 +
3261 +#endif                         /* __DLM_INTERNAL_DOT_H__ */
3262 diff -urN linux-orig/cluster/dlm/lkb.c linux-patched/cluster/dlm/lkb.c
3263 --- linux-orig/cluster/dlm/lkb.c        1970-01-01 07:30:00.000000000 +0730
3264 +++ linux-patched/cluster/dlm/lkb.c     2004-11-03 11:31:56.000000000 +0800
3265 @@ -0,0 +1,183 @@
3266 +/******************************************************************************
3267 +*******************************************************************************
3268 +**
3269 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3270 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3271 +**
3272 +**  This copyrighted material is made available to anyone wishing to use,
3273 +**  modify, copy, or redistribute it subject to the terms and conditions
3274 +**  of the GNU General Public License v.2.
3275 +**
3276 +*******************************************************************************
3277 +******************************************************************************/
3278 +
3279 +/*
3280 + * lkb.c
3281 + *
3282 + * Allocate and free locks on the lock ID table.
3283 + *
3284 + * This is slightly naff but I don't really like the
3285 + * VMS lockidtbl stuff as it uses a realloced array
3286 + * to hold the locks in. I think this is slightly better
3287 + * in some ways.
3288 + *
3289 + * Any better suggestions gratefully received. Patrick
3290 + *
3291 + */
3292 +
3293 +#include "dlm_internal.h"
3294 +#include "lockqueue.h"
3295 +#include "lkb.h"
3296 +#include "config.h"
3297 +#include "rsb.h"
3298 +#include "memory.h"
3299 +#include "lockspace.h"
3300 +#include "util.h"
3301 +
3302 +/*
3303 + * Internal find lock by ID. Must be called with the lockidtbl spinlock held.
3304 + */
3305 +
3306 +static struct dlm_lkb *__find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3307 +{
3308 +       uint16_t bucket = lkid & 0xFFFF;
3309 +       struct dlm_lkb *lkb;
3310 +
3311 +       if (bucket >= ls->ls_lkbtbl_size)
3312 +               goto out;
3313 +
3314 +       list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list){
3315 +               if (lkb->lkb_id == lkid)
3316 +                       return lkb;
3317 +       }
3318 + out:
3319 +       return NULL;
3320 +}
3321 +
3322 +/*
3323 + * LKB lkid's are 32 bits and have two 16 bit parts.  The bottom 16 bits are a
3324 + * random number between 0 and lockidtbl_size-1.  This random number specifies
3325 + * the "bucket" for the lkb in lockidtbl.  The upper 16 bits are a sequentially
3326 + * assigned per-bucket id.
3327 + *
3328 + * Because the 16 bit id's per bucket can roll over, a new lkid must be checked
3329 + * against the lkid of all lkb's in the bucket to avoid duplication.
3330 + *
3331 + */
3332 +
3333 +struct dlm_lkb *create_lkb(struct dlm_ls *ls)
3334 +{
3335 +       struct dlm_lkb *lkb;
3336 +       uint32_t lkid;
3337 +       uint16_t bucket;
3338 +
3339 +       lkb = allocate_lkb(ls);
3340 +       if (!lkb)
3341 +               goto out;
3342 +
3343 + retry:
3344 +       get_random_bytes(&bucket, sizeof(bucket));
3345 +       bucket &= (ls->ls_lkbtbl_size - 1);
3346 +
3347 +       write_lock(&ls->ls_lkbtbl[bucket].lock);
3348 +
3349 +       lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
3350 +
3351 +       if (__find_lock_by_id(ls, lkid)) {
3352 +               write_unlock(&ls->ls_lkbtbl[bucket].lock);
3353 +               goto retry;
3354 +       }
3355 +
3356 +       lkb->lkb_id = lkid;
3357 +       list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
3358 +       write_unlock(&ls->ls_lkbtbl[bucket].lock);
3359 + out:
3360 +       return lkb;
3361 +}
3362 +
3363 +/*
3364 + * Free LKB and remove it from the lockidtbl.
3365 + * NB - this always frees the lkb whereas release_rsb doesn't free an
3366 + * rsb unless its reference count is zero.
3367 + */
3368 +
3369 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
3370 +{
3371 +       uint16_t bucket = lkb->lkb_id & 0xFFFF;
3372 +
3373 +       if (lkb->lkb_status) {
3374 +               log_error(ls, "release lkb with status %u", lkb->lkb_status);
3375 +               print_lkb(lkb);
3376 +               return;
3377 +       }
3378 +
3379 +       if (lkb->lkb_parent)
3380 +               atomic_dec(&lkb->lkb_parent->lkb_childcnt);
3381 +
3382 +       write_lock(&ls->ls_lkbtbl[bucket].lock);
3383 +       list_del(&lkb->lkb_idtbl_list);
3384 +       write_unlock(&ls->ls_lkbtbl[bucket].lock);
3385 +
3386 +       /* if this is not a master copy then lvbptr points into the user's
3387 +        * lksb, so don't free it */
3388 +       if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
3389 +               free_lvb(lkb->lkb_lvbptr);
3390 +
3391 +       if (lkb->lkb_range)
3392 +               free_range(lkb->lkb_range);
3393 +
3394 +       free_lkb(lkb);
3395 +}
3396 +
3397 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid)
3398 +{
3399 +       struct dlm_lkb *lkb;
3400 +       uint16_t bucket = lkid & 0xFFFF;
3401 +
3402 +       read_lock(&ls->ls_lkbtbl[bucket].lock);
3403 +       lkb = __find_lock_by_id(ls, lkid);
3404 +       read_unlock(&ls->ls_lkbtbl[bucket].lock);
3405 +
3406 +       return lkb;
3407 +}
3408 +
3409 +struct dlm_lkb *dlm_get_lkb(void *lockspace, uint32_t lkid)
3410 +{
3411 +       struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
3412 +       struct dlm_lkb *lkb = find_lock_by_id(ls, lkid);
3413 +       put_lockspace(ls);
3414 +       return lkb;
3415 +}
3416 +
3417 +/*
3418 + * Initialise the range parts of an LKB.
3419 + */
3420 +
3421 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end)
3422 +{
3423 +       int ret = -ENOMEM;
3424 +
3425 +       /*
3426 +        * if this wasn't already a range lock, make it one
3427 +        */
3428 +       if (!lkb->lkb_range) {
3429 +               lkb->lkb_range = allocate_range(lspace);
3430 +               if (!lkb->lkb_range)
3431 +                       goto out;
3432 +
3433 +               /*
3434 +                * This is needed for conversions that contain ranges where the
3435 +                * original lock didn't but it's harmless for new locks too.
3436 +                */
3437 +               lkb->lkb_range[GR_RANGE_START] = 0LL;
3438 +               lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
3439 +       }
3440 +
3441 +       lkb->lkb_range[RQ_RANGE_START] = start;
3442 +       lkb->lkb_range[RQ_RANGE_END] = end;
3443 +
3444 +       ret = 0;
3445 +
3446 +      out:
3447 +       return ret;
3448 +}
3449 diff -urN linux-orig/cluster/dlm/lkb.h linux-patched/cluster/dlm/lkb.h
3450 --- linux-orig/cluster/dlm/lkb.h        1970-01-01 07:30:00.000000000 +0730
3451 +++ linux-patched/cluster/dlm/lkb.h     2004-11-03 11:31:56.000000000 +0800
3452 @@ -0,0 +1,23 @@
3453 +/******************************************************************************
3454 +*******************************************************************************
3455 +**
3456 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3457 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3458 +**
3459 +**  This copyrighted material is made available to anyone wishing to use,
3460 +**  modify, copy, or redistribute it subject to the terms and conditions
3461 +**  of the GNU General Public License v.2.
3462 +**
3463 +*******************************************************************************
3464 +******************************************************************************/
3465 +
3466 +#ifndef __LKB_DOT_H__
3467 +#define __LKB_DOT_H__
3468 +
3469 +struct dlm_lkb *find_lock_by_id(struct dlm_ls *ls, uint32_t lkid);
3470 +struct dlm_lkb *create_lkb(struct dlm_ls *ls);
3471 +void release_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb);
3472 +struct dlm_lkb *dlm_get_lkb(void *ls, uint32_t lkid);
3473 +int lkb_set_range(struct dlm_ls *lspace, struct dlm_lkb *lkb, uint64_t start, uint64_t end);
3474 +
3475 +#endif                         /* __LKB_DOT_H__ */
3476 diff -urN linux-orig/cluster/dlm/locking.c linux-patched/cluster/dlm/locking.c
3477 --- linux-orig/cluster/dlm/locking.c    1970-01-01 07:30:00.000000000 +0730
3478 +++ linux-patched/cluster/dlm/locking.c 2004-11-03 11:31:56.000000000 +0800
3479 @@ -0,0 +1,1378 @@
3480 +/******************************************************************************
3481 +*******************************************************************************
3482 +**
3483 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
3484 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
3485 +**
3486 +**  This copyrighted material is made available to anyone wishing to use,
3487 +**  modify, copy, or redistribute it subject to the terms and conditions
3488 +**  of the GNU General Public License v.2.
3489 +**
3490 +*******************************************************************************
3491 +******************************************************************************/
3492 +
3493 +/*
3494 + * locking.c
3495 + *
3496 + * This is where the main work of the DLM goes on
3497 + *
3498 + */
3499 +
3500 +#include "dlm_internal.h"
3501 +#include "lockqueue.h"
3502 +#include "locking.h"
3503 +#include "lockspace.h"
3504 +#include "lkb.h"
3505 +#include "nodes.h"
3506 +#include "dir.h"
3507 +#include "ast.h"
3508 +#include "memory.h"
3509 +#include "rsb.h"
3510 +#include "util.h"
3511 +#include "lowcomms.h"
3512 +
3513 +extern struct list_head lslist;
3514 +
3515 +#define MAX(a, b) (((a) > (b)) ? (a) : (b))
3516 +
3517 +/*
3518 + * Lock compatibilty matrix - thanks Steve
3519 + * UN = Unlocked state. Not really a state, used as a flag
3520 + * PD = Padding. Used to make the matrix a nice power of two in size
3521 + * Other states are the same as the VMS DLM.
3522 + * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
3523 + */
3524 +
3525 +#define modes_compat(gr, rq) \
3526 +       __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
3527 +
3528 +const int __dlm_compat_matrix[8][8] = {
3529 +      /* UN NL CR CW PR PW EX PD */
3530 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
3531 +       {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
3532 +       {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
3533 +       {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
3534 +       {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
3535 +       {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
3536 +       {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
3537 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3538 +};
3539 +
3540 +/*
3541 + * Compatibility matrix for conversions with QUECVT set.
3542 + * Granted mode is the row; requested mode is the column.
3543 + * Usage: matrix[grmode+1][rqmode+1]
3544 + */
3545 +
3546 +const int __quecvt_compat_matrix[8][8] = {
3547 +      /* UN NL CR CW PR PW EX PD */
3548 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
3549 +       {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
3550 +       {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
3551 +       {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
3552 +       {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
3553 +       {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
3554 +       {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
3555 +       {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
3556 +};
3557 +
3558 +/*
3559 + * This defines the direction of transfer of LVB data.
3560 + * Granted mode is the row; requested mode is the column.
3561 + * Usage: matrix[grmode+1][rqmode+1]
3562 + * 1 = LVB is returned to the caller
3563 + * 0 = LVB is written to the resource
3564 + * -1 = nothing happens to the LVB
3565 + */
3566 +
3567 +const int __lvb_operations[8][8] = {
3568 +       /* UN   NL  CR  CW  PR  PW  EX  PD*/
3569 +       {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
3570 +       {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
3571 +       {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
3572 +       {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
3573 +       {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
3574 +       {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
3575 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
3576 +       {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
3577 +};
3578 +
3579 +static void grant_lock(struct dlm_lkb *lkb, int send_remote);
3580 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3581 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb);
3582 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
3583 +                       uint32_t flags, void *ast, void *astarg, void *bast,
3584 +                       struct dlm_range *range);
3585 +static int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb,
3586 +                          uint32_t flags, char *name, int namelen);
3587 +
3588 +
3589 +inline int dlm_modes_compat(int mode1, int mode2)
3590 +{
3591 +       return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
3592 +}
3593 +
3594 +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
3595 +{
3596 +       struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, lkb_statequeue);
3597 +
3598 +       if (lkb->lkb_id == first->lkb_id)
3599 +               return 1;
3600 +
3601 +       return 0;
3602 +}
3603 +
3604 +/*
3605 + * Return 1 if the locks' ranges overlap
3606 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
3607 + */
3608 +
3609 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
3610 +{
3611 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
3612 +               return 1;
3613 +
3614 +       if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
3615 +           lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
3616 +               return 0;
3617 +
3618 +       return 1;
3619 +}
3620 +
3621 +/*
3622 + * "A conversion deadlock arises with a pair of lock requests in the converting
3623 + * queue for one resource.  The granted mode of each lock blocks the requested
3624 + * mode of the other lock."
3625 + */
3626 +
3627 +static struct dlm_lkb *conversion_deadlock_detect(struct dlm_rsb *rsb,
3628 +                                                 struct dlm_lkb *lkb)
3629 +{
3630 +       struct dlm_lkb *this;
3631 +
3632 +       list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
3633 +               if (this == lkb)
3634 +                       continue;
3635 +
3636 +               if (!ranges_overlap(lkb, this))
3637 +                       continue;
3638 +
3639 +               if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
3640 +                       return this;
3641 +       }
3642 +
3643 +       return NULL;
3644 +}
3645 +
3646 +/*
3647 + * Check if the given lkb conflicts with another lkb on the queue.
3648 + */
3649 +
3650 +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
3651 +{
3652 +       struct dlm_lkb *this;
3653 +
3654 +       list_for_each_entry(this, head, lkb_statequeue) {
3655 +               if (this == lkb)
3656 +                       continue;
3657 +               if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
3658 +                       return TRUE;
3659 +       }
3660 +       return FALSE;
3661 +}
3662 +
3663 +/*
3664 + * Return 1 if the lock can be granted, 0 otherwise.
3665 + * Also detect and resolve conversion deadlocks.
3666 + *
3667 + * lkb is the lock to be granted
3668 + *
3669 + * now is 1 if the function is being called in the context of the
3670 + * immediate request, it is 0 if called later, after the lock has been
3671 + * queued.
3672 + *
3673 + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
3674 + */
3675 +
3676 +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
3677 +{
3678 +       int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
3679 +
3680 +       /*
3681 +        * 6-10: Version 5.4 introduced an option to address the phenomenon of
3682 +        * a new request for a NL mode lock being blocked.
3683 +        *
3684 +        * 6-11: If the optional EXPEDITE flag is used with the new NL mode
3685 +        * request, then it would be granted.  In essence, the use of this flag
3686 +        * tells the Lock Manager to expedite theis request by not considering
3687 +        * what may be in the CONVERTING or WAITING queues...  As of this
3688 +        * writing, the EXPEDITE flag can be used only with new requests for NL
3689 +        * mode locks.  This flag is not valid for conversion requests.
3690 +        *
3691 +        * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
3692 +        * conversion or used with a non-NL requested mode.  We also know an
3693 +        * EXPEDITE request is always granted immediately, so now must always
3694 +        * be 1.  The full condition to grant an expedite request: (now &&
3695 +        * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
3696 +        * therefore be shortened to just checking the flag.
3697 +        */
3698 +
3699 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_EXPEDITE)
3700 +               return TRUE;
3701 +
3702 +       /*
3703 +        * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
3704 +        * added to the remaining conditions.
3705 +        */
3706 +
3707 +       if (queue_conflict(&r->res_grantqueue, lkb))
3708 +               goto out;
3709 +
3710 +       /*
3711 +        * 6-3: By default, a conversion request is immediately granted if the
3712 +        * requested mode is compatible with the modes of all other granted
3713 +        * locks
3714 +        */
3715 +
3716 +       if (queue_conflict(&r->res_convertqueue, lkb))
3717 +               goto out;
3718 +
3719 +       /*
3720 +        * 6-5: But the default algorithm for deciding whether to grant or
3721 +        * queue conversion requests does not by itself guarantee that such
3722 +        * requests are serviced on a "first come first serve" basis.  This, in
3723 +        * turn, can lead to a phenomenon known as "indefinate postponement".
3724 +        *
3725 +        * 6-7: This issue is dealt with by using the optional QUECVT flag with
3726 +        * the system service employed to request a lock conversion.  This flag
3727 +        * forces certain conversion requests to be queued, even if they are
3728 +        * compatible with the granted modes of other locks on the same
3729 +        * resource.  Thus, the use of this flag results in conversion requests
3730 +        * being ordered on a "first come first servce" basis.
3731 +        */
3732 +
3733 +       if (now && conv && !(lkb->lkb_lockqueue_flags & DLM_LKF_QUECVT))
3734 +               return TRUE;
3735 +
3736 +       /*
3737 +        * When using range locks the NOORDER flag is set to avoid the standard
3738 +        * vms rules on grant order.
3739 +        */
3740 +
3741 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOORDER)
3742 +               return TRUE;
3743 +
3744 +       /*
3745 +        * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
3746 +        * granted until all other conversion requests ahead of it are granted
3747 +        * and/or canceled.
3748 +        */
3749 +
3750 +       if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
3751 +               return TRUE;
3752 +
3753 +       /*
3754 +        * 6-4: By default, a new request is immediately granted only if all
3755 +        * three of the following conditions are satisfied when the request is
3756 +        * issued:
3757 +        * - The queue of ungranted conversion requests for the resource is
3758 +        *   empty.
3759 +        * - The queue of ungranted new requests for the resource is empty.
3760 +        * - The mode of the new request is compatible with the most
3761 +        *   restrictive mode of all granted locks on the resource.
3762 +        */
3763 +
3764 +       if (now && !conv && list_empty(&r->res_convertqueue) &&
3765 +           list_empty(&r->res_waitqueue))
3766 +               return TRUE;
3767 +
3768 +       /*
3769 +        * 6-4: Once a lock request is in the queue of ungranted new requests,
3770 +        * it cannot be granted until the queue of ungranted conversion
3771 +        * requests is empty, all ungranted new requests ahead of it are
3772 +        * granted and/or canceled, and it is compatible with the granted mode
3773 +        * of the most restrictive lock granted on the resource.
3774 +        */
3775 +
3776 +       if (!now && !conv && list_empty(&r->res_convertqueue) &&
3777 +           first_in_list(lkb, &r->res_waitqueue))
3778 +               return TRUE;
3779 +
3780 + out:
3781 +       /*
3782 +        * The following, enabled by CONVDEADLK, departs from VMS.
3783 +        */
3784 +
3785 +       if (now && conv && (lkb->lkb_lockqueue_flags & DLM_LKF_CONVDEADLK) &&
3786 +           conversion_deadlock_detect(r, lkb)) {
3787 +               lkb->lkb_grmode = DLM_LOCK_NL;
3788 +               lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
3789 +       }
3790 +
3791 +       return FALSE;
3792 +}
3793 +
3794 +int dlm_lock(void *lockspace,
3795 +            uint32_t mode,
3796 +            struct dlm_lksb *lksb,
3797 +            uint32_t flags,
3798 +            void *name,
3799 +            unsigned int namelen,
3800 +            uint32_t parent,
3801 +            void (*ast) (void *astarg),
3802 +            void *astarg,
3803 +            void (*bast) (void *astarg, int mode),
3804 +            struct dlm_range *range)
3805 +{
3806 +       struct dlm_ls *lspace;
3807 +       struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
3808 +       int ret = -EINVAL;
3809 +
3810 +       lspace = find_lockspace_by_local_id(lockspace);
3811 +       if (!lspace)
3812 +               return ret;
3813 +
3814 +       if (mode < 0 || mode > DLM_LOCK_EX)
3815 +               goto out;
3816 +
3817 +       if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
3818 +               goto out;
3819 +
3820 +       if (flags & DLM_LKF_CANCEL)
3821 +               goto out;
3822 +
3823 +       if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
3824 +               goto out;
3825 +
3826 +       if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
3827 +               goto out;
3828 +
3829 +       if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
3830 +               goto out;
3831 +
3832 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
3833 +               goto out;
3834 +
3835 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
3836 +               goto out;
3837 +
3838 +       if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
3839 +               goto out;
3840 +
3841 +       if (flags & DLM_LKF_EXPEDITE && (mode != DLM_LOCK_NL))
3842 +               goto out;
3843 +
3844 +       if (!ast || !lksb)
3845 +               goto out;
3846 +
3847 +       if ((flags & DLM_LKF_VALBLK) && !lksb->sb_lvbptr)
3848 +               goto out;
3849 +
3850 +       /*
3851 +        * Take conversion path.
3852 +        */
3853 +
3854 +       if (flags & DLM_LKF_CONVERT) {
3855 +               ret = convert_lock(lspace, mode, lksb, flags, ast, astarg,
3856 +                                  bast, range);
3857 +               goto out;
3858 +       }
3859 +
3860 +#ifdef CONFIG_DLM_STATS
3861 +       dlm_stats.lockops++;
3862 +#endif
3863 +       /*
3864 +        * Take new lock path.
3865 +        */
3866 +
3867 +       if (parent) {
3868 +               down_read(&lspace->ls_unlock_sem);
3869 +
3870 +               parent_lkb = find_lock_by_id(lspace, parent);
3871 +
3872 +               if (!parent_lkb ||
3873 +                   parent_lkb->lkb_flags & GDLM_LKFLG_DELETED ||
3874 +                   parent_lkb->lkb_flags & GDLM_LKFLG_MSTCPY ||
3875 +                   parent_lkb->lkb_status != GDLM_LKSTS_GRANTED) {
3876 +                       up_read(&lspace->ls_unlock_sem);
3877 +                       goto out;
3878 +               }
3879 +
3880 +               atomic_inc(&parent_lkb->lkb_childcnt);
3881 +               up_read(&lspace->ls_unlock_sem);
3882 +       }
3883 +
3884 +       down_read(&lspace->ls_in_recovery);
3885 +
3886 +       ret = -ENOMEM;
3887 +
3888 +       lkb = create_lkb(lspace);
3889 +       if (!lkb)
3890 +               goto fail_dec;
3891 +       lkb->lkb_astaddr = ast;
3892 +       lkb->lkb_astparam = (long) astarg;
3893 +       lkb->lkb_bastaddr = bast;
3894 +       lkb->lkb_rqmode = mode;
3895 +       lkb->lkb_grmode = DLM_LOCK_IV;
3896 +       lkb->lkb_nodeid = -1;
3897 +       lkb->lkb_lksb = lksb;
3898 +       lkb->lkb_parent = parent_lkb;
3899 +       lkb->lkb_lockqueue_flags = flags;
3900 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
3901 +
3902 +       if (!in_interrupt() && current)
3903 +               lkb->lkb_ownpid = (int) current->pid;
3904 +       else
3905 +               lkb->lkb_ownpid = 0;
3906 +
3907 +       if (range) {
3908 +               if (range->ra_start > range->ra_end) {
3909 +                       ret = -EINVAL;
3910 +                       goto fail_free;
3911 +               }
3912 +
3913 +               if (lkb_set_range(lspace, lkb, range->ra_start, range->ra_end))
3914 +                       goto fail_free;
3915 +       }
3916 +
3917 +       /* Convert relevant flags to internal numbers */
3918 +       if (flags & DLM_LKF_VALBLK)
3919 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
3920 +       if (flags & DLM_LKF_PERSISTENT)
3921 +               lkb->lkb_flags |= GDLM_LKFLG_PERSISTENT;
3922 +       if (flags & DLM_LKF_NODLCKWT)
3923 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
3924 +
3925 +       lksb->sb_lkid = lkb->lkb_id;
3926 +
3927 +       ret = dlm_lock_stage1(lspace, lkb, flags, name, namelen);
3928 +       if (ret)
3929 +               goto fail_free;
3930 +
3931 +       up_read(&lspace->ls_in_recovery);
3932 +
3933 +       wake_astd();
3934 +
3935 +       put_lockspace(lspace);
3936 +       return 0;
3937 +
3938 +      fail_free:
3939 +       release_lkb(lspace, lkb);
3940 +       goto fail_unlock;
3941 +
3942 +      fail_dec:
3943 +       if (parent_lkb)
3944 +               atomic_dec(&parent_lkb->lkb_childcnt);
3945 +
3946 +      fail_unlock:
3947 +       up_read(&lspace->ls_in_recovery);
3948 +
3949 +      out:
3950 +       put_lockspace(lspace);
3951 +       return ret;
3952 +}
3953 +
3954 +int dlm_lock_stage1(struct dlm_ls *ls, struct dlm_lkb *lkb, uint32_t flags,
3955 +                   char *name, int namelen)
3956 +{
3957 +       struct dlm_rsb *rsb, *parent_rsb = NULL;
3958 +       struct dlm_lkb *parent_lkb = lkb->lkb_parent;
3959 +       uint32_t nodeid;
3960 +       int error, dir_error = 0;
3961 +
3962 +       if (parent_lkb)
3963 +               parent_rsb = parent_lkb->lkb_resource;
3964 +
3965 +       error = find_rsb(ls, parent_rsb, name, namelen, CREATE, &rsb);
3966 +       if (error)
3967 +               return error;
3968 +       lkb->lkb_resource = rsb;
3969 +       down_write(&rsb->res_lock);
3970 +
3971 +       log_debug(ls, "(%d) rq %u %x \"%s\"", lkb->lkb_ownpid, lkb->lkb_rqmode,
3972 +                 lkb->lkb_id, rsb->res_name);
3973 +       /*
3974 +        * Next stage, do we need to find the master or can
3975 +        * we get on with the real locking work ?
3976 +        */
3977 +
3978 + retry:
3979 +       if (rsb->res_nodeid == -1) {
3980 +               if (get_directory_nodeid(rsb) != our_nodeid()) {
3981 +                       remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
3982 +                       up_write(&rsb->res_lock);
3983 +                       return 0;
3984 +               }
3985 +
3986 +               error = dlm_dir_lookup(ls, our_nodeid(), rsb->res_name,
3987 +                                      rsb->res_length, &nodeid);
3988 +               if (error) {
3989 +                       DLM_ASSERT(error == -EEXIST,);
3990 +                       msleep(500);
3991 +                       dir_error = error;
3992 +                       goto retry;
3993 +               }
3994 +
3995 +               if (nodeid == our_nodeid()) {
3996 +                       set_bit(RESFL_MASTER, &rsb->res_flags);
3997 +                       rsb->res_nodeid = 0;
3998 +               } else {
3999 +                       clear_bit(RESFL_MASTER, &rsb->res_flags);
4000 +                       rsb->res_nodeid = nodeid;
4001 +               }
4002 +
4003 +               if (dir_error) {
4004 +                       log_all(ls, "dir lookup retry %x %u", lkb->lkb_id,
4005 +                               nodeid);
4006 +               }
4007 +       }
4008 +
4009 +       lkb->lkb_nodeid = rsb->res_nodeid;
4010 +       up_write(&rsb->res_lock);
4011 +
4012 +       error = dlm_lock_stage2(ls, lkb, rsb, flags);
4013 +
4014 +       return error;
4015 +}
4016 +
4017 +/*
4018 + * Locking routine called after we have an RSB, either a copy of a remote one
4019 + * or a local one, or perhaps a shiny new one all of our very own
4020 + */
4021 +
4022 +int dlm_lock_stage2(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_rsb *rsb,
4023 +                   uint32_t flags)
4024 +{
4025 +       int error = 0;
4026 +
4027 +       DLM_ASSERT(rsb->res_nodeid != -1, print_lkb(lkb); print_rsb(rsb););
4028 +
4029 +       if (rsb->res_nodeid) {
4030 +               res_lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4031 +               error = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONDGRANT);
4032 +       } else {
4033 +               dlm_lock_stage3(lkb);
4034 +       }
4035 +
4036 +       return error;
4037 +}
4038 +
4039 +/*
4040 + * Called on an RSB's master node to do stage2 locking for a remote lock
4041 + * request.  Returns a proper lkb with rsb ready for lock processing.
4042 + * This is analagous to sections of dlm_lock() and dlm_lock_stage1().
4043 + */
4044 +
4045 +struct dlm_lkb *remote_stage2(int remote_nodeid, struct dlm_ls *ls,
4046 +                             struct dlm_request *freq)
4047 +{
4048 +       struct dlm_rsb *rsb = NULL, *parent_rsb = NULL;
4049 +       struct dlm_lkb *lkb = NULL, *parent_lkb = NULL;
4050 +       int error, namelen;
4051 +
4052 +       if (freq->rr_remparid) {
4053 +               parent_lkb = find_lock_by_id(ls, freq->rr_remparid);
4054 +               if (!parent_lkb)
4055 +                       goto fail;
4056 +
4057 +               atomic_inc(&parent_lkb->lkb_childcnt);
4058 +               parent_rsb = parent_lkb->lkb_resource;
4059 +       }
4060 +
4061 +       /*
4062 +        * A new MSTCPY lkb.  Initialize lkb fields including the real lkid and
4063 +        * node actually holding the (non-MSTCPY) lkb.  AST address are just
4064 +        * flags in the master copy.
4065 +        */
4066 +
4067 +       lkb = create_lkb(ls);
4068 +       if (!lkb)
4069 +               goto fail_dec;
4070 +       lkb->lkb_grmode = DLM_LOCK_IV;
4071 +       lkb->lkb_rqmode = freq->rr_rqmode;
4072 +       lkb->lkb_parent = parent_lkb;
4073 +       lkb->lkb_astaddr = (void *) (long) (freq->rr_asts & AST_COMP);
4074 +       lkb->lkb_bastaddr = (void *) (long) (freq->rr_asts & AST_BAST);
4075 +       lkb->lkb_nodeid = remote_nodeid;
4076 +       lkb->lkb_remid = freq->rr_header.rh_lkid;
4077 +       lkb->lkb_flags = GDLM_LKFLG_MSTCPY;
4078 +       lkb->lkb_lockqueue_flags = freq->rr_flags;
4079 +
4080 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_VALBLK) {
4081 +               lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
4082 +               allocate_and_copy_lvb(ls, &lkb->lkb_lvbptr, freq->rr_lvb);
4083 +               if (!lkb->lkb_lvbptr)
4084 +                       goto fail_free;
4085 +       }
4086 +
4087 +       if (lkb->lkb_lockqueue_flags & GDLM_LKFLG_RANGE) {
4088 +               error = lkb_set_range(ls, lkb, freq->rr_range_start,
4089 +                                     freq->rr_range_end);
4090 +               if (error)
4091 +                       goto fail_free;
4092 +       }
4093 +
4094 +       /*
4095 +        * Get the RSB which this lock is for.  Create a new RSB if this is a
4096 +        * new lock on a new resource.  We must be the master of any new rsb.
4097 +        */
4098 +
4099 +       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
4100 +
4101 +       error = find_rsb(ls, parent_rsb, freq->rr_name, namelen, MASTER, &rsb);
4102 +       if (error)
4103 +               goto fail_free;
4104 +
4105 +       if (!rsb) {
4106 +               log_debug(ls, "send einval to %u", remote_nodeid);
4107 +               /* print_name(freq->rr_name, namelen); */
4108 +               lkb->lkb_retstatus = -EINVAL;
4109 +               goto out;
4110 +       }
4111 +
4112 +       lkb->lkb_resource = rsb;
4113 +
4114 +       log_debug(ls, "(%d) rq %u from %u %x \"%s\"",
4115 +                 lkb->lkb_ownpid, lkb->lkb_rqmode, remote_nodeid,
4116 +                 lkb->lkb_id, rsb->res_name);
4117 +
4118 +      out:
4119 +       return lkb;
4120 +
4121 +      fail_free:
4122 +       /* release_lkb handles parent */
4123 +       release_lkb(ls, lkb);
4124 +       parent_lkb = NULL;
4125 +
4126 +      fail_dec:
4127 +       if (parent_lkb)
4128 +               atomic_dec(&parent_lkb->lkb_childcnt);
4129 +      fail:
4130 +       return NULL;
4131 +}
4132 +
4133 +/*
4134 + * The final bit of lock request processing on the master node.  Here the lock
4135 + * is granted and the completion ast is queued, or the lock is put on the
4136 + * waitqueue and blocking asts are sent.
4137 + */
4138 +
4139 +void dlm_lock_stage3(struct dlm_lkb *lkb)
4140 +{
4141 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4142 +
4143 +       /*
4144 +        * This is a locally mastered lock on a resource that already exists,
4145 +        * see if it can be  granted or if it must wait.  When this function is
4146 +        * called for a remote lock request (process_cluster_request,
4147 +        * REMCMD_LOCKREQUEST), the result from grant_lock is returned to the
4148 +        * requesting node at the end of process_cluster_request, not at the
4149 +        * end of grant_lock.
4150 +        */
4151 +
4152 +       down_write(&rsb->res_lock);
4153 +
4154 +       if (can_be_granted(rsb, lkb, TRUE)) {
4155 +               grant_lock(lkb, 0);
4156 +               goto out;
4157 +       }
4158 +
4159 +       /*
4160 +        * This request is not a conversion, so the lkb didn't exist other than
4161 +        * for this request and should be freed after EAGAIN is returned in the
4162 +        * ast.
4163 +        */
4164 +
4165 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4166 +               lkb->lkb_retstatus = -EAGAIN;
4167 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4168 +                       send_blocking_asts_all(rsb, lkb);
4169 +               queue_ast(lkb, AST_COMP | AST_DEL, 0);
4170 +               goto out;
4171 +       }
4172 +
4173 +       /*
4174 +        * The requested lkb must wait.  Because the rsb of the requested lkb
4175 +        * is mastered here, send blocking asts for the lkb's blocking the
4176 +        * request.
4177 +        */
4178 +
4179 +       log_debug2("w %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4180 +                  lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4181 +                  lkb->lkb_status, rsb->res_name);
4182 +
4183 +       lkb->lkb_retstatus = 0;
4184 +       lkb_enqueue(rsb, lkb, GDLM_LKSTS_WAITING);
4185 +
4186 +       send_blocking_asts(rsb, lkb);
4187 +
4188 +      out:
4189 +       up_write(&rsb->res_lock);
4190 +}
4191 +
4192 +int dlm_unlock(void *lockspace,
4193 +              uint32_t lkid,
4194 +              uint32_t flags,
4195 +              struct dlm_lksb *lksb,
4196 +              void *astarg)
4197 +{
4198 +       struct dlm_ls *ls = find_lockspace_by_local_id(lockspace);
4199 +       struct dlm_lkb *lkb;
4200 +       struct dlm_rsb *rsb;
4201 +       int ret = -EINVAL;
4202 +
4203 +       if (!ls) {
4204 +               log_print("dlm_unlock: lkid %x lockspace not found", lkid);
4205 +               return ret;
4206 +       }
4207 +
4208 +       lkb = find_lock_by_id(ls, lkid);
4209 +       if (!lkb) {
4210 +               log_debug(ls, "unlock %x no id", lkid);
4211 +               goto out;
4212 +       }
4213 +
4214 +       /* Can't dequeue a master copy (a remote node's mastered lock) */
4215 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4216 +               log_debug(ls, "(%d) unlock %x lkb_flags %x",
4217 +                         lkb->lkb_ownpid, lkid, lkb->lkb_flags);
4218 +               goto out;
4219 +       }
4220 +
4221 +       /* Already waiting for a remote lock operation */
4222 +       if (lkb->lkb_lockqueue_state) {
4223 +               log_debug(ls, "(%d) unlock %x lq%d",
4224 +                         lkb->lkb_ownpid, lkid, lkb->lkb_lockqueue_state);
4225 +               ret = -EBUSY;
4226 +               goto out;
4227 +       }
4228 +
4229 +#ifdef CONFIG_DLM_STATS
4230 +       dlm_stats.unlockops++;
4231 +#endif
4232 +       /* Can only cancel WAITING or CONVERTing locks.
4233 +        * This is just a quick check - it is also checked in unlock_stage2()
4234 +        * (which may be on the master) under the semaphore.
4235 +        */
4236 +       if ((flags & DLM_LKF_CANCEL) &&
4237 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4238 +               log_debug(ls, "(%d) unlock %x %x %d",
4239 +                         lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4240 +               goto out;
4241 +       }
4242 +
4243 +       /* "Normal" unlocks must operate on a granted lock */
4244 +       if (!(flags & DLM_LKF_CANCEL) &&
4245 +           (lkb->lkb_status != GDLM_LKSTS_GRANTED)) {
4246 +               log_debug(ls, "(%d) unlock %x %x %d",
4247 +                         lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4248 +               goto out;
4249 +       }
4250 +
4251 +       if (lkb->lkb_flags & GDLM_LKFLG_DELETED) {
4252 +               log_debug(ls, "(%d) unlock deleted %x %x %d",
4253 +                         lkb->lkb_ownpid, lkid, flags, lkb->lkb_status);
4254 +               goto out;
4255 +       }
4256 +
4257 +       down_write(&ls->ls_unlock_sem);
4258 +       /* Can't dequeue a lock with sublocks */
4259 +       if (atomic_read(&lkb->lkb_childcnt)) {
4260 +               up_write(&ls->ls_unlock_sem);
4261 +               ret = -ENOTEMPTY;
4262 +               goto out;
4263 +       }
4264 +       /* Mark it as deleted so we can't use it as a parent in dlm_lock() */
4265 +       if (!(flags & DLM_LKF_CANCEL))
4266 +               lkb->lkb_flags |= GDLM_LKFLG_DELETED;
4267 +       up_write(&ls->ls_unlock_sem);
4268 +
4269 +       down_read(&ls->ls_in_recovery);
4270 +       rsb = find_rsb_to_unlock(ls, lkb);
4271 +
4272 +       log_debug(ls, "(%d) un %x %x %d %d \"%s\"",
4273 +                 lkb->lkb_ownpid,
4274 +                 lkb->lkb_id,
4275 +                 lkb->lkb_flags,
4276 +                 lkb->lkb_nodeid,
4277 +                 rsb->res_nodeid,
4278 +                 rsb->res_name);
4279 +
4280 +       /* Save any new params */
4281 +       if (lksb)
4282 +               lkb->lkb_lksb = lksb;
4283 +       lkb->lkb_astparam = (long) astarg;
4284 +       lkb->lkb_lockqueue_flags = flags;
4285 +
4286 +       if (lkb->lkb_nodeid)
4287 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_UNLOCK);
4288 +       else
4289 +               ret = dlm_unlock_stage2(lkb, rsb, flags);
4290 +       up_read(&ls->ls_in_recovery);
4291 +
4292 +       wake_astd();
4293 +
4294 +      out:
4295 +       put_lockspace(ls);
4296 +       return ret;
4297 +}
4298 +
4299 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags)
4300 +{
4301 +       int remote = lkb->lkb_flags & GDLM_LKFLG_MSTCPY;
4302 +       int old_status;
4303 +
4304 +       down_write(&rsb->res_lock);
4305 +
4306 +       /* Can only cancel WAITING or CONVERTing locks */
4307 +       if ((flags & DLM_LKF_CANCEL) &&
4308 +           (lkb->lkb_status == GDLM_LKSTS_GRANTED)) {
4309 +               lkb->lkb_retstatus = -EINVAL;
4310 +               queue_ast(lkb, AST_COMP, 0);
4311 +               goto out;
4312 +       }
4313 +
4314 +       log_debug2("u %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4315 +                  lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4316 +                  lkb->lkb_status, rsb->res_name);
4317 +
4318 +       old_status = lkb_dequeue(lkb);
4319 +
4320 +       /*
4321 +        * Cancelling a conversion
4322 +        */
4323 +
4324 +       if ((old_status == GDLM_LKSTS_CONVERT) && (flags & DLM_LKF_CANCEL)) {
4325 +               /* VMS semantics say we should send blocking ASTs again here */
4326 +               send_blocking_asts(rsb, lkb);
4327 +
4328 +               /* Remove from deadlock detection */
4329 +               if (lkb->lkb_duetime)
4330 +                       remove_from_deadlockqueue(lkb);
4331 +
4332 +               /* Stick it back on the granted queue */
4333 +               lkb_enqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4334 +               lkb->lkb_rqmode = lkb->lkb_grmode;
4335 +
4336 +               /* Was it blocking any other locks? */
4337 +               if (first_in_list(lkb, &rsb->res_convertqueue))
4338 +                       grant_pending_locks(rsb);
4339 +
4340 +               lkb->lkb_retstatus = -DLM_ECANCEL;
4341 +               queue_ast(lkb, AST_COMP, 0);
4342 +               goto out;
4343 +       }
4344 +
4345 +       /*
4346 +        * If was granted grant any converting or waiting locks
4347 +        * and save or clear lvb
4348 +        */
4349 +
4350 +       if (old_status == GDLM_LKSTS_GRANTED) {
4351 +               if (rsb->res_lvbptr && (lkb->lkb_grmode >= DLM_LOCK_PW)) {
4352 +                       if ((flags & DLM_LKF_VALBLK) && lkb->lkb_lvbptr)
4353 +                               memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr,
4354 +                                      DLM_LVB_LEN);
4355 +                       if (flags & DLM_LKF_IVVALBLK)
4356 +                               memset(rsb->res_lvbptr, 0, DLM_LVB_LEN);
4357 +               }
4358 +
4359 +               grant_pending_locks(rsb);
4360 +       } else
4361 +               DLM_ASSERT(0, print_lkb(lkb); print_rsb(rsb););
4362 +
4363 +       lkb->lkb_retstatus = flags & DLM_LKF_CANCEL ? -DLM_ECANCEL:-DLM_EUNLOCK;
4364 +
4365 +       if (!remote) {
4366 +               queue_ast(lkb, AST_COMP | AST_DEL, 0);
4367 +       } else {
4368 +               up_write(&rsb->res_lock);
4369 +               release_lkb(rsb->res_ls, lkb);
4370 +               release_rsb(rsb);
4371 +               goto out2;
4372 +       }
4373 +
4374 + out:
4375 +       up_write(&rsb->res_lock);
4376 + out2:
4377 +       wake_astd();
4378 +       return 0;
4379 +}
4380 +
4381 +/*
4382 + * Lock conversion
4383 + */
4384 +
4385 +static int convert_lock(struct dlm_ls *ls, int mode, struct dlm_lksb *lksb,
4386 +                       uint32_t flags, void *ast, void *astarg, void *bast,
4387 +                       struct dlm_range *range)
4388 +{
4389 +       struct dlm_lkb *lkb;
4390 +       struct dlm_rsb *rsb;
4391 +       int ret = -EINVAL;
4392 +
4393 +       lkb = find_lock_by_id(ls, lksb->sb_lkid);
4394 +       if (!lkb) {
4395 +               goto out;
4396 +       }
4397 +
4398 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED) {
4399 +               ret = -EBUSY;
4400 +               goto out;
4401 +       }
4402 +
4403 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
4404 +               goto out;
4405 +       }
4406 +
4407 +       if ((flags & DLM_LKF_QUECVT) &&
4408 +           !__quecvt_compat_matrix[lkb->lkb_grmode + 1][mode + 1]) {
4409 +               goto out;
4410 +       }
4411 +
4412 +       if (!lksb->sb_lvbptr && (flags & DLM_LKF_VALBLK)) {
4413 +               goto out;
4414 +       }
4415 +
4416 +#ifdef CONFIG_DLM_STATS
4417 +       dlm_stats.convertops++;
4418 +#endif
4419 +       /* Set up the ranges as appropriate */
4420 +       if (range) {
4421 +               if (range->ra_start > range->ra_end)
4422 +                       goto out;
4423 +
4424 +               if (lkb_set_range(ls, lkb, range->ra_start, range->ra_end)) {
4425 +                       ret = -ENOMEM;
4426 +                       goto out;
4427 +               }
4428 +       }
4429 +
4430 +       rsb = lkb->lkb_resource;
4431 +       down_read(&ls->ls_in_recovery);
4432 +
4433 +       log_debug(ls, "(%d) cv %u %x \"%s\"", lkb->lkb_ownpid, mode,
4434 +                 lkb->lkb_id, rsb->res_name);
4435 +
4436 +       lkb->lkb_flags &= ~GDLM_LKFLG_VALBLK;
4437 +       lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
4438 +
4439 +       if (flags & DLM_LKF_NODLCKWT)
4440 +               lkb->lkb_flags |= GDLM_LKFLG_NODLCKWT;
4441 +       lkb->lkb_astaddr = ast;
4442 +       lkb->lkb_astparam = (long) astarg;
4443 +       lkb->lkb_bastaddr = bast;
4444 +       lkb->lkb_rqmode = mode;
4445 +       lkb->lkb_lockqueue_flags = flags;
4446 +       lkb->lkb_flags |= (flags & DLM_LKF_VALBLK) ? GDLM_LKFLG_VALBLK : 0;
4447 +       lkb->lkb_lvbptr = lksb->sb_lvbptr;
4448 +
4449 +       if (rsb->res_nodeid) {
4450 +               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4451 +               ret = remote_stage(lkb, GDLM_LQSTATE_WAIT_CONVERT);
4452 +       } else {
4453 +               ret = dlm_convert_stage2(lkb, FALSE);
4454 +       }
4455 +
4456 +       up_read(&ls->ls_in_recovery);
4457 +
4458 +       wake_astd();
4459 +
4460 +      out:
4461 +       return ret;
4462 +}
4463 +
4464 +/*
4465 + * For local conversion requests on locally mastered locks this is called
4466 + * directly from dlm_lock/convert_lock.  This function is also called for
4467 + * remote conversion requests of MSTCPY locks (from process_cluster_request).
4468 + */
4469 +
4470 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast)
4471 +{
4472 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4473 +       int ret = 0;
4474 +
4475 +       down_write(&rsb->res_lock);
4476 +
4477 +       if (can_be_granted(rsb, lkb, TRUE)) {
4478 +               grant_lock(lkb, 0);
4479 +               grant_pending_locks(rsb);
4480 +               goto out;
4481 +       }
4482 +
4483 +       if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUE) {
4484 +               ret = lkb->lkb_retstatus = -EAGAIN;
4485 +               if (do_ast)
4486 +                       queue_ast(lkb, AST_COMP, 0);
4487 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_NOQUEUEBAST)
4488 +                       send_blocking_asts_all(rsb, lkb);
4489 +               goto out;
4490 +       }
4491 +
4492 +       log_debug2("c %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4493 +                  lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4494 +                  lkb->lkb_status, rsb->res_name);
4495 +
4496 +       lkb->lkb_retstatus = 0;
4497 +       lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
4498 +
4499 +       /*
4500 +        * The granted mode may have been reduced to NL by conversion deadlock
4501 +        * avoidance in can_be_granted().  If so, try to grant other locks.
4502 +        */
4503 +
4504 +       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED)
4505 +               grant_pending_locks(rsb);
4506 +
4507 +       send_blocking_asts(rsb, lkb);
4508 +
4509 +       if (!(lkb->lkb_flags & GDLM_LKFLG_NODLCKWT))
4510 +               add_to_deadlockqueue(lkb);
4511 +
4512 +      out:
4513 +       up_write(&rsb->res_lock);
4514 +       return ret;
4515 +}
4516 +
4517 +/*
4518 + * Remove lkb from any queue it's on, add it to the granted queue, and queue a
4519 + * completion ast.  rsb res_lock must be held in write when this is called.
4520 + */
4521 +
4522 +static void grant_lock(struct dlm_lkb *lkb, int send_remote)
4523 +{
4524 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4525 +
4526 +       if (lkb->lkb_duetime)
4527 +               remove_from_deadlockqueue(lkb);
4528 +
4529 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
4530 +               int b;
4531 +               DLM_ASSERT(lkb->lkb_lvbptr,);
4532 +
4533 +               if (!rsb->res_lvbptr)
4534 +                       rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
4535 +
4536 +               b = __lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
4537 +               if (b)
4538 +                       memcpy(lkb->lkb_lvbptr, rsb->res_lvbptr, DLM_LVB_LEN);
4539 +               else
4540 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
4541 +       }
4542 +
4543 +       if (lkb->lkb_range) {
4544 +               lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
4545 +               lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
4546 +       }
4547 +
4548 +       log_debug2("g %x %d %x %d,%d %d %s", lkb->lkb_id, lkb->lkb_nodeid,
4549 +                  lkb->lkb_remid, lkb->lkb_grmode, lkb->lkb_rqmode,
4550 +                  lkb->lkb_status, rsb->res_name);
4551 +
4552 +       if (lkb->lkb_grmode != lkb->lkb_rqmode) {
4553 +               lkb->lkb_grmode = lkb->lkb_rqmode;
4554 +               lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4555 +       }
4556 +       lkb->lkb_rqmode = DLM_LOCK_IV;
4557 +       lkb->lkb_highbast = 0;
4558 +       lkb->lkb_retstatus = 0;
4559 +       queue_ast(lkb, AST_COMP, 0);
4560 +
4561 +       /*
4562 +        * A remote conversion request has been granted, either immediately
4563 +        * upon being requested or after waiting a bit.  In the former case,
4564 +        * reply_and_grant() is called.  In the later case send_remote is 1 and
4565 +        * remote_grant() is called.
4566 +        *
4567 +        * The "send_remote" flag is set only for locks which are granted "out
4568 +        * of band" - ie by another lock being converted or unlocked.
4569 +        *
4570 +        * The second case occurs when this lkb is granted right away as part
4571 +        * of processing the initial request.  In that case, we send a single
4572 +        * message in reply_and_grant which combines the request reply with the
4573 +        * grant message.
4574 +        */
4575 +
4576 +       if ((lkb->lkb_flags & GDLM_LKFLG_MSTCPY) && lkb->lkb_nodeid) {
4577 +               if (send_remote)
4578 +                       remote_grant(lkb);
4579 +               else if (lkb->lkb_request)
4580 +                       reply_and_grant(lkb);
4581 +       }
4582 +
4583 +}
4584 +
4585 +static void send_bast_queue(struct list_head *head, struct dlm_lkb *lkb)
4586 +{
4587 +       struct dlm_lkb *gr;
4588 +
4589 +       list_for_each_entry(gr, head, lkb_statequeue) {
4590 +               if (gr->lkb_bastaddr &&
4591 +                   gr->lkb_highbast < lkb->lkb_rqmode &&
4592 +                   ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
4593 +                       queue_ast(gr, AST_BAST, lkb->lkb_rqmode);
4594 +                       gr->lkb_highbast = lkb->lkb_rqmode;
4595 +               }
4596 +       }
4597 +}
4598 +
4599 +/*
4600 + * Notify granted locks if they are blocking a newly forced-to-wait lock.
4601 + */
4602 +
4603 +static void send_blocking_asts(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4604 +{
4605 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4606 +       /* check if the following improves performance */
4607 +       /* send_bast_queue(&rsb->res_convertqueue, lkb); */
4608 +}
4609 +
4610 +static void send_blocking_asts_all(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
4611 +{
4612 +       send_bast_queue(&rsb->res_grantqueue, lkb);
4613 +       send_bast_queue(&rsb->res_convertqueue, lkb);
4614 +}
4615 +
4616 +/*
4617 + * Called when a lock has been dequeued. Look for any locks to grant that are
4618 + * waiting for conversion or waiting to be granted.
4619 + * The rsb res_lock must be held in write when this function is called.
4620 + */
4621 +
4622 +int grant_pending_locks(struct dlm_rsb *r)
4623 +{
4624 +       struct dlm_lkb *lkb, *s;
4625 +       int8_t high = DLM_LOCK_IV;
4626 +
4627 +       list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
4628 +               if (can_be_granted(r, lkb, FALSE))
4629 +                       grant_lock(lkb, 1);
4630 +               else
4631 +                       high = MAX(lkb->lkb_rqmode, high);
4632 +       }
4633 +
4634 +       list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
4635 +               if (lkb->lkb_lockqueue_state)
4636 +                       continue;
4637 +
4638 +               if (can_be_granted(r, lkb, FALSE))
4639 +                       grant_lock(lkb, 1);
4640 +               else
4641 +                       high = MAX(lkb->lkb_rqmode, high);
4642 +       }
4643 +
4644 +       /*
4645 +        * If there are locks left on the wait/convert queue then send blocking
4646 +        * ASTs to granted locks that are blocking
4647 +        *
4648 +        * FIXME: This might generate some spurious blocking ASTs for range
4649 +        * locks.
4650 +        */
4651 +
4652 +       if (high > DLM_LOCK_IV) {
4653 +               list_for_each_entry_safe(lkb, s, &r->res_grantqueue,
4654 +                                        lkb_statequeue) {
4655 +                       if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
4656 +                           !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
4657 +                               queue_ast(lkb, AST_BAST, high);
4658 +                               lkb->lkb_highbast = high;
4659 +                       }
4660 +               }
4661 +       }
4662 +
4663 +       return 0;
4664 +}
4665 +
4666 +/*
4667 + * Called to cancel a locking operation that failed due to some internal
4668 + * reason.
4669 + *
4670 + * Waiting locks will be removed, converting locks will be reverted to their
4671 + * granted status, unlocks will be left where they are.
4672 + *
4673 + * A completion AST will be delivered to the caller.
4674 + */
4675 +
4676 +int cancel_lockop(struct dlm_lkb *lkb, int status)
4677 +{
4678 +       int state = lkb->lkb_lockqueue_state;
4679 +       uint16_t astflags = AST_COMP;
4680 +
4681 +       lkb->lkb_lockqueue_state = 0;
4682 +
4683 +       switch (state) {
4684 +       case GDLM_LQSTATE_WAIT_RSB:
4685 +               astflags |= AST_DEL;
4686 +               break;
4687 +
4688 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4689 +               res_lkb_dequeue(lkb);
4690 +               astflags |= AST_DEL;
4691 +               break;
4692 +
4693 +       case GDLM_LQSTATE_WAIT_CONVERT:
4694 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
4695 +
4696 +               /* Remove from deadlock detection */
4697 +               if (lkb->lkb_duetime) {
4698 +                       remove_from_deadlockqueue(lkb);
4699 +               }
4700 +               break;
4701 +
4702 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4703 +               /* We can leave this. I think.... */
4704 +               break;
4705 +       }
4706 +
4707 +       lkb->lkb_retstatus = status;
4708 +       queue_ast(lkb, astflags, 0);
4709 +
4710 +       return 0;
4711 +}
4712 +
4713 +/*
4714 + * Check for conversion deadlock. If a deadlock was found
4715 + * return lkb to kill, else return NULL
4716 + */
4717 +
4718 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb)
4719 +{
4720 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4721 +       struct list_head *entry;
4722 +
4723 +       DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_CONVERT,);
4724 +
4725 +       /* Work our way up to the head of the queue looking for locks that
4726 +        * conflict with us */
4727 +
4728 +       down_read(&rsb->res_lock);
4729 +
4730 +       entry = lkb->lkb_statequeue.prev;
4731 +       while (entry != &rsb->res_convertqueue) {
4732 +               struct dlm_lkb *lkb2 = list_entry(entry, struct dlm_lkb, lkb_statequeue);
4733 +
4734 +               if (ranges_overlap(lkb, lkb2) && !modes_compat(lkb2, lkb)) {
4735 +                       up_read(&rsb->res_lock);
4736 +                       return lkb;
4737 +               }
4738 +               entry = entry->prev;
4739 +       }
4740 +       up_read(&rsb->res_lock);
4741 +
4742 +       return 0;
4743 +}
4744 +
4745 +/*
4746 + * Conversion operation was cancelled by us (not the user).
4747 + * ret contains the return code to pass onto the user
4748 + */
4749 +
4750 +void cancel_conversion(struct dlm_lkb *lkb, int ret)
4751 +{
4752 +       struct dlm_rsb *rsb = lkb->lkb_resource;
4753 +
4754 +       /* Stick it back on the granted queue */
4755 +       res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
4756 +       lkb->lkb_rqmode = lkb->lkb_grmode;
4757 +
4758 +       remove_from_deadlockqueue(lkb);
4759 +
4760 +       lkb->lkb_retstatus = ret;
4761 +       queue_ast(lkb, AST_COMP, 0);
4762 +       wake_astd();
4763 +}
4764 +
4765 +/*
4766 + * As new master of the rsb for this lkb, we need to handle these requests
4767 + * removed from the lockqueue and originating from local processes:
4768 + * GDLM_LQSTATE_WAIT_RSB, GDLM_LQSTATE_WAIT_CONDGRANT,
4769 + * GDLM_LQSTATE_WAIT_UNLOCK, GDLM_LQSTATE_WAIT_CONVERT.
4770 + */
4771 +
4772 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state)
4773 +{
4774 +       struct dlm_rsb *rsb;
4775 +
4776 +       switch (state) {
4777 +       case GDLM_LQSTATE_WAIT_RSB:
4778 +               dlm_lock_stage1(lkb->lkb_resource->res_ls, lkb,
4779 +                               lkb->lkb_lockqueue_flags,
4780 +                               lkb->lkb_resource->res_name,
4781 +                               lkb->lkb_resource->res_length);
4782 +               break;
4783 +
4784 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
4785 +               res_lkb_dequeue(lkb);
4786 +               dlm_lock_stage3(lkb);
4787 +               break;
4788 +
4789 +       case GDLM_LQSTATE_WAIT_UNLOCK:
4790 +               rsb = find_rsb_to_unlock(ls, lkb);
4791 +               dlm_unlock_stage2(lkb, rsb, lkb->lkb_lockqueue_flags);
4792 +               break;
4793 +
4794 +       case GDLM_LQSTATE_WAIT_CONVERT:
4795 +               dlm_convert_stage2(lkb, TRUE);
4796 +               break;
4797 +
4798 +       default:
4799 +               DLM_ASSERT(0,);
4800 +       }
4801 +}
4802 +
4803 +static void dump_queue(struct list_head *head, char *qname)
4804 +{
4805 +       struct dlm_lkb *lkb;
4806 +
4807 +       list_for_each_entry(lkb, head, lkb_statequeue) {
4808 +               printk("%s %08x gr %d rq %d flg %x sts %u node %u remid %x "
4809 +                      "lq %d,%x\n",
4810 +                      qname,
4811 +                      lkb->lkb_id,
4812 +                      lkb->lkb_grmode,
4813 +                      lkb->lkb_rqmode,
4814 +                      lkb->lkb_flags,
4815 +                      lkb->lkb_status,
4816 +                      lkb->lkb_nodeid,
4817 +                      lkb->lkb_remid,
4818 +                      lkb->lkb_lockqueue_state,
4819 +                      lkb->lkb_lockqueue_flags);
4820 +       }
4821 +}
4822 +
4823 +static void dump_rsb(struct dlm_rsb *rsb)
4824 +{
4825 +       printk("name \"%s\" flags %lx nodeid %d ref %u\n",
4826 +              rsb->res_name, rsb->res_flags, rsb->res_nodeid,
4827 +              atomic_read(&rsb->res_ref));
4828 +
4829 +       if (!list_empty(&rsb->res_grantqueue))
4830 +               dump_queue(&rsb->res_grantqueue, "G");
4831 +
4832 +       if (!list_empty(&rsb->res_convertqueue))
4833 +               dump_queue(&rsb->res_convertqueue, "C");
4834 +
4835 +       if (!list_empty(&rsb->res_waitqueue))
4836 +               dump_queue(&rsb->res_waitqueue, "W");
4837 +}
4838 +
4839 +void dlm_locks_dump(void)
4840 +{
4841 +       struct dlm_ls *ls;
4842 +       struct dlm_rsb *rsb;
4843 +       struct list_head *head;
4844 +       int i;
4845 +
4846 +       lowcomms_stop_accept();
4847 +
4848 +       list_for_each_entry(ls, &lslist, ls_list) {
4849 +               down_write(&ls->ls_in_recovery);
4850 +               for (i = 0; i < ls->ls_rsbtbl_size; i++) {
4851 +                       head = &ls->ls_rsbtbl[i].list;
4852 +                       list_for_each_entry(rsb, head, res_hashchain)
4853 +                               dump_rsb(rsb);
4854 +               }
4855 +       }
4856 +}
4857 +
4858 diff -urN linux-orig/cluster/dlm/locking.h linux-patched/cluster/dlm/locking.h
4859 --- linux-orig/cluster/dlm/locking.h    1970-01-01 07:30:00.000000000 +0730
4860 +++ linux-patched/cluster/dlm/locking.h 2004-11-03 11:31:56.000000000 +0800
4861 @@ -0,0 +1,33 @@
4862 +/******************************************************************************
4863 +*******************************************************************************
4864 +**
4865 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4866 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4867 +**
4868 +**  This copyrighted material is made available to anyone wishing to use,
4869 +**  modify, copy, or redistribute it subject to the terms and conditions
4870 +**  of the GNU General Public License v.2.
4871 +**
4872 +*******************************************************************************
4873 +******************************************************************************/
4874 +
4875 +#ifndef __LOCKING_DOT_H__
4876 +#define __LOCKING_DOT_H__
4877 +
4878 +int dlm_modes_compat(int mode1, int mode2);
4879 +void process_remastered_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb, int state);
4880 +void dlm_lock_stage3(struct dlm_lkb *lkb);
4881 +int dlm_convert_stage2(struct dlm_lkb *lkb, int do_ast);
4882 +int dlm_unlock_stage2(struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4883 +int dlm_lock_stage2(struct dlm_ls *lspace, struct dlm_lkb *lkb, struct dlm_rsb *rsb, uint32_t flags);
4884 +struct dlm_rsb *create_rsb(struct dlm_ls *lspace, struct dlm_lkb *lkb, char *name, int namelen);
4885 +int free_rsb_if_unused(struct dlm_rsb *rsb);
4886 +struct dlm_lkb *remote_stage2(int remote_csid, struct dlm_ls *lspace,
4887 +                       struct dlm_request *freq);
4888 +int cancel_lockop(struct dlm_lkb *lkb, int status);
4889 +int dlm_remove_lock(struct dlm_lkb *lkb, uint32_t flags);
4890 +int grant_pending_locks(struct dlm_rsb *rsb);
4891 +void cancel_conversion(struct dlm_lkb *lkb, int ret);
4892 +struct dlm_lkb *conversion_deadlock_check(struct dlm_lkb *lkb);
4893 +
4894 +#endif                         /* __LOCKING_DOT_H__ */
4895 diff -urN linux-orig/cluster/dlm/lockqueue.c linux-patched/cluster/dlm/lockqueue.c
4896 --- linux-orig/cluster/dlm/lockqueue.c  1970-01-01 07:30:00.000000000 +0730
4897 +++ linux-patched/cluster/dlm/lockqueue.c       2004-11-03 11:31:56.000000000 +0800
4898 @@ -0,0 +1,1159 @@
4899 +/******************************************************************************
4900 +*******************************************************************************
4901 +**
4902 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
4903 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
4904 +**
4905 +**  This copyrighted material is made available to anyone wishing to use,
4906 +**  modify, copy, or redistribute it subject to the terms and conditions
4907 +**  of the GNU General Public License v.2.
4908 +**
4909 +*******************************************************************************
4910 +******************************************************************************/
4911 +
4912 +/*
4913 + * lockqueue.c
4914 + *
4915 + * This controls the lock queue, which is where locks
4916 + * come when they need to wait for a remote operation
4917 + * to complete.
4918 + *
4919 + * This could also be thought of as the "high-level" comms
4920 + * layer.
4921 + *
4922 + */
4923 +
4924 +#include "dlm_internal.h"
4925 +#include "lockqueue.h"
4926 +#include "dir.h"
4927 +#include "locking.h"
4928 +#include "lkb.h"
4929 +#include "lowcomms.h"
4930 +#include "midcomms.h"
4931 +#include "reccomms.h"
4932 +#include "nodes.h"
4933 +#include "lockspace.h"
4934 +#include "ast.h"
4935 +#include "memory.h"
4936 +#include "rsb.h"
4937 +#include "queries.h"
4938 +#include "util.h"
4939 +
4940 +static void add_reply_lvb(struct dlm_lkb * lkb, struct dlm_reply *reply);
4941 +static void add_request_lvb(struct dlm_lkb * lkb, struct dlm_request *req);
4942 +
4943 +/*
4944 + * format of an entry on the request queue
4945 + */
4946 +struct rq_entry {
4947 +       struct list_head rqe_list;
4948 +       uint32_t rqe_nodeid;
4949 +       char rqe_request[1];
4950 +};
4951 +
4952 +/*
4953 + * Add a new request (if appropriate) to the request queue and send the remote
4954 + * request out.  - runs in the context of the locking caller
4955 + *
4956 + * Recovery of a remote_stage request if the remote end fails while the lkb
4957 + * is still on the lockqueue:
4958 + *
4959 + * o lkbs on the lockqueue are flagged with GDLM_LKFLG_LQRESEND in
4960 + *   lockqueue_lkb_mark() at the start of recovery.
4961 + *
4962 + * o Some lkb's will be rebuilt on new master rsb's during recovery.
4963 + *   (depends on the type of request, see below).
4964 + *
4965 + * o At the end of recovery, resend_cluster_requests() looks at these
4966 + *   LQRESEND lkb's and either:
4967 + *
4968 + *   i) resends the request to the new master for the rsb where the
4969 + *      request is processed as usual.  The lkb remains on the lockqueue until
4970 + *      the new master replies and we run process_lockqueue_reply().
4971 + *
4972 + *   ii) if we've become the rsb master, remove the lkb from the lockqueue
4973 + *       and processes the request locally via process_remastered_lkb().
4974 + *
4975 + * GDLM_LQSTATE_WAIT_RSB (1) - these lockqueue lkb's are not on any rsb queue
4976 + * and the request should be resent if dest node is failed.
4977 + *
4978 + * GDLM_LQSTATE_WAIT_CONDGRANT (3) - this lockqueue lkb is on a local rsb's
4979 + * wait queue.  Don't rebuild this lkb on a new master rsb (the NOREBUILD flag
4980 + * makes send_lkb_queue() skip it).  Resend this request to the new master.
4981 + *
4982 + * GDLM_LQSTATE_WAIT_UNLOCK (4) - this lkb is on a local rsb's queue.  It will
4983 + * be rebuilt on the rsb on the new master (restbl_lkb_send/send_lkb_queue).
4984 + * Resend this request to the new master.
4985 + *
4986 + * GDLM_LQSTATE_WAIT_CONVERT (2) - this lkb is on a local rsb convert queue.
4987 + * It will be rebuilt on the new master rsb's granted queue.  Resend this
4988 + * request to the new master.
4989 + */
4990 +
4991 +int remote_stage(struct dlm_lkb *lkb, int state)
4992 +{
4993 +       int error;
4994 +
4995 +       lkb->lkb_lockqueue_state = state;
4996 +       add_to_lockqueue(lkb);
4997 +
4998 +       error = send_cluster_request(lkb, state);
4999 +       if (error < 0) {
5000 +               log_error(lkb->lkb_resource->res_ls, "remote_stage error %d %x",
5001 +                         error, lkb->lkb_id);
5002 +               /* Leave on lockqueue, it will be resent to correct node during
5003 +                * recovery. */
5004 +       }
5005 +       return 0;
5006 +}
5007 +
5008 +/*
5009 + * Requests received while the lockspace is in recovery get added to the
5010 + * request queue and processed when recovery is complete.
5011 + */
5012 +
5013 +void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
5014 +{
5015 +       struct rq_entry *entry;
5016 +       int length = hd->rh_length;
5017 +
5018 +       if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags))
5019 +               log_error(ls, "request during recovery from %u", nodeid);
5020 +
5021 +       if (in_nodes_gone(ls, nodeid))
5022 +               return;
5023 +
5024 +       entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
5025 +       if (!entry) {
5026 +               // TODO something better
5027 +               printk("dlm: add_to_requestqueue: out of memory\n");
5028 +               return;
5029 +       }
5030 +
5031 +       log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid);
5032 +       entry->rqe_nodeid = nodeid;
5033 +       memcpy(entry->rqe_request, hd, length);
5034 +
5035 +       down(&ls->ls_requestqueue_lock);
5036 +       list_add_tail(&entry->rqe_list, &ls->ls_requestqueue);
5037 +       up(&ls->ls_requestqueue_lock);
5038 +}
5039 +
5040 +int process_requestqueue(struct dlm_ls *ls)
5041 +{
5042 +       int error = 0, count = 0;
5043 +       struct rq_entry *entry;
5044 +       struct dlm_header *hd;
5045 +
5046 +       log_all(ls, "process held requests");
5047 +
5048 +       down(&ls->ls_requestqueue_lock);
5049 +
5050 +       for (;;) {
5051 +               if (list_empty(&ls->ls_requestqueue)) {
5052 +                       up(&ls->ls_requestqueue_lock);
5053 +                       error = 0;
5054 +                       break;
5055 +               }
5056 +
5057 +               entry = list_entry(ls->ls_requestqueue.next, struct rq_entry,
5058 +                                  rqe_list);
5059 +               up(&ls->ls_requestqueue_lock);
5060 +               hd = (struct dlm_header *) entry->rqe_request;
5061 +
5062 +               log_debug(ls, "process_requestq cmd %d fr %u", hd->rh_cmd,
5063 +                         entry->rqe_nodeid);
5064 +
5065 +               error = process_cluster_request(entry->rqe_nodeid, hd, TRUE);
5066 +               if (error == -EINTR) {
5067 +                       /* entry is left on requestqueue */
5068 +                       log_debug(ls, "process_requestqueue abort eintr");
5069 +                       break;
5070 +               }
5071 +
5072 +               down(&ls->ls_requestqueue_lock);
5073 +               list_del(&entry->rqe_list);
5074 +               kfree(entry);
5075 +               count++;
5076 +
5077 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5078 +                       log_debug(ls, "process_requestqueue abort ls_run");
5079 +                       up(&ls->ls_requestqueue_lock);
5080 +                       error = -EINTR;
5081 +                       break;
5082 +               }
5083 +       }
5084 +
5085 +       log_all(ls, "processed %d requests", count);
5086 +       return error;
5087 +}
5088 +
5089 +void wait_requestqueue(struct dlm_ls *ls)
5090 +{
5091 +       for (;;) {
5092 +               down(&ls->ls_requestqueue_lock);
5093 +               if (list_empty(&ls->ls_requestqueue))
5094 +                       break;
5095 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
5096 +                       break;
5097 +               up(&ls->ls_requestqueue_lock);
5098 +               schedule();
5099 +       }
5100 +       up(&ls->ls_requestqueue_lock);
5101 +}
5102 +
5103 +/*
5104 + * Resdir requests (lookup or remove) and replies from before recovery are
5105 + * invalid since the resdir was rebuilt.  Clear them.  Requests from nodes now
5106 + * gone are also invalid.
5107 + */
5108 +
5109 +void purge_requestqueue(struct dlm_ls *ls)
5110 +{
5111 +       int count = 0;
5112 +       struct rq_entry *entry, *safe;
5113 +       struct dlm_header *hd;
5114 +       struct dlm_lkb *lkb;
5115 +
5116 +       log_all(ls, "purge requests");
5117 +
5118 +       down(&ls->ls_requestqueue_lock);
5119 +
5120 +       list_for_each_entry_safe(entry, safe, &ls->ls_requestqueue, rqe_list) {
5121 +               hd = (struct dlm_header *) entry->rqe_request;
5122 +
5123 +               if (hd->rh_cmd == GDLM_REMCMD_REM_RESDATA ||
5124 +                   hd->rh_cmd == GDLM_REMCMD_LOOKUP ||
5125 +                   in_nodes_gone(ls, entry->rqe_nodeid)) {
5126 +
5127 +                       list_del(&entry->rqe_list);
5128 +                       kfree(entry);
5129 +                       count++;
5130 +
5131 +               } else if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY) {
5132 +
5133 +                       /*
5134 +                        * Replies to resdir lookups are invalid and must be
5135 +                        * purged.  The lookup requests are marked in
5136 +                        * lockqueue_lkb_mark and will be resent in
5137 +                        * resend_cluster_requests.  The only way to check if
5138 +                        * this is a lookup reply is to look at the
5139 +                        * lockqueue_state of the lkb.
5140 +                        */
5141 +
5142 +                       lkb = find_lock_by_id(ls, hd->rh_lkid);
5143 +                       DLM_ASSERT(lkb,);
5144 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_RSB) {
5145 +                               list_del(&entry->rqe_list);
5146 +                               kfree(entry);
5147 +                               count++;
5148 +                       }
5149 +               }
5150 +       }
5151 +       up(&ls->ls_requestqueue_lock);
5152 +
5153 +       log_all(ls, "purged %d requests", count);
5154 +}
5155 +
5156 +/*
5157 + * Check if there's a reply for the given lkid in the requestqueue.
5158 + */
5159 +
5160 +int reply_in_requestqueue(struct dlm_ls *ls, int lkid)
5161 +{
5162 +       int rv = FALSE;
5163 +       struct rq_entry *entry;
5164 +       struct dlm_header *hd;
5165 +
5166 +       down(&ls->ls_requestqueue_lock);
5167 +
5168 +       list_for_each_entry(entry, &ls->ls_requestqueue, rqe_list) {
5169 +               hd = (struct dlm_header *) entry->rqe_request;
5170 +               if (hd->rh_cmd == GDLM_REMCMD_LOCKREPLY && hd->rh_lkid == lkid){
5171 +                       log_debug(ls, "reply_in_requestq cmd %d fr %d id %x",
5172 +                                 hd->rh_cmd, entry->rqe_nodeid, lkid);
5173 +                       rv = TRUE;
5174 +                       break;
5175 +               }
5176 +       }
5177 +       up(&ls->ls_requestqueue_lock);
5178 +
5179 +       return rv;
5180 +}
5181 +
5182 +void allocate_and_copy_lvb(struct dlm_ls *ls, char **lvbptr, char *src)
5183 +{
5184 +       if (!*lvbptr)
5185 +               *lvbptr = allocate_lvb(ls);
5186 +       if (*lvbptr)
5187 +               memcpy(*lvbptr, src, DLM_LVB_LEN);
5188 +}
5189 +
5190 +/*
5191 + * Process a lockqueue LKB after it has had it's remote processing complete and
5192 + * been pulled from the lockqueue.  Runs in the context of the DLM recvd thread
5193 + * on the machine that requested the lock.
5194 + */
5195 +
5196 +static void process_lockqueue_reply(struct dlm_lkb *lkb,
5197 +                                   struct dlm_reply *reply,
5198 +                                   uint32_t nodeid)
5199 +{
5200 +       struct dlm_rsb *rsb = lkb->lkb_resource;
5201 +       struct dlm_ls *ls = rsb->res_ls;
5202 +       int oldstate, state = lkb->lkb_lockqueue_state;
5203 +
5204 +       if (state)
5205 +               remove_from_lockqueue(lkb);
5206 +
5207 +       switch (state) {
5208 +       case GDLM_LQSTATE_WAIT_RSB:
5209 +
5210 +               if (reply->rl_status) {
5211 +                       DLM_ASSERT(reply->rl_status == -EEXIST,);
5212 +                       if (rsb->res_nodeid == -1) {
5213 +                               msleep(500);
5214 +                               remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5215 +                               break;
5216 +                       }
5217 +               } else {
5218 +                       if (reply->rl_nodeid == our_nodeid()) {
5219 +                               set_bit(RESFL_MASTER, &rsb->res_flags);
5220 +                               rsb->res_nodeid = 0;
5221 +                       } else {
5222 +                               clear_bit(RESFL_MASTER, &rsb->res_flags);
5223 +                               rsb->res_nodeid = reply->rl_nodeid;
5224 +                       }
5225 +               }
5226 +
5227 +               log_debug(ls, "(%d) lu rep %x fr %u %u", lkb->lkb_ownpid,
5228 +                         lkb->lkb_id, nodeid,
5229 +                         rsb->res_nodeid);
5230 +
5231 +               lkb->lkb_nodeid = rsb->res_nodeid;
5232 +               dlm_lock_stage2(ls, lkb, rsb, lkb->lkb_lockqueue_flags);
5233 +               break;
5234 +
5235 +       case GDLM_LQSTATE_WAIT_CONVERT:
5236 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
5237 +
5238 +               /*
5239 +                * the destination wasn't the master
5240 +                * this implies the request was a CONDGRANT
5241 +                */
5242 +
5243 +               if (reply->rl_status == -EINVAL) {
5244 +                       int master_nodeid;
5245 +
5246 +                       DLM_ASSERT(state == GDLM_LQSTATE_WAIT_CONDGRANT, );
5247 +
5248 +                       log_debug(ls, "(%d) req reply einval %x fr %d r %d %s",
5249 +                                 lkb->lkb_ownpid, lkb->lkb_id, nodeid,
5250 +                                 rsb->res_nodeid, rsb->res_name);
5251 +
5252 +                       lkb_dequeue(lkb);
5253 +
5254 +                       if (rsb->res_nodeid == lkb->lkb_nodeid || rsb->res_nodeid == -1){
5255 +                               /*
5256 +                                * We need to re-lookup the master and resend our
5257 +                                * request to it.
5258 +                                */
5259 +
5260 +                               lkb->lkb_nodeid = -1;
5261 +                               rsb->res_nodeid = -1;
5262 +
5263 +                               if (get_directory_nodeid(rsb) != our_nodeid())
5264 +                                       remote_stage(lkb, GDLM_LQSTATE_WAIT_RSB);
5265 +                               else {
5266 +                                       int error = dlm_dir_lookup(ls, our_nodeid(),
5267 +                                                                  rsb->res_name,
5268 +                                                                  rsb->res_length,
5269 +                                                                  &master_nodeid);
5270 +                                       if (error == -EEXIST) {
5271 +                                               /* don't expect this will happen */
5272 +                                               log_all(ls, "EEXIST %x", lkb->lkb_id);
5273 +                                               print_lkb(lkb);
5274 +                                               print_rsb(rsb);
5275 +                                       }
5276 +
5277 +                                       if (master_nodeid == our_nodeid()) {
5278 +                                               set_bit(RESFL_MASTER, &rsb->res_flags);
5279 +                                               master_nodeid = 0;
5280 +                                       } else
5281 +                                               clear_bit(RESFL_MASTER,&rsb->res_flags);
5282 +
5283 +                                       rsb->res_nodeid = master_nodeid;
5284 +                                       lkb->lkb_nodeid = master_nodeid;
5285 +
5286 +                                       dlm_lock_stage2(ls, lkb, rsb,
5287 +                                                       lkb->lkb_lockqueue_flags);
5288 +                               }
5289 +                       } else {
5290 +                               /*
5291 +                                * Another request on this rsb has since found
5292 +                                * the master, we'll use that one although it too
5293 +                                * may be invalid requiring us to retry again.
5294 +                                */
5295 +
5296 +                               lkb->lkb_nodeid = rsb->res_nodeid;
5297 +                               dlm_lock_stage2(ls, lkb, rsb,
5298 +                                               lkb->lkb_lockqueue_flags);
5299 +                       }
5300 +
5301 +                       break;
5302 +               }
5303 +
5304 +
5305 +               /*
5306 +                * After a remote lock/conversion/grant request we put the lock
5307 +                * on the right queue and send an AST if appropriate.  Any lock
5308 +                * shuffling (eg newly granted locks because this one was
5309 +                * converted downwards) will be dealt with in seperate messages
5310 +                * (which may be in the same network message)
5311 +                */
5312 +
5313 +               if (!lkb->lkb_remid)
5314 +                       lkb->lkb_remid = reply->rl_lkid;
5315 +
5316 +               /*
5317 +                * The remote request failed (we assume because of NOQUEUE).
5318 +                * If this is a new request (non-conv) the lkb was created just
5319 +                * for it so the lkb should be freed.  If this was a
5320 +                * conversion, the lkb already existed so we should put it back
5321 +                * on the grant queue.
5322 +                */
5323 +
5324 +               if (reply->rl_status != 0) {
5325 +                       DLM_ASSERT(reply->rl_status == -EAGAIN,);
5326 +
5327 +                       if (state == GDLM_LQSTATE_WAIT_CONDGRANT) {
5328 +                               res_lkb_dequeue(lkb);
5329 +                               lkb->lkb_retstatus = reply->rl_status;
5330 +                               queue_ast(lkb, AST_COMP | AST_DEL, 0);
5331 +                       } else {
5332 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5333 +                               lkb->lkb_retstatus = reply->rl_status;
5334 +                               queue_ast(lkb, AST_COMP, 0);
5335 +                       }
5336 +                       break;
5337 +               }
5338 +
5339 +               /*
5340 +                * The remote request was successful in granting the request or
5341 +                * queuing it to be granted later.  Add the lkb to the
5342 +                * appropriate rsb queue.
5343 +                */
5344 +
5345 +               switch (reply->rl_lockstate) {
5346 +               case GDLM_LKSTS_GRANTED:
5347 +
5348 +                       /* Compact version of grant_lock(). */
5349 +
5350 +                       down_write(&rsb->res_lock);
5351 +                       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5352 +                               memcpy(lkb->lkb_lvbptr, reply->rl_lvb,
5353 +                                      DLM_LVB_LEN);
5354 +
5355 +                       lkb->lkb_grmode = lkb->lkb_rqmode;
5356 +                       lkb->lkb_rqmode = DLM_LOCK_IV;
5357 +                       lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5358 +
5359 +                       if (lkb->lkb_range) {
5360 +                               lkb->lkb_range[GR_RANGE_START] =
5361 +                                   lkb->lkb_range[RQ_RANGE_START];
5362 +                               lkb->lkb_range[GR_RANGE_END] =
5363 +                                   lkb->lkb_range[RQ_RANGE_END];
5364 +                       }
5365 +                       up_write(&rsb->res_lock);
5366 +
5367 +                       lkb->lkb_retstatus = 0;
5368 +                       queue_ast(lkb, AST_COMP, 0);
5369 +                       break;
5370 +
5371 +               case GDLM_LKSTS_WAITING:
5372 +
5373 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5374 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_WAITING);
5375 +                       else
5376 +                               log_error(ls, "wait reply for granted %x %u",
5377 +                                         lkb->lkb_id, lkb->lkb_nodeid);
5378 +                       break;
5379 +
5380 +               case GDLM_LKSTS_CONVERT:
5381 +
5382 +                       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
5383 +                               res_lkb_swqueue(rsb, lkb, GDLM_LKSTS_CONVERT);
5384 +                       else
5385 +                               log_error(ls, "convert reply for granted %x %u",
5386 +                                         lkb->lkb_id, lkb->lkb_nodeid);
5387 +                       break;
5388 +
5389 +               default:
5390 +                       log_error(ls, "process_lockqueue_reply state %d",
5391 +                                 reply->rl_lockstate);
5392 +               }
5393 +
5394 +               break;
5395 +
5396 +       case GDLM_LQSTATE_WAIT_UNLOCK:
5397 +
5398 +               /*
5399 +                * Unlocks should never fail.  Update local lock info.  This
5400 +                * always sends completion AST with status in lksb
5401 +                */
5402 +
5403 +               DLM_ASSERT(reply->rl_status == 0,);
5404 +               oldstate = res_lkb_dequeue(lkb);
5405 +
5406 +               /* Differentiate between unlocks and conversion cancellations */
5407 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_CANCEL) {
5408 +                       if (oldstate == GDLM_LKSTS_CONVERT) {
5409 +                               res_lkb_enqueue(lkb->lkb_resource, lkb,
5410 +                                               GDLM_LKSTS_GRANTED);
5411 +                               lkb->lkb_retstatus = -DLM_ECANCEL;
5412 +                               queue_ast(lkb, AST_COMP, 0);
5413 +                       } else
5414 +                               log_error(ls, "cancel state %d", oldstate);
5415 +               } else {
5416 +                       DLM_ASSERT(oldstate == GDLM_LKSTS_GRANTED,
5417 +                                  print_lkb(lkb););
5418 +
5419 +                       lkb->lkb_retstatus = -DLM_EUNLOCK;
5420 +                       queue_ast(lkb, AST_COMP | AST_DEL, 0);
5421 +               }
5422 +               break;
5423 +
5424 +       default:
5425 +               log_error(ls, "process_lockqueue_reply id %x state %d",
5426 +                         lkb->lkb_id, state);
5427 +       }
5428 +}
5429 +
5430 +/*
5431 + * Tell a remote node to grant a lock.  This happens when we are the master
5432 + * copy for a lock that is actually held on a remote node.  The remote end is
5433 + * also responsible for sending the completion AST.
5434 + */
5435 +
5436 +void remote_grant(struct dlm_lkb *lkb)
5437 +{
5438 +       struct writequeue_entry *e;
5439 +       struct dlm_request *req;
5440 +
5441 +       // TODO Error handling
5442 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
5443 +                               sizeof(struct dlm_request),
5444 +                               lkb->lkb_resource->res_ls->ls_allocation,
5445 +                               (char **) &req);
5446 +       if (!e)
5447 +               return;
5448 +
5449 +       req->rr_header.rh_cmd = GDLM_REMCMD_LOCKGRANT;
5450 +       req->rr_header.rh_length = sizeof(struct dlm_request);
5451 +       req->rr_header.rh_flags = 0;
5452 +       req->rr_header.rh_lkid = lkb->lkb_id;
5453 +       req->rr_header.rh_lockspace = lkb->lkb_resource->res_ls->ls_global_id;
5454 +       req->rr_remlkid = lkb->lkb_remid;
5455 +       req->rr_flags = 0;
5456 +
5457 +       if (lkb->lkb_flags & GDLM_LKFLG_DEMOTED) {
5458 +               /* This is a confusing non-standard use of rr_flags which is
5459 +                * usually used to pass lockqueue_flags. */
5460 +               req->rr_flags |= GDLM_LKFLG_DEMOTED;
5461 +       }
5462 +
5463 +       add_request_lvb(lkb, req);
5464 +       midcomms_send_buffer(&req->rr_header, e);
5465 +}
5466 +
5467 +void reply_and_grant(struct dlm_lkb *lkb)
5468 +{
5469 +       struct dlm_request *req = lkb->lkb_request;
5470 +       struct dlm_reply *reply;
5471 +       struct writequeue_entry *e;
5472 +
5473 +       // TODO Error handling
5474 +       e = lowcomms_get_buffer(lkb->lkb_nodeid,
5475 +                               sizeof(struct dlm_reply),
5476 +                               lkb->lkb_resource->res_ls->ls_allocation,
5477 +                               (char **) &reply);
5478 +       if (!e)
5479 +               return;
5480 +
5481 +       reply->rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
5482 +       reply->rl_header.rh_flags = 0;
5483 +       reply->rl_header.rh_length = sizeof(struct dlm_reply);
5484 +       reply->rl_header.rh_lkid = req->rr_header.rh_lkid;
5485 +       reply->rl_header.rh_lockspace = req->rr_header.rh_lockspace;
5486 +
5487 +       reply->rl_status = lkb->lkb_retstatus;
5488 +       reply->rl_lockstate = lkb->lkb_status;
5489 +       reply->rl_lkid = lkb->lkb_id;
5490 +
5491 +       DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_DEMOTED),);
5492 +
5493 +       lkb->lkb_request = NULL;
5494 +
5495 +       add_reply_lvb(lkb, reply);
5496 +       midcomms_send_buffer(&reply->rl_header, e);
5497 +}
5498 +
5499 +/*
5500 + * Request removal of a dead entry in the resource directory
5501 + */
5502 +
5503 +void remote_remove_direntry(struct dlm_ls *ls, int nodeid, char *name,
5504 +                           int namelen)
5505 +{
5506 +       struct writequeue_entry *e;
5507 +       struct dlm_request *req;
5508 +
5509 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5510 +               struct dlm_rcom *rc = allocate_rcom_buffer(ls);
5511 +
5512 +               memcpy(rc->rc_buf, name, namelen);
5513 +               rc->rc_datalen = namelen;
5514 +
5515 +               rcom_send_message(ls, nodeid, RECCOMM_REMRESDATA, rc, 0);
5516 +
5517 +               free_rcom_buffer(rc);
5518 +               return;
5519 +       }
5520 +       // TODO Error handling
5521 +       e = lowcomms_get_buffer(nodeid,
5522 +                               sizeof(struct dlm_request) + namelen - 1,
5523 +                               ls->ls_allocation, (char **) &req);
5524 +       if (!e)
5525 +               return;
5526 +
5527 +       memset(req, 0, sizeof(struct dlm_request) + namelen - 1);
5528 +       req->rr_header.rh_cmd = GDLM_REMCMD_REM_RESDATA;
5529 +       req->rr_header.rh_length =
5530 +           sizeof(struct dlm_request) + namelen - 1;
5531 +       req->rr_header.rh_flags = 0;
5532 +       req->rr_header.rh_lkid = 0;
5533 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5534 +       req->rr_remlkid = 0;
5535 +       memcpy(req->rr_name, name, namelen);
5536 +
5537 +       midcomms_send_buffer(&req->rr_header, e);
5538 +}
5539 +
5540 +/*
5541 + * Send remote cluster request to directory or master node before the request
5542 + * is put on the lock queue.  Runs in the context of the locking caller.
5543 + */
5544 +
5545 +int send_cluster_request(struct dlm_lkb *lkb, int state)
5546 +{
5547 +       uint32_t target_nodeid;
5548 +       struct dlm_rsb *rsb = lkb->lkb_resource;
5549 +       struct dlm_ls *ls = rsb->res_ls;
5550 +       struct dlm_request *req;
5551 +       struct writequeue_entry *e;
5552 +
5553 +       if (state == GDLM_LQSTATE_WAIT_RSB)
5554 +               target_nodeid = get_directory_nodeid(rsb);
5555 +       else
5556 +               target_nodeid = lkb->lkb_nodeid;
5557 +
5558 +       /* during recovery it's valid for target_nodeid to equal our own;
5559 +          resend_cluster_requests does this to get requests back on track */
5560 +
5561 +       DLM_ASSERT(target_nodeid && target_nodeid != -1,
5562 +                  print_lkb(lkb);
5563 +                  print_rsb(rsb);
5564 +                  printk("target_nodeid %u\n", target_nodeid););
5565 +
5566 +       if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
5567 +               /* this may happen when called by resend_cluster_request */
5568 +               log_error(ls, "send_cluster_request to %u state %d recovery",
5569 +                         target_nodeid, state);
5570 +       }
5571 +
5572 +       e = lowcomms_get_buffer(target_nodeid,
5573 +                               sizeof(struct dlm_request) +
5574 +                               rsb->res_length - 1, ls->ls_allocation,
5575 +                               (char **) &req);
5576 +       if (!e)
5577 +               return -ENOBUFS;
5578 +       memset(req, 0, sizeof(struct dlm_request) + rsb->res_length - 1);
5579 +
5580 +       /* Common stuff, some are just defaults */
5581 +
5582 +       if (lkb->lkb_bastaddr)
5583 +               req->rr_asts = AST_BAST;
5584 +       if (lkb->lkb_astaddr)
5585 +               req->rr_asts |= AST_COMP;
5586 +       if (lkb->lkb_parent)
5587 +               req->rr_remparid = lkb->lkb_parent->lkb_remid;
5588 +
5589 +       req->rr_flags = lkb->lkb_lockqueue_flags;
5590 +       req->rr_rqmode = lkb->lkb_rqmode;
5591 +       req->rr_remlkid = lkb->lkb_remid;
5592 +       req->rr_pid = lkb->lkb_ownpid;
5593 +       req->rr_header.rh_length =
5594 +           sizeof(struct dlm_request) + rsb->res_length - 1;
5595 +       req->rr_header.rh_flags = 0;
5596 +       req->rr_header.rh_lkid = lkb->lkb_id;
5597 +       req->rr_header.rh_lockspace = ls->ls_global_id;
5598 +
5599 +       switch (state) {
5600 +
5601 +       case GDLM_LQSTATE_WAIT_RSB:
5602 +
5603 +               DLM_ASSERT(!lkb->lkb_parent,
5604 +                          print_lkb(lkb);
5605 +                          print_rsb(rsb););
5606 +
5607 +               log_debug(ls, "(%d) send lu %x to %u",
5608 +                         lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5609 +
5610 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOOKUP;
5611 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5612 +               break;
5613 +
5614 +       case GDLM_LQSTATE_WAIT_CONVERT:
5615 +
5616 +               DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5617 +                          print_lkb(lkb);
5618 +                          print_rsb(rsb););
5619 +
5620 +               log_debug(ls, "(%d) send cv %x to %u",
5621 +                         lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5622 +
5623 +               req->rr_header.rh_cmd = GDLM_REMCMD_CONVREQUEST;
5624 +               if (lkb->lkb_range) {
5625 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5626 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5627 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5628 +               }
5629 +               break;
5630 +
5631 +       case GDLM_LQSTATE_WAIT_CONDGRANT:
5632 +
5633 +               DLM_ASSERT(lkb->lkb_nodeid == rsb->res_nodeid,
5634 +                          print_lkb(lkb);
5635 +                          print_rsb(rsb););
5636 +
5637 +               log_debug(ls, "(%d) send rq %x to %u",
5638 +                         lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5639 +
5640 +               req->rr_header.rh_cmd = GDLM_REMCMD_LOCKREQUEST;
5641 +               memcpy(req->rr_name, rsb->res_name, rsb->res_length);
5642 +               if (lkb->lkb_range) {
5643 +                       req->rr_flags |= GDLM_LKFLG_RANGE;
5644 +                       req->rr_range_start = lkb->lkb_range[RQ_RANGE_START];
5645 +                       req->rr_range_end = lkb->lkb_range[RQ_RANGE_END];
5646 +               }
5647 +               break;
5648 +
5649 +       case GDLM_LQSTATE_WAIT_UNLOCK:
5650 +
5651 +               log_debug(ls, "(%d) send un %x to %u",
5652 +                         lkb->lkb_ownpid, lkb->lkb_id, target_nodeid);
5653 +
5654 +               req->rr_header.rh_cmd = GDLM_REMCMD_UNLOCKREQUEST;
5655 +               break;
5656 +
5657 +       default:
5658 +               DLM_ASSERT(0, printk("Unknown cluster request\n"););
5659 +       }
5660 +
5661 +       add_request_lvb(lkb, req);
5662 +       midcomms_send_buffer(&req->rr_header, e);
5663 +
5664 +       return 0;
5665 +}
5666 +
5667 +/*
5668 + * We got a request from another cluster node, process it and return an info
5669 + * structure with the lock state/LVB etc as required.  Executes in the DLM's
5670 + * recvd thread.
5671 + */
5672 +
5673 +int process_cluster_request(int nodeid, struct dlm_header *req, int recovery)
5674 +{
5675 +       struct dlm_ls *lspace;
5676 +       struct dlm_lkb *lkb = NULL;
5677 +       struct dlm_rsb *rsb;
5678 +       int send_reply = 0, status = 0, namelen;
5679 +       struct dlm_request *freq = (struct dlm_request *) req;
5680 +       struct dlm_reply *rp = (struct dlm_reply *) req;
5681 +       struct dlm_reply reply;
5682 +
5683 +       lspace = find_lockspace_by_global_id(req->rh_lockspace);
5684 +
5685 +       if (!lspace) {
5686 +               log_print("process_cluster_request invalid lockspace %x "
5687 +                         "from %d req %u", req->rh_lockspace, nodeid,
5688 +                         req->rh_cmd);
5689 +               return -EINVAL;
5690 +       }
5691 +
5692 +       /* wait for recoverd to drain requestqueue */
5693 +       if (!recovery)
5694 +               wait_requestqueue(lspace);
5695 +
5696 +       /*
5697 +        * If we're in recovery then queue the request for later.  Otherwise,
5698 +        * we still need to get the "in_recovery" lock to make sure the
5699 +        * recovery itself doesn't start until we are done.
5700 +        */
5701 + retry:
5702 +       if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) {
5703 +               if (!recovery)
5704 +                       add_to_requestqueue(lspace, nodeid, req);
5705 +               status = -EINTR;
5706 +               goto out;
5707 +       }
5708 +       if (!down_read_trylock(&lspace->ls_in_recovery)) {
5709 +               schedule();
5710 +               goto retry;
5711 +       }
5712 +
5713 +
5714 +       /*
5715 +        * Process the request.
5716 +        */
5717 +
5718 +       switch (req->rh_cmd) {
5719 +
5720 +       case GDLM_REMCMD_LOOKUP:
5721 +               {
5722 +                       uint32_t dir_nodeid, r_nodeid;
5723 +                       int status;
5724 +
5725 +                       namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5726 +
5727 +                       dir_nodeid = name_to_directory_nodeid(lspace,
5728 +                                                             freq->rr_name,
5729 +                                                             namelen);
5730 +                       if (dir_nodeid != our_nodeid())
5731 +                               log_debug(lspace, "ignoring directory lookup");
5732 +
5733 +                       status = dlm_dir_lookup(lspace, nodeid, freq->rr_name,
5734 +                                               namelen, &r_nodeid);
5735 +                       reply.rl_status = status;
5736 +                       reply.rl_lockstate = 0;
5737 +                       reply.rl_nodeid = r_nodeid;
5738 +               }
5739 +               send_reply = 1;
5740 +               break;
5741 +
5742 +       case GDLM_REMCMD_REM_RESDATA:
5743 +
5744 +               namelen = freq->rr_header.rh_length - sizeof(*freq) + 1;
5745 +               dlm_dir_remove(lspace, nodeid, freq->rr_name, namelen);
5746 +               break;
5747 +
5748 +       case GDLM_REMCMD_LOCKREQUEST:
5749 +
5750 +               lkb = remote_stage2(nodeid, lspace, freq);
5751 +               if (lkb) {
5752 +                       lkb->lkb_request = freq;
5753 +                       lkb->lkb_ownpid = freq->rr_pid;
5754 +                       if (lkb->lkb_retstatus != -EINVAL)
5755 +                               dlm_lock_stage3(lkb);
5756 +
5757 +                       /*
5758 +                        * If the request was granted in lock_stage3, then a
5759 +                        * reply message was already sent in combination with
5760 +                        * the grant message and lkb_request is NULL.
5761 +                        */
5762 +
5763 +                       if (lkb->lkb_request) {
5764 +                               lkb->lkb_request = NULL;
5765 +                               send_reply = 1;
5766 +                               reply.rl_status = lkb->lkb_retstatus;
5767 +                               reply.rl_lockstate = lkb->lkb_status;
5768 +                               reply.rl_lkid = lkb->lkb_id;
5769 +
5770 +                               /*
5771 +                                * If the request could not be granted and the
5772 +                                * user won't wait, then free up the LKB
5773 +                                */
5774 +
5775 +                               if (lkb->lkb_retstatus == -EAGAIN) {
5776 +                                       rsb = lkb->lkb_resource;
5777 +                                       release_lkb(lspace, lkb);
5778 +                                       release_rsb(rsb);
5779 +                                       lkb = NULL;
5780 +                               }
5781 +                               else if (lkb->lkb_retstatus == -EINVAL) {
5782 +                                       release_lkb(lspace, lkb);
5783 +                                       lkb = NULL;
5784 +                               }
5785 +                       }
5786 +               } else {
5787 +                       reply.rl_status = -ENOMEM;
5788 +                       send_reply = 1;
5789 +               }
5790 +               break;
5791 +
5792 +       case GDLM_REMCMD_CONVREQUEST:
5793 +
5794 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5795 +
5796 +
5797 +               DLM_ASSERT(lkb,
5798 +                          print_request(freq);
5799 +                          printk("nodeid %u\n", nodeid););
5800 +
5801 +               rsb = lkb->lkb_resource;
5802 +
5803 +               DLM_ASSERT(rsb,
5804 +                          print_lkb(lkb);
5805 +                          print_request(freq);
5806 +                          printk("nodeid %u\n", nodeid););
5807 +
5808 +               DLM_ASSERT(!rsb->res_nodeid,
5809 +                          print_lkb(lkb);
5810 +                          print_rsb(rsb);
5811 +                          print_request(freq);
5812 +                          printk("nodeid %u\n", nodeid););
5813 +
5814 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5815 +                          print_lkb(lkb);
5816 +                          print_rsb(rsb);
5817 +                          print_request(freq);
5818 +                          printk("nodeid %u\n", nodeid););
5819 +
5820 +               DLM_ASSERT(lkb->lkb_status == GDLM_LKSTS_GRANTED,
5821 +                          print_lkb(lkb);
5822 +                          print_rsb(rsb);
5823 +                          print_request(freq);
5824 +                          printk("nodeid %u\n", nodeid););
5825 +
5826 +               /* Update orphan lock status */
5827 +               if (freq->rr_flags & DLM_LKF_ORPHAN) {
5828 +                       lkb->lkb_flags |= GDLM_LKFLG_ORPHAN;
5829 +               }
5830 +
5831 +               lkb->lkb_rqmode = freq->rr_rqmode;
5832 +               lkb->lkb_lockqueue_flags = freq->rr_flags;
5833 +               lkb->lkb_request = freq;
5834 +               lkb->lkb_flags &= ~GDLM_LKFLG_DEMOTED;
5835 +
5836 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK ||
5837 +                   freq->rr_flags & DLM_LKF_VALBLK) {
5838 +                       lkb->lkb_flags |= GDLM_LKFLG_VALBLK;
5839 +                       allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr,
5840 +                                             freq->rr_lvb);
5841 +               }
5842 +
5843 +               if (freq->rr_flags & GDLM_LKFLG_RANGE) {
5844 +                       if (lkb_set_range(lspace, lkb, freq->rr_range_start,
5845 +                                         freq->rr_range_end)) {
5846 +                               reply.rl_status = -ENOMEM;
5847 +                               send_reply = 1;
5848 +                               goto out;
5849 +                       }
5850 +               }
5851 +
5852 +               log_debug(lspace, "(%d) cv %u from %u %x \"%s\"",
5853 +                         lkb->lkb_ownpid, lkb->lkb_rqmode, nodeid,
5854 +                         lkb->lkb_id, rsb->res_name);
5855 +
5856 +               dlm_convert_stage2(lkb, FALSE);
5857 +
5858 +               /*
5859 +                * If the conv request was granted in stage2, then a reply
5860 +                * message was already sent in combination with the grant
5861 +                * message.
5862 +                */
5863 +
5864 +               if (lkb->lkb_request) {
5865 +                       lkb->lkb_request = NULL;
5866 +                       send_reply = 1;
5867 +                       reply.rl_status = lkb->lkb_retstatus;
5868 +                       reply.rl_lockstate = lkb->lkb_status;
5869 +                       reply.rl_lkid = lkb->lkb_id;
5870 +               }
5871 +               break;
5872 +
5873 +       case GDLM_REMCMD_LOCKREPLY:
5874 +
5875 +               lkb = find_lock_by_id(lspace, req->rh_lkid);
5876 +
5877 +               DLM_ASSERT(lkb,
5878 +                          print_reply(rp);
5879 +                          printk("nodeid %u\n", nodeid););
5880 +
5881 +               DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5882 +                          print_lkb(lkb);
5883 +                          print_reply(rp);
5884 +                          printk("nodeid %u\n", nodeid););
5885 +
5886 +               process_lockqueue_reply(lkb, rp, nodeid);
5887 +               break;
5888 +
5889 +       case GDLM_REMCMD_LOCKGRANT:
5890 +
5891 +               /*
5892 +                * Remote lock has been granted asynchronously.  Do a compact
5893 +                * version of what grant_lock() does.
5894 +                */
5895 +
5896 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5897 +
5898 +               DLM_ASSERT(lkb,
5899 +                          print_request(freq);
5900 +                          printk("nodeid %u\n", nodeid););
5901 +
5902 +               rsb = lkb->lkb_resource;
5903 +
5904 +               DLM_ASSERT(rsb,
5905 +                          print_lkb(lkb);
5906 +                          print_request(freq);
5907 +                          printk("nodeid %u\n", nodeid););
5908 +
5909 +               DLM_ASSERT(rsb->res_nodeid,
5910 +                          print_lkb(lkb);
5911 +                          print_rsb(rsb);
5912 +                          print_request(freq);
5913 +                          printk("nodeid %u\n", nodeid););
5914 +
5915 +               DLM_ASSERT(!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY),
5916 +                          print_lkb(lkb);
5917 +                          print_rsb(rsb);
5918 +                          print_request(freq);
5919 +                          printk("nodeid %u\n", nodeid););
5920 +
5921 +               if (lkb->lkb_lockqueue_state) {
5922 +                       log_debug(rsb->res_ls, "grant lock on lockqueue %d",
5923 +                                 lkb->lkb_lockqueue_state);
5924 +
5925 +                       /* Don't grant locks that are waiting for an unlock */
5926 +                       if (lkb->lkb_lockqueue_state == GDLM_LQSTATE_WAIT_UNLOCK)
5927 +                               return 0;
5928 +
5929 +                       print_lkb(lkb);
5930 +                       print_request(freq);
5931 +                       remove_from_lockqueue(lkb);
5932 +                       if (!lkb->lkb_remid)
5933 +                               lkb->lkb_remid = req->rh_lkid;
5934 +               }
5935 +
5936 +               down_write(&rsb->res_lock);
5937 +
5938 +               if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
5939 +                       allocate_and_copy_lvb(lspace, &lkb->lkb_lvbptr, freq->rr_lvb);
5940 +
5941 +               lkb->lkb_grmode = lkb->lkb_rqmode;
5942 +               lkb->lkb_rqmode = DLM_LOCK_IV;
5943 +
5944 +               if (lkb->lkb_range) {
5945 +                       lkb->lkb_range[GR_RANGE_START] =
5946 +                           lkb->lkb_range[RQ_RANGE_START];
5947 +                       lkb->lkb_range[GR_RANGE_END] =
5948 +                           lkb->lkb_range[RQ_RANGE_END];
5949 +               }
5950 +
5951 +               lkb_swqueue(rsb, lkb, GDLM_LKSTS_GRANTED);
5952 +               up_write(&rsb->res_lock);
5953 +
5954 +               if (freq->rr_flags & GDLM_LKFLG_DEMOTED)
5955 +                       lkb->lkb_flags |= GDLM_LKFLG_DEMOTED;
5956 +
5957 +               lkb->lkb_retstatus = 0;
5958 +               queue_ast(lkb, AST_COMP, 0);
5959 +               break;
5960 +
5961 +       case GDLM_REMCMD_SENDBAST:
5962 +
5963 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5964 +
5965 +               DLM_ASSERT(lkb,
5966 +                          print_request(freq);
5967 +                          printk("nodeid %u\n", nodeid););
5968 +
5969 +               if (lkb->lkb_status == GDLM_LKSTS_GRANTED)
5970 +                       queue_ast(lkb, AST_BAST, freq->rr_rqmode);
5971 +               break;
5972 +
5973 +       case GDLM_REMCMD_SENDCAST:
5974 +
5975 +               /* This is only used for some error completion ASTs */
5976 +
5977 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5978 +
5979 +               DLM_ASSERT(lkb,
5980 +                          print_request(freq);
5981 +                          printk("nodeid %u\n", nodeid););
5982 +
5983 +               /* Return the lock to granted status */
5984 +               res_lkb_swqueue(lkb->lkb_resource, lkb, GDLM_LKSTS_GRANTED);
5985 +               lkb->lkb_retstatus = freq->rr_status;
5986 +               queue_ast(lkb, AST_COMP, 0);
5987 +               break;
5988 +
5989 +       case GDLM_REMCMD_UNLOCKREQUEST:
5990 +
5991 +               lkb = find_lock_by_id(lspace, freq->rr_remlkid);
5992 +
5993 +               DLM_ASSERT(lkb,
5994 +                          print_request(freq);
5995 +                          printk("nodeid %u\n", nodeid););
5996 +
5997 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,
5998 +                          print_lkb(lkb);
5999 +                          print_request(freq);
6000 +                          printk("nodeid %u\n", nodeid););
6001 +
6002 +               DLM_ASSERT(lkb->lkb_nodeid == nodeid,
6003 +                          print_lkb(lkb);
6004 +                          print_request(freq);
6005 +                          printk("nodeid %u\n", nodeid););
6006 +
6007 +               rsb = find_rsb_to_unlock(lspace, lkb);
6008 +
6009 +               log_debug(lspace, "(%d) un from %u %x \"%s\"", lkb->lkb_ownpid,
6010 +                         nodeid, lkb->lkb_id, rsb->res_name);
6011 +
6012 +               reply.rl_status = dlm_unlock_stage2(lkb, rsb, freq->rr_flags);
6013 +               send_reply = 1;
6014 +               break;
6015 +
6016 +       case GDLM_REMCMD_QUERY:
6017 +               remote_query(nodeid, lspace, req);
6018 +               break;
6019 +
6020 +       case GDLM_REMCMD_QUERYREPLY:
6021 +               remote_query_reply(nodeid, lspace, req);
6022 +               break;
6023 +
6024 +       default:
6025 +               log_error(lspace, "process_cluster_request cmd %d",req->rh_cmd);
6026 +       }
6027 +
6028 +       up_read(&lspace->ls_in_recovery);
6029 +
6030 +      out:
6031 +       if (send_reply) {
6032 +               reply.rl_header.rh_cmd = GDLM_REMCMD_LOCKREPLY;
6033 +               reply.rl_header.rh_flags = 0;
6034 +               reply.rl_header.rh_length = sizeof(reply);
6035 +               reply.rl_header.rh_lkid = freq->rr_header.rh_lkid;
6036 +               reply.rl_header.rh_lockspace = freq->rr_header.rh_lockspace;
6037 +
6038 +               status = midcomms_send_message(nodeid, &reply.rl_header,
6039 +                                              GFP_KERNEL);
6040 +       }
6041 +
6042 +       wake_astd();
6043 +       put_lockspace(lspace);
6044 +       return status;
6045 +}
6046 +
6047 +static void add_reply_lvb(struct dlm_lkb *lkb, struct dlm_reply *reply)
6048 +{
6049 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6050 +               memcpy(reply->rl_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6051 +}
6052 +
6053 +static void add_request_lvb(struct dlm_lkb *lkb, struct dlm_request *req)
6054 +{
6055 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK)
6056 +               memcpy(req->rr_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
6057 +}
6058 diff -urN linux-orig/cluster/dlm/lockqueue.h linux-patched/cluster/dlm/lockqueue.h
6059 --- linux-orig/cluster/dlm/lockqueue.h  1970-01-01 07:30:00.000000000 +0730
6060 +++ linux-patched/cluster/dlm/lockqueue.h       2004-11-03 11:31:56.000000000 +0800
6061 @@ -0,0 +1,29 @@
6062 +/******************************************************************************
6063 +*******************************************************************************
6064 +**
6065 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6066 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6067 +**
6068 +**  This copyrighted material is made available to anyone wishing to use,
6069 +**  modify, copy, or redistribute it subject to the terms and conditions
6070 +**  of the GNU General Public License v.2.
6071 +**
6072 +*******************************************************************************
6073 +******************************************************************************/
6074 +
6075 +#ifndef __LOCKQUEUE_DOT_H__
6076 +#define __LOCKQUEUE_DOT_H__
6077 +
6078 +void remote_grant(struct dlm_lkb * lkb);
6079 +void reply_and_grant(struct dlm_lkb * lkb);
6080 +int remote_stage(struct dlm_lkb * lkb, int state);
6081 +int process_cluster_request(int csid, struct dlm_header *req, int recovery);
6082 +int send_cluster_request(struct dlm_lkb * lkb, int state);
6083 +void purge_requestqueue(struct dlm_ls * ls);
6084 +int process_requestqueue(struct dlm_ls * ls);
6085 +int reply_in_requestqueue(struct dlm_ls * ls, int lkid);
6086 +void remote_remove_direntry(struct dlm_ls * ls, int nodeid, char *name,
6087 +                           int namelen);
6088 +void allocate_and_copy_lvb(struct dlm_ls * ls, char **lvbptr, char *src);
6089 +
6090 +#endif                         /* __LOCKQUEUE_DOT_H__ */
6091 diff -urN linux-orig/cluster/dlm/lockspace.c linux-patched/cluster/dlm/lockspace.c
6092 --- linux-orig/cluster/dlm/lockspace.c  1970-01-01 07:30:00.000000000 +0730
6093 +++ linux-patched/cluster/dlm/lockspace.c       2004-11-03 11:31:56.000000000 +0800
6094 @@ -0,0 +1,715 @@
6095 +/******************************************************************************
6096 +*******************************************************************************
6097 +**
6098 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6099 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6100 +**
6101 +**  This copyrighted material is made available to anyone wishing to use,
6102 +**  modify, copy, or redistribute it subject to the terms and conditions
6103 +**  of the GNU General Public License v.2.
6104 +**
6105 +*******************************************************************************
6106 +******************************************************************************/
6107 +
6108 +#include <linux/module.h>
6109 +
6110 +#include "dlm_internal.h"
6111 +#include "recoverd.h"
6112 +#include "ast.h"
6113 +#include "lkb.h"
6114 +#include "nodes.h"
6115 +#include "dir.h"
6116 +#include "lowcomms.h"
6117 +#include "config.h"
6118 +#include "memory.h"
6119 +#include "lockspace.h"
6120 +#include "device.h"
6121 +
6122 +#define GDST_NONE       (0)
6123 +#define GDST_RUNNING    (1)
6124 +
6125 +static int dlmstate;
6126 +static int dlmcount;
6127 +static struct semaphore dlmstate_lock;
6128 +struct list_head lslist;
6129 +spinlock_t lslist_lock;
6130 +struct kcl_service_ops ls_ops;
6131 +
6132 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags);
6133 +
6134 +
6135 +void dlm_lockspace_init(void)
6136 +{
6137 +       dlmstate = GDST_NONE;
6138 +       dlmcount = 0;
6139 +       init_MUTEX(&dlmstate_lock);
6140 +       INIT_LIST_HEAD(&lslist);
6141 +       spin_lock_init(&lslist_lock);
6142 +}
6143 +
6144 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen)
6145 +{
6146 +       struct dlm_ls *ls;
6147 +
6148 +       spin_lock(&lslist_lock);
6149 +
6150 +       list_for_each_entry(ls, &lslist, ls_list) {
6151 +               if (ls->ls_namelen == namelen &&
6152 +                   memcmp(ls->ls_name, name, namelen) == 0)
6153 +                       goto out;
6154 +       }
6155 +       ls = NULL;
6156 +      out:
6157 +       spin_unlock(&lslist_lock);
6158 +       return ls;
6159 +}
6160 +
6161 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id)
6162 +{
6163 +       struct dlm_ls *ls;
6164 +
6165 +       spin_lock(&lslist_lock);
6166 +
6167 +       list_for_each_entry(ls, &lslist, ls_list) {
6168 +               if (ls->ls_global_id == id) {
6169 +                       ls->ls_count++;
6170 +                       goto out;
6171 +               }
6172 +       }
6173 +       ls = NULL;
6174 +      out:
6175 +       spin_unlock(&lslist_lock);
6176 +       return ls;
6177 +}
6178 +
6179 +struct dlm_ls *find_lockspace_by_local_id(void *id)
6180 +{
6181 +       struct dlm_ls *ls;
6182 +
6183 +       spin_lock(&lslist_lock);
6184 +
6185 +       list_for_each_entry(ls, &lslist, ls_list) {
6186 +               if (ls->ls_local_id == (uint32_t)(long)id) {
6187 +                       ls->ls_count++;
6188 +                       goto out;
6189 +               }
6190 +       }
6191 +       ls = NULL;
6192 +      out:
6193 +       spin_unlock(&lslist_lock);
6194 +       return ls;
6195 +}
6196 +
6197 +/* must be called with lslist_lock held */
6198 +void hold_lockspace(struct dlm_ls *ls)
6199 +{
6200 +       ls->ls_count++;
6201 +}
6202 +
6203 +void put_lockspace(struct dlm_ls *ls)
6204 +{
6205 +       spin_lock(&lslist_lock);
6206 +       ls->ls_count--;
6207 +       spin_unlock(&lslist_lock);
6208 +}
6209 +
6210 +static void remove_lockspace(struct dlm_ls *ls)
6211 +{
6212 +       for (;;) {
6213 +               spin_lock(&lslist_lock);
6214 +               if (ls->ls_count == 0) {
6215 +                       list_del(&ls->ls_list);
6216 +                       spin_unlock(&lslist_lock);
6217 +                       return;
6218 +               }
6219 +               spin_unlock(&lslist_lock);
6220 +               set_current_state(TASK_INTERRUPTIBLE);
6221 +               schedule_timeout(HZ);
6222 +       }
6223 +}
6224 +
6225 +/*
6226 + * Called from dlm_init.  These are the general threads which are not
6227 + * lockspace-specific and work for all dlm lockspaces.
6228 + */
6229 +
6230 +static int threads_start(void)
6231 +{
6232 +       int error;
6233 +
6234 +       /* Thread which process lock requests for all ls's */
6235 +       error = astd_start();
6236 +       if (error) {
6237 +               log_print("cannot start ast thread %d", error);
6238 +               goto fail;
6239 +       }
6240 +
6241 +       /* Thread for sending/receiving messages for all ls's */
6242 +       error = lowcomms_start();
6243 +       if (error) {
6244 +               log_print("cannot start lowcomms %d", error);
6245 +               goto astd_fail;
6246 +       }
6247 +
6248 +       return 0;
6249 +
6250 +      astd_fail:
6251 +       astd_stop();
6252 +
6253 +      fail:
6254 +       return error;
6255 +}
6256 +
6257 +static void threads_stop(void)
6258 +{
6259 +       lowcomms_stop();
6260 +       astd_stop();
6261 +}
6262 +
6263 +static int init_internal(void)
6264 +{
6265 +       int error = 0;
6266 +
6267 +       if (dlmstate == GDST_RUNNING)
6268 +               dlmcount++;
6269 +       else {
6270 +               error = threads_start();
6271 +               if (error)
6272 +                       goto out;
6273 +
6274 +               dlmstate = GDST_RUNNING;
6275 +               dlmcount = 1;
6276 +       }
6277 +
6278 +      out:
6279 +       return error;
6280 +}
6281 +
6282 +/*
6283 + * Called after dlm module is loaded and before any lockspaces are created.
6284 + * Starts and initializes global threads and structures.  These global entities
6285 + * are shared by and independent of all lockspaces.
6286 + *
6287 + * There should be a dlm-specific user command which a person can run which
6288 + * calls this function.  If a user hasn't run that command and something
6289 + * creates a new lockspace, this is called first.
6290 + *
6291 + * This also starts the default lockspace.
6292 + */
6293 +
6294 +int dlm_init(void)
6295 +{
6296 +       int error;
6297 +
6298 +       down(&dlmstate_lock);
6299 +       error = init_internal();
6300 +       up(&dlmstate_lock);
6301 +
6302 +       return error;
6303 +}
6304 +
6305 +int dlm_release(void)
6306 +{
6307 +       int error = 0;
6308 +
6309 +       down(&dlmstate_lock);
6310 +
6311 +       if (dlmstate == GDST_NONE)
6312 +               goto out;
6313 +
6314 +       if (dlmcount)
6315 +               dlmcount--;
6316 +
6317 +       if (dlmcount)
6318 +               goto out;
6319 +
6320 +       spin_lock(&lslist_lock);
6321 +       if (!list_empty(&lslist)) {
6322 +               spin_unlock(&lslist_lock);
6323 +               log_print("cannot stop threads, lockspaces still exist");
6324 +               goto out;
6325 +       }
6326 +       spin_unlock(&lslist_lock);
6327 +
6328 +       threads_stop();
6329 +       dlmstate = GDST_NONE;
6330 +
6331 +      out:
6332 +       up(&dlmstate_lock);
6333 +
6334 +       return error;
6335 +}
6336 +
6337 +struct dlm_ls *allocate_ls(int namelen)
6338 +{
6339 +       struct dlm_ls *ls;
6340 +
6341 +       ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
6342 +       if (ls)
6343 +               memset(ls, 0, sizeof(struct dlm_ls) + namelen);
6344 +
6345 +       return ls;
6346 +}
6347 +
6348 +static int new_lockspace(char *name, int namelen, void **lockspace, int flags)
6349 +{
6350 +       struct dlm_ls *ls;
6351 +       int i, size, error = -ENOMEM;
6352 +       uint32_t local_id = 0;
6353 +
6354 +       if (!try_module_get(THIS_MODULE))
6355 +               return -EINVAL;
6356 +
6357 +       if (namelen > MAX_SERVICE_NAME_LEN)
6358 +               return -EINVAL;
6359 +
6360 +       ls = find_lockspace_by_name(name, namelen);
6361 +       if (ls) {
6362 +               *lockspace = (void *)(long) ls->ls_local_id;
6363 +               return -EEXIST;
6364 +       }
6365 +
6366 +       /*
6367 +        * Initialize ls fields
6368 +        */
6369 +
6370 +       ls = allocate_ls(namelen);
6371 +       if (!ls)
6372 +               goto out;
6373 +
6374 +       memcpy(ls->ls_name, name, namelen);
6375 +       ls->ls_namelen = namelen;
6376 +
6377 +       ls->ls_allocation = GFP_KERNEL;
6378 +       ls->ls_count = 0;
6379 +       ls->ls_flags = 0;
6380 +
6381 +       size = dlm_config.rsbtbl_size;
6382 +       ls->ls_rsbtbl_size = size;
6383 +
6384 +       ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
6385 +       if (!ls->ls_rsbtbl)
6386 +               goto out_lsfree;
6387 +       for (i = 0; i < size; i++) {
6388 +               INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
6389 +               rwlock_init(&ls->ls_rsbtbl[i].lock);
6390 +       }
6391 +
6392 +       size = dlm_config.lkbtbl_size;
6393 +       ls->ls_lkbtbl_size = size;
6394 +
6395 +       ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
6396 +       if (!ls->ls_lkbtbl)
6397 +               goto out_rsbfree;
6398 +       for (i = 0; i < size; i++) {
6399 +               INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
6400 +               rwlock_init(&ls->ls_lkbtbl[i].lock);
6401 +               ls->ls_lkbtbl[i].counter = 1;
6402 +       }
6403 +
6404 +       size = dlm_config.dirtbl_size;
6405 +       ls->ls_dirtbl_size = size;
6406 +
6407 +       ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
6408 +       if (!ls->ls_dirtbl)
6409 +               goto out_lkbfree;
6410 +       for (i = 0; i < size; i++) {
6411 +               INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
6412 +               rwlock_init(&ls->ls_dirtbl[i].lock);
6413 +       }
6414 +
6415 +       INIT_LIST_HEAD(&ls->ls_nodes);
6416 +       INIT_LIST_HEAD(&ls->ls_nodes_gone);
6417 +       ls->ls_num_nodes = 0;
6418 +       ls->ls_node_array = NULL;
6419 +       ls->ls_recoverd_task = NULL;
6420 +       init_MUTEX(&ls->ls_recoverd_lock);
6421 +       INIT_LIST_HEAD(&ls->ls_recover);
6422 +       spin_lock_init(&ls->ls_recover_lock);
6423 +       INIT_LIST_HEAD(&ls->ls_recover_list);
6424 +       ls->ls_recover_list_count = 0;
6425 +       spin_lock_init(&ls->ls_recover_list_lock);
6426 +       init_waitqueue_head(&ls->ls_wait_general);
6427 +       INIT_LIST_HEAD(&ls->ls_rootres);
6428 +       INIT_LIST_HEAD(&ls->ls_requestqueue);
6429 +       INIT_LIST_HEAD(&ls->ls_rebuild_rootrsb_list);
6430 +       ls->ls_last_stop = 0;
6431 +       ls->ls_last_start = 0;
6432 +       ls->ls_last_finish = 0;
6433 +       ls->ls_rcom_msgid = 0;
6434 +       init_MUTEX(&ls->ls_requestqueue_lock);
6435 +       init_MUTEX(&ls->ls_rcom_lock);
6436 +       init_rwsem(&ls->ls_unlock_sem);
6437 +       init_rwsem(&ls->ls_root_lock);
6438 +       init_rwsem(&ls->ls_in_recovery);
6439 +
6440 +       down_write(&ls->ls_in_recovery);
6441 +
6442 +       if (flags & DLM_LSF_NOTIMERS)
6443 +               set_bit(LSFL_NOTIMERS, &ls->ls_flags);
6444 +
6445 +
6446 +       /*
6447 +        * Connect this lockspace with the cluster manager
6448 +        */
6449 +
6450 +       error = kcl_register_service(name, namelen, SERVICE_LEVEL_GDLM,
6451 +                                    &ls_ops, TRUE, (void *) ls, &local_id);
6452 +       if (error)
6453 +               goto out_recoverd;
6454 +
6455 +       ls->ls_state = LSST_INIT;
6456 +       ls->ls_local_id = local_id;
6457 +
6458 +       spin_lock(&lslist_lock);
6459 +       list_add(&ls->ls_list, &lslist);
6460 +       spin_unlock(&lslist_lock);
6461 +
6462 +       error = kcl_join_service(local_id);
6463 +       if (error) {
6464 +               log_error(ls, "service manager join error %d", error);
6465 +               goto out_reg;
6466 +       }
6467 +
6468 +       /* The ls isn't actually running until it receives a start() from CMAN.
6469 +          Neither does it have a global ls id until started. */
6470 +
6471 +       /* Return the local ID as the lockspace handle. I've left this
6472 +          cast to a void* as it allows us to replace it with pretty much
6473 +          anything at a future date without breaking clients. But returning
6474 +          the address of the lockspace is a bad idea as it could get
6475 +          forcibly removed, leaving client with a dangling pointer */
6476 +
6477 +       *lockspace = (void *)(long) local_id;
6478 +       return 0;
6479 +
6480 + out_reg:
6481 +       kcl_unregister_service(ls->ls_local_id);
6482 + out_recoverd:
6483 +       dlm_recoverd_stop(ls);
6484 +       kfree(ls->ls_dirtbl);
6485 + out_lkbfree:
6486 +       kfree(ls->ls_lkbtbl);
6487 + out_rsbfree:
6488 +       kfree(ls->ls_rsbtbl);
6489 + out_lsfree:
6490 +       kfree(ls);
6491 + out:
6492 +       return error;
6493 +}
6494 +
6495 +/*
6496 + * Called by a system like GFS which wants independent lock spaces.
6497 + */
6498 +
6499 +int dlm_new_lockspace(char *name, int namelen, void **lockspace, int flags)
6500 +{
6501 +       int error = -ENOSYS;
6502 +
6503 +       down(&dlmstate_lock);
6504 +       error = init_internal();
6505 +       if (error)
6506 +               goto out;
6507 +
6508 +       error = new_lockspace(name, namelen, lockspace, flags);
6509 + out:
6510 +       up(&dlmstate_lock);
6511 +       return error;
6512 +}
6513 +
6514 +/* Return 1 if the lockspace still has active remote locks,
6515 + *        2 if the lockspace still has active local locks.
6516 + */
6517 +static int lockspace_busy(struct dlm_ls *ls)
6518 +{
6519 +       int i, lkb_found = 0;
6520 +       struct dlm_lkb *lkb;
6521 +
6522 +       /* NOTE: We check the lockidtbl here rather than the resource table.
6523 +          This is because there may be LKBs queued as ASTs that have been
6524 +          unlinked from their RSBs and are pending deletion once the AST has
6525 +          been delivered */
6526 +
6527 +       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6528 +               read_lock(&ls->ls_lkbtbl[i].lock);
6529 +               if (!list_empty(&ls->ls_lkbtbl[i].list)) {
6530 +                       lkb_found = 1;
6531 +                       list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
6532 +                                           lkb_idtbl_list) {
6533 +                               if (!lkb->lkb_nodeid) {
6534 +                                       read_unlock(&ls->ls_lkbtbl[i].lock);
6535 +                                       return 2;
6536 +                               }
6537 +                       }
6538 +               }
6539 +               read_unlock(&ls->ls_lkbtbl[i].lock);
6540 +       }
6541 +       return lkb_found;
6542 +}
6543 +
6544 +static int release_lockspace(struct dlm_ls *ls, int force)
6545 +{
6546 +       struct dlm_lkb *lkb;
6547 +       struct dlm_rsb *rsb;
6548 +       struct dlm_recover *rv;
6549 +       struct list_head *head;
6550 +       int i;
6551 +       int busy = lockspace_busy(ls);
6552 +
6553 +       /* Don't destroy a busy lockspace */
6554 +       if (busy > force)
6555 +               return -EBUSY;
6556 +
6557 +       if (force < 3) {
6558 +               kcl_leave_service(ls->ls_local_id);
6559 +               kcl_unregister_service(ls->ls_local_id);
6560 +       }
6561 +
6562 +       dlm_recoverd_stop(ls);
6563 +
6564 +       remove_lockspace(ls);
6565 +
6566 +       /*
6567 +        * Free direntry structs.
6568 +        */
6569 +
6570 +       dlm_dir_clear(ls);
6571 +       kfree(ls->ls_dirtbl);
6572 +
6573 +       /*
6574 +        * Free all lkb's on lkbtbl[] lists.
6575 +        */
6576 +
6577 +       for (i = 0; i < ls->ls_lkbtbl_size; i++) {
6578 +               head = &ls->ls_lkbtbl[i].list;
6579 +               while (!list_empty(head)) {
6580 +                       lkb = list_entry(head->next, struct dlm_lkb,
6581 +                                        lkb_idtbl_list);
6582 +                       list_del(&lkb->lkb_idtbl_list);
6583 +
6584 +                       if (lkb->lkb_lockqueue_state)
6585 +                               remove_from_lockqueue(lkb);
6586 +
6587 +                       if (lkb->lkb_astflags & (AST_COMP | AST_BAST))
6588 +                               list_del(&lkb->lkb_astqueue);
6589 +
6590 +                       if (lkb->lkb_lvbptr && lkb->lkb_flags & GDLM_LKFLG_MSTCPY)
6591 +                               free_lvb(lkb->lkb_lvbptr);
6592 +
6593 +                       free_lkb(lkb);
6594 +               }
6595 +       }
6596 +
6597 +       kfree(ls->ls_lkbtbl);
6598 +
6599 +       /*
6600 +        * Free all rsb's on rsbtbl[] lists
6601 +        */
6602 +
6603 +       for (i = 0; i < ls->ls_rsbtbl_size; i++) {
6604 +               head = &ls->ls_rsbtbl[i].list;
6605 +               while (!list_empty(head)) {
6606 +                       rsb = list_entry(head->next, struct dlm_rsb,
6607 +                                        res_hashchain);
6608 +                       list_del(&rsb->res_hashchain);
6609 +
6610 +                       if (rsb->res_lvbptr)
6611 +                               free_lvb(rsb->res_lvbptr);
6612 +
6613 +                       free_rsb(rsb);
6614 +               }
6615 +       }
6616 +
6617 +       kfree(ls->ls_rsbtbl);
6618 +
6619 +       /*
6620 +        * Free structures on any other lists
6621 +        */
6622 +
6623 +       head = &ls->ls_recover;
6624 +       while (!list_empty(head)) {
6625 +               rv = list_entry(head->next, struct dlm_recover, list);
6626 +               list_del(&rv->list);
6627 +               kfree(rv);
6628 +       }
6629 +
6630 +       clear_free_de(ls);
6631 +
6632 +       ls_nodes_clear(ls);
6633 +       ls_nodes_gone_clear(ls);
6634 +       if (ls->ls_node_array)
6635 +               kfree(ls->ls_node_array);
6636 +
6637 +       kfree(ls);
6638 +       dlm_release();
6639 +       module_put(THIS_MODULE);
6640 +       return 0;
6641 +}
6642 +
6643 +
6644 +/*
6645 + * Called when a system has released all its locks and is not going to use the
6646 + * lockspace any longer.  We blindly free everything we're managing for this
6647 + * lockspace.  Remaining nodes will go through the recovery process as if we'd
6648 + * died.  The lockspace must continue to function as usual, participating in
6649 + * recoveries, until kcl_leave_service returns.
6650 + *
6651 + * Force has 4 possible values:
6652 + * 0 - don't destroy locksapce if it has any LKBs
6653 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
6654 + * 2 - destroy lockspace regardless of LKBs
6655 + * 3 - destroy lockspace as part of a forced shutdown
6656 + */
6657 +
6658 +int dlm_release_lockspace(void *lockspace, int force)
6659 +{
6660 +       struct dlm_ls *ls;
6661 +
6662 +       ls = find_lockspace_by_local_id(lockspace);
6663 +       if (!ls)
6664 +               return -EINVAL;
6665 +       put_lockspace(ls);
6666 +       return release_lockspace(ls, force);
6667 +}
6668 +
6669 +
6670 +/* Called when the cluster is being shut down dirtily */
6671 +void dlm_emergency_shutdown()
6672 +{
6673 +       struct dlm_ls *ls;
6674 +       struct dlm_ls *tmp;
6675 +
6676 +       /* Shut lowcomms down to prevent any socket activity */
6677 +       lowcomms_stop_accept();
6678 +
6679 +       /* Delete the devices that belong the the userland
6680 +          lockspaces to be deleted. */
6681 +       dlm_device_free_devices();
6682 +
6683 +       /* Now try to clean the lockspaces */
6684 +       spin_lock(&lslist_lock);
6685 +
6686 +       list_for_each_entry_safe(ls, tmp, &lslist, ls_list) {
6687 +               spin_unlock(&lslist_lock);
6688 +               release_lockspace(ls, 3);
6689 +               spin_lock(&lslist_lock);
6690 +       }
6691 +
6692 +       spin_unlock(&lslist_lock);
6693 +}
6694 +
6695 +struct dlm_recover *allocate_dlm_recover(void)
6696 +{
6697 +       struct dlm_recover *rv;
6698 +
6699 +       rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
6700 +       if (rv)
6701 +               memset(rv, 0, sizeof(struct dlm_recover));
6702 +       return rv;
6703 +}
6704 +
6705 +/*
6706 + * Called by CMAN on a specific ls.  "stop" means set flag which while set
6707 + * causes all new requests to ls to be queued and not submitted until flag is
6708 + * cleared.  stop on a ls also needs to cancel any prior starts on the ls.
6709 + * The recoverd thread carries out any work called for by this event.
6710 + */
6711 +
6712 +static int dlm_ls_stop(void *servicedata)
6713 +{
6714 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6715 +       int new;
6716 +
6717 +       spin_lock(&ls->ls_recover_lock);
6718 +       ls->ls_last_stop = ls->ls_last_start;
6719 +       set_bit(LSFL_LS_STOP, &ls->ls_flags);
6720 +       new = test_and_clear_bit(LSFL_LS_RUN, &ls->ls_flags);
6721 +       spin_unlock(&ls->ls_recover_lock);
6722 +
6723 +       /*
6724 +        * This in_recovery lock does two things:
6725 +        *
6726 +        * 1) Keeps this function from returning until all threads are out
6727 +        *    of locking routines and locking is truely stopped.
6728 +        * 2) Keeps any new requests from being processed until it's unlocked
6729 +        *    when recovery is complete.
6730 +        */
6731 +
6732 +       if (new)
6733 +               down_write(&ls->ls_in_recovery);
6734 +
6735 +       clear_bit(LSFL_RESDIR_VALID, &ls->ls_flags);
6736 +       clear_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags);
6737 +       clear_bit(LSFL_NODES_VALID, &ls->ls_flags);
6738 +       clear_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
6739 +
6740 +       dlm_recoverd_kick(ls);
6741 +
6742 +       return 0;
6743 +}
6744 +
6745 +/*
6746 + * Called by CMAN on a specific ls.  "start" means enable the lockspace to do
6747 + * request processing which first requires that the recovery procedure be
6748 + * stepped through with all nodes sharing the lockspace (nodeids).  The first
6749 + * start on the ls after it's created is a special case and requires some extra
6750 + * work like figuring out our own local nodeid.  We can't do all this in the
6751 + * calling CMAN context, so we must pass this work off to the recoverd thread
6752 + * which was created in dlm_init().  The recoverd thread carries out any work
6753 + * called for by this event.
6754 + */
6755 +
6756 +static int dlm_ls_start(void *servicedata, uint32_t *nodeids, int count,
6757 +                       int event_id, int type)
6758 +{
6759 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6760 +       struct dlm_recover *rv;
6761 +       int error = -ENOMEM;
6762 +
6763 +       rv = allocate_dlm_recover();
6764 +       if (!rv)
6765 +               goto out;
6766 +
6767 +       rv->nodeids = nodeids;
6768 +       rv->node_count = count;
6769 +       rv->event_id = event_id;
6770 +
6771 +       spin_lock(&ls->ls_recover_lock);
6772 +       if (ls->ls_last_start == event_id)
6773 +               log_all(ls, "repeated start %d stop %d finish %d",
6774 +                       event_id, ls->ls_last_stop, ls->ls_last_finish);
6775 +       ls->ls_last_start = event_id;
6776 +       list_add_tail(&rv->list, &ls->ls_recover);
6777 +       set_bit(LSFL_LS_START, &ls->ls_flags);
6778 +       spin_unlock(&ls->ls_recover_lock);
6779 +
6780 +       dlm_recoverd_kick(ls);
6781 +       error = 0;
6782 +
6783 +      out:
6784 +       return error;
6785 +}
6786 +
6787 +/*
6788 + * Called by CMAN on a specific ls.  "finish" means that all nodes which
6789 + * received a "start" have completed the start and called kcl_start_done.
6790 + * The recoverd thread carries out any work called for by this event.
6791 + */
6792 +
6793 +static void dlm_ls_finish(void *servicedata, int event_id)
6794 +{
6795 +       struct dlm_ls *ls = (struct dlm_ls *) servicedata;
6796 +
6797 +       spin_lock(&ls->ls_recover_lock);
6798 +       ls->ls_last_finish = event_id;
6799 +       set_bit(LSFL_LS_FINISH, &ls->ls_flags);
6800 +       spin_unlock(&ls->ls_recover_lock);
6801 +
6802 +       dlm_recoverd_kick(ls);
6803 +}
6804 +
6805 +struct kcl_service_ops ls_ops = {
6806 +       .stop = dlm_ls_stop,
6807 +       .start = dlm_ls_start,
6808 +       .finish = dlm_ls_finish
6809 +};
6810 diff -urN linux-orig/cluster/dlm/lockspace.h linux-patched/cluster/dlm/lockspace.h
6811 --- linux-orig/cluster/dlm/lockspace.h  1970-01-01 07:30:00.000000000 +0730
6812 +++ linux-patched/cluster/dlm/lockspace.h       2004-11-03 11:31:56.000000000 +0800
6813 @@ -0,0 +1,29 @@
6814 +/******************************************************************************
6815 +*******************************************************************************
6816 +**
6817 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6818 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6819 +**
6820 +**  This copyrighted material is made available to anyone wishing to use,
6821 +**  modify, copy, or redistribute it subject to the terms and conditions
6822 +**  of the GNU General Public License v.2.
6823 +**
6824 +*******************************************************************************
6825 +******************************************************************************/
6826 +
6827 +#ifndef __LOCKSPACE_DOT_H__
6828 +#define __LOCKSPACE_DOT_H__
6829 +
6830 +void dlm_lockspace_init(void);
6831 +int dlm_init(void);
6832 +int dlm_release(void);
6833 +int dlm_new_lockspace(char *name, int namelen, void **ls, int flags);
6834 +int dlm_release_lockspace(void *ls, int force);
6835 +void dlm_emergency_shutdown(void);
6836 +struct dlm_ls *find_lockspace_by_global_id(uint32_t id);
6837 +struct dlm_ls *find_lockspace_by_local_id(void *id);
6838 +struct dlm_ls *find_lockspace_by_name(char *name, int namelen);
6839 +void hold_lockspace(struct dlm_ls *ls);
6840 +void put_lockspace(struct dlm_ls *ls);
6841 +
6842 +#endif                         /* __LOCKSPACE_DOT_H__ */
6843 diff -urN linux-orig/cluster/dlm/lowcomms.c linux-patched/cluster/dlm/lowcomms.c
6844 --- linux-orig/cluster/dlm/lowcomms.c   1970-01-01 07:30:00.000000000 +0730
6845 +++ linux-patched/cluster/dlm/lowcomms.c        2004-11-03 11:31:56.000000000 +0800
6846 @@ -0,0 +1,1415 @@
6847 +/******************************************************************************
6848 +*******************************************************************************
6849 +**
6850 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
6851 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
6852 +**
6853 +**  This copyrighted material is made available to anyone wishing to use,
6854 +**  modify, copy, or redistribute it subject to the terms and conditions
6855 +**  of the GNU General Public License v.2.
6856 +**
6857 +*******************************************************************************
6858 +******************************************************************************/
6859 +
6860 +/*
6861 + * lowcomms.c
6862 + *
6863 + * This is the "low-level" comms layer.
6864 + *
6865 + * It is responsible for sending/receiving messages
6866 + * from other nodes in the cluster.
6867 + *
6868 + * Cluster nodes are referred to by their nodeids. nodeids are
6869 + * simply 32 bit numbers to the locking module - if they need to
6870 + * be expanded for the cluster infrastructure then that is it's
6871 + * responsibility. It is this layer's
6872 + * responsibility to resolve these into IP address or
6873 + * whatever it needs for inter-node communication.
6874 + *
6875 + * The comms level is two kernel threads that deal mainly with
6876 + * the receiving of messages from other nodes and passing them
6877 + * up to the mid-level comms layer (which understands the
6878 + * message format) for execution by the locking core, and
6879 + * a send thread which does all the setting up of connections
6880 + * to remote nodes and the sending of data. Threads are not allowed
6881 + * to send their own data because it may cause them to wait in times
6882 + * of high load. Also, this way, the sending thread can collect together
6883 + * messages bound for one node and send them in one block.
6884 + *
6885 + * I don't see any problem with the recv thread executing the locking
6886 + * code on behalf of remote processes as the locking code is
6887 + * short, efficient and never waits.
6888 + *
6889 + */
6890 +
6891 +
6892 +#include <asm/ioctls.h>
6893 +#include <net/sock.h>
6894 +#include <net/tcp.h>
6895 +#include <linux/pagemap.h>
6896 +#include <cluster/cnxman.h>
6897 +
6898 +#include "dlm_internal.h"
6899 +#include "lowcomms.h"
6900 +#include "midcomms.h"
6901 +#include "config.h"
6902 +
6903 +struct cbuf {
6904 +       unsigned base;
6905 +       unsigned len;
6906 +       unsigned mask;
6907 +};
6908 +
6909 +#define CBUF_INIT(cb, size) do { (cb)->base = (cb)->len = 0; (cb)->mask = ((size)-1); } while(0)
6910 +#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
6911 +#define CBUF_EMPTY(cb) ((cb)->len == 0)
6912 +#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
6913 +#define CBUF_EAT(cb, n) do { (cb)->len  -= (n); \
6914 +                             (cb)->base += (n); (cb)->base &= (cb)->mask; } while(0)
6915 +#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
6916 +
6917 +struct connection {
6918 +       struct socket *sock;    /* NULL if not connected */
6919 +       uint32_t nodeid;        /* So we know who we are in the list */
6920 +       struct rw_semaphore sock_sem;   /* Stop connect races */
6921 +       struct list_head read_list;     /* On this list when ready for reading */
6922 +       struct list_head write_list;    /* On this list when ready for writing */
6923 +       struct list_head state_list;    /* On this list when ready to connect */
6924 +       unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
6925 +#define CF_READ_PENDING 1
6926 +#define CF_WRITE_PENDING 2
6927 +#define CF_CONNECT_PENDING 3
6928 +#define CF_IS_OTHERCON 4
6929 +       struct list_head writequeue;    /* List of outgoing writequeue_entries */
6930 +       struct list_head listenlist;    /* List of allocated listening sockets */
6931 +       spinlock_t writequeue_lock;
6932 +       int (*rx_action) (struct connection *); /* What to do when active */
6933 +       struct page *rx_page;
6934 +       struct cbuf cb;
6935 +       int retries;
6936 +       atomic_t waiting_requests;
6937 +#define MAX_CONNECT_RETRIES 3
6938 +       struct connection *othercon;
6939 +};
6940 +#define sock2con(x) ((struct connection *)(x)->sk_user_data)
6941 +
6942 +/* An entry waiting to be sent */
6943 +struct writequeue_entry {
6944 +       struct list_head list;
6945 +       struct page *page;
6946 +       int offset;
6947 +       int len;
6948 +       int end;
6949 +       int users;
6950 +       struct connection *con;
6951 +};
6952 +
6953 +/* "Template" structure for IPv4 and IPv6 used to fill
6954 + * in the missing bits when converting between cman (which knows
6955 + * nothing about sockaddr structs) and real life where we actually
6956 + * have to connect to these addresses. Also one of these structs
6957 + * will hold the cached "us" address.
6958 + *
6959 + * It's an in6 sockaddr just so there's enough space for anything
6960 + * we're likely to see here.
6961 + */
6962 +static struct sockaddr_in6 local_addr;
6963 +
6964 +/* Manage daemons */
6965 +static struct task_struct *recv_task;
6966 +static struct task_struct *send_task;
6967 +
6968 +static wait_queue_t lowcomms_send_waitq_head;
6969 +static wait_queue_head_t lowcomms_send_waitq;
6970 +static wait_queue_t lowcomms_recv_waitq_head;
6971 +static wait_queue_head_t lowcomms_recv_waitq;
6972 +
6973 +/* An array of pointers to connections, indexed by NODEID */
6974 +static struct connection **connections;
6975 +static struct rw_semaphore connections_lock;
6976 +static kmem_cache_t *con_cache;
6977 +static int conn_array_size;
6978 +static atomic_t accepting;
6979 +
6980 +/* List of sockets that have reads pending */
6981 +static struct list_head read_sockets;
6982 +static spinlock_t read_sockets_lock;
6983 +
6984 +/* List of sockets which have writes pending */
6985 +static struct list_head write_sockets;
6986 +static spinlock_t write_sockets_lock;
6987 +
6988 +/* List of sockets which have connects pending */
6989 +static struct list_head state_sockets;
6990 +static spinlock_t state_sockets_lock;
6991 +
6992 +/* List of allocated listen sockets */
6993 +static struct list_head listen_sockets;
6994 +
6995 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr);
6996 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len);
6997 +
6998 +
6999 +static struct connection *nodeid2con(int nodeid, int allocation)
7000 +{
7001 +       struct connection *con = NULL;
7002 +
7003 +       down_read(&connections_lock);
7004 +       if (nodeid >= conn_array_size) {
7005 +               int new_size = nodeid + dlm_config.conn_increment;
7006 +               struct connection **new_conns;
7007 +
7008 +               new_conns = kmalloc(sizeof(struct connection *) *
7009 +                                   new_size, allocation);
7010 +               if (!new_conns)
7011 +                       goto finish;
7012 +
7013 +               up_read(&connections_lock);
7014 +               /* The worst that can happen here (I think), is that
7015 +                  we get two consecutive reallocations */
7016 +               down_write(&connections_lock);
7017 +
7018 +               memset(new_conns, 0, sizeof(struct connection *) * new_size);
7019 +               memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
7020 +               conn_array_size = new_size;
7021 +               kfree(connections);
7022 +               connections = new_conns;
7023 +
7024 +               up_write(&connections_lock);
7025 +               down_read(&connections_lock);
7026 +       }
7027 +
7028 +       con = connections[nodeid];
7029 +       if (con == NULL && allocation) {
7030 +               con = kmem_cache_alloc(con_cache, allocation);
7031 +               if (!con)
7032 +                       goto finish;
7033 +
7034 +               memset(con, 0, sizeof(*con));
7035 +               con->nodeid = nodeid;
7036 +               init_rwsem(&con->sock_sem);
7037 +               INIT_LIST_HEAD(&con->writequeue);
7038 +               spin_lock_init(&con->writequeue_lock);
7039 +
7040 +               connections[nodeid] = con;
7041 +       }
7042 +
7043 + finish:
7044 +       up_read(&connections_lock);
7045 +       return con;
7046 +}
7047 +
7048 +/* Data available on socket or listen socket received a connect */
7049 +static void lowcomms_data_ready(struct sock *sk, int count_unused)
7050 +{
7051 +       struct connection *con = sock2con(sk);
7052 +
7053 +       atomic_inc(&con->waiting_requests);
7054 +       if (test_and_set_bit(CF_READ_PENDING, &con->flags))
7055 +               return;
7056 +
7057 +       spin_lock_bh(&read_sockets_lock);
7058 +       list_add_tail(&con->read_list, &read_sockets);
7059 +       spin_unlock_bh(&read_sockets_lock);
7060 +
7061 +       wake_up_interruptible(&lowcomms_recv_waitq);
7062 +}
7063 +
7064 +static void lowcomms_write_space(struct sock *sk)
7065 +{
7066 +       struct connection *con = sock2con(sk);
7067 +
7068 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags))
7069 +               return;
7070 +
7071 +       spin_lock_bh(&write_sockets_lock);
7072 +       list_add_tail(&con->write_list, &write_sockets);
7073 +       spin_unlock_bh(&write_sockets_lock);
7074 +
7075 +       wake_up_interruptible(&lowcomms_send_waitq);
7076 +}
7077 +
7078 +static inline void lowcomms_connect_sock(struct connection *con)
7079 +{
7080 +       if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
7081 +               return;
7082 +       if (!atomic_read(&accepting))
7083 +               return;
7084 +
7085 +       spin_lock_bh(&state_sockets_lock);
7086 +       list_add_tail(&con->state_list, &state_sockets);
7087 +       spin_unlock_bh(&state_sockets_lock);
7088 +
7089 +       wake_up_interruptible(&lowcomms_send_waitq);
7090 +}
7091 +
7092 +static void lowcomms_state_change(struct sock *sk)
7093 +{
7094 +/*     struct connection *con = sock2con(sk); */
7095 +
7096 +       switch (sk->sk_state) {
7097 +       case TCP_ESTABLISHED:
7098 +               lowcomms_write_space(sk);
7099 +               break;
7100 +
7101 +       case TCP_FIN_WAIT1:
7102 +       case TCP_FIN_WAIT2:
7103 +       case TCP_TIME_WAIT:
7104 +       case TCP_CLOSE:
7105 +       case TCP_CLOSE_WAIT:
7106 +       case TCP_LAST_ACK:
7107 +       case TCP_CLOSING:
7108 +               /* FIXME: I think this causes more trouble than it solves.
7109 +                  lowcomms wil reconnect anyway when there is something to
7110 +                  send. This just attempts reconnection if a node goes down!
7111 +               */
7112 +               /* lowcomms_connect_sock(con); */
7113 +               break;
7114 +
7115 +       default:
7116 +               printk("dlm: lowcomms_state_change: state=%d\n", sk->sk_state);
7117 +               break;
7118 +       }
7119 +}
7120 +
7121 +/* Make a socket active */
7122 +static int add_sock(struct socket *sock, struct connection *con)
7123 +{
7124 +       con->sock = sock;
7125 +
7126 +       /* Install a data_ready callback */
7127 +       con->sock->sk->sk_data_ready = lowcomms_data_ready;
7128 +       con->sock->sk->sk_write_space = lowcomms_write_space;
7129 +       con->sock->sk->sk_state_change = lowcomms_state_change;
7130 +
7131 +       return 0;
7132 +}
7133 +
7134 +/* Add the port number to an IP6 or 4 sockaddr and return the address
7135 +   length */
7136 +static void make_sockaddr(struct sockaddr_in6 *saddr, uint16_t port,
7137 +                         int *addr_len)
7138 +{
7139 +        saddr->sin6_family = local_addr.sin6_family;
7140 +        if (local_addr.sin6_family == AF_INET) {
7141 +               struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
7142 +               in4_addr->sin_port = cpu_to_be16(port);
7143 +               *addr_len = sizeof(struct sockaddr_in);
7144 +       }
7145 +       else {
7146 +               saddr->sin6_port = cpu_to_be16(port);
7147 +               *addr_len = sizeof(struct sockaddr_in6);
7148 +       }
7149 +}
7150 +
7151 +/* Close a remote connection and tidy up */
7152 +static void close_connection(struct connection *con, int and_other)
7153 +{
7154 +       down_write(&con->sock_sem);
7155 +
7156 +       if (con->sock) {
7157 +               sock_release(con->sock);
7158 +               con->sock = NULL;
7159 +               if (con->othercon && and_other) {
7160 +                       /* Argh! recursion in kernel code!
7161 +                          Actually, this isn't a list so it
7162 +                          will only re-enter once.
7163 +                       */
7164 +                       close_connection(con->othercon, TRUE);
7165 +               }
7166 +       }
7167 +       if (con->rx_page) {
7168 +               __free_page(con->rx_page);
7169 +               con->rx_page = NULL;
7170 +       }
7171 +       up_write(&con->sock_sem);
7172 +}
7173 +
7174 +/* Data received from remote end */
7175 +static int receive_from_sock(struct connection *con)
7176 +{
7177 +       int ret = 0;
7178 +       struct msghdr msg;
7179 +       struct iovec iov[2];
7180 +       mm_segment_t fs;
7181 +       unsigned len;
7182 +       int r;
7183 +       int call_again_soon = 0;
7184 +
7185 +       down_read(&con->sock_sem);
7186 +
7187 +       if (con->sock == NULL)
7188 +               goto out;
7189 +       if (con->rx_page == NULL) {
7190 +               /*
7191 +                * This doesn't need to be atomic, but I think it should
7192 +                * improve performance if it is.
7193 +                */
7194 +               con->rx_page = alloc_page(GFP_ATOMIC);
7195 +               if (con->rx_page == NULL)
7196 +                       goto out_resched;
7197 +               CBUF_INIT(&con->cb, PAGE_CACHE_SIZE);
7198 +       }
7199 +
7200 +       /*
7201 +        * To avoid doing too many short reads, we will reschedule for
7202 +        * another time if there are less than 20 bytes left in the buffer.
7203 +        */
7204 +       if (!CBUF_MAY_ADD(&con->cb, 20))
7205 +               goto out_resched;
7206 +
7207 +       msg.msg_control = NULL;
7208 +       msg.msg_controllen = 0;
7209 +       msg.msg_iovlen = 1;
7210 +       msg.msg_iov = iov;
7211 +       msg.msg_name = NULL;
7212 +       msg.msg_namelen = 0;
7213 +       msg.msg_flags = 0;
7214 +
7215 +       /*
7216 +        * iov[0] is the bit of the circular buffer between the current end
7217 +        * point (cb.base + cb.len) and the end of the buffer.
7218 +        */
7219 +       iov[0].iov_len = con->cb.base - CBUF_DATA(&con->cb);
7220 +       iov[0].iov_base = page_address(con->rx_page) + CBUF_DATA(&con->cb);
7221 +       iov[1].iov_len = 0;
7222 +
7223 +       /*
7224 +        * iov[1] is the bit of the circular buffer between the start of the
7225 +        * buffer and the start of the currently used section (cb.base)
7226 +        */
7227 +       if (CBUF_DATA(&con->cb) >= con->cb.base) {
7228 +               iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&con->cb);
7229 +               iov[1].iov_len = con->cb.base;
7230 +               iov[1].iov_base = page_address(con->rx_page);
7231 +               msg.msg_iovlen = 2;
7232 +       }
7233 +       len = iov[0].iov_len + iov[1].iov_len;
7234 +
7235 +       fs = get_fs();
7236 +       set_fs(get_ds());
7237 +       r = ret = sock_recvmsg(con->sock, &msg, len,
7238 +                              MSG_DONTWAIT | MSG_NOSIGNAL);
7239 +       set_fs(fs);
7240 +
7241 +       if (ret <= 0)
7242 +               goto out_close;
7243 +       if (ret == len)
7244 +               call_again_soon = 1;
7245 +       CBUF_ADD(&con->cb, ret);
7246 +       ret = midcomms_process_incoming_buffer(con->nodeid,
7247 +                                              page_address(con->rx_page),
7248 +                                              con->cb.base, con->cb.len,
7249 +                                              PAGE_CACHE_SIZE);
7250 +       if (ret == -EBADMSG) {
7251 +               printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
7252 +                      "iov_len=%u, iov_base[0]=%p, read=%d\n",
7253 +                      page_address(con->rx_page), con->cb.base, con->cb.len,
7254 +                      len, iov[0].iov_base, r);
7255 +       }
7256 +       if (ret < 0)
7257 +               goto out_close;
7258 +       CBUF_EAT(&con->cb, ret);
7259 +
7260 +       if (CBUF_EMPTY(&con->cb) && !call_again_soon) {
7261 +               __free_page(con->rx_page);
7262 +               con->rx_page = NULL;
7263 +       }
7264 +
7265 +      out:
7266 +       if (call_again_soon)
7267 +               goto out_resched;
7268 +       up_read(&con->sock_sem);
7269 +       ret = 0;
7270 +       goto out_ret;
7271 +
7272 +      out_resched:
7273 +       lowcomms_data_ready(con->sock->sk, 0);
7274 +       up_read(&con->sock_sem);
7275 +       ret = 0;
7276 +       goto out_ret;
7277 +
7278 +      out_close:
7279 +       up_read(&con->sock_sem);
7280 +       if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
7281 +               close_connection(con, FALSE);
7282 +               lowcomms_connect_sock(con);
7283 +       }
7284 +
7285 +      out_ret:
7286 +       return ret;
7287 +}
7288 +
7289 +/* Listening socket is busy, accept a connection */
7290 +static int accept_from_sock(struct connection *con)
7291 +{
7292 +       int result;
7293 +       struct sockaddr_in6 peeraddr;
7294 +       struct socket *newsock;
7295 +       int len;
7296 +       int nodeid;
7297 +       struct connection *newcon;
7298 +
7299 +       memset(&peeraddr, 0, sizeof(peeraddr));
7300 +       newsock = sock_alloc();
7301 +       if (!newsock)
7302 +               return -ENOMEM;
7303 +
7304 +       down_read(&con->sock_sem);
7305 +
7306 +       result = -ENOTCONN;
7307 +       if (con->sock == NULL)
7308 +               goto accept_err;
7309 +
7310 +       newsock->type = con->sock->type;
7311 +       newsock->ops = con->sock->ops;
7312 +
7313 +       result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
7314 +       if (result < 0)
7315 +               goto accept_err;
7316 +
7317 +       /* Get the connected socket's peer */
7318 +       if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
7319 +                                 &len, 2)) {
7320 +               result = -ECONNABORTED;
7321 +               goto accept_err;
7322 +       }
7323 +
7324 +       /* Get the new node's NODEID */
7325 +       nodeid = lowcomms_nodeid_from_ipaddr((struct sockaddr *)&peeraddr, len);
7326 +       if (nodeid == 0) {
7327 +               printk("dlm: connect from non cluster node\n");
7328 +               sock_release(newsock);
7329 +               up_read(&con->sock_sem);
7330 +               return -1;
7331 +       }
7332 +
7333 +       log_print("got connection from %d", nodeid);
7334 +
7335 +       /*  Check to see if we already have a connection to this node. This
7336 +        *  could happen if the two nodes initiate a connection at roughly
7337 +        *  the same time and the connections cross on the wire.
7338 +        * TEMPORARY FIX:
7339 +        *  In this case we store the incoming one in "othercon"
7340 +        */
7341 +       newcon = nodeid2con(nodeid, GFP_KERNEL);
7342 +       if (!newcon) {
7343 +               result = -ENOMEM;
7344 +               goto accept_err;
7345 +       }
7346 +       down_write(&newcon->sock_sem);
7347 +       if (newcon->sock) {
7348 +               struct connection *othercon = newcon->othercon;
7349 +
7350 +               if (!othercon) {
7351 +                       othercon = kmem_cache_alloc(con_cache, GFP_KERNEL);
7352 +                       if (!othercon) {
7353 +                               printk("dlm: failed to allocate incoming socket\n");
7354 +                               up_write(&newcon->sock_sem);
7355 +                               result = -ENOMEM;
7356 +                               goto accept_err;
7357 +                       }
7358 +                       memset(othercon, 0, sizeof(*othercon));
7359 +                       othercon->nodeid = nodeid;
7360 +                       othercon->rx_action = receive_from_sock;
7361 +                       init_rwsem(&othercon->sock_sem);
7362 +                       set_bit(CF_IS_OTHERCON, &othercon->flags);
7363 +                       newcon->othercon = othercon;
7364 +               }
7365 +               othercon->sock = newsock;
7366 +               newsock->sk->sk_user_data = othercon;
7367 +               add_sock(newsock, othercon);
7368 +       }
7369 +       else {
7370 +               newsock->sk->sk_user_data = newcon;
7371 +               newcon->rx_action = receive_from_sock;
7372 +               add_sock(newsock, newcon);
7373 +
7374 +       }
7375 +
7376 +       up_write(&newcon->sock_sem);
7377 +
7378 +       /*
7379 +        * Add it to the active queue in case we got data
7380 +        * beween processing the accept adding the socket
7381 +        * to the read_sockets list
7382 +        */
7383 +       lowcomms_data_ready(newsock->sk, 0);
7384 +       up_read(&con->sock_sem);
7385 +
7386 +       return 0;
7387 +
7388 +      accept_err:
7389 +       up_read(&con->sock_sem);
7390 +       sock_release(newsock);
7391 +
7392 +       if (result != -EAGAIN)
7393 +               printk("dlm: error accepting connection from node: %d\n", result);
7394 +       return result;
7395 +}
7396 +
7397 +/* Connect a new socket to its peer */
7398 +static int connect_to_sock(struct connection *con)
7399 +{
7400 +       int result = -EHOSTUNREACH;
7401 +       struct sockaddr_in6 saddr;
7402 +       int addr_len;
7403 +       struct socket *sock;
7404 +
7405 +       if (con->nodeid == 0) {
7406 +               log_print("attempt to connect sock 0 foiled");
7407 +               return 0;
7408 +       }
7409 +
7410 +       down_write(&con->sock_sem);
7411 +       if (con->retries++ > MAX_CONNECT_RETRIES)
7412 +               goto out;
7413 +
7414 +       // FIXME not sure this should happen, let alone like this.
7415 +       if (con->sock) {
7416 +               sock_release(con->sock);
7417 +               con->sock = NULL;
7418 +       }
7419 +
7420 +       /* Create a socket to communicate with */
7421 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7422 +       if (result < 0)
7423 +               goto out_err;
7424 +
7425 +       memset(&saddr, 0, sizeof(saddr));
7426 +       if (lowcomms_ipaddr_from_nodeid(con->nodeid, (struct sockaddr *)&saddr) < 0)
7427 +               goto out_err;
7428 +
7429 +       sock->sk->sk_user_data = con;
7430 +       con->rx_action = receive_from_sock;
7431 +
7432 +       make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
7433 +
7434 +       add_sock(sock, con);
7435 +
7436 +       log_print("connecting to %d", con->nodeid);
7437 +       result =
7438 +           sock->ops->connect(sock, (struct sockaddr *) &saddr, addr_len,
7439 +                              O_NONBLOCK);
7440 +       if (result == -EINPROGRESS)
7441 +               result = 0;
7442 +       if (result != 0)
7443 +               goto out_err;
7444 +
7445 +      out:
7446 +       up_write(&con->sock_sem);
7447 +       /*
7448 +        * Returning an error here means we've given up trying to connect to
7449 +        * a remote node, otherwise we return 0 and reschedule the connetion
7450 +        * attempt
7451 +        */
7452 +       return result;
7453 +
7454 +      out_err:
7455 +       if (con->sock) {
7456 +               sock_release(con->sock);
7457 +               con->sock = NULL;
7458 +       }
7459 +       /*
7460 +        * Some errors are fatal and this list might need adjusting. For other
7461 +        * errors we try again until the max number of retries is reached.
7462 +        */
7463 +       if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
7464 +           result != -ENETDOWN && result != EINVAL
7465 +           && result != -EPROTONOSUPPORT) {
7466 +               lowcomms_connect_sock(con);
7467 +               result = 0;
7468 +       }
7469 +       goto out;
7470 +}
7471 +
7472 +static struct socket *create_listen_sock(struct connection *con, char *addr, int addr_len)
7473 +{
7474 +        struct socket *sock = NULL;
7475 +       mm_segment_t fs;
7476 +       int result = 0;
7477 +       int one = 1;
7478 +       struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr;
7479 +
7480 +       /* Create a socket to communicate with */
7481 +       result = sock_create_kern(local_addr.sin6_family, SOCK_STREAM, IPPROTO_TCP, &sock);
7482 +       if (result < 0) {
7483 +               printk("dlm: Can't create listening comms socket\n");
7484 +               goto create_out;
7485 +       }
7486 +
7487 +       fs = get_fs();
7488 +       set_fs(get_ds());
7489 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one));
7490 +       set_fs(fs);
7491 +       if (result < 0) {
7492 +               printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",result);
7493 +       }
7494 +       sock->sk->sk_user_data = con;
7495 +       con->rx_action = accept_from_sock;
7496 +       con->sock = sock;
7497 +
7498 +       /* Bind to our port */
7499 +       make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
7500 +       result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
7501 +       if (result < 0) {
7502 +               printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
7503 +               sock_release(sock);
7504 +               sock = NULL;
7505 +               goto create_out;
7506 +       }
7507 +
7508 +       fs = get_fs();
7509 +       set_fs(get_ds());
7510 +
7511 +       result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&one, sizeof(one));
7512 +       set_fs(fs);
7513 +       if (result < 0) {
7514 +               printk("dlm: Set keepalive failed: %d\n", result);
7515 +       }
7516 +
7517 +       result = sock->ops->listen(sock, 5);
7518 +       if (result < 0) {
7519 +               printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
7520 +               sock_release(sock);
7521 +               sock = NULL;
7522 +               goto create_out;
7523 +       }
7524 +
7525 +      create_out:
7526 +       return sock;
7527 +}
7528 +
7529 +
7530 +/* Listen on all interfaces */
7531 +static int listen_for_all(void)
7532 +{
7533 +       int result = 0;
7534 +       int nodeid;
7535 +       struct socket *sock = NULL;
7536 +       struct list_head *addr_list;
7537 +       struct connection *con = nodeid2con(0, GFP_KERNEL);
7538 +       struct connection *temp;
7539 +       struct cluster_node_addr *node_addr;
7540 +       char local_addr[sizeof(struct sockaddr_in6)];
7541 +
7542 +       /* This will also fill in local_addr */
7543 +       nodeid = lowcomms_our_nodeid();
7544 +
7545 +       addr_list = kcl_get_node_addresses(nodeid);
7546 +       if (!addr_list) {
7547 +               printk("dlm: cannot initialise comms layer\n");
7548 +               result = -ENOTCONN;
7549 +               goto create_out;
7550 +       }
7551 +
7552 +       list_for_each_entry(node_addr, addr_list, list) {
7553 +
7554 +               if (!con) {
7555 +                       con = kmem_cache_alloc(con_cache, GFP_KERNEL);
7556 +                       if (!con) {
7557 +                               printk("dlm: failed to allocate listen socket\n");
7558 +                               result = -ENOMEM;
7559 +                               goto create_free;
7560 +                       }
7561 +                       memset(con, 0, sizeof(*con));
7562 +                       init_rwsem(&con->sock_sem);
7563 +                       spin_lock_init(&con->writequeue_lock);
7564 +                       INIT_LIST_HEAD(&con->writequeue);
7565 +                       set_bit(CF_IS_OTHERCON, &con->flags);
7566 +               }
7567 +
7568 +               memcpy(local_addr, node_addr->addr, node_addr->addr_len);
7569 +               sock = create_listen_sock(con, local_addr,
7570 +                                         node_addr->addr_len);
7571 +               if (sock) {
7572 +                       add_sock(sock, con);
7573 +
7574 +                       /* Keep a list of dynamically allocated listening sockets
7575 +                          so we can free them at shutdown */
7576 +                       if (test_bit(CF_IS_OTHERCON, &con->flags)) {
7577 +                               list_add_tail(&con->listenlist, &listen_sockets);
7578 +                       }
7579 +               }
7580 +               else {
7581 +                       result = -EADDRINUSE;
7582 +                       kmem_cache_free(con_cache, con);
7583 +                       goto create_free;
7584 +               }
7585 +
7586 +               con = NULL;
7587 +       }
7588 +
7589 +      create_out:
7590 +       return result;
7591 +
7592 +      create_free:
7593 +       /* Free up any dynamically allocated listening sockets */
7594 +       list_for_each_entry_safe(con, temp, &listen_sockets, listenlist) {
7595 +               sock_release(con->sock);
7596 +               kmem_cache_free(con_cache, con);
7597 +       }
7598 +       return result;
7599 +}
7600 +
7601 +
7602 +
7603 +static struct writequeue_entry *new_writequeue_entry(struct connection *con,
7604 +                                                    int allocation)
7605 +{
7606 +       struct writequeue_entry *entry;
7607 +
7608 +       entry = kmalloc(sizeof(struct writequeue_entry), allocation);
7609 +       if (!entry)
7610 +               return NULL;
7611 +
7612 +       entry->page = alloc_page(allocation);
7613 +       if (!entry->page) {
7614 +               kfree(entry);
7615 +               return NULL;
7616 +       }
7617 +
7618 +       entry->offset = 0;
7619 +       entry->len = 0;
7620 +       entry->end = 0;
7621 +       entry->users = 0;
7622 +       entry->con = con;
7623 +
7624 +       return entry;
7625 +}
7626 +
7627 +struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
7628 +                                            int allocation, char **ppc)
7629 +{
7630 +       struct connection *con = nodeid2con(nodeid, allocation);
7631 +       struct writequeue_entry *e;
7632 +       int offset = 0;
7633 +       int users = 0;
7634 +
7635 +       if (!con)
7636 +               return NULL;
7637 +
7638 +       if (!atomic_read(&accepting))
7639 +               return NULL;
7640 +
7641 +       spin_lock(&con->writequeue_lock);
7642 +       e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
7643 +       if (((struct list_head *) e == &con->writequeue) ||
7644 +           (PAGE_CACHE_SIZE - e->end < len)) {
7645 +               e = NULL;
7646 +       } else {
7647 +               offset = e->end;
7648 +               e->end += len;
7649 +               users = e->users++;
7650 +       }
7651 +       spin_unlock(&con->writequeue_lock);
7652 +
7653 +       if (e) {
7654 +             got_one:
7655 +               if (users == 0)
7656 +                       kmap(e->page);
7657 +               *ppc = page_address(e->page) + offset;
7658 +               return e;
7659 +       }
7660 +
7661 +       e = new_writequeue_entry(con, allocation);
7662 +       if (e) {
7663 +               spin_lock(&con->writequeue_lock);
7664 +               offset = e->end;
7665 +               e->end += len;
7666 +               users = e->users++;
7667 +               list_add_tail(&e->list, &con->writequeue);
7668 +               spin_unlock(&con->writequeue_lock);
7669 +               goto got_one;
7670 +       }
7671 +       return NULL;
7672 +}
7673 +
7674 +void lowcomms_commit_buffer(struct writequeue_entry *e)
7675 +{
7676 +       struct connection *con = e->con;
7677 +       int users;
7678 +
7679 +       if (!atomic_read(&accepting))
7680 +               return;
7681 +
7682 +       spin_lock(&con->writequeue_lock);
7683 +       users = --e->users;
7684 +       if (users)
7685 +               goto out;
7686 +       e->len = e->end - e->offset;
7687 +       kunmap(e->page);
7688 +       spin_unlock(&con->writequeue_lock);
7689 +
7690 +       if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) {
7691 +               spin_lock_bh(&write_sockets_lock);
7692 +               list_add_tail(&con->write_list, &write_sockets);
7693 +               spin_unlock_bh(&write_sockets_lock);
7694 +
7695 +               wake_up_interruptible(&lowcomms_send_waitq);
7696 +       }
7697 +       return;
7698 +
7699 +      out:
7700 +       spin_unlock(&con->writequeue_lock);
7701 +       return;
7702 +}
7703 +
7704 +static void free_entry(struct writequeue_entry *e)
7705 +{
7706 +       __free_page(e->page);
7707 +       kfree(e);
7708 +}
7709 +
7710 +/* Send a message */
7711 +static int send_to_sock(struct connection *con)
7712 +{
7713 +       int ret = 0;
7714 +       ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
7715 +       const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
7716 +       struct writequeue_entry *e;
7717 +       int len, offset;
7718 +
7719 +       down_read(&con->sock_sem);
7720 +       if (con->sock == NULL)
7721 +               goto out_connect;
7722 +
7723 +       sendpage = con->sock->ops->sendpage;
7724 +
7725 +       spin_lock(&con->writequeue_lock);
7726 +       for (;;) {
7727 +               e = list_entry(con->writequeue.next, struct writequeue_entry,
7728 +                              list);
7729 +               if ((struct list_head *) e == &con->writequeue)
7730 +                       break;
7731 +
7732 +               len = e->len;
7733 +               offset = e->offset;
7734 +               BUG_ON(len == 0 && e->users == 0);
7735 +               spin_unlock(&con->writequeue_lock);
7736 +
7737 +               ret = 0;
7738 +               if (len) {
7739 +                       ret = sendpage(con->sock, e->page, offset, len,
7740 +                                      msg_flags);
7741 +                       if (ret == -EAGAIN || ret == 0)
7742 +                               goto out;
7743 +                       if (ret <= 0)
7744 +                               goto send_error;
7745 +               }
7746 +
7747 +               spin_lock(&con->writequeue_lock);
7748 +               e->offset += ret;
7749 +               e->len -= ret;
7750 +
7751 +               if (e->len == 0 && e->users == 0) {
7752 +                       list_del(&e->list);
7753 +                       free_entry(e);
7754 +                       continue;
7755 +               }
7756 +       }
7757 +       spin_unlock(&con->writequeue_lock);
7758 +      out:
7759 +       up_read(&con->sock_sem);
7760 +       return ret;
7761 +
7762 +      send_error:
7763 +       up_read(&con->sock_sem);
7764 +       close_connection(con, FALSE);
7765 +       lowcomms_connect_sock(con);
7766 +       return ret;
7767 +
7768 +      out_connect:
7769 +       up_read(&con->sock_sem);
7770 +       lowcomms_connect_sock(con);
7771 +       return 0;
7772 +}
7773 +
7774 +static void clean_one_writequeue(struct connection *con)
7775 +{
7776 +       struct list_head *list;
7777 +       struct list_head *temp;
7778 +
7779 +       spin_lock(&con->writequeue_lock);
7780 +       list_for_each_safe(list, temp, &con->writequeue) {
7781 +               struct writequeue_entry *e =
7782 +                       list_entry(list, struct writequeue_entry, list);
7783 +               list_del(&e->list);
7784 +               free_entry(e);
7785 +       }
7786 +       spin_unlock(&con->writequeue_lock);
7787 +}
7788 +
7789 +/* Called from recovery when it knows that a node has
7790 +   left the cluster */
7791 +int lowcomms_close(int nodeid)
7792 +{
7793 +       struct connection *con;
7794 +
7795 +       if (!connections)
7796 +               goto out;
7797 +
7798 +       log_print("closing connection to node %d", nodeid);
7799 +       con = nodeid2con(nodeid, 0);
7800 +       if (con) {
7801 +               close_connection(con, TRUE);
7802 +               clean_one_writequeue(con);
7803 +               atomic_set(&con->waiting_requests, 0);
7804 +       }
7805 +       return 0;
7806 +
7807 +      out:
7808 +       return -1;
7809 +}
7810 +
7811 +/* API send message call, may queue the request */
7812 +/* N.B. This is the old interface - use the new one for new calls */
7813 +int lowcomms_send_message(int nodeid, char *buf, int len, int allocation)
7814 +{
7815 +       struct writequeue_entry *e;
7816 +       char *b;
7817 +
7818 +       e = lowcomms_get_buffer(nodeid, len, allocation, &b);
7819 +       if (e) {
7820 +               memcpy(b, buf, len);
7821 +               lowcomms_commit_buffer(e);
7822 +               return 0;
7823 +       }
7824 +       return -ENOBUFS;
7825 +}
7826 +
7827 +/* Look for activity on active sockets */
7828 +static void process_sockets(void)
7829 +{
7830 +       struct list_head *list;
7831 +       struct list_head *temp;
7832 +
7833 +       spin_lock_bh(&read_sockets_lock);
7834 +       list_for_each_safe(list, temp, &read_sockets) {
7835 +               struct connection *con =
7836 +                   list_entry(list, struct connection, read_list);
7837 +               list_del(&con->read_list);
7838 +               clear_bit(CF_READ_PENDING, &con->flags);
7839 +
7840 +               spin_unlock_bh(&read_sockets_lock);
7841 +
7842 +               /* This can reach zero if we are processing requests
7843 +                * as they come in.
7844 +                */
7845 +               if (atomic_read(&con->waiting_requests) == 0) {
7846 +                       spin_lock_bh(&read_sockets_lock);
7847 +                       continue;
7848 +               }
7849 +
7850 +               do {
7851 +                       con->rx_action(con);
7852 +               } while (!atomic_dec_and_test(&con->waiting_requests) &&
7853 +                        !kthread_should_stop());
7854 +
7855 +               /* Don't starve out everyone else */
7856 +               schedule();
7857 +               spin_lock_bh(&read_sockets_lock);
7858 +       }
7859 +       spin_unlock_bh(&read_sockets_lock);
7860 +}
7861 +
7862 +/* Try to send any messages that are pending
7863 + */
7864 +static void process_output_queue(void)
7865 +{
7866 +       struct list_head *list;
7867 +       struct list_head *temp;
7868 +       int ret;
7869 +
7870 +       spin_lock_bh(&write_sockets_lock);
7871 +       list_for_each_safe(list, temp, &write_sockets) {
7872 +               struct connection *con =
7873 +                   list_entry(list, struct connection, write_list);
7874 +               list_del(&con->write_list);
7875 +               clear_bit(CF_WRITE_PENDING, &con->flags);
7876 +
7877 +               spin_unlock_bh(&write_sockets_lock);
7878 +
7879 +               ret = send_to_sock(con);
7880 +               if (ret < 0) {
7881 +               }
7882 +               spin_lock_bh(&write_sockets_lock);
7883 +       }
7884 +       spin_unlock_bh(&write_sockets_lock);
7885 +}
7886 +
7887 +static void process_state_queue(void)
7888 +{
7889 +       struct list_head *list;
7890 +       struct list_head *temp;
7891 +       int ret;
7892 +
7893 +       spin_lock_bh(&state_sockets_lock);
7894 +       list_for_each_safe(list, temp, &state_sockets) {
7895 +               struct connection *con =
7896 +                   list_entry(list, struct connection, state_list);
7897 +               list_del(&con->state_list);
7898 +               clear_bit(CF_CONNECT_PENDING, &con->flags);
7899 +               spin_unlock_bh(&state_sockets_lock);
7900 +
7901 +               ret = connect_to_sock(con);
7902 +               if (ret < 0) {
7903 +               }
7904 +               spin_lock_bh(&state_sockets_lock);
7905 +       }
7906 +       spin_unlock_bh(&state_sockets_lock);
7907 +}
7908 +
7909 +
7910 +/* Discard all entries on the write queues */
7911 +static void clean_writequeues(void)
7912 +{
7913 +       int nodeid;
7914 +
7915 +       for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
7916 +               struct connection *con = nodeid2con(nodeid, 0);
7917 +
7918 +               if (con)
7919 +                       clean_one_writequeue(con);
7920 +       }
7921 +}
7922 +
7923 +static int read_list_empty(void)
7924 +{
7925 +       int status;
7926 +
7927 +       spin_lock_bh(&read_sockets_lock);
7928 +       status = list_empty(&read_sockets);
7929 +       spin_unlock_bh(&read_sockets_lock);
7930 +
7931 +       return status;
7932 +}
7933 +
7934 +/* DLM Transport comms receive daemon */
7935 +static int dlm_recvd(void *data)
7936 +{
7937 +       init_waitqueue_head(&lowcomms_recv_waitq);
7938 +       init_waitqueue_entry(&lowcomms_recv_waitq_head, current);
7939 +       add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head);
7940 +
7941 +       while (!kthread_should_stop()) {
7942 +               set_current_state(TASK_INTERRUPTIBLE);
7943 +               if (read_list_empty())
7944 +                       schedule();
7945 +               set_current_state(TASK_RUNNING);
7946 +
7947 +               process_sockets();
7948 +       }
7949 +
7950 +       return 0;
7951 +}
7952 +
7953 +static int write_and_state_lists_empty(void)
7954 +{
7955 +       int status;
7956 +
7957 +       spin_lock_bh(&write_sockets_lock);
7958 +       status = list_empty(&write_sockets);
7959 +       spin_unlock_bh(&write_sockets_lock);
7960 +
7961 +       spin_lock_bh(&state_sockets_lock);
7962 +       if (list_empty(&state_sockets) == 0)
7963 +               status = 0;
7964 +       spin_unlock_bh(&state_sockets_lock);
7965 +
7966 +       return status;
7967 +}
7968 +
7969 +/* DLM Transport send daemon */
7970 +static int dlm_sendd(void *data)
7971 +{
7972 +       init_waitqueue_head(&lowcomms_send_waitq);
7973 +       init_waitqueue_entry(&lowcomms_send_waitq_head, current);
7974 +       add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head);
7975 +
7976 +       while (!kthread_should_stop()) {
7977 +               set_current_state(TASK_INTERRUPTIBLE);
7978 +               if (write_and_state_lists_empty())
7979 +                       schedule();
7980 +               set_current_state(TASK_RUNNING);
7981 +
7982 +               process_state_queue();
7983 +               process_output_queue();
7984 +       }
7985 +
7986 +       return 0;
7987 +}
7988 +
7989 +static void daemons_stop(void)
7990 +{
7991 +       kthread_stop(recv_task);
7992 +       kthread_stop(send_task);
7993 +}
7994 +
7995 +static int daemons_start(void)
7996 +{
7997 +       struct task_struct *p;
7998 +       int error;
7999 +
8000 +       p = kthread_run(dlm_recvd, NULL, 0, "dlm_recvd");
8001 +       error = IS_ERR(p);
8002 +               if (error) {
8003 +               log_print("can't start dlm_recvd %d", error);
8004 +               return error;
8005 +       }
8006 +       recv_task = p;
8007 +
8008 +       p = kthread_run(dlm_sendd, NULL, 0, "dlm_sendd");
8009 +       error = IS_ERR(p);
8010 +               if (error) {
8011 +               log_print("can't start dlm_sendd %d", error);
8012 +               kthread_stop(recv_task);
8013 +               return error;
8014 +       }
8015 +       send_task = p;
8016 +
8017 +       return 0;
8018 +}
8019 +
8020 +/*
8021 + * Return the largest buffer size we can cope with.
8022 + */
8023 +int lowcomms_max_buffer_size(void)
8024 +{
8025 +       return PAGE_CACHE_SIZE;
8026 +}
8027 +
8028 +void lowcomms_stop(void)
8029 +{
8030 +       int i;
8031 +       struct connection *temp;
8032 +       struct connection *lcon;
8033 +
8034 +       atomic_set(&accepting, 0);
8035 +
8036 +       /* Set all the activity flags to prevent any
8037 +          socket activity.
8038 +       */
8039 +       for (i = 0; i < conn_array_size; i++) {
8040 +               if (connections[i])
8041 +                       connections[i]->flags = 0x7;
8042 +       }
8043 +       daemons_stop();
8044 +       clean_writequeues();
8045 +
8046 +       for (i = 0; i < conn_array_size; i++) {
8047 +               if (connections[i]) {
8048 +                       close_connection(connections[i], TRUE);
8049 +                       if (connections[i]->othercon)
8050 +                               kmem_cache_free(con_cache, connections[i]->othercon);
8051 +                       kmem_cache_free(con_cache, connections[i]);
8052 +               }
8053 +       }
8054 +
8055 +       kfree(connections);
8056 +       connections = NULL;
8057 +
8058 +       /* Free up any dynamically allocated listening sockets */
8059 +       list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8060 +               sock_release(lcon->sock);
8061 +               kmem_cache_free(con_cache, lcon);
8062 +       }
8063 +
8064 +       kmem_cache_destroy(con_cache);
8065 +       kcl_releaseref_cluster();
8066 +}
8067 +
8068 +/* This is quite likely to sleep... */
8069 +int lowcomms_start(void)
8070 +{
8071 +       int error = 0;
8072 +       struct connection *temp;
8073 +       struct connection *lcon;
8074 +
8075 +       INIT_LIST_HEAD(&read_sockets);
8076 +       INIT_LIST_HEAD(&write_sockets);
8077 +       INIT_LIST_HEAD(&state_sockets);
8078 +       INIT_LIST_HEAD(&listen_sockets);
8079 +
8080 +       spin_lock_init(&read_sockets_lock);
8081 +       spin_lock_init(&write_sockets_lock);
8082 +       spin_lock_init(&state_sockets_lock);
8083 +       init_rwsem(&connections_lock);
8084 +
8085 +       error = -ENOTCONN;
8086 +       if (kcl_addref_cluster())
8087 +               goto out;
8088 +
8089 +       /*
8090 +        * Temporarily initialise the waitq head so that lowcomms_send_message
8091 +        * doesn't crash if it gets called before the thread is fully
8092 +        * initialised
8093 +        */
8094 +       init_waitqueue_head(&lowcomms_send_waitq);
8095 +
8096 +       error = -ENOMEM;
8097 +       connections = kmalloc(sizeof(struct connection *) *
8098 +                             dlm_config.conn_increment, GFP_KERNEL);
8099 +       if (!connections)
8100 +               goto out;
8101 +
8102 +       memset(connections, 0,
8103 +              sizeof(struct connection *) * dlm_config.conn_increment);
8104 +
8105 +       conn_array_size = dlm_config.conn_increment;
8106 +
8107 +       con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
8108 +                                     __alignof__(struct connection), 0, NULL, NULL);
8109 +       if (!con_cache)
8110 +               goto fail_free_conn;
8111 +
8112 +
8113 +       /* Start listening */
8114 +       error = listen_for_all();
8115 +       if (error)
8116 +               goto fail_unlisten;
8117 +
8118 +       error = daemons_start();
8119 +       if (error)
8120 +               goto fail_unlisten;
8121 +
8122 +       atomic_set(&accepting, 1);
8123 +
8124 +       return 0;
8125 +
8126 +      fail_unlisten:
8127 +       close_connection(connections[0], 0);
8128 +       kmem_cache_free(con_cache, connections[0]);
8129 +       list_for_each_entry_safe(lcon, temp, &listen_sockets, listenlist) {
8130 +               sock_release(lcon->sock);
8131 +               kmem_cache_free(con_cache, lcon);
8132 +       }
8133 +
8134 +       kmem_cache_destroy(con_cache);
8135 +
8136 +      fail_free_conn:
8137 +       kcl_releaseref_cluster();
8138 +       kfree(connections);
8139 +
8140 +      out:
8141 +       return error;
8142 +}
8143 +
8144 +/* Don't accept any more outgoing work */
8145 +void lowcomms_stop_accept()
8146 +{
8147 +        atomic_set(&accepting, 0);
8148 +}
8149 +
8150 +/* Cluster Manager interface functions for looking up
8151 +   nodeids and IP addresses by each other
8152 +*/
8153 +
8154 +/* Return the IP address of a node given its NODEID */
8155 +static int lowcomms_ipaddr_from_nodeid(int nodeid, struct sockaddr *retaddr)
8156 +{
8157 +       struct list_head *addrs;
8158 +       struct cluster_node_addr *node_addr;
8159 +       struct cluster_node_addr *current_addr = NULL;
8160 +       struct sockaddr_in6 *saddr;
8161 +       int interface;
8162 +       int i;
8163 +
8164 +       addrs = kcl_get_node_addresses(nodeid);
8165 +       if (!addrs)
8166 +               return -1;
8167 +
8168 +       interface = kcl_get_current_interface();
8169 +
8170 +       /* Look for address number <interface> */
8171 +       i=0; /* i/f numbers start at 1 */
8172 +       list_for_each_entry(node_addr, addrs, list) {
8173 +               if (interface == ++i) {
8174 +                       current_addr = node_addr;
8175 +                       break;
8176 +               }
8177 +       }
8178 +
8179 +       /* If that failed then just use the first one */
8180 +       if (!current_addr)
8181 +               current_addr = (struct cluster_node_addr *)addrs->next;
8182 +
8183 +       saddr = (struct sockaddr_in6 *)current_addr->addr;
8184 +
8185 +       /* Extract the IP address */
8186 +       if (local_addr.sin6_family == AF_INET) {
8187 +               struct sockaddr_in *in4  = (struct sockaddr_in *)saddr;
8188 +               struct sockaddr_in *ret4 = (struct sockaddr_in *)retaddr;
8189 +               ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
8190 +       }
8191 +       else {
8192 +               struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *)retaddr;
8193 +               memcpy(&ret6->sin6_addr, &saddr->sin6_addr, sizeof(saddr->sin6_addr));
8194 +       }
8195 +
8196 +       return 0;
8197 +}
8198 +
8199 +/* Return the NODEID for a node given its sockaddr */
8200 +static int lowcomms_nodeid_from_ipaddr(struct sockaddr *addr, int addr_len)
8201 +{
8202 +       struct kcl_cluster_node node;
8203 +       struct sockaddr_in6 ipv6_addr;
8204 +       struct sockaddr_in  ipv4_addr;
8205 +
8206 +       if (local_addr.sin6_family == AF_INET) {
8207 +               struct sockaddr_in *in4 = (struct sockaddr_in *)addr;
8208 +               memcpy(&ipv4_addr, &local_addr, addr_len);
8209 +               memcpy(&ipv4_addr.sin_addr, &in4->sin_addr, sizeof(ipv4_addr.sin_addr));
8210 +
8211 +               addr = (struct sockaddr *)&ipv4_addr;
8212 +       }
8213 +       else {
8214 +               struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
8215 +               memcpy(&ipv6_addr, &local_addr, addr_len);
8216 +               memcpy(&ipv6_addr.sin6_addr, &in6->sin6_addr, sizeof(ipv6_addr.sin6_addr));
8217 +
8218 +               addr = (struct sockaddr *)&ipv6_addr;
8219 +       }
8220 +
8221 +       if (kcl_get_node_by_addr((char *)addr, addr_len, &node) == 0)
8222 +               return node.node_id;
8223 +       else
8224 +               return 0;
8225 +}
8226 +
8227 +int lowcomms_our_nodeid(void)
8228 +{
8229 +       struct kcl_cluster_node node;
8230 +       struct list_head *addrs;
8231 +       struct cluster_node_addr *first_addr;
8232 +       static int our_nodeid = 0;
8233 +
8234 +       if (our_nodeid)
8235 +               return our_nodeid;
8236 +
8237 +       if (kcl_get_node_by_nodeid(0, &node) == -1)
8238 +               return 0;
8239 +
8240 +       our_nodeid = node.node_id;
8241 +
8242 +       /* Fill in the "template" structure */
8243 +       addrs = kcl_get_node_addresses(our_nodeid);
8244 +       if (!addrs)
8245 +               return 0;
8246 +
8247 +       first_addr = (struct cluster_node_addr *) addrs->next;
8248 +       memcpy(&local_addr, &first_addr->addr, first_addr->addr_len);
8249 +
8250 +       return node.node_id;
8251 +}
8252 +/*
8253 + * Overrides for Emacs so that we follow Linus's tabbing style.
8254 + * Emacs will notice this stuff at the end of the file and automatically
8255 + * adjust the settings for this buffer only.  This must remain at the end
8256 + * of the file.
8257 + * ---------------------------------------------------------------------------
8258 + * Local variables:
8259 + * c-file-style: "linux"
8260 + * End:
8261 + */
8262 diff -urN linux-orig/cluster/dlm/lowcomms.h linux-patched/cluster/dlm/lowcomms.h
8263 --- linux-orig/cluster/dlm/lowcomms.h   1970-01-01 07:30:00.000000000 +0730
8264 +++ linux-patched/cluster/dlm/lowcomms.h        2004-11-03 11:31:56.000000000 +0800
8265 @@ -0,0 +1,34 @@
8266 +/******************************************************************************
8267 +*******************************************************************************
8268 +**
8269 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8270 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8271 +**
8272 +**  This copyrighted material is made available to anyone wishing to use,
8273 +**  modify, copy, or redistribute it subject to the terms and conditions
8274 +**  of the GNU General Public License v.2.
8275 +**
8276 +*******************************************************************************
8277 +******************************************************************************/
8278 +
8279 +#ifndef __LOWCOMMS_DOT_H__
8280 +#define __LOWCOMMS_DOT_H__
8281 +
8282 +/* The old interface */
8283 +int lowcomms_send_message(int csid, char *buf, int len, int allocation);
8284 +
8285 +/* The new interface */
8286 +struct writequeue_entry;
8287 +extern struct writequeue_entry *lowcomms_get_buffer(int nodeid, int len,
8288 +                                                   int allocation, char **ppc);
8289 +extern void lowcomms_commit_buffer(struct writequeue_entry *e);
8290 +
8291 +int lowcomms_start(void);
8292 +void lowcomms_stop(void);
8293 +void lowcomms_stop_accept(void);
8294 +int lowcomms_close(int nodeid);
8295 +int lowcomms_max_buffer_size(void);
8296 +
8297 +int lowcomms_our_nodeid(void);
8298 +
8299 +#endif                         /* __LOWCOMMS_DOT_H__ */
8300 diff -urN linux-orig/cluster/dlm/main.c linux-patched/cluster/dlm/main.c
8301 --- linux-orig/cluster/dlm/main.c       1970-01-01 07:30:00.000000000 +0730
8302 +++ linux-patched/cluster/dlm/main.c    2004-11-03 11:31:56.000000000 +0800
8303 @@ -0,0 +1,93 @@
8304 +/******************************************************************************
8305 +*******************************************************************************
8306 +**
8307 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8308 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8309 +**
8310 +**  This copyrighted material is made available to anyone wishing to use,
8311 +**  modify, copy, or redistribute it subject to the terms and conditions
8312 +**  of the GNU General Public License v.2.
8313 +**
8314 +*******************************************************************************
8315 +******************************************************************************/
8316 +
8317 +#define EXPORT_SYMTAB
8318 +
8319 +#include <linux/init.h>
8320 +#include <linux/proc_fs.h>
8321 +#include <linux/ctype.h>
8322 +#include <linux/module.h>
8323 +#include <net/sock.h>
8324 +
8325 +#include <cluster/cnxman.h>
8326 +
8327 +#include "dlm_internal.h"
8328 +#include "lockspace.h"
8329 +#include "ast.h"
8330 +#include "lkb.h"
8331 +#include "nodes.h"
8332 +#include "locking.h"
8333 +#include "config.h"
8334 +#include "memory.h"
8335 +#include "recover.h"
8336 +#include "lowcomms.h"
8337 +
8338 +int  dlm_device_init(void);
8339 +void dlm_device_exit(void);
8340 +void dlm_proc_init(void);
8341 +void dlm_proc_exit(void);
8342 +
8343 +
8344 +/* Cluster manager callbacks, we want to know if a node dies
8345 +   N.B. this is independent of lockspace-specific event callbacks from SM */
8346 +
8347 +static void cman_callback(kcl_callback_reason reason, long arg)
8348 +{
8349 +       /* This is unconditional. so do what we can to tidy up */
8350 +       if (reason == LEAVING) {
8351 +               dlm_emergency_shutdown();
8352 +       }
8353 +}
8354 +
8355 +int __init init_dlm(void)
8356 +{
8357 +       dlm_proc_init();
8358 +       dlm_lockspace_init();
8359 +       dlm_nodes_init();
8360 +       dlm_device_init();
8361 +       dlm_memory_init();
8362 +       dlm_config_init();
8363 +
8364 +       kcl_add_callback(cman_callback);
8365 +
8366 +       printk("DLM %s (built %s %s) installed\n",
8367 +              DLM_RELEASE_NAME, __DATE__, __TIME__);
8368 +
8369 +       return 0;
8370 +}
8371 +
8372 +void __exit exit_dlm(void)
8373 +{
8374 +       kcl_remove_callback(cman_callback);
8375 +
8376 +       dlm_device_exit();
8377 +       dlm_memory_exit();
8378 +       dlm_config_exit();
8379 +       dlm_proc_exit();
8380 +}
8381 +
8382 +MODULE_DESCRIPTION("Distributed Lock Manager " DLM_RELEASE_NAME);
8383 +MODULE_AUTHOR("Red Hat, Inc.");
8384 +MODULE_LICENSE("GPL");
8385 +
8386 +module_init(init_dlm);
8387 +module_exit(exit_dlm);
8388 +
8389 +EXPORT_SYMBOL(dlm_init);
8390 +EXPORT_SYMBOL(dlm_release);
8391 +EXPORT_SYMBOL(dlm_new_lockspace);
8392 +EXPORT_SYMBOL(dlm_release_lockspace);
8393 +EXPORT_SYMBOL(dlm_lock);
8394 +EXPORT_SYMBOL(dlm_unlock);
8395 +EXPORT_SYMBOL(dlm_debug_dump);
8396 +EXPORT_SYMBOL(dlm_locks_dump);
8397 diff -urN linux-orig/cluster/dlm/memory.c linux-patched/cluster/dlm/memory.c
8398 --- linux-orig/cluster/dlm/memory.c     1970-01-01 07:30:00.000000000 +0730
8399 +++ linux-patched/cluster/dlm/memory.c  2004-11-03 11:31:56.000000000 +0800
8400 @@ -0,0 +1,238 @@
8401 +/******************************************************************************
8402 +*******************************************************************************
8403 +**
8404 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8405 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8406 +**
8407 +**  This copyrighted material is made available to anyone wishing to use,
8408 +**  modify, copy, or redistribute it subject to the terms and conditions
8409 +**  of the GNU General Public License v.2.
8410 +**
8411 +*******************************************************************************
8412 +******************************************************************************/
8413 +
8414 +/* memory.c
8415 + *
8416 + * memory allocation routines
8417 + *
8418 + */
8419 +
8420 +#include "dlm_internal.h"
8421 +#include "memory.h"
8422 +#include "config.h"
8423 +
8424 +/* as the man says...Shouldn't this be in a header file somewhere? */
8425 +#define        BYTES_PER_WORD          sizeof(void *)
8426 +
8427 +static kmem_cache_t *rsb_cache_small;
8428 +static kmem_cache_t *rsb_cache_large;
8429 +static kmem_cache_t *lkb_cache;
8430 +static kmem_cache_t *lvb_cache;
8431 +static kmem_cache_t *resdir_cache_large;
8432 +static kmem_cache_t *resdir_cache_small;
8433 +
8434 +/* The thresholds above which we allocate large RSBs/direntry rather than small
8435 + * ones. This must make the resultant structure end on a word boundary */
8436 +#define LARGE_RSB_NAME 28
8437 +#define LARGE_RES_NAME 28
8438 +
8439 +int dlm_memory_init()
8440 +{
8441 +       int ret = -ENOMEM;
8442 +
8443 +
8444 +       rsb_cache_small =
8445 +           kmem_cache_create("dlm_rsb(small)",
8446 +                             (sizeof(struct dlm_rsb) + LARGE_RSB_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8447 +                             __alignof__(struct dlm_rsb), 0, NULL, NULL);
8448 +       if (!rsb_cache_small)
8449 +               goto out;
8450 +
8451 +       rsb_cache_large =
8452 +           kmem_cache_create("dlm_rsb(large)",
8453 +                             sizeof(struct dlm_rsb) + DLM_RESNAME_MAXLEN,
8454 +                             __alignof__(struct dlm_rsb), 0, NULL, NULL);
8455 +       if (!rsb_cache_large)
8456 +               goto out_free_rsbs;
8457 +
8458 +       lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
8459 +                                     __alignof__(struct dlm_lkb), 0, NULL, NULL);
8460 +       if (!lkb_cache)
8461 +               goto out_free_rsbl;
8462 +
8463 +       resdir_cache_large =
8464 +           kmem_cache_create("dlm_resdir(l)",
8465 +                             sizeof(struct dlm_direntry) + DLM_RESNAME_MAXLEN,
8466 +                             __alignof__(struct dlm_direntry), 0, NULL, NULL);
8467 +       if (!resdir_cache_large)
8468 +               goto out_free_lkb;
8469 +
8470 +       resdir_cache_small =
8471 +           kmem_cache_create("dlm_resdir(s)",
8472 +                             (sizeof(struct dlm_direntry) + LARGE_RES_NAME + BYTES_PER_WORD-1) & ~(BYTES_PER_WORD-1),
8473 +                             __alignof__(struct dlm_direntry), 0, NULL, NULL);
8474 +       if (!resdir_cache_small)
8475 +               goto out_free_resl;
8476 +
8477 +       /* LVB cache also holds ranges, so should be 64bit aligned */
8478 +       lvb_cache = kmem_cache_create("dlm_lvb/range", DLM_LVB_LEN,
8479 +                                     __alignof__(uint64_t), 0, NULL, NULL);
8480 +       if (!lkb_cache)
8481 +               goto out_free_ress;
8482 +
8483 +       ret = 0;
8484 +       goto out;
8485 +
8486 +      out_free_ress:
8487 +       kmem_cache_destroy(resdir_cache_small);
8488 +
8489 +      out_free_resl:
8490 +       kmem_cache_destroy(resdir_cache_large);
8491 +
8492 +      out_free_lkb:
8493 +       kmem_cache_destroy(lkb_cache);
8494 +
8495 +      out_free_rsbl:
8496 +       kmem_cache_destroy(rsb_cache_large);
8497 +
8498 +      out_free_rsbs:
8499 +       kmem_cache_destroy(rsb_cache_small);
8500 +
8501 +      out:
8502 +       return ret;
8503 +}
8504 +
8505 +void dlm_memory_exit()
8506 +{
8507 +       kmem_cache_destroy(rsb_cache_large);
8508 +       kmem_cache_destroy(rsb_cache_small);
8509 +       kmem_cache_destroy(lkb_cache);
8510 +       kmem_cache_destroy(resdir_cache_small);
8511 +       kmem_cache_destroy(resdir_cache_large);
8512 +       kmem_cache_destroy(lvb_cache);
8513 +}
8514 +
8515 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
8516 +{
8517 +       struct dlm_rsb *r;
8518 +
8519 +       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8520 +
8521 +       if (namelen >= LARGE_RSB_NAME)
8522 +               r = kmem_cache_alloc(rsb_cache_large, ls->ls_allocation);
8523 +       else
8524 +               r = kmem_cache_alloc(rsb_cache_small, ls->ls_allocation);
8525 +
8526 +       if (r)
8527 +               memset(r, 0, sizeof(struct dlm_rsb) + namelen);
8528 +
8529 +       return r;
8530 +}
8531 +
8532 +void free_rsb(struct dlm_rsb *r)
8533 +{
8534 +       int length = r->res_length;
8535 +
8536 +#ifdef POISON
8537 +       memset(r, 0x55, sizeof(struct dlm_rsb) + r->res_length);
8538 +#endif
8539 +
8540 +       if (length >= LARGE_RSB_NAME)
8541 +               kmem_cache_free(rsb_cache_large, r);
8542 +       else
8543 +               kmem_cache_free(rsb_cache_small, r);
8544 +}
8545 +
8546 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
8547 +{
8548 +       struct dlm_lkb *l;
8549 +
8550 +       l = kmem_cache_alloc(lkb_cache, ls->ls_allocation);
8551 +       if (l)
8552 +               memset(l, 0, sizeof(struct dlm_lkb));
8553 +
8554 +       return l;
8555 +}
8556 +
8557 +void free_lkb(struct dlm_lkb *l)
8558 +{
8559 +#ifdef POISON
8560 +       memset(l, 0xAA, sizeof(struct dlm_lkb));
8561 +#endif
8562 +       kmem_cache_free(lkb_cache, l);
8563 +}
8564 +
8565 +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
8566 +{
8567 +       struct dlm_direntry *rd;
8568 +
8569 +       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
8570 +
8571 +       if (namelen >= LARGE_RES_NAME)
8572 +               rd = kmem_cache_alloc(resdir_cache_large, ls->ls_allocation);
8573 +       else
8574 +               rd = kmem_cache_alloc(resdir_cache_small, ls->ls_allocation);
8575 +
8576 +       if (rd)
8577 +               memset(rd, 0, sizeof(struct dlm_direntry));
8578 +
8579 +       return rd;
8580 +}
8581 +
8582 +void free_direntry(struct dlm_direntry *de)
8583 +{
8584 +       if (de->length >= LARGE_RES_NAME)
8585 +               kmem_cache_free(resdir_cache_large, de);
8586 +       else
8587 +               kmem_cache_free(resdir_cache_small, de);
8588 +}
8589 +
8590 +char *allocate_lvb(struct dlm_ls *ls)
8591 +{
8592 +       char *l;
8593 +
8594 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8595 +       if (l)
8596 +               memset(l, 0, DLM_LVB_LEN);
8597 +
8598 +       return l;
8599 +}
8600 +
8601 +void free_lvb(char *l)
8602 +{
8603 +       kmem_cache_free(lvb_cache, l);
8604 +}
8605 +
8606 +/* Ranges are allocated from the LVB cache as they are the same size (4x64
8607 + * bits) */
8608 +uint64_t *allocate_range(struct dlm_ls * ls)
8609 +{
8610 +       uint64_t *l;
8611 +
8612 +       l = kmem_cache_alloc(lvb_cache, ls->ls_allocation);
8613 +       if (l)
8614 +               memset(l, 0, DLM_LVB_LEN);
8615 +
8616 +       return l;
8617 +}
8618 +
8619 +void free_range(uint64_t *l)
8620 +{
8621 +       kmem_cache_free(lvb_cache, l);
8622 +}
8623 +
8624 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls)
8625 +{
8626 +       struct dlm_rcom *rc;
8627 +
8628 +       rc = kmalloc(dlm_config.buffer_size, ls->ls_allocation);
8629 +       if (rc)
8630 +               memset(rc, 0, dlm_config.buffer_size);
8631 +
8632 +       return rc;
8633 +}
8634 +
8635 +void free_rcom_buffer(struct dlm_rcom *rc)
8636 +{
8637 +       kfree(rc);
8638 +}
8639 diff -urN linux-orig/cluster/dlm/memory.h linux-patched/cluster/dlm/memory.h
8640 --- linux-orig/cluster/dlm/memory.h     1970-01-01 07:30:00.000000000 +0730
8641 +++ linux-patched/cluster/dlm/memory.h  2004-11-03 11:31:56.000000000 +0800
8642 @@ -0,0 +1,32 @@
8643 +/******************************************************************************
8644 +*******************************************************************************
8645 +**
8646 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8647 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8648 +**
8649 +**  This copyrighted material is made available to anyone wishing to use,
8650 +**  modify, copy, or redistribute it subject to the terms and conditions
8651 +**  of the GNU General Public License v.2.
8652 +**
8653 +*******************************************************************************
8654 +******************************************************************************/
8655 +
8656 +#ifndef __MEMORY_DOT_H__
8657 +#define __MEMORY_DOT_H__
8658 +
8659 +int dlm_memory_init(void);
8660 +void dlm_memory_exit(void);
8661 +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
8662 +void free_rsb(struct dlm_rsb *r);
8663 +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
8664 +void free_lkb(struct dlm_lkb *l);
8665 +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
8666 +void free_direntry(struct dlm_direntry *de);
8667 +char *allocate_lvb(struct dlm_ls *ls);
8668 +void free_lvb(char *l);
8669 +struct dlm_rcom *allocate_rcom_buffer(struct dlm_ls *ls);
8670 +void free_rcom_buffer(struct dlm_rcom *rc);
8671 +uint64_t *allocate_range(struct dlm_ls *ls);
8672 +void free_range(uint64_t *l);
8673 +
8674 +#endif         /* __MEMORY_DOT_H__ */
8675 diff -urN linux-orig/cluster/dlm/midcomms.c linux-patched/cluster/dlm/midcomms.c
8676 --- linux-orig/cluster/dlm/midcomms.c   1970-01-01 07:30:00.000000000 +0730
8677 +++ linux-patched/cluster/dlm/midcomms.c        2004-11-03 11:31:56.000000000 +0800
8678 @@ -0,0 +1,355 @@
8679 +/******************************************************************************
8680 +*******************************************************************************
8681 +**
8682 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
8683 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
8684 +**
8685 +**  This copyrighted material is made available to anyone wishing to use,
8686 +**  modify, copy, or redistribute it subject to the terms and conditions
8687 +**  of the GNU General Public License v.2.
8688 +**
8689 +*******************************************************************************
8690 +******************************************************************************/
8691 +
8692 +/*
8693 + * midcomms.c
8694 + *
8695 + * This is the appallingly named "mid-level" comms layer.
8696 + *
8697 + * Its purpose is to take packets from the "real" comms layer,
8698 + * split them up into packets and pass them to the interested
8699 + * part of the locking mechanism.
8700 + *
8701 + * It also takes messages from the locking layer, formats them
8702 + * into packets and sends them to the comms layer.
8703 + *
8704 + * It knows the format of the mid-level messages used and nodeidss
8705 + * but it does not know how to resolve a nodeid into an IP address
8706 + * or any of the comms channel details
8707 + *
8708 + */
8709 +
8710 +#include "dlm_internal.h"
8711 +#include "lowcomms.h"
8712 +#include "midcomms.h"
8713 +#include "lockqueue.h"
8714 +#include "nodes.h"
8715 +#include "reccomms.h"
8716 +#include "config.h"
8717 +
8718 +/* Byteorder routines */
8719 +
8720 +static void host_to_network(void *msg)
8721 +{
8722 +       struct dlm_header *head = msg;
8723 +       struct dlm_request *req = msg;
8724 +       struct dlm_reply *rep = msg;
8725 +       struct dlm_query_request *qreq = msg;
8726 +       struct dlm_query_reply *qrep= msg;
8727 +       struct dlm_rcom *rc = msg;
8728 +
8729 +       /* Force into network byte order */
8730 +
8731 +       /*
8732 +        * Do the common header first
8733 +        */
8734 +
8735 +       head->rh_length = cpu_to_le16(head->rh_length);
8736 +       head->rh_lockspace = cpu_to_le32(head->rh_lockspace);
8737 +       /* Leave the lkid alone as it is transparent at the remote end */
8738 +
8739 +       /*
8740 +        * Do the fields in the remlockrequest or remlockreply structs
8741 +        */
8742 +
8743 +       switch (req->rr_header.rh_cmd) {
8744 +
8745 +       case GDLM_REMCMD_LOCKREQUEST:
8746 +       case GDLM_REMCMD_CONVREQUEST:
8747 +               req->rr_range_start = cpu_to_le64(req->rr_range_start);
8748 +               req->rr_range_end = cpu_to_le64(req->rr_range_end);
8749 +               /* Deliberate fall through */
8750 +       case GDLM_REMCMD_UNLOCKREQUEST:
8751 +       case GDLM_REMCMD_LOOKUP:
8752 +       case GDLM_REMCMD_LOCKGRANT:
8753 +       case GDLM_REMCMD_SENDBAST:
8754 +       case GDLM_REMCMD_SENDCAST:
8755 +       case GDLM_REMCMD_REM_RESDATA:
8756 +               req->rr_flags = cpu_to_le32(req->rr_flags);
8757 +               req->rr_status = cpu_to_le32(req->rr_status);
8758 +               break;
8759 +
8760 +       case GDLM_REMCMD_LOCKREPLY:
8761 +               rep->rl_lockstate = cpu_to_le32(rep->rl_lockstate);
8762 +               rep->rl_nodeid = cpu_to_le32(rep->rl_nodeid);
8763 +               rep->rl_status = cpu_to_le32(rep->rl_status);
8764 +               break;
8765 +
8766 +       case GDLM_REMCMD_RECOVERMESSAGE:
8767 +       case GDLM_REMCMD_RECOVERREPLY:
8768 +               rc->rc_msgid = cpu_to_le32(rc->rc_msgid);
8769 +               rc->rc_datalen = cpu_to_le16(rc->rc_datalen);
8770 +               break;
8771 +
8772 +       case GDLM_REMCMD_QUERY:
8773 +               qreq->rq_mstlkid = cpu_to_le32(qreq->rq_mstlkid);
8774 +               qreq->rq_query = cpu_to_le32(qreq->rq_query);
8775 +               qreq->rq_maxlocks = cpu_to_le32(qreq->rq_maxlocks);
8776 +               break;
8777 +
8778 +       case GDLM_REMCMD_QUERYREPLY:
8779 +               qrep->rq_numlocks = cpu_to_le32(qrep->rq_numlocks);
8780 +               qrep->rq_status = cpu_to_le32(qrep->rq_status);
8781 +               qrep->rq_grantcount = cpu_to_le32(qrep->rq_grantcount);
8782 +               qrep->rq_waitcount = cpu_to_le32(qrep->rq_waitcount);
8783 +               qrep->rq_convcount = cpu_to_le32(qrep->rq_convcount);
8784 +               break;
8785 +
8786 +       default:
8787 +               printk("dlm: warning, unknown REMCMD type %u\n",
8788 +                      req->rr_header.rh_cmd);
8789 +       }
8790 +}
8791 +
8792 +static void network_to_host(void *msg)
8793 +{
8794 +       struct dlm_header *head = msg;
8795 +       struct dlm_request *req = msg;
8796 +       struct dlm_reply *rep = msg;
8797 +       struct dlm_query_request *qreq = msg;
8798 +       struct dlm_query_reply *qrep = msg;
8799 +       struct dlm_rcom *rc = msg;
8800 +
8801 +       /* Force into host byte order */
8802 +
8803 +       /*
8804 +        * Do the common header first
8805 +        */
8806 +
8807 +       head->rh_length = le16_to_cpu(head->rh_length);
8808 +       head->rh_lockspace = le32_to_cpu(head->rh_lockspace);
8809 +       /* Leave the lkid alone as it is transparent at the remote end */
8810 +
8811 +       /*
8812 +        * Do the fields in the remlockrequest or remlockreply structs
8813 +        */
8814 +
8815 +       switch (req->rr_header.rh_cmd) {
8816 +
8817 +       case GDLM_REMCMD_LOCKREQUEST:
8818 +       case GDLM_REMCMD_CONVREQUEST:
8819 +               req->rr_range_start = le64_to_cpu(req->rr_range_start);
8820 +               req->rr_range_end = le64_to_cpu(req->rr_range_end);
8821 +       case GDLM_REMCMD_LOOKUP:
8822 +       case GDLM_REMCMD_UNLOCKREQUEST:
8823 +       case GDLM_REMCMD_LOCKGRANT:
8824 +       case GDLM_REMCMD_SENDBAST:
8825 +       case GDLM_REMCMD_SENDCAST:
8826 +       case GDLM_REMCMD_REM_RESDATA:
8827 +               /* Actually, not much to do here as the remote lock IDs are
8828 +                * transparent too */
8829 +               req->rr_flags = le32_to_cpu(req->rr_flags);
8830 +               req->rr_status = le32_to_cpu(req->rr_status);
8831 +               break;
8832 +
8833 +       case GDLM_REMCMD_LOCKREPLY:
8834 +               rep->rl_lockstate = le32_to_cpu(rep->rl_lockstate);
8835 +               rep->rl_nodeid = le32_to_cpu(rep->rl_nodeid);
8836 +               rep->rl_status = le32_to_cpu(rep->rl_status);
8837 +               break;
8838 +
8839 +       case GDLM_REMCMD_RECOVERMESSAGE:
8840 +       case GDLM_REMCMD_RECOVERREPLY:
8841 +               rc->rc_msgid = le32_to_cpu(rc->rc_msgid);
8842 +               rc->rc_datalen = le16_to_cpu(rc->rc_datalen);
8843 +               break;
8844 +
8845 +
8846 +       case GDLM_REMCMD_QUERY:
8847 +               qreq->rq_mstlkid = le32_to_cpu(qreq->rq_mstlkid);
8848 +               qreq->rq_query = le32_to_cpu(qreq->rq_query);
8849 +               qreq->rq_maxlocks = le32_to_cpu(qreq->rq_maxlocks);
8850 +               break;
8851 +
8852 +       case GDLM_REMCMD_QUERYREPLY:
8853 +               qrep->rq_numlocks = le32_to_cpu(qrep->rq_numlocks);
8854 +               qrep->rq_status = le32_to_cpu(qrep->rq_status);
8855 +               qrep->rq_grantcount = le32_to_cpu(qrep->rq_grantcount);
8856 +               qrep->rq_waitcount = le32_to_cpu(qrep->rq_waitcount);
8857 +               qrep->rq_convcount = le32_to_cpu(qrep->rq_convcount);
8858 +               break;
8859 +
8860 +       default:
8861 +               printk("dlm: warning, unknown REMCMD type %u\n",
8862 +                      req->rr_header.rh_cmd);
8863 +       }
8864 +}
8865 +
8866 +static void copy_from_cb(void *dst, const void *base, unsigned offset,
8867 +                        unsigned len, unsigned limit)
8868 +{
8869 +       unsigned copy = len;
8870 +
8871 +       if ((copy + offset) > limit)
8872 +               copy = limit - offset;
8873 +       memcpy(dst, base + offset, copy);
8874 +       len -= copy;
8875 +       if (len)
8876 +               memcpy(dst + copy, base, len);
8877 +}
8878 +
8879 +static void khexdump(const unsigned char *c, int len)
8880 +{
8881 +       while (len > 16) {
8882 +               printk(KERN_INFO
8883 +                      "%02x %02x %02x %02x %02x %02x %02x %02x-%02x %02x %02x %02x %02x %02x %02x %02x\n",
8884 +                      c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8],
8885 +                      c[9], c[10], c[11], c[12], c[13], c[14], c[15]);
8886 +               len -= 16;
8887 +               c += 16;
8888 +       }
8889 +       while (len > 4) {
8890 +               printk(KERN_INFO "%02x %02x %02x %02x\n", c[0], c[1], c[2],
8891 +                      c[3]);
8892 +               len -= 4;
8893 +               c += 4;
8894 +       }
8895 +       while (len > 0) {
8896 +               printk(KERN_INFO "%02x\n", c[0]);
8897 +               len--;
8898 +               c++;
8899 +       }
8900 +}
8901 +
8902 +/*
8903 + * Called from the low-level comms layer to process a buffer of
8904 + * commands.
8905 + *
8906 + * Only complete messages are processed here, any "spare" bytes from
8907 + * the end of a buffer are saved and tacked onto the front of the next
8908 + * message that comes in. I doubt this will happen very often but we
8909 + * need to be able to cope with it and I don't want the task to be waiting
8910 + * for packets to come in when there is useful work to be done.
8911 + *
8912 + */
8913 +int midcomms_process_incoming_buffer(int nodeid, const void *base,
8914 +                                    unsigned offset, unsigned len,
8915 +                                    unsigned limit)
8916 +{
8917 +       unsigned char __tmp[sizeof(struct dlm_header) + 64];
8918 +       struct dlm_header *msg = (struct dlm_header *) __tmp;
8919 +       int ret = 0;
8920 +       int err = 0;
8921 +       unsigned msglen;
8922 +       __u32 id, space;
8923 +
8924 +       while (len > sizeof(struct dlm_header)) {
8925 +               /* Get message header and check it over */
8926 +               copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
8927 +                            limit);
8928 +               msglen = le16_to_cpu(msg->rh_length);
8929 +               id = msg->rh_lkid;
8930 +               space = msg->rh_lockspace;
8931 +
8932 +               /* Check message size */
8933 +               err = -EINVAL;
8934 +               if (msglen < sizeof(struct dlm_header))
8935 +                       break;
8936 +               err = -E2BIG;
8937 +               if (msglen > dlm_config.buffer_size) {
8938 +                       printk("dlm: message size from %d too big %d(pkt len=%d)\n", nodeid, msglen, len);
8939 +                       khexdump((const unsigned char *) msg, len);
8940 +                       break;
8941 +               }
8942 +               err = 0;
8943 +
8944 +               /* Not enough in buffer yet? wait for some more */
8945 +               if (msglen > len)
8946 +                       break;
8947 +
8948 +               /* Make sure our temp buffer is large enough */
8949 +               if (msglen > sizeof(__tmp) &&
8950 +                   msg == (struct dlm_header *) __tmp) {
8951 +                       msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
8952 +                       if (msg == NULL)
8953 +                               return ret;
8954 +               }
8955 +
8956 +               copy_from_cb(msg, base, offset, msglen, limit);
8957 +               BUG_ON(id != msg->rh_lkid);
8958 +               BUG_ON(space != msg->rh_lockspace);
8959 +               ret += msglen;
8960 +               offset += msglen;
8961 +               offset &= (limit - 1);
8962 +               len -= msglen;
8963 +               network_to_host(msg);
8964 +
8965 +               if ((msg->rh_cmd > 32) ||
8966 +                   (msg->rh_cmd == 0) ||
8967 +                   (msg->rh_length < sizeof(struct dlm_header)) ||
8968 +                   (msg->rh_length > dlm_config.buffer_size)) {
8969 +
8970 +                       printk("dlm: midcomms: cmd=%u, flags=%u, length=%hu, "
8971 +                              "lkid=%u, lockspace=%u\n",
8972 +                              msg->rh_cmd, msg->rh_flags, msg->rh_length,
8973 +                              msg->rh_lkid, msg->rh_lockspace);
8974 +
8975 +                       printk("dlm: midcomms: base=%p, offset=%u, len=%u, "
8976 +                              "ret=%u, limit=%08x newbuf=%d\n",
8977 +                              base, offset, len, ret, limit,
8978 +                              ((struct dlm_header *) __tmp == msg));
8979 +
8980 +                       khexdump((const unsigned char *) msg, msg->rh_length);
8981 +
8982 +                       return -EBADMSG;
8983 +               }
8984 +
8985 +               switch (msg->rh_cmd) {
8986 +               case GDLM_REMCMD_RECOVERMESSAGE:
8987 +               case GDLM_REMCMD_RECOVERREPLY:
8988 +                       process_recovery_comm(nodeid, msg);
8989 +                       break;
8990 +               default:
8991 +                       process_cluster_request(nodeid, msg, FALSE);
8992 +               }
8993 +       }
8994 +
8995 +       if (msg != (struct dlm_header *) __tmp)
8996 +               kfree(msg);
8997 +
8998 +       return err ? err : ret;
8999 +}
9000 +
9001 +/*
9002 + * Send a lowcomms buffer
9003 + */
9004 +
9005 +void midcomms_send_buffer(struct dlm_header *msg, struct writequeue_entry *e)
9006 +{
9007 +       host_to_network(msg);
9008 +       lowcomms_commit_buffer(e);
9009 +}
9010 +
9011 +/*
9012 + * Make the message into network byte order and send it
9013 + */
9014 +
9015 +int midcomms_send_message(uint32_t nodeid, struct dlm_header *msg,
9016 +                         int allocation)
9017 +{
9018 +       int len = msg->rh_length;
9019 +
9020 +       host_to_network(msg);
9021 +
9022 +       /*
9023 +        * Loopback.  In fact, the locking code pretty much prevents this from
9024 +        * being needed but it can happen when the directory node is also the
9025 +        * local node.
9026 +        */
9027 +
9028 +       if (nodeid == our_nodeid())
9029 +               return midcomms_process_incoming_buffer(nodeid, (char *) msg, 0,
9030 +                                                       len, len);
9031 +
9032 +       return lowcomms_send_message(nodeid, (char *) msg, len, allocation);
9033 +}
9034 diff -urN linux-orig/cluster/dlm/midcomms.h linux-patched/cluster/dlm/midcomms.h
9035 --- linux-orig/cluster/dlm/midcomms.h   1970-01-01 07:30:00.000000000 +0730
9036 +++ linux-patched/cluster/dlm/midcomms.h        2004-11-03 11:31:56.000000000 +0800
9037 @@ -0,0 +1,24 @@
9038 +/******************************************************************************
9039 +*******************************************************************************
9040 +**
9041 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9042 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9043 +**
9044 +**  This copyrighted material is made available to anyone wishing to use,
9045 +**  modify, copy, or redistribute it subject to the terms and conditions
9046 +**  of the GNU General Public License v.2.
9047 +**
9048 +*******************************************************************************
9049 +******************************************************************************/
9050 +
9051 +#ifndef __MIDCOMMS_DOT_H__
9052 +#define __MIDCOMMS_DOT_H__
9053 +
9054 +int midcomms_send_message(uint32_t csid, struct dlm_header *msg,
9055 +                         int allocation);
9056 +int midcomms_process_incoming_buffer(int csid, const void *buf, unsigned offset,
9057 +                                    unsigned len, unsigned limit);
9058 +void midcomms_send_buffer(struct dlm_header *msg,
9059 +                         struct writequeue_entry *e);
9060 +
9061 +#endif                         /* __MIDCOMMS_DOT_H__ */
9062 diff -urN linux-orig/cluster/dlm/nodes.c linux-patched/cluster/dlm/nodes.c
9063 --- linux-orig/cluster/dlm/nodes.c      1970-01-01 07:30:00.000000000 +0730
9064 +++ linux-patched/cluster/dlm/nodes.c   2004-11-03 11:31:56.000000000 +0800
9065 @@ -0,0 +1,347 @@
9066 +/******************************************************************************
9067 +*******************************************************************************
9068 +**
9069 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9070 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9071 +**
9072 +**  This copyrighted material is made available to anyone wishing to use,
9073 +**  modify, copy, or redistribute it subject to the terms and conditions
9074 +**  of the GNU General Public License v.2.
9075 +**
9076 +*******************************************************************************
9077 +******************************************************************************/
9078 +
9079 +#include <net/sock.h>
9080 +#include <cluster/cnxman.h>
9081 +
9082 +#include "dlm_internal.h"
9083 +#include "lowcomms.h"
9084 +#include "nodes.h"
9085 +#include "recover.h"
9086 +#include "reccomms.h"
9087 +#include "util.h"
9088 +
9089 +static struct list_head cluster_nodes;
9090 +static spinlock_t node_lock;
9091 +
9092 +
9093 +void dlm_nodes_init(void)
9094 +{
9095 +       INIT_LIST_HEAD(&cluster_nodes);
9096 +       spin_lock_init(&node_lock);
9097 +}
9098 +
9099 +static struct dlm_node *search_node(uint32_t nodeid)
9100 +{
9101 +       struct dlm_node *node;
9102 +
9103 +       list_for_each_entry(node, &cluster_nodes, list) {
9104 +               if (node->nodeid == nodeid)
9105 +                       goto out;
9106 +       }
9107 +       node = NULL;
9108 + out:
9109 +       return node;
9110 +}
9111 +
9112 +static void put_node(struct dlm_node *node)
9113 +{
9114 +       spin_lock(&node_lock);
9115 +       if (atomic_dec_and_test(&node->refcount)) {
9116 +               lowcomms_close(node->nodeid);
9117 +               list_del(&node->list);
9118 +               spin_unlock(&node_lock);
9119 +               kfree(node);
9120 +               return;
9121 +       }
9122 +       spin_unlock(&node_lock);
9123 +}
9124 +
9125 +static int get_node(uint32_t nodeid, struct dlm_node **ndp)
9126 +{
9127 +       struct dlm_node *node, *node2;
9128 +       int error = -ENOMEM;
9129 +
9130 +       spin_lock(&node_lock);
9131 +       node = search_node(nodeid);
9132 +       if (node)
9133 +               atomic_inc(&node->refcount);
9134 +       spin_unlock(&node_lock);
9135 +
9136 +       if (node)
9137 +               goto out;
9138 +
9139 +       node = (struct dlm_node *) kmalloc(sizeof(struct dlm_node), GFP_KERNEL);
9140 +       if (!node)
9141 +               goto fail;
9142 +
9143 +       memset(node, 0, sizeof(struct dlm_node));
9144 +       node->nodeid = nodeid;
9145 +
9146 +       spin_lock(&node_lock);
9147 +       node2 = search_node(nodeid);
9148 +       if (node2) {
9149 +               atomic_inc(&node2->refcount);
9150 +               spin_unlock(&node_lock);
9151 +               kfree(node);
9152 +               node = node2;
9153 +               goto out;
9154 +       }
9155 +
9156 +       atomic_set(&node->refcount, 1);
9157 +       list_add_tail(&node->list, &cluster_nodes);
9158 +       spin_unlock(&node_lock);
9159 +
9160 + out:
9161 +       *ndp = node;
9162 +       return 0;
9163 + fail:
9164 +       return error;
9165 +}
9166 +
9167 +int init_new_csb(uint32_t nodeid, struct dlm_csb **ret_csb)
9168 +{
9169 +       struct dlm_csb *csb;
9170 +       struct dlm_node *node;
9171 +       int error = -ENOMEM;
9172 +
9173 +       csb = (struct dlm_csb *) kmalloc(sizeof(struct dlm_csb), GFP_KERNEL);
9174 +       if (!csb)
9175 +               goto fail;
9176 +
9177 +       memset(csb, 0, sizeof(struct dlm_csb));
9178 +
9179 +       error = get_node(nodeid, &node);
9180 +       if (error)
9181 +               goto fail_free;
9182 +
9183 +       csb->node = node;
9184 +       *ret_csb = csb;
9185 +       return 0;
9186 +
9187 + fail_free:
9188 +       kfree(csb);
9189 + fail:
9190 +       return error;
9191 +}
9192 +
9193 +void release_csb(struct dlm_csb *csb)
9194 +{
9195 +       put_node(csb->node);
9196 +       kfree(csb);
9197 +}
9198 +
9199 +uint32_t our_nodeid(void)
9200 +{
9201 +       return lowcomms_our_nodeid();
9202 +}
9203 +
9204 +static void make_node_array(struct dlm_ls *ls)
9205 +{
9206 +       struct dlm_csb *csb;
9207 +       uint32_t *array;
9208 +       int i = 0;
9209 +
9210 +       if (ls->ls_node_array) {
9211 +               kfree(ls->ls_node_array);
9212 +               ls->ls_node_array = NULL;
9213 +       }
9214 +
9215 +       array = kmalloc(sizeof(uint32_t) * ls->ls_num_nodes, GFP_KERNEL);
9216 +       if (!array)
9217 +               return;
9218 +
9219 +       list_for_each_entry(csb, &ls->ls_nodes, list)
9220 +               array[i++] = csb->node->nodeid;
9221 +
9222 +       ls->ls_node_array = array;
9223 +}
9224 +
9225 +int nodes_reconfig_wait(struct dlm_ls *ls)
9226 +{
9227 +       int error;
9228 +
9229 +       if (ls->ls_low_nodeid == our_nodeid()) {
9230 +               error = dlm_wait_status_all(ls, NODES_VALID);
9231 +               if (!error)
9232 +                       set_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags);
9233 +
9234 +               /* Experimental: this delay should allow any final messages
9235 +                * from the previous node to be received before beginning
9236 +                * recovery. */
9237 +
9238 +               if (ls->ls_num_nodes == 1) {
9239 +                       current->state = TASK_UNINTERRUPTIBLE;
9240 +                       schedule_timeout((2) * HZ);
9241 +               }
9242 +
9243 +       } else
9244 +               error = dlm_wait_status_low(ls, NODES_ALL_VALID);
9245 +
9246 +       return error;
9247 +}
9248 +
9249 +static void add_ordered_node(struct dlm_ls *ls, struct dlm_csb *new)
9250 +{
9251 +       struct dlm_csb *csb = NULL;
9252 +       struct list_head *tmp;
9253 +       struct list_head *newlist = &new->list;
9254 +       struct list_head *head = &ls->ls_nodes;
9255 +
9256 +       list_for_each(tmp, head) {
9257 +               csb = list_entry(tmp, struct dlm_csb, list);
9258 +
9259 +               if (new->node->nodeid < csb->node->nodeid)
9260 +                       break;
9261 +       }
9262 +
9263 +       if (!csb)
9264 +               list_add_tail(newlist, head);
9265 +       else {
9266 +               /* FIXME: can use list macro here */
9267 +               newlist->prev = tmp->prev;
9268 +               newlist->next = tmp;
9269 +               tmp->prev->next = newlist;
9270 +               tmp->prev = newlist;
9271 +       }
9272 +}
9273 +
9274 +int ls_nodes_reconfig(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
9275 +{
9276 +       struct dlm_csb *csb, *safe;
9277 +       int error, i, found, pos = 0, neg = 0;
9278 +       uint32_t low = (uint32_t) (-1);
9279 +
9280 +       /*
9281 +        * Remove (and save) departed nodes from lockspace's nodes list
9282 +        */
9283 +
9284 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes, list) {
9285 +               found = FALSE;
9286 +               for (i = 0; i < rv->node_count; i++) {
9287 +                       if (csb->node->nodeid == rv->nodeids[i]) {
9288 +                               found = TRUE;
9289 +                               break;
9290 +                       }
9291 +               }
9292 +
9293 +               if (!found) {
9294 +                       neg++;
9295 +                       csb->gone_event = rv->event_id;
9296 +                       list_del(&csb->list);
9297 +                       list_add_tail(&csb->list, &ls->ls_nodes_gone);
9298 +                       ls->ls_num_nodes--;
9299 +                       log_all(ls, "remove node %u", csb->node->nodeid);
9300 +               }
9301 +       }
9302 +
9303 +       /*
9304 +        * Add new nodes to lockspace's nodes list
9305 +        */
9306 +
9307 +       for (i = 0; i < rv->node_count; i++) {
9308 +               found = FALSE;
9309 +               list_for_each_entry(csb, &ls->ls_nodes, list) {
9310 +                       if (csb->node->nodeid == rv->nodeids[i]) {
9311 +                               found = TRUE;
9312 +                               break;
9313 +                       }
9314 +               }
9315 +
9316 +               if (!found) {
9317 +                       pos++;
9318 +
9319 +                       error = init_new_csb(rv->nodeids[i], &csb);
9320 +                       DLM_ASSERT(!error,);
9321 +
9322 +                       add_ordered_node(ls, csb);
9323 +                       ls->ls_num_nodes++;
9324 +                       log_all(ls, "add node %u", csb->node->nodeid);
9325 +               }
9326 +       }
9327 +
9328 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
9329 +               if (csb->node->nodeid < low)
9330 +                       low = csb->node->nodeid;
9331 +       }
9332 +
9333 +       ls->ls_low_nodeid = low;
9334 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
9335 +       *neg_out = neg;
9336 +       make_node_array(ls);
9337 +
9338 +       error = nodes_reconfig_wait(ls);
9339 +
9340 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
9341 +
9342 +       return error;
9343 +}
9344 +
9345 +static void nodes_clear(struct list_head *head)
9346 +{
9347 +       struct dlm_csb *csb;
9348 +
9349 +       while (!list_empty(head)) {
9350 +               csb = list_entry(head->next, struct dlm_csb, list);
9351 +               list_del(&csb->list);
9352 +               release_csb(csb);
9353 +       }
9354 +}
9355 +
9356 +void ls_nodes_clear(struct dlm_ls *ls)
9357 +{
9358 +       nodes_clear(&ls->ls_nodes);
9359 +       ls->ls_num_nodes = 0;
9360 +}
9361 +
9362 +void ls_nodes_gone_clear(struct dlm_ls *ls)
9363 +{
9364 +       nodes_clear(&ls->ls_nodes_gone);
9365 +}
9366 +
9367 +int ls_nodes_init(struct dlm_ls *ls, struct dlm_recover *rv)
9368 +{
9369 +       struct dlm_csb *csb;
9370 +       int i, error;
9371 +       uint32_t low = (uint32_t) (-1);
9372 +
9373 +       /* nodes may be left from a previous failed start */
9374 +       ls_nodes_clear(ls);
9375 +
9376 +       log_all(ls, "add nodes");
9377 +
9378 +       for (i = 0; i < rv->node_count; i++) {
9379 +               error = init_new_csb(rv->nodeids[i], &csb);
9380 +               if (error)
9381 +                       goto fail;
9382 +
9383 +               add_ordered_node(ls, csb);
9384 +               ls->ls_num_nodes++;
9385 +
9386 +               if (csb->node->nodeid < low)
9387 +                       low = csb->node->nodeid;
9388 +       }
9389 +
9390 +       ls->ls_low_nodeid = low;
9391 +       set_bit(LSFL_NODES_VALID, &ls->ls_flags);
9392 +       make_node_array(ls);
9393 +
9394 +       error = nodes_reconfig_wait(ls);
9395 +
9396 +       log_all(ls, "total nodes %d", ls->ls_num_nodes);
9397 +       return error;
9398 + fail:
9399 +       ls_nodes_clear(ls);
9400 +       return error;
9401 +}
9402 +
9403 +int in_nodes_gone(struct dlm_ls *ls, uint32_t nodeid)
9404 +{
9405 +       struct dlm_csb *csb;
9406 +
9407 +       list_for_each_entry(csb, &ls->ls_nodes_gone, list) {
9408 +               if (csb->node->nodeid == nodeid)
9409 +                       return TRUE;
9410 +       }
9411 +       return FALSE;
9412 +}
9413 diff -urN linux-orig/cluster/dlm/nodes.h linux-patched/cluster/dlm/nodes.h
9414 --- linux-orig/cluster/dlm/nodes.h      1970-01-01 07:30:00.000000000 +0730
9415 +++ linux-patched/cluster/dlm/nodes.h   2004-11-03 11:31:56.000000000 +0800
9416 @@ -0,0 +1,27 @@
9417 +/******************************************************************************
9418 +*******************************************************************************
9419 +**
9420 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9421 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9422 +**
9423 +**  This copyrighted material is made available to anyone wishing to use,
9424 +**  modify, copy, or redistribute it subject to the terms and conditions
9425 +**  of the GNU General Public License v.2.
9426 +**
9427 +*******************************************************************************
9428 +******************************************************************************/
9429 +
9430 +#ifndef __NODES_DOT_H__
9431 +#define __NODES_DOT_H__
9432 +
9433 +void dlm_nodes_init(void);
9434 +int init_new_csb(uint32_t nodeid, struct dlm_csb ** ret_csb);
9435 +void release_csb(struct dlm_csb * csb);
9436 +uint32_t our_nodeid(void);
9437 +int ls_nodes_reconfig(struct dlm_ls * ls, struct dlm_recover * gr, int *neg);
9438 +int ls_nodes_init(struct dlm_ls * ls, struct dlm_recover * gr);
9439 +int in_nodes_gone(struct dlm_ls * ls, uint32_t nodeid);
9440 +void ls_nodes_clear(struct dlm_ls *ls);
9441 +void ls_nodes_gone_clear(struct dlm_ls *ls);
9442 +
9443 +#endif                         /* __NODES_DOT_H__ */
9444 diff -urN linux-orig/cluster/dlm/proc.c linux-patched/cluster/dlm/proc.c
9445 --- linux-orig/cluster/dlm/proc.c       1970-01-01 07:30:00.000000000 +0730
9446 +++ linux-patched/cluster/dlm/proc.c    2004-11-03 11:31:56.000000000 +0800
9447 @@ -0,0 +1,652 @@
9448 +/******************************************************************************
9449 +*******************************************************************************
9450 +**
9451 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
9452 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
9453 +**
9454 +**  This copyrighted material is made available to anyone wishing to use,
9455 +**  modify, copy, or redistribute it subject to the terms and conditions
9456 +**  of the GNU General Public License v.2.
9457 +**
9458 +*******************************************************************************
9459 +******************************************************************************/
9460 +
9461 +#include <linux/init.h>
9462 +#include <linux/proc_fs.h>
9463 +#include <linux/ctype.h>
9464 +#include <linux/seq_file.h>
9465 +#include <linux/module.h>
9466 +
9467 +#include "dlm_internal.h"
9468 +#include "lockspace.h"
9469 +
9470 +#if defined(DLM_DEBUG)
9471 +#define DLM_DEBUG_SIZE         (1024)
9472 +#define MAX_DEBUG_MSG_LEN      (64)
9473 +#else
9474 +#define DLM_DEBUG_SIZE         (0)
9475 +#define MAX_DEBUG_MSG_LEN      (0)
9476 +#endif
9477 +
9478 +static char *                  debug_buf;
9479 +static unsigned int            debug_size;
9480 +static unsigned int            debug_point;
9481 +static int                     debug_wrap;
9482 +static spinlock_t              debug_lock;
9483 +static struct proc_dir_entry * debug_proc_entry = NULL;
9484 +static char                    proc_ls_name[255] = "";
9485 +
9486 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
9487 +static struct proc_dir_entry * locks_proc_entry = NULL;
9488 +static struct seq_operations   locks_info_op;
9489 +static struct proc_dir_entry * dir_proc_entry = NULL;
9490 +static struct seq_operations   dir_info_op;
9491 +
9492 +
9493 +/*
9494 + * /proc/cluster/dlm_locks - dump resources and locks
9495 + */
9496 +
9497 +static int locks_open(struct inode *inode, struct file *file)
9498 +{
9499 +       return seq_open(file, &locks_info_op);
9500 +}
9501 +
9502 +/* Write simply sets the lockspace to use */
9503 +static ssize_t locks_write(struct file *file, const char *buf,
9504 +                          size_t count, loff_t * ppos)
9505 +{
9506 +       if (count < sizeof(proc_ls_name)) {
9507 +               copy_from_user(proc_ls_name, buf, count);
9508 +               proc_ls_name[count] = '\0';
9509 +
9510 +               /* Remove any trailing LF so that lazy users
9511 +                  can just echo "lsname" > /proc/cluster/dlm_locks */
9512 +               if (proc_ls_name[count - 1] == '\n')
9513 +                       proc_ls_name[count - 1] = '\0';
9514 +
9515 +               return count;
9516 +       }
9517 +       return 0;
9518 +}
9519 +
9520 +static struct file_operations locks_fops = {
9521 +       open:locks_open,
9522 +       write:locks_write,
9523 +       read:seq_read,
9524 +       llseek:seq_lseek,
9525 +       release:seq_release,
9526 +};
9527 +
9528 +struct ls_dumpinfo {
9529 +       int entry;
9530 +       struct list_head *next;
9531 +       struct dlm_ls *ls;
9532 +       struct dlm_rsb *rsb;
9533 +       struct dlm_direntry *de;
9534 +};
9535 +
9536 +static int print_resource(struct dlm_rsb * res, struct seq_file *s);
9537 +
9538 +static struct ls_dumpinfo *next_rsb(struct ls_dumpinfo *di)
9539 +{
9540 +       int i;
9541 +
9542 +       if (!di->next) {
9543 +               /* Find the next non-empty hash bucket */
9544 +               for (i = di->entry; i < di->ls->ls_rsbtbl_size; i++) {
9545 +                       read_lock(&di->ls->ls_rsbtbl[i].lock);
9546 +                       if (!list_empty(&di->ls->ls_rsbtbl[i].list)) {
9547 +                               di->next = di->ls->ls_rsbtbl[i].list.next;
9548 +                               read_unlock(&di->ls->ls_rsbtbl[i].lock);
9549 +                               break;
9550 +                       }
9551 +                       read_unlock(&di->ls->ls_rsbtbl[i].lock);
9552 +               }
9553 +               di->entry = i;
9554 +
9555 +               if (di->entry >= di->ls->ls_rsbtbl_size)
9556 +                       return NULL;    /* End of hash list */
9557 +       } else {                /* Find the next entry in the list */
9558 +               i = di->entry;
9559 +               read_lock(&di->ls->ls_rsbtbl[i].lock);
9560 +               di->next = di->next->next;
9561 +               if (di->next->next == di->ls->ls_rsbtbl[i].list.next) {
9562 +                       /* End of list - move to next bucket */
9563 +                       di->next = NULL;
9564 +                       di->entry++;
9565 +                       read_unlock(&di->ls->ls_rsbtbl[i].lock);
9566 +                       return next_rsb(di);    /* do the top half of this conditional */
9567 +               }
9568 +               read_unlock(&di->ls->ls_rsbtbl[i].lock);
9569 +       }
9570 +       di->rsb = list_entry(di->next, struct dlm_rsb, res_hashchain);
9571 +
9572 +       return di;
9573 +}
9574 +
9575 +static void *s_start(struct seq_file *m, loff_t *pos)
9576 +{
9577 +       struct ls_dumpinfo *di;
9578 +       struct dlm_ls *ls;
9579 +       int i;
9580 +
9581 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9582 +       if (!ls)
9583 +               return NULL;
9584 +
9585 +       di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9586 +       if (!di)
9587 +               return NULL;
9588 +
9589 +       if (*pos == 0)
9590 +               seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9591 +
9592 +       di->entry = 0;
9593 +       di->next = NULL;
9594 +       di->ls = ls;
9595 +       di->de = NULL;
9596 +
9597 +       for (i = 0; i < *pos; i++)
9598 +               if (next_rsb(di) == NULL)
9599 +                       return NULL;
9600 +
9601 +       return next_rsb(di);
9602 +}
9603 +
9604 +static void *s_next(struct seq_file *m, void *p, loff_t *pos)
9605 +{
9606 +       struct ls_dumpinfo *di = p;
9607 +
9608 +       *pos += 1;
9609 +
9610 +       return next_rsb(di);
9611 +}
9612 +
9613 +static int s_show(struct seq_file *m, void *p)
9614 +{
9615 +       struct ls_dumpinfo *di = p;
9616 +       return print_resource(di->rsb, m);
9617 +}
9618 +
9619 +static void s_stop(struct seq_file *m, void *p)
9620 +{
9621 +       kfree(p);
9622 +}
9623 +
9624 +static struct seq_operations locks_info_op = {
9625 +       start:s_start,
9626 +       next:s_next,
9627 +       stop:s_stop,
9628 +       show:s_show
9629 +};
9630 +
9631 +static char *print_lockmode(int mode)
9632 +{
9633 +       switch (mode) {
9634 +       case DLM_LOCK_IV:
9635 +               return "--";
9636 +       case DLM_LOCK_NL:
9637 +               return "NL";
9638 +       case DLM_LOCK_CR:
9639 +               return "CR";
9640 +       case DLM_LOCK_CW:
9641 +               return "CW";
9642 +       case DLM_LOCK_PR:
9643 +               return "PR";
9644 +       case DLM_LOCK_PW:
9645 +               return "PW";
9646 +       case DLM_LOCK_EX:
9647 +               return "EX";
9648 +       default:
9649 +               return "??";
9650 +       }
9651 +}
9652 +
9653 +static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
9654 +                      struct dlm_rsb *res)
9655 +{
9656 +
9657 +       seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
9658 +
9659 +       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9660 +           || lkb->lkb_status == GDLM_LKSTS_WAITING)
9661 +               seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
9662 +
9663 +       if (lkb->lkb_range) {
9664 +               /* This warns on Alpha. Tough. Only I see it */
9665 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9666 +                   || lkb->lkb_status == GDLM_LKSTS_GRANTED)
9667 +                       seq_printf(s, " %" PRIx64 "-%" PRIx64,
9668 +                                  lkb->lkb_range[GR_RANGE_START],
9669 +                                  lkb->lkb_range[GR_RANGE_END]);
9670 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT
9671 +                   || lkb->lkb_status == GDLM_LKSTS_WAITING)
9672 +                       seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
9673 +                                  lkb->lkb_range[RQ_RANGE_START],
9674 +                                  lkb->lkb_range[RQ_RANGE_END]);
9675 +       }
9676 +
9677 +       if (lkb->lkb_nodeid) {
9678 +               if (lkb->lkb_nodeid != res->res_nodeid)
9679 +                       seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
9680 +                                  lkb->lkb_remid);
9681 +               else
9682 +                       seq_printf(s, " Master:     %08x", lkb->lkb_remid);
9683 +       }
9684 +
9685 +       if (lkb->lkb_status != GDLM_LKSTS_GRANTED)
9686 +               seq_printf(s, "  LQ: %d,0x%x", lkb->lkb_lockqueue_state,
9687 +                          lkb->lkb_lockqueue_flags);
9688 +
9689 +       seq_printf(s, "\n");
9690 +}
9691 +
9692 +static int print_resource(struct dlm_rsb *res, struct seq_file *s)
9693 +{
9694 +       int i;
9695 +       struct list_head *locklist;
9696 +
9697 +       seq_printf(s, "\nResource %p (parent %p). Name (len=%d) \"", res,
9698 +                  res->res_parent, res->res_length);
9699 +       for (i = 0; i < res->res_length; i++) {
9700 +               if (isprint(res->res_name[i]))
9701 +                       seq_printf(s, "%c", res->res_name[i]);
9702 +               else
9703 +                       seq_printf(s, "%c", '.');
9704 +       }
9705 +       if (res->res_nodeid)
9706 +               seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
9707 +                          res->res_nodeid);
9708 +       else
9709 +               seq_printf(s, "\"  \nMaster Copy\n");
9710 +
9711 +       /* Print the LVB: */
9712 +       if (res->res_lvbptr) {
9713 +               seq_printf(s, "LVB: ");
9714 +               for (i = 0; i < DLM_LVB_LEN; i++) {
9715 +                       if (i == DLM_LVB_LEN / 2)
9716 +                               seq_printf(s, "\n     ");
9717 +                       seq_printf(s, "%02x ",
9718 +                                  (unsigned char) res->res_lvbptr[i]);
9719 +               }
9720 +               seq_printf(s, "\n");
9721 +       }
9722 +
9723 +       /* Print the locks attached to this resource */
9724 +       seq_printf(s, "Granted Queue\n");
9725 +       list_for_each(locklist, &res->res_grantqueue) {
9726 +               struct dlm_lkb *this_lkb =
9727 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9728 +               print_lock(s, this_lkb, res);
9729 +       }
9730 +
9731 +       seq_printf(s, "Conversion Queue\n");
9732 +       list_for_each(locklist, &res->res_convertqueue) {
9733 +               struct dlm_lkb *this_lkb =
9734 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9735 +               print_lock(s, this_lkb, res);
9736 +       }
9737 +
9738 +       seq_printf(s, "Waiting Queue\n");
9739 +       list_for_each(locklist, &res->res_waitqueue) {
9740 +               struct dlm_lkb *this_lkb =
9741 +                   list_entry(locklist, struct dlm_lkb, lkb_statequeue);
9742 +               print_lock(s, this_lkb, res);
9743 +       }
9744 +
9745 +       return 0;
9746 +}
9747 +
9748 +
9749 +/*
9750 + * /proc/cluster/dlm_dir - dump resource directory
9751 + */
9752 +
9753 +static int print_de(struct dlm_direntry *de, struct seq_file *s)
9754 +{
9755 +       char strname[DLM_RESNAME_MAXLEN+1];
9756 +
9757 +       memset(strname, 0, DLM_RESNAME_MAXLEN+1);
9758 +       memcpy(strname, de->name, de->length);
9759 +
9760 +       seq_printf(s, "%s %u\n", strname, de->master_nodeid);
9761 +       return 0;
9762 +}
9763 +
9764 +static int dir_open(struct inode *inode, struct file *file)
9765 +{
9766 +       return seq_open(file, &dir_info_op);
9767 +}
9768 +
9769 +static ssize_t dir_write(struct file *file, const char *buf,
9770 +                        size_t count, loff_t *ppos)
9771 +{
9772 +       return locks_write(file, buf, count, ppos);
9773 +}
9774 +
9775 +static struct file_operations dir_fops = {
9776 +       .open    = dir_open,
9777 +       .write   = dir_write,
9778 +       .read    = seq_read,
9779 +       .llseek  = seq_lseek,
9780 +       .release = seq_release,
9781 +       .owner   = THIS_MODULE,
9782 +};
9783 +
9784 +static struct ls_dumpinfo *next_de(struct ls_dumpinfo *di)
9785 +{
9786 +       int i;
9787 +
9788 +       if (!di->next) {
9789 +               /* Find the next non-empty hash bucket */
9790 +               for (i = di->entry; i < di->ls->ls_dirtbl_size; i++) {
9791 +                       read_lock(&di->ls->ls_dirtbl[i].lock);
9792 +                       if (!list_empty(&di->ls->ls_dirtbl[i].list)) {
9793 +                               di->next = di->ls->ls_dirtbl[i].list.next;
9794 +                               read_unlock(&di->ls->ls_dirtbl[i].lock);
9795 +                               break;
9796 +                       }
9797 +                       read_unlock(&di->ls->ls_dirtbl[i].lock);
9798 +               }
9799 +               di->entry = i;
9800 +
9801 +               if (di->entry >= di->ls->ls_dirtbl_size)
9802 +                       return NULL;    /* End of hash list */
9803 +       } else {                /* Find the next entry in the list */
9804 +               i = di->entry;
9805 +               read_lock(&di->ls->ls_dirtbl[i].lock);
9806 +               di->next = di->next->next;
9807 +               if (di->next->next == di->ls->ls_dirtbl[i].list.next) {
9808 +                       /* End of list - move to next bucket */
9809 +                       di->next = NULL;
9810 +                       di->entry++;
9811 +                       read_unlock(&di->ls->ls_dirtbl[i].lock);
9812 +                       return next_de(di);     /* do the top half of this conditional */
9813 +               }
9814 +               read_unlock(&di->ls->ls_dirtbl[i].lock);
9815 +       }
9816 +       di->de = list_entry(di->next, struct dlm_direntry, list);
9817 +
9818 +       return di;
9819 +}
9820 +
9821 +static void *dir_start(struct seq_file *m, loff_t *pos)
9822 +{
9823 +       struct ls_dumpinfo *di;
9824 +       struct dlm_ls *ls;
9825 +       int i;
9826 +
9827 +       ls = find_lockspace_by_name(proc_ls_name, strlen(proc_ls_name));
9828 +       if (!ls)
9829 +               return NULL;
9830 +
9831 +       di = kmalloc(sizeof(struct ls_dumpinfo), GFP_KERNEL);
9832 +       if (!di)
9833 +               return NULL;
9834 +
9835 +       if (*pos == 0)
9836 +               seq_printf(m, "DLM lockspace '%s'\n", proc_ls_name);
9837 +
9838 +       di->entry = 0;
9839 +       di->next = NULL;
9840 +       di->ls = ls;
9841 +
9842 +       for (i = 0; i < *pos; i++)
9843 +               if (next_de(di) == NULL)
9844 +                       return NULL;
9845 +
9846 +       return next_de(di);
9847 +}
9848 +
9849 +static void *dir_next(struct seq_file *m, void *p, loff_t *pos)
9850 +{
9851 +       struct ls_dumpinfo *di = p;
9852 +
9853 +       *pos += 1;
9854 +
9855 +       return next_de(di);
9856 +}
9857 +
9858 +static int dir_show(struct seq_file *m, void *p)
9859 +{
9860 +       struct ls_dumpinfo *di = p;
9861 +       return print_de(di->de, m);
9862 +}
9863 +
9864 +static void dir_stop(struct seq_file *m, void *p)
9865 +{
9866 +       kfree(p);
9867 +}
9868 +
9869 +static struct seq_operations dir_info_op = {
9870 +       .start = dir_start,
9871 +       .next  = dir_next,
9872 +       .stop  = dir_stop,
9873 +       .show  = dir_show,
9874 +};
9875 +#endif                         /* CONFIG_CLUSTER_DLM_PROCLOCKS */
9876 +
9877 +void dlm_debug_log(struct dlm_ls *ls, const char *fmt, ...)
9878 +{
9879 +       va_list va;
9880 +       int i, n, size, len;
9881 +       char buf[MAX_DEBUG_MSG_LEN+1];
9882 +
9883 +       spin_lock(&debug_lock);
9884 +
9885 +       if (!debug_buf)
9886 +               goto out;
9887 +
9888 +       size = MAX_DEBUG_MSG_LEN;
9889 +       memset(buf, 0, size+1);
9890 +
9891 +       n = snprintf(buf, size, "%s ", ls->ls_name);
9892 +       size -= n;
9893 +
9894 +       va_start(va, fmt);
9895 +       vsnprintf(buf+n, size, fmt, va);
9896 +       va_end(va);
9897 +
9898 +       len = strlen(buf);
9899 +       if (len > MAX_DEBUG_MSG_LEN-1)
9900 +               len = MAX_DEBUG_MSG_LEN-1;
9901 +       buf[len] = '\n';
9902 +       buf[len+1] = '\0';
9903 +
9904 +       for (i = 0; i < strlen(buf); i++) {
9905 +               debug_buf[debug_point++] = buf[i];
9906 +
9907 +               if (debug_point == debug_size) {
9908 +                       debug_point = 0;
9909 +                       debug_wrap = 1;
9910 +               }
9911 +       }
9912 + out:
9913 +       spin_unlock(&debug_lock);
9914 +}
9915 +
9916 +void dlm_debug_dump(void)
9917 +{
9918 +       int i;
9919 +
9920 +       spin_lock(&debug_lock);
9921 +       if (debug_wrap) {
9922 +               for (i = debug_point; i < debug_size; i++)
9923 +                       printk("%c", debug_buf[i]);
9924 +       }
9925 +       for (i = 0; i < debug_point; i++)
9926 +               printk("%c", debug_buf[i]);
9927 +       spin_unlock(&debug_lock);
9928 +}
9929 +
9930 +void dlm_debug_setup(int size)
9931 +{
9932 +       char *b = NULL;
9933 +
9934 +       if (size > PAGE_SIZE)
9935 +               size = PAGE_SIZE;
9936 +       if (size)
9937 +               b = kmalloc(size, GFP_KERNEL);
9938 +
9939 +       spin_lock(&debug_lock);
9940 +       if (debug_buf)
9941 +               kfree(debug_buf);
9942 +       if (!size || !b)
9943 +               goto out;
9944 +       debug_size = size;
9945 +       debug_point = 0;
9946 +       debug_wrap = 0;
9947 +       debug_buf = b;
9948 +       memset(debug_buf, 0, debug_size);
9949 + out:
9950 +        spin_unlock(&debug_lock);
9951 +}
9952 +
9953 +static void dlm_debug_init(void)
9954 +{
9955 +       debug_buf = NULL;
9956 +        debug_size = 0;
9957 +       debug_point = 0;
9958 +       debug_wrap = 0;
9959 +       spin_lock_init(&debug_lock);
9960 +
9961 +       dlm_debug_setup(DLM_DEBUG_SIZE);
9962 +}
9963 +
9964 +#ifdef CONFIG_PROC_FS
9965 +int dlm_debug_info(char *b, char **start, off_t offset, int length)
9966 +{
9967 +       int i, n = 0;
9968 +
9969 +       spin_lock(&debug_lock);
9970 +
9971 +       if (debug_wrap) {
9972 +               for (i = debug_point; i < debug_size; i++)
9973 +                       n += sprintf(b + n, "%c", debug_buf[i]);
9974 +       }
9975 +       for (i = 0; i < debug_point; i++)
9976 +               n += sprintf(b + n, "%c", debug_buf[i]);
9977 +
9978 +       spin_unlock(&debug_lock);
9979 +
9980 +       return n;
9981 +}
9982 +#endif
9983 +
9984 +#ifdef CONFIG_DLM_STATS
9985 +struct dlm_statinfo dlm_stats;
9986 +static struct proc_dir_entry *stats_proc_entry = NULL;
9987 +static int dlm_stats_info(char *b, char **start, off_t offset, int length)
9988 +{
9989 +       int n=0;
9990 +       int i;
9991 +       long lq_locks = 0;
9992 +       unsigned long lq_time = 0;
9993 +
9994 +       n += sprintf(b+n, "DLM stats (HZ=%d)\n\n", HZ);
9995 +       n += sprintf(b+n, "Lock operations:    %7d\n", dlm_stats.lockops);
9996 +       n += sprintf(b+n, "Unlock operations:  %7d\n", dlm_stats.unlockops);
9997 +       n += sprintf(b+n, "Convert operations: %7d\n", dlm_stats.convertops);
9998 +       n += sprintf(b+n, "Completion ASTs:    %7d\n", dlm_stats.cast);
9999 +       n += sprintf(b+n, "Blocking ASTs:      %7d\n", dlm_stats.bast);
10000 +       n += sprintf(b+n, "\n");
10001 +       n += sprintf(b+n, "Lockqueue        num  waittime   ave\n");
10002 +       for (i=1; i<=4 ; i++) {
10003 +               char *lq_reason="???";
10004 +               switch (i){
10005 +               case 1: lq_reason = "WAIT_RSB   ";
10006 +                       break;
10007 +               case 2: lq_reason = "WAIT_CONV  ";
10008 +                       break;
10009 +               case 3: lq_reason = "WAIT_GRANT ";
10010 +                       break;
10011 +               case 4: lq_reason = "WAIT_UNLOCK";
10012 +                       break;
10013 +               }
10014 +               if (dlm_stats.lockqueue_locks[i])
10015 +                       n += sprintf(b+n, "%s   %6lu   %7lu   %3lu\n",
10016 +                                    lq_reason,
10017 +                                    dlm_stats.lockqueue_locks[i],
10018 +                                    dlm_stats.lockqueue_time[i],
10019 +                                    dlm_stats.lockqueue_time[i]/
10020 +                                    dlm_stats.lockqueue_locks[i]);
10021 +
10022 +               lq_locks += dlm_stats.lockqueue_locks[i];
10023 +               lq_time += dlm_stats.lockqueue_time[i];
10024 +       }
10025 +       if (lq_locks)
10026 +               n += sprintf(b+n, "Total         %6lu   %7lu   %3lu\n",
10027 +                            lq_locks, lq_time, lq_time/lq_locks);
10028 +       return n;
10029 +}
10030 +
10031 +static int dlm_stats_clear(struct file *file, const char __user *buffer,
10032 +                           unsigned long count, void *data)
10033 +{
10034 +       memset(&dlm_stats, 0, sizeof(dlm_stats));
10035 +       return count;
10036 +}
10037 +#endif  /* CONFIG_DLM_STATS */
10038 +
10039 +void dlm_proc_init(void)
10040 +{
10041 +#ifdef CONFIG_PROC_FS
10042 +       debug_proc_entry = create_proc_entry("cluster/dlm_debug", S_IRUGO,
10043 +                                            NULL);
10044 +       if (!debug_proc_entry)
10045 +               return;
10046 +
10047 +       debug_proc_entry->get_info = &dlm_debug_info;
10048 +#endif
10049 +
10050 +#ifdef CONFIG_DLM_STATS
10051 +       stats_proc_entry = create_proc_entry("cluster/dlm_stats",
10052 +                                            S_IRUSR | S_IWUSR, NULL);
10053 +       if (!stats_proc_entry)
10054 +               return;
10055 +
10056 +       stats_proc_entry->get_info = &dlm_stats_info;
10057 +       stats_proc_entry->write_proc = &dlm_stats_clear;
10058 +#endif
10059 +
10060 +       dlm_debug_init();
10061 +
10062 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10063 +       locks_proc_entry = create_proc_read_entry("cluster/dlm_locks",
10064 +                                                 S_IFREG | 0400,
10065 +                                                 NULL, NULL, NULL);
10066 +       if (!locks_proc_entry)
10067 +               return;
10068 +       locks_proc_entry->proc_fops = &locks_fops;
10069 +
10070 +       dir_proc_entry = create_proc_read_entry("cluster/dlm_dir",
10071 +                                               S_IFREG | 0400,
10072 +                                               NULL, NULL, NULL);
10073 +       if (!dir_proc_entry)
10074 +               return;
10075 +       dir_proc_entry->proc_fops = &dir_fops;
10076 +#endif
10077 +}
10078 +
10079 +void dlm_proc_exit(void)
10080 +{
10081 +#ifdef CONFIG_PROC_FS
10082 +       if (debug_proc_entry) {
10083 +               remove_proc_entry("cluster/dlm_debug", NULL);
10084 +               dlm_debug_setup(0);
10085 +       }
10086 +#endif
10087 +
10088 +#ifdef CONFIG_DLM_STATS
10089 +       if (stats_proc_entry)
10090 +               remove_proc_entry("cluster/dlm_stats", NULL);
10091 +#endif
10092 +
10093 +#ifdef CONFIG_CLUSTER_DLM_PROCLOCKS
10094 +       if (locks_proc_entry)
10095 +               remove_proc_entry("cluster/dlm_locks", NULL);
10096 +       if (dir_proc_entry)
10097 +               remove_proc_entry("cluster/dlm_dir", NULL);
10098 +#endif
10099 +}
10100 diff -urN linux-orig/cluster/dlm/queries.c linux-patched/cluster/dlm/queries.c
10101 --- linux-orig/cluster/dlm/queries.c    1970-01-01 07:30:00.000000000 +0730
10102 +++ linux-patched/cluster/dlm/queries.c 2004-11-03 11:31:56.000000000 +0800
10103 @@ -0,0 +1,713 @@
10104 +/******************************************************************************
10105 +*******************************************************************************
10106 +**
10107 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
10108 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
10109 +**
10110 +**  This copyrighted material is made available to anyone wishing to use,
10111 +**  modify, copy, or redistribute it subject to the terms and conditions
10112 +**  of the GNU General Public License v.2.
10113 +**
10114 +*******************************************************************************
10115 +******************************************************************************/
10116 +
10117 +/*
10118 + * queries.c
10119 + *
10120 + * This file provides the kernel query interface to the DLM.
10121 + *
10122 + */
10123 +
10124 +#define EXPORT_SYMTAB
10125 +#include <linux/module.h>
10126 +
10127 +#include "dlm_internal.h"
10128 +#include "lockspace.h"
10129 +#include "lockqueue.h"
10130 +#include "locking.h"
10131 +#include "lkb.h"
10132 +#include "nodes.h"
10133 +#include "dir.h"
10134 +#include "ast.h"
10135 +#include "memory.h"
10136 +#include "lowcomms.h"
10137 +#include "midcomms.h"
10138 +#include "rsb.h"
10139 +
10140 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo);
10141 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo);
10142 +
10143 +/*
10144 + * API entry point.
10145 + */
10146 +int dlm_query(void *lockspace,
10147 +             struct dlm_lksb *lksb,
10148 +             int query,
10149 +             struct dlm_queryinfo *qinfo,
10150 +             void (ast_routine(void *)),
10151 +             void *astarg)
10152 +{
10153 +       int status = -EINVAL;
10154 +       struct dlm_lkb *target_lkb;
10155 +       struct dlm_lkb *query_lkb = NULL;       /* Our temporary LKB */
10156 +       struct dlm_ls  *ls = find_lockspace_by_local_id(lockspace);
10157 +
10158 +       if (!ls)
10159 +               return -EINVAL;
10160 +       if (!qinfo)
10161 +               goto out;
10162 +       if (!ast_routine)
10163 +               goto out;
10164 +       if (!lksb)
10165 +               goto out;
10166 +
10167 +       if (!qinfo->gqi_lockinfo)
10168 +               qinfo->gqi_locksize = 0;
10169 +
10170 +        /* Find the lkid */
10171 +       target_lkb = find_lock_by_id(ls, lksb->sb_lkid);
10172 +       if (!target_lkb)
10173 +               goto out;
10174 +
10175 +       /* If the user wants a list of locks that are blocking or
10176 +          not blocking this lock, then it must be waiting
10177 +          for something
10178 +       */
10179 +       if (((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING ||
10180 +            (query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK) &&
10181 +           target_lkb->lkb_status == GDLM_LKSTS_GRANTED)
10182 +               goto out;
10183 +
10184 +       /* We now allocate an LKB for our own use (so we can hang
10185 +        * things like the AST routine and the lksb from it) */
10186 +       lksb->sb_status = -EBUSY;
10187 +       query_lkb = create_lkb(ls);
10188 +       if (!query_lkb) {
10189 +               status = -ENOMEM;
10190 +               goto out;
10191 +       }
10192 +       query_lkb->lkb_astaddr  = ast_routine;
10193 +       query_lkb->lkb_astparam = (long)astarg;
10194 +       query_lkb->lkb_resource = target_lkb->lkb_resource;
10195 +       query_lkb->lkb_lksb     = lksb;
10196 +
10197 +       /* Don't free the resource while we are querying it. This ref
10198 +        * will be dropped when the LKB is freed */
10199 +       hold_rsb(query_lkb->lkb_resource);
10200 +
10201 +       /* Fill in the stuff that's always local */
10202 +       if (qinfo->gqi_resinfo) {
10203 +               if (target_lkb->lkb_resource->res_nodeid)
10204 +                       qinfo->gqi_resinfo->rsi_masternode =
10205 +                               target_lkb->lkb_resource->res_nodeid;
10206 +               else
10207 +                       qinfo->gqi_resinfo->rsi_masternode = our_nodeid();
10208 +               qinfo->gqi_resinfo->rsi_length =
10209 +                       target_lkb->lkb_resource->res_length;
10210 +               memcpy(qinfo->gqi_resinfo->rsi_name,
10211 +                      target_lkb->lkb_resource->res_name,
10212 +                      qinfo->gqi_resinfo->rsi_length);
10213 +       }
10214 +
10215 +       /* If the master is local (or the user doesn't want the overhead of a
10216 +        * remote call) - fill in the details here */
10217 +       if (target_lkb->lkb_resource->res_nodeid == 0 ||
10218 +           (query & DLM_QUERY_LOCAL)) {
10219 +
10220 +               status = 0;
10221 +               /* Resource info */
10222 +               if (qinfo->gqi_resinfo) {
10223 +                       query_resource(target_lkb->lkb_resource,
10224 +                                      qinfo->gqi_resinfo);
10225 +               }
10226 +
10227 +               /* Lock lists */
10228 +               if (qinfo->gqi_lockinfo) {
10229 +                       status = query_locks(query, target_lkb, qinfo);
10230 +               }
10231 +
10232 +               query_lkb->lkb_retstatus = status;
10233 +               queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
10234 +               wake_astd();
10235 +
10236 +               /* An AST will be delivered so we must return success here */
10237 +               status = 0;
10238 +               goto out;
10239 +       }
10240 +
10241 +       /* Remote master */
10242 +       if (target_lkb->lkb_resource->res_nodeid != 0)
10243 +       {
10244 +               struct dlm_query_request *remquery;
10245 +               struct writequeue_entry *e;
10246 +
10247 +               /* Clear this cos the receiving end adds to it with
10248 +                  each incoming packet */
10249 +               qinfo->gqi_lockcount = 0;
10250 +
10251 +               /* Squirrel a pointer to the query info struct
10252 +                  somewhere illegal */
10253 +               query_lkb->lkb_request = (struct dlm_request *) qinfo;
10254 +
10255 +               e = lowcomms_get_buffer(query_lkb->lkb_resource->res_nodeid,
10256 +                                       sizeof(struct dlm_query_request),
10257 +                                       ls->ls_allocation,
10258 +                                       (char **) &remquery);
10259 +               if (!e) {
10260 +                       status = -ENOBUFS;
10261 +                       goto out;
10262 +               }
10263 +
10264 +               /* Build remote packet */
10265 +               memset(remquery, 0, sizeof(struct dlm_query_request));
10266 +
10267 +               remquery->rq_maxlocks  = qinfo->gqi_locksize;
10268 +               remquery->rq_query     = query;
10269 +               remquery->rq_mstlkid   = target_lkb->lkb_remid;
10270 +               if (qinfo->gqi_lockinfo)
10271 +                       remquery->rq_maxlocks = qinfo->gqi_locksize;
10272 +
10273 +               remquery->rq_header.rh_cmd       = GDLM_REMCMD_QUERY;
10274 +               remquery->rq_header.rh_flags     = 0;
10275 +               remquery->rq_header.rh_length    = sizeof(struct dlm_query_request);
10276 +               remquery->rq_header.rh_lkid      = query_lkb->lkb_id;
10277 +               remquery->rq_header.rh_lockspace = ls->ls_global_id;
10278 +
10279 +               midcomms_send_buffer(&remquery->rq_header, e);
10280 +               status = 0;
10281 +       }
10282 +
10283 +      out:
10284 +       put_lockspace(ls);
10285 +       return status;
10286 +}
10287 +
10288 +static inline int valid_range(struct dlm_range *r)
10289 +{
10290 +    if (r->ra_start != 0ULL ||
10291 +       r->ra_end != 0xFFFFFFFFFFFFFFFFULL)
10292 +       return 1;
10293 +    else
10294 +       return 0;
10295 +}
10296 +
10297 +static void put_int(int x, char *buf, int *offp)
10298 +{
10299 +        x = cpu_to_le32(x);
10300 +        memcpy(buf + *offp, &x, sizeof(int));
10301 +        *offp += sizeof(int);
10302 +}
10303 +
10304 +static void put_int64(uint64_t x, char *buf, int *offp)
10305 +{
10306 +        x = cpu_to_le64(x);
10307 +        memcpy(buf + *offp, &x, sizeof(uint64_t));
10308 +        *offp += sizeof(uint64_t);
10309 +}
10310 +
10311 +static int get_int(char *buf, int *offp)
10312 +{
10313 +        int value;
10314 +        memcpy(&value, buf + *offp, sizeof(int));
10315 +        *offp += sizeof(int);
10316 +        return le32_to_cpu(value);
10317 +}
10318 +
10319 +static uint64_t get_int64(char *buf, int *offp)
10320 +{
10321 +        uint64_t value;
10322 +
10323 +        memcpy(&value, buf + *offp, sizeof(uint64_t));
10324 +        *offp += sizeof(uint64_t);
10325 +        return le64_to_cpu(value);
10326 +}
10327 +
10328 +#define LOCK_LEN (sizeof(int)*4 + sizeof(uint8_t)*4)
10329 +
10330 +/* Called from recvd to get lock info for a remote node */
10331 +int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
10332 +{
10333 +        struct dlm_query_request *query = (struct dlm_query_request *) msg;
10334 +       struct dlm_query_reply *reply;
10335 +       struct dlm_resinfo resinfo;
10336 +       struct dlm_queryinfo qinfo;
10337 +       struct writequeue_entry *e;
10338 +       char *buf;
10339 +       struct dlm_lkb *lkb;
10340 +       int status = 0;
10341 +       int bufidx;
10342 +       int finished = 0;
10343 +       int cur_lock = 0;
10344 +       int start_lock = 0;
10345 +
10346 +       lkb = find_lock_by_id(ls, query->rq_mstlkid);
10347 +       if (!lkb) {
10348 +               status = -EINVAL;
10349 +               goto send_error;
10350 +       }
10351 +
10352 +       qinfo.gqi_resinfo = &resinfo;
10353 +       qinfo.gqi_locksize = query->rq_maxlocks;
10354 +
10355 +       /* Get the resource bits */
10356 +       query_resource(lkb->lkb_resource, &resinfo);
10357 +
10358 +       /* Now get the locks if wanted */
10359 +       if (query->rq_maxlocks) {
10360 +               qinfo.gqi_lockinfo = kmalloc(sizeof(struct dlm_lockinfo) * query->rq_maxlocks,
10361 +                                            GFP_KERNEL);
10362 +               if (!qinfo.gqi_lockinfo) {
10363 +                       status = -ENOMEM;
10364 +                       goto send_error;
10365 +               }
10366 +
10367 +               status = query_locks(query->rq_query, lkb, &qinfo);
10368 +               if (status && status != -E2BIG) {
10369 +                       kfree(qinfo.gqi_lockinfo);
10370 +                       goto send_error;
10371 +               }
10372 +       }
10373 +       else {
10374 +               qinfo.gqi_lockinfo = NULL;
10375 +               qinfo.gqi_lockcount = 0;
10376 +       }
10377 +
10378 +       /* Send as many blocks as needed for all the locks */
10379 +       do {
10380 +               int i;
10381 +               int msg_len = sizeof(struct dlm_query_reply);
10382 +               int last_msg_len = msg_len; /* keeps compiler quiet */
10383 +               int last_lock;
10384 +
10385 +               /* First work out how many locks we can fit into a block */
10386 +               for (i=cur_lock; i < qinfo.gqi_lockcount && msg_len < PAGE_SIZE; i++) {
10387 +
10388 +                       last_msg_len = msg_len;
10389 +
10390 +                       msg_len += LOCK_LEN;
10391 +                       if (valid_range(&qinfo.gqi_lockinfo[i].lki_grrange) ||
10392 +                           valid_range(&qinfo.gqi_lockinfo[i].lki_rqrange)) {
10393 +
10394 +                               msg_len += sizeof(uint64_t) * 4;
10395 +                       }
10396 +               }
10397 +
10398 +               /* There must be a neater way of doing this... */
10399 +               if (msg_len > PAGE_SIZE) {
10400 +                       last_lock = i-1;
10401 +                       msg_len = last_msg_len;
10402 +               }
10403 +               else {
10404 +                       last_lock = i;
10405 +               }
10406 +
10407 +               e = lowcomms_get_buffer(nodeid,
10408 +                                       msg_len,
10409 +                                       ls->ls_allocation,
10410 +                                       (char **) &reply);
10411 +               if (!e) {
10412 +                       kfree(qinfo.gqi_lockinfo);
10413 +                       status = -ENOBUFS;
10414 +                       goto out;
10415 +               }
10416 +
10417 +               reply->rq_header.rh_cmd       = GDLM_REMCMD_QUERYREPLY;
10418 +               reply->rq_header.rh_length    = msg_len;
10419 +               reply->rq_header.rh_lkid      = msg->rh_lkid;
10420 +               reply->rq_header.rh_lockspace = msg->rh_lockspace;
10421 +
10422 +               reply->rq_status     = status;
10423 +               reply->rq_startlock  = cur_lock;
10424 +               reply->rq_grantcount = qinfo.gqi_resinfo->rsi_grantcount;
10425 +               reply->rq_convcount  = qinfo.gqi_resinfo->rsi_convcount;
10426 +               reply->rq_waitcount  = qinfo.gqi_resinfo->rsi_waitcount;
10427 +               memcpy(reply->rq_valblk, qinfo.gqi_resinfo->rsi_valblk, DLM_LVB_LEN);
10428 +
10429 +               buf = (char *)reply;
10430 +               bufidx = sizeof(struct dlm_query_reply);
10431 +
10432 +               for (; cur_lock < last_lock; cur_lock++) {
10433 +
10434 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_state;
10435 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_grmode;
10436 +                       buf[bufidx++] = qinfo.gqi_lockinfo[cur_lock].lki_rqmode;
10437 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_lkid, buf, &bufidx);
10438 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_mstlkid, buf, &bufidx);
10439 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_parent, buf, &bufidx);
10440 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_node, buf, &bufidx);
10441 +                       put_int(qinfo.gqi_lockinfo[cur_lock].lki_ownpid, buf, &bufidx);
10442 +
10443 +                       if (valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_grrange) ||
10444 +                           valid_range(&qinfo.gqi_lockinfo[cur_lock].lki_rqrange)) {
10445 +
10446 +                               buf[bufidx++] = 1;
10447 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_start, buf, &bufidx);
10448 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_grrange.ra_end, buf, &bufidx);
10449 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_start, buf, &bufidx);
10450 +                               put_int64(qinfo.gqi_lockinfo[cur_lock].lki_rqrange.ra_end, buf, &bufidx);
10451 +                       }
10452 +                       else {
10453 +                               buf[bufidx++] = 0;
10454 +                       }
10455 +               }
10456 +
10457 +               if (cur_lock == qinfo.gqi_lockcount) {
10458 +                       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10459 +                       finished = 1;
10460 +               }
10461 +               else {
10462 +                       reply->rq_header.rh_flags = 0;
10463 +               }
10464 +
10465 +               reply->rq_numlocks = cur_lock - start_lock;
10466 +               start_lock = cur_lock;
10467 +
10468 +               midcomms_send_buffer(&reply->rq_header, e);
10469 +       } while (!finished);
10470 +
10471 +       kfree(qinfo.gqi_lockinfo);
10472 + out:
10473 +       return status;
10474 +
10475 + send_error:
10476 +       e = lowcomms_get_buffer(nodeid,
10477 +                               sizeof(struct dlm_query_reply),
10478 +                               ls->ls_allocation,
10479 +                               (char **) &reply);
10480 +       if (!e) {
10481 +               status =  -ENOBUFS;
10482 +               goto out;
10483 +       }
10484 +       reply->rq_header.rh_cmd = GDLM_REMCMD_QUERYREPLY;
10485 +       reply->rq_header.rh_flags = GDLM_REMFLAG_ENDQUERY;
10486 +       reply->rq_header.rh_length = sizeof(struct dlm_query_reply);
10487 +       reply->rq_header.rh_lkid = msg->rh_lkid;
10488 +       reply->rq_header.rh_lockspace = msg->rh_lockspace;
10489 +       reply->rq_status     = status;
10490 +       reply->rq_numlocks   = 0;
10491 +       reply->rq_startlock  = 0;
10492 +       reply->rq_grantcount = 0;
10493 +       reply->rq_convcount  = 0;
10494 +       reply->rq_waitcount  = 0;
10495 +
10496 +       midcomms_send_buffer(&reply->rq_header, e);
10497 +
10498 +       return status;
10499 +}
10500 +
10501 +/* Reply to a remote query */
10502 +int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg)
10503 +{
10504 +       struct dlm_lkb *query_lkb;
10505 +       struct dlm_queryinfo *qinfo;
10506 +       struct dlm_query_reply *reply;
10507 +       char *buf;
10508 +       int i;
10509 +       int bufidx;
10510 +
10511 +       query_lkb = find_lock_by_id(ls, msg->rh_lkid);
10512 +       if (!query_lkb)
10513 +               return -EINVAL;
10514 +
10515 +       qinfo = (struct dlm_queryinfo *) query_lkb->lkb_request;
10516 +       reply = (struct dlm_query_reply *) msg;
10517 +
10518 +       /* Copy the easy bits first */
10519 +       qinfo->gqi_lockcount += reply->rq_numlocks;
10520 +       if (qinfo->gqi_resinfo) {
10521 +               qinfo->gqi_resinfo->rsi_grantcount = reply->rq_grantcount;
10522 +               qinfo->gqi_resinfo->rsi_convcount = reply->rq_convcount;
10523 +               qinfo->gqi_resinfo->rsi_waitcount = reply->rq_waitcount;
10524 +               memcpy(qinfo->gqi_resinfo->rsi_valblk, reply->rq_valblk,
10525 +                       DLM_LVB_LEN);
10526 +       }
10527 +
10528 +       /* Now unpack the locks */
10529 +       bufidx = sizeof(struct dlm_query_reply);
10530 +       buf = (char *) msg;
10531 +
10532 +       DLM_ASSERT(reply->rq_startlock + reply->rq_numlocks <= qinfo->gqi_locksize,
10533 +                   printk("start = %d, num + %d. Max=  %d\n",
10534 +                          reply->rq_startlock, reply->rq_numlocks, qinfo->gqi_locksize););
10535 +
10536 +       for (i = reply->rq_startlock;
10537 +            i < reply->rq_startlock + reply->rq_numlocks; i++) {
10538 +               qinfo->gqi_lockinfo[i].lki_state = buf[bufidx++];
10539 +               qinfo->gqi_lockinfo[i].lki_grmode = buf[bufidx++];
10540 +               qinfo->gqi_lockinfo[i].lki_rqmode = buf[bufidx++];
10541 +               qinfo->gqi_lockinfo[i].lki_lkid = get_int(buf, &bufidx);
10542 +               qinfo->gqi_lockinfo[i].lki_mstlkid = get_int(buf, &bufidx);
10543 +               qinfo->gqi_lockinfo[i].lki_parent = get_int(buf, &bufidx);
10544 +               qinfo->gqi_lockinfo[i].lki_node = get_int(buf, &bufidx);
10545 +               qinfo->gqi_lockinfo[i].lki_ownpid = get_int(buf, &bufidx);
10546 +               if (buf[bufidx++]) {
10547 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = get_int64(buf, &bufidx);
10548 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = get_int64(buf, &bufidx);
10549 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = get_int64(buf, &bufidx);
10550 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = get_int64(buf, &bufidx);
10551 +               }
10552 +               else {
10553 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_start = 0ULL;
10554 +                       qinfo->gqi_lockinfo[i].lki_grrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
10555 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_start = 0ULL;
10556 +                       qinfo->gqi_lockinfo[i].lki_rqrange.ra_end   = 0xFFFFFFFFFFFFFFFFULL;
10557 +               }
10558 +       }
10559 +
10560 +       /* If this was the last block then now tell the user */
10561 +       if (msg->rh_flags & GDLM_REMFLAG_ENDQUERY) {
10562 +               query_lkb->lkb_retstatus = reply->rq_status;
10563 +               queue_ast(query_lkb, AST_COMP | AST_DEL, 0);
10564 +               wake_astd();
10565 +       }
10566 +
10567 +       return 0;
10568 +}
10569 +
10570 +/* Aggregate resource information */
10571 +static int query_resource(struct dlm_rsb *rsb, struct dlm_resinfo *resinfo)
10572 +{
10573 +       struct list_head *tmp;
10574 +
10575 +       if (rsb->res_lvbptr)
10576 +               memcpy(resinfo->rsi_valblk, rsb->res_lvbptr, DLM_LVB_LEN);
10577 +
10578 +       down_read(&rsb->res_lock);
10579 +       resinfo->rsi_grantcount = 0;
10580 +       list_for_each(tmp, &rsb->res_grantqueue) {
10581 +               resinfo->rsi_grantcount++;
10582 +       }
10583 +
10584 +       resinfo->rsi_waitcount = 0;
10585 +       list_for_each(tmp, &rsb->res_waitqueue) {
10586 +               resinfo->rsi_waitcount++;
10587 +       }
10588 +
10589 +       resinfo->rsi_convcount = 0;
10590 +       list_for_each(tmp, &rsb->res_convertqueue) {
10591 +               resinfo->rsi_convcount++;
10592 +       }
10593 +       up_read(&rsb->res_lock);
10594 +
10595 +       return 0;
10596 +}
10597 +
10598 +static int add_lock(struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
10599 +{
10600 +       int entry;
10601 +
10602 +       /* Don't fill it in if the buffer is full */
10603 +       if (qinfo->gqi_lockcount == qinfo->gqi_locksize)
10604 +               return -E2BIG;
10605 +
10606 +       /* gqi_lockcount contains the number of locks we have returned */
10607 +       entry = qinfo->gqi_lockcount++;
10608 +
10609 +       /* Fun with master copies */
10610 +       if (lkb->lkb_flags & GDLM_LKFLG_MSTCPY) {
10611 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_remid;
10612 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_id;
10613 +       }
10614 +       else {
10615 +               qinfo->gqi_lockinfo[entry].lki_lkid = lkb->lkb_id;
10616 +               qinfo->gqi_lockinfo[entry].lki_mstlkid = lkb->lkb_remid;
10617 +       }
10618 +
10619 +       /* Also make sure we always have a valid nodeid in there, the
10620 +          calling end may not know which node "0" is */
10621 +       if (lkb->lkb_nodeid)
10622 +           qinfo->gqi_lockinfo[entry].lki_node = lkb->lkb_nodeid;
10623 +       else
10624 +           qinfo->gqi_lockinfo[entry].lki_node = our_nodeid();
10625 +
10626 +       if (lkb->lkb_parent)
10627 +               qinfo->gqi_lockinfo[entry].lki_parent = lkb->lkb_parent->lkb_id;
10628 +       else
10629 +               qinfo->gqi_lockinfo[entry].lki_parent = 0;
10630 +
10631 +       qinfo->gqi_lockinfo[entry].lki_state  = lkb->lkb_status;
10632 +       qinfo->gqi_lockinfo[entry].lki_rqmode = lkb->lkb_rqmode;
10633 +       qinfo->gqi_lockinfo[entry].lki_grmode = lkb->lkb_grmode;
10634 +       qinfo->gqi_lockinfo[entry].lki_ownpid = lkb->lkb_ownpid;
10635 +
10636 +       if (lkb->lkb_range) {
10637 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start =
10638 +                       lkb->lkb_range[GR_RANGE_START];
10639 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_end =
10640 +                       lkb->lkb_range[GR_RANGE_END];
10641 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start =
10642 +                       lkb->lkb_range[RQ_RANGE_START];
10643 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_end =
10644 +                       lkb->lkb_range[RQ_RANGE_END];
10645 +       } else {
10646 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0ULL;
10647 +               qinfo->gqi_lockinfo[entry].lki_grrange.ra_start = 0xffffffffffffffffULL;
10648 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0ULL;
10649 +               qinfo->gqi_lockinfo[entry].lki_rqrange.ra_start = 0xffffffffffffffffULL;
10650 +       }
10651 +       return 0;
10652 +}
10653 +
10654 +static int query_lkb_queue(struct dlm_rsb *rsb,
10655 +                          struct list_head *queue, int query,
10656 +                          struct dlm_queryinfo *qinfo)
10657 +{
10658 +       struct list_head *tmp;
10659 +       int status = 0;
10660 +       int mode = query & DLM_QUERY_MODE_MASK;
10661 +
10662 +       down_read(&rsb->res_lock);
10663 +       list_for_each(tmp, queue) {
10664 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10665 +               int lkmode;
10666 +
10667 +               if (query & DLM_QUERY_RQMODE)
10668 +                       lkmode = lkb->lkb_rqmode;
10669 +               else
10670 +                       lkmode = lkb->lkb_grmode;
10671 +
10672 +               /* Add the LKB info to the list if it matches the criteria in
10673 +                * the query bitmap */
10674 +               switch (query & DLM_QUERY_MASK) {
10675 +               case DLM_QUERY_LOCKS_ALL:
10676 +                       status = add_lock(lkb, qinfo);
10677 +                       break;
10678 +
10679 +               case DLM_QUERY_LOCKS_HIGHER:
10680 +                       if (lkmode > mode)
10681 +                               status = add_lock(lkb, qinfo);
10682 +                       break;
10683 +
10684 +               case DLM_QUERY_LOCKS_EQUAL:
10685 +                       if (lkmode == mode)
10686 +                               status = add_lock(lkb, qinfo);
10687 +                       break;
10688 +
10689 +               case DLM_QUERY_LOCKS_LOWER:
10690 +                       if (lkmode < mode)
10691 +                               status = add_lock(lkb, qinfo);
10692 +
10693 +               case DLM_QUERY_LOCKS_ORPHAN:
10694 +                       if (lkb->lkb_flags & GDLM_LKFLG_ORPHAN)
10695 +                               status = add_lock(lkb, qinfo);
10696 +                       break;
10697 +               }
10698 +       }
10699 +       up_read(&rsb->res_lock);
10700 +       return status;
10701 +}
10702 +
10703 +/*
10704 + * Return 1 if the locks' ranges overlap
10705 + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
10706 + */
10707 +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
10708 +{
10709 +       if (!lkb1->lkb_range || !lkb2->lkb_range)
10710 +               return 1;
10711 +
10712 +       if (lkb1->lkb_range[RQ_RANGE_END] <= lkb2->lkb_range[GR_RANGE_START] ||
10713 +           lkb1->lkb_range[RQ_RANGE_START] >= lkb2->lkb_range[GR_RANGE_END])
10714 +               return 0;
10715 +
10716 +       return 1;
10717 +}
10718 +extern const int __dlm_compat_matrix[8][8];
10719 +
10720 +
10721 +static int get_blocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
10722 +{
10723 +       struct list_head *tmp;
10724 +       int status = 0;
10725 +
10726 +       down_read(&qlkb->lkb_resource->res_lock);
10727 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
10728 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10729 +
10730 +               if (ranges_overlap(lkb, qlkb) &&
10731 +                   !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1])
10732 +                       status = add_lock(lkb, qinfo);
10733 +       }
10734 +       up_read(&qlkb->lkb_resource->res_lock);
10735 +
10736 +       return status;
10737 +}
10738 +
10739 +static int get_nonblocking_locks(struct dlm_lkb *qlkb, struct dlm_queryinfo *qinfo)
10740 +{
10741 +       struct list_head *tmp;
10742 +       int status = 0;
10743 +
10744 +       down_read(&qlkb->lkb_resource->res_lock);
10745 +       list_for_each(tmp, &qlkb->lkb_resource->res_grantqueue) {
10746 +               struct dlm_lkb *lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
10747 +
10748 +               if (!(ranges_overlap(lkb, qlkb) &&
10749 +                     !__dlm_compat_matrix[lkb->lkb_grmode + 1][qlkb->lkb_rqmode + 1]))
10750 +                       status = add_lock(lkb, qinfo);
10751 +       }
10752 +       up_read(&qlkb->lkb_resource->res_lock);
10753 +
10754 +       return status;
10755 +}
10756 +
10757 +/* Gather a list of appropriate locks */
10758 +static int query_locks(int query, struct dlm_lkb *lkb, struct dlm_queryinfo *qinfo)
10759 +{
10760 +       int status = 0;
10761 +
10762 +
10763 +       /* Mask in the actual granted/requsted mode of the lock if LOCK_THIS
10764 +        * was requested as the mode
10765 +        */
10766 +       if ((query & DLM_QUERY_MODE_MASK) == DLM_LOCK_THIS) {
10767 +               query &= ~DLM_QUERY_MODE_MASK;
10768 +               if (query & DLM_QUERY_RQMODE)
10769 +                       query |= lkb->lkb_rqmode;
10770 +               else
10771 +                       query |= lkb->lkb_grmode;
10772 +       }
10773 +
10774 +       qinfo->gqi_lockcount = 0;
10775 +
10776 +       /* BLOCKING/NOTBLOCK only look at the granted queue */
10777 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_BLOCKING)
10778 +               return get_blocking_locks(lkb, qinfo);
10779 +
10780 +       if ((query & DLM_QUERY_MASK) == DLM_QUERY_LOCKS_NOTBLOCK)
10781 +               return get_nonblocking_locks(lkb, qinfo);
10782 +
10783 +        /* Do the lock queues that were requested */
10784 +       if (query & DLM_QUERY_QUEUE_GRANT) {
10785 +               status = query_lkb_queue(lkb->lkb_resource,
10786 +                                        &lkb->lkb_resource->res_grantqueue,
10787 +                                        query, qinfo);
10788 +       }
10789 +
10790 +       if (!status && (query & DLM_QUERY_QUEUE_CONVERT)) {
10791 +               status = query_lkb_queue(lkb->lkb_resource,
10792 +                                        &lkb->lkb_resource->res_convertqueue,
10793 +                                        query, qinfo);
10794 +       }
10795 +
10796 +       if (!status && (query & DLM_QUERY_QUEUE_WAIT)) {
10797 +               status = query_lkb_queue(lkb->lkb_resource,
10798 +                                        &lkb->lkb_resource->res_waitqueue,
10799 +                                        query, qinfo);
10800 +       }
10801 +
10802 +
10803 +       return status;
10804 +}
10805 +
10806 +EXPORT_SYMBOL(dlm_query);
10807 +/*
10808 + * Overrides for Emacs so that we follow Linus's tabbing style.
10809 + * Emacs will notice this stuff at the end of the file and automatically
10810 + * adjust the settings for this buffer only.  This must remain at the end
10811 + * of the file.
10812 + * ---------------------------------------------------------------------------
10813 + * Local variables:
10814 + * c-file-style: "linux"
10815 + * End:
10816 + */
10817 diff -urN linux-orig/cluster/dlm/queries.h linux-patched/cluster/dlm/queries.h
10818 --- linux-orig/cluster/dlm/queries.h    1970-01-01 07:30:00.000000000 +0730
10819 +++ linux-patched/cluster/dlm/queries.h 2004-11-03 11:31:56.000000000 +0800
10820 @@ -0,0 +1,20 @@
10821 +/******************************************************************************
10822 +*******************************************************************************
10823 +**
10824 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
10825 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
10826 +**
10827 +**  This copyrighted material is made available to anyone wishing to use,
10828 +**  modify, copy, or redistribute it subject to the terms and conditions
10829 +**  of the GNU General Public License v.2.
10830 +**
10831 +*******************************************************************************
10832 +******************************************************************************/
10833 +
10834 +#ifndef __QUERIES_DOT_H__
10835 +#define __QUERIES_DOT_H__
10836 +
10837 +extern int remote_query(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10838 +extern int remote_query_reply(int nodeid, struct dlm_ls *ls, struct dlm_header *msg);
10839 +
10840 +#endif                          /* __QUERIES_DOT_H__ */
10841 diff -urN linux-orig/cluster/dlm/rebuild.c linux-patched/cluster/dlm/rebuild.c
10842 --- linux-orig/cluster/dlm/rebuild.c    1970-01-01 07:30:00.000000000 +0730
10843 +++ linux-patched/cluster/dlm/rebuild.c 2004-11-03 11:31:56.000000000 +0800
10844 @@ -0,0 +1,1280 @@
10845 +/******************************************************************************
10846 +*******************************************************************************
10847 +**
10848 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
10849 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
10850 +**
10851 +**  This copyrighted material is made available to anyone wishing to use,
10852 +**  modify, copy, or redistribute it subject to the terms and conditions
10853 +**  of the GNU General Public License v.2.
10854 +**
10855 +*******************************************************************************
10856 +******************************************************************************/
10857 +
10858 +/*
10859 + * Rebuild RSB's on new masters.  Functions for transferring locks and
10860 + * subresources to new RSB masters during recovery.
10861 + */
10862 +
10863 +#include "dlm_internal.h"
10864 +#include "reccomms.h"
10865 +#include "lkb.h"
10866 +#include "rsb.h"
10867 +#include "nodes.h"
10868 +#include "config.h"
10869 +#include "memory.h"
10870 +#include "recover.h"
10871 +
10872 +
10873 +/* Types of entity serialised in remastering messages */
10874 +#define REMASTER_ROOTRSB 1
10875 +#define REMASTER_RSB     2
10876 +#define REMASTER_LKB     3
10877 +
10878 +struct rcom_fill {
10879 +       char *                  outbuf;         /* Beginning of data */
10880 +       int                     offset;         /* Current offset into outbuf */
10881 +       int                     maxlen;         /* Max value of offset */
10882 +       int                     remasterid;
10883 +       int                     count;
10884 +       struct dlm_rsb *        rsb;
10885 +       struct dlm_rsb *        subrsb;
10886 +       struct dlm_lkb *        lkb;
10887 +       struct list_head *      lkbqueue;
10888 +       char                    more;
10889 +};
10890 +typedef struct rcom_fill rcom_fill_t;
10891 +
10892 +
10893 +struct rebuild_node {
10894 +       struct list_head        list;
10895 +       int                     nodeid;
10896 +       struct dlm_rsb *        rootrsb;
10897 +};
10898 +typedef struct rebuild_node rebuild_node_t;
10899 +
10900 +
10901 +/*
10902 + * Root rsb passed in for which all lkb's (own and subrsbs) will be sent to new
10903 + * master.  The rsb will be "done" with recovery when the new master has
10904 + * replied with all the new remote lockid's for this rsb's lkb's.
10905 + */
10906 +
10907 +void expect_new_lkids(struct dlm_rsb *rsb)
10908 +{
10909 +       rsb->res_newlkid_expect = 0;
10910 +       recover_list_add(rsb);
10911 +}
10912 +
10913 +/*
10914 + * This function is called on root rsb or subrsb when another lkb is being sent
10915 + * to the new master for which we expect to receive a corresponding remote lkid
10916 + */
10917 +
10918 +void need_new_lkid(struct dlm_rsb *rsb)
10919 +{
10920 +       struct dlm_rsb *root = rsb;
10921 +
10922 +       if (rsb->res_parent)
10923 +               root = rsb->res_root;
10924 +
10925 +       if (!root->res_newlkid_expect)
10926 +               recover_list_add(root);
10927 +       else
10928 +               DLM_ASSERT(test_bit(RESFL_RECOVER_LIST, &root->res_flags),);
10929 +
10930 +       root->res_newlkid_expect++;
10931 +}
10932 +
10933 +/*
10934 + * This function is called for each lkb for which a new remote lkid is
10935 + * received.  Decrement the expected number of remote lkids expected for the
10936 + * root rsb.
10937 + */
10938 +
10939 +void have_new_lkid(struct dlm_lkb *lkb)
10940 +{
10941 +       struct dlm_rsb *root = lkb->lkb_resource;
10942 +
10943 +       if (root->res_parent)
10944 +               root = root->res_root;
10945 +
10946 +       down_write(&root->res_lock);
10947 +
10948 +       DLM_ASSERT(root->res_newlkid_expect,
10949 +                  printk("newlkid_expect=%d\n", root->res_newlkid_expect););
10950 +
10951 +       root->res_newlkid_expect--;
10952 +
10953 +       if (!root->res_newlkid_expect) {
10954 +               clear_bit(RESFL_NEW_MASTER, &root->res_flags);
10955 +               recover_list_del(root);
10956 +       }
10957 +       up_write(&root->res_lock);
10958 +}
10959 +
10960 +/*
10961 + * Return the rebuild struct for a node - will create an entry on the rootrsb
10962 + * list if necessary.
10963 + *
10964 + * Currently no locking is needed here as it all happens in the dlm_recvd
10965 + * thread
10966 + */
10967 +
10968 +static rebuild_node_t *find_rebuild_root(struct dlm_ls *ls, int nodeid)
10969 +{
10970 +       rebuild_node_t *node = NULL;
10971 +
10972 +       list_for_each_entry(node, &ls->ls_rebuild_rootrsb_list, list) {
10973 +               if (node->nodeid == nodeid)
10974 +                       return node;
10975 +       }
10976 +
10977 +       /* Not found, add one */
10978 +       node = kmalloc(sizeof(rebuild_node_t), GFP_KERNEL);
10979 +       if (!node)
10980 +               return NULL;
10981 +
10982 +       node->nodeid = nodeid;
10983 +       node->rootrsb = NULL;
10984 +       list_add(&node->list, &ls->ls_rebuild_rootrsb_list);
10985 +
10986 +       return node;
10987 +}
10988 +
10989 +/*
10990 + * Tidy up after a rebuild run.  Called when all recovery has finished
10991 + */
10992 +
10993 +void rebuild_freemem(struct dlm_ls *ls)
10994 +{
10995 +       rebuild_node_t *node = NULL, *s;
10996 +
10997 +       list_for_each_entry_safe(node, s, &ls->ls_rebuild_rootrsb_list, list) {
10998 +               list_del(&node->list);
10999 +               kfree(node);
11000 +       }
11001 +}
11002 +
11003 +static void put_int(int x, char *buf, int *offp)
11004 +{
11005 +       x = cpu_to_le32(x);
11006 +       memcpy(buf + *offp, &x, sizeof(int));
11007 +       *offp += sizeof(int);
11008 +}
11009 +
11010 +static void put_int64(uint64_t x, char *buf, int *offp)
11011 +{
11012 +       x = cpu_to_le64(x);
11013 +       memcpy(buf + *offp, &x, sizeof(uint64_t));
11014 +       *offp += sizeof(uint64_t);
11015 +}
11016 +
11017 +static void put_bytes(char *x, int len, char *buf, int *offp)
11018 +{
11019 +       put_int(len, buf, offp);
11020 +       memcpy(buf + *offp, x, len);
11021 +       *offp += len;
11022 +}
11023 +
11024 +static void put_char(char x, char *buf, int *offp)
11025 +{
11026 +       buf[*offp] = x;
11027 +       *offp += 1;
11028 +}
11029 +
11030 +static int get_int(char *buf, int *offp)
11031 +{
11032 +       int value;
11033 +       memcpy(&value, buf + *offp, sizeof(int));
11034 +       *offp += sizeof(int);
11035 +       return le32_to_cpu(value);
11036 +}
11037 +
11038 +static uint64_t get_int64(char *buf, int *offp)
11039 +{
11040 +       uint64_t value;
11041 +
11042 +       memcpy(&value, buf + *offp, sizeof(uint64_t));
11043 +       *offp += sizeof(uint64_t);
11044 +       return le64_to_cpu(value);
11045 +}
11046 +
11047 +static char get_char(char *buf, int *offp)
11048 +{
11049 +       char x = buf[*offp];
11050 +
11051 +       *offp += 1;
11052 +       return x;
11053 +}
11054 +
11055 +static void get_bytes(char *bytes, int *len, char *buf, int *offp)
11056 +{
11057 +       *len = get_int(buf, offp);
11058 +       memcpy(bytes, buf + *offp, *len);
11059 +       *offp += *len;
11060 +}
11061 +
11062 +static int lkb_length(struct dlm_lkb *lkb)
11063 +{
11064 +       int len = 0;
11065 +
11066 +       len += sizeof(int);     /* lkb_id */
11067 +       len += sizeof(int);     /* lkb_resource->res_reamasterid */
11068 +       len += sizeof(int);     /* lkb_flags */
11069 +       len += sizeof(int);     /* lkb_status */
11070 +       len += sizeof(char);    /* lkb_rqmode */
11071 +       len += sizeof(char);    /* lkb_grmode */
11072 +       len += sizeof(int);     /* lkb_childcnt */
11073 +       len += sizeof(int);     /* lkb_parent->lkb_id */
11074 +       len += sizeof(int);     /* lkb_bastaddr */
11075 +       len += sizeof(int);     /* lkb_ownpid */
11076 +
11077 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11078 +               len += sizeof(int);     /* number of lvb bytes */
11079 +               len += DLM_LVB_LEN;
11080 +       }
11081 +
11082 +       if (lkb->lkb_range) {
11083 +               len += sizeof(uint64_t);
11084 +               len += sizeof(uint64_t);
11085 +               if (lkb->lkb_status == GDLM_LKSTS_CONVERT) {
11086 +                       len += sizeof(uint64_t);
11087 +                       len += sizeof(uint64_t);
11088 +               }
11089 +       }
11090 +
11091 +       return len;
11092 +}
11093 +
11094 +/*
11095 + * It's up to the caller to be sure there's enough space in the buffer.
11096 + */
11097 +
11098 +static void serialise_lkb(struct dlm_lkb *lkb, char *buf, int *offp)
11099 +{
11100 +       int flags;
11101 +
11102 +       /* Need to tell the remote end if we have a range */
11103 +       flags = lkb->lkb_flags;
11104 +       if (lkb->lkb_range)
11105 +               flags |= GDLM_LKFLG_RANGE;
11106 +
11107 +       /*
11108 +        * See lkb_length()
11109 +        * Total: 30 (no lvb) or 66 (with lvb) bytes
11110 +        */
11111 +
11112 +       put_int(lkb->lkb_id, buf, offp);
11113 +       put_int(lkb->lkb_resource->res_remasterid, buf, offp);
11114 +       put_int(flags, buf, offp);
11115 +       put_int(lkb->lkb_status, buf, offp);
11116 +       put_char(lkb->lkb_rqmode, buf, offp);
11117 +       put_char(lkb->lkb_grmode, buf, offp);
11118 +       put_int(atomic_read(&lkb->lkb_childcnt), buf, offp);
11119 +
11120 +       if (lkb->lkb_parent)
11121 +               put_int(lkb->lkb_parent->lkb_id, buf, offp);
11122 +       else
11123 +               put_int(0, buf, offp);
11124 +
11125 +       if (lkb->lkb_bastaddr)
11126 +               put_int(1, buf, offp);
11127 +       else
11128 +               put_int(0, buf, offp);
11129 +       put_int(lkb->lkb_ownpid, buf, offp);
11130 +
11131 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11132 +               DLM_ASSERT(lkb->lkb_lvbptr,);
11133 +               put_bytes(lkb->lkb_lvbptr, DLM_LVB_LEN, buf, offp);
11134 +       }
11135 +
11136 +       /* Only send the range we actually need */
11137 +       if (lkb->lkb_range) {
11138 +               switch (lkb->lkb_status) {
11139 +               case GDLM_LKSTS_CONVERT:
11140 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11141 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11142 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11143 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11144 +                       break;
11145 +               case GDLM_LKSTS_WAITING:
11146 +                       put_int64(lkb->lkb_range[RQ_RANGE_START], buf, offp);
11147 +                       put_int64(lkb->lkb_range[RQ_RANGE_END], buf, offp);
11148 +                       break;
11149 +               case GDLM_LKSTS_GRANTED:
11150 +                       put_int64(lkb->lkb_range[GR_RANGE_START], buf, offp);
11151 +                       put_int64(lkb->lkb_range[GR_RANGE_END], buf, offp);
11152 +                       break;
11153 +               default:
11154 +                       DLM_ASSERT(0,);
11155 +               }
11156 +       }
11157 +}
11158 +
11159 +static int rsb_length(struct dlm_rsb *rsb)
11160 +{
11161 +       int len = 0;
11162 +
11163 +       len += sizeof(int);     /* number of res_name bytes */
11164 +       len += rsb->res_length; /* res_name */
11165 +       len += sizeof(int);     /* res_remasterid */
11166 +       len += sizeof(int);     /* res_parent->res_remasterid */
11167 +
11168 +       return len;
11169 +}
11170 +
11171 +static inline struct dlm_rsb *next_subrsb(struct dlm_rsb *subrsb)
11172 +{
11173 +       struct list_head *tmp;
11174 +       struct dlm_rsb *r;
11175 +
11176 +       tmp = subrsb->res_subreslist.next;
11177 +       r = list_entry(tmp, struct dlm_rsb, res_subreslist);
11178 +
11179 +       return r;
11180 +}
11181 +
11182 +static inline int last_in_list(struct dlm_rsb *r, struct list_head *head)
11183 +{
11184 +       struct dlm_rsb *last;
11185 +       last = list_entry(head->prev, struct dlm_rsb, res_subreslist);
11186 +       if (last == r)
11187 +               return 1;
11188 +       return 0;
11189 +}
11190 +
11191 +static int lkbs_to_remaster_list(struct list_head *head)
11192 +{
11193 +       struct dlm_lkb *lkb;
11194 +
11195 +       list_for_each_entry(lkb, head, lkb_statequeue) {
11196 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11197 +                       continue;
11198 +               return TRUE;
11199 +       }
11200 +       return FALSE;
11201 +}
11202 +
11203 +/*
11204 + * Used to decide if an rsb should be rebuilt on a new master.  An rsb only
11205 + * needs to be rebuild if we have lkb's queued on it.  NOREBUILD lkb's are not
11206 + * rebuilt.
11207 + */
11208 +
11209 +static int lkbs_to_remaster(struct dlm_rsb *r)
11210 +{
11211 +       struct dlm_rsb *sub;
11212 +
11213 +       if (lkbs_to_remaster_list(&r->res_grantqueue))
11214 +               return TRUE;
11215 +       if (lkbs_to_remaster_list(&r->res_convertqueue))
11216 +               return TRUE;
11217 +       if (lkbs_to_remaster_list(&r->res_waitqueue))
11218 +               return TRUE;
11219 +
11220 +       list_for_each_entry(sub, &r->res_subreslist, res_subreslist) {
11221 +               if (lkbs_to_remaster_list(&sub->res_grantqueue))
11222 +                       return TRUE;
11223 +               if (lkbs_to_remaster_list(&sub->res_convertqueue))
11224 +                       return TRUE;
11225 +               if (lkbs_to_remaster_list(&sub->res_waitqueue))
11226 +                       return TRUE;
11227 +       }
11228 +
11229 +       return FALSE;
11230 +}
11231 +
11232 +static void serialise_rsb(struct dlm_rsb *rsb, char *buf, int *offp)
11233 +{
11234 +       /*
11235 +        * See rsb_length()
11236 +        * Total: 36 bytes (4 + 24 + 4 + 4)
11237 +        */
11238 +
11239 +       put_bytes(rsb->res_name, rsb->res_length, buf, offp);
11240 +       put_int(rsb->res_remasterid, buf, offp);
11241 +
11242 +       if (rsb->res_parent)
11243 +               put_int(rsb->res_parent->res_remasterid, buf, offp);
11244 +       else
11245 +               put_int(0, buf, offp);
11246 +
11247 +       DLM_ASSERT(!rsb->res_lvbptr,);
11248 +}
11249 +
11250 +/*
11251 + * Flatten an LKB into a buffer for sending to the new RSB master.  As a
11252 + * side-effect the nodeid of the lock is set to the nodeid of the new RSB
11253 + * master.
11254 + */
11255 +
11256 +static int pack_one_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb,
11257 +                       rcom_fill_t *fill)
11258 +{
11259 +       if (fill->offset + 1 + lkb_length(lkb) > fill->maxlen)
11260 +               goto nospace;
11261 +
11262 +       lkb->lkb_nodeid = r->res_nodeid;
11263 +
11264 +       put_char(REMASTER_LKB, fill->outbuf, &fill->offset);
11265 +       serialise_lkb(lkb, fill->outbuf, &fill->offset);
11266 +
11267 +       fill->count++;
11268 +       need_new_lkid(r);
11269 +       return 0;
11270 +
11271 +      nospace:
11272 +       return -ENOSPC;
11273 +}
11274 +
11275 +/*
11276 + * Pack all LKB's from a given queue, except for those with the NOREBUILD flag.
11277 + */
11278 +
11279 +static int pack_lkb_queue(struct dlm_rsb *r, struct list_head *queue,
11280 +                         rcom_fill_t *fill)
11281 +{
11282 +       struct dlm_lkb *lkb;
11283 +       int error;
11284 +
11285 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
11286 +               if (lkb->lkb_flags & GDLM_LKFLG_NOREBUILD)
11287 +                       continue;
11288 +
11289 +               error = pack_one_lkb(r, lkb, fill);
11290 +               if (error)
11291 +                       goto nospace;
11292 +       }
11293 +
11294 +       return 0;
11295 +
11296 +      nospace:
11297 +       fill->lkb = lkb;
11298 +       fill->lkbqueue = queue;
11299 +
11300 +       return error;
11301 +}
11302 +
11303 +static int pack_lkb_queues(struct dlm_rsb *r, rcom_fill_t *fill)
11304 +{
11305 +       int error;
11306 +
11307 +       error = pack_lkb_queue(r, &r->res_grantqueue, fill);
11308 +       if (error)
11309 +               goto nospace;
11310 +
11311 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11312 +       if (error)
11313 +               goto nospace;
11314 +
11315 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11316 +
11317 +      nospace:
11318 +       return error;
11319 +}
11320 +
11321 +/*
11322 + * Pack remaining lkb's for rsb or subrsb.  This may include a partial lkb
11323 + * queue and full lkb queues.
11324 + */
11325 +
11326 +static int pack_lkb_remaining(struct dlm_rsb *r, rcom_fill_t *fill)
11327 +{
11328 +       struct list_head *tmp, *start, *end;
11329 +       struct dlm_lkb *lkb;
11330 +       int error;
11331 +
11332 +       /*
11333 +        * Beginning with fill->lkb, pack remaining lkb's on fill->lkbqueue.
11334 +        */
11335 +
11336 +       error = pack_one_lkb(r, fill->lkb, fill);
11337 +       if (error)
11338 +               goto out;
11339 +
11340 +       start = fill->lkb->lkb_statequeue.next;
11341 +       end = fill->lkbqueue;
11342 +
11343 +       for (tmp = start; tmp != end; tmp = tmp->next) {
11344 +               lkb = list_entry(tmp, struct dlm_lkb, lkb_statequeue);
11345 +
11346 +               error = pack_one_lkb(r, lkb, fill);
11347 +               if (error) {
11348 +                       fill->lkb = lkb;
11349 +                       goto out;
11350 +               }
11351 +       }
11352 +
11353 +       /*
11354 +        * Pack all lkb's on r's queues following fill->lkbqueue.
11355 +        */
11356 +
11357 +       if (fill->lkbqueue == &r->res_waitqueue)
11358 +               goto out;
11359 +       if (fill->lkbqueue == &r->res_convertqueue)
11360 +               goto skip;
11361 +
11362 +       DLM_ASSERT(fill->lkbqueue == &r->res_grantqueue,);
11363 +
11364 +       error = pack_lkb_queue(r, &r->res_convertqueue, fill);
11365 +       if (error)
11366 +               goto out;
11367 +      skip:
11368 +       error = pack_lkb_queue(r, &r->res_waitqueue, fill);
11369 +
11370 +      out:
11371 +       return error;
11372 +}
11373 +
11374 +static int pack_one_subrsb(struct dlm_rsb *rsb, struct dlm_rsb *subrsb,
11375 +                          rcom_fill_t *fill)
11376 +{
11377 +       int error;
11378 +
11379 +       down_write(&subrsb->res_lock);
11380 +
11381 +       if (fill->offset + 1 + rsb_length(subrsb) > fill->maxlen)
11382 +               goto nospace;
11383 +
11384 +       subrsb->res_nodeid = rsb->res_nodeid;
11385 +       subrsb->res_remasterid = ++fill->remasterid;
11386 +
11387 +       put_char(REMASTER_RSB, fill->outbuf, &fill->offset);
11388 +       serialise_rsb(subrsb, fill->outbuf, &fill->offset);
11389 +
11390 +       error = pack_lkb_queues(subrsb, fill);
11391 +       if (error)
11392 +               goto nospace;
11393 +
11394 +       up_write(&subrsb->res_lock);
11395 +
11396 +       return 0;
11397 +
11398 +      nospace:
11399 +       up_write(&subrsb->res_lock);
11400 +       fill->subrsb = subrsb;
11401 +
11402 +       return -ENOSPC;
11403 +}
11404 +
11405 +static int pack_subrsbs(struct dlm_rsb *rsb, struct dlm_rsb *in_subrsb,
11406 +                       rcom_fill_t *fill)
11407 +{
11408 +       struct dlm_rsb *subrsb;
11409 +       int error = 0;
11410 +
11411 +       /*
11412 +        * When an initial subrsb is given, we know it needs to be packed.
11413 +        * When no initial subrsb is given, begin with the first (if any exist).
11414 +        */
11415 +
11416 +       if (!in_subrsb) {
11417 +               if (list_empty(&rsb->res_subreslist))
11418 +                       goto out;
11419 +
11420 +               subrsb = list_entry(rsb->res_subreslist.next, struct dlm_rsb,
11421 +                                   res_subreslist);
11422 +       } else
11423 +               subrsb = in_subrsb;
11424 +
11425 +       for (;;) {
11426 +               error = pack_one_subrsb(rsb, subrsb, fill);
11427 +               if (error)
11428 +                       goto out;
11429 +
11430 +               if (last_in_list(subrsb, &rsb->res_subreslist))
11431 +                       break;
11432 +
11433 +               subrsb = next_subrsb(subrsb);
11434 +       }
11435 +
11436 +      out:
11437 +       return error;
11438 +}
11439 +
11440 +/*
11441 + * Finish packing whatever is left in an rsb tree.  If space runs out while
11442 + * finishing, save subrsb/lkb and this will be called again for the same rsb.
11443 + *
11444 + * !subrsb &&  lkb, we left off part way through root rsb's lkbs.
11445 + *  subrsb && !lkb, we left off just before starting a new subrsb.
11446 + *  subrsb &&  lkb, we left off part way through a subrsb's lkbs.
11447 + * !subrsb && !lkb, we shouldn't be in this function, but starting
11448 + *                  a new rsb in pack_rsb_tree().
11449 + */
11450 +
11451 +static int pack_rsb_tree_remaining(struct dlm_ls *ls, struct dlm_rsb *rsb,
11452 +                                  rcom_fill_t *fill)
11453 +{
11454 +       struct dlm_rsb *subrsb = NULL;
11455 +       int error = 0;
11456 +
11457 +       if (!fill->subrsb && fill->lkb) {
11458 +               error = pack_lkb_remaining(rsb, fill);
11459 +               if (error)
11460 +                       goto out;
11461 +
11462 +               error = pack_subrsbs(rsb, NULL, fill);
11463 +               if (error)
11464 +                       goto out;
11465 +       }
11466 +
11467 +       else if (fill->subrsb && !fill->lkb) {
11468 +               error = pack_subrsbs(rsb, fill->subrsb, fill);
11469 +               if (error)
11470 +                       goto out;
11471 +       }
11472 +
11473 +       else if (fill->subrsb && fill->lkb) {
11474 +               error = pack_lkb_remaining(fill->subrsb, fill);
11475 +               if (error)
11476 +                       goto out;
11477 +
11478 +               if (last_in_list(fill->subrsb, &fill->rsb->res_subreslist))
11479 +                       goto out;
11480 +
11481 +               subrsb = next_subrsb(fill->subrsb);
11482 +
11483 +               error = pack_subrsbs(rsb, subrsb, fill);
11484 +               if (error)
11485 +                       goto out;
11486 +       }
11487 +
11488 +       fill->subrsb = NULL;
11489 +       fill->lkb = NULL;
11490 +
11491 +      out:
11492 +       return error;
11493 +}
11494 +
11495 +/*
11496 + * Pack an RSB, all its LKB's, all its subrsb's and all their LKB's into a
11497 + * buffer.  When the buffer runs out of space, save the place to restart (the
11498 + * queue+lkb, subrsb, or subrsb+queue+lkb which wouldn't fit).
11499 + */
11500 +
11501 +static int pack_rsb_tree(struct dlm_ls *ls, struct dlm_rsb *rsb,
11502 +                        rcom_fill_t *fill)
11503 +{
11504 +       int error = -ENOSPC;
11505 +
11506 +       fill->remasterid = 0;
11507 +
11508 +       /*
11509 +        * Pack the root rsb itself.  A 1 byte type precedes the serialised
11510 +        * rsb.  Then pack the lkb's for the root rsb.
11511 +        */
11512 +
11513 +       down_write(&rsb->res_lock);
11514 +
11515 +       if (fill->offset + 1 + rsb_length(rsb) > fill->maxlen)
11516 +               goto out;
11517 +
11518 +       rsb->res_remasterid = ++fill->remasterid;
11519 +       put_char(REMASTER_ROOTRSB, fill->outbuf, &fill->offset);
11520 +       serialise_rsb(rsb, fill->outbuf, &fill->offset);
11521 +
11522 +       error = pack_lkb_queues(rsb, fill);
11523 +       if (error)
11524 +               goto out;
11525 +
11526 +       up_write(&rsb->res_lock);
11527 +
11528 +       /*
11529 +        * Pack subrsb/lkb's under the root rsb.
11530 +        */
11531 +
11532 +       error = pack_subrsbs(rsb, NULL, fill);
11533 +
11534 +       return error;
11535 +
11536 +      out:
11537 +       up_write(&rsb->res_lock);
11538 +       return error;
11539 +}
11540 +
11541 +/*
11542 + * Given an RSB, return the next RSB that should be sent to a new master.
11543 + */
11544 +
11545 +static struct dlm_rsb *next_remastered_rsb(struct dlm_ls *ls,
11546 +                                          struct dlm_rsb *rsb)
11547 +{
11548 +       struct list_head *tmp, *start, *end;
11549 +       struct dlm_rsb *r;
11550 +
11551 +       if (!rsb)
11552 +               start = ls->ls_rootres.next;
11553 +       else
11554 +               start = rsb->res_rootlist.next;
11555 +
11556 +       end = &ls->ls_rootres;
11557 +
11558 +       for (tmp = start; tmp != end; tmp = tmp->next) {
11559 +               r = list_entry(tmp, struct dlm_rsb, res_rootlist);
11560 +
11561 +               if (test_bit(RESFL_NEW_MASTER, &r->res_flags)) {
11562 +                       if (r->res_nodeid && lkbs_to_remaster(r)) {
11563 +                               expect_new_lkids(r);
11564 +                               return r;
11565 +                       } else
11566 +                               clear_bit(RESFL_NEW_MASTER, &r->res_flags);
11567 +               }
11568 +       }
11569 +
11570 +       return NULL;
11571 +}
11572 +
11573 +/*
11574 + * Given an rcom buffer, fill it with RSB's that need to be sent to a single
11575 + * new master node.  In the case where all the data to send to one node
11576 + * requires multiple messages, this function needs to resume filling each
11577 + * successive buffer from the point where it left off when the previous buffer
11578 + * filled up.
11579 + */
11580 +
11581 +static void fill_rcom_buffer(struct dlm_ls *ls, rcom_fill_t *fill,
11582 +                            uint32_t *nodeid)
11583 +{
11584 +       struct dlm_rsb *rsb, *prev_rsb = fill->rsb;
11585 +       int error;
11586 +
11587 +       fill->offset = 0;
11588 +
11589 +       if (!prev_rsb) {
11590 +
11591 +               /*
11592 +                * The first time this function is called.
11593 +                */
11594 +
11595 +               rsb = next_remastered_rsb(ls, NULL);
11596 +               if (!rsb)
11597 +                       goto no_more;
11598 +
11599 +       } else if (fill->subrsb || fill->lkb) {
11600 +
11601 +               /*
11602 +                * Continue packing an rsb tree that was partially packed last
11603 +                * time (fill->subrsb/lkb indicates where packing of last block
11604 +                * left off)
11605 +                */
11606 +
11607 +               rsb = prev_rsb;
11608 +               *nodeid = rsb->res_nodeid;
11609 +
11610 +               error = pack_rsb_tree_remaining(ls, rsb, fill);
11611 +               if (error == -ENOSPC)
11612 +                       goto more;
11613 +
11614 +               rsb = next_remastered_rsb(ls, prev_rsb);
11615 +               if (!rsb)
11616 +                       goto no_more;
11617 +
11618 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
11619 +                       goto more;
11620 +       } else {
11621 +               rsb = prev_rsb;
11622 +       }
11623 +
11624 +       /*
11625 +        * Pack rsb trees into the buffer until we run out of space, run out of
11626 +        * new rsb's or hit a new nodeid.
11627 +        */
11628 +
11629 +       *nodeid = rsb->res_nodeid;
11630 +
11631 +       for (;;) {
11632 +               error = pack_rsb_tree(ls, rsb, fill);
11633 +               if (error == -ENOSPC)
11634 +                       goto more;
11635 +
11636 +               prev_rsb = rsb;
11637 +
11638 +               rsb = next_remastered_rsb(ls, prev_rsb);
11639 +               if (!rsb)
11640 +                       goto no_more;
11641 +
11642 +               if (rsb->res_nodeid != prev_rsb->res_nodeid)
11643 +                       goto more;
11644 +       }
11645 +
11646 +      more:
11647 +       fill->more = 1;
11648 +       fill->rsb = rsb;
11649 +       return;
11650 +
11651 +      no_more:
11652 +       fill->more = 0;
11653 +}
11654 +
11655 +/*
11656 + * Send lkb's (and subrsb/lkbs) for remastered root rsbs to new masters.
11657 + */
11658 +
11659 +int rebuild_rsbs_send(struct dlm_ls *ls)
11660 +{
11661 +       struct dlm_rcom *rc;
11662 +       rcom_fill_t fill;
11663 +       uint32_t nodeid;
11664 +       int error;
11665 +
11666 +       DLM_ASSERT(recover_list_empty(ls),);
11667 +
11668 +       log_all(ls, "rebuild locks");
11669 +
11670 +       error = -ENOMEM;
11671 +       rc = allocate_rcom_buffer(ls);
11672 +       if (!rc)
11673 +               goto ret;
11674 +
11675 +       down_read(&ls->ls_root_lock);
11676 +
11677 +       error = 0;
11678 +       memset(&fill, 0, sizeof(rcom_fill_t));
11679 +       fill.outbuf = rc->rc_buf;
11680 +       fill.maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
11681 +
11682 +       do {
11683 +               fill_rcom_buffer(ls, &fill, &nodeid);
11684 +               if (!fill.offset)
11685 +                       break;
11686 +
11687 +               rc->rc_datalen = fill.offset;
11688 +               error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKS, rc, 0);
11689 +               if (error) {
11690 +                       up_read(&ls->ls_root_lock);
11691 +                       goto out;
11692 +               }
11693 +
11694 +               schedule();
11695 +               error = dlm_recovery_stopped(ls);
11696 +               if (error) {
11697 +                       up_read(&ls->ls_root_lock);
11698 +                       goto out;
11699 +               }
11700 +       }
11701 +       while (fill.more);
11702 +
11703 +       up_read(&ls->ls_root_lock);
11704 +
11705 +       error = dlm_wait_function(ls, &recover_list_empty);
11706 +
11707 +       log_all(ls, "rebuilt %d locks", fill.count);
11708 +
11709 +      out:
11710 +       free_rcom_buffer(rc);
11711 +
11712 +      ret:
11713 +       return error;
11714 +}
11715 +
11716 +static struct dlm_rsb *find_by_remasterid(struct dlm_ls *ls, int remasterid,
11717 +                                         struct dlm_rsb *rootrsb)
11718 +{
11719 +       struct dlm_rsb *rsb;
11720 +
11721 +       DLM_ASSERT(rootrsb,);
11722 +
11723 +       if (rootrsb->res_remasterid == remasterid) {
11724 +               rsb = rootrsb;
11725 +               goto out;
11726 +       }
11727 +
11728 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11729 +               if (rsb->res_remasterid == remasterid)
11730 +                       goto out;
11731 +       }
11732 +       rsb = NULL;
11733 +
11734 +      out:
11735 +       return rsb;
11736 +}
11737 +
11738 +/*
11739 + * Search a queue for the given remote lock id (remlkid).
11740 + */
11741 +
11742 +static struct dlm_lkb *search_remlkid(struct list_head *statequeue, int nodeid,
11743 +                                     int remid)
11744 +{
11745 +       struct dlm_lkb *lkb;
11746 +
11747 +       list_for_each_entry(lkb, statequeue, lkb_statequeue) {
11748 +               if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) {
11749 +                       return lkb;
11750 +               }
11751 +       }
11752 +
11753 +       return NULL;
11754 +}
11755 +
11756 +/*
11757 + * Given a remote lock ID (and a parent resource), return the local LKB for it
11758 + * Hopefully we dont need to do this too often on deep lock trees.  This is
11759 + * VERY suboptimal for anything but the smallest lock trees. It searches the
11760 + * lock tree for an LKB with the remote id "remid" and the node "nodeid" and
11761 + * returns the LKB address.  OPTIMISATION: we should keep a list of these while
11762 + * we are building up the remastered LKBs
11763 + */
11764 +
11765 +static struct dlm_lkb *find_by_remlkid(struct dlm_rsb *rootrsb, int nodeid,
11766 +                                      int remid)
11767 +{
11768 +       struct dlm_lkb *lkb;
11769 +       struct dlm_rsb *rsb;
11770 +
11771 +       lkb = search_remlkid(&rootrsb->res_grantqueue, nodeid, remid);
11772 +       if (lkb)
11773 +               goto out;
11774 +
11775 +       lkb = search_remlkid(&rootrsb->res_convertqueue, nodeid, remid);
11776 +       if (lkb)
11777 +               goto out;
11778 +
11779 +       lkb = search_remlkid(&rootrsb->res_waitqueue, nodeid, remid);
11780 +       if (lkb)
11781 +               goto out;
11782 +
11783 +       list_for_each_entry(rsb, &rootrsb->res_subreslist, res_subreslist) {
11784 +               lkb = search_remlkid(&rsb->res_grantqueue, nodeid, remid);
11785 +               if (lkb)
11786 +                       goto out;
11787 +
11788 +               lkb = search_remlkid(&rsb->res_convertqueue, nodeid, remid);
11789 +               if (lkb)
11790 +                       goto out;
11791 +
11792 +               lkb = search_remlkid(&rsb->res_waitqueue, nodeid, remid);
11793 +               if (lkb)
11794 +                       goto out;
11795 +       }
11796 +       lkb = NULL;
11797 +
11798 +      out:
11799 +       return lkb;
11800 +}
11801 +
11802 +/*
11803 + * Unpack an LKB from a remaster operation
11804 + */
11805 +
11806 +static int deserialise_lkb(struct dlm_ls *ls, int rem_nodeid,
11807 +                          struct dlm_rsb *rootrsb, char *buf, int *ptr,
11808 +                          char *outbuf, int *outoffp)
11809 +{
11810 +       struct dlm_lkb *lkb, *exist_lkb = NULL;
11811 +       struct dlm_rsb *rsb;
11812 +       int error = -ENOMEM, parentid, rsb_rmid, remote_lkid, status, temp;
11813 +
11814 +       remote_lkid = get_int(buf, ptr);
11815 +
11816 +       rsb_rmid = get_int(buf, ptr);
11817 +       rsb = find_by_remasterid(ls, rsb_rmid, rootrsb);
11818 +       DLM_ASSERT(rsb, printk("no RSB for remasterid %d\n", rsb_rmid););
11819 +
11820 +       /*
11821 +        * We could have received this lkb already from a previous recovery
11822 +        * that was interrupted.  We still need to advance ptr so read in
11823 +        * lkb and then release it.  FIXME: verify this is valid.
11824 +        */
11825 +       lkb = find_by_remlkid(rsb, rem_nodeid, remote_lkid);
11826 +       if (lkb) {
11827 +               log_all(ls, "lkb %x exists %s", remote_lkid, rsb->res_name);
11828 +               exist_lkb = lkb;
11829 +       }
11830 +
11831 +       lkb = create_lkb(ls);
11832 +       if (!lkb)
11833 +               goto out;
11834 +
11835 +       lkb->lkb_remid = remote_lkid;
11836 +       lkb->lkb_flags = get_int(buf, ptr);
11837 +       status = get_int(buf, ptr);
11838 +       lkb->lkb_rqmode = get_char(buf, ptr);
11839 +       lkb->lkb_grmode = get_char(buf, ptr);
11840 +       atomic_set(&lkb->lkb_childcnt, get_int(buf, ptr));
11841 +
11842 +       parentid = get_int(buf, ptr);
11843 +       lkb->lkb_bastaddr = (void *) (long) get_int(buf, ptr);
11844 +       lkb->lkb_ownpid = get_int(buf, ptr);
11845 +
11846 +       if (lkb->lkb_flags & GDLM_LKFLG_VALBLK) {
11847 +               lkb->lkb_lvbptr = allocate_lvb(ls);
11848 +               if (!lkb->lkb_lvbptr)
11849 +                       goto out;
11850 +               get_bytes(lkb->lkb_lvbptr, &temp, buf, ptr);
11851 +       }
11852 +
11853 +       if (lkb->lkb_flags & GDLM_LKFLG_RANGE) {
11854 +               uint64_t start, end;
11855 +
11856 +               /* Don't need to keep the range flag, for comms use only */
11857 +               lkb->lkb_flags &= ~GDLM_LKFLG_RANGE;
11858 +               start = get_int64(buf, ptr);
11859 +               end = get_int64(buf, ptr);
11860 +
11861 +               lkb->lkb_range = allocate_range(ls);
11862 +               if (!lkb->lkb_range)
11863 +                       goto out;
11864 +
11865 +               switch (status) {
11866 +               case GDLM_LKSTS_CONVERT:
11867 +                       lkb->lkb_range[RQ_RANGE_START] = start;
11868 +                       lkb->lkb_range[RQ_RANGE_END] = end;
11869 +                       start = get_int64(buf, ptr);
11870 +                       end = get_int64(buf, ptr);
11871 +                       lkb->lkb_range[GR_RANGE_START] = start;
11872 +                       lkb->lkb_range[GR_RANGE_END] = end;
11873 +
11874 +               case GDLM_LKSTS_WAITING:
11875 +                       lkb->lkb_range[RQ_RANGE_START] = start;
11876 +                       lkb->lkb_range[RQ_RANGE_END] = end;
11877 +                       break;
11878 +
11879 +               case GDLM_LKSTS_GRANTED:
11880 +                       lkb->lkb_range[GR_RANGE_START] = start;
11881 +                       lkb->lkb_range[GR_RANGE_END] = end;
11882 +                       break;
11883 +               default:
11884 +                       DLM_ASSERT(0,);
11885 +               }
11886 +       }
11887 +
11888 +       if (exist_lkb) {
11889 +               /* verify lkb and exist_lkb values match? */
11890 +               release_lkb(ls, lkb);
11891 +               lkb = exist_lkb;
11892 +               goto put_lkid;
11893 +       }
11894 +
11895 +       /* Resolve local lock LKB address from parent ID */
11896 +       if (parentid)
11897 +               lkb->lkb_parent = find_by_remlkid(rootrsb, rem_nodeid,
11898 +                                                 parentid);
11899 +
11900 +       atomic_inc(&rsb->res_ref);
11901 +       lkb->lkb_resource = rsb;
11902 +
11903 +       lkb->lkb_flags |= GDLM_LKFLG_MSTCPY;
11904 +       lkb->lkb_nodeid = rem_nodeid;
11905 +
11906 +       /*
11907 +        * Put the lkb on an RSB queue.  An lkb that's in the midst of a
11908 +        * conversion request (on the requesting node's lockqueue and has
11909 +        * LQCONVERT set) should be put on the granted queue.  The convert
11910 +        * request will be resent by the requesting node.
11911 +        */
11912 +
11913 +       if (lkb->lkb_flags & GDLM_LKFLG_LQCONVERT) {
11914 +               lkb->lkb_flags &= ~GDLM_LKFLG_LQCONVERT;
11915 +               DLM_ASSERT(status == GDLM_LKSTS_CONVERT,
11916 +                           printk("status=%d\n", status););
11917 +               lkb->lkb_rqmode = DLM_LOCK_IV;
11918 +               status = GDLM_LKSTS_GRANTED;
11919 +       }
11920 +
11921 +       lkb_enqueue(rsb, lkb, status);
11922 +
11923 +       /*
11924 +        * Update the rsb lvb if the lkb's lvb is up to date (grmode > NL).
11925 +        */
11926 +
11927 +       if ((lkb->lkb_flags & GDLM_LKFLG_VALBLK)
11928 +           && lkb->lkb_grmode > DLM_LOCK_NL) {
11929 +               if (!rsb->res_lvbptr)
11930 +                       rsb->res_lvbptr = allocate_lvb(ls);
11931 +               if (!rsb->res_lvbptr)
11932 +                       goto out;
11933 +               memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
11934 +       }
11935 +
11936 +       /*
11937 +        * Clear flags that may have been sent over that are only relevant in
11938 +        * the context of the sender.
11939 +        */
11940 +
11941 +       lkb->lkb_flags &= ~(GDLM_LKFLG_DELETED | GDLM_LKFLG_LQRESEND |
11942 +                           GDLM_LKFLG_NOREBUILD | GDLM_LKFLG_DEMOTED);
11943 +
11944 +      put_lkid:
11945 +       /* Return the new LKID to the caller's buffer */
11946 +       put_int(lkb->lkb_id, outbuf, outoffp);
11947 +       put_int(lkb->lkb_remid, outbuf, outoffp);
11948 +       error = 0;
11949 +
11950 +      out:
11951 +       return error;
11952 +}
11953 +
11954 +static struct dlm_rsb *deserialise_rsb(struct dlm_ls *ls, int nodeid,
11955 +                                      struct dlm_rsb *rootrsb, char *buf,
11956 +                                      int *ptr)
11957 +{
11958 +       int length;
11959 +       int remasterid;
11960 +       int parent_remasterid;
11961 +       char name[DLM_RESNAME_MAXLEN];
11962 +       int error;
11963 +       struct dlm_rsb *parent = NULL;
11964 +       struct dlm_rsb *rsb;
11965 +
11966 +       get_bytes(name, &length, buf, ptr);
11967 +       remasterid = get_int(buf, ptr);
11968 +       parent_remasterid = get_int(buf, ptr);
11969 +
11970 +       if (parent_remasterid)
11971 +               parent = find_by_remasterid(ls, parent_remasterid, rootrsb);
11972 +
11973 +       /*
11974 +        * The rsb reference from this find_or_create_rsb() will keep the rsb
11975 +        * around while we add new lkb's to it from deserialise_lkb.  Each of
11976 +        * the lkb's will add an rsb reference.  The reference added here is
11977 +        * removed by release_rsb() after all lkb's are added.
11978 +        */
11979 +
11980 +       error = find_rsb(ls, parent, name, length, CREATE, &rsb);
11981 +       DLM_ASSERT(!error,);
11982 +
11983 +       set_bit(RESFL_MASTER, &rsb->res_flags);
11984 +
11985 +       /* There is a case where the above needs to create the RSB. */
11986 +       if (rsb->res_nodeid == -1)
11987 +               rsb->res_nodeid = our_nodeid();
11988 +
11989 +       rsb->res_remasterid = remasterid;
11990 +
11991 +       return rsb;
11992 +}
11993 +
11994 +/*
11995 + * Processing at the receiving end of a NEWLOCKS message from a node in
11996 + * rebuild_rsbs_send().  Rebuild a remastered lock tree.  Nodeid is the remote
11997 + * node whose locks we are now mastering.  For a reply we need to send back the
11998 + * new lockids of the remastered locks so that remote ops can find them.
11999 + */
12000 +
12001 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
12002 +{
12003 +       struct dlm_rcom *rc;
12004 +       struct dlm_rsb *rsb = NULL;
12005 +       rebuild_node_t *rnode;
12006 +       char *outbuf;
12007 +       int outptr, ptr = 0, error = -ENOMEM;
12008 +
12009 +       rnode = find_rebuild_root(ls, nodeid);
12010 +       if (!rnode)
12011 +               goto out;
12012 +
12013 +       /*
12014 +        * Allocate a buffer for the reply message which is a list of remote
12015 +        * lock IDs and their (new) local lock ids.  It will always be big
12016 +        * enough to fit <n> ID pairs if it already fit <n> LKBs.
12017 +        */
12018 +
12019 +       rc = allocate_rcom_buffer(ls);
12020 +       if (!rc)
12021 +               goto out;
12022 +       outbuf = rc->rc_buf;
12023 +       outptr = 0;
12024 +
12025 +       /*
12026 +        * Unpack RSBs and LKBs, saving new LKB id's in outbuf as they're
12027 +        * created.  Each deserialise_rsb adds an rsb reference that must be
12028 +        * removed with release_rsb once all new lkb's for an rsb have been
12029 +        * added.
12030 +        */
12031 +
12032 +       while (ptr < len) {
12033 +               int type;
12034 +
12035 +               type = get_char(buf, &ptr);
12036 +
12037 +               switch (type) {
12038 +               case REMASTER_ROOTRSB:
12039 +                       if (rsb)
12040 +                               release_rsb(rsb);
12041 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12042 +                                             &ptr);
12043 +                       rnode->rootrsb = rsb;
12044 +                       break;
12045 +
12046 +               case REMASTER_RSB:
12047 +                       if (rsb)
12048 +                               release_rsb(rsb);
12049 +                       rsb = deserialise_rsb(ls, nodeid, rnode->rootrsb, buf,
12050 +                                             &ptr);
12051 +                       break;
12052 +
12053 +               case REMASTER_LKB:
12054 +                       deserialise_lkb(ls, nodeid, rnode->rootrsb, buf, &ptr,
12055 +                                       outbuf, &outptr);
12056 +                       break;
12057 +
12058 +               default:
12059 +                       DLM_ASSERT(0, printk("type=%d nodeid=%u ptr=%d "
12060 +                                             "len=%d\n", type, nodeid, ptr,
12061 +                                             len););
12062 +               }
12063 +       }
12064 +
12065 +       if (rsb)
12066 +               release_rsb(rsb);
12067 +
12068 +       /*
12069 +        * Reply with the new lock IDs.
12070 +        */
12071 +
12072 +       rc->rc_datalen = outptr;
12073 +       error = rcom_send_message(ls, nodeid, RECCOMM_NEWLOCKIDS, rc, 0);
12074 +
12075 +       free_rcom_buffer(rc);
12076 +
12077 +      out:
12078 +       return error;
12079 +}
12080 +
12081 +/*
12082 + * Processing for a NEWLOCKIDS message.  Called when we get the reply from the
12083 + * new master telling us what the new remote lock IDs are for the remastered
12084 + * locks
12085 + */
12086 +
12087 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len)
12088 +{
12089 +       int offset = 0;
12090 +
12091 +       if (len == 1)
12092 +               len = 0;
12093 +
12094 +       while (offset < len) {
12095 +               int remote_id;
12096 +               int local_id;
12097 +               struct dlm_lkb *lkb;
12098 +
12099 +               if (offset + 8 > len) {
12100 +                       log_error(ls, "rebuild_rsbs_lkids_recv: bad data "
12101 +                                 "length nodeid=%d offset=%d len=%d",
12102 +                                 nodeid, offset, len);
12103 +                       break;
12104 +               }
12105 +
12106 +               remote_id = get_int(buf, &offset);
12107 +               local_id = get_int(buf, &offset);
12108 +
12109 +               lkb = find_lock_by_id(ls, local_id);
12110 +               if (lkb) {
12111 +                       lkb->lkb_remid = remote_id;
12112 +                       have_new_lkid(lkb);
12113 +               } else {
12114 +                       log_error(ls, "rebuild_rsbs_lkids_recv: unknown lkid "
12115 +                                 "nodeid=%d id=%x remid=%x offset=%d len=%d",
12116 +                                 nodeid, local_id, remote_id, offset, len);
12117 +               }
12118 +       }
12119 +
12120 +       if (recover_list_empty(ls))
12121 +               wake_up(&ls->ls_wait_general);
12122 +
12123 +       return 0;
12124 +}
12125 diff -urN linux-orig/cluster/dlm/rebuild.h linux-patched/cluster/dlm/rebuild.h
12126 --- linux-orig/cluster/dlm/rebuild.h    1970-01-01 07:30:00.000000000 +0730
12127 +++ linux-patched/cluster/dlm/rebuild.h 2004-11-03 11:31:56.000000000 +0800
12128 @@ -0,0 +1,22 @@
12129 +/******************************************************************************
12130 +*******************************************************************************
12131 +**
12132 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12133 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12134 +**
12135 +**  This copyrighted material is made available to anyone wishing to use,
12136 +**  modify, copy, or redistribute it subject to the terms and conditions
12137 +**  of the GNU General Public License v.2.
12138 +**
12139 +*******************************************************************************
12140 +******************************************************************************/
12141 +
12142 +#ifndef __REBUILD_DOT_H__
12143 +#define __REBUILD_DOT_H__
12144 +
12145 +int rebuild_rsbs_send(struct dlm_ls *ls);
12146 +int rebuild_rsbs_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12147 +int rebuild_rsbs_lkids_recv(struct dlm_ls *ls, int nodeid, char *buf, int len);
12148 +int rebuild_freemem(struct dlm_ls *ls);
12149 +
12150 +#endif                         /* __REBUILD_DOT_H__ */
12151 diff -urN linux-orig/cluster/dlm/reccomms.c linux-patched/cluster/dlm/reccomms.c
12152 --- linux-orig/cluster/dlm/reccomms.c   1970-01-01 07:30:00.000000000 +0730
12153 +++ linux-patched/cluster/dlm/reccomms.c        2004-11-03 11:31:56.000000000 +0800
12154 @@ -0,0 +1,447 @@
12155 +/******************************************************************************
12156 +*******************************************************************************
12157 +**
12158 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12159 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12160 +**
12161 +**  This copyrighted material is made available to anyone wishing to use,
12162 +**  modify, copy, or redistribute it subject to the terms and conditions
12163 +**  of the GNU General Public License v.2.
12164 +**
12165 +*******************************************************************************
12166 +******************************************************************************/
12167 +
12168 +#include "dlm_internal.h"
12169 +#include "lowcomms.h"
12170 +#include "midcomms.h"
12171 +#include "reccomms.h"
12172 +#include "nodes.h"
12173 +#include "lockspace.h"
12174 +#include "recover.h"
12175 +#include "dir.h"
12176 +#include "config.h"
12177 +#include "rebuild.h"
12178 +#include "memory.h"
12179 +
12180 +/* Running on the basis that only a single recovery communication will be done
12181 + * at a time per lockspace */
12182 +
12183 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc);
12184 +
12185 +static int rcom_response(struct dlm_ls *ls)
12186 +{
12187 +       return test_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12188 +}
12189 +
12190 +/**
12191 + * rcom_send_message - send or request recovery data
12192 + * @ls: the lockspace
12193 + * @nodeid: node to which the message is sent
12194 + * @type: type of recovery message
12195 + * @rc: the rc buffer to send
12196 + * @need_reply: wait for reply if this is set
12197 + *
12198 + * Using this interface
12199 + * i)   Allocate an rc buffer:
12200 + *          rc = allocate_rcom_buffer(ls);
12201 + * ii)  Copy data to send beginning at rc->rc_buf:
12202 + *          memcpy(rc->rc_buf, mybuf, mylen);
12203 + * iii) Set rc->rc_datalen to the number of bytes copied in (ii):
12204 + *          rc->rc_datalen = mylen
12205 + * iv)  Submit the rc to this function:
12206 + *          rcom_send_message(rc);
12207 + *
12208 + * The max value of "mylen" is dlm_config.buffer_size - sizeof(struct
12209 + * dlm_rcom).  If more data must be passed in one send, use
12210 + * rcom_expand_buffer() which incrementally increases the size of the rc buffer
12211 + * by dlm_config.buffer_size bytes.
12212 + *
12213 + * Any data returned for the message (when need_reply is set) will saved in
12214 + * rc->rc_buf when this function returns and rc->rc_datalen will be set to the
12215 + * number of bytes copied into rc->rc_buf.
12216 + *
12217 + * Returns: 0 on success, -EXXX on failure
12218 + */
12219 +
12220 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12221 +                     struct dlm_rcom *rc, int need_reply)
12222 +{
12223 +       int error = 0;
12224 +
12225 +       if (!rc->rc_datalen)
12226 +               rc->rc_datalen = 1;
12227 +
12228 +       /*
12229 +        * Fill in the header.
12230 +        */
12231 +
12232 +       rc->rc_header.rh_cmd = GDLM_REMCMD_RECOVERMESSAGE;
12233 +       rc->rc_header.rh_lockspace = ls->ls_global_id;
12234 +       rc->rc_header.rh_length = sizeof(struct dlm_rcom) + rc->rc_datalen - 1;
12235 +       rc->rc_subcmd = type;
12236 +       rc->rc_msgid = ++ls->ls_rcom_msgid;
12237 +
12238 +       /*
12239 +        * When a reply is received, the reply data goes back into this buffer.
12240 +        * Synchronous rcom requests (need_reply=1) are serialised because of
12241 +        * the single ls_rcom.
12242 +        */
12243 +
12244 +       if (need_reply) {
12245 +               down(&ls->ls_rcom_lock);
12246 +               ls->ls_rcom = rc;
12247 +       }
12248 +
12249 +       /*
12250 +        * After sending the message we'll wait at the end of this function to
12251 +        * get a reply.  The READY flag will be set when the reply has been
12252 +        * received and requested data has been copied into
12253 +        * ls->ls_rcom->rc_buf;
12254 +        */
12255 +
12256 +       DLM_ASSERT(!test_bit(LSFL_RECCOMM_READY, &ls->ls_flags),);
12257 +
12258 +       /*
12259 +        * The WAIT bit indicates that we're waiting for and willing to accept a
12260 +        * reply.  Any replies are ignored unless this bit is set.
12261 +        */
12262 +
12263 +       set_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12264 +
12265 +       /*
12266 +        * Process the message locally.
12267 +        */
12268 +
12269 +       if (nodeid == our_nodeid()) {
12270 +               rcom_process_message(ls, nodeid, rc);
12271 +               goto out;
12272 +       }
12273 +
12274 +       /*
12275 +        * Send the message.
12276 +        */
12277 +
12278 +       log_debug(ls, "rcom send %d to %u id %u", type, nodeid, rc->rc_msgid);
12279 +
12280 +       error = midcomms_send_message(nodeid, (struct dlm_header *) rc,
12281 +                                     GFP_KERNEL);
12282 +       DLM_ASSERT(error >= 0, printk("error = %d\n", error););
12283 +       error = 0;
12284 +
12285 +       /*
12286 +        * Wait for a reply.  Once a reply is processed from midcomms, the
12287 +        * READY bit will be set and we'll be awoken (dlm_wait_function will
12288 +        * return 0).
12289 +        */
12290 +
12291 +       if (need_reply) {
12292 +               error = dlm_wait_function(ls, &rcom_response);
12293 +               if (error)
12294 +                       log_debug(ls, "rcom wait error %d", error);
12295 +       }
12296 +
12297 +      out:
12298 +       clear_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags);
12299 +       clear_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12300 +
12301 +       if (need_reply)
12302 +               up(&ls->ls_rcom_lock);
12303 +
12304 +       return error;
12305 +}
12306 +
12307 +/*
12308 + * Runs in same context as midcomms.
12309 + */
12310 +
12311 +static void rcom_process_message(struct dlm_ls *ls, uint32_t nodeid, struct dlm_rcom *rc)
12312 +{
12313 +       struct dlm_rcom rc_stack;
12314 +       struct dlm_rcom *reply = NULL;
12315 +       int status, datalen, maxlen;
12316 +       uint32_t r_nodeid, be_nodeid;
12317 +
12318 +       if (!ls)
12319 +               return;
12320 +
12321 +       if (dlm_recovery_stopped(ls) && (rc->rc_subcmd != RECCOMM_STATUS)) {
12322 +               log_error(ls, "ignoring recovery message %x from %u",
12323 +                         rc->rc_subcmd, nodeid);
12324 +               return;
12325 +       }
12326 +
12327 +       switch (rc->rc_subcmd) {
12328 +
12329 +       case RECCOMM_STATUS:
12330 +
12331 +               memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12332 +               reply = &rc_stack;
12333 +
12334 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12335 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12336 +               reply->rc_subcmd = rc->rc_subcmd;
12337 +               reply->rc_msgid = rc->rc_msgid;
12338 +               reply->rc_buf[0] = 0;
12339 +
12340 +               if (test_bit(LSFL_RESDIR_VALID, &ls->ls_flags))
12341 +                       reply->rc_buf[0] |= RESDIR_VALID;
12342 +
12343 +               if (test_bit(LSFL_ALL_RESDIR_VALID, &ls->ls_flags))
12344 +                       reply->rc_buf[0] |= RESDIR_ALL_VALID;
12345 +
12346 +               if (test_bit(LSFL_NODES_VALID, &ls->ls_flags))
12347 +                       reply->rc_buf[0] |= NODES_VALID;
12348 +
12349 +               if (test_bit(LSFL_ALL_NODES_VALID, &ls->ls_flags))
12350 +                       reply->rc_buf[0] |= NODES_ALL_VALID;
12351 +
12352 +               reply->rc_datalen = 1;
12353 +               reply->rc_header.rh_length =
12354 +                       sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12355 +
12356 +               log_debug(ls, "rcom status %x to %u", reply->rc_buf[0], nodeid);
12357 +               break;
12358 +
12359 +       case RECCOMM_RECOVERNAMES:
12360 +
12361 +               reply = allocate_rcom_buffer(ls);
12362 +               DLM_ASSERT(reply,);
12363 +               maxlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
12364 +
12365 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12366 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12367 +               reply->rc_subcmd = rc->rc_subcmd;
12368 +               reply->rc_msgid = rc->rc_msgid;
12369 +
12370 +               /*
12371 +                * The other node wants a bunch of resource names.  The name of
12372 +                * the resource to begin with is in rc->rc_buf.
12373 +                */
12374 +
12375 +               datalen = dlm_dir_rebuild_send(ls, rc->rc_buf, rc->rc_datalen,
12376 +                                              reply->rc_buf, maxlen, nodeid);
12377 +
12378 +               reply->rc_datalen = datalen;
12379 +               reply->rc_header.rh_length =
12380 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12381 +
12382 +               log_debug(ls, "rcom names len %d to %u id %u", datalen, nodeid,
12383 +                         reply->rc_msgid);
12384 +               break;
12385 +
12386 +       case RECCOMM_GETMASTER:
12387 +
12388 +               reply = allocate_rcom_buffer(ls);
12389 +               DLM_ASSERT(reply,);
12390 +
12391 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12392 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12393 +               reply->rc_subcmd = rc->rc_subcmd;
12394 +               reply->rc_msgid = rc->rc_msgid;
12395 +
12396 +               /*
12397 +                * The other node wants to know the master of a named resource.
12398 +                */
12399 +
12400 +               status = dlm_dir_lookup(ls, nodeid, rc->rc_buf, rc->rc_datalen,
12401 +                                       &r_nodeid);
12402 +               if (status != 0) {
12403 +                       log_all(ls, "rcom lookup error %d", status);
12404 +                       free_rcom_buffer(reply);
12405 +                       reply = NULL;
12406 +                       return;
12407 +               }
12408 +               be_nodeid = cpu_to_be32(r_nodeid);
12409 +               memcpy(reply->rc_buf, &be_nodeid, sizeof(uint32_t));
12410 +               reply->rc_datalen = sizeof(uint32_t);
12411 +               reply->rc_header.rh_length =
12412 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12413 +               break;
12414 +
12415 +       case RECCOMM_BULKLOOKUP:
12416 +
12417 +               reply = allocate_rcom_buffer(ls);
12418 +               DLM_ASSERT(reply,);
12419 +
12420 +               reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12421 +               reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12422 +               reply->rc_subcmd = rc->rc_subcmd;
12423 +               reply->rc_msgid = rc->rc_msgid;
12424 +
12425 +               /*
12426 +                * This is a bulk version of the above and just returns a
12427 +                * buffer full of node ids to match the resources
12428 +                */
12429 +
12430 +               datalen = bulk_master_lookup(ls, nodeid, rc->rc_buf,
12431 +                                            rc->rc_datalen, reply->rc_buf);
12432 +               if (datalen < 0) {
12433 +                       free_rcom_buffer(reply);
12434 +                       reply = NULL;
12435 +                       return;
12436 +               }
12437 +
12438 +               reply->rc_datalen = datalen;
12439 +               reply->rc_header.rh_length =
12440 +                   sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12441 +               break;
12442 +
12443 +               /*
12444 +                * These RECCOMM messages don't need replies.
12445 +                */
12446 +
12447 +       case RECCOMM_NEWLOCKS:
12448 +               rebuild_rsbs_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12449 +               break;
12450 +
12451 +       case RECCOMM_NEWLOCKIDS:
12452 +               rebuild_rsbs_lkids_recv(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12453 +               break;
12454 +
12455 +       case RECCOMM_REMRESDATA:
12456 +               dlm_dir_remove(ls, nodeid, rc->rc_buf, rc->rc_datalen);
12457 +               break;
12458 +
12459 +       default:
12460 +               DLM_ASSERT(0, printk("cmd=%x\n", rc->rc_subcmd););
12461 +       }
12462 +
12463 +       if (reply) {
12464 +               if (nodeid == our_nodeid()) {
12465 +                       DLM_ASSERT(rc == ls->ls_rcom,);
12466 +                       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12467 +                       rc->rc_datalen = reply->rc_datalen;
12468 +               } else {
12469 +                       midcomms_send_message(nodeid,
12470 +                                             (struct dlm_header *) reply,
12471 +                                             GFP_KERNEL);
12472 +               }
12473 +
12474 +               if (reply != &rc_stack)
12475 +                       free_rcom_buffer(reply);
12476 +       }
12477 +}
12478 +
12479 +static void process_reply_sync(struct dlm_ls *ls, uint32_t nodeid,
12480 +                              struct dlm_rcom *reply)
12481 +{
12482 +       struct dlm_rcom *rc = ls->ls_rcom;
12483 +
12484 +       if (!test_bit(LSFL_RECCOMM_WAIT, &ls->ls_flags)) {
12485 +               log_error(ls, "unexpected rcom reply nodeid=%u", nodeid);
12486 +               return;
12487 +       }
12488 +
12489 +       if (reply->rc_msgid != le32_to_cpu(rc->rc_msgid)) {
12490 +               log_error(ls, "unexpected rcom msgid %x/%x nodeid=%u",
12491 +                         reply->rc_msgid, le32_to_cpu(rc->rc_msgid), nodeid);
12492 +               return;
12493 +       }
12494 +
12495 +       memcpy(rc->rc_buf, reply->rc_buf, reply->rc_datalen);
12496 +       rc->rc_datalen = reply->rc_datalen;
12497 +
12498 +       /*
12499 +        * Tell the thread waiting in rcom_send_message() that it can go ahead.
12500 +        */
12501 +
12502 +       set_bit(LSFL_RECCOMM_READY, &ls->ls_flags);
12503 +       wake_up(&ls->ls_wait_general);
12504 +}
12505 +
12506 +static void process_reply_async(struct dlm_ls *ls, uint32_t nodeid,
12507 +                               struct dlm_rcom *reply)
12508 +{
12509 +       restbl_rsb_update_recv(ls, nodeid, reply->rc_buf, reply->rc_datalen,
12510 +                              reply->rc_msgid);
12511 +}
12512 +
12513 +/*
12514 + * Runs in same context as midcomms.
12515 + */
12516 +
12517 +static void rcom_process_reply(struct dlm_ls *ls, uint32_t nodeid,
12518 +                              struct dlm_rcom *reply)
12519 +{
12520 +       if (dlm_recovery_stopped(ls)) {
12521 +               log_error(ls, "ignoring recovery reply %x from %u",
12522 +                         reply->rc_subcmd, nodeid);
12523 +               return;
12524 +       }
12525 +
12526 +       switch (reply->rc_subcmd) {
12527 +       case RECCOMM_GETMASTER:
12528 +               process_reply_async(ls, nodeid, reply);
12529 +               break;
12530 +       case RECCOMM_STATUS:
12531 +       case RECCOMM_NEWLOCKS:
12532 +       case RECCOMM_NEWLOCKIDS:
12533 +       case RECCOMM_RECOVERNAMES:
12534 +               process_reply_sync(ls, nodeid, reply);
12535 +               break;
12536 +       default:
12537 +               log_error(ls, "unknown rcom reply subcmd=%x nodeid=%u",
12538 +                         reply->rc_subcmd, nodeid);
12539 +       }
12540 +}
12541 +
12542 +
12543 +static int send_ls_not_ready(uint32_t nodeid, struct dlm_header *header)
12544 +{
12545 +       struct writequeue_entry *wq;
12546 +       struct dlm_rcom *rc = (struct dlm_rcom *) header;
12547 +       struct dlm_rcom *reply;
12548 +
12549 +       wq = lowcomms_get_buffer(nodeid, sizeof(struct dlm_rcom), GFP_KERNEL,
12550 +                                (char **)&reply);
12551 +       if (!wq)
12552 +               return -ENOMEM;
12553 +
12554 +       reply->rc_header.rh_cmd = GDLM_REMCMD_RECOVERREPLY;
12555 +       reply->rc_header.rh_lockspace = rc->rc_header.rh_lockspace;
12556 +       reply->rc_subcmd = rc->rc_subcmd;
12557 +       reply->rc_msgid = rc->rc_msgid;
12558 +       reply->rc_buf[0] = 0;
12559 +
12560 +       reply->rc_datalen = 1;
12561 +       reply->rc_header.rh_length = sizeof(struct dlm_rcom) + reply->rc_datalen - 1;
12562 +
12563 +       midcomms_send_buffer((struct dlm_header *)reply, wq);
12564 +       return 0;
12565 +}
12566 +
12567 +
12568 +/*
12569 + * Runs in same context as midcomms.  Both recovery requests and recovery
12570 + * replies come through this function.
12571 + */
12572 +
12573 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header)
12574 +{
12575 +       struct dlm_ls *ls = find_lockspace_by_global_id(header->rh_lockspace);
12576 +       struct dlm_rcom *rc = (struct dlm_rcom *) header;
12577 +
12578 +       /* If the lockspace doesn't exist then still send a status message
12579 +          back; it's possible that it just doesn't have its global_id yet. */
12580 +
12581 +       if (!ls) {
12582 +             send_ls_not_ready(nodeid, header);
12583 +             return;
12584 +       }
12585 +
12586 +       switch (header->rh_cmd) {
12587 +       case GDLM_REMCMD_RECOVERMESSAGE:
12588 +               rcom_process_message(ls, nodeid, rc);
12589 +               break;
12590 +
12591 +       case GDLM_REMCMD_RECOVERREPLY:
12592 +               rcom_process_reply(ls, nodeid, rc);
12593 +               break;
12594 +
12595 +       default:
12596 +               DLM_ASSERT(0, printk("cmd=%x\n", header->rh_cmd););
12597 +       }
12598 +
12599 +       put_lockspace(ls);
12600 +}
12601 +
12602 diff -urN linux-orig/cluster/dlm/reccomms.h linux-patched/cluster/dlm/reccomms.h
12603 --- linux-orig/cluster/dlm/reccomms.h   1970-01-01 07:30:00.000000000 +0730
12604 +++ linux-patched/cluster/dlm/reccomms.h        2004-11-03 11:31:56.000000000 +0800
12605 @@ -0,0 +1,36 @@
12606 +/******************************************************************************
12607 +*******************************************************************************
12608 +**
12609 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12610 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12611 +**
12612 +**  This copyrighted material is made available to anyone wishing to use,
12613 +**  modify, copy, or redistribute it subject to the terms and conditions
12614 +**  of the GNU General Public License v.2.
12615 +**
12616 +*******************************************************************************
12617 +******************************************************************************/
12618 +
12619 +#ifndef __RECCOMMS_DOT_H__
12620 +#define __RECCOMMS_DOT_H__
12621 +
12622 +/* Bit flags */
12623 +
12624 +#define RESDIR_VALID            (1)
12625 +#define RESDIR_ALL_VALID        (2)
12626 +#define NODES_VALID             (4)
12627 +#define NODES_ALL_VALID         (8)
12628 +
12629 +#define RECCOMM_STATUS          (1)
12630 +#define RECCOMM_RECOVERNAMES    (2)
12631 +#define RECCOMM_GETMASTER       (3)
12632 +#define RECCOMM_BULKLOOKUP      (4)
12633 +#define RECCOMM_NEWLOCKS        (5)
12634 +#define RECCOMM_NEWLOCKIDS      (6)
12635 +#define RECCOMM_REMRESDATA      (7)
12636 +
12637 +int rcom_send_message(struct dlm_ls *ls, uint32_t nodeid, int type,
12638 +                     struct dlm_rcom *rc, int need_reply);
12639 +void process_recovery_comm(uint32_t nodeid, struct dlm_header *header);
12640 +
12641 +#endif
12642 diff -urN linux-orig/cluster/dlm/recover.c linux-patched/cluster/dlm/recover.c
12643 --- linux-orig/cluster/dlm/recover.c    1970-01-01 07:30:00.000000000 +0730
12644 +++ linux-patched/cluster/dlm/recover.c 2004-11-03 11:31:56.000000000 +0800
12645 @@ -0,0 +1,611 @@
12646 +/******************************************************************************
12647 +*******************************************************************************
12648 +**
12649 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
12650 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
12651 +**
12652 +**  This copyrighted material is made available to anyone wishing to use,
12653 +**  modify, copy, or redistribute it subject to the terms and conditions
12654 +**  of the GNU General Public License v.2.
12655 +**
12656 +*******************************************************************************
12657 +******************************************************************************/
12658 +
12659 +#include "dlm_internal.h"
12660 +#include "reccomms.h"
12661 +#include "dir.h"
12662 +#include "locking.h"
12663 +#include "rsb.h"
12664 +#include "lockspace.h"
12665 +#include "lkb.h"
12666 +#include "nodes.h"
12667 +#include "config.h"
12668 +#include "ast.h"
12669 +#include "memory.h"
12670 +
12671 +/*
12672 + * Called in recovery routines to check whether the recovery process has been
12673 + * interrupted/stopped by another transition.  A recovery in-process will abort
12674 + * if the lockspace is "stopped" so that a new recovery process can start from
12675 + * the beginning when the lockspace is "started" again.
12676 + */
12677 +
12678 +int dlm_recovery_stopped(struct dlm_ls *ls)
12679 +{
12680 +       return test_bit(LSFL_LS_STOP, &ls->ls_flags);
12681 +}
12682 +
12683 +static void dlm_wait_timer_fn(unsigned long data)
12684 +{
12685 +       struct dlm_ls *ls = (struct dlm_ls *) data;
12686 +
12687 +       wake_up(&ls->ls_wait_general);
12688 +}
12689 +
12690 +/*
12691 + * Wait until given function returns non-zero or lockspace is stopped (LS_STOP
12692 + * set due to failure of a node in ls_nodes).  When another function thinks it
12693 + * could have completed the waited-on task, they should wake up ls_wait_general
12694 + * to get an immediate response rather than waiting for the timer to detect the
12695 + * result.  A timer wakes us up periodically while waiting to see if we should
12696 + * abort due to a node failure.
12697 + */
12698 +
12699 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
12700 +{
12701 +       struct timer_list timer;
12702 +       int error = 0;
12703 +
12704 +       init_timer(&timer);
12705 +       timer.function = dlm_wait_timer_fn;
12706 +       timer.data = (long) ls;
12707 +
12708 +       for (;;) {
12709 +               mod_timer(&timer, jiffies + (dlm_config.recover_timer * HZ));
12710 +
12711 +               wchan_cond_sleep_intr(ls->ls_wait_general,
12712 +                                     !testfn(ls) &&
12713 +                                     !test_bit(LSFL_LS_STOP, &ls->ls_flags));
12714 +
12715 +               if (timer_pending(&timer))
12716 +                       del_timer(&timer);
12717 +
12718 +               if (testfn(ls))
12719 +                       break;
12720 +
12721 +               if (test_bit(LSFL_LS_STOP, &ls->ls_flags)) {
12722 +                       error = -1;
12723 +                       break;
12724 +               }
12725 +       }
12726 +
12727 +       return error;
12728 +}
12729 +
12730 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status)
12731 +{
12732 +       struct dlm_rcom rc_stack, *rc;
12733 +       struct dlm_csb *csb;
12734 +       int status;
12735 +       int error = 0;
12736 +
12737 +       memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12738 +       rc = &rc_stack;
12739 +       rc->rc_datalen = 0;
12740 +
12741 +       list_for_each_entry(csb, &ls->ls_nodes, list) {
12742 +               for (;;) {
12743 +                       error = dlm_recovery_stopped(ls);
12744 +                       if (error)
12745 +                               goto out;
12746 +
12747 +                       error = rcom_send_message(ls, csb->node->nodeid,
12748 +                                                 RECCOMM_STATUS, rc, 1);
12749 +                       if (error)
12750 +                               goto out;
12751 +
12752 +                       status = rc->rc_buf[0];
12753 +                       if (status & wait_status)
12754 +                               break;
12755 +                       else {
12756 +                               set_current_state(TASK_INTERRUPTIBLE);
12757 +                               schedule_timeout(HZ >> 1);
12758 +                       }
12759 +               }
12760 +       }
12761 +
12762 +      out:
12763 +       return error;
12764 +}
12765 +
12766 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status)
12767 +{
12768 +       struct dlm_rcom rc_stack, *rc;
12769 +       uint32_t nodeid = ls->ls_low_nodeid;
12770 +       int status;
12771 +       int error = 0;
12772 +
12773 +       memset(&rc_stack, 0, sizeof(struct dlm_rcom));
12774 +       rc = &rc_stack;
12775 +       rc->rc_datalen = 0;
12776 +
12777 +       for (;;) {
12778 +               error = dlm_recovery_stopped(ls);
12779 +               if (error)
12780 +                       goto out;
12781 +
12782 +               error = rcom_send_message(ls, nodeid, RECCOMM_STATUS, rc, 1);
12783 +               if (error)
12784 +                       break;
12785 +
12786 +               status = rc->rc_buf[0];
12787 +               if (status & wait_status)
12788 +                       break;
12789 +               else {
12790 +                       set_current_state(TASK_INTERRUPTIBLE);
12791 +                       schedule_timeout(HZ >> 1);
12792 +               }
12793 +       }
12794 +
12795 +      out:
12796 +       return error;
12797 +}
12798 +
12799 +static int purge_queue(struct dlm_ls *ls, struct list_head *queue)
12800 +{
12801 +       struct dlm_lkb *lkb, *safe;
12802 +       struct dlm_rsb *rsb;
12803 +       int count = 0;
12804 +
12805 +       list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
12806 +               if (!lkb->lkb_nodeid)
12807 +                       continue;
12808 +
12809 +               DLM_ASSERT(lkb->lkb_flags & GDLM_LKFLG_MSTCPY,);
12810 +
12811 +               if (in_nodes_gone(ls, lkb->lkb_nodeid)) {
12812 +                       list_del(&lkb->lkb_statequeue);
12813 +
12814 +                       rsb = lkb->lkb_resource;
12815 +                       lkb->lkb_status = 0;
12816 +
12817 +                       if (lkb->lkb_status == GDLM_LKSTS_CONVERT
12818 +                           && &lkb->lkb_duetime)
12819 +                               remove_from_deadlockqueue(lkb);
12820 +
12821 +                       release_lkb(ls, lkb);
12822 +                       release_rsb_locked(rsb);
12823 +                       count++;
12824 +               }
12825 +       }
12826 +
12827 +       return count;
12828 +}
12829 +
12830 +/*
12831 + * Go through local restbl and for each rsb we're master of, clear out any
12832 + * lkb's held by departed nodes.
12833 + */
12834 +
12835 +int restbl_lkb_purge(struct dlm_ls *ls)
12836 +{
12837 +       struct list_head *tmp2, *safe2;
12838 +       int count = 0;
12839 +       struct dlm_rsb *rootrsb, *safe, *rsb;
12840 +
12841 +       log_all(ls, "purge locks of departed nodes");
12842 +       down_write(&ls->ls_root_lock);
12843 +
12844 +       list_for_each_entry_safe(rootrsb, safe, &ls->ls_rootres, res_rootlist) {
12845 +
12846 +               if (rootrsb->res_nodeid)
12847 +                       continue;
12848 +
12849 +               hold_rsb(rootrsb);
12850 +               down_write(&rootrsb->res_lock);
12851 +
12852 +               /* This traverses the subreslist in reverse order so we purge
12853 +                * the children before their parents. */
12854 +
12855 +               for (tmp2 = rootrsb->res_subreslist.prev, safe2 = tmp2->prev;
12856 +                    tmp2 != &rootrsb->res_subreslist;
12857 +                    tmp2 = safe2, safe2 = safe2->prev) {
12858 +                       rsb = list_entry(tmp2, struct dlm_rsb, res_subreslist);
12859 +
12860 +                       hold_rsb(rsb);
12861 +                       purge_queue(ls, &rsb->res_grantqueue);
12862 +                       purge_queue(ls, &rsb->res_convertqueue);
12863 +                       purge_queue(ls, &rsb->res_waitqueue);
12864 +                       release_rsb_locked(rsb);
12865 +               }
12866 +               count += purge_queue(ls, &rootrsb->res_grantqueue);
12867 +               count += purge_queue(ls, &rootrsb->res_convertqueue);
12868 +               count += purge_queue(ls, &rootrsb->res_waitqueue);
12869 +
12870 +               up_write(&rootrsb->res_lock);
12871 +               release_rsb_locked(rootrsb);
12872 +       }
12873 +
12874 +       up_write(&ls->ls_root_lock);
12875 +       log_all(ls, "purged %d locks", count);
12876 +
12877 +       return 0;
12878 +}
12879 +
12880 +/*
12881 + * Grant any locks that have become grantable after a purge
12882 + */
12883 +
12884 +int restbl_grant_after_purge(struct dlm_ls *ls)
12885 +{
12886 +       struct dlm_rsb *root, *rsb, *safe;
12887 +       int error = 0;
12888 +
12889 +       down_read(&ls->ls_root_lock);
12890 +
12891 +       list_for_each_entry_safe(root, safe, &ls->ls_rootres, res_rootlist) {
12892 +               /* only the rsb master grants locks */
12893 +               if (root->res_nodeid)
12894 +                       continue;
12895 +
12896 +               if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
12897 +                       log_debug(ls, "restbl_grant_after_purge aborted");
12898 +                       error = -EINTR;
12899 +                       up_read(&ls->ls_root_lock);
12900 +                       goto out;
12901 +               }
12902 +
12903 +               down_write(&root->res_lock);
12904 +               grant_pending_locks(root);
12905 +               up_write(&root->res_lock);
12906 +
12907 +               list_for_each_entry(rsb, &root->res_subreslist, res_subreslist){
12908 +                       down_write(&rsb->res_lock);
12909 +                       grant_pending_locks(rsb);
12910 +                       up_write(&rsb->res_lock);
12911 +               }
12912 +       }
12913 +       up_read(&ls->ls_root_lock);
12914 +       wake_astd();
12915 + out:
12916 +       return error;
12917 +}
12918 +
12919 +/*
12920 + * Set the lock master for all LKBs in a lock queue
12921 + */
12922 +
12923 +static void set_lock_master(struct list_head *queue, int nodeid)
12924 +{
12925 +       struct dlm_lkb *lkb;
12926 +
12927 +       list_for_each_entry(lkb, queue, lkb_statequeue) {
12928 +               /* Don't muck around with pre-exising sublocks */
12929 +               if (!(lkb->lkb_flags & GDLM_LKFLG_MSTCPY))
12930 +                       lkb->lkb_nodeid = nodeid;
12931 +       }
12932 +}
12933 +
12934 +static void set_master_lkbs(struct dlm_rsb *rsb)
12935 +{
12936 +       set_lock_master(&rsb->res_grantqueue, rsb->res_nodeid);
12937 +       set_lock_master(&rsb->res_convertqueue, rsb->res_nodeid);
12938 +       set_lock_master(&rsb->res_waitqueue, rsb->res_nodeid);
12939 +}
12940 +
12941 +/*
12942 + * This rsb struct is now the master so it is responsible for keeping the
12943 + * latest rsb.  Find if any current lkb's have an up to date copy of the lvb to
12944 + * be used as the rsb copy.  An equivalent step occurs as new lkb's arrive for
12945 + * this rsb in deserialise_lkb.
12946 + */
12947 +
12948 +static void set_rsb_lvb(struct dlm_rsb *rsb)
12949 +{
12950 +       struct dlm_lkb *lkb;
12951 +
12952 +       list_for_each_entry(lkb, &rsb->res_grantqueue, lkb_statequeue) {
12953 +
12954 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12955 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12956 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12957 +               {
12958 +                       if (!rsb->res_lvbptr)
12959 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12960 +
12961 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12962 +                       return;
12963 +               }
12964 +       }
12965 +
12966 +       list_for_each_entry(lkb, &rsb->res_convertqueue, lkb_statequeue) {
12967 +
12968 +               if (!(lkb->lkb_flags & GDLM_LKFLG_DELETED) &&
12969 +                   (lkb->lkb_flags & GDLM_LKFLG_VALBLK) &&
12970 +                   (lkb->lkb_grmode > DLM_LOCK_NL))
12971 +               {
12972 +                       if (!rsb->res_lvbptr)
12973 +                               rsb->res_lvbptr = allocate_lvb(rsb->res_ls);
12974 +
12975 +                       memcpy(rsb->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
12976 +                       return;
12977 +               }
12978 +       }
12979 +}
12980 +
12981 +/*
12982 + * Propogate the new master nodeid to locks, subrsbs, sublocks.
12983 + * The NEW_MASTER flag tells rebuild_rsbs_send() which rsb's to consider.
12984 + */
12985 +
12986 +static void set_new_master(struct dlm_rsb *rsb, uint32_t nodeid)
12987 +{
12988 +       struct dlm_rsb *subrsb;
12989 +
12990 +       down_write(&rsb->res_lock);
12991 +
12992 +       if (nodeid == our_nodeid()) {
12993 +               set_bit(RESFL_MASTER, &rsb->res_flags);
12994 +               rsb->res_nodeid = 0;
12995 +               set_rsb_lvb(rsb);
12996 +       } else
12997 +               rsb->res_nodeid = nodeid;
12998 +
12999 +       set_master_lkbs(rsb);
13000 +
13001 +       list_for_each_entry(subrsb, &rsb->res_subreslist, res_subreslist) {
13002 +               subrsb->res_nodeid = rsb->res_nodeid;
13003 +               set_master_lkbs(subrsb);
13004 +       }
13005 +
13006 +       up_write(&rsb->res_lock);
13007 +
13008 +       set_bit(RESFL_NEW_MASTER, &rsb->res_flags);
13009 +}
13010 +
13011 +/*
13012 + * The recover_list contains all the rsb's for which we've requested the new
13013 + * master nodeid.  As replies are returned from the resource directories the
13014 + * rsb's are removed from the list.  When the list is empty we're done.
13015 + *
13016 + * The recover_list is later similarly used for all rsb's for which we've sent
13017 + * new lkb's and need to receive new corresponding lkid's.
13018 + */
13019 +
13020 +int recover_list_empty(struct dlm_ls *ls)
13021 +{
13022 +       int empty;
13023 +
13024 +       spin_lock(&ls->ls_recover_list_lock);
13025 +       empty = list_empty(&ls->ls_recover_list);
13026 +       spin_unlock(&ls->ls_recover_list_lock);
13027 +
13028 +       return empty;
13029 +}
13030 +
13031 +int recover_list_count(struct dlm_ls *ls)
13032 +{
13033 +       int count;
13034 +
13035 +       spin_lock(&ls->ls_recover_list_lock);
13036 +       count = ls->ls_recover_list_count;
13037 +       spin_unlock(&ls->ls_recover_list_lock);
13038 +
13039 +       return count;
13040 +}
13041 +
13042 +void recover_list_add(struct dlm_rsb *rsb)
13043 +{
13044 +       struct dlm_ls *ls = rsb->res_ls;
13045 +
13046 +       spin_lock(&ls->ls_recover_list_lock);
13047 +       if (!test_and_set_bit(RESFL_RECOVER_LIST, &rsb->res_flags)) {
13048 +               list_add_tail(&rsb->res_recover_list, &ls->ls_recover_list);
13049 +               ls->ls_recover_list_count++;
13050 +               hold_rsb(rsb);
13051 +       }
13052 +       spin_unlock(&ls->ls_recover_list_lock);
13053 +}
13054 +
13055 +void recover_list_del(struct dlm_rsb *rsb)
13056 +{
13057 +       struct dlm_ls *ls = rsb->res_ls;
13058 +
13059 +       spin_lock(&ls->ls_recover_list_lock);
13060 +       clear_bit(RESFL_RECOVER_LIST, &rsb->res_flags);
13061 +       list_del(&rsb->res_recover_list);
13062 +       ls->ls_recover_list_count--;
13063 +       spin_unlock(&ls->ls_recover_list_lock);
13064 +
13065 +       release_rsb(rsb);
13066 +}
13067 +
13068 +static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, int msgid)
13069 +{
13070 +       struct dlm_rsb *rsb = NULL;
13071 +
13072 +       spin_lock(&ls->ls_recover_list_lock);
13073 +
13074 +       list_for_each_entry(rsb, &ls->ls_recover_list, res_recover_list) {
13075 +               if (rsb->res_recover_msgid == msgid)
13076 +                       goto rec_found;
13077 +       }
13078 +       rsb = NULL;
13079 +
13080 + rec_found:
13081 +       spin_unlock(&ls->ls_recover_list_lock);
13082 +       return rsb;
13083 +}
13084 +
13085 +static int rsb_master_lookup(struct dlm_rsb *rsb, struct dlm_rcom *rc)
13086 +{
13087 +       struct dlm_ls *ls = rsb->res_ls;
13088 +       uint32_t dir_nodeid, r_nodeid;
13089 +       int error;
13090 +
13091 +       dir_nodeid = get_directory_nodeid(rsb);
13092 +
13093 +       if (dir_nodeid == our_nodeid()) {
13094 +               error = dlm_dir_lookup(ls, dir_nodeid, rsb->res_name,
13095 +                                      rsb->res_length, &r_nodeid);
13096 +               if (error == -EEXIST) {
13097 +                       log_all(ls, "rsb_master_lookup %u EEXIST %s",
13098 +                               r_nodeid, rsb->res_name);
13099 +               } else if (error)
13100 +                       goto fail;
13101 +
13102 +               set_new_master(rsb, r_nodeid);
13103 +       } else {
13104 +               /* As we are the only thread doing recovery this
13105 +                  should be safe. if not then we need to use a different
13106 +                  ID somehow. We must set it in the RSB before rcom_send_msg
13107 +                  completes cos we may get a reply quite quickly.
13108 +               */
13109 +               rsb->res_recover_msgid = ls->ls_rcom_msgid + 1;
13110 +
13111 +               recover_list_add(rsb);
13112 +
13113 +               memcpy(rc->rc_buf, rsb->res_name, rsb->res_length);
13114 +               rc->rc_datalen = rsb->res_length;
13115 +
13116 +               error = rcom_send_message(ls, dir_nodeid, RECCOMM_GETMASTER,
13117 +                                         rc, 0);
13118 +               if (error)
13119 +                       goto fail;
13120 +       }
13121 +
13122 + fail:
13123 +       return error;
13124 +}
13125 +
13126 +static int needs_update(struct dlm_ls *ls, struct dlm_rsb *r)
13127 +{
13128 +       if (!r->res_nodeid)
13129 +               return FALSE;
13130 +
13131 +       if (r->res_nodeid == -1)
13132 +               return FALSE;
13133 +
13134 +       if (in_nodes_gone(ls, r->res_nodeid))
13135 +               return TRUE;
13136 +
13137 +       return FALSE;
13138 +}
13139 +
13140 +/*
13141 + * Go through local root resources and for each rsb which has a master which
13142 + * has departed, get the new master nodeid from the resdir.  The resdir will
13143 + * assign mastery to the first node to look up the new master.  That means
13144 + * we'll discover in this lookup if we're the new master of any rsb's.
13145 + *
13146 + * We fire off all the resdir requests individually and asynchronously to the
13147 + * correct resdir node.  The replies are processed in rsb_master_recv().
13148 + */
13149 +
13150 +int restbl_rsb_update(struct dlm_ls *ls)
13151 +{
13152 +       struct dlm_rsb *rsb, *safe;
13153 +       struct dlm_rcom *rc;
13154 +       int error = -ENOMEM;
13155 +       int count = 0;
13156 +
13157 +       log_all(ls, "update remastered resources");
13158 +
13159 +       rc = allocate_rcom_buffer(ls);
13160 +       if (!rc)
13161 +               goto out;
13162 +
13163 +       down_read(&ls->ls_root_lock);
13164 +
13165 +       list_for_each_entry_safe(rsb, safe, &ls->ls_rootres, res_rootlist) {
13166 +               error = dlm_recovery_stopped(ls);
13167 +               if (error) {
13168 +                       up_read(&ls->ls_root_lock);
13169 +                       goto out_free;
13170 +               }
13171 +
13172 +               if (needs_update(ls, rsb)) {
13173 +                       error = rsb_master_lookup(rsb, rc);
13174 +                       if (error) {
13175 +                               up_read(&ls->ls_root_lock);
13176 +                               goto out_free;
13177 +                       }
13178 +                       count++;
13179 +               }
13180 +       }
13181 +       up_read(&ls->ls_root_lock);
13182 +
13183 +       error = dlm_wait_function(ls, &recover_list_empty);
13184 +
13185 +       log_all(ls, "updated %d resources", count);
13186 + out_free:
13187 +       free_rcom_buffer(rc);
13188 + out:
13189 +       return error;
13190 +}
13191 +
13192 +int restbl_rsb_update_recv(struct dlm_ls *ls, uint32_t nodeid, char *buf,
13193 +                          int length, int msgid)
13194 +{
13195 +       struct dlm_rsb *rsb;
13196 +       uint32_t be_nodeid;
13197 +
13198 +       rsb = recover_list_find(ls, msgid);
13199 +       if (!rsb) {
13200 +               log_error(ls, "restbl_rsb_update_recv rsb not found %d", msgid);
13201 +               goto out;
13202 +       }
13203 +
13204 +       memcpy(&be_nodeid, buf, sizeof(uint32_t));
13205 +       set_new_master(rsb, be32_to_cpu(be_nodeid));
13206 +       recover_list_del(rsb);
13207 +
13208 +       if (recover_list_empty(ls))
13209 +               wake_up(&ls->ls_wait_general);
13210 +
13211 + out:
13212 +       return 0;
13213 +}
13214 +
13215 +/*
13216 + * This function not used any longer.
13217 + */
13218 +
13219 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
13220 +                      char *outbuf)
13221 +{
13222 +       char *inbufptr, *outbufptr;
13223 +
13224 +       /*
13225 +        * The other node wants nodeids matching the resource names in inbuf.
13226 +        * The resource names are packed into inbuf as
13227 +        * [len1][name1][len2][name2]...  where lenX is 1 byte and nameX is
13228 +        * lenX bytes.  Matching nodeids are packed into outbuf in order
13229 +        * [nodeid1][nodeid2]...
13230 +        */
13231 +
13232 +       inbufptr = inbuf;
13233 +       outbufptr = outbuf;
13234 +
13235 +       while (inbufptr < inbuf + inlen) {
13236 +               uint32_t r_nodeid, be_nodeid;
13237 +               int status;
13238 +
13239 +               status = dlm_dir_lookup(ls, nodeid, inbufptr + 1, *inbufptr,
13240 +                                       &r_nodeid);
13241 +               if (status != 0)
13242 +                       goto fail;
13243 +
13244 +               inbufptr += *inbufptr + 1;
13245 +
13246 +               be_nodeid = cpu_to_be32(r_nodeid);
13247 +               memcpy(outbufptr, &be_nodeid, sizeof(uint32_t));
13248 +               outbufptr += sizeof(uint32_t);
13249 +
13250 +               /* add assertion that outbufptr - outbuf is not > than ... */
13251 +       }
13252 +
13253 +       return (outbufptr - outbuf);
13254 + fail:
13255 +       return -1;
13256 +}
13257 diff -urN linux-orig/cluster/dlm/recover.h linux-patched/cluster/dlm/recover.h
13258 --- linux-orig/cluster/dlm/recover.h    1970-01-01 07:30:00.000000000 +0730
13259 +++ linux-patched/cluster/dlm/recover.h 2004-11-03 11:31:56.000000000 +0800
13260 @@ -0,0 +1,33 @@
13261 +/******************************************************************************
13262 +*******************************************************************************
13263 +**
13264 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13265 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13266 +**
13267 +**  This copyrighted material is made available to anyone wishing to use,
13268 +**  modify, copy, or redistribute it subject to the terms and conditions
13269 +**  of the GNU General Public License v.2.
13270 +**
13271 +*******************************************************************************
13272 +******************************************************************************/
13273 +
13274 +#ifndef __RECOVER_DOT_H__
13275 +#define __RECOVER_DOT_H__
13276 +
13277 +int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls * ls));
13278 +int dlm_wait_status_all(struct dlm_ls *ls, unsigned int wait_status);
13279 +int dlm_wait_status_low(struct dlm_ls *ls, unsigned int wait_status);
13280 +int dlm_recovery_stopped(struct dlm_ls *ls);
13281 +int recover_list_empty(struct dlm_ls *ls);
13282 +int recover_list_count(struct dlm_ls *ls);
13283 +void recover_list_add(struct dlm_rsb *rsb);
13284 +void recover_list_del(struct dlm_rsb *rsb);
13285 +int restbl_lkb_purge(struct dlm_ls *ls);
13286 +void restbl_grant_after_purge(struct dlm_ls *ls);
13287 +int restbl_rsb_update(struct dlm_ls *ls);
13288 +int restbl_rsb_update_recv(struct dlm_ls *ls, int nodeid, char *buf, int len,
13289 +                          int msgid);
13290 +int bulk_master_lookup(struct dlm_ls *ls, int nodeid, char *inbuf, int inlen,
13291 +                      char *outbuf);
13292 +
13293 +#endif                         /* __RECOVER_DOT_H__ */
13294 diff -urN linux-orig/cluster/dlm/recoverd.c linux-patched/cluster/dlm/recoverd.c
13295 --- linux-orig/cluster/dlm/recoverd.c   1970-01-01 07:30:00.000000000 +0730
13296 +++ linux-patched/cluster/dlm/recoverd.c        2004-11-03 11:31:56.000000000 +0800
13297 @@ -0,0 +1,713 @@
13298 +/******************************************************************************
13299 +*******************************************************************************
13300 +**
13301 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
13302 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
13303 +**
13304 +**  This copyrighted material is made available to anyone wishing to use,
13305 +**  modify, copy, or redistribute it subject to the terms and conditions
13306 +**  of the GNU General Public License v.2.
13307 +**
13308 +*******************************************************************************
13309 +******************************************************************************/
13310 +
13311 +#include "dlm_internal.h"
13312 +#include "nodes.h"
13313 +#include "dir.h"
13314 +#include "ast.h"
13315 +#include "recover.h"
13316 +#include "lockspace.h"
13317 +#include "lowcomms.h"
13318 +#include "lockqueue.h"
13319 +#include "lkb.h"
13320 +#include "rebuild.h"
13321 +
13322 +/*
13323 + * next_move actions
13324 + */
13325 +
13326 +#define DO_STOP             (1)
13327 +#define DO_START            (2)
13328 +#define DO_FINISH           (3)
13329 +#define DO_FINISH_STOP      (4)
13330 +#define DO_FINISH_START     (5)
13331 +
13332 +/*
13333 + * Queue of lockspaces (dlm_recover structs) which need to be
13334 + * started/recovered
13335 + */
13336 +
13337 +static int enable_locking(struct dlm_ls *ls, int event_id)
13338 +{
13339 +       int error = 0;
13340 +
13341 +       spin_lock(&ls->ls_recover_lock);
13342 +       if (ls->ls_last_stop < event_id) {
13343 +               set_bit(LSFL_LS_RUN, &ls->ls_flags);
13344 +               up_write(&ls->ls_in_recovery);
13345 +       } else {
13346 +               error = -EINTR;
13347 +               log_debug(ls, "enable_locking: abort %d", event_id);
13348 +       }
13349 +       spin_unlock(&ls->ls_recover_lock);
13350 +       return error;
13351 +}
13352 +
13353 +static int ls_first_start(struct dlm_ls *ls, struct dlm_recover *rv)
13354 +{
13355 +       int error;
13356 +
13357 +       log_all(ls, "recover event %u (first)", rv->event_id);
13358 +
13359 +       kcl_global_service_id(ls->ls_local_id, &ls->ls_global_id);
13360 +
13361 +       error = ls_nodes_init(ls, rv);
13362 +       if (error) {
13363 +               log_error(ls, "nodes_init failed %d", error);
13364 +               goto out;
13365 +       }
13366 +
13367 +       error = dlm_dir_rebuild_local(ls);
13368 +       if (error) {
13369 +               log_error(ls, "dlm_dir_rebuild_local failed %d", error);
13370 +               goto out;
13371 +       }
13372 +
13373 +       error = dlm_dir_rebuild_wait(ls);
13374 +       if (error) {
13375 +               log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
13376 +               goto out;
13377 +       }
13378 +
13379 +       log_all(ls, "recover event %u done", rv->event_id);
13380 +       kcl_start_done(ls->ls_local_id, rv->event_id);
13381 +
13382 + out:
13383 +       return error;
13384 +}
13385 +
13386 +/*
13387 + * We are given here a new group of nodes which are in the lockspace.  We first
13388 + * figure out the differences in ls membership from when we were last running.
13389 + * If nodes from before are gone, then there will be some lock recovery to do.
13390 + * If there are only nodes which have joined, then there's no lock recovery.
13391 + *
13392 + * note: cman requires an rc to finish starting on an revent (where nodes die)
13393 + * before it allows an sevent (where nodes join) to be processed.  This means
13394 + * that we won't get start1 with nodeA gone, stop/cancel, start2 with nodeA
13395 + * joined.
13396 + */
13397 +
13398 +static int ls_reconfig(struct dlm_ls *ls, struct dlm_recover *rv)
13399 +{
13400 +       int error, neg = 0;
13401 +
13402 +       log_all(ls, "recover event %u", rv->event_id);
13403 +
13404 +       /*
13405 +        * this list may be left over from a previous aborted recovery
13406 +        */
13407 +
13408 +       rebuild_freemem(ls);
13409 +
13410 +       /*
13411 +        * Add or remove nodes from the lockspace's ls_nodes list.
13412 +        */
13413 +
13414 +       error = ls_nodes_reconfig(ls, rv, &neg);
13415 +       if (error) {
13416 +               log_error(ls, "nodes_reconfig failed %d", error);
13417 +               goto fail;
13418 +       }
13419 +
13420 +       /*
13421 +        * Rebuild our own share of the resdir by collecting from all other
13422 +        * nodes rsb name/master pairs for which the name hashes to us.
13423 +        */
13424 +
13425 +       error = dlm_dir_rebuild_local(ls);
13426 +       if (error) {
13427 +               log_error(ls, "dlm_dir_rebuild_local failed %d", error);
13428 +               goto fail;
13429 +       }
13430 +
13431 +       /*
13432 +        * Purge resdir-related requests that are being held in requestqueue.
13433 +        * All resdir requests from before recovery started are invalid now due
13434 +        * to the resdir rebuild and will be resent by the requesting nodes.
13435 +        */
13436 +
13437 +       purge_requestqueue(ls);
13438 +       set_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13439 +
13440 +       /*
13441 +        * Wait for all nodes to complete resdir rebuild.
13442 +        */
13443 +
13444 +       error = dlm_dir_rebuild_wait(ls);
13445 +       if (error) {
13446 +               log_error(ls, "dlm_dir_rebuild_wait failed %d", error);
13447 +               goto fail;
13448 +       }
13449 +
13450 +       /*
13451 +        * Mark our own lkb's waiting in the lockqueue for remote replies from
13452 +        * nodes that are now departed.  These will be resent to the new
13453 +        * masters in resend_cluster_requests.  Also mark resdir lookup
13454 +        * requests for resending.
13455 +        */
13456 +
13457 +       lockqueue_lkb_mark(ls);
13458 +
13459 +       error = dlm_recovery_stopped(ls);
13460 +       if (error)
13461 +               goto fail;
13462 +
13463 +       if (neg) {
13464 +               /*
13465 +                * Clear lkb's for departed nodes.  This can't fail since it
13466 +                * doesn't involve communicating with other nodes.
13467 +                */
13468 +
13469 +               restbl_lkb_purge(ls);
13470 +
13471 +               /*
13472 +                * Get new master id's for rsb's of departed nodes.  This fails
13473 +                * if we can't communicate with other nodes.
13474 +                */
13475 +
13476 +               error = restbl_rsb_update(ls);
13477 +               if (error) {
13478 +                       log_error(ls, "restbl_rsb_update failed %d", error);
13479 +                       goto fail;
13480 +               }
13481 +
13482 +               /*
13483 +                * Send our lkb info to new masters.  This fails if we can't
13484 +                * communicate with a node.
13485 +                */
13486 +
13487 +               error = rebuild_rsbs_send(ls);
13488 +               if (error) {
13489 +                       log_error(ls, "rebuild_rsbs_send failed %d", error);
13490 +                       goto fail;
13491 +               }
13492 +       }
13493 +
13494 +       clear_bit(LSFL_REQUEST_WARN, &ls->ls_flags);
13495 +
13496 +       log_all(ls, "recover event %u done", rv->event_id);
13497 +       kcl_start_done(ls->ls_local_id, rv->event_id);
13498 +       return 0;
13499 +
13500 + fail:
13501 +       log_all(ls, "recover event %d error %d", rv->event_id, error);
13502 +       return error;
13503 +}
13504 +
13505 +static void clear_finished_nodes(struct dlm_ls *ls, int finish_event)
13506 +{
13507 +       struct dlm_csb *csb, *safe;
13508 +
13509 +       list_for_each_entry_safe(csb, safe, &ls->ls_nodes_gone, list) {
13510 +               if (csb->gone_event <= finish_event) {
13511 +                       list_del(&csb->list);
13512 +                       release_csb(csb);
13513 +               }
13514 +       }
13515 +}
13516 +
13517 +/*
13518 + * Between calls to this routine for a ls, there can be multiple stop/start
13519 + * events from cman where every start but the latest is cancelled by stops.
13520 + * There can only be a single finish from cman because every finish requires us
13521 + * to call start_done.  A single finish event could be followed by multiple
13522 + * stop/start events.  This routine takes any combination of events from cman
13523 + * and boils them down to one course of action.
13524 + */
13525 +
13526 +static int next_move(struct dlm_ls *ls, struct dlm_recover **rv_out,
13527 +                    int *finish_out)
13528 +{
13529 +       LIST_HEAD(events);
13530 +       unsigned int cmd = 0, stop, start, finish;
13531 +       unsigned int last_stop, last_start, last_finish;
13532 +       struct dlm_recover *rv = NULL, *start_rv = NULL;
13533 +
13534 +       /*
13535 +        * Grab the current state of cman/sm events.
13536 +        */
13537 +
13538 +       spin_lock(&ls->ls_recover_lock);
13539 +
13540 +       stop = test_and_clear_bit(LSFL_LS_STOP, &ls->ls_flags) ? 1 : 0;
13541 +       start = test_and_clear_bit(LSFL_LS_START, &ls->ls_flags) ? 1 : 0;
13542 +       finish = test_and_clear_bit(LSFL_LS_FINISH, &ls->ls_flags) ? 1 : 0;
13543 +
13544 +       last_stop = ls->ls_last_stop;
13545 +       last_start = ls->ls_last_start;
13546 +       last_finish = ls->ls_last_finish;
13547 +
13548 +       while (!list_empty(&ls->ls_recover)) {
13549 +               rv = list_entry(ls->ls_recover.next, struct dlm_recover, list);
13550 +               list_del(&rv->list);
13551 +               list_add_tail(&rv->list, &events);
13552 +       }
13553 +
13554 +       /*
13555 +        * There are two cases where we need to adjust these event values:
13556 +        * 1. - we get a first start
13557 +        *    - we get a stop
13558 +        *    - we process the start + stop here and notice this special case
13559 +        *
13560 +        * 2. - we get a first start
13561 +        *    - we process the start
13562 +        *    - we get a stop
13563 +        *    - we process the stop here and notice this special case
13564 +        *
13565 +        * In both cases, the first start we received was aborted by a
13566 +        * stop before we received a finish.  last_finish being zero is the
13567 +        * indication that this is the "first" start, i.e. we've not yet
13568 +        * finished a start; if we had, last_finish would be non-zero.
13569 +        * Part of the problem arises from the fact that when we initially
13570 +        * get start/stop/start, SM uses the same event id for both starts
13571 +        * (since the first was cancelled).
13572 +        *
13573 +        * In both cases, last_start and last_stop will be equal.
13574 +        * In both cases, finish=0.
13575 +        * In the first case start=1 && stop=1.
13576 +        * In the second case start=0 && stop=1.
13577 +        *
13578 +        * In both cases, we need to make adjustments to values so:
13579 +        * - we process the current event (now) as a normal stop
13580 +        * - the next start we receive will be processed normally
13581 +        *   (taking into account the assertions below)
13582 +        *
13583 +        * In the first case, dlm_ls_start() will have printed the
13584 +        * "repeated start" warning.
13585 +        *
13586 +        * In the first case we need to get rid of the recover event struct.
13587 +        *
13588 +        * - set stop=1, start=0, finish=0 for case 4 below
13589 +        * - last_stop and last_start must be set equal per the case 4 assert
13590 +        * - ls_last_stop = 0 so the next start will be larger
13591 +        * - ls_last_start = 0 not really necessary (avoids dlm_ls_start print)
13592 +        */
13593 +
13594 +       if (!last_finish && (last_start == last_stop)) {
13595 +               log_all(ls, "move reset %u,%u,%u ids %u,%u,%u", stop,
13596 +                       start, finish, last_stop, last_start, last_finish);
13597 +               stop = 1;
13598 +               start = 0;
13599 +               finish = 0;
13600 +               last_stop = 0;
13601 +               last_start = 0;
13602 +               ls->ls_last_stop = 0;
13603 +               ls->ls_last_start = 0;
13604 +
13605 +               while (!list_empty(&events)) {
13606 +                       rv = list_entry(events.next, struct dlm_recover, list);
13607 +                       list_del(&rv->list);
13608 +                       kfree(rv->nodeids);
13609 +                       kfree(rv);
13610 +               }
13611 +       }
13612 +       spin_unlock(&ls->ls_recover_lock);
13613 +
13614 +       log_debug(ls, "move flags %u,%u,%u ids %u,%u,%u", stop, start, finish,
13615 +                 last_stop, last_start, last_finish);
13616 +
13617 +       /*
13618 +        * Toss start events which have since been cancelled.
13619 +        */
13620 +
13621 +       while (!list_empty(&events)) {
13622 +               DLM_ASSERT(start,);
13623 +               rv = list_entry(events.next, struct dlm_recover, list);
13624 +               list_del(&rv->list);
13625 +
13626 +               if (rv->event_id <= last_stop) {
13627 +                       log_debug(ls, "move skip event %u", rv->event_id);
13628 +                       kfree(rv->nodeids);
13629 +                       kfree(rv);
13630 +                       rv = NULL;
13631 +               } else {
13632 +                       log_debug(ls, "move use event %u", rv->event_id);
13633 +                       DLM_ASSERT(!start_rv,);
13634 +                       start_rv = rv;
13635 +               }
13636 +       }
13637 +
13638 +       /*
13639 +        * Eight possible combinations of events.
13640 +        */
13641 +
13642 +       /* 0 */
13643 +       if (!stop && !start && !finish) {
13644 +               DLM_ASSERT(!start_rv,);
13645 +               cmd = 0;
13646 +               goto out;
13647 +       }
13648 +
13649 +       /* 1 */
13650 +       if (!stop && !start && finish) {
13651 +               DLM_ASSERT(!start_rv,);
13652 +               DLM_ASSERT(last_start > last_stop,);
13653 +               DLM_ASSERT(last_finish == last_start,);
13654 +               cmd = DO_FINISH;
13655 +               *finish_out = last_finish;
13656 +               goto out;
13657 +       }
13658 +
13659 +       /* 2 */
13660 +       if (!stop && start && !finish) {
13661 +               DLM_ASSERT(start_rv,);
13662 +               DLM_ASSERT(last_start > last_stop,);
13663 +               cmd = DO_START;
13664 +               *rv_out = start_rv;
13665 +               goto out;
13666 +       }
13667 +
13668 +       /* 3 */
13669 +       if (!stop && start && finish) {
13670 +               DLM_ASSERT(0, printk("finish and start with no stop\n"););
13671 +       }
13672 +
13673 +       /* 4 */
13674 +       if (stop && !start && !finish) {
13675 +               DLM_ASSERT(!start_rv,);
13676 +               DLM_ASSERT(last_start == last_stop,);
13677 +               cmd = DO_STOP;
13678 +               goto out;
13679 +       }
13680 +
13681 +       /* 5 */
13682 +       if (stop && !start && finish) {
13683 +               DLM_ASSERT(!start_rv,);
13684 +               DLM_ASSERT(last_finish == last_start,);
13685 +               DLM_ASSERT(last_stop == last_start,);
13686 +               cmd = DO_FINISH_STOP;
13687 +               *finish_out = last_finish;
13688 +               goto out;
13689 +       }
13690 +
13691 +       /* 6 */
13692 +       if (stop && start && !finish) {
13693 +               if (start_rv) {
13694 +                       DLM_ASSERT(last_start > last_stop,);
13695 +                       cmd = DO_START;
13696 +                       *rv_out = start_rv;
13697 +               } else {
13698 +                       DLM_ASSERT(last_stop == last_start,);
13699 +                       cmd = DO_STOP;
13700 +               }
13701 +               goto out;
13702 +       }
13703 +
13704 +       /* 7 */
13705 +       if (stop && start && finish) {
13706 +               if (start_rv) {
13707 +                       DLM_ASSERT(last_start > last_stop,);
13708 +                       DLM_ASSERT(last_start > last_finish,);
13709 +                       cmd = DO_FINISH_START;
13710 +                       *finish_out = last_finish;
13711 +                       *rv_out = start_rv;
13712 +               } else {
13713 +                       DLM_ASSERT(last_start == last_stop,);
13714 +                       DLM_ASSERT(last_start > last_finish,);
13715 +                       cmd = DO_FINISH_STOP;
13716 +                       *finish_out = last_finish;
13717 +               }
13718 +               goto out;
13719 +       }
13720 +
13721 + out:
13722 +       return cmd;
13723 +}
13724 +
13725 +/*
13726 + * This function decides what to do given every combination of current
13727 + * lockspace state and next lockspace state.
13728 + */
13729 +
13730 +static void do_ls_recovery(struct dlm_ls *ls)
13731 +{
13732 +       struct dlm_recover *rv = NULL;
13733 +       int error, cur_state, next_state = 0, do_now, finish_event = 0;
13734 +
13735 +       do_now = next_move(ls, &rv, &finish_event);
13736 +       if (!do_now)
13737 +               goto out;
13738 +
13739 +       cur_state = ls->ls_state;
13740 +       next_state = 0;
13741 +
13742 +       DLM_ASSERT(!test_bit(LSFL_LS_RUN, &ls->ls_flags),
13743 +                   log_error(ls, "curstate=%d donow=%d", cur_state, do_now););
13744 +
13745 +       /*
13746 +        * LSST_CLEAR - we're not in any recovery state.  We can get a stop or
13747 +        * a stop and start which equates with a START.
13748 +        */
13749 +
13750 +       if (cur_state == LSST_CLEAR) {
13751 +               switch (do_now) {
13752 +               case DO_STOP:
13753 +                       next_state = LSST_WAIT_START;
13754 +                       break;
13755 +
13756 +               case DO_START:
13757 +                       error = ls_reconfig(ls, rv);
13758 +                       if (error)
13759 +                               next_state = LSST_WAIT_START;
13760 +                       else
13761 +                               next_state = LSST_RECONFIG_DONE;
13762 +                       break;
13763 +
13764 +               case DO_FINISH: /* invalid */
13765 +               case DO_FINISH_STOP:    /* invalid */
13766 +               case DO_FINISH_START:   /* invalid */
13767 +               default:
13768 +                       DLM_ASSERT(0,);
13769 +               }
13770 +               goto out;
13771 +       }
13772 +
13773 +       /*
13774 +        * LSST_WAIT_START - we're not running because of getting a stop or
13775 +        * failing a start.  We wait in this state for another stop/start or
13776 +        * just the next start to begin another reconfig attempt.
13777 +        */
13778 +
13779 +       if (cur_state == LSST_WAIT_START) {
13780 +               switch (do_now) {
13781 +               case DO_STOP:
13782 +                       break;
13783 +
13784 +               case DO_START:
13785 +                       error = ls_reconfig(ls, rv);
13786 +                       if (error)
13787 +                               next_state = LSST_WAIT_START;
13788 +                       else
13789 +                               next_state = LSST_RECONFIG_DONE;
13790 +                       break;
13791 +
13792 +               case DO_FINISH: /* invalid */
13793 +               case DO_FINISH_STOP:    /* invalid */
13794 +               case DO_FINISH_START:   /* invalid */
13795 +               default:
13796 +                       DLM_ASSERT(0,);
13797 +               }
13798 +               goto out;
13799 +       }
13800 +
13801 +       /*
13802 +        * LSST_RECONFIG_DONE - we entered this state after successfully
13803 +        * completing ls_reconfig and calling kcl_start_done.  We expect to get
13804 +        * a finish if everything goes ok.  A finish could be followed by stop
13805 +        * or stop/start before we get here to check it.  Or a finish may never
13806 +        * happen, only stop or stop/start.
13807 +        */
13808 +
13809 +       if (cur_state == LSST_RECONFIG_DONE) {
13810 +               switch (do_now) {
13811 +               case DO_FINISH:
13812 +                       rebuild_freemem(ls);
13813 +
13814 +                       clear_finished_nodes(ls, finish_event);
13815 +                       next_state = LSST_CLEAR;
13816 +
13817 +                       error = enable_locking(ls, finish_event);
13818 +                       if (error)
13819 +                               break;
13820 +
13821 +                       error = process_requestqueue(ls);
13822 +                       if (error)
13823 +                               break;
13824 +
13825 +                       error = resend_cluster_requests(ls);
13826 +                       if (error)
13827 +                               break;
13828 +
13829 +                       restbl_grant_after_purge(ls);
13830 +
13831 +                       log_all(ls, "recover event %u finished", finish_event);
13832 +                       break;
13833 +
13834 +               case DO_STOP:
13835 +                       next_state = LSST_WAIT_START;
13836 +                       break;
13837 +
13838 +               case DO_FINISH_STOP:
13839 +                       clear_finished_nodes(ls, finish_event);
13840 +                       next_state = LSST_WAIT_START;
13841 +                       break;
13842 +
13843 +               case DO_FINISH_START:
13844 +                       clear_finished_nodes(ls, finish_event);
13845 +                       /* fall into DO_START */
13846 +
13847 +               case DO_START:
13848 +                       error = ls_reconfig(ls, rv);
13849 +                       if (error)
13850 +                               next_state = LSST_WAIT_START;
13851 +                       else
13852 +                               next_state = LSST_RECONFIG_DONE;
13853 +                       break;
13854 +
13855 +               default:
13856 +                       DLM_ASSERT(0,);
13857 +               }
13858 +               goto out;
13859 +       }
13860 +
13861 +       /*
13862 +        * LSST_INIT - state after ls is created and before it has been
13863 +        * started.  A start operation will cause the ls to be started for the
13864 +        * first time.  A failed start will cause to just wait in INIT for
13865 +        * another stop/start.
13866 +        */
13867 +
13868 +       if (cur_state == LSST_INIT) {
13869 +               switch (do_now) {
13870 +               case DO_START:
13871 +                       error = ls_first_start(ls, rv);
13872 +                       if (!error)
13873 +                               next_state = LSST_INIT_DONE;
13874 +                       break;
13875 +
13876 +               case DO_STOP:
13877 +                       break;
13878 +
13879 +               case DO_FINISH: /* invalid */
13880 +               case DO_FINISH_STOP:    /* invalid */
13881 +               case DO_FINISH_START:   /* invalid */
13882 +               default:
13883 +                       DLM_ASSERT(0,);
13884 +               }
13885 +               goto out;
13886 +       }
13887 +
13888 +       /*
13889 +        * LSST_INIT_DONE - after the first start operation is completed
13890 +        * successfully and kcl_start_done() called.  If there are no errors, a
13891 +        * finish will arrive next and we'll move to LSST_CLEAR.
13892 +        */
13893 +
13894 +       if (cur_state == LSST_INIT_DONE) {
13895 +               switch (do_now) {
13896 +               case DO_STOP:
13897 +               case DO_FINISH_STOP:
13898 +                       next_state = LSST_WAIT_START;
13899 +                       break;
13900 +
13901 +               case DO_START:
13902 +               case DO_FINISH_START:
13903 +                       error = ls_reconfig(ls, rv);
13904 +                       if (error)
13905 +                               next_state = LSST_WAIT_START;
13906 +                       else
13907 +                               next_state = LSST_RECONFIG_DONE;
13908 +                       break;
13909 +
13910 +               case DO_FINISH:
13911 +                       next_state = LSST_CLEAR;
13912 +
13913 +                       enable_locking(ls, finish_event);
13914 +
13915 +                       process_requestqueue(ls);
13916 +
13917 +                       log_all(ls, "recover event %u finished", finish_event);
13918 +                       break;
13919 +
13920 +               default:
13921 +                       DLM_ASSERT(0,);
13922 +               }
13923 +               goto out;
13924 +       }
13925 +
13926 + out:
13927 +       if (next_state)
13928 +               ls->ls_state = next_state;
13929 +
13930 +       if (rv) {
13931 +               kfree(rv->nodeids);
13932 +               kfree(rv);
13933 +       }
13934 +}
13935 +
13936 +int dlm_recoverd(void *arg)
13937 +{
13938 +       struct dlm_ls *ls = arg;
13939 +
13940 +       hold_lockspace(ls);
13941 +
13942 +       for (;;) {
13943 +               set_current_state(TASK_INTERRUPTIBLE);
13944 +               if (!test_bit(LSFL_WORK, &ls->ls_flags))
13945 +                       schedule();
13946 +               set_current_state(TASK_RUNNING);
13947 +
13948 +               if (test_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags)) {
13949 +                       down(&ls->ls_recoverd_lock);
13950 +                       ls->ls_recoverd_task = NULL;
13951 +                       up(&ls->ls_recoverd_lock);
13952 +                       goto out;
13953 +               }
13954 +
13955 +               if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) {
13956 +                       do_ls_recovery(ls);
13957 +
13958 +                       down(&ls->ls_recoverd_lock);
13959 +                       if (ls->ls_state == LSST_CLEAR &&
13960 +                           !test_bit(LSFL_WORK, &ls->ls_flags)) {
13961 +                               ls->ls_recoverd_task = NULL;
13962 +                               up(&ls->ls_recoverd_lock);
13963 +                               goto out;
13964 +                       }
13965 +                       up(&ls->ls_recoverd_lock);
13966 +               }
13967 +       }
13968 +
13969 + out:
13970 +       put_lockspace(ls);
13971 +       return 0;
13972 +}
13973 +
13974 +void dlm_recoverd_kick(struct dlm_ls *ls)
13975 +{
13976 +       struct task_struct *p;
13977 +
13978 +       down(&ls->ls_recoverd_lock);
13979 +        set_bit(LSFL_WORK, &ls->ls_flags);
13980 +
13981 +       if (!ls->ls_recoverd_task) {
13982 +               p = kthread_run(dlm_recoverd, (void *) ls, 0, "dlm_recoverd");
13983 +               if (IS_ERR(p)) {
13984 +                       log_error(ls, "can't start dlm_recoverd %ld",
13985 +                                 PTR_ERR(p));
13986 +                       goto out;
13987 +               }
13988 +               ls->ls_recoverd_task = p;
13989 +       } else
13990 +               wake_up_process(ls->ls_recoverd_task);
13991 + out:
13992 +       up(&ls->ls_recoverd_lock);
13993 +}
13994 +
13995 +void dlm_recoverd_stop(struct dlm_ls *ls)
13996 +{
13997 +       set_bit(LSFL_RECOVERD_EXIT, &ls->ls_flags);
13998 +
13999 +       for (;;) {
14000 +               down(&ls->ls_recoverd_lock);
14001 +               if (!ls->ls_recoverd_task) {
14002 +                       up(&ls->ls_recoverd_lock);
14003 +                       break;
14004 +               }
14005 +               wake_up_process(ls->ls_recoverd_task);
14006 +               up(&ls->ls_recoverd_lock);
14007 +               msleep(100);
14008 +       }
14009 +}
14010 +
14011 diff -urN linux-orig/cluster/dlm/recoverd.h linux-patched/cluster/dlm/recoverd.h
14012 --- linux-orig/cluster/dlm/recoverd.h   1970-01-01 07:30:00.000000000 +0730
14013 +++ linux-patched/cluster/dlm/recoverd.h        2004-11-03 11:31:56.000000000 +0800
14014 @@ -0,0 +1,21 @@
14015 +/******************************************************************************
14016 +*******************************************************************************
14017 +**
14018 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14019 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14020 +**
14021 +**  This copyrighted material is made available to anyone wishing to use,
14022 +**  modify, copy, or redistribute it subject to the terms and conditions
14023 +**  of the GNU General Public License v.2.
14024 +**
14025 +*******************************************************************************
14026 +******************************************************************************/
14027 +
14028 +#ifndef __RECOVERD_DOT_H__
14029 +#define __RECOVERD_DOT_H__
14030 +
14031 +int dlm_recoverd(void *arg);
14032 +void dlm_recoverd_kick(struct dlm_ls *ls);
14033 +void dlm_recoverd_stop(struct dlm_ls *ls);
14034 +
14035 +#endif                         /* __RECOVERD_DOT_H__ */
14036 diff -urN linux-orig/cluster/dlm/rsb.c linux-patched/cluster/dlm/rsb.c
14037 --- linux-orig/cluster/dlm/rsb.c        1970-01-01 07:30:00.000000000 +0730
14038 +++ linux-patched/cluster/dlm/rsb.c     2004-11-03 11:31:56.000000000 +0800
14039 @@ -0,0 +1,329 @@
14040 +/******************************************************************************
14041 +*******************************************************************************
14042 +**
14043 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14044 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14045 +**
14046 +**  This copyrighted material is made available to anyone wishing to use,
14047 +**  modify, copy, or redistribute it subject to the terms and conditions
14048 +**  of the GNU General Public License v.2.
14049 +**
14050 +*******************************************************************************
14051 +******************************************************************************/
14052 +
14053 +#include "dlm_internal.h"
14054 +#include "locking.h"
14055 +#include "memory.h"
14056 +#include "lockqueue.h"
14057 +#include "nodes.h"
14058 +#include "dir.h"
14059 +#include "util.h"
14060 +#include "rsb.h"
14061 +
14062 +static struct dlm_rsb *search_hashchain(struct list_head *head,
14063 +                                       struct dlm_rsb *parent,
14064 +                                       char *name, int namelen)
14065 +{
14066 +       struct dlm_rsb *r;
14067 +
14068 +       list_for_each_entry(r, head, res_hashchain) {
14069 +               if ((parent == r->res_parent) && (namelen == r->res_length) &&
14070 +                   (memcmp(name, r->res_name, namelen) == 0)) {
14071 +                       return r;
14072 +               }
14073 +       }
14074 +
14075 +       return NULL;
14076 +}
14077 +
14078 +/*
14079 + * A way to arbitrarily hold onto an rsb which we already have a reference to
14080 + * to make sure it doesn't go away.  Opposite of release_rsb().
14081 + */
14082 +
14083 +void hold_rsb(struct dlm_rsb *r)
14084 +{
14085 +       atomic_inc(&r->res_ref);
14086 +}
14087 +
14088 +/*
14089 + * release_rsb() - Decrement reference count on rsb struct.  Free the rsb
14090 + * struct when there are zero references.  Every lkb for the rsb adds a
14091 + * reference.  When ref is zero there can be no more lkb's for the rsb, on the
14092 + * queue's or anywhere else.
14093 + */
14094 +
14095 +static void _release_rsb(struct dlm_rsb *r, int locked)
14096 +{
14097 +       struct dlm_ls *ls = r->res_ls;
14098 +       uint32_t nodeid;
14099 +       int removed = FALSE;
14100 +
14101 +       write_lock(&ls->ls_rsbtbl[r->res_bucket].lock);
14102 +       if (atomic_dec_and_test(&r->res_ref)) {
14103 +               DLM_ASSERT(list_empty(&r->res_grantqueue), print_rsb(r););
14104 +               DLM_ASSERT(list_empty(&r->res_waitqueue), print_rsb(r););
14105 +               DLM_ASSERT(list_empty(&r->res_convertqueue), print_rsb(r););
14106 +               removed = TRUE;
14107 +               list_del(&r->res_hashchain);
14108 +       }
14109 +       write_unlock(&ls->ls_rsbtbl[r->res_bucket].lock);
14110 +
14111 +       if (!removed)
14112 +               return;
14113 +
14114 +       if (!locked)
14115 +               down_write(&ls->ls_root_lock);
14116 +       if (r->res_parent)
14117 +               list_del(&r->res_subreslist);
14118 +       else
14119 +               list_del(&r->res_rootlist);
14120 +       if (!locked)
14121 +               up_write(&ls->ls_root_lock);
14122 +
14123 +       if (r->res_parent || !test_bit(RESFL_MASTER, &r->res_flags))
14124 +               goto out;
14125 +
14126 +       nodeid = get_directory_nodeid(r);
14127 +
14128 +       if (nodeid != our_nodeid())
14129 +               remote_remove_direntry(ls, nodeid, r->res_name, r->res_length);
14130 +       else
14131 +               dlm_dir_remove(ls, nodeid, r->res_name, r->res_length);
14132 + out:
14133 +       if (r->res_lvbptr)
14134 +               free_lvb(r->res_lvbptr);
14135 +
14136 +       free_rsb(r);
14137 +}
14138 +
14139 +void release_rsb(struct dlm_rsb *r)
14140 +{
14141 +       _release_rsb(r, 0);
14142 +}
14143 +
14144 +void release_rsb_locked(struct dlm_rsb *r)
14145 +{
14146 +       _release_rsb(r, 1);
14147 +}
14148 +
14149 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb)
14150 +{
14151 +       struct dlm_rsb *r = lkb->lkb_resource;
14152 +       return r;
14153 +}
14154 +
14155 +/*
14156 + * find_or_create_rsb() - Get an rsb struct, or create one if it doesn't exist.
14157 + * If the rsb exists, its ref count is incremented by this function.  If it
14158 + * doesn't exist, it's created with a ref count of one.
14159 + */
14160 +
14161 +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name, int len,
14162 +            int flags, struct dlm_rsb **rp)
14163 +{
14164 +       uint32_t bucket;
14165 +       struct dlm_rsb *r, *tmp;
14166 +       int error = -ENOMEM;
14167 +
14168 +       DLM_ASSERT(len <= DLM_RESNAME_MAXLEN,);
14169 +
14170 +       bucket = dlm_hash(name, len);
14171 +       bucket &= (ls->ls_rsbtbl_size - 1);
14172 +
14173 +       read_lock(&ls->ls_rsbtbl[bucket].lock);
14174 +       r = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
14175 +       if (r) {
14176 +               if (r->res_nodeid != 0 && (flags & MASTER))
14177 +                       r = NULL;
14178 +               else
14179 +                       atomic_inc(&r->res_ref);
14180 +       }
14181 +       read_unlock(&ls->ls_rsbtbl[bucket].lock);
14182 +
14183 +       if (r)
14184 +               goto out_set;
14185 +
14186 +       /* Always create sublocks */
14187 +       if (!(flags & CREATE) && !parent) {
14188 +               *rp = NULL;
14189 +               goto out;
14190 +       }
14191 +
14192 +       r = allocate_rsb(ls, len);
14193 +       if (!r)
14194 +               goto fail;
14195 +
14196 +       INIT_LIST_HEAD(&r->res_subreslist);
14197 +       INIT_LIST_HEAD(&r->res_grantqueue);
14198 +       INIT_LIST_HEAD(&r->res_convertqueue);
14199 +       INIT_LIST_HEAD(&r->res_waitqueue);
14200 +
14201 +       memcpy(r->res_name, name, len);
14202 +       r->res_length = len;
14203 +       r->res_ls = ls;
14204 +       init_rwsem(&r->res_lock);
14205 +       atomic_set(&r->res_ref, 1);
14206 +       r->res_bucket = bucket;
14207 +
14208 +       if (parent) {
14209 +               r->res_parent = parent;
14210 +               r->res_depth = parent->res_depth + 1;
14211 +               r->res_root = parent->res_root;
14212 +               r->res_nodeid = parent->res_nodeid;
14213 +       } else {
14214 +               r->res_parent = NULL;
14215 +               r->res_depth = 1;
14216 +               r->res_root = r;
14217 +               r->res_nodeid = -1;
14218 +       }
14219 +
14220 +       write_lock(&ls->ls_rsbtbl[bucket].lock);
14221 +       tmp = search_hashchain(&ls->ls_rsbtbl[bucket].list, parent, name, len);
14222 +       if (tmp) {
14223 +               atomic_inc(&tmp->res_ref);
14224 +               write_unlock(&ls->ls_rsbtbl[bucket].lock);
14225 +               free_rsb(r);
14226 +               r = tmp;
14227 +       } else {
14228 +               list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
14229 +               write_unlock(&ls->ls_rsbtbl[bucket].lock);
14230 +
14231 +               down_write(&ls->ls_root_lock);
14232 +               if (parent)
14233 +                       list_add_tail(&r->res_subreslist,
14234 +                                     &r->res_root->res_subreslist);
14235 +               else
14236 +                       list_add(&r->res_rootlist, &ls->ls_rootres);
14237 +               up_write(&ls->ls_root_lock);
14238 +       }
14239 +
14240 +      out_set:
14241 +       *rp = r;
14242 +
14243 +      out:
14244 +       error = 0;
14245 +
14246 +      fail:
14247 +       return error;
14248 +}
14249 +
14250 +/*
14251 + * Add a LKB to a resource's grant/convert/wait queue. in order
14252 + */
14253 +
14254 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode)
14255 +{
14256 +       struct dlm_lkb *lkb = NULL;
14257 +
14258 +       list_for_each_entry(lkb, head, lkb_statequeue) {
14259 +               if (lkb->lkb_rqmode < mode)
14260 +                       break;
14261 +       }
14262 +
14263 +       if (!lkb) {
14264 +               /* No entries in the queue, we are alone */
14265 +               list_add_tail(new, head);
14266 +       } else {
14267 +               __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
14268 +       }
14269 +}
14270 +
14271 +/*
14272 + * The rsb res_lock must be held in write when this function is called.
14273 + */
14274 +
14275 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14276 +{
14277 +       DLM_ASSERT(!lkb->lkb_status,
14278 +                  print_lkb(lkb);
14279 +                  print_rsb(r););
14280 +
14281 +       lkb->lkb_status = type;
14282 +
14283 +       switch (type) {
14284 +       case GDLM_LKSTS_WAITING:
14285 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14286 +                       list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
14287 +               else
14288 +                       list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
14289 +               break;
14290 +
14291 +       case GDLM_LKSTS_GRANTED:
14292 +               lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
14293 +                               lkb->lkb_grmode);
14294 +               break;
14295 +
14296 +       case GDLM_LKSTS_CONVERT:
14297 +               if (lkb->lkb_lockqueue_flags & DLM_LKF_HEADQUE)
14298 +                       list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
14299 +               else
14300 +                       list_add_tail(&lkb->lkb_statequeue,
14301 +                                     &r->res_convertqueue);
14302 +               break;
14303 +
14304 +       default:
14305 +               DLM_ASSERT(0,);
14306 +       }
14307 +}
14308 +
14309 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14310 +{
14311 +       down_write(&r->res_lock);
14312 +       lkb_enqueue(r, lkb, type);
14313 +       up_write(&r->res_lock);
14314 +}
14315 +
14316 +/*
14317 + * The rsb res_lock must be held in write when this function is called.
14318 + */
14319 +
14320 +int lkb_dequeue(struct dlm_lkb *lkb)
14321 +{
14322 +       int status = lkb->lkb_status;
14323 +
14324 +       if (!status)
14325 +               goto out;
14326 +
14327 +       lkb->lkb_status = 0;
14328 +       list_del(&lkb->lkb_statequeue);
14329 +
14330 +      out:
14331 +       return status;
14332 +}
14333 +
14334 +int res_lkb_dequeue(struct dlm_lkb *lkb)
14335 +{
14336 +       int status;
14337 +
14338 +       down_write(&lkb->lkb_resource->res_lock);
14339 +       status = lkb_dequeue(lkb);
14340 +       up_write(&lkb->lkb_resource->res_lock);
14341 +
14342 +       return status;
14343 +}
14344 +
14345 +/*
14346 + * The rsb res_lock must be held in write when this function is called.
14347 + */
14348 +
14349 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14350 +{
14351 +       int status;
14352 +
14353 +       status = lkb_dequeue(lkb);
14354 +       lkb_enqueue(r, lkb, type);
14355 +
14356 +       return status;
14357 +}
14358 +
14359 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type)
14360 +{
14361 +       int status;
14362 +
14363 +       down_write(&r->res_lock);
14364 +       status = lkb_swqueue(r, lkb, type);
14365 +       up_write(&r->res_lock);
14366 +
14367 +       return status;
14368 +}
14369 diff -urN linux-orig/cluster/dlm/rsb.h linux-patched/cluster/dlm/rsb.h
14370 --- linux-orig/cluster/dlm/rsb.h        1970-01-01 07:30:00.000000000 +0730
14371 +++ linux-patched/cluster/dlm/rsb.h     2004-11-03 11:31:56.000000000 +0800
14372 @@ -0,0 +1,34 @@
14373 +/******************************************************************************
14374 +*******************************************************************************
14375 +**
14376 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14377 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14378 +**
14379 +**  This copyrighted material is made available to anyone wishing to use,
14380 +**  modify, copy, or redistribute it subject to the terms and conditions
14381 +**  of the GNU General Public License v.2.
14382 +**
14383 +*******************************************************************************
14384 +******************************************************************************/
14385 +
14386 +#ifndef __RSB_DOT_H__
14387 +#define __RSB_DOT_H__
14388 +
14389 +#define CREATE 1
14390 +#define MASTER 2
14391 +
14392 +void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode);
14393 +void release_rsb(struct dlm_rsb *r);
14394 +void release_rsb_locked(struct dlm_rsb *r);
14395 +void hold_rsb(struct dlm_rsb *r);
14396 +int find_rsb(struct dlm_ls *ls, struct dlm_rsb *parent, char *name,
14397 +            int namelen, int flags, struct dlm_rsb **rp);
14398 +struct dlm_rsb *find_rsb_to_unlock(struct dlm_ls *ls, struct dlm_lkb *lkb);
14399 +void lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14400 +void res_lkb_enqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14401 +int lkb_dequeue(struct dlm_lkb *lkb);
14402 +int res_lkb_dequeue(struct dlm_lkb *lkb);
14403 +int lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14404 +int res_lkb_swqueue(struct dlm_rsb *r, struct dlm_lkb *lkb, int type);
14405 +
14406 +#endif                         /* __RSB_DOT_H__ */
14407 diff -urN linux-orig/cluster/dlm/util.c linux-patched/cluster/dlm/util.c
14408 --- linux-orig/cluster/dlm/util.c       1970-01-01 07:30:00.000000000 +0730
14409 +++ linux-patched/cluster/dlm/util.c    2004-11-03 11:31:56.000000000 +0800
14410 @@ -0,0 +1,183 @@
14411 +/******************************************************************************
14412 +*******************************************************************************
14413 +**
14414 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14415 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14416 +**
14417 +**  This copyrighted material is made available to anyone wishing to use,
14418 +**  modify, copy, or redistribute it subject to the terms and conditions
14419 +**  of the GNU General Public License v.2.
14420 +**
14421 +*******************************************************************************
14422 +******************************************************************************/
14423 +
14424 +#include "dlm_internal.h"
14425 +
14426 +static const uint32_t crc_32_tab[] = {
14427 +       0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
14428 +       0xe963a535, 0x9e6495a3,
14429 +       0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd,
14430 +       0xe7b82d07, 0x90bf1d91,
14431 +       0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb,
14432 +       0xf4d4b551, 0x83d385c7,
14433 +       0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
14434 +       0xfa0f3d63, 0x8d080df5,
14435 +       0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447,
14436 +       0xd20d85fd, 0xa50ab56b,
14437 +       0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75,
14438 +       0xdcd60dcf, 0xabd13d59,
14439 +       0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
14440 +       0xcfba9599, 0xb8bda50f,
14441 +       0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11,
14442 +       0xc1611dab, 0xb6662d3d,
14443 +       0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
14444 +       0x9fbfe4a5, 0xe8b8d433,
14445 +       0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
14446 +       0x91646c97, 0xe6635c01,
14447 +       0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b,
14448 +       0x8208f4c1, 0xf50fc457,
14449 +       0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49,
14450 +       0x8cd37cf3, 0xfbd44c65,
14451 +       0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
14452 +       0xa4d1c46d, 0xd3d6f4fb,
14453 +       0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
14454 +       0xaa0a4c5f, 0xdd0d7cc9,
14455 +       0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3,
14456 +       0xb966d409, 0xce61e49f,
14457 +       0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
14458 +       0xb7bd5c3b, 0xc0ba6cad,
14459 +       0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af,
14460 +       0x04db2615, 0x73dc1683,
14461 +       0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d,
14462 +       0x0a00ae27, 0x7d079eb1,
14463 +       0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
14464 +       0x196c3671, 0x6e6b06e7,
14465 +       0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9,
14466 +       0x17b7be43, 0x60b08ed5,
14467 +       0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767,
14468 +       0x3fb506dd, 0x48b2364b,
14469 +       0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
14470 +       0x316e8eef, 0x4669be79,
14471 +       0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703,
14472 +       0x220216b9, 0x5505262f,
14473 +       0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
14474 +       0x2cd99e8b, 0x5bdeae1d,
14475 +       0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
14476 +       0x72076785, 0x05005713,
14477 +       0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d,
14478 +       0x7cdcefb7, 0x0bdbdf21,
14479 +       0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b,
14480 +       0x6fb077e1, 0x18b74777,
14481 +       0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
14482 +       0x616bffd3, 0x166ccf45,
14483 +       0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
14484 +       0x4969474d, 0x3e6e77db,
14485 +       0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5,
14486 +       0x47b2cf7f, 0x30b5ffe9,
14487 +       0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
14488 +       0x54de5729, 0x23d967bf,
14489 +       0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1,
14490 +       0x5a05df1b, 0x2d02ef8d
14491 +};
14492 +
14493 +/**
14494 + * dlm_hash - hash an array of data
14495 + * @data: the data to be hashed
14496 + * @len: the length of data to be hashed
14497 + *
14498 + * Copied from GFS.
14499 + *
14500 + * Take some data and convert it to a 32-bit hash.
14501 + *
14502 + * The hash function is a 32-bit CRC of the data.  The algorithm uses
14503 + * the crc_32_tab table above.
14504 + *
14505 + * This may not be the fastest hash function, but it does a fair bit better
14506 + * at providing uniform results than the others I've looked at.  That's
14507 + * really important for efficient directories.
14508 + *
14509 + * Returns: the hash
14510 + */
14511 +
14512 +uint32_t dlm_hash(const char *data, int len)
14513 +{
14514 +       uint32_t hash = 0xFFFFFFFF;
14515 +
14516 +       for (; len--; data++)
14517 +               hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8);
14518 +
14519 +       hash = ~hash;
14520 +
14521 +       return hash;
14522 +}
14523 +
14524 +void print_lkb(struct dlm_lkb *lkb)
14525 +{
14526 +       printk("dlm: lkb\n"
14527 +              "id %x\n"
14528 +              "remid %x\n"
14529 +              "flags %x\n"
14530 +              "status %x\n"
14531 +              "rqmode %d\n"
14532 +              "grmode %d\n"
14533 +              "nodeid %d\n"
14534 +              "lqstate %x\n"
14535 +              "lqflags %x\n",
14536 +               lkb->lkb_id,
14537 +               lkb->lkb_remid,
14538 +               lkb->lkb_flags,
14539 +               lkb->lkb_status,
14540 +               lkb->lkb_rqmode,
14541 +               lkb->lkb_grmode,
14542 +               lkb->lkb_nodeid,
14543 +               lkb->lkb_lockqueue_state,
14544 +               lkb->lkb_lockqueue_flags);
14545 +}
14546 +
14547 +void print_rsb(struct dlm_rsb *r)
14548 +{
14549 +       printk("dlm: rsb\n"
14550 +              "name \"%s\"\n"
14551 +              "nodeid %d\n"
14552 +              "flags %lx\n"
14553 +              "ref %u\n",
14554 +              r->res_name,
14555 +              r->res_nodeid,
14556 +              r->res_flags,
14557 +              atomic_read(&r->res_ref));
14558 +}
14559 +
14560 +void print_request(struct dlm_request *req)
14561 +{
14562 +       printk("dlm: request\n"
14563 +              "rh_cmd %u\n"
14564 +              "rh_lkid %x\n"
14565 +              "remlkid %x\n"
14566 +              "flags %x\n"
14567 +              "status %u\n"
14568 +              "rqmode %u\n",
14569 +              req->rr_header.rh_cmd,
14570 +              req->rr_header.rh_lkid,
14571 +              req->rr_remlkid,
14572 +              req->rr_flags,
14573 +              req->rr_status,
14574 +              req->rr_rqmode);
14575 +}
14576 +
14577 +void print_reply(struct dlm_reply *rp)
14578 +{
14579 +       printk("dlm: reply\n"
14580 +              "rh_cmd %u\n"
14581 +              "rh_lkid %x\n"
14582 +              "lockstate %u\n"
14583 +              "nodeid %u\n"
14584 +              "status %u\n"
14585 +              "lkid %x\n",
14586 +              rp->rl_header.rh_cmd,
14587 +              rp->rl_header.rh_lkid,
14588 +              rp->rl_lockstate,
14589 +              rp->rl_nodeid,
14590 +              rp->rl_status,
14591 +              rp->rl_lkid);
14592 +}
14593 +
14594 diff -urN linux-orig/cluster/dlm/util.h linux-patched/cluster/dlm/util.h
14595 --- linux-orig/cluster/dlm/util.h       1970-01-01 07:30:00.000000000 +0730
14596 +++ linux-patched/cluster/dlm/util.h    2004-11-03 11:31:56.000000000 +0800
14597 @@ -0,0 +1,24 @@
14598 +/******************************************************************************
14599 +*******************************************************************************
14600 +**
14601 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14602 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14603 +**
14604 +**  This copyrighted material is made available to anyone wishing to use,
14605 +**  modify, copy, or redistribute it subject to the terms and conditions
14606 +**  of the GNU General Public License v.2.
14607 +**
14608 +*******************************************************************************
14609 +******************************************************************************/
14610 +
14611 +#ifndef __UTIL_DOT_H__
14612 +#define __UTIL_DOT_H__
14613 +
14614 +uint32_t dlm_hash(const char *data, int len);
14615 +
14616 +void print_lkb(struct dlm_lkb *lkb);
14617 +void print_rsb(struct dlm_rsb *r);
14618 +void print_request(struct dlm_request *req);
14619 +void print_reply(struct dlm_reply *rp);
14620 +
14621 +#endif
14622 diff -urN linux-orig/include/cluster/dlm.h linux-patched/include/cluster/dlm.h
14623 --- linux-orig/include/cluster/dlm.h    1970-01-01 07:30:00.000000000 +0730
14624 +++ linux-patched/include/cluster/dlm.h 2004-11-03 11:31:56.000000000 +0800
14625 @@ -0,0 +1,416 @@
14626 +/******************************************************************************
14627 +*******************************************************************************
14628 +**
14629 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
14630 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
14631 +**
14632 +**  This copyrighted material is made available to anyone wishing to use,
14633 +**  modify, copy, or redistribute it subject to the terms and conditions
14634 +**  of the GNU General Public License v.2.
14635 +**
14636 +*******************************************************************************
14637 +******************************************************************************/
14638 +
14639 +#ifndef __DLM_DOT_H__
14640 +#define __DLM_DOT_H__
14641 +
14642 +/*
14643 + * Interface to DLM - routines and structures to use DLM lockspaces.
14644 + */
14645 +
14646 +/*
14647 + * Lock Modes
14648 + */
14649 +
14650 +#define DLM_LOCK_IV            (-1)    /* invalid */
14651 +#define DLM_LOCK_NL            (0)     /* null */
14652 +#define DLM_LOCK_CR            (1)     /* concurrent read */
14653 +#define DLM_LOCK_CW            (2)     /* concurrent write */
14654 +#define DLM_LOCK_PR            (3)     /* protected read */
14655 +#define DLM_LOCK_PW            (4)     /* protected write */
14656 +#define DLM_LOCK_EX            (5)     /* exclusive */
14657 +
14658 +/*
14659 + * Maximum size in bytes of a dlm_lock name
14660 + */
14661 +
14662 +#define DLM_RESNAME_MAXLEN     (64)
14663 +
14664 +/*
14665 + * Size in bytes of Lock Value Block
14666 + */
14667 +
14668 +#define DLM_LVB_LEN            (32)
14669 +
14670 +/*
14671 + * Flags to dlm_new_lockspace
14672 + *
14673 + * DLM_LSF_NOTIMERS
14674 + *
14675 + * Do not subject locks in this lockspace to time-outs.
14676 + */
14677 +
14678 +#define DLM_LSF_NOTIMERS       (1)
14679 +
14680 +/*
14681 + * Flags to dlm_lock
14682 + *
14683 + * DLM_LKF_NOQUEUE
14684 + *
14685 + * Do not queue the lock request on the wait queue if it cannot be granted
14686 + * immediately.  If the lock cannot be granted because of this flag, DLM will
14687 + * either return -EAGAIN from the dlm_lock call or will return 0 from
14688 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
14689 + *
14690 + * DLM_LKF_CONVERT
14691 + *
14692 + * Indicates a lock conversion request.  For conversions the name and namelen
14693 + * are ignored and the lock ID in the LKSB is used to identify the lock.
14694 + *
14695 + * DLM_LKF_VALBLK
14696 + *
14697 + * Requests DLM to return the current contents of the lock value block in the
14698 + * lock status block.  When this flag is set in a lock conversion from PW or EX
14699 + * modes, DLM assigns the value specified in the lock status block to the lock
14700 + * value block of the lock resource.  The LVB is a DLM_LVB_LEN size array
14701 + * containing application-specific information.
14702 + *
14703 + * DLM_LKF_QUECVT
14704 + *
14705 + * Force a conversion request to be queued, even if it is compatible with
14706 + * the granted modes of other locks on the same resource.
14707 + *
14708 + * DLM_LKF_CANCEL
14709 + *
14710 + * Used to cancel a pending conversion (with dlm_unlock).  Lock is returned to
14711 + * previously granted mode.
14712 + *
14713 + * DLM_LKF_IVVALBLK
14714 + *
14715 + * Invalidate/clear the lock value block.
14716 + *
14717 + * DLM_LKF_CONVDEADLK
14718 + *
14719 + * The granted mode of a lock being converted (from a non-NL mode) can be
14720 + * changed to NL in the process of acquiring the requested mode to avoid
14721 + * conversion deadlock.
14722 + *
14723 + * DLM_LKF_PERSISTENT
14724 + *
14725 + * Only relevant to locks originating in userspace. Signals to the ioctl.c code
14726 + * that this lock should not be unlocked when the process exits.
14727 + *
14728 + * DLM_LKF_NODLKWT
14729 + *
14730 + * This lock is not to be checked for conversion deadlocks.
14731 + *
14732 + * DLM_LKF_NODLCKBLK
14733 + *
14734 + * not yet implemented
14735 + *
14736 + * DLM_LKF_EXPEDITE
14737 + *
14738 + * Used only with new requests for NL mode locks.  Tells the lock manager
14739 + * to grant the lock, ignoring other locks in convert and wait queues.
14740 + *
14741 + * DLM_LKF_NOQUEUEBAST
14742 + *
14743 + * Send blocking AST's before returning -EAGAIN to the caller.  It is only
14744 + * used along with the NOQUEUE flag.  Blocking AST's are not sent for failed
14745 + * NOQUEUE requests otherwise.
14746 + *
14747 + * DLM_LKF_HEADQUE
14748 + *
14749 + * Add a lock to the head of the convert or wait queue rather than the tail.
14750 + *
14751 + * DLM_LKF_NOORDER
14752 + *
14753 + * Disregard the standard grant order rules and grant a lock as soon as it
14754 + * is compatible with other granted locks.
14755 + */
14756 +
14757 +#define DLM_LKF_NOQUEUE        (0x00000001)
14758 +#define DLM_LKF_CANCEL         (0x00000002)
14759 +#define DLM_LKF_CONVERT        (0x00000004)
14760 +#define DLM_LKF_VALBLK         (0x00000008)
14761 +#define DLM_LKF_QUECVT         (0x00000010)
14762 +#define DLM_LKF_IVVALBLK       (0x00000020)
14763 +#define DLM_LKF_CONVDEADLK     (0x00000040)
14764 +#define DLM_LKF_PERSISTENT     (0x00000080)
14765 +#define DLM_LKF_NODLCKWT       (0x00000100)
14766 +#define DLM_LKF_NODLCKBLK      (0x00000200)
14767 +#define DLM_LKF_EXPEDITE       (0x00000400)
14768 +#define DLM_LKF_NOQUEUEBAST    (0x00000800)
14769 +#define DLM_LKF_HEADQUE        (0x00001000)
14770 +#define DLM_LKF_NOORDER        (0x00002000)
14771 +#define DLM_LKF_ORPHAN         (0x00004000)
14772 +
14773 +/*
14774 + * Some return codes that are not in errno.h
14775 + */
14776 +
14777 +#define DLM_ECANCEL            (0x10001)
14778 +#define DLM_EUNLOCK            (0x10002)
14779 +
14780 +typedef void dlm_lockspace_t;
14781 +
14782 +/*
14783 + * Lock range structure
14784 + */
14785 +
14786 +struct dlm_range {
14787 +       uint64_t ra_start;
14788 +       uint64_t ra_end;
14789 +};
14790 +
14791 +/*
14792 + * Lock status block
14793 + *
14794 + * Use this structure to specify the contents of the lock value block.  For a
14795 + * conversion request, this structure is used to specify the lock ID of the
14796 + * lock.  DLM writes the status of the lock request and the lock ID assigned
14797 + * to the request in the lock status block.
14798 + *
14799 + * sb_lkid: the returned lock ID.  It is set on new (non-conversion) requests.
14800 + * It is available when dlm_lock returns.
14801 + *
14802 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
14803 + * shown for the DLM_LKF_VALBLK flag.
14804 + *
14805 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
14806 + * it was first demoted to NL to avoid conversion deadlock.
14807 + *
14808 + * sb_status: the returned status of the lock request set prior to AST
14809 + * execution.  Possible return values:
14810 + *
14811 + * 0 if lock request was successful
14812 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14813 + * -ENOMEM if there is no memory to process request
14814 + * -EINVAL if there are invalid parameters
14815 + * -DLM_EUNLOCK if unlock request was successful
14816 + * -DLM_ECANCEL ?
14817 + */
14818 +
14819 +#define DLM_SBF_DEMOTED        (0x01)
14820 +
14821 +struct dlm_lksb {
14822 +       int      sb_status;
14823 +       uint32_t sb_lkid;
14824 +       char     sb_flags;
14825 +       char *   sb_lvbptr;
14826 +};
14827 +
14828 +/*
14829 + * These defines are the bits that make up the query code.
14830 + */
14831 +
14832 +/* Bits 0, 1, 2, the lock mode or DLM_LOCK_THIS, see DLM_LOCK_NL etc in
14833 + * dlm.h Ignored for DLM_QUERY_LOCKS_ALL */
14834 +#define DLM_LOCK_THIS            0x0007
14835 +#define DLM_QUERY_MODE_MASK      0x0007
14836 +
14837 +/* Bits 3, 4, 5  bitmap of queue(s) to query */
14838 +#define DLM_QUERY_QUEUE_WAIT     0x0008
14839 +#define DLM_QUERY_QUEUE_CONVERT  0x0010
14840 +#define DLM_QUERY_QUEUE_GRANT    0x0020
14841 +#define DLM_QUERY_QUEUE_GRANTED  0x0030        /* Shorthand */
14842 +#define DLM_QUERY_QUEUE_ALL      0x0038        /* Shorthand */
14843 +
14844 +/* Bit 6, Return only the information that can be established without a network
14845 + * round-trip. The caller must be aware of the implications of this. Useful for
14846 + * just getting the master node id or resource name. */
14847 +#define DLM_QUERY_LOCAL          0x0040
14848 +
14849 +/* Bits 8 up, query type */
14850 +#define DLM_QUERY_LOCKS_HIGHER   0x0100
14851 +#define DLM_QUERY_LOCKS_LOWER    0x0200
14852 +#define DLM_QUERY_LOCKS_EQUAL    0x0300
14853 +#define DLM_QUERY_LOCKS_BLOCKING 0x0400
14854 +#define DLM_QUERY_LOCKS_NOTBLOCK 0x0500
14855 +#define DLM_QUERY_LOCKS_ALL      0x0600
14856 +#define DLM_QUERY_LOCKS_ORPHAN   0x0700
14857 +#define DLM_QUERY_MASK           0x0F00
14858 +
14859 +/* GRMODE is the default for mode comparisons,
14860 +   RQMODE might also be handy */
14861 +#define DLM_QUERY_GRMODE         0x0000
14862 +#define DLM_QUERY_RQMODE         0x1000
14863 +
14864 +/* Structures passed into and out of the query */
14865 +
14866 +struct dlm_lockinfo {
14867 +       int lki_lkid;           /* Lock ID on originating node */
14868 +        int lki_mstlkid;        /* Lock ID on master node */
14869 +       int lki_parent;
14870 +       int lki_node;           /* Originating node (not master) */
14871 +       int lki_ownpid;         /* Owner pid on originating node */
14872 +       uint8_t lki_state;      /* Queue the lock is on */
14873 +       uint8_t lki_grmode;     /* Granted mode */
14874 +       uint8_t lki_rqmode;     /* Requested mode */
14875 +       struct dlm_range lki_grrange;   /* Granted range, if applicable */
14876 +       struct dlm_range lki_rqrange;   /* Requested range, if applicable */
14877 +};
14878 +
14879 +struct dlm_resinfo {
14880 +       int rsi_length;
14881 +       int rsi_grantcount;     /* No. of nodes on grant queue */
14882 +       int rsi_convcount;      /* No. of nodes on convert queue */
14883 +       int rsi_waitcount;      /* No. of nodes on wait queue */
14884 +       int rsi_masternode;     /* Master for this resource */
14885 +       char rsi_name[DLM_RESNAME_MAXLEN];      /* Resource name */
14886 +       char rsi_valblk[DLM_LVB_LEN];   /* Master's LVB contents, if applicable
14887 +                                        */
14888 +};
14889 +
14890 +struct dlm_queryinfo {
14891 +       struct dlm_resinfo *gqi_resinfo;
14892 +       struct dlm_lockinfo *gqi_lockinfo;      /* This points to an array
14893 +                                                * of structs */
14894 +       int gqi_locksize;       /* input */
14895 +       int gqi_lockcount;      /* output */
14896 +};
14897 +
14898 +#ifdef __KERNEL__
14899 +/*
14900 + * dlm_init
14901 + *
14902 + * Starts and initializes DLM threads and structures.  Creation of the first
14903 + * lockspace will call this if it has not been called already.
14904 + *
14905 + * Returns: 0 if successful, -EXXX on error
14906 + */
14907 +
14908 +int dlm_init(void);
14909 +
14910 +/*
14911 + * dlm_release
14912 + *
14913 + * Stops DLM threads.
14914 + *
14915 + * Returns: 0 if successful, -EXXX on error
14916 + */
14917 +
14918 +int dlm_release(void);
14919 +
14920 +/*
14921 + * dlm_new_lockspace
14922 + *
14923 + * Starts a lockspace with the given name.  If the named lockspace exists in
14924 + * the cluster, the calling node joins it.
14925 + */
14926 +
14927 +int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
14928 +                     int flags);
14929 +
14930 +/*
14931 + * dlm_release_lockspace
14932 + *
14933 + * Stop a lockspace.
14934 + */
14935 +
14936 +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
14937 +
14938 +/*
14939 + * dlm_lock
14940 + *
14941 + * Make an asyncronous request to acquire or convert a lock on a named
14942 + * resource.
14943 + *
14944 + * lockspace: context for the request
14945 + * mode: the requested mode of the lock (DLM_LOCK_)
14946 + * lksb: lock status block for input and async return values
14947 + * flags: input flags (DLM_LKF_)
14948 + * name: name of the resource to lock, can be binary
14949 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
14950 + * parent: the lock ID of a parent lock or 0 if none
14951 + * lockast: function DLM executes when it completes processing the request
14952 + * astarg: argument passed to lockast and bast functions
14953 + * bast: function DLM executes when this lock later blocks another request
14954 + *
14955 + * Returns:
14956 + * 0 if request is successfully queued for processing
14957 + * -EINVAL if any input parameters are invalid
14958 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
14959 + * -ENOMEM if there is no memory to process request
14960 + * -ENOTCONN if there is a communication error
14961 + *
14962 + * If the call to dlm_lock returns an error then the operation has failed and
14963 + * the AST routine will not be called.  If dlm_lock returns 0 it is still
14964 + * possible that the lock operation will fail. The AST routine will be called
14965 + * when the locking is complete and the status is returned in the lksb.
14966 + *
14967 + * If the AST routines or parameter are passed to a conversion operation then
14968 + * they will overwrite those values that were passed to a previous dlm_lock
14969 + * call.
14970 + *
14971 + * AST routines should not block (at least not for long), but may make
14972 + * any locking calls they please.
14973 + */
14974 +
14975 +int dlm_lock(dlm_lockspace_t *lockspace,
14976 +            uint32_t mode,
14977 +            struct dlm_lksb *lksb,
14978 +            uint32_t flags,
14979 +            void *name,
14980 +            unsigned int namelen,
14981 +            uint32_t parent,
14982 +            void (*lockast) (void *astarg),
14983 +            void *astarg,
14984 +            void (*bast) (void *astarg, int mode),
14985 +            struct dlm_range *range);
14986 +
14987 +/*
14988 + * dlm_unlock
14989 + *
14990 + * Asynchronously release a lock on a resource.  The AST routine is called
14991 + * when the resource is successfully unlocked.
14992 + *
14993 + * lockspace: context for the request
14994 + * lkid: the lock ID as returned in the lksb
14995 + * flags: input flags (DLM_LKF_)
14996 + * lksb: if NULL the lksb parameter passed to last lock request is used
14997 + * astarg: the arg used with the completion ast for the unlock
14998 + *
14999 + * Returns:
15000 + * 0 if request is successfully queued for processing
15001 + * -EINVAL if any input parameters are invalid
15002 + * -ENOTEMPTY if the lock still has sublocks
15003 + * -EBUSY if the lock is waiting for a remote lock operation
15004 + * -ENOTCONN if there is a communication error
15005 + */
15006 +
15007 +extern int dlm_unlock(dlm_lockspace_t *lockspace,
15008 +                      uint32_t lkid,
15009 +                      uint32_t flags,
15010 +                      struct dlm_lksb *lksb,
15011 +                      void *astarg);
15012 +
15013 +/* Query interface
15014 + *
15015 + * Query the other holders of a resource, given a known lock ID
15016 + *
15017 + * lockspace:   context for the request
15018 + * lksb:        LKSB, sb_lkid contains the lock ID of a valid lock
15019 + *              on the resource. sb_status will contain the status
15020 + *             of the request on completion.
15021 + * query:       query bitmap see DLM_QUERY_* above
15022 + * qinfo:       pointer to dlm_queryinfo structure
15023 + * ast_routine: AST routine to call on completion
15024 + * artarg:      argument to AST routine. It is "traditional"
15025 + *              to put the qinfo pointer into lksb->sb_lvbptr
15026 + *              and pass the lksb in here.
15027 + */
15028 +extern int dlm_query(dlm_lockspace_t *lockspace,
15029 +                     struct dlm_lksb *lksb,
15030 +                     int query,
15031 +                     struct dlm_queryinfo *qinfo,
15032 +                     void (ast_routine(void *)),
15033 +                     void *astarg);
15034 +
15035 +
15036 +void dlm_debug_dump(void);
15037 +void dlm_locks_dump(void);
15038 +
15039 +#endif                         /* __KERNEL__ */
15040 +
15041 +#endif                         /* __DLM_DOT_H__ */
15042 diff -urN linux-orig/include/cluster/dlm_device.h linux-patched/include/cluster/dlm_device.h
15043 --- linux-orig/include/cluster/dlm_device.h     1970-01-01 07:30:00.000000000 +0730
15044 +++ linux-patched/include/cluster/dlm_device.h  2004-11-03 11:31:56.000000000 +0800
15045 @@ -0,0 +1,64 @@
15046 +/******************************************************************************
15047 +*******************************************************************************
15048 +**
15049 +**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
15050 +**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
15051 +**
15052 +**  This copyrighted material is made available to anyone wishing to use,
15053 +**  modify, copy, or redistribute it subject to the terms and conditions
15054 +**  of the GNU General Public License v.2.
15055 +**
15056 +*******************************************************************************
15057 +******************************************************************************/
15058 +
15059 +/* This is the device interface for dlm, most users will use a library
15060 + * interface.
15061 + */
15062 +
15063 +/* Version of the device interface */
15064 +#define DLM_DEVICE_VERSION_MAJOR 2
15065 +#define DLM_DEVICE_VERSION_MINOR 0
15066 +#define DLM_DEVICE_VERSION_PATCH 0
15067 +
15068 +/* struct passed to the lock write */
15069 +struct dlm_lock_params {
15070 +       uint32_t version[3];
15071 +       uint8_t cmd;
15072 +       uint8_t mode;
15073 +       uint16_t flags;
15074 +       uint32_t lkid;
15075 +       uint32_t parent;
15076 +       struct dlm_range range;
15077 +       uint8_t namelen;
15078 +        void *castparam;
15079 +       void *castaddr;
15080 +       void *bastparam;
15081 +        void *bastaddr;
15082 +        struct dlm_lksb *lksb;
15083 +       char name[1];
15084 +};
15085 +
15086 +
15087 +/* struct read from the "device" fd,
15088 +   consists mainly of userspace pointers for the library to use */
15089 +struct dlm_lock_result {
15090 +       uint8_t cmd;
15091 +        void *astparam;
15092 +        void (*astaddr)(void *astparam);
15093 +        struct dlm_lksb *user_lksb;
15094 +        struct dlm_lksb lksb;  /* But this has real data in it */
15095 +        uint8_t bast_mode; /* Not yet used */
15096 +};
15097 +
15098 +/* commands passed to the device */
15099 +#define DLM_USER_LOCK       1
15100 +#define DLM_USER_UNLOCK     2
15101 +#define DLM_USER_QUERY      3
15102 +
15103 +/* Arbitrary length restriction */
15104 +#define MAX_LS_NAME_LEN 64
15105 +
15106 +/* ioctls on the device */
15107 +#define DLM_CREATE_LOCKSPACE         _IOW('D', 0x01, char *)
15108 +#define DLM_RELEASE_LOCKSPACE        _IOW('D', 0x02, char *)
15109 +#define DLM_FORCE_RELEASE_LOCKSPACE  _IOW('D', 0x03, char *)